├── .dockerignore
├── .gitignore
├── .travis.yml
├── Dockerfile
├── LICENSE
├── README.md
├── app
├── pom.xml
└── src
│ ├── main
│ ├── resources
│ │ └── logback.xml
│ └── scala
│ │ └── edu
│ │ └── knowitall
│ │ ├── ollie
│ │ ├── OllieCli.scala
│ │ └── SentenceIterator.scala
│ │ └── openparse
│ │ ├── OpenParseCli.scala
│ │ ├── OpenParseGui.scala
│ │ └── gui
│ │ ├── Dot.scala
│ │ ├── ExtractionEntry.scala
│ │ ├── Parser.scala
│ │ └── Sentence.scala
│ └── test
│ └── resources
│ └── logback-test.xml
├── core
├── build.sbt
├── here.txt
├── pom.xml
├── project
│ └── plugins.sbt
├── scripts
│ ├── applypatterns.sh
│ ├── build_templates.sh
│ ├── create_patterns.sh
│ ├── create_test_train.sh
│ ├── extractor.sh
│ └── keep_common_patterns.sh
└── src
│ ├── main
│ ├── resources
│ │ └── edu
│ │ │ └── knowitall
│ │ │ ├── ollie
│ │ │ ├── cognitiveWords.txt
│ │ │ ├── communicationWords.txt
│ │ │ ├── confidence
│ │ │ │ └── default-classifier.txt
│ │ │ └── prefixWords.txt
│ │ │ └── openparse
│ │ │ ├── categories
│ │ │ ├── location.txt
│ │ │ └── person.txt
│ │ │ └── openparse.model
│ └── scala
│ │ └── edu
│ │ └── knowitall
│ │ ├── common
│ │ └── enrich
│ │ │ └── Traversable.scala
│ │ ├── ollie
│ │ ├── DependencyGraphExtras.scala
│ │ ├── NaryExtraction.scala
│ │ ├── Ollie.scala
│ │ ├── OllieExtraction.scala
│ │ ├── OllieExtractionInstance.scala
│ │ ├── ScoredOllieExtractionInstance.scala
│ │ ├── confidence
│ │ │ ├── OllieConfidenceFunction.scala
│ │ │ ├── OllieFeatureEvaluation.scala
│ │ │ ├── OllieFeatureSet.scala
│ │ │ └── train
│ │ │ │ ├── CrossValidateConfidence.scala
│ │ │ │ └── TrainOllieConfidence.scala
│ │ └── output
│ │ │ └── BratOutput.scala
│ │ └── openparse
│ │ ├── AnalyzePatterns.scala
│ │ ├── BuildPatterns.scala
│ │ ├── ExtractorPattern.scala
│ │ ├── GraphExpansions.scala
│ │ ├── OpenParse.scala
│ │ ├── bootstrap
│ │ ├── FilterTargetExtractions.scala
│ │ ├── FindCommon.scala
│ │ ├── FindTargetArguments.scala
│ │ └── FindTargetExtractions.scala
│ │ ├── eval
│ │ ├── GroupScoredBy.scala
│ │ ├── PrecisionYield.scala
│ │ ├── RankPatterns.scala
│ │ ├── Score.scala
│ │ └── StatisticalSignificance.scala
│ │ ├── extract
│ │ ├── Extraction.scala
│ │ ├── GeneralExtractor.scala
│ │ ├── PatternExtractor.scala
│ │ ├── SpecificExtractor.scala
│ │ └── TemplateExtractor.scala
│ │ └── template
│ │ ├── BuildTemplates.scala
│ │ ├── CountsToConfidence.scala
│ │ ├── GeneralizeTemplate.scala
│ │ └── PassiveReflections.scala
│ └── test
│ ├── resources
│ └── logback-test.xml
│ └── scala
│ └── edu
│ └── knowitall
│ ├── common
│ └── enrich
│ │ └── TraversableSpecTest.scala
│ ├── ollie
│ ├── DependencyGraphExtrasSpec.scala
│ └── confidence
│ │ └── OllieFeatureSetSpec.scala
│ └── openparse
│ ├── BuildPatternsSpec.scala
│ ├── ExtractorPatternSpec.scala
│ ├── OllieSpec.scala
│ ├── OpenParseSpec.scala
│ └── PatternExtractorSpec.scala
├── data
└── training.tsv
├── example
├── pom.xml
└── src
│ └── main
│ ├── java
│ └── example
│ │ └── JavaOllieWrapper.java
│ ├── resouces
│ └── logback.xml
│ └── scala
│ └── ollie
│ └── Example.scala
└── pom.xml
/.dockerignore:
--------------------------------------------------------------------------------
1 | Dockerfile
2 | .dockerignore
3 | .gitignore
4 | .git
5 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | .cache
3 | .classpath
4 | .project
5 | .settings
6 | engmalt.linear.mco
7 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 | - "2.9.2"
4 | jdk:
5 | - oraclejdk7
6 | - openjdk7
7 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM maven:3.5.2-jdk-7
2 |
3 | WORKDIR /stage
4 |
5 | COPY ./ /stage/
6 | RUN curl http://www.maltparser.org/mco/english_parser/engmalt.linear-1.7.mco > /stage/engmalt.linear-1.7.mco
7 | RUN mvn clean package
8 |
9 | CMD ["java", "-Xmx512m", "-jar", "ollie-app-1.0.1-SNAPSHOT.jar"]
10 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Ollie Software License Agreement
2 |
3 | Ollie Software
4 | (C) 2011-2012, University of Washington. All rights reserved.
5 | US patent number 7,877,343 and 12/970,155 patent pending
6 |
7 | The University of Washington (UW), Professor Mausam, Michael Schmitz, Robert
8 | Bart, and Stephen Soderland, (Developers) give permission for you and your
9 | laboratory (University) to use Ollie. Ollie is a system that extracts
10 | relational triples from text. Ollie is protected by a United States copyright
11 | and patents. The National Science Foundation supported work on Ollie. Under
12 | University of Washington's patents 7,877,343 (issued) and 12/970,155 (patent
13 | pending), the UW grants to you the non-exclusive right to use patent claims
14 | practiced by the University of Washington's Ollie software solely for
15 | non-commercial purposes and as long as you comply with the terms of this Ollie
16 | Software License Agreement. UW and the Developers allow you to copy and modify
17 | Ollie for non-commercial purposes, and to distribute modifications through
18 | GitHub or directly to the University of Washington, on the following
19 | conditions:
20 |
21 |
22 | 1. Ollie is not used for any commercial purposes, or as part of a system
23 | which has commercial purposes.
24 |
25 |
26 | 2. Any software derived from Ollie must carry prominent notices stating that
27 | you modified it along with the date modified. The derivative must also carry
28 | prominent notices stating that it is released under this Ollie Software
29 | License Agreement
30 |
31 | If you wish to obtain Ollie or to obtain any patent rights for any commercial
32 | purposes, you will need to contact the University of Washington to see if
33 | rights are available and to negotiate a commercial license and pay a fee. This
34 | includes, but is not limited to, using Ollie to provide services to outside
35 | parties for a fee. In that case please contact:
36 |
37 | UW Center for Commercialization
38 | University of Washington
39 | 4311 11th Ave. NE,
40 | Suite 500 Seattle, WA 98105-4608
41 |
42 | Phone: (206) 543-3970
43 | Email: license@u.washington.edu
44 |
45 |
46 | 3. You retain in Ollie and any modifications to Ollie, the copyright,
47 | trademark, patent or other notices pertaining to Ollie as provided by UW.
48 |
49 |
50 | 4. You provide the Developers with feedback on the use of the Ollie software
51 | in your research, and that the Developers and UW are permitted to use any
52 | information you provide in making changes to the Ollie software. All bug
53 | reports and technical questions shall be sent to: afader@cs.washington.edu.
54 | Modifications may be communicated through GitHub pull requests at:
55 |
56 | https://github.com/knowitall/
57 |
58 |
59 | 5. You acknowledge that the Developers, UW and its licensees may develop
60 | modifications to Ollie that may be substantially similar to your modifications
61 | of Ollie, and that the Developers, UW and its licensees shall not be
62 | constrained in any way by you in UW's or its licensees' use or management of
63 | such modifications. You acknowledge the right of the Developers and UW to
64 | prepare and publish modifications to Ollie that may be substantially similar
65 | or functionally equivalent to your modifications and improvements, and if you
66 | obtain patent protection for any modification or improvement to Ollie you
67 | agree not to allege or enjoin infringement of your patent by the Developers,
68 | the UW or by any of UW's licensees obtaining modifications or improvements to
69 | Ollie from the University of Washington or the Developers.
70 |
71 |
72 | 6. If utilization of the Ollie software results in outcomes which will be
73 | published, please specify the version of Ollie you used and cite the UW
74 | Developers.
75 |
76 | @inproceedings{ollie-emnlp12,
77 | author = {Mausam and Michael Schmitz and Robert Bart and
78 | Stephen Soderland and Oren Etzioni},
79 | title = {Open Language Learning for Information Extraction},
80 | booktitle = {Proceedings of Conference on Empirical Methods in
81 | Natural Language Processing and Computational Natural
82 | Language Learning (EMNLP-CONLL)},
83 | year = {2012}
84 | }
85 |
86 |
87 | 7. Any risk associated with using the Ollie software at your organization is
88 | with you and your organization. Ollie is experimental in nature and is made
89 | available as a research courtesy "AS IS," without obligation by UW to provide
90 | accompanying services or support.
91 |
92 |
93 | UW AND THE AUTHORS EXPRESSLY DISCLAIM ANY AND ALL WARRANTIES REGARDING THE
94 | SOFTWARE, WHETHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO WARRANTIES
95 | PERTAINING TO MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
96 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Ollie
2 |
3 | Ollie is a program that automatically identifies and extracts binary
4 | relationships from English sentences. Ollie is designed for Web-scale
5 | information extraction, where target relations are not specified in advance.
6 |
7 | Ollie is our second-generation information extraction system . Whereas ReVerb operates on flat sequences
9 | of tokens, Ollie works with the tree-like (graph with only small cycles)
10 | representation using Stanford's compression of the dependencies. This allows
11 | Ollie to capture expression that ReVerb misses, such as long-range relations.
12 |
13 | Ollie also captures context that modifies a binary relation. Presently Ollie
14 | handles attribution (He said/she believes) and enabling conditions (if X
15 | then).
16 |
17 | ## Quick Start
18 |
19 | ### Docker
20 |
21 | You can now run Ollie with a single Docker command.
22 |
23 | ```
24 | docker run -it schmmd/ollie:latest
25 | ```
26 |
27 | To configure Ollie, you can drop into a bash shell with `docker run -it schmmd/ollie:latest /bin/bash`
28 | and run Ollie from the command line.
29 |
30 | ### Local Machine
31 |
32 | If you want to run Ollie on a small amount of text without modifying the source
33 | code, you can use an executable file that can be run from the command line.
34 | Please note that Ollie was built using Scala 2.9 and so it requires Java 7.
35 | Follow these steps to get started:
36 |
37 | 1. Download the latest Ollie binary from
38 | http://knowitall.cs.washington.edu/ollie/ollie-app-latest.jar.
39 |
40 | 2. Download the linear English MaltParser model (engmalt.linear-1.7.mco) from
41 | http://www.maltparser.org/mco/english_parser/engmalt.html
42 | and place it in the same directory as Ollie.
43 |
44 | 3. Run `java -Xmx512m -jar ollie-app-latest.jar yourfile.txt`. The input file
45 | should contain one sentence per line unless `--split` is specified. Omit
46 | the input file for an interactive console.
47 |
48 | ## Examples
49 |
50 | ### Enabling Condition
51 |
52 | An enabling condition is a condition that needs to be met for the extraction to
53 | be true. Certain words demark an enabling condition, such as "if" and "when".
54 | Ollie captures enabling conditions if they are present.
55 |
56 | sentence: If I slept past noon, I'd be late for work.
57 | extraction: (I; 'd be late for; work)[enabler=If I slept past noon]
58 |
59 | ### Attribution
60 |
61 | An attribution clause specifies an entity that asserted an extraction and a
62 | verb that specifies the expression. Ollie captures attributions if they are
63 | present.
64 |
65 | sentence: Some people say Barack Obama was not born in the United States.
66 | extraction: (Barack Obama; was not born in; the United States)[attrib=Some people say]
67 |
68 | sentence: Early astronomers believe that the earth is the center of the universe.
69 | extraction: (the earth; is the center of; the universe)[attrib=Early astronomers believe]
70 |
71 | ### Relational noun
72 |
73 | Some relations are expressed without verbs. Ollie can capture these as well as
74 | verb-mediated relations.
75 |
76 | sentence: Microsoft co-founder Bill Gates spoke at a conference on Monday.
77 | extraction: (Bill Gates; be co-founder of; Microsoft)
78 |
79 |
80 | ### N-ary extractions
81 |
82 | Often times similar relations will specify different aspects of the same event.
83 | Since Ollie captures long-range relations it can capture N-ary extractions by
84 | collapsing extractions where the relation phrase only differs by the
85 | preposition.
86 |
87 | sentence: I learned that the 2012 Sasquatch music festival is scheduled for May 25th until May 28th.
88 | extraction: (the 2012 Sasquatch music festival; is scheduled for; May 25th)
89 | extraction: (the 2012 Sasquatch music festival; is scheduled until; May 28th)
90 | nary: (the 2012 Sasquatch music festival; is scheduled; [for May 25th; to May 28th])
91 |
92 | ## Building
93 |
94 | Building Ollie from source requires Apache Maven ().
95 | First, clone or download the Ollie source from GitHub. Run this command in the
96 | top-level source folder to download the required dependencies, compile, and
97 | create a single jar file.
98 |
99 | mvn clean package
100 |
101 | The compiled class files will be put in the base directory. The single
102 | executable jar file will be written to `ollie-app-VERSION.jar` where `VERSION`
103 | is the version number.
104 |
105 | ## Command Line Interface
106 |
107 | Once you have built Ollie, you can run it from the command line.
108 |
109 | java -Xmx512m -jar ollie-app-VERSION.jar yourfile.txt
110 |
111 | Omit the input file for an interactive console.
112 |
113 | Ollie takes sentences, one-per-line as input or splits text into sentences if
114 | `--split` is specified. Run Ollie with `--usage` to see full usage.
115 |
116 | The Ollie command line tool has a few output formats. The output format is
117 | specified by `--output-format` and a valid format:
118 |
119 | 1. The `interactive` format that is meant to be easily human readable.
120 | 2. The `tabbed` format is mean to be easily parsable. A header will be output
121 | as the first row to label the columns.
122 | 3. `tabbedsingle` is similar to `tabbed` but the extraction is output as (arg1; relation;
123 | arg2) in a single column.
124 | 4. The `serialized` is meant to be fully deserialized into an
125 | `OllieExtractionInstance` class.
126 |
127 | ## Graphical Interface
128 |
129 | Ollie works ontop of a subcomponent called OpenParse. The distinction is
130 | largely technical; OpenParse does not handle attribution and enabling condition
131 | and uses a coarser confidence metric. You can use a GUI application to
132 | visualize the OpenParse extractions in a parse tree. To use it, you will need
133 | to have [graphviz](http://www.graphviz.org/) installed. You can run the GUI
134 | with:
135 |
136 | java -Xms512M -Xmx1g -cp ollie-app-VERSION.jar edu.knowitall.openparse.OpenParseGui
137 |
138 | By default, this application will look for graphviz's `dot` program at
139 | `/usr/bin/dot`. You can specify a location with the `--graphviz` parameter.
140 |
141 | You can try out your own models with `Options->Load Model...`. To see an
142 | example model, look at `openparse.model` in `src/main/resources`. Your model
143 | may have one or more patterns in it. If you want to see pattern matches
144 | (without node expansion) instead of triple extractions, you can choose to show
145 | the raw match with `Options->Raw Matches`. This will allow you to use patterns
146 | that do not capture an arg1, rel, and arg2.
147 |
148 | ## Parsers
149 |
150 | Ollie is packaged to use Malt Parser, one of the fastest dependency parsers
151 | available. You will need the model file (`engmalt.linear-1.7.mco`) in the
152 | directory the application is run from or you will need to specify its location
153 | with the `--malt-model` parameter. Malt Parser models are available online.
154 |
155 | http://www.maltparser.org/mco/english_parser/engmalt.html
156 |
157 | Ollie works with any other parser in the `nlptools` project. For example, it
158 | is easy to swap out Malt for Stanford's parser. Stanford's parser is not a
159 | part of the Ollie distribution by default because of licensing conflicts, but
160 | the Stanford parser was used as the execution parser for the results in the
161 | paper. Malt Parser was used to bootstrap the patterns. We are interested
162 | in Clear parser as an alternative, but it's not a trivial change because Clear
163 | uses a slightly different dependency representation.
164 |
165 | ## Using Eclipse
166 |
167 | To modify the Ollie source code in Eclipse, use the [M2Eclipse
168 | plugin](http://www.sonatype.org/m2eclipse/) along with
169 | [ScalaIDE](http://scala-ide.org/). You can then import the project using
170 | the following.
171 |
172 | File > Import > Existing Maven Projects
173 |
174 | ## Including Ollie as a Dependency
175 |
176 | Add the following as a Maven dependency.
177 |
178 | edu.washington.cs.knowitall.ollie
179 | ollie-core_2.9.2
180 | [1.0.0, )
181 |
182 | The best way to find the latest version is to browse [Maven Central](http://search.maven.org/#search%7Cga%7C1%7Cg%3A%22edu.washington.cs.knowitall%22).
183 |
184 | `ollie-core` does not include a way to parse sentences. You will need to use a
185 | parser supplied by the [nlptools](https://github.com/knowitall/nlptools)
186 | project. The source for for `ollie-app` is an excellent example of a project
187 | using `ollie-core` as a dependency. `ollie-app` supplies a parser from
188 | [nlptools](https://github.com/knowitall/nlptools).
189 |
190 | There is an example project that uses Ollie in the `example` folder of the
191 | source distribution.
192 |
193 | ## Training the Confidence Function
194 |
195 | While Ollie comes with a trained confidence function, it is possible to retrain
196 | the confidence function. First, you need to run Ollie over a set of sentences
197 | and store the output in the *serialized* format.
198 |
199 | echo "Michael rolled down the hill." | java -jar ollie-app-1.0.0-SNAPSHOT.jar --serialized --output toannotate.tsv
200 |
201 | Next you need to annotate the extractions. Modify the output file and
202 | **change** the first column to a binary annotation--`1` for correct and `0` for
203 | wrong. Your final file will look similar to `ollie/data/training.tsv`. Now
204 | run the logistic regression trainer.
205 |
206 | java -cp ollie-app-1.0.0-SNAPSHOT.jar edu.washington.cs.knowitall.ollie.confidence.train.TrainOllieConfidence toannotate.tsv
207 |
208 | ## Concurrency
209 |
210 | When operating at web scale, parallelism is essential. While the base Ollie
211 | extractor is immutable and thread safe, the parser may not be thread safe. I
212 | do not know whether Malt parser is thread safe.
213 |
214 | ## FAQ
215 |
216 | 1. How fast is Ollie?
217 |
218 | You should really benchmark Ollie yourself, but on my computer (a new computer in 2011), Ollie processed 5000 high-quality web sentences in 56 seconds, or 89 sentences per second, in a single thread. Ollie is easily parallelizable and the Ollie extractor itself is threadsafe (see Concurrency section).
219 |
220 | ## Contact
221 |
222 | To contact the UW about Ollie, email knowit-ollie@cs.washington.edu.
223 |
224 | ## Citing Ollie
225 | If you use Ollie in your academic work, please cite Ollie with the following
226 | BibTeX citation:
227 |
228 | @inproceedings{ollie-emnlp12,
229 | author = {Mausam and Michael Schmitz and Robert Bart and Stephen Soderland and Oren Etzioni},
230 | title = {Open Language Learning for Information Extraction},
231 | booktitle = {Proceedings of Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning (EMNLP-CONLL)},
232 | year = {2012}
233 | }
234 |
--------------------------------------------------------------------------------
/app/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | edu.washington.cs.knowitall.ollie
5 | ollie-app
6 | ollie-app
7 | 1.0.1-SNAPSHOT
8 |
9 | edu.washington.cs.knowitall
10 | knowitall-oss
11 | 1.0.2
12 |
13 |
14 | UTF-8
15 | 2.4.0
16 |
17 |
18 |
19 | org.scala-lang
20 | scala-swing
21 | 2.9.2
22 |
23 |
24 | edu.washington.cs.knowitall.ollie
25 | ollie-core_2.9.2
26 | 1.0.2
27 |
28 |
29 | edu.washington.cs.knowitall.nlptools
30 | nlptools-parse-malt_2.9.2
31 | ${nlptools.version}
32 |
33 |
34 | edu.washington.cs.knowitall.nlptools
35 | nlptools-parse-stanford_2.9.2
36 | ${nlptools.version}
37 |
38 |
39 | edu.washington.cs.knowitall.nlptools
40 | nlptools-sentence-opennlp_2.9.2
41 | ${nlptools.version}
42 |
43 |
44 | junit
45 | junit
46 | 4.11
47 | test
48 |
49 |
50 | batik
51 | batik-swing
52 | 1.6-1
53 |
54 |
55 | org.specs2
56 | specs2_2.9.2
57 | 1.12.3
58 | test
59 |
60 |
61 |
62 | ch.qos.logback
63 | logback-classic
64 | 1.0.9
65 |
66 |
67 | ch.qos.logback
68 | logback-core
69 | 1.0.9
70 |
71 |
72 |
73 | src/main/scala
74 | src/test/scala
75 |
76 |
77 |
78 | net.alchim31.maven
79 | scala-maven-plugin
80 | 3.1.1
81 |
82 |
83 | -deprecation
84 | -unchecked
85 |
86 |
87 |
88 |
89 |
90 | compile
91 | testCompile
92 |
93 |
94 |
95 |
96 |
97 | maven-assembly-plugin
98 |
99 | ${project.build.directory}/../..
100 | false
101 |
102 |
103 | edu.knowitall.ollie.OllieCli
104 |
105 |
106 |
107 |
108 |
109 | distro-assembly
110 | package
111 |
112 | single
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
--------------------------------------------------------------------------------
/app/src/main/resources/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
6 |
7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/app/src/main/scala/edu/knowitall/ollie/SentenceIterator.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.ollie
2 |
3 | import edu.knowitall.tool.segment.Segmenter
4 |
5 | class SentenceIterator(sentencer: Segmenter, private var lines: BufferedIterator[String]) extends Iterator[String] {
6 | var sentences: Iterator[String] = Iterator.empty
7 |
8 | lines.dropWhile(_.trim.isEmpty)
9 |
10 | def nextSentences = {
11 | val (paragraph, rest) = lines.span(!_.trim.isEmpty)
12 | lines = rest.dropWhile(_.trim.isEmpty).buffered
13 | sentencer.segmentTexts(paragraph.mkString(" ")).iterator.buffered
14 | }
15 |
16 | def hasNext: Boolean = {
17 | if (sentences.hasNext) {
18 | true
19 | }
20 | else if (!lines.hasNext) {
21 | false
22 | }
23 | else {
24 | sentences = nextSentences
25 | sentences.hasNext
26 | }
27 | }
28 |
29 | def next: String = {
30 | if (sentences.hasNext) {
31 | sentences.next()
32 | }
33 | else {
34 | sentences = nextSentences
35 | sentences.next()
36 | }
37 | }
38 | }
--------------------------------------------------------------------------------
/app/src/main/scala/edu/knowitall/openparse/OpenParseCli.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse
2 |
3 | import java.io.{PrintWriter, File}
4 | import java.net.URL
5 |
6 | import scala.collection.Set
7 | import scala.io.Source
8 |
9 | import org.slf4j.LoggerFactory
10 |
11 | import edu.knowitall.collection.immutable.graph.pattern.Match
12 | import edu.knowitall.collection.immutable.graph.Graph
13 | import edu.knowitall.common.Resource.using
14 | import edu.knowitall.common.Timing
15 | import edu.knowitall.tool.parse.MaltParser
16 | import edu.knowitall.openparse.OpenParse.validMatch
17 | import edu.knowitall.openparse.extract.{TemplateExtractor, PatternExtractorType, PatternExtractor, GeneralExtractor, Extraction, DetailedExtraction}
18 | import edu.knowitall.tool.parse.graph.{DependencyNode, DependencyGraph}
19 |
20 | import scopt.OptionParser
21 |
22 | object OpenParseCli {
23 | val logger = LoggerFactory.getLogger(this.getClass)
24 |
25 | abstract class Settings {
26 | def modelUrl: URL
27 | def outputFile: Option[File]
28 | def sentenceFile: File
29 |
30 | def confidenceThreshold: Double
31 | def expandArguments: Boolean
32 | def verbose: Boolean
33 |
34 | def parallel: Boolean
35 | def invincible: Boolean
36 | }
37 |
38 | def main(args: Array[String]) {
39 | object settings extends Settings {
40 | var modelUrl: URL = OpenParse.defaultModelUrl
41 | var outputFile: Option[File] = None
42 | var sentenceFile: File = null
43 |
44 | var confidenceThreshold = 0.0;
45 | var expandArguments: Boolean = true
46 | var verbose: Boolean = false
47 |
48 | var parallel: Boolean = false
49 | var invincible: Boolean = false
50 | }
51 |
52 | val parser = new OptionParser("openparse-cli") {
53 | arg("sentences", "sentence file", { path: String =>
54 | val file = new File(path)
55 | require(file.exists, "file does not exist: " + path)
56 | settings.sentenceFile = file
57 | })
58 | opt(Some("m"), "model", "", "model file", { path: String =>
59 | val file = new File(path)
60 | require(file.exists, "file does not exist: " + path)
61 | settings.modelUrl = file.toURI.toURL
62 | })
63 | doubleOpt(Some("t"), "threshold", "", "confident threshold for shown extractions", { t: Double => settings.confidenceThreshold = t })
64 | opt("o", "output", "output file (otherwise stdout)", { path => settings.outputFile = Some(new File(path)) })
65 |
66 | opt("x", "expand-arguments", "expand extraction arguments", { settings.expandArguments = true })
67 | opt("v", "verbose", "", { settings.verbose = true })
68 |
69 | opt("p", "parallel", "", { settings.parallel = true })
70 | opt("invincible", "", { settings.invincible = true })
71 | }
72 |
73 | if (parser.parse(args)) {
74 | logger.info("args: " + args.mkString(" "))
75 | run(settings)
76 | }
77 | }
78 |
79 | def run(settings: Settings) {
80 | val parser = new MaltParser
81 | def parse(line: String): Option[DependencyGraph] = {
82 | Some(parser.dependencyGraph(line))
83 | }
84 |
85 | val other = new OpenParse.Settings {
86 | var modelUrl = settings.modelUrl
87 | var outputFile = settings.outputFile
88 | var sentenceFile = settings.sentenceFile
89 | var confidenceThreshold = settings.confidenceThreshold
90 | val duplicates = false
91 | var expandArguments = settings.expandArguments
92 | val showAll = false
93 | var verbose = settings.verbose
94 | val collapseVB = false
95 | var parallel = settings.parallel
96 | var invincible = settings.invincible
97 | }
98 |
99 | OpenParse.run(other, parse)
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/app/src/main/scala/edu/knowitall/openparse/gui/Dot.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.gui
2 |
3 | import edu.knowitall.openparse.extract.TemplateExtractor
4 | import edu.knowitall.common.Resource.using
5 | import edu.knowitall.tool.parse.graph.DependencyGraph
6 | import edu.knowitall.tool.parse.graph.DependencyNode
7 | import java.io.IOException
8 | import scala.swing.Dialog
9 | import scala.io.Source
10 | import java.io.InputStream
11 | import java.io.OutputStream
12 | import java.io.PrintWriter
13 | import java.io.File
14 |
15 | /** Code pertaining to rendering and converting DOT graphs. */
16 | object Dot {
17 | def dot2svg(graphvizFile: Option[File], dotgraph: String) = {
18 | import sys.process.ProcessIO
19 |
20 | trait InputHandler[A] {
21 | def handle(a: A)(input: OutputStream)
22 | }
23 |
24 | trait OutputHandler[A] {
25 | def handle(output: InputStream)
26 | def value: A
27 | }
28 |
29 | val errHandler = new OutputHandler[String] {
30 | var value: String = null
31 |
32 | def handle(out: InputStream) {
33 | value = Source.fromInputStream(out).mkString
34 | out.close()
35 | }
36 | }
37 |
38 | val inputHandler = new InputHandler[String] {
39 | def handle(a: String)(os: OutputStream) {
40 | val pw = new PrintWriter(os)
41 | pw write a
42 | pw.close()
43 | }
44 | }
45 |
46 | val outputHandler = new OutputHandler[String] {
47 | var value: String = null
48 |
49 | def handle(out: InputStream) {
50 | value = Source.fromInputStream(out).mkString
51 | out.close()
52 | }
53 | }
54 | val io = new ProcessIO(inputHandler.handle(dotgraph), outputHandler.handle, errHandler.handle, false)
55 |
56 | val process = graphvizFile match {
57 | case Some(file) => sys.process.Process(file.getAbsolutePath, Seq("-T", "svg"))
58 | case None => sys.process.Process("dot", Seq("-T", "svg"))
59 | }
60 |
61 | val proc = try (process run io)
62 | catch {
63 | case e: IOException =>
64 | Dialog.showMessage(message = e.getMessage() + ". You may need to install graphviz and add it to the PATH variable, or specify the path to the dot program using the '--graphviz' argument.", messageType = Dialog.Message.Error)
65 | throw e
66 | }
67 |
68 | proc.exitValue() match {
69 | case 0 => outputHandler.value
70 | case x => sys.error("Dot exited with error code: " + x + " with output:\n" + errHandler.value)
71 | }
72 | }
73 |
74 | def svg2xml(svgString: String, nodeClickEvent: String=>Unit) = {
75 | import org.apache.batik.dom.svg.SVGDOMImplementation;
76 | import org.apache.batik.util.XMLResourceDescriptor
77 | import org.apache.batik.dom.svg.SAXSVGDocumentFactory
78 |
79 | val uri = SVGDOMImplementation.SVG_NAMESPACE_URI;
80 |
81 | val doc = using(new java.io.StringReader(svgString)) { reader =>
82 | val parser = XMLResourceDescriptor.getXMLParserClassName();
83 | val f = new SAXSVGDocumentFactory(parser);
84 | f.createSVGDocument(uri, reader);
85 | }
86 |
87 | val gs = doc.getElementsByTagNameNS(uri, "g")
88 | for (i <- 0 until gs.getLength) {
89 | val g = gs.item(i)
90 | val attributes = g.getAttributes
91 | val clazz = attributes.getNamedItem("class").getNodeValue
92 |
93 | if (clazz == "node") {
94 | val children = g.getChildNodes
95 | for (j <- 0 until children.getLength) {
96 | val child = children.item(j)
97 | if (child.getNodeName == "title") {
98 | val text = child.getFirstChild.getNodeValue
99 |
100 | import org.w3c.dom.events._
101 | g.asInstanceOf[EventTarget].addEventListener("click",
102 | new EventListener() {
103 | def handleEvent(e: Event) { nodeClickEvent(text) }
104 | },
105 | true);
106 | }
107 | }
108 | }
109 | }
110 |
111 | doc
112 | }
113 |
114 | def dotgraph(dgraph: DependencyGraph, nodes: Set[DependencyNode]) = {
115 | val nodeStyle = nodes.map((_, "style=filled,color=lightblue"))
116 | dgraph.dot(dgraph.text, nodeStyle.toMap, Map.empty)
117 | }
118 |
119 | def dotgraph(dgraph: DependencyGraph, extraction: ExtractionEntry) = {
120 | def originalNodes(nodes: Iterable[DependencyNode]) = nodes.map { node =>
121 | dgraph.nodes.find(_.indices == node.indices).get
122 | }
123 |
124 | val title = "\\n" + dgraph.text + "\\n" + extraction.toString + "\\n" + extraction.`match`.pattern.toStringF((s: String) => if (s.length < 60) s else s.take(20) + "...") +
125 | (extraction.extractor match { case ex: TemplateExtractor => "\\n" + ex.template case _ => "" })
126 |
127 | // nodes
128 | val darkNodes = extraction.`match`.nodeGroups
129 | val lightNodes = originalNodes(extraction.nodes).toSet -- originalNodes(darkNodes.map(_._2.node))
130 | val filledNodes = (lightNodes zip Stream.continually("style=filled,fillcolor=lightgray")) ++
131 | (darkNodes.map { nodeGroup =>
132 | val style = "style=filled,fillcolor=" + (nodeGroup._1 match {
133 | case "rel" => "salmon1"
134 | case "arg1" | "arg2" => "lightblue"
135 | case "slot0" | "slot1" | "slot2" | "slot3" => "seashell"
136 | case _ => "yellow"
137 | })
138 |
139 | (nodeGroup._2.node, style)
140 | })
141 |
142 | // edges
143 | val solidEdges = extraction.edges.toSet
144 |
145 | val nodeStyle = filledNodes
146 | val edgeStyle = (solidEdges zip Stream.continually("style=filled")) ++
147 | ((dgraph.graph.edges.toSet -- solidEdges.toSet) zip Stream.continually("style=dotted,color=gray"))
148 |
149 | dgraph.dot(title, nodeStyle.toMap, edgeStyle.toMap)
150 | }
151 | }
--------------------------------------------------------------------------------
/app/src/main/scala/edu/knowitall/openparse/gui/ExtractionEntry.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.gui
2 |
3 | import edu.knowitall.collection.immutable.graph.pattern.Match
4 | import edu.knowitall.tool.parse.graph.DependencyNode
5 | import edu.knowitall.openparse.extract.PatternExtractor
6 | import edu.knowitall.openparse.extract.DetailedExtraction
7 |
8 | /**
9 | * A more generic representation of an extraction.
10 | *
11 | * This is needed to allow for raw matches, which do
12 | * not have an arg1, rel, etc.
13 | */
14 | case class ExtractionEntry(
15 | confidence: Option[Double],
16 | `match`: Match[DependencyNode],
17 | nodes: Set[DependencyNode],
18 | extractor: PatternExtractor,
19 | parser: Parser.ParserEnum,
20 | string: String = "",
21 | correct: Option[Boolean]) {
22 |
23 | /**
24 | * Convenient constructor for instantiating from
25 | * an OpenParse extraction.
26 | */
27 | def this(confidence: Double, extraction: DetailedExtraction, parser: Parser.ParserEnum, correct: Option[Boolean] = None) = this(Some(confidence), extraction.`match`, extraction.nodes.toSet, extraction.extractor, parser, extraction.toString, correct)
28 |
29 | def edges = `match`.edges
30 |
31 | def annotate(correct: Boolean) = this.copy(correct = Some(correct))
32 | def unannotate = this.copy(correct = None)
33 |
34 | private def goldString = {
35 | correct match {
36 | case Some(true) => "+ "
37 | case Some(false) => "- "
38 | case None => ""
39 | }
40 | }
41 |
42 | override def toString = confidence.map("%1.4f:" format _).getOrElse("") + goldString + string
43 | }
--------------------------------------------------------------------------------
/app/src/main/scala/edu/knowitall/openparse/gui/Parser.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.gui
2 |
3 | import edu.knowitall.tool.parse.DependencyParser
4 | import edu.knowitall.tool.parse.MaltParser
5 | import edu.knowitall.tool.parse.graph.Dependencies
6 | import edu.knowitall.tool.parse.graph.DependencyGraph
7 |
8 | /** An enumerator for parser options */
9 | object Parser extends Enumeration {
10 | type ParserEnum = Value
11 |
12 | val Deserialize = Value("Deserialize")
13 | val Stanford = Value("Stanford")
14 | val MaltL = Value("Malt (Linear)")
15 | val MaltPoly = Value("Malt (Poly)")
16 |
17 | def default = MaltL
18 |
19 | def load(parserType: ParserEnum): (ParserEnum, DependencyParser) = parserType match {
20 | case Parser.Stanford => (parserType, new edu.knowitall.tool.parse.StanfordParser)
21 | case Parser.MaltL => (parserType, new MaltParser())
22 | case Parser.MaltPoly => (parserType, new MaltParser(modelUrl = new java.io.File("engmalt.poly-1.7.mco").toURI.toURL))
23 | case Parser.Deserialize => (parserType, new DependencyParser() {
24 | override def dependencies(input: String) = Dependencies.deserialize(input)
25 | override def dependencyGraph(input: String) = DependencyGraph.deserialize(input)
26 | })
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/app/src/main/scala/edu/knowitall/openparse/gui/Sentence.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.gui
2 |
3 | import scala.util.control.Exception.catching
4 |
5 | import edu.knowitall.tool.parse.graph.DependencyGraph
6 | import edu.knowitall.tool.parse.graph.DependencyGraph.SerializationException
7 | import edu.knowitall.tool.parse.graph.DependencyGraph.deserialize
8 |
9 | /** A representation of the input sentence. */
10 | sealed abstract class Sentence
11 | object Sentence {
12 | case class Text(text: String) extends Sentence {
13 | override def toString = text
14 | }
15 | case class Graph(dgraph: DependencyGraph) extends Sentence {
16 | override def toString = dgraph.serialize
17 | }
18 |
19 | def apply(string: String): Sentence = {
20 | import DependencyGraph._
21 |
22 | catching(classOf[SerializationException]).opt {
23 | deserialize(string)
24 | } match {
25 | case Some(dgraph) => Graph(dgraph)
26 | case None => Text(string)
27 | }
28 | }
29 | }
--------------------------------------------------------------------------------
/app/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
6 |
7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/core/build.sbt:
--------------------------------------------------------------------------------
1 | organization := "edu.washington.cs.knowitall.ollie"
2 |
3 | name := "ollie-core"
4 |
5 | description := "Wrapper and implementation for extractors of chunked sentences."
6 |
7 | version := "1.0.4-SNAPSHOT"
8 |
9 | crossScalaVersions := Seq("2.9.2", "2.10.1")
10 |
11 | scalaVersion <<= crossScalaVersions { (vs: Seq[String]) => vs.head }
12 |
13 | libraryDependencies ++= Seq(
14 | "edu.washington.cs.knowitall.nlptools" %% "nlptools-core" % "2.4.1",
15 | "edu.washington.cs.knowitall.nlptools" %% "nlptools-conf-breeze" % "2.4.1",
16 | "edu.washington.cs.knowitall.nlptools" %% "nlptools-stem-morpha" % "2.4.1",
17 | "org.slf4j" % "slf4j-api" % "1.7.2",
18 | "org.scalaz" %% "scalaz-core" % "7.0.0",
19 | "ch.qos.logback" % "logback-classic" % "1.0.9" % "test",
20 | "ch.qos.logback" % "logback-core" % "1.0.9" % "test",
21 | "junit" % "junit" % "4.11" % "test",
22 | "org.specs2" %% "specs2" % "1.12.3" % "test")
23 |
24 | scalacOptions ++= Seq("-unchecked", "-deprecation")
25 |
26 | licenses := Seq("Ollie Software License Agreement" -> url("https://raw.github.com/knowitall/ollie/master/LICENSE"))
27 |
28 | homepage := Some(url("http://ollie.cs.washington.edu"))
29 |
30 | publishMavenStyle := true
31 |
32 | resolvers += "Sonatype OSS Snapshots" at "https://oss.sonatype.org/content/repositories/snapshots"
33 |
34 | publishTo <<= version { (v: String) =>
35 | val nexus = "https://oss.sonatype.org/"
36 | if (v.trim.endsWith("SNAPSHOT"))
37 | Some("snapshots" at nexus + "content/repositories/snapshots")
38 | else
39 | Some("releases" at nexus + "service/local/staging/deploy/maven2")
40 | }
41 |
42 | pomExtra := (
43 |
44 | https://github.com/knowitall/ollie
45 | scm:git://github.com/knowitall/ollie.git
46 | scm:git:git@github.com:knowitall/ollie.git
47 | HEAD
48 |
49 |
50 |
51 | Michael Schmitz
52 |
53 |
54 | Robert Bart
55 |
56 | )
57 |
--------------------------------------------------------------------------------
/core/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 |
5 | edu.washington.cs.knowitall
6 | knowitall-oss
7 | 1.0.2
8 |
9 | edu.washington.cs.knowitall.ollie
10 | ollie-core_2.9.2
11 | 1.0.4-SNAPSHOT
12 | ollie-core
13 | Ollie is an open information extractor for binary relations.
14 |
15 | https://github.com/knowitall/ollie
16 | scm:git://github.com/knowitall/ollie.git
17 | scm:git:git@github.com:knowitall/ollie.git
18 | HEAD
19 |
20 |
21 |
22 | Ollie Software License Agreement
23 | https://raw.github.com/knowitall/ollie/master/LICENSE
24 | repo
25 |
26 |
27 |
28 | University of Washington CSE
29 | http://cs.washington.edu/
30 |
31 |
32 |
33 | Michael Schmitz
34 |
35 |
36 | Robert Bart
37 |
38 |
39 | 2012
40 |
41 | UTF-8
42 | 2.4.1
43 |
44 |
45 |
46 | edu.washington.cs.knowitall.nlptools
47 | nlptools-core_2.9.2
48 | ${nlptools.version}
49 |
50 |
51 | edu.washington.cs.knowitall.nlptools
52 | nlptools-stem-morpha_2.9.2
53 | ${nlptools.version}
54 |
55 |
56 | edu.washington.cs.knowitall.nlptools
57 | nlptools-conf-breeze_2.9.2
58 | ${nlptools.version}
59 |
60 |
61 | org.scalaz
62 | scalaz-core_2.9.2
63 | 7.0.0
64 |
65 |
66 |
67 | org.slf4j
68 | slf4j-api
69 | 1.7.2
70 |
71 |
72 | ch.qos.logback
73 | logback-classic
74 | 1.0.9
75 | test
76 |
77 |
78 | ch.qos.logback
79 | logback-core
80 | 1.0.9
81 | test
82 |
83 |
84 |
85 | junit
86 | junit
87 | 4.11
88 | test
89 |
90 |
91 | org.specs2
92 | specs2_2.9.2
93 | 1.12.3
94 | test
95 |
96 |
97 |
98 | src/main/scala
99 | src/test/scala
100 |
101 |
102 | net.alchim31.maven
103 | scala-maven-plugin
104 | 3.1.1
105 |
106 |
107 |
108 | compile
109 | testCompile
110 | doc-jar
111 |
112 |
113 |
114 |
115 |
116 | -deprecation
117 | -unchecked
118 |
119 |
120 | -Xms128m
121 | -Xmx1024m
122 |
123 |
124 |
125 |
126 |
127 |
128 |
--------------------------------------------------------------------------------
/core/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | resolvers += Resolver.url("sbt-plugin-releases", new URL("http://scalasbt.artifactoryonline.com/scalasbt/sbt-plugin-releases")) (Resolver.ivyStylePatterns)
2 |
3 | addSbtPlugin("com.jsuereth" % "xsbt-gpg-plugin" % "0.6")
4 |
--------------------------------------------------------------------------------
/core/scripts/applypatterns.sh:
--------------------------------------------------------------------------------
1 | # 1 -- patterns
2 | # 2 -- sentences
3 | mvn -q -e -f ../pom.xml compile exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.PatternExtractor -Dexec.args="--patterns $1 --sentences $2"
4 |
--------------------------------------------------------------------------------
/core/scripts/build_templates.sh:
--------------------------------------------------------------------------------
1 | mkdir "$1/templates/"
2 | mvn exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.BuildTemplates -Dexec.args="$1/raw/patterned.txt $1/templates/templates.txt --reltemplates $HOME/public/read/reltemplates.txt --debug $1/templates/"
3 |
--------------------------------------------------------------------------------
/core/scripts/create_patterns.sh:
--------------------------------------------------------------------------------
1 | mvn -q -e exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.BuildTreePatterns -Dexec.args="$1/raw/parsed.txt $1/raw/patterned-all.txt -p --length 3" 2> $1/raw/patterned-all.log
2 |
--------------------------------------------------------------------------------
/core/scripts/create_test_train.sh:
--------------------------------------------------------------------------------
1 | # 1 -- lda directory
2 | ROWS="$1/raw/patterned.txt"
3 | TEST="$1/raw/test.txt"
4 | TRAIN="$1/raw/train.txt"
5 | mvn -q -e exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.lda.CreateTestSet -Dexec.args="$ROWS $TEST $TRAIN"
6 |
7 |
--------------------------------------------------------------------------------
/core/scripts/extractor.sh:
--------------------------------------------------------------------------------
1 | echo "$*"
2 | mvn -q -e exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.OpenParse -Dexec.args="$*"
3 |
--------------------------------------------------------------------------------
/core/scripts/keep_common_patterns.sh:
--------------------------------------------------------------------------------
1 | # 1 -- lda directory
2 | cut -f5 "$1/raw/patterned-all.txt" | sort | uniq -c | sort -nr > "$1/raw/patterns.txt"
3 | mvn -q -e exec:java -Dexec.mainClass=edu.washington.cs.knowitall.pattern.KeepCommonPatterns -Dexec.args="$1/raw/patterned-all.txt 10" > "$1/raw/patterned.txt"
4 |
--------------------------------------------------------------------------------
/core/src/main/resources/edu/knowitall/ollie/cognitiveWords.txt:
--------------------------------------------------------------------------------
1 | accept
2 | admit
3 | affirm
4 | aim
5 | allow
6 | apprehend
7 | assert
8 | attest
9 | aver
10 | avouch
11 | avow
12 | believe
13 | claim
14 | comprehend
15 | confirm
16 | conjecture
17 | consider
18 | contend
19 | define
20 | deny
21 | describe
22 | discover
23 | doubt
24 | dream
25 | envisage
26 | expect
27 | fathom
28 | feel
29 | follow
30 | foreknow
31 | foresee
32 | foretell
33 | grant
34 | grasp
35 | guarantee
36 | guess
37 | hold
38 | hope
39 | identify
40 | imagine
41 | infer
42 | intend
43 | know
44 | maintain
45 | mean
46 | misapprehend
47 | misconstrue
48 | misinterpret
49 | misunderstand
50 | observe
51 | plan
52 | portray
53 | presume
54 | prophesy
55 | propose
56 | reaffirm
57 | realize
58 | recognize
59 | recollect
60 | remember
61 | report
62 | represent
63 | repute
64 | reveal
65 | see
66 | show
67 | speculate
68 | suppose
69 | surmise
70 | suspect
71 | swear
72 | think
73 | trust
74 | understand
75 | vaticinate
76 | visualize
77 | wish
78 | yen
--------------------------------------------------------------------------------
/core/src/main/resources/edu/knowitall/ollie/communicationWords.txt:
--------------------------------------------------------------------------------
1 | acknowledge
2 | acquaint
3 | add
4 | advise
5 | affirm
6 | allege
7 | announce
8 | apprise
9 | articulate
10 | believe
11 | blab
12 | blurt
13 | claim
14 | comment
15 | communicate
16 | confess
17 | confide
18 | confirm
19 | consider
20 | convey
21 | corroborate
22 | declare
23 | deem
24 | demonstrate
25 | disclose
26 | divulge
27 | elaborate
28 | elucidate
29 | establish
30 | esteem
31 | exclaim
32 | explain
33 | explicate
34 | expound
35 | feel
36 | illustrate
37 | imagine
38 | inform
39 | insinuate
40 | insist
41 | intimate
42 | justify
43 | know
44 | leak
45 | lecture
46 | mention
47 | moralize
48 | narrate
49 | note
50 | notify
51 | observe
52 | pose
53 | preach
54 | proclaim
55 | promulgate
56 | propose
57 | prove
58 | rant
59 | rate
60 | read
61 | reaffirm
62 | recite
63 | reckon
64 | recount
65 | reiterate
66 | relate
67 | relay
68 | remark
69 | remember
70 | remind
71 | repeat
72 | reply
73 | report
74 | respond
75 | retort
76 | reveal
77 | say
78 | see
79 | show
80 | sniff
81 | speak
82 | state
83 | suppose
84 | suspect
85 | talk
86 | teach
87 | tell
88 | testify
89 | theorize
90 | think
91 | update
92 | utter
93 | venture
94 | verify
95 | view
96 | voice
97 | write
--------------------------------------------------------------------------------
/core/src/main/resources/edu/knowitall/ollie/confidence/default-classifier.txt:
--------------------------------------------------------------------------------
1 | args start and end with noun 0.030924657084179144
2 | rel ends with of 0.1013506657501542
3 | arg1 contains pronoun 0.19630801348782667
4 | arg2 contains pronoun -0.13341646099789348
5 | long relation -0.3547145229191737
6 | gap of 10 in rel -0.34306426484946456
7 | vacuous extraction -0.6389807893982924
8 | nn edges in pattern 0.9130032848389
9 | arg1 is proper 0.07933280909554899
10 | Intercept 0.0
11 | sentence begins with arg1 -0.1588407285556643
12 | if right before arg1 -1.2206208992816086
13 | arg2 is proper -0.04306420146120506
14 | arg2 borders appositive -0.0017006187220647805
15 | rel contains gerund -0.26200297625650837
16 | arg1 borders appositive -0.13448972417475485
17 | noun-verb-noun in arg1 0.0
18 | prep right after arg2 0.19212879336967245
19 | prep in arg2 0.16539493294341892
20 | arg2 contains infinitive -0.0
21 | prep mismatch in pattern -0.20092201136389673
22 | sentence is imperative 0.11745202578145564
23 | hyp words in rel -0.1449927441123399
24 | sentence ends with arg2 0.11610654106632967
25 | noun-verb-noun in arg2 0.07217080739835992
26 | rel is contiguous 0.12562188545360878
27 | non-contiguous rel -0.1849662870655201
28 | semantic constraints in pattern -0.4343558913425681
29 | openparse confidence 0.43411514029724824
30 | arg1 bad characters -0.40339032821185783
31 | sentence starts with extraction 0.18854224217974247
32 | arg2 bad characters -0.009939551407472108
33 | rel contains verb 0.4757113580400253
34 | rel starts with be 0.0
35 | prep right before arg1 -0.2350155331052106
36 | sentence has question mark 0.0
37 | arg2 before arg1 -0.35791735399208685
38 | arg2 before rel -0.023882392179128745
39 | rel bad characters -0.11794120943690224
40 |
--------------------------------------------------------------------------------
/core/src/main/resources/edu/knowitall/ollie/prefixWords.txt:
--------------------------------------------------------------------------------
1 | after
2 | although
3 | because
4 | before
5 | but
6 | however
7 | if
8 | once
9 | that
10 | though
11 | when
12 | whenever
13 | whether
14 | where
15 | while
16 | would
--------------------------------------------------------------------------------
/core/src/main/resources/edu/knowitall/openparse/categories/location.txt:
--------------------------------------------------------------------------------
1 | abbacy
2 | abode
3 | abutment
4 | abysm
5 | abyss
6 | acme
7 | addition
8 | address
9 | aerie
10 | aerospace
11 | aery
12 | aim
13 | air
14 | airhead
15 | airspace
16 | airway
17 | ambiance
18 | ambience
19 | anchorage
20 | angle
21 | anomaly
22 | antapex
23 | antinode
24 | antipodes
25 | aperture
26 | apex
27 | aphelion
28 | apoapsis
29 | apogee
30 | apojove
31 | apolune
32 | aposelene
33 | approach
34 | apron
35 | archbishopric
36 | archdeaconry
37 | archdiocese
38 | archduchy
39 | area
40 | arena
41 | arrowhead
42 | ashram
43 | asthenosphere
44 | atelier
45 | atmosphere
46 | axil
47 | axis
48 | azimuth
49 | back
50 | backside
51 | backwater
52 | backwoods
53 | backyard
54 | bailiwick
55 | bakehouse
56 | bakery
57 | bakeshop
58 | barb
59 | barony
60 | barren
61 | barrio
62 | barycenter
63 | base
64 | basin
65 | battlefield
66 | battlefront
67 | battleground
68 | beachhead
69 | beak
70 | bearing
71 | beat
72 | bed
73 | bedground
74 | bedside
75 | beehive
76 | beeline
77 | beginning
78 | belly
79 | bellybutton
80 | belt
81 | bent
82 | benthos
83 | berm
84 | berth
85 | bight
86 | bilge
87 | bilges
88 | bilocation
89 | bindery
90 | biosphere
91 | birthplace
92 | bishopric
93 | bitthead
94 | bivouac
95 | block
96 | boatyard
97 | bookbindery
98 | boondocks
99 | border
100 | borderland
101 | borderline
102 | borough
103 | bottom
104 | bound
105 | boundary
106 | bounds
107 | bourn
108 | bourne
109 | bowels
110 | breach
111 | breadbasket
112 | break
113 | brickfield
114 | brickyard
115 | bridgehead
116 | brink
117 | brokerage
118 | brow
119 | buffer
120 | bull
121 | burg
122 | bush
123 | cabstand
124 | caliphate
125 | cambium
126 | camp
127 | campground
128 | campong
129 | campsite
130 | campus
131 | canthus
132 | canton
133 | cap
134 | capital
135 | capitulum
136 | carrefour
137 | casbah
138 | cavern
139 | cavity
140 | cell
141 | cemetery
142 | center
143 | centerfield
144 | central
145 | centre
146 | centrex
147 | centroid
148 | chap
149 | chapiter
150 | charnel
151 | chasm
152 | checkpoint
153 | chink
154 | chokepoint
155 | chromosphere
156 | churchyard
157 | circle
158 | circuit
159 | circumference
160 | circus
161 | city
162 | clear
163 | clearing
164 | cleft
165 | cloverleaf
166 | coalfield
167 | coastline
168 | col
169 | colliery
170 | colony
171 | columbarium
172 | common
173 | commons
174 | commonwealth
175 | commune
176 | community
177 | compartment
178 | confluence
179 | conurbation
180 | core
181 | corium
182 | corncob
183 | corner
184 | corneum
185 | cornfield
186 | country
187 | countryside
188 | county
189 | course
190 | court
191 | cowtown
192 | crack
193 | cradle
194 | cranny
195 | crawlspace
196 | creamery
197 | crenel
198 | crenelle
199 | crest
200 | crevasse
201 | crevice
202 | crinion
203 | croft
204 | crosscut
205 | crossing
206 | crossroad
207 | crossway
208 | crotch
209 | crown
210 | crud
211 | crust
212 | crypt
213 | cubbyhole
214 | culmination
215 | curtilage
216 | cusp
217 | cuticle
218 | dairy
219 | danger
220 | dark
221 | darkness
222 | dateline
223 | dec
224 | declination
225 | defile
226 | delimitation
227 | demarcation
228 | demesne
229 | den
230 | department
231 | dependency
232 | depth
233 | derivation
234 | derma
235 | dermis
236 | desert
237 | desktop
238 | destination
239 | determinant
240 | development
241 | diamond
242 | diastema
243 | dig
244 | diocese
245 | dip
246 | direction
247 | distance
248 | district
249 | divide
250 | dockside
251 | dockyard
252 | dogleg
253 | domain
254 | domicile
255 | dominion
256 | dooryard
257 | downtown
258 | drop
259 | duchy
260 | dukedom
261 | dump
262 | dumpsite
263 | earldom
264 | earreach
265 | earshot
266 | earth
267 | east
268 | ecliptic
269 | edge
270 | edging
271 | element
272 | emirate
273 | empire
274 | emptiness
275 | empyrean
276 | encampment
277 | enclave
278 | enclosure
279 | end
280 | endpoint
281 | entrepot
282 | environment
283 | environs
284 | eparchy
285 | epicenter
286 | epicentre
287 | epidermis
288 | episcopate
289 | epitope
290 | equator
291 | equinoctial
292 | equinox
293 | exaltation
294 | exarchate
295 | excavation
296 | exchange
297 | exosphere
298 | expanse
299 | exterior
300 | extreme
301 | extremity
302 | extremum
303 | exurbia
304 | eye
305 | eyeshot
306 | eyrie
307 | eyry
308 | face
309 | fairground
310 | fairway
311 | farm
312 | farmland
313 | farmplace
314 | farmstead
315 | fatherland
316 | faubourg
317 | fault
318 | faulting
319 | fiefdom
320 | field
321 | fingertip
322 | finish
323 | firebreak
324 | fireguard
325 | fireside
326 | firmament
327 | fishery
328 | fissure
329 | flies
330 | floor
331 | flowerbed
332 | fluke
333 | flyway
334 | focus
335 | foot
336 | foothold
337 | foramen
338 | forefront
339 | forepart
340 | forge
341 | fork
342 | fountainhead
343 | fracture
344 | fringe
345 | front
346 | frontier
347 | funfair
348 | gaff
349 | gap
350 | garden
351 | gasfield
352 | gasworks
353 | geosphere
354 | ghetto
355 | glade
356 | glassworks
357 | goal
358 | goldfield
359 | gorge
360 | grainfield
361 | grange
362 | grassland
363 | grave
364 | graveyard
365 | green
366 | greenbelt
367 | greenway
368 | gridiron
369 | ground
370 | grounds
371 | grove
372 | gulf
373 | habitat
374 | habitation
375 | hairline
376 | hamlet
377 | hand
378 | hangout
379 | harbor
380 | harborage
381 | harbour
382 | harbourage
383 | hatchery
384 | haunt
385 | haven
386 | hayfield
387 | head
388 | heading
389 | headspring
390 | headwater
391 | hearing
392 | heart
393 | hearth
394 | heartland
395 | heath
396 | heathland
397 | heaven
398 | heavens
399 | heel
400 | heights
401 | heliopause
402 | heliosphere
403 | hell
404 | hellhole
405 | hem
406 | hemisphere
407 | hemline
408 | here
409 | heronry
410 | hiatus
411 | hideaway
412 | hideout
413 | high
414 | hilltop
415 | hilum
416 | hinterland
417 | hip
418 | hipline
419 | hole
420 | hollow
421 | holy
422 | home
423 | homeland
424 | hometown
425 | horizon
426 | horst
427 | hotbed
428 | hotspot
429 | house
430 | hub
431 | hydathode
432 | hydrosphere
433 | imperium
434 | inclination
435 | inferno
436 | infield
437 | innersole
438 | inside
439 | insole
440 | interchange
441 | interface
442 | interior
443 | intersection
444 | ionosphere
445 | ironworks
446 | irredenta
447 | irridenta
448 | isarithm
449 | island
450 | isobar
451 | isochrone
452 | isoclinal
453 | isogone
454 | isogram
455 | isohel
456 | isopleth
457 | isotherm
458 | itinerary
459 | job
460 | junction
461 | jungle
462 | junkyard
463 | jurisdiction
464 | justiciary
465 | juxtaposition
466 | kampong
467 | kasbah
468 | key
469 | khanate
470 | kingdom
471 | knothole
472 | kraal
473 | lab
474 | laboratory
475 | lair
476 | land
477 | landmark
478 | landscape
479 | landscaping
480 | latitude
481 | launderette
482 | laundry
483 | lawn
484 | layer
485 | lea
486 | lead
487 | leak
488 | lee
489 | leeward
490 | left
491 | leftfield
492 | lenticel
493 | ley
494 | lie
495 | light
496 | limb
497 | limit
498 | line
499 | lineation
500 | lithosphere
501 | locale
502 | locality
503 | location
504 | locus
505 | longitude
506 | lookout
507 | lot
508 | loxodrome
509 | luff
510 | lumberyard
511 | mandate
512 | mandatory
513 | mansion
514 | mantle
515 | march
516 | marchland
517 | mare
518 | maria
519 | mastaba
520 | mastabah
521 | masthead
522 | matrix
523 | mausoleum
524 | maximum
525 | meadow
526 | mecca
527 | medina
528 | medium
529 | meeting
530 | megalopolis
531 | meridian
532 | mesosphere
533 | mete
534 | metropolis
535 | micropyle
536 | midair
537 | midden
538 | middle
539 | midfield
540 | midland
541 | midpoint
542 | midst
543 | midstream
544 | midway
545 | minefield
546 | minimum
547 | molding
548 | monument
549 | moorage
550 | mooring
551 | motherland
552 | moulding
553 | mouth
554 | municipality
555 | nadir
556 | nape
557 | navel
558 | necropolis
559 | neighborhood
560 | neighbourhood
561 | nest
562 | nib
563 | nidus
564 | nirvana
565 | node
566 | nombril
567 | nook
568 | north
569 | northeast
570 | northland
571 | northwest
572 | notch
573 | nucha
574 | nucleus
575 | oasis
576 | occident
577 | oilfield
578 | omphalos
579 | omphalus
580 | open
581 | opening
582 | orbit
583 | orchard
584 | orient
585 | origin
586 | orphrey
587 | outback
588 | outdoors
589 | outfield
590 | outline
591 | outport
592 | outpost
593 | outside
594 | outskirt
595 | outskirts
596 | outsole
597 | outstation
598 | overhead
599 | overlook
600 | ozonosphere
601 | paddy
602 | paint
603 | palaestra
604 | palate
605 | palatinate
606 | palestra
607 | pallium
608 | pampas
609 | panhandle
610 | paradise
611 | parallel
612 | parcel
613 | paries
614 | parish
615 | park
616 | parkland
617 | part
618 | parterre
619 | parting
620 | parts
621 | pass
622 | pasture
623 | pastureland
624 | patch
625 | patchboard
626 | pate
627 | path
628 | patisserie
629 | patriarchate
630 | pattern
631 | peak
632 | penetralia
633 | perch
634 | perforation
635 | periapsis
636 | perigee
637 | perigon
638 | perihelion
639 | perijove
640 | perilune
641 | periselene
642 | pesthole
643 | photosphere
644 | piazza
645 | pigeonhole
646 | piggery
647 | pike
648 | pinnacle
649 | pinpoint
650 | piscary
651 | piste
652 | pit
653 | pitch
654 | place
655 | plantation
656 | plate
657 | playground
658 | plaza
659 | pleasance
660 | plot
661 | plugboard
662 | pocket
663 | point
664 | pole
665 | poll
666 | polls
667 | pool
668 | pore
669 | port
670 | position
671 | possession
672 | post
673 | pottery
674 | pouch
675 | prairie
676 | precinct
677 | prefecture
678 | premises
679 | presence
680 | preserve
681 | princedom
682 | principality
683 | property
684 | proprioceptor
685 | protectorate
686 | provenance
687 | provenience
688 | province
689 | proximity
690 | puddle
691 | pueblo
692 | punctum
693 | pupil
694 | purlieu
695 | qibla
696 | quadrant
697 | quarter
698 | radius
699 | railhead
700 | railyard
701 | ranch
702 | range
703 | rathole
704 | reach
705 | realm
706 | rear
707 | rearward
708 | refuge
709 | region
710 | rendezvous
711 | rent
712 | repair
713 | repository
714 | reservation
715 | reserve
716 | residence
717 | resort
718 | retreat
719 | rhumb
720 | rift
721 | right
722 | rightfield
723 | rip
724 | roads
725 | roadside
726 | roadstead
727 | rockery
728 | rooftop
729 | rookery
730 | root
731 | rootage
732 | ropewalk
733 | rotary
734 | rough
735 | round
736 | roundabout
737 | roundhouse
738 | route
739 | sac
740 | sack
741 | saddle
742 | saddleback
743 | saddlery
744 | safety
745 | sanctuary
746 | sanctum
747 | sandlot
748 | savanna
749 | savannah
750 | scenario
751 | scene
752 | scenery
753 | schoolyard
754 | scissure
755 | scour
756 | scrapheap
757 | scrubland
758 | scruff
759 | seafront
760 | seam
761 | seaport
762 | seascape
763 | seat
764 | section
765 | sector
766 | see
767 | seedbed
768 | selvage
769 | selvedge
770 | semidesert
771 | semitropics
772 | separation
773 | sepulcher
774 | sepulchre
775 | sepulture
776 | setting
777 | settlement
778 | shadow
779 | shantytown
780 | sheeprun
781 | sheepwalk
782 | sheet
783 | sheikdom
784 | sheikhdom
785 | shift
786 | shipside
787 | shipyard
788 | shire
789 | shop
790 | shoreline
791 | short
792 | shoulder
793 | showplace
794 | shrubbery
795 | side
796 | sign
797 | silhouette
798 | site
799 | situation
800 | skyline
801 | skyway
802 | slack
803 | slip
804 | slit
805 | slot
806 | slum
807 | smithy
808 | snag
809 | snow
810 | sodom
811 | soil
812 | sole
813 | solitude
814 | somewhere
815 | source
816 | south
817 | southeast
818 | southland
819 | southwest
820 | spa
821 | space
822 | spearhead
823 | spearpoint
824 | sphere
825 | spike
826 | split
827 | spoor
828 | spot
829 | sprawl
830 | spread
831 | spring
832 | square
833 | stage
834 | stand
835 | state
836 | station
837 | steps
838 | stoma
839 | stomate
840 | stop
841 | stopover
842 | stratosphere
843 | stratum
844 | stretch
845 | studio
846 | subdivision
847 | substrate
848 | substratum
849 | subtopia
850 | subtropics
851 | suburb
852 | suburbia
853 | sultanate
854 | summit
855 | superstrate
856 | superstratum
857 | surface
858 | surround
859 | surroundings
860 | suzerainty
861 | swath
862 | switchboard
863 | tack
864 | tannery
865 | tape
866 | target
867 | taxistand
868 | tear
869 | tee
870 | telomere
871 | tendency
872 | tenderloin
873 | terminal
874 | termination
875 | terminus
876 | terrain
877 | terreplein
878 | territory
879 | theater
880 | theatre
881 | there
882 | thermosphere
883 | thick
884 | tiltyard
885 | timberline
886 | tip
887 | tiptoe
888 | tiptop
889 | tomb
890 | tonsure
891 | top
892 | topiary
893 | town
894 | township
895 | track
896 | tract
897 | trail
898 | trailhead
899 | treetop
900 | trend
901 | trichion
902 | tropic
903 | tropics
904 | tropopause
905 | troposphere
906 | trusteeship
907 | turf
908 | turnery
909 | umbilicus
910 | underbelly
911 | underside
912 | undersurface
913 | unknown
914 | upside
915 | uptown
916 | vacancy
917 | vacuity
918 | vacuum
919 | vantage
920 | variation
921 | vault
922 | veld
923 | veldt
924 | vent
925 | venue
926 | verge
927 | vertex
928 | viceroyalty
929 | vicinity
930 | view
931 | viewpoint
932 | village
933 | vinery
934 | vineyard
935 | viscounty
936 | void
937 | volcano
938 | wall
939 | ward
940 | warren
941 | washhouse
942 | waste
943 | wasteland
944 | wasteyard
945 | waterfront
946 | waterline
947 | watermark
948 | watershed
949 | waterworks
950 | wavefront
951 | way
952 | wayside
953 | weald
954 | wedge
955 | welkin
956 | wellhead
957 | wellspring
958 | west
959 | wheatfield
960 | whereabouts
961 | wild
962 | wilderness
963 | window
964 | windward
965 | wing
966 | wire
967 | wold
968 | woodlet
969 | work
970 | workplace
971 | workshop
972 | workspace
973 | yard
974 | yardarm
975 | zenith
976 | zodiac
977 | zone
978 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/common/enrich/Traversable.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall
2 | package common
3 | package enrich
4 |
5 | import edu.knowitall.collection.immutable.Bag
6 |
7 | import scalaz._
8 | import Scalaz._
9 | import Monoid._
10 |
11 | /**
12 | * Enrichments for traversables.
13 | *
14 | * @author Michael Schmitz
15 | */
16 | object Traversables {
17 | implicit def traversableOnceTo[T](as: TraversableOnce[T]): SuperTraversableOnce[T] = new SuperTraversableOnce[T](as)
18 |
19 | implicit def traversableOncePairIntTo[T](as: TraversableOnce[(T, Int)]): SuperTraversableOncePairInt[T] = new SuperTraversableOncePairInt[T](as)
20 |
21 | implicit def traversableOncePairTo[T, U](as: TraversableOnce[(T, U)]): SuperTraversableOncePair[T, U] = new SuperTraversableOncePair[T, U](as)
22 | }
23 |
24 | sealed class SuperTraversableOnce[T](value: TraversableOnce[T]) {
25 | def histogram: Map[T, Int] = {
26 | value.foldLeft(Map[T, Int]()) { (m, c) =>
27 | m.updated(c, m.getOrElse(c, 0) + 1)
28 | }
29 | }
30 | }
31 |
32 | sealed class SuperTraversableOncePairInt[T](value: TraversableOnce[(T, Int)]) {
33 | import Traversables._
34 | def mergeHistograms: Map[T, Int] = value.mergeKeys(_ + _)
35 | }
36 |
37 | sealed class SuperTraversableOncePair[T, U](value: TraversableOnce[(T, U)]) {
38 | def mergeKeys(implicit mon: Semigroup[U]): Map[T, U] = {
39 | value.foldLeft(Map[T, U]()) {
40 | case (map, (k, v)) =>
41 | map + (k -> (map.get(k).map(_ |+| v).getOrElse(v)))
42 | }
43 | }
44 |
45 | def mergeKeys[F[_]](implicit monoid: Monoid[F[U]]): Map[T, F[U]] = {
46 | value.foldLeft(Map[T, F[U]]()) {
47 | case (map, (k, v)) =>
48 | val pure = monoid.zero
49 | map + (k -> (map.get(k).map(_ |+| pure).getOrElse(pure)))
50 | }
51 | }
52 |
53 | def mergeKeys(merge: (U, U) => U): Map[T, U] = {
54 | value.foldLeft(Map[T, U]()) {
55 | case (map, (k, v)) =>
56 | map + (k -> map.get(k).map(merge(_, v)).getOrElse(v))
57 | }
58 | }
59 |
60 | def toListMultimap: Map[T, List[U]] = {
61 | value.foldLeft(Map[T, List[U]]().withDefaultValue(List.empty[U])) {
62 | case (map, (k, v)) =>
63 | map + (k -> (v :: map(k)))
64 | }
65 | }
66 |
67 | def toSetMultimap: Map[T, Set[U]] = {
68 | value.foldLeft(Map[T, Set[U]]().withDefaultValue(Set.empty[U])) {
69 | case (map, (k, v)) =>
70 | map + (k -> (map(k) + v))
71 | }
72 | }
73 |
74 | def toBagMultimap: Map[T, Bag[U]] = {
75 | value.foldLeft(Map[T, Bag[U]]().withDefaultValue(Bag.empty[U])) {
76 | case (map, (k, v)) =>
77 | val bag = map(k)
78 | map + (k -> (bag + v))
79 | }
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/DependencyGraphExtras.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.ollie
2 |
3 | import edu.knowitall.tool.parse.graph.DependencyGraph
4 | import edu.knowitall.tool.parse.graph.Dependency
5 | import edu.knowitall.tool.parse.graph.DependencyNode
6 | import edu.knowitall.collection.immutable.Interval
7 | import edu.knowitall.tool.tokenize.Tokenizer
8 | import edu.knowitall.collection.immutable.graph.Graph
9 |
10 | class DependencyGraphExtras(dgraph: DependencyGraph) {
11 | private def graph = dgraph.graph
12 |
13 | def passiveVoice: Iterable[DependencyGraph] = {
14 | require(dgraph.nodes forall (_.indices.length == 1))
15 |
16 | // look for passive constructions
17 | val activeVoices = this.graph.vertices.filter { v =>
18 | (v.postag startsWith "VB") &&
19 | (dgraph.dependencies exists {edge => edge.label == "nsubj" && edge.source == v}) &&
20 | (dgraph.dependencies exists {edge => edge.label == "dobj" && edge.source == v})
21 | }
22 |
23 | activeVoices map { v =>
24 | val nsubj = dgraph.dependencies.find(edge => edge.label == "nsubj" && edge.source == v).get
25 | val dobj = dgraph.dependencies.find(edge => edge.label == "dobj" && edge.source == v).get
26 | val nsubjInterval = Interval.span(dgraph.graph.inferiors(nsubj.dest).map(_.indices))
27 | val dobjInterval = Interval.span(dgraph.graph.inferiors(dobj.dest).map(_.indices))
28 |
29 | val nsubjpass = new Dependency(v, dobj.dest, "nsubjpass")
30 |
31 | val by = new DependencyNode("by", "IN", dobjInterval.start, -1)
32 | val prep = new Dependency(v, by, "prep")
33 | val pobj = new Dependency(by, nsubj.dest, "pobj")
34 | val was = new DependencyNode("was", "VBD", v.indices.start, -1)
35 | val auxpass = new Dependency(nsubj.source, was, "auxpass")
36 |
37 | // adjust the edges
38 | var edges: Iterable[Dependency] = dgraph.dependencies
39 | edges = edges.toSet - nsubj - dobj + prep + pobj + auxpass + nsubjpass
40 | // adjust for the "by" node
41 | def nodeMap = { (v: DependencyNode) =>
42 | var interval = v.indices
43 | if (v.indices.start >= by.indices.start && v != by) interval = DependencyGraphExtras.shift(interval, 1)
44 | if (v.indices.start >= was.indices.start && v != was) interval = DependencyGraphExtras.shift(interval, 1)
45 | new DependencyNode(v.text, v.postag, interval, v.offset)
46 | }
47 | edges = edges.map { e => e mapNodes nodeMap }
48 |
49 | edges = DependencyGraphExtras.swapOrders(edges, graph.inferiors(nsubj.dest) map nodeMap, graph.inferiors(dobj.dest) map nodeMap)
50 |
51 | // create the new graph
52 | val newGraph = new DependencyGraph(edges.flatMap(_.vertices), edges)
53 | val text = newGraph.nodes.iterator.map(_.text).mkString(" ")
54 |
55 | // compute the correct offsets
56 | val offsets = Tokenizer.computeOffsets(newGraph.nodes.iterator.map(_.text).toList, text)
57 | val nodeOffsetTransformation =
58 | ((newGraph.graph.vertices.iterator zip offsets.iterator) map {case (node, token) => node -> new DependencyNode(node.text, node.postag, node.indices, token.offset)}).toMap
59 |
60 | newGraph map nodeOffsetTransformation
61 | }
62 | }
63 |
64 | def activeVoice: Iterable[DependencyGraph] = {
65 | require(dgraph.nodes forall (_.indices.length == 1))
66 |
67 | // look for active constructions
68 | val passiveVoices = this.graph.vertices.filter { v =>
69 | if (!(v.postag startsWith "VB") &&
70 | (dgraph.dependencies exists {edge => edge.label == "nsubjpass" && edge.source == v}) &&
71 | (dgraph.dependencies exists (edge => edge.label == "auxpass" && edge.source == v)))
72 | false
73 | else {
74 | dgraph.dependencies.find(e => e.label == "prep" && e.source == v && e.dest.text == "by") match {
75 | case None => false
76 | case Some(prep) => dgraph.dependencies.exists(e => e.source == prep.dest && e.label == "pobj")
77 | }
78 | }
79 | (dgraph.dependencies exists {edge => edge.label == "prep" && edge.source == v})
80 | }
81 |
82 | passiveVoices map { v =>
83 | val nsubjpass = dgraph.dependencies.find(edge => edge.label == "nsubjpass" && edge.source == v).get
84 | val prep = dgraph.dependencies.find(edge => edge.label == "prep" && edge.source == v && edge.dest.text == "by" && dgraph.dependencies.exists(e => e.source == edge.dest && e.label == "pobj")).get
85 | val pobj = dgraph.dependencies.find(edge => edge.label == "pobj" && edge.source == prep.dest).get
86 | val auxpass = dgraph.dependencies.find(edge => edge.label == "auxpass" && edge.source == v).get
87 |
88 | val nsubj = new Dependency(v, pobj.dest, "nsubj")
89 | val dobj = new Dependency(v, nsubjpass.dest, "dobj")
90 |
91 | // adjust the edges
92 | var edges: Iterable[Dependency] = dgraph.dependencies
93 | edges = edges.toSet - nsubjpass - auxpass - prep - pobj + nsubj + dobj
94 | edges = DependencyGraphExtras.swapOrders(edges, graph.inferiors(nsubjpass.dest), graph.inferiors(pobj.dest))
95 |
96 | val nodes = scala.collection.immutable.SortedSet.empty[DependencyNode] ++ edges.flatMap(_.nodes)
97 | val nodeMap = nodes.iterator.zipWithIndex.map{case (node, i) => node -> new DependencyNode(node.text, node.postag, Interval.singleton(i), -1)}.toMap
98 | edges = edges.map(_ mapNodes nodeMap)
99 |
100 | // create the new graph
101 | val newGraph = new DependencyGraph(edges.flatMap(_.vertices), edges)
102 | val text = newGraph.nodes.iterator.map(_.text).mkString(" ")
103 |
104 | // compute the correct offsets
105 | val offsets = Tokenizer.computeOffsets(newGraph.nodes.iterator.map(_.text).toList, text)
106 | val nodeOffsetTransformation =
107 | ((newGraph.graph.vertices.iterator zip offsets.iterator) map {case (node, token) => node -> new DependencyNode(node.text, node.postag, node.indices, token.offset)}).toMap
108 |
109 | newGraph map nodeOffsetTransformation
110 | }
111 | }
112 |
113 | def switchVoice: Iterable[DependencyGraph] = {
114 | passiveVoice ++ activeVoice
115 | }
116 | }
117 |
118 | object DependencyGraphExtras {
119 | private def shift(interval: Interval, by: Int) = Interval.open(interval.start + by, interval.end + by)
120 |
121 | private def swapOrders(edges: Iterable[Dependency], left: scala.collection.Set[DependencyNode], right: scala.collection.Set[DependencyNode]) = {
122 | val leftInterval = Interval.span(left.map(_.indices))
123 | val rightInterval = Interval.span(right.map(_.indices))
124 |
125 | require(leftInterval.end <= rightInterval.start)
126 |
127 | val leftOffset = left.iterator.map(_.offset).max
128 | val rightOffset = right.iterator.map(_.offset).min
129 |
130 | val tokensBetween = rightInterval.start - leftInterval.end + 1
131 | val charsBetween = rightOffset - leftOffset
132 |
133 | edges.map(e => e.mapNodes(v =>
134 | if (left contains v) new DependencyNode(v.text, v.postag, DependencyGraphExtras.shift(v.indices, tokensBetween), v.offset + charsBetween)
135 | else if (right contains v) new DependencyNode(v.text, v.postag, DependencyGraphExtras.shift(v.indices, -tokensBetween), v.offset - charsBetween)
136 | else v))
137 | }
138 | }
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/NaryExtraction.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.ollie
2 |
3 | import scala.Option.option2Iterable
4 | import scala.collection.SortedSet
5 | import scala.collection.immutable
6 |
7 | import edu.knowitall.collection.immutable.Interval
8 | import edu.knowitall.openparse.extract.DetailedExtraction
9 | import edu.knowitall.openparse.extract.Extraction
10 | import edu.knowitall.openparse.extract.Extraction.AdverbialModifier
11 | import edu.knowitall.openparse.extract.Extraction.ClausalComponent
12 | import edu.knowitall.tool.parse.graph.DependencyNode
13 | import edu.knowitall.tool.postag.Postagger
14 |
15 | /**
16 | * Represents a part {arg1, rel, arg2} of an extraction.
17 | *
18 | * @param string the representation of the part
19 | * @param interval the interval of the part in the source sentence
20 | */
21 | class ExtractionPart(val string: String, val interval: Interval) extends Ordered[ExtractionPart] {
22 | override def compare(that: ExtractionPart) =
23 | this.interval compare that.interval
24 |
25 | override def toString = string.replaceAll("/", "")
26 | }
27 |
28 | /**
29 | * Represents a possible suffix for an extended extraction.
30 | * For example, in the sentence "He ate from 7 until 10."
31 | * there are two suffixes: "from 7" and "until 10".
32 | *
33 | * @param string the text of the suffix
34 | * @param interval the interval of the suffix in the source sentence
35 | * @param confidence the confidence of the suffix
36 | */
37 | class Suffix(
38 | text: String,
39 | nodes: SortedSet[DependencyNode],
40 | val confidence: Double)
41 | extends Extraction.Part(nodes, text) {
42 | override def toString = ("%1.4f" format confidence) + "/\"" + super.toString + "\""
43 |
44 | /** Annote the suffix with a type. */
45 | def annotate(string: String) =
46 | new AnnotatedSuffix(this, string)
47 | }
48 |
49 | /**
50 | * Represents a possible suffix for an extended extraction
51 | * along with an annotation.
52 | *
53 | * For example, in the sentence "He ate from 7 until 10."
54 | * there are two suffixes: "from 7" and "until 10".
55 | *
56 | * @param string the text of the suffix
57 | * @param interval the interval of the suffix in the source sentence
58 | * @param confidence the confidence of the suffix
59 | * @param annotation an annotation for the suffix
60 | */
61 | class AnnotatedSuffix(
62 | text: String,
63 | nodes: SortedSet[DependencyNode],
64 | confidence: Double,
65 | val annotation: String)
66 | extends Suffix(text, nodes, confidence) {
67 | def this(suffix: Suffix, annotation: String) =
68 | this(suffix.text, suffix.nodes, suffix.confidence, annotation)
69 | override def toString = annotation + "/" + super.toString
70 | }
71 |
72 | /**
73 | * A representaiton of an n-ary extraction, i.e.
74 | *
75 | * (Michael, ran, to the store, on Monday, at 2 PM)
76 | *
77 | * N-ary extractions have multiple secondary arguments (objects)
78 | * and these arguments include the preposition.
79 | *
80 | * @param arg1 the first argument
81 | * @param rel the relation
82 | * @param suffixes the suffixes
83 | * @param clausals a clause restricting this extraction to a context
84 | * @param modifier a modifier for this extraction (i.e. attribution)
85 | *
86 | * @author Michael Schmitz
87 | */
88 | class NaryExtraction(val arg1: Extraction.Part, val rel: Extraction.Part, val suffixes: Seq[Suffix], val attributions: Seq[Attribution] = Seq.empty, val enablers: Seq[EnablingCondition] = Seq.empty) {
89 | override def toString =
90 | "(" + arg1.text + ", " + rel.text + ", " + suffixes.map(_.text).mkString(", ") + ")"
91 | }
92 |
93 | object NaryExtraction {
94 | implicit object SuffixOrdering extends Ordering[Suffix] {
95 | def compare(x: Suffix, y: Suffix) = x.span.compare(y.span)
96 | }
97 |
98 | /**
99 | * Create extended extractions from a collection of extractions
100 | * from the same sentence.
101 | */
102 | def from(extrs: Iterable[(Double, OllieExtractionInstance)]): Iterable[NaryExtraction] = {
103 | // keep extractions that end with a one-word preposition
104 | val prepositionEnding = extrs.filter {
105 | case (conf, inst) =>
106 | Postagger.simplePrepositions(inst.extr.rel.text drop (1 + inst.extr.rel.text lastIndexOf ' '))
107 | }
108 |
109 | // break off the preposition
110 | case class BrokenExtraction(rel: String, preposition: String, extr: (Double, OllieExtraction))
111 | val split: Iterable[BrokenExtraction] = prepositionEnding.map {
112 | case (conf, inst) =>
113 | val preps = Postagger.prepositions.filter(inst.extr.rel.text endsWith _)
114 | val longest = preps.maxBy(_.length)
115 | BrokenExtraction(inst.extr.rel.text.dropRight(longest.length + 1), longest, (conf, inst.extr))
116 | }
117 |
118 | // group by the arg1 and text
119 | split groupBy {
120 | case BrokenExtraction(rel, preposition, (conf, extr)) =>
121 | (extr.arg1.text, rel)
122 | } filter (_._2.size > 1) map {
123 | case ((arg1, rel), extrs) =>
124 | val suffixes: immutable.SortedSet[Suffix] = extrs.map {
125 | case BrokenExtraction(rel, prep, (conf, extr)) =>
126 | new Suffix(prep + " " + extr.arg2.text, extr.arg2.nodes, conf)
127 | }(scala.collection.breakOut)
128 |
129 | val first = extrs.head.extr._2
130 | val argument1 = new Extraction.Part(first.arg1.nodes, arg1)
131 | val relation = new Extraction.Part(first.rel.nodes, rel)
132 |
133 | val attributions = extrs.flatMap(_.extr._2.attribution).toSet.toSeq
134 | val enablers = extrs.flatMap(_.extr._2.enabler).toSet.toSeq
135 |
136 | new NaryExtraction(argument1, relation, suffixes.toSeq, enablers = enablers, attributions = attributions)
137 | }
138 | }
139 | }
140 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/Ollie.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.ollie
2 |
3 | import scala.io.Source
4 | import edu.knowitall.collection.immutable.Interval
5 | import edu.knowitall.common.Resource.using
6 | import edu.knowitall.openparse.OpenParse
7 | import edu.knowitall.openparse.extract.DetailedExtraction
8 | import edu.knowitall.tool.parse.graph.DependencyGraph
9 | import edu.knowitall.tool.stem.MorphaStemmer
10 | import edu.knowitall.tool.stem.Stemmer
11 |
12 | /** Ollie is an Open Information Extractor that produces binary extractions
13 | * with context. The constructor takes an OpenParse instance. Ollie extends
14 | * OpenParse's extractions with enabling conditions and attributions. There
15 | * is also a trained confidence function for OllieExtractions.
16 | *
17 | * @author Michael Schmitz
18 | */
19 | class Ollie(val openparse: OpenParse) {
20 | val stemmer = new MorphaStemmer
21 |
22 | /** Construct with the default model. */
23 | def this() = this(OpenParse.withDefaultModel(OpenParse.Configuration(confidenceThreshold = 0.005)))
24 |
25 | def apply(dgraph: DependencyGraph): Iterable[OllieExtractionInstance] =
26 | extract(dgraph)
27 |
28 | /**
29 | * primary method for getting extractions
30 | */
31 | def extract(dgraph: DependencyGraph): Iterable[OllieExtractionInstance] = {
32 | val openparseExtrs = openparse.extract(dgraph)
33 |
34 | for {
35 | (conf, extr) <- openparseExtrs
36 | enabler = enablingAdverbialClauseHelper(extr)
37 | attribution = attribClausalComponentHelper(extr)
38 | } yield new OllieExtractionInstance(
39 | new OllieExtraction(extr.arg1, extr.rel, extr.arg2, conf, enabler, attribution), dgraph, extr.extractor)
40 | }
41 |
42 | /** Identify enabling condition, i.e. "if it's raining..." */
43 | private def enablingAdverbialClauseHelper(extr: DetailedExtraction): Option[EnablingCondition] = {
44 | extr.modifier map { modifier =>
45 | val prefix = modifier.contents.nodes.head.text
46 | val phrase = modifier.contents.nodes.iterator.drop(1).map(_.text).mkString(" ")
47 |
48 | new EnablingCondition(prefix, phrase, modifier.contents.span)
49 | }
50 | }
51 |
52 | /** Identify attributions from clausal components, i.e. "He said..." */
53 | private def attribClausalComponentHelper(extr: DetailedExtraction): Option[Attribution] = {
54 | extr.clausal flatMap { clausal =>
55 | // find the first verb in the clausal rel
56 | clausal.rel.nodes.find(_.postag.startsWith("VB")).flatMap { node =>
57 | val normalized = stemmer.stem(node.text.toLowerCase())
58 | if (Ollie.communicationWords.contains(normalized) || Ollie.cognitiveWords.contains(normalized)) {
59 | val clausalArgInterval = Interval.span(clausal.arg.nodes.map(_.indices))
60 | val clausalRelInterval = Interval.span(clausal.rel.nodes.map(_.indices))
61 | Some(new Attribution(
62 | clausal.arg.text,
63 | clausal.arg.span,
64 | clausal.rel.text,
65 | clausal.rel.span))
66 | } else None
67 | }
68 | }
69 | }
70 | }
71 |
72 | object Ollie {
73 | implicit def stemmer: Stemmer = MorphaStemmer
74 |
75 | /** A collection of verbs used for communication, i.e. "said" */
76 | val communicationWords = using(Source.fromInputStream(classOf[Ollie].getResource("communicationWords.txt").openStream())) { source =>
77 | source.getLines.toSet
78 | }
79 |
80 | /** A collection of verbs used for beliefs, i.e. "think" */
81 | val cognitiveWords = using(Source.fromInputStream(classOf[Ollie].getResource("cognitiveWords.txt").openStream())) { source =>
82 | source.getLines.toSet
83 | }
84 |
85 | /** A collection of prefixes used for enabling conditions, i.e. "if" and "when" */
86 | val enablerPrefixes = using(Source.fromInputStream(classOf[Ollie].getResource("prefixWords.txt").openStream())) { source =>
87 | source.getLines.toSet
88 | }
89 | }
90 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/OllieExtraction.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.ollie
2 |
3 | import scala.Option.option2Iterable
4 | import scala.collection.breakOut
5 |
6 | import edu.knowitall.collection.immutable.Interval
7 | import edu.knowitall.common.HashCodeHelper
8 | import edu.knowitall.openparse.extract.Extraction.Part
9 | import edu.knowitall.tool.parse.graph.DependencyNode
10 |
11 | /** A base representation for additional context around an extraction. */
12 | sealed abstract class Context {
13 | def text: String
14 | def interval: Interval
15 | }
16 |
17 | /** A representation for an enabling condition.
18 | * An example of an enabling condition is "if it's raining".
19 | */
20 | case class EnablingCondition(
21 | /** The enabling condition word, i.e. "if" */
22 | val prefix: String,
23 | /** The rest of the enabling condition, i.e. "it's raining" */
24 | val phrase: String,
25 | /** The token interval of the enabling condition */
26 | override val interval: Interval) extends Context {
27 | override def text = prefix + " " + phrase
28 |
29 | def serialize: String = Seq(prefix, phrase, interval.start.toString, interval.last.toString).map(_.replaceAll("_", "_UNSC_")).mkString("_")
30 | }
31 |
32 | object EnablingCondition {
33 | def deserialize(string: String) = {
34 | val Array(prefix, phrase, intervalStart, intervalLast) = try (string.split("_"))
35 | catch {
36 | case e => throw new RuntimeException("could not deserialize EnablingCondition: " + string, e);
37 | }
38 | new EnablingCondition(prefix, phrase, Interval.closed(intervalStart.toInt, intervalLast.toInt))
39 | }
40 | }
41 |
42 | /** A representation for an attribution.
43 | * An example of an is "Obama believes".
44 | */
45 | case class Attribution(
46 | /** The argument of the attribution, i.e. "Obama" */
47 | val arg: String,
48 | /** The token interval of the argument of the attribution */
49 | val argInterval: Interval,
50 | /** The relation of the attribution, i.e. "believes" */
51 | val rel: String,
52 | /** The token interval of the relation of the attribution */
53 | override val interval: Interval) extends Context {
54 | override def text = arg + " " + rel
55 |
56 | def serialize: String = {
57 | val fields = Seq(arg, rel, argInterval.start.toString, argInterval.last.toString, interval.start.toString, interval.last.toString)
58 | fields.map(_.replaceAll("_", "_UNSC_")).mkString("_")
59 | }
60 | }
61 |
62 | object Attribution {
63 | def deserialize(string: String) = {
64 | val Array(arg, rel, argIntervalStart, argIntervalLast, relIntervalStart, relIntervalLast) = try (string.split("_"))
65 | catch {
66 | case e => throw new RuntimeException("could not deserialize Attribution: " + string, e);
67 | }
68 | val argInterval = Interval.closed(argIntervalStart.toInt, argIntervalLast.toInt)
69 | val relInterval = Interval.closed(relIntervalStart.toInt, relIntervalLast.toInt)
70 |
71 | new Attribution(arg, argInterval, rel, relInterval)
72 | }
73 | }
74 |
75 | /** A representation of an Ollie extraction, i.e. we could get the following
76 | * extraction from the example sentence.
77 | *
78 | * {{{
79 | * When I'm dreaming David Bowie sings that Ziggy sucked up into his mind.
80 | * (Ziggy, sucked up, into his mind)[attribution = "David Bowie")
81 | * }}}
82 | */
83 | class OllieExtraction(
84 | /** The first argument (subject) of the extraction, i.e. "Ziggy" */
85 | val arg1: Part,
86 | /** The relation of the extraction, i.e. "sucked up" */
87 | val rel: Part,
88 | /** The second argument (object) of the extraction, i.e. "into his mind" */
89 | val arg2: Part,
90 | /** The confidence value from OpenParse. */
91 | private[ollie] val openparseConfidence: Double,
92 | /** The enabling condition, if any. I.e. "When I'm dreaming" */
93 | val enabler: Option[EnablingCondition],
94 | /** The attribution, if any. I.e. "David Bowie sings that" */
95 | val attribution: Option[Attribution]) {
96 |
97 | import OllieExtraction.{serializePart, deserializePart}
98 |
99 | override def equals(that: Any) = that match {
100 | case that: OllieExtraction =>
101 | this.arg1 == that.arg1 &&
102 | this.rel == that.rel &&
103 | this.arg2 == that.arg2 &&
104 | this.enabler == that.enabler &&
105 | this.attribution == that.attribution &&
106 | this.openparseConfidence == that.openparseConfidence
107 | case _ => false
108 | }
109 |
110 | override def hashCode = HashCodeHelper(
111 | this.arg1,
112 | this.rel,
113 | this.arg2,
114 | this.enabler,
115 | this.attribution,
116 | this.openparseConfidence)
117 |
118 | def tabSerialize: String = {
119 | val enablerString = enabler match {
120 | case Some(enablingCondition) => enablingCondition.serialize
121 | case None => "None"
122 | }
123 | val attrString = attribution match {
124 | case Some(attr) => attr.serialize
125 | case None => "None"
126 | }
127 |
128 | val fieldStrings = Seq(arg1, rel, arg2).map(serializePart(_)) ++ Seq("%.05f".format(openparseConfidence), enablerString, attrString)
129 | fieldStrings.map(_.replaceAll("\t", "_TAB_")).mkString("\t")
130 | }
131 |
132 | /** The full text of this extraction. */
133 | def text = Iterable(arg1.text, rel.text, arg2.text).mkString(" ")
134 |
135 | /** All the nodes in this extraction. */
136 | def nodes = arg1.nodes ++ rel.nodes ++ arg2.nodes
137 |
138 | /** The spanning interval of the nodes in this extraction. */
139 | def span = Interval.span(nodes.map(_.indices))
140 |
141 | override def toString = {
142 | val extentions = Iterable(
143 | enabler.map("enabler="+_.text),
144 | attribution.map("attrib="+_.text)).flatten match {
145 | case Nil => ""
146 | case list => list.mkString("[", ";", "]")
147 | }
148 | "(%s; %s; %s)".format(arg1.text, rel.text, arg2.text) + extentions
149 | }
150 | }
151 |
152 | object OllieExtraction {
153 | def tabDelimitedColumns = Seq("Arg1Part", "RelPart", "Arg2Part", "Confidence", "Enabler", "Attribution").mkString("\t")
154 |
155 | def tabDeserialize(array: Seq[String]): (OllieExtraction, Seq[String]) = {
156 | array match {
157 | case Seq(arg1Part, relPart, arg2Part, openparseConfString, enablerString, attrString, rest @ _*) => {
158 | val parts = Seq(arg1Part, relPart, arg2Part) map deserializePart
159 | val enabler = if (enablerString.equals("None")) None else Some(EnablingCondition.deserialize(enablerString))
160 | val attribution = if (attrString.equals("None")) None else Some(Attribution.deserialize(attrString))
161 | val extr = new OllieExtraction(parts(0), parts(1), parts(2), openparseConfString.toDouble, enabler, attribution)
162 | (extr, rest)
163 | }
164 | }
165 | }
166 |
167 | def tabDeserialize(s: String): OllieExtraction = {
168 | val (extr, rest) = tabDeserialize(s.split("\t"))
169 | require(rest.isEmpty)
170 | extr
171 | }
172 |
173 | def serializePart(part: Part): String = {
174 | val serializedNodes = part.nodes.iterator.map(_.serialize).mkString("; ")
175 | Iterable(part.text, serializedNodes).mkString(" ;;; ")
176 | }
177 |
178 | def deserializePart(string: String): Part = {
179 | val Array(partText, partNodes) = try (string.split("\\s*;;;\\s*"))
180 | catch {
181 | case e => throw new RuntimeException("could not deserialize Extraction.Part: " + string, e);
182 | }
183 |
184 | val nodesSortedSet: scala.collection.SortedSet[DependencyNode] =
185 | try (partNodes.split("\\s*;\\s*").map(DependencyNode.deserialize(_))(breakOut))
186 | catch {
187 | case e => throw new RuntimeException("could not deserialize Extraction.Part: " + string, e);
188 | }
189 |
190 | new Part(nodesSortedSet, partText)
191 | }
192 | }
193 |
194 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/OllieExtractionInstance.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.ollie
2 |
3 | import edu.knowitall.common.HashCodeHelper
4 | import edu.knowitall.openparse.extract.PatternExtractor
5 | import edu.knowitall.tool.parse.graph.DependencyGraph
6 | import scala.util.matching.Regex
7 |
8 | /** OllieExtractionInstance represents an extraction coupled with
9 | * its source sentence.
10 | */
11 | class OllieExtractionInstance(
12 | /** The associated extraction. */
13 | val extr: OllieExtraction,
14 | /** The associated sentence. */
15 | val sent: DependencyGraph,
16 | /** The extractor used. */
17 | val pat: PatternExtractor) {
18 |
19 | override def equals(that: Any) = that match {
20 | case that: OllieExtractionInstance => this.extr == that.extr && this.sent == that.sent
21 | case _ => false
22 | }
23 | override def hashCode = HashCodeHelper(extr, sent)
24 |
25 | def extraction = extr
26 | def sentence = sent
27 | def pattern = pat
28 |
29 | private val passivePatternRegex = new Regex("""^\{arg1:?\w*\} dobj> \{arg2:?\w*\}""")
40 | /** Report if this extraction is an active construction.
41 | * This is a crude measure so false should not be taken to mean
42 | * that it is not active.
43 | *
44 | * An extraction is active if it has a valid passive formulation
45 | * by swapping the arguments and modifying the relation (adding "be"
46 | * and "by").
47 | */
48 | def active: Boolean =
49 | activePatternRegex.pattern.matcher(pat.pattern.serialize).matches()
50 |
51 | def tabSerialize: String = {
52 | val serializedGraph = sent.serialize
53 | val serializedExtr = extr.tabSerialize
54 | Seq(serializedGraph, pat.tabSerialize, serializedExtr).mkString("\t")
55 | }
56 | }
57 |
58 | object OllieExtractionInstance {
59 | def tabDeserialize(string: String): OllieExtractionInstance = {
60 | val array = string.split('\t')
61 |
62 | val (extr, rest) = tabDeserialize(array)
63 | require(rest.isEmpty)
64 |
65 | extr
66 | }
67 |
68 | def tabDeserialize(array: Seq[String]): (OllieExtractionInstance, Seq[String]) = {
69 | try {
70 | val Seq(serializedGraph, r0 @ _*) = array
71 |
72 | val graph = DependencyGraph.deserialize(serializedGraph)
73 | val (pat, r1) = PatternExtractor.tabDeserialize(r0)
74 | val (extr, r2) = OllieExtraction.tabDeserialize(r1)
75 |
76 | (new OllieExtractionInstance(extr, graph, pat), r2)
77 | } catch {
78 | case e => throw new IllegalArgumentException("Could not tab deserialize: " + array.mkString("\t"), e)
79 | }
80 | }
81 |
82 | val numFinder = "[0-9]+".r
83 | }
84 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/ScoredOllieExtractionInstance.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.ollie
2 |
3 | import edu.knowitall.tool.conf.Labelled
4 |
5 | /** OllieExtractionInstance represents a boolean score coupled with
6 | * an extraciton instance.
7 | *
8 | * @param score the label for this extraction
9 | * @param inst the extraction instance labelled
10 | */
11 | class ScoredOllieExtractionInstance(
12 | val score: Boolean,
13 | val inst: OllieExtractionInstance) extends Labelled[OllieExtractionInstance](score, inst) {
14 |
15 | override def toString = score + ":" + inst.extr
16 |
17 | def tabSerialize: String = {
18 | Iterable(if (score) 1 else 0, inst.extr.toString, inst.tabSerialize).mkString("\t")
19 | }
20 | }
21 |
22 | object ScoredOllieExtractionInstance {
23 | def tabDeserialize(string: String): ScoredOllieExtractionInstance = {
24 | try {
25 | val Array(scoreString, _, rest @ _*) = string.split('\t')
26 |
27 | val score =
28 | if (scoreString == "1") true
29 | else if (scoreString == "0") false
30 | else throw new IllegalArgumentException("bad score: " + scoreString)
31 | val (inst, r2) = OllieExtractionInstance.tabDeserialize(rest)
32 |
33 | require(r2.isEmpty)
34 |
35 | new ScoredOllieExtractionInstance(score, inst)
36 | } catch {
37 | case e => throw new IllegalArgumentException("could not tab deserialize: " + string, e)
38 | }
39 | }
40 |
41 | val numFinder = "[0-9]+".r
42 | }
43 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/confidence/OllieConfidenceFunction.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.ollie.confidence
2 |
3 | import java.io.InputStream
4 | import java.net.URL
5 | import java.util.Scanner
6 |
7 | import scala.collection.mutable
8 |
9 | import org.slf4j.LoggerFactory
10 |
11 | import edu.knowitall.common.Resource.using
12 | import edu.knowitall.ollie.OllieExtractionInstance
13 | import edu.knowitall.tool.conf.FeatureSet
14 | import edu.knowitall.tool.conf.impl.LogisticRegression
15 |
16 | /** An implementation of logistic regression of features that can be
17 | * represented as a double. */
18 |
19 | object OllieConfidenceFunction {
20 | val logger = LoggerFactory.getLogger(classOf[OllieIndependentConfFunction])
21 |
22 | type OllieIndependentConfFunction = LogisticRegression[OllieExtractionInstance]
23 |
24 | val defaultModelUrl = Option(this.getClass.getResource("default-classifier.txt")).getOrElse {
25 | throw new IllegalArgumentException("Could not load confidence function resource.")
26 | }
27 |
28 | def loadDefaultClassifier(): OllieIndependentConfFunction = {
29 | fromUrl(OllieFeatureSet, defaultModelUrl)
30 | }
31 |
32 | def fromUrl(featureSet: FeatureSet[OllieExtractionInstance, Double], url: URL): OllieIndependentConfFunction = {
33 | LogisticRegression.fromUrl(featureSet, url)
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/confidence/OllieFeatureEvaluation.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.ollie.confidence
2 |
3 | import java.io.File
4 | import java.io.PrintWriter
5 | import java.net.URL
6 |
7 | import scala.io.Source
8 |
9 | import edu.knowitall.common.Analysis
10 | import edu.knowitall.common.Resource.using
11 | import edu.knowitall.ollie.ScoredOllieExtractionInstance
12 | import scopt.OptionParser
13 |
14 | object OllieFeatureEvaluation {
15 | /** Settings for OpenParse. */
16 | abstract class Settings {
17 | /** source file of scored extractions */
18 | def inputFile: File
19 |
20 | /** file to output; None means stdout */
21 | def outputFile: Option[File]
22 |
23 | /** confidence model url */
24 | def confidenceModelUrl: URL
25 | }
26 |
27 | def main(args: Array[String]) = {
28 | var settings = new Settings {
29 | var inputFile: File = _
30 | var outputFile: Option[File] = None
31 | var confidenceModelUrl: URL = OllieConfidenceFunction.defaultModelUrl
32 | }
33 |
34 | val parser = new OptionParser("feature-eval") {
35 | opt(Some("c"), "confidence model", "", "confidence model file", { path: String =>
36 | val file = new File(path)
37 | require(file.exists, "file does not exist: " + path)
38 | settings.confidenceModelUrl = file.toURI.toURL
39 | })
40 |
41 | opt("o", "output", "output file (otherwise stdout)", { path =>
42 | val file = new File(path)
43 | settings.outputFile = Some(file)
44 | })
45 |
46 | arg("input", "input dependencies file", { path: String =>
47 | val file = new File(path)
48 | require(file.exists, "input file does not exist: " + path)
49 | settings.inputFile = file
50 | })
51 | }
52 |
53 | if (parser.parse(args)) {
54 | run(settings)
55 | }
56 | }
57 |
58 | def run(settings: Settings) = {
59 | val confFunc = OllieConfidenceFunction.fromUrl(OllieFeatureSet, settings.confidenceModelUrl)
60 |
61 | val extrs = using (Source.fromFile(settings.inputFile)) { source =>
62 | for (
63 | line <- source.getLines.toList;
64 | val scored = ScoredOllieExtractionInstance.tabDeserialize(line);
65 | val conf = confFunc(scored.inst)
66 | ) yield (conf, scored)
67 | }
68 |
69 | val sorted = extrs.sortBy(-_._1).toList
70 |
71 | val pyed = (sorted.head, 0, 1.0) +: Analysis.precisionYieldMeta(sorted zip sorted.map(_._2.score))
72 |
73 | val featureNames = confFunc.featureSet.featureNames.filter(confFunc.featureWeights.get(_).isDefined).toList.sorted
74 | using {
75 | settings.outputFile match {
76 | case Some(f) => new PrintWriter(f, "UTF8")
77 | case None => new PrintWriter(System.out)
78 | }
79 | } { writer =>
80 | writer.println((Iterable("score", "conf", "op-conf", "yield", "precision",
81 | "extr", "enabler", "attrib", "sentence", "dependencies") ++
82 | featureNames).mkString("\t"))
83 | writer.println("\t" * 10 + featureNames.map(confFunc.featureWeights(_).toString).mkString("\t"))
84 | (pyed) foreach { case ((conf, scored), y, p) =>
85 | val features =
86 | for (
87 | featureName <- featureNames;
88 | val featureValue = confFunc.featureSet(featureName)(scored.inst)
89 | ) yield featureValue
90 |
91 | writer.println((Iterable(if (scored.score) 1 else 0,
92 | conf,
93 | scored.inst.extr.openparseConfidence,
94 | y,
95 | p,
96 | scored.inst.extr.toString,
97 | scored.inst.extr.enabler.isDefined.toString.toLowerCase,
98 | scored.inst.extr.attribution.isDefined.toString.toLowerCase,
99 | scored.inst.sent.text,
100 | scored.inst.sent.serialize) ++ features).mkString("\t"))
101 | }
102 | }
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/confidence/train/CrossValidateConfidence.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.ollie.confidence.train
2 |
3 | import java.io.File
4 |
5 | import scala.io.Source
6 |
7 | import edu.knowitall.common.Analysis
8 | import edu.knowitall.common.Resource.using
9 | import edu.knowitall.ollie.ScoredOllieExtractionInstance
10 | import edu.knowitall.ollie.confidence.OllieFeatureSet
11 | import edu.knowitall.tool.conf.BreezeLogisticRegressionTrainer
12 | import scopt.mutable.OptionParser
13 |
14 | object CrossValidateConfidence {
15 | def main(args: Array[String]) {
16 | object settings extends Settings {
17 | var inputFile: File = _
18 | var outputFile: Option[File] = None
19 | }
20 |
21 | val parser = new OptionParser("scoreextr") {
22 | arg("labelled", "labelled extractions", { path: String => settings.inputFile = new File(path) })
23 | argOpt("output", "output file", { path: String => settings.outputFile = Some(new File(path)) })
24 | }
25 |
26 | if (parser.parse(args)) {
27 | run(settings)
28 | }
29 | }
30 |
31 | abstract class Settings {
32 | def inputFile: File
33 | def outputFile: Option[File]
34 |
35 | val splits = 10
36 | }
37 |
38 |
39 | def run(settings: Settings) = {
40 | val trainer = new BreezeLogisticRegressionTrainer(OllieFeatureSet)
41 |
42 | val data =
43 | using (Source.fromFile(settings.inputFile)) { source =>
44 | (source.getLines map (ScoredOllieExtractionInstance.tabDeserialize)).toList
45 | }
46 |
47 | val splits = data.iterator.sliding(data.size / settings.splits, data.size / settings.splits).withPartial(false)
48 | val results = for {
49 | split <- splits.toList
50 |
51 | val test = split
52 | val training = data filterNot (test contains _)
53 |
54 | val classifier = trainer.train(training)
55 | } yield {
56 | for (example <- test) yield {
57 | val conf = classifier.apply(example.inst)
58 | val correct =
59 | if (conf >= 0.5 && example.score) true
60 | else if (conf < 0.5 && !example.score) true
61 | else false
62 | (conf, correct)
63 | }
64 | }
65 |
66 | val pys = results.map { list =>
67 | val py = Analysis.precisionYield(list.sortBy(-_._1).map(_._2))
68 |
69 | py
70 | }
71 |
72 | val aucs = pys.zipWithIndex map { case (py, i) =>
73 | println("Split " + i)
74 | py foreach { case (y, p) =>
75 | println(Iterable(y.toString, "%1.4f" format p).mkString("\t"))
76 | }
77 |
78 | val auc = Analysis.areaUnderCurve(py)
79 | println("auc: " + auc)
80 |
81 | println()
82 | auc
83 | }
84 |
85 | var auc = breeze.linalg.mean(aucs)
86 | println("avg auc: " + auc)
87 | }
88 | }
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/confidence/train/TrainOllieConfidence.scala:
--------------------------------------------------------------------------------
1 |
2 | package edu.knowitall.ollie.confidence.train
3 |
4 | import java.io.File
5 |
6 | import scala.io.Source
7 |
8 | import edu.knowitall.common.Resource.using
9 | import edu.knowitall.ollie.ScoredOllieExtractionInstance
10 | import edu.knowitall.ollie.confidence.OllieFeatureSet
11 | import edu.knowitall.tool.conf.BreezeLogisticRegressionTrainer
12 | import scopt.mutable.OptionParser
13 |
14 | object TrainOllieConfidence {
15 | def main(args: Array[String]) {
16 | object settings extends Settings {
17 | var inputFile: File = _
18 | var outputFile: Option[File] = None
19 | }
20 |
21 | val parser = new OptionParser("scoreextr") {
22 | arg("labelled", "labelled extractions", { path: String => settings.inputFile = new File(path) })
23 | argOpt("output", "output file", { path: String => settings.outputFile = Some(new File(path)) })
24 | }
25 |
26 | if (parser.parse(args)) {
27 | run(settings)
28 | }
29 | }
30 |
31 | abstract class Settings {
32 | def inputFile: File
33 | def outputFile: Option[File]
34 | }
35 |
36 | def run(settings: Settings) = {
37 | val trainer = new BreezeLogisticRegressionTrainer(OllieFeatureSet)
38 |
39 | val data =
40 | using (Source.fromFile(settings.inputFile)) { source =>
41 | (source.getLines map (ScoredOllieExtractionInstance.tabDeserialize)).toList
42 | }
43 |
44 | val classifier = trainer.train(data)
45 | settings.outputFile match {
46 | case Some(file) => classifier.saveFile(file)
47 | case None =>
48 | classifier.save(System.out)
49 | }
50 | }
51 | }
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/ollie/output/BratOutput.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.ollie.output
2 |
3 | import edu.knowitall.ollie.OllieExtractionInstance
4 | import edu.knowitall.openparse.extract.Extraction
5 | import edu.knowitall.collection.immutable.Interval
6 | import edu.knowitall.ollie.ExtractionPart
7 | import edu.knowitall.tool.segment.Segment
8 | import java.io.PrintWriter
9 |
10 | class BratOutput(extractor: String => Iterable[OllieExtractionInstance]) {
11 | def process(sentences: Iterable[Segment], writer: PrintWriter) = {
12 | val document = new Document()
13 | for {
14 | Segment(text, offset) <- sentences
15 | inst <- extractor(text)
16 | entry <- document.annotations(inst, offset)
17 | } {
18 | writer.println(entry)
19 | }
20 | }
21 |
22 | class Document {
23 | var entityIndex = 0
24 | var relationIndex = 0
25 |
26 | def annotations(inst: OllieExtractionInstance, sentenceCharacterOffset: Int) = {
27 | def partToAnnotation(inst: OllieExtractionInstance, part: Extraction.Part, partName: String) = {
28 | val tokens = inst.sentence.nodes.toList.slice(part.span.start, part.span.end)
29 | val charInterval = Interval.open(tokens.head.offset, tokens.last.offsets.end)
30 | partName + " " + (sentenceCharacterOffset + charInterval.start) + " " + (sentenceCharacterOffset + charInterval.end) + "\t" + inst.sentence.text.substring(charInterval.start, charInterval.end)
31 | }
32 |
33 | case class LabelledEntry(label: String, entry: String)
34 | def label(identifier: Char, index: Int, entry: String) = LabelledEntry(identifier.toString + index, entry)
35 |
36 | val entries = {
37 | val arguments = List(inst.extr.arg1, inst.extr.arg2) map { arg =>
38 | val labelled = label('T', entityIndex, partToAnnotation(inst, arg, "Argument"))
39 | entityIndex += 1
40 | labelled
41 | }
42 | val relation = {
43 | val labelled = label('T', entityIndex, partToAnnotation(inst, inst.extr.rel, "Relation"))
44 | entityIndex += 1
45 | labelled
46 | }
47 |
48 | val entities = relation :: arguments
49 |
50 | val relations = arguments zip List("Arg1", "Arg2") map {
51 | case (entry, edge) =>
52 | val labelled = label('R', relationIndex, edge + "-of Arg1:" + relation.label + " Arg2:" + entry.label)
53 | relationIndex += 1
54 | labelled
55 | }
56 |
57 | entities ::: relations
58 | }
59 |
60 | entries map {
61 | case LabelledEntry(label, entry) => label + "\t" + entry
62 | }
63 | }
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/AnalyzePatterns.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse
2 |
3 | import java.io.{PrintWriter, File}
4 |
5 | import scala.Option.option2Iterable
6 | import scala.collection.mutable
7 | import scala.io.Source
8 |
9 | import edu.knowitall.collection.immutable.graph.pattern.DirectedEdgeMatcher
10 | import edu.knowitall.common.Resource
11 | import edu.knowitall.tool.parse.graph.{PostagNodeMatcher, LabelEdgeMatcher, DependencyPattern, DependencyGraph}
12 | import edu.knowitall.ollie.Ollie.stemmer
13 |
14 | object AnalyzePatterns {
15 | def main(args: Array[String]) {
16 | val patternedFilePath = args(0)
17 | val outputFilePath = args(1)
18 |
19 | println("Counting pattern occurrence...")
20 | val patterns = mutable.HashMap[String, Int]().withDefaultValue(0)
21 | Resource.using(Source.fromFile(patternedFilePath, "UTF8")) { source =>
22 | for (line <- source.getLines) {
23 | val Array(_, _, _, _, pattern, _, _, _*) = line.split("\t", -1)
24 | patterns += pattern -> (patterns(pattern) + 1)
25 | }
26 | }
27 |
28 | println("Grouping patterns...")
29 | Resource.using(new PrintWriter(new File(outputFilePath), "UTF8")) { writer =>
30 | val ordered = patterns.toList.sortBy(_._2)(implicitly(Ordering[Int]).reverse)
31 | for ((pattern, count) <- ordered.filter(_._2 > 100)) {
32 | println(count + ":" + pattern)
33 | Resource.using(Source.fromFile(patternedFilePath, "UTF8")) { source =>
34 | writer.println(pattern + "\t" + count)
35 | for (line <- source.getLines) {
36 | val Array(rel, arg1, arg2, lemmas, p, sentence, deps, _*) = line.split("\t", -1)
37 | if (p == pattern) {
38 | writer.println(Iterable(rel, arg1, arg2, lemmas).mkString("\t"))
39 | writer.println(sentence)
40 | writer.println(deps)
41 | writer.println()
42 | }
43 | }
44 | }
45 | }
46 |
47 | println()
48 | }
49 | }
50 | }
51 |
52 | object CountPatternComponents {
53 | def main(args: Array[String]) {
54 | val patternedFilePath = args(0)
55 |
56 | val edgeCounts = mutable.HashMap[String, Int]().withDefaultValue(0)
57 | val postagCounts = mutable.HashMap[String, Int]().withDefaultValue(0)
58 | Resource.using(Source.fromFile(patternedFilePath, "UTF8")) { source =>
59 | for (line <- source.getLines) {
60 | val Array(_, _, _, _, pickledPattern, _, _, _*) = line.split("\t", -1)
61 | val pattern = new ExtractorPattern(DependencyPattern.deserialize(pickledPattern))
62 | val labels = (pattern.edgeMatchers.toList).flatMap { _ match {
63 | case e: DirectedEdgeMatcher[_] if e.matcher.isInstanceOf[LabelEdgeMatcher] =>
64 | Some(e.matcher.asInstanceOf[LabelEdgeMatcher].label)
65 | case _ => None
66 | }
67 | }
68 | val postags = (pattern.baseNodeMatchers.toList).collect {
69 | case m: PostagNodeMatcher => m.postag
70 | }
71 |
72 | for (l <- labels) {
73 | edgeCounts += l -> (edgeCounts(l)+1)
74 | }
75 |
76 | for (postag <- postags) {
77 | postagCounts += postag -> (postagCounts(postag)+1)
78 | }
79 | }
80 | }
81 |
82 | println("Postag counts: ")
83 | for ((k, v) <- postagCounts.toList.sortBy(_._2).reverse) {
84 | println(k + "\t" + v)
85 | }
86 |
87 | println()
88 | println("Edge counts: ")
89 | for ((k, v) <- edgeCounts.toList.sortBy(_._2).reverse) {
90 | println(k + "\t" + v)
91 | }
92 | }
93 | }
94 |
95 | object CountSentenceComponents {
96 | def main(args: Array[String]) {
97 | val patternedFilePath = args(0)
98 |
99 | val edgeCounts = mutable.HashMap[String, Int]().withDefaultValue(0)
100 | val postagCounts = mutable.HashMap[String, Int]().withDefaultValue(0)
101 | val pieceCounts = mutable.HashMap[String, Int]().withDefaultValue(0)
102 | Resource.using(Source.fromFile(patternedFilePath, "UTF8")) { source =>
103 | for (line <- source.getLines) {
104 | val Array(_, _, _, _, _, _, pickledGraph, _*) = line.split("\t", -1)
105 | val graph = DependencyGraph.deserialize(pickledGraph)
106 | val labels = (graph.graph.edges).toList.map(_.label )
107 | val postags = (graph.graph.vertices).toList.map(_.postag)
108 |
109 | for (l <- labels) {
110 | edgeCounts += l -> (edgeCounts(l)+1)
111 | }
112 |
113 | for (postag <- postags) {
114 | postagCounts += postag -> (postagCounts(postag)+1)
115 | }
116 |
117 | for (edge <- graph.graph.edges) {
118 | val piece1 = edge.source.postag + " " + edge.label + " " + edge.dest.postag
119 | val piece2 = edge.dest.postag + " " + edge.label + " " + edge.source.postag
120 |
121 | pieceCounts += piece1 -> (pieceCounts(piece1)+1)
122 | pieceCounts += piece2 -> (pieceCounts(piece2)+1)
123 | }
124 | }
125 | }
126 |
127 | println("Postag counts: ")
128 | for ((k, v) <- postagCounts.toList.sortBy(_._2).reverse) {
129 | println(k + "\t" + v)
130 | }
131 |
132 | println()
133 | println("Edge counts: ")
134 | for ((k, v) <- edgeCounts.toList.sortBy(_._2).reverse) {
135 | println(k + "\t" + v)
136 | }
137 |
138 | println()
139 | println("Piece counts: ")
140 | for ((k, v) <- pieceCounts.toList.sortBy(_._2).reverse) {
141 | println(k + "\t" + v)
142 | }
143 | }
144 | }
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/ExtractorPattern.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse
2 |
3 | import scala.io.Source
4 |
5 | import org.slf4j.LoggerFactory
6 |
7 | import edu.knowitall.collection.immutable.graph.pattern.CaptureNodeMatcher
8 | import edu.knowitall.collection.immutable.graph.pattern.EdgeMatcher
9 | import edu.knowitall.collection.immutable.graph.pattern.Matcher
10 | import edu.knowitall.collection.immutable.graph.pattern.NodeMatcher
11 | import edu.knowitall.collection.immutable.graph.pattern.Pattern
12 | import edu.knowitall.collection.immutable.graph.pattern.TrivialNodeMatcher
13 | import edu.knowitall.ollie.Ollie.stemmer
14 | import edu.knowitall.tool.parse.graph.DependencyNode
15 | import edu.knowitall.tool.parse.graph.DependencyPattern
16 | import edu.knowitall.tool.parse.graph.LabelEdgeMatcher
17 | import edu.knowitall.tool.parse.graph.RegexNodeMatcher
18 | import scalaz._
19 | import scalaz.Scalaz._
20 |
21 | /** A wrapper for a dependency pattern that adds some convenience methods
22 | * for working with patterns intended for extraction of binary relations.
23 | *
24 | * @author Michael Schmitz
25 | */
26 | class ExtractorPattern(matchers: List[Matcher[DependencyNode]]) extends DependencyPattern(matchers) {
27 | val logger = LoggerFactory.getLogger(this.getClass)
28 |
29 | def this(pattern: Pattern[DependencyNode]) = this(pattern.matchers.map { _ match {
30 | case m: ExtractionPartMatcher => m
31 | // lift extractor matchers to a more representitive class
32 | case m: CaptureNodeMatcher[_] => m.alias.take(3) match {
33 | case "arg" => new ArgumentMatcher(m.alias, m.matcher)
34 | case "rel" => new RelationMatcher(m.alias, m.matcher)
35 | case "slo" => new SlotMatcher(m.alias, m.matcher)
36 | case _ => throw new IllegalArgumentException("Unknown capture alias: " + m.alias)
37 | }
38 | // keep everything else the same
39 | case m => m
40 | }})
41 |
42 | override def canEqual(that: Any) = that.isInstanceOf[ExtractorPattern]
43 | override def equals(that: Any) = that match {
44 | case that: ExtractorPattern => (that canEqual this) && this.matchers == that.matchers
45 | case _ => false
46 | }
47 |
48 | def semantic: Boolean = matchers.exists {
49 | case m: RelationMatcher => m.baseNodeMatchers exists { case m: RegexNodeMatcher => true case _ => false }
50 | case _ => false
51 | }
52 |
53 | def valid: Boolean = {
54 | def existsEdge(pred: LabelEdgeMatcher=>Boolean) =
55 | this.baseEdgeMatchers.collect {
56 | case e: LabelEdgeMatcher => e
57 | }exists(pred)
58 |
59 | /* check for multiple prep edges */
60 | def multiplePreps = this.baseEdgeMatchers.collect {
61 | case e: LabelEdgeMatcher => e
62 | }.count(_.label.contains("prep")) > 1
63 |
64 | /* check for a conj_and edge */
65 | def conjAnd = existsEdge(_.label == "conj_and")
66 |
67 | /* check for a conj_and edge */
68 | def conjOr = existsEdge(_.label == "conj_or")
69 |
70 | /* eliminate all conj edges */
71 | def conj = existsEdge(_.label startsWith "conj")
72 |
73 | def slotBordersNN = {
74 | import scalaz._
75 | import Scalaz._
76 |
77 | def isNN(m: Matcher[DependencyNode]) = m match {
78 | case e: NodeMatcher[_] =>
79 | e.baseNodeMatchers exists {
80 | case m: LabelEdgeMatcher if m.label == "nn" => true
81 | case _ => false
82 | }
83 | case _ => false
84 | }
85 |
86 | def isSlot(m: Matcher[DependencyNode]) = m match {
87 | case m: SlotMatcher => true
88 | case _ => false
89 | }
90 |
91 | this.matchers.toZipper.map(_.positions.toStream.exists { z =>
92 | def focusedOnNN(z: Option[Zipper[Matcher[DependencyNode]]]) = z.map(z => isNN(z.focus)).getOrElse(false)
93 | isSlot(z.focus) && (focusedOnNN(z.previous) || focusedOnNN(z.next))
94 | }).getOrElse(false)
95 | }
96 |
97 | if (existsEdge(_.label == "dep")) {
98 | logger.debug("invalid: dep edge: " + this.toString)
99 | return false
100 | }
101 |
102 | if (existsEdge(_.label == "dep")) {
103 | logger.debug("invalid: dep edge: " + this.toString)
104 | return false
105 | }
106 |
107 | /* check if ends with slot */
108 | def slotAtEnd = {
109 | def isSlot(node: NodeMatcher[_]) = node match {
110 | case m: CaptureNodeMatcher[_] => m.alias.startsWith("slot")
111 | case _ => false
112 | }
113 |
114 | !this.nodeMatchers.isEmpty && (isSlot(this.nodeMatchers.head) || isSlot(this.nodeMatchers.last))
115 | }
116 |
117 | val length = edgeMatchers.length
118 |
119 | if (length == 2 && multiplePreps) {
120 | logger.debug("invalid: multiple preps: " + this.toString)
121 | false
122 | }
123 | else if (conjAnd) {
124 | logger.debug("invalid: conj_and: " + this.toString)
125 | false
126 | }
127 | else if (conjOr) {
128 | logger.debug("invalid: conj_or: " + this.toString)
129 | false
130 | }
131 | else if (conj) {
132 | logger.debug("invalid: alt conj: " + this.toString)
133 | false
134 | }
135 | else if (slotAtEnd) {
136 | logger.debug("invalid: ends with slot: " + this.toString)
137 | false
138 | }
139 | else if (slotBordersNN) {
140 | logger.debug("invalid: slot borders nn: " + this.toString)
141 | false
142 | }
143 | else {
144 | true
145 | }
146 | }
147 |
148 | /* determine if the pattern is symmetric, such as:
149 | * {arg1} >prep> {rel} compare(m1s, m2s)
155 | // edge matchers should be equals but opposite
156 | case (((m1: EdgeMatcher[_]) :: m1s), ((m2: EdgeMatcher[_]) :: m2s)) => m1 == m2.flip && compare(m1s, m2s)
157 | // edges and other nodes must be equal
158 | case (((m1: Matcher[_]) :: m1s), ((m2: Matcher[_]) :: m2s)) => m1 == m2 && compare(m1s, m2s)
159 | case (Nil, Nil) => true
160 | case _ => false
161 | }
162 |
163 | compare(matchers, matchers.reverse)
164 | }
165 | }
166 |
167 | object ExtractorPattern {
168 | import scala.io.Source
169 | def main(args: Array[String]) {
170 | val iter = if (args.length == 0) Source.stdin.getLines else args.iterator
171 | for (line <- iter) {
172 | val pattern = DependencyPattern.deserialize(line)
173 | val extractor = new ExtractorPattern(pattern)
174 | def verdict = if (extractor.valid) "valid" else "invalid"
175 | println(verdict + ": " + extractor.toString)
176 | }
177 | }
178 | }
179 |
180 | /** A dependency node used to match an extraction part in a pattern extractor.
181 | *
182 | * @author Michael Schmitz
183 | */
184 | sealed abstract class ExtractionPartMatcher(alias: String, matcher: NodeMatcher[DependencyNode])
185 | extends CaptureNodeMatcher[DependencyNode](alias, matcher) {
186 | def this(alias: String) = this(alias, new TrivialNodeMatcher[DependencyNode])
187 |
188 | def withMatcher(matcher: NodeMatcher[DependencyNode]): ExtractionPartMatcher
189 | }
190 |
191 | /** A dependency node used to match an argument in a pattern extractor.
192 | *
193 | * @author Michael Schmitz
194 | */
195 | class ArgumentMatcher(alias: String, matcher: NodeMatcher[DependencyNode]) extends ExtractionPartMatcher(alias, matcher) {
196 | def this(alias: String) = this(alias, new TrivialNodeMatcher[DependencyNode])
197 | override def canEqual(that: Any) = that.isInstanceOf[ExtractionPartMatcher]
198 | override def equals(that: Any) = that match {
199 | case that: ExtractionPartMatcher => (that canEqual this) && super.equals(that.asInstanceOf[Any])
200 | case _ => false
201 | }
202 |
203 | override def withMatcher(matcher: NodeMatcher[DependencyNode]) = new ArgumentMatcher(this.alias, matcher)
204 | }
205 |
206 | /** A dependency node used to match a relation in a pattern extractor.
207 | *
208 | * @author Michael Schmitz
209 | */
210 | class RelationMatcher(alias: String, matcher: NodeMatcher[DependencyNode])
211 | extends ExtractionPartMatcher(alias, matcher) {
212 | override def canEqual(that: Any) = that.isInstanceOf[RelationMatcher]
213 | override def equals(that: Any) = that match {
214 | case that: RelationMatcher => (that canEqual this) && super.equals(that.asInstanceOf[Any])
215 | case _ => false
216 | }
217 |
218 | override def withMatcher(matcher: NodeMatcher[DependencyNode]) = new RelationMatcher(this.alias, matcher)
219 | }
220 |
221 | /** A dependency node used to match a slot in a pattern extractor.
222 | *
223 | * @author Michael Schmitz
224 | */
225 | class SlotMatcher(alias: String, matcher: NodeMatcher[DependencyNode])
226 | extends ExtractionPartMatcher(alias, matcher) {
227 | override def canEqual(that: Any) = that.isInstanceOf[SlotMatcher]
228 | override def equals(that: Any) = that match {
229 | case that: SlotMatcher => (that canEqual this) && super.equals(that.asInstanceOf[Any])
230 | case _ => false
231 | }
232 |
233 | override def withMatcher(matcher: NodeMatcher[DependencyNode]) = new SlotMatcher(this.alias, matcher)
234 | }
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/GraphExpansions.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse
2 |
3 | import scala.collection.Set
4 | import scala.collection.SortedSet
5 |
6 | import edu.knowitall.collection.immutable.graph.{Graph, DirectedEdge}
7 | import edu.knowitall.collection.immutable.graph.Direction
8 | import edu.knowitall.collection.immutable.Interval
9 | import edu.knowitall.tool.parse.graph.{DependencyNode, DependencyGraph}
10 |
11 | /** A collection of helper methods for expanding a node in a graph
12 | * and/or sentence according to some metric. */
13 | object GraphExpansions {
14 | def neighborsUntil(graph: DependencyGraph, node: DependencyNode, inferiors: List[DependencyNode], until: Set[DependencyNode]): SortedSet[DependencyNode] = {
15 | val lefts = inferiors.takeWhile(_ != node).reverse
16 | val rights = inferiors.dropWhile(_ != node).drop(1)
17 |
18 | val indices = Interval.span(node.indices :: lefts.takeWhile(!until(_)).map(_.indices) ++ rights.takeWhile(!until(_)).map(_.indices))
19 |
20 | // use the original dependencies nodes in case some information
21 | // was lost. For example, of is collapsed into the edge prep_of
22 | graph.nodes.filter(node => node.indices.max >= indices.min && node.indices.max <= indices.max)
23 | }
24 |
25 | def expandAdjacent(graph: DependencyGraph, node: DependencyNode, until: Set[DependencyNode], labels: Set[String]) = {
26 | def takeAdjacent(interval: Interval, nodes: List[DependencyNode], pool: List[DependencyNode]): List[DependencyNode] = pool match {
27 | // can we add the top node?
28 | case head :: tail if (head.indices borders interval) && !until.contains(head) =>
29 | takeAdjacent(interval union head.indices, head :: nodes, tail)
30 | // otherwise abort
31 | case _ => nodes
32 | }
33 |
34 | // it might be possible to simply have an adjacency restriction
35 | // in this condition
36 | def cond(e: Graph.Edge[DependencyNode]) =
37 | labels.contains(e.label)
38 | val inferiors = graph.graph.inferiors(node, cond).toList.sortBy(_.indices)
39 |
40 | // split into nodes left and right of node
41 | val lefts = inferiors.takeWhile(_ != node).reverse
42 | val rights = inferiors.dropWhile(_ != node).drop(1)
43 |
44 | // take adjacent nodes from each list
45 | val withLefts = takeAdjacent(node.indices, List(node), lefts)
46 | val expanded = takeAdjacent(node.indices, withLefts, rights)
47 |
48 | SortedSet(expanded: _*)
49 | }
50 |
51 | def expand(graph: DependencyGraph, node: DependencyNode, until: Set[DependencyNode], labels: Set[String]) = {
52 | // don't restrict to adjacent (by interval) because prep_of, etc.
53 | // remove some nodes that we want to expand across. In the end,
54 | // we get the span over the inferiors. Do go beneath until
55 | // nodes because we need them for neighborsUntil.
56 | def cond(e: Graph.Edge[DependencyNode]) =
57 | labels.contains(e.label)
58 | val inferiors = graph.graph.inferiors(node, cond)
59 |
60 | // get all nodes connected by an nn edge
61 | val nns = graph.graph.connected(node, dedge => dedge.edge.label == "nn")
62 |
63 | // order the nodes by their indices
64 | val ordered = (inferiors ++ nns).toList.sortBy(_.indices)
65 |
66 | // get neighbors, moving left and right, until a bad node is it
67 | neighborsUntil(graph, node, ordered, until)
68 | }
69 |
70 | def augment(graph: DependencyGraph, node: DependencyNode, without: Set[DependencyNode], pred: Graph.Edge[DependencyNode] => Boolean): List[SortedSet[DependencyNode]] = {
71 | // don't restrict to adjacent (by interval) because prep_of, etc.
72 | // remove some nodes that we want to expand across. In the end,
73 | // we get the span over the inferiors.
74 | graph.graph.successors(node, pred).map { successor =>
75 | SortedSet[DependencyNode]() ++ graph.graph.inferiors(successor)
76 | }.toList
77 | }
78 |
79 | /**
80 | * Find all nodes in a components next to the node.
81 | * @param node components will be found adjacent to this node
82 | * @param labels components may be connected by edges with any of these labels
83 | * @param without components may not include any of these nodes
84 | */
85 | def components(graph: DependencyGraph, node: DependencyNode, labels: Set[String], without: Set[DependencyNode], nested: Boolean) = {
86 | // nodes across an allowed label to a subcomponent
87 | val across = graph.graph.neighbors(node, (dedge: DirectedEdge[_]) => dedge.dir match {
88 | case Direction.Down if labels.contains(dedge.edge.label) => true
89 | case _ => false
90 | })
91 |
92 | across.flatMap { start =>
93 | // get inferiors without passing back to node
94 | val inferiors = graph.graph.inferiors(start,
95 | (e: Graph.Edge[DependencyNode]) =>
96 | // don't cross a conjunction that goes back an across node
97 | !((e.label startsWith "conj") && (across contains e.dest)) &&
98 | // make sure we don't cycle out of the component
99 | e.dest != node &&
100 | // make sure we don't descend into another component
101 | // i.e. "John M. Synge who came to us with his play direct
102 | // from the Aran Islands , where the material for most of
103 | // his later works was gathered" if nested is false
104 | (nested || !labels.contains(e.label)))
105 |
106 | // make sure none of the without nodes are in the component
107 | if (without.forall(!inferiors.contains(_))) {
108 | val span = Interval.span(inferiors.map(_.indices).toSeq)
109 | Some(graph.nodes.filter(node => span.superset(node.indices)).toList)
110 | } else None
111 | }
112 | }
113 | }
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/bootstrap/FilterTargetExtractions.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.bootstrap
2 |
3 | import scala.io.Source
4 |
5 | import org.slf4j.LoggerFactory
6 |
7 | /** Filter the target extractions. We only want to keep extractions that
8 | * occur more than once and have a relation with more than 15 seeds.
9 | *
10 | * @author Michael Schmitz
11 | */
12 | object FilterTargetExtractions {
13 | val logger = LoggerFactory.getLogger(this.getClass)
14 |
15 | final val MIN_RELATION_SEEDS = 15
16 |
17 | def main(args: Array[String]) {
18 | val inputFile = Source.fromFile(args(0), "UTF8")
19 |
20 | logger.info("reading lines and counting")
21 | var relationCounts = Map[String, Int]().withDefaultValue(0)
22 | var seedCounts = Map[(String, String, String, String), Int]().withDefaultValue(0)
23 | for (line <- inputFile.getLines) {
24 | val Array(rel, arg1, arg2, lemmas, _*) = line.split("\t")
25 |
26 | val seed = (rel, arg1, arg2, lemmas)
27 |
28 | // make sure the relation contains at least on of the lemmas
29 | // this excludes, for example, "be in"
30 | if (rel.split(" ").exists (lemmas contains _)) {
31 | seedCounts += seed -> (seedCounts(seed) + 1)
32 | relationCounts += rel -> (relationCounts(rel) + 1)
33 | }
34 | }
35 |
36 | // keep relations with more than 15 seeds
37 | // and more than 0 lemmas
38 | val relations: Set[String] =
39 | (for {
40 | (rel, count) <- relationCounts;
41 | if (count > MIN_RELATION_SEEDS)
42 | } yield (rel))(scala.collection.breakOut)
43 | logger.info("keeping " + relations.size + "/" + relationCounts.size + " relations")
44 |
45 | // keep seeds that occur more than once
46 | val seeds =
47 | for {
48 | (seed @ (rel, arg1, arg2, lemmas), count) <- seedCounts;
49 | if count > 1 && relations.contains(rel)
50 | } yield (seed)
51 |
52 | logger.info("keeping " + seeds.size + "/" + seedCounts.size + " seeds")
53 |
54 | logger.info("printing seeds to keep")
55 | for (seed <- seeds) {
56 | println(seed.productIterator.mkString("\t"))
57 | }
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/bootstrap/FindCommon.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.bootstrap
2 |
3 | import scala.util.matching.Regex
4 |
5 | /** Common functionality for bootstrap code.
6 | *
7 | * @author Michael Schmitz
8 | */
9 | object FindCommon {
10 | // tags allowed in proper arguments
11 | val properPostags = Set("DT", "IN", "NNP", "NNPS")
12 | def proper(lemmas: Array[String]) =
13 | lemmas.forall(properPostags.contains(_)) && lemmas.exists(lemma => lemma == "NNP" || lemma == "NNPS")
14 |
15 | def stripPostag(target: String, part: Seq[(String, String, String)]) = {
16 | part.filter { case (pos, tok, lem) => target != pos }
17 | }
18 | def stripPostag(target: Regex, part: Seq[(String, String, String)]) = {
19 | part.filter { case (pos, tok, lem) => !target.pattern.matcher(pos).matches}
20 | }
21 | def stripLemma(target: String, part: Seq[(String, String, String)]) = {
22 | part.filter { case (pos, tok, lem) => target != lem }
23 | }
24 |
25 | def cleanArg(part: Seq[(String, String, String)]) = stripPostag("DT", part)
26 |
27 | def zip3(l1 : List[String], l2 : List[String],l3 : List[String]) : List[(String, String, String)] =
28 | {
29 | def zip3$ (l1$ : List[String], l2$ : List[String], l3$ : List[String], acc : List[(String, String, String)]) : List[(String, String, String)] = l1$ match
30 | {
31 | case Nil => acc reverse
32 | case l1$head :: l1$tail => zip3$(l1$tail, l2$.tail, l3$.tail, (l1$head, l2$.head, l3$.head) :: acc)
33 | }
34 |
35 | zip3$(l1, l2, l3, List[(String,String,String)]())
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/bootstrap/FindTargetArguments.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.bootstrap
2 |
3 | import scala.Array.canBuildFrom
4 | import scala.collection.mutable
5 | import scala.io.Source
6 |
7 | import edu.knowitall.tool.stem.MorphaStemmer
8 |
9 | import FindCommon.{zip3, proper, cleanArg}
10 |
11 | /** Detemine valid arguments of extractions for the boostrap process.
12 | *
13 | * Only frequent proper arguments are used.
14 | *
15 | * @author Michael Schmitz
16 | */
17 | object FindTargetArguments {
18 | import FindCommon._
19 |
20 | val blacklist = Set("inc", "ltd", "page",
21 | "vehicle", "turn", "site", "photo", "image", "gallery")
22 |
23 | def valid(lemma: String) = {
24 | lemma.length > 2 && lemma.length < 64 && !blacklist.contains(lemma)
25 | }
26 |
27 | /** Run over a file with four columns:
28 | *
29 | * string
30 | * lemma
31 | * postag
32 | * count
33 | *
34 | * Count all of the proper arguments and print any arguments that
35 | * exceed the lower bound. The lower bound is specified by the first
36 | * command-line argument. */
37 | def main(args: Array[String]) {
38 | val source = Source.fromFile(args(0), "UTF8")
39 | val lowerBound = args(1).toInt
40 |
41 | val map = new mutable.HashMap[String, Int]().withDefaultValue(0)
42 | for (line <- source.getLines) {
43 | try {
44 | val Array(string, lem, postag, count) = line.split("\t")
45 | // do our own normalization
46 | val lemma = string.split(" ").map(
47 | MorphaStemmer.lemmatize(_)).mkString(" ")
48 |
49 | if (!string.contains("_")) {
50 | // remove DT
51 | val arg = cleanArg(
52 | zip3(
53 | postag.split("""\s+""").toList,
54 | string.split("""\s+""").toList,
55 | lemma.split("""\s+""").toList))
56 | val cleanLemma = arg.unzip3._3.mkString(" ")
57 |
58 | // make sure lemma is valid
59 | if (proper(postag.split(" ")) && valid(cleanLemma)) {
60 | map += cleanLemma -> (map(cleanLemma)+count.toInt)
61 | }
62 | }
63 | }
64 | catch {
65 | case e: MatchError =>
66 | }
67 | }
68 |
69 | source.close
70 |
71 | val keepers: List[(String, Int)] = (for ((k, v) <- map if v > lowerBound) yield {
72 | (k, v)
73 | })(scala.collection.breakOut)
74 |
75 | keepers.sortBy(_._2).reverse.foreach { case (k, v) => println(k + "\t" + v) }
76 | }
77 | }
78 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/bootstrap/FindTargetExtractions.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.bootstrap
2 |
3 | import scala.Array.canBuildFrom
4 | import scala.Option.option2Iterable
5 | import scala.io.Source
6 |
7 | import org.slf4j.LoggerFactory
8 |
9 | import edu.knowitall.openparse.OpenParse
10 | import edu.knowitall.tool.stem.MorphaStemmer
11 |
12 | import FindCommon.{zip3, stripPostag, proper, cleanArg}
13 | import scopt.OptionParser
14 |
15 | /** Detemine valid extractions for the boostrap process.
16 | *
17 | * Extractions need frequent arguments from FindTargetArguments
18 | * and cannot contain a negation word.
19 | *
20 | * @author Michael Schmitz
21 | */
22 | object FindTargetExtractions {
23 | import FindCommon._
24 |
25 | val logger = LoggerFactory.getLogger(this.getClass)
26 |
27 | def negated(lemmas: Array[String]) =
28 | lemmas.contains("not") || lemmas.contains("no") || lemmas.contains("n't") || lemmas.contains("never")
29 |
30 | val lemmaBlacklist = Set("the", "that", "of")
31 |
32 | def main(args: Array[String]) {
33 |
34 | val parser = new OptionParser("findextr") {
35 | var extractionFilePath: String = _
36 | var relationFilePath: Option[String] = None
37 | var argumentFilePath: String = _
38 |
39 | arg("extractions", "extraction file", { v: String => require(v != null); extractionFilePath = v })
40 | arg("arguments", "argument file", { v: String => require(v != null); argumentFilePath = v })
41 | opt("r", "relations", "", "relation file", { v: String => require(v != null); relationFilePath = Some(v) })
42 | }
43 |
44 | if (parser.parse(args)) {
45 | // read in the argument files
46 | val extractions = Source.fromFile(parser.extractionFilePath, "UTF8")
47 | logger.info("loading targets")
48 | val relationsRows = parser.relationFilePath.map(Source.fromFile(_, "UTF8").getLines.map(line => line.split("\t")).toList)
49 | val targets = relationsRows.map(_ map (_(0)))
50 | val relationLemmaLookup = relationsRows.map(_.map(row => (row(0), row(1).split(" "))).toMap)
51 | def relationLemmas(relation: String): Seq[String] = {
52 | relationLemmaLookup match {
53 | case Some(lookup) => lookup(relation)
54 | case None => relation.split(" ") filterNot OpenParse.LEMMA_BLACKLIST
55 | }
56 | }
57 |
58 | targets match {
59 | case Some(targets) => logger.info("5 targets: " + targets.take(5).mkString(", "))
60 | case None => logger.info("No target restriction")
61 | }
62 | logger.info("loading arguments")
63 | val arguments = Source.fromFile(parser.argumentFilePath, "UTF8").getLines.map(line => line.split("\t")(0)).toSet
64 | logger.info("5 arguments: " + arguments.take(5).mkString(", "))
65 |
66 | // iterate over extractions
67 | logger.info("iterating over extractions")
68 | for (line <- extractions.getLines) {
69 | try {
70 | val Array(id, arg1String, relationString, arg2String, _, relationLemma, _, arg1Postag, relationPostag, arg2Postag, _, _, _, count, confidence, url, sentence) = line.split("\t", -1)
71 | val arg1Lemma = arg1String.split(" ").map(MorphaStemmer.lemmatize(_)).mkString(" ")
72 | val arg2Lemma = arg2String.split(" ").map(MorphaStemmer.lemmatize(_)).mkString(" ")
73 | // val rs = new RelationString(relationString, relationLemma, relationPostag)
74 | // rs.correctNormalization()
75 |
76 | val arg1 = zip3(arg1Postag.split("""\s+""").toList, arg1String.split("""\s+""").toList, arg1Lemma.split("""\s+""").toList)
77 | // val rel = zip3(rs.getPosPred.split("""\s+""").toList, rs.getPred.split("""\s+""").toList, rs.getNormPred.split("""\s+""").toList)
78 | val rel = zip3(relationPostag.split("""\s+""").toList, relationString.split("""\s+""").toList, relationLemma.split("""\s+""").toList)
79 | val arg2 = zip3(arg2Postag.split("""\s+""").toList, arg2String.split("""\s+""").toList, arg2Lemma.split("""\s+""").toList)
80 |
81 | implicit def t2mapper[A, B](t: (A, B)) = new {
82 | def map[R](f: A => R, g: B => R) = (f(t._1), g(t._2))
83 | }
84 |
85 | val (arg1cleanPostags, arg1cleanStrings, arg1cleanLemmas) = cleanArg(arg1).unzip3
86 | val (arg2cleanPostags, arg2cleanStrings, arg2cleanLemmas) = cleanArg(arg2).unzip3
87 | val (relcleanPostags, relcleanStrings, relcleanLemmas) = {
88 | val stripped = stripPostag("RB.*", stripPostag("DT", rel))
89 | val beIndex = rel.indexWhere(_._3 == "be")
90 | val penultimateAdjective =
91 | if (rel.length - beIndex >= 3 && (rel.drop(beIndex).head._3 startsWith "be") && rel.last._1 == "IN") {
92 | // return the penultimate if it's VERB ADJECTIVE PREPOSITION
93 | Some(rel.init.last)
94 | }
95 | else None
96 |
97 | (stripPostag("JJS?".r, stripped) ++ penultimateAdjective).unzip3
98 | }
99 |
100 | val relcleanLemmaString = relcleanLemmas.mkString(" ")
101 | val arg1cleanLemmaString = arg1cleanLemmas.mkString(" ")
102 | val arg2cleanLemmaString = arg2cleanLemmas.mkString(" ")
103 |
104 | // ensure the extraction parts are relatively small
105 | if (relationLemma.length < 64 &&
106 | // ensure the normalized relation string is a target
107 | targets.map(_ contains relcleanLemmaString).getOrElse(true) &&
108 | // ensure arguments are proper
109 | (proper(arg1Postag.split("\\s+")) ||
110 | proper(arg2Postag.split("\\s+"))) &&
111 | arg1cleanLemmaString != arg2cleanLemmaString &&
112 | // ensure the args are permissible
113 | arguments.contains(arg1cleanLemmaString) && arguments.contains(arg2cleanLemmaString) &&
114 | // ensure the unnormalized relation is not negated
115 | !negated(relationLemma.split(" "))) {
116 |
117 | val lemmas = (arg1cleanLemmas ++ relationLemmas(relcleanLemmaString) ++ arg2cleanLemmas) filterNot lemmaBlacklist
118 |
119 | for (i <- 0 until count.toInt) {
120 | println(Iterable(
121 | relcleanLemmaString,
122 | arg1cleanLemmaString,
123 | arg2cleanLemmaString,
124 | lemmas.mkString(" "),
125 | arg1String, relationString, arg2String, arg1Postag, relationPostag, arg2Postag).mkString("\t"))
126 | }
127 | }
128 | }
129 | catch {
130 | case e => // e.printStackTrace
131 | }
132 | }
133 | }
134 | }
135 | }
136 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/eval/GroupScoredBy.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.eval
2 |
3 | import java.io.File
4 |
5 | import edu.knowitall.common.Analysis
6 |
7 | import scopt.OptionParser
8 |
9 | /** Group scored extractions by precision and yield.
10 | *
11 | * @author Michael Schmitz
12 | */
13 | object GroupScoredBy {
14 | def main(args: Array[String]) = {
15 | val parser = new OptionParser("groupscored") {
16 | var scoredFile: File = _
17 | var column: Int = 2
18 |
19 | arg("scored", "scored extractions", { path: String => scoredFile = new File(path) })
20 | intOpt("k", "column", "column", { c: Int => column = c })
21 | }
22 |
23 | if (parser.parse(args)) {
24 | require(parser.column >= 2, "column must be >= 2")
25 |
26 | val scores = Score.loadScoredFile(parser.scoredFile)
27 | val grouped = scores.groupBy(scored => scored.extra(parser.column - 2))
28 |
29 | val scored = (for (group <- grouped) yield {
30 | val title = group._1
31 | val scoreds = group._2
32 |
33 | (group._1, Analysis.precision(scoreds.map(scored => scored.score.getOrElse(throw new IllegalArgumentException("unscored extraction: " + scored)))), group._2)
34 | }).toList.sortBy(tuple => (tuple._2, tuple._3.mkString("\t"))).reverse
35 |
36 | scored.foreach { item =>
37 | println(item._2 + ": " + item._1)
38 | item._3.sortBy(scored => (scored.confidence, scored.toRow)).iterator.map(_.toRow).foreach(println)
39 | println()
40 | }
41 | }
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/eval/PrecisionYield.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.eval
2 |
3 | import java.io.{PrintWriter, File}
4 |
5 | import edu.knowitall.common.Resource.using
6 | import edu.knowitall.common.Analysis
7 |
8 | import scopt.OptionParser
9 |
10 | /** Compute precision yield point from scored extractions.
11 | *
12 | * @author Michael Schmitz
13 | */
14 | object PrecisionYield {
15 | abstract class Settings {
16 | def scoredFile: File
17 | def outputFile: Option[File]
18 | }
19 |
20 | def main(args: Array[String]) = {
21 | val settings = new Settings {
22 | var scoredFile: File = _
23 | var outputFile: Option[File] = None
24 | }
25 |
26 | val parser = new OptionParser("precyield") {
27 | arg("scored", "scored extractions file", { path: String => settings.scoredFile = new File(path) })
28 | argOpt("output", "output file", { path: String => settings.outputFile = Some(new File(path)) })
29 | }
30 |
31 | if (parser.parse(args)) {
32 | run(settings)
33 | }
34 | }
35 |
36 | def run(settings: Settings) = {
37 | val scores = Score.loadScoredFile(settings.scoredFile).sortBy(_.confidence).reverse
38 | val input = scores.map(scored => ("%.4f".format(scored.confidence), scored.score.getOrElse(throw new IllegalArgumentException("unscored extraction: " + scored))))
39 |
40 | using {
41 | settings.outputFile match {
42 | case Some(file) => new PrintWriter(file, "UTF8")
43 | case None => new PrintWriter(System.out)
44 | }
45 | } { writer =>
46 | val py = Analysis.precisionYieldMeta(input)
47 | val area = Analysis.areaUnderCurve(py.map { case (conf, yld, pr) => (yld, pr) })
48 | println("auc: " + area)
49 | for ((conf, yld, pr) <- Analysis.precisionYieldMeta(input)) {
50 | writer.println(conf + "\t" + yld + "\t" + pr)
51 | }
52 | }
53 | }
54 | }
55 |
56 | /** Merge precision yield points into a single file,
57 | * usually so they can be graphed together.
58 | *
59 | * @author Michael Schmitz
60 | */
61 | object MergePYFiles {
62 | abstract class Settings {
63 | def files: List[File]
64 | }
65 |
66 | def main(args: Array[String]) {
67 | val settings = new Settings {
68 | var files: List[File] = Nil
69 | }
70 |
71 | val parser = new OptionParser("mergepy") {
72 | arglist("...", "input files", { file: String => settings.files = new File(file) :: settings.files })
73 | }
74 |
75 | if (parser.parse(args)) {
76 | run(settings)
77 | }
78 | }
79 |
80 | def run(settings: Settings) {
81 | val points = for ((file, i) <- settings.files.zipWithIndex) yield {
82 | using(io.Source.fromFile(file, "UTF8")) { source =>
83 | source.getLines.dropWhile(line => !(line contains "\t")).map { line =>
84 | val Array(_, yld, prec) = line.split("\t", -1)
85 | (yld.toInt, (i, prec.toDouble))
86 | }.toList
87 | }
88 | }
89 |
90 | println("\t" + settings.files.map(_.getName).mkString("\t"))
91 | points.flatten.sortBy(_._1).reverse.groupBy(_._1).toSeq.sortBy(_._1).reverse foreach { case (grp, seq) =>
92 | var vec = Vector.fill[String](settings.files.size)("")
93 | seq.foreach {
94 | case (k, (i, v)) => vec = vec updated (i, "%1.4f" format v)
95 | }
96 | println(grp+"\t"+vec.mkString("\t"))
97 | }
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/eval/RankPatterns.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.eval
2 |
3 | import java.io.{PrintWriter, File}
4 |
5 | import edu.knowitall.common.Resource.using
6 |
7 | import scopt.OptionParser
8 |
9 | /** Read a scored file and rank the patterns by their precision and frequency.
10 | *
11 | * @author Michael Schmitz
12 | */
13 | object RankPatterns {
14 | abstract class Settings {
15 | def scoredFile: File
16 | def outputFile: Option[File]
17 | }
18 |
19 | def main(args: Array[String]) = {
20 | val settings = new Settings {
21 | var scoredFile: File = _
22 | var outputFile: Option[File] = None
23 | }
24 |
25 | val parser = new OptionParser("rankpat") {
26 | var scoredFile: File = _
27 |
28 | arg("scored", "scored extractions file", { path: String => settings.scoredFile = new File(path) })
29 | argOpt("output", "output file", { path: String => settings.outputFile = Some(new File(path)) })
30 | }
31 |
32 | if (parser.parse(args)) {
33 | run(settings)
34 | }
35 | }
36 |
37 | def run(settings: Settings) = {
38 | val scores = Score.loadScoredFile(settings.scoredFile).sortBy(_.confidence).reverse
39 | val grouped = scores.groupBy(_.extra(0))
40 | .mapValues { scoreds =>
41 | val yld = scoreds.map(scored => if (scored.score.getOrElse(throw new IllegalArgumentException("unscored extraction: " + scored))) 1 else 0).sum
42 | val precision = yld.toDouble / scoreds.size.toDouble
43 | (precision, scoreds.size)
44 | }
45 |
46 | using {
47 | settings.outputFile match {
48 | case Some(file) => new PrintWriter(file, "UTF8")
49 | case None => new PrintWriter(System.out)
50 | }
51 | } { writer =>
52 | for ((pattern, (p, y)) <- grouped.toSeq.sortBy(_._2).reverse) {
53 | writer.println(pattern+"\t"+p+"\t"+y)
54 | }
55 | }
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/eval/Score.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.eval
2 |
3 | import java.io.{PrintWriter, File}
4 |
5 | import scala.io.Source
6 |
7 | import edu.knowitall.common.Resource.using
8 |
9 | import scopt.OptionParser
10 |
11 | /** A main method to annotate extractions,
12 | * using a gold set for previously scored extractions.
13 | *
14 | * @author Michael Schmitz
15 | */
16 | object Score {
17 | abstract class Settings {
18 | def extractionFile: File
19 | def outputFile: File
20 | def goldFile: Option[File]
21 | def goldOutputFile: Option[File]
22 | def confidenceThreshold: Double
23 | def skipAll: Boolean
24 | def keepSkipped: Boolean
25 | }
26 |
27 | def main(args: Array[String]) = {
28 | object settings extends Settings {
29 | var extractionFile: File = _
30 | var outputFile: File = _
31 | var goldFile: Option[File] = None
32 | var goldOutputFile: Option[File] = None
33 | var confidenceThreshold = 0.0
34 | var skipAll = false
35 | var keepSkipped = false
36 | }
37 |
38 | val parser = new OptionParser("scoreextr") {
39 | arg("extrs", "extractions", { path: String => settings.extractionFile = new File(path) })
40 | arg("output", "scored output", { path: String => settings.outputFile = new File(path) })
41 | opt("g", "gold", "gold set", { path: String => settings.goldFile = Some(new File(path)) })
42 | opt("u", "goldoutput", "output for updated gold set", { path: String => settings.goldOutputFile = Some(new File(path)) })
43 | doubleOpt("t", "threshold", "confidence threshold for considered extractions", { x: Double => settings.confidenceThreshold = x })
44 | opt("skip-all", "don't prompt for items not in the gold set", { settings.skipAll = true })
45 | opt("keep-skipped", "keep unannotated extractions in output file", { settings.keepSkipped = true })
46 | }
47 |
48 | if (parser.parse(args)) {
49 | run(settings)
50 | }
51 | }
52 |
53 | def run(settings: Settings) {
54 | val gold = settings.goldFile match {
55 | case None => Map[String, Boolean]()
56 | case Some(goldFile) => GoldSet.load(goldFile)
57 | }
58 |
59 | val (scoreds, golden) = using(Source.fromFile(settings.extractionFile, "UTF8")) { source =>
60 | score(source.getLines, gold, settings.confidenceThreshold, !settings.skipAll)
61 | }
62 |
63 | // print the scored extractions
64 | using(new PrintWriter(settings.outputFile, "UTF8")) { writer =>
65 | for (scored <- scoreds.filter(scored => settings.keepSkipped || scored.score.isDefined)) {
66 | writer.println(scored.toRow)
67 | }
68 | }
69 |
70 | // output updated gold set
71 | settings.goldOutputFile match {
72 | case Some(file) =>
73 | using(new PrintWriter(file, "UTF8")) { writer =>
74 | golden.foreach { case (k, v) => writer.println((if (v) 1 else 0) + "\t" + k) }
75 | }
76 | case None =>
77 | }
78 | }
79 |
80 | def loadScoredFile(file: File): Seq[Scored] = {
81 | using(Source.fromFile(file, "UTF8")) { source =>
82 | source.getLines.map { line =>
83 | Scored.fromRow(line)
84 | }.toList
85 | }
86 | }
87 |
88 | def score(lines: Iterator[String], gold: Map[String, Boolean], confidenceThreshold: Double, prompt: Boolean) = {
89 | def stringDistance(s1: String, s2: String): Int = {
90 | def minimum(i1: Int, i2: Int, i3: Int) = math.min(math.min(i1, i2), i3)
91 |
92 | val dist = Array.ofDim[Int](s1.length + 1, s2.length + 1)
93 |
94 | for (idx <- 0 to s1.length) dist(idx)(0) = idx
95 | for (jdx <- 0 to s2.length) dist(0)(jdx) = jdx
96 |
97 | for (idx <- 1 to s1.length; jdx <- 1 to s2.length)
98 | dist(idx)(jdx) = minimum (
99 | dist(idx-1)(jdx ) + 1,
100 | dist(idx )(jdx-1) + 1,
101 | dist(idx-1)(jdx-1) + (if (s1(idx-1) == s2(jdx-1)) 0 else 1)
102 | )
103 | dist(s1.length)(s2.length)
104 | }
105 |
106 | def suggest(extr: String) = {
107 | for {
108 | k <- gold.keys;
109 | if stringDistance(k, extr) < extr.length / 2
110 | } yield ((k, gold(k)))
111 | }
112 |
113 | def promptScore(index: Int, extr: String, confidence: String, rest: Seq[Any]): Option[Boolean] = {
114 | println()
115 | System.out.println("Please score " + index + ": " + confidence + ":" + extr + ". (1/y/0/n/skip) ")
116 | if (rest.length > 0) println(rest.mkString("\t"))
117 | suggest(extr) foreach { case (k, v) =>
118 | println("suggest: " + v + "\t" + k)
119 | }
120 | readLine match {
121 | case "0" | "y" => Some(false)
122 | case "1" | "n" => Some(true)
123 | case "s" | "skip" => None
124 | case _ => promptScore(index, extr, confidence, rest)
125 | }
126 | }
127 |
128 | var golden = gold
129 |
130 | val scored = for {
131 | (line, index) <- lines.zipWithIndex
132 | val Array(confidence, extr, rest @ _*) = line.split("\t")
133 | val conf = confidence.toDouble
134 |
135 | if (conf >= confidenceThreshold)
136 |
137 | val scoreOption = gold.get(extr) match {
138 | case Some(score) => Some(score)
139 | case None if prompt => promptScore(index, extr, confidence, rest)
140 | case None => None
141 | }
142 | } yield {
143 | scoreOption match {
144 | case Some(score) =>
145 | // update golden set
146 | golden += extr -> score
147 | case None =>
148 | }
149 |
150 | // output
151 | Scored(scoreOption, conf, extr, rest)
152 | }
153 |
154 | (scored.toList, golden)
155 | }
156 | }
157 |
158 | case class Scored(score: Option[Boolean], confidence: Double, extraction: String, extra: Seq[String]) {
159 | def toRow = (if (!score.isDefined) "" else if (score.get == true) "1" else "0")+"\t"+confidence+"\t"+extraction+"\t"+extra.mkString("\t")
160 | }
161 |
162 | object Scored {
163 | def fromRow(row: String) = {
164 | val parts = row.split("\t")
165 | val score = parts(0) match {
166 | case "1" => true
167 | case "0" => false
168 | case _ => throw new IllegalArgumentException("must be 1 or 0: " + parts(0))
169 | }
170 | val confidence = parts(1).toDouble
171 | val extraction = parts(2)
172 | val extra = parts.drop(3)
173 |
174 | Scored(Some(score), confidence, extraction, extra)
175 | }
176 | }
177 |
178 | object GoldSet {
179 | def load(file: File) = {
180 | using(Source.fromFile(file, "UTF8")) { source =>
181 | source.getLines.map { line =>
182 | val parts = line.split("\t")
183 | parts(1) -> (if (parts(0) == "1") true else false)
184 | }.toMap
185 | }
186 | }
187 |
188 | def save(gold: Map[String, Boolean], file: File) = {
189 | using(new PrintWriter(file, "UTF8")) { writer =>
190 | gold.foreach { case (extr, correct) => writer.println((if (correct) 1 else 0) + "\t" + extr) }
191 | }
192 | }
193 | }
194 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/eval/StatisticalSignificance.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.eval
2 |
3 | import java.io.File
4 |
5 | import edu.knowitall.common.{Random, Analysis}
6 |
7 | import scopt.OptionParser
8 |
9 | /** Compute the statistical significance of scored extractions to a baseline.
10 | *
11 | * @author Michael Schmitz
12 | */
13 | object StatisticalSignificance {
14 | abstract class Settings {
15 | def iterations: Int
16 | def systemFile: File
17 | def baselineFile: File
18 | }
19 |
20 | def main(args: Array[String]) {
21 | object settings extends Settings {
22 | var systemFile: File = _
23 | var baselineFile: File = _
24 | var iterations: Int = 1000
25 | }
26 |
27 | val parser = new OptionParser("statsig") {
28 | arg("system", "scored extractions from the new system", { path: String => settings.systemFile = new File(path) })
29 | arg("baseline", "scored extractions from the baseline system", { path: String => settings.baselineFile = new File(path) })
30 | intOpt("i", "iterations", "number of iterations", { n: Int => settings.iterations = n })
31 | }
32 |
33 | if (parser.parse(args)) {
34 | run(settings)
35 | }
36 | }
37 |
38 | /**
39 | * Uses the bootstrap test for statistical significance.
40 | * This is described in the following paper:
41 | *
42 | * http://maroo.cs.umass.edu/pub/web/getpdf.php?id=744
43 | *
44 | * Note that this function is agnostic to the order of
45 | * `system` and `baseline`.
46 | *
47 | * @param system a metric for the system, i.e. AUC
48 | * @param baseline a metric for the baseline, i.e. AUC
49 | * @param sample a lambda that resamples the systems, returning the metric, i.e. AUC
50 | * @param iterations the number of iterations
51 | */
52 | def bootstrapTestWithMetric(system: Double,
53 | baseline: Double,
54 | sample: ()=>(Double, Double),
55 | iterations: Int) = {
56 | val difference = math.abs(system - baseline)
57 | val sampled = for (i <- 0 until iterations) yield (sample())
58 | val differences = sampled.map { case (sys, base) => math.abs(sys - base) }
59 | val average = differences.sum / differences.size.toDouble
60 | val normed = differences.map(_ - average)
61 | val pscore = normed.count(_ >= difference).toDouble / normed.size.toDouble
62 |
63 | pscore
64 | }
65 |
66 | /**
67 | * Uses the bootstrap test for statistical significance.
68 | * This is described in the following paper:
69 | *
70 | * http://maroo.cs.umass.edu/pub/web/getpdf.php?id=744
71 | *
72 | * Note that this function is agnostic to the order of
73 | * `system` and `baseline`.
74 | *
75 | * @param system a metric for the system, i.e. AUC
76 | * @param baseline a metric for the baseline, i.e. AUC
77 | * @param sample a lambda that resamples the systems, returning the metric, i.e. AUC
78 | * @param iterations the number of iterations
79 | */
80 | def bootstrapTestWithScores(system: Seq[Boolean],
81 | baseline: Seq[Boolean],
82 | metric: Seq[Boolean]=>Double,
83 | iterations: Int, rand: util.Random) = {
84 |
85 | def sample(extrs: Seq[Boolean]) =
86 | metric(extrs.map(extr=>Random.choose(extrs, extrs.size, rand)))
87 |
88 | bootstrapTestWithMetric(metric(system), metric(baseline),
89 | ()=>(sample(system), sample(baseline)), iterations)
90 | }
91 |
92 | def run(settings: Settings) {
93 | val rand = new util.Random
94 |
95 | def areaUnderCurve(scoreds: Seq[Scored]) = {
96 | val points = Analysis.precisionYieldMeta(scoreds.map(extr => (extr.confidence, extr.score.get)))
97 | Analysis.areaUnderCurve(points.map { case (conf, yld, prc) => (yld, prc) })
98 | }
99 |
100 | val systemExtractionsAll: Seq[Scored] =
101 | Score.loadScoredFile(settings.systemFile).sortBy(-_.confidence)
102 | val baselineExtractionsAll: Seq[Scored] =
103 | Score.loadScoredFile(settings.baselineFile).sortBy(-_.confidence)
104 |
105 | val sentences = (systemExtractionsAll.map(_.extra(0)).toSet ++ baselineExtractionsAll.map(_.extra(0)).toSet).toSeq.take(50).toSet
106 |
107 | val systemExtractions = systemExtractionsAll.filter(extr => sentences.contains(extr.extra(0)))
108 | val baselineExtractions = baselineExtractionsAll.filter(extr => sentences.contains(extr.extra(0)))
109 |
110 | def sample(): (Double, Double) = {
111 | def helper(extrs: Seq[Scored]) = {
112 | val sent = sentences.map(extr=>Random.choose(sentences, sentences.size, rand))
113 | val set = sent.flatMap(sent => extrs.filter(sent == _.extra(0))).toSeq.sortBy(_.confidence)
114 | val auc = areaUnderCurve(set)
115 | auc
116 | }
117 |
118 | (helper(systemExtractions), helper(baselineExtractions))
119 | }
120 |
121 | val pscore = bootstrapTestWithMetric(
122 | areaUnderCurve(systemExtractions),
123 | areaUnderCurve(baselineExtractions),
124 | sample, settings.iterations)
125 |
126 | println(pscore)
127 | }
128 | }
129 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/extract/Extraction.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.extract
2 |
3 | import scala.collection.{SortedSet, Set}
4 | import edu.knowitall.collection.immutable.graph.pattern.Match
5 | import edu.knowitall.collection.immutable.graph.{Direction, Graph, DirectedEdge}
6 | import edu.knowitall.collection.immutable.Interval
7 | import edu.knowitall.openparse.GraphExpansions.{expand, components, augment}
8 | import edu.knowitall.openparse.OpenParse
9 | import edu.knowitall.tool.parse.graph.{DependencyPattern, DependencyNode, DependencyGraph}
10 | import edu.knowitall.ollie.Ollie.stemmer
11 | import edu.knowitall.tool.stem.Stemmer
12 | import Extraction.{Part, ClausalComponent, AdverbialModifier}
13 | import edu.knowitall.tool.parse.graph.LabelEdgeMatcher
14 | import edu.knowitall.collection.immutable.graph.pattern.DirectedEdgeMatcher
15 |
16 | /** A representation of an OpenParse extraction.
17 | *
18 | * @author Michael Schmitz
19 | */
20 | abstract class Extraction(val relLemmas: Set[String]) {
21 | /** the text of the first argument */
22 | def arg1Text: String
23 | /** the text of the relation */
24 | def relText: String
25 | /** the text of the second argument */
26 | def arg2Text: String
27 |
28 | def this(relText: String) = this(relText.split(" ").map(implicitly[Stemmer].lemmatize(_)).toSet -- OpenParse.LEMMA_BLACKLIST)
29 |
30 | override def equals(that: Any) = that match {
31 | case that: Extraction => (that canEqual this) && that.arg1Text == this.arg1Text && that.relText == this.relText && that.arg2Text == this.arg2Text
32 | case _ => false
33 | }
34 | def canEqual(that: Any) = that.isInstanceOf[Extraction]
35 | override def hashCode = arg1Text.hashCode + 39 * (relText.hashCode + 39 * arg2Text.hashCode)
36 |
37 | override def toString() = Iterable(arg1Text, relText, arg2Text).mkString("(", "; ", ")")
38 |
39 | def softMatch(that: Extraction) =
40 | (that.arg1Text.contains(this.arg1Text) || this.arg1Text.contains(that.arg1Text)) &&
41 | this.relLemmas == that.relLemmas &&
42 | (that.arg2Text.contains(this.arg2Text) || this.arg2Text.contains(that.arg2Text))
43 | }
44 |
45 | /** A simple representation of an OpenParse extraction.
46 | *
47 | * @author Michael Schmitz
48 | */
49 | class SimpleExtraction(
50 | override val arg1Text: String,
51 | override val relText: String,
52 | relLemmas: Set[String],
53 | override val arg2Text: String)
54 | extends Extraction(relLemmas) {
55 |
56 | def this(arg1Text: String, relText: String, arg2Text: String) = this(arg1Text,
57 | relText,
58 | relText.split(" ").map(implicitly[Stemmer].lemmatize(_)).toSet -- OpenParse.LEMMA_BLACKLIST,
59 | arg2Text)
60 |
61 | def replaceRelation(relation: String) =
62 | new SimpleExtraction(this.arg1Text, this.relText, this.relLemmas, this.arg2Text)
63 | }
64 |
65 | /** A more informative representation of an OpenParse extraction.
66 | *
67 | * @author Michael Schmitz
68 | */
69 | class DetailedExtraction(
70 | val extractor: PatternExtractor,
71 | val `match`: Match[DependencyNode],
72 | val arg1: Part,
73 | val rel: Part,
74 | val arg2: Part,
75 | val clausal: Option[ClausalComponent] = None,
76 | val modifier: Option[AdverbialModifier] = None)
77 | extends Extraction(rel.text) {
78 |
79 | override def arg1Text = arg1.text
80 | override def relText = rel.text
81 | override def arg2Text = arg2.text
82 |
83 | def this(extractor: PatternExtractor, mch: Match[DependencyNode],
84 | arg1Nodes: SortedSet[DependencyNode],
85 | relNodes: SortedSet[DependencyNode],
86 | arg2Nodes: SortedSet[DependencyNode]) =
87 | this(extractor, mch, new Part(arg1Nodes), new Part(relNodes), new Part(arg2Nodes))
88 |
89 | /** all the nodes in this extraction */
90 | def nodes = arg1.nodes ++ rel.nodes ++ arg2.nodes
91 |
92 | /** all the edges in this extraction */
93 | def edges = `match`.bipath.path
94 |
95 | def replaceRelation(relation: String) =
96 | new DetailedExtraction(extractor, `match`, this.arg1, Part(this.rel.nodes, relation), this.arg2, this.clausal, this.modifier)
97 | }
98 |
99 | object DetailedExtraction {
100 | def nodesToString(nodes: Iterable[DependencyNode]) = nodes.iterator.map(_.text).mkString(" ")
101 | }
102 |
103 |
104 | /** Includes logic for expanding relations and arguments.
105 | *
106 | * @author Michael Schmitz
107 | */
108 | object Extraction {
109 | /** Representation of a part of an extraction.
110 | *
111 | * @author Michael Schmitz
112 | */
113 | case class Part(nodes: SortedSet[DependencyNode], text: String) {
114 | def this(nodes: SortedSet[DependencyNode]) = {
115 | this(nodes, DetailedExtraction.nodesToString(nodes))
116 | }
117 |
118 | def this(nodes: Iterable[DependencyNode]) = {
119 | this(SortedSet[DependencyNode]() ++ nodes, DetailedExtraction.nodesToString(nodes))
120 | }
121 |
122 | def span = Interval.span(nodes.map(_.indices))
123 | }
124 | object Part {
125 | def connections(m: Match[DependencyNode], node: DependencyNode): Set[Graph.Edge[DependencyNode]] = {
126 | m.edges.filter(edge => edge.source == node || edge.dest == node).toSet
127 | }
128 |
129 | def connections(m: Match[DependencyNode], nodes: Set[DependencyNode]): Set[Graph.Edge[DependencyNode]] = {
130 | m.edges.filter(edge => nodes.contains(edge.source) || nodes.contains(edge.dest)).toSet
131 | }
132 |
133 | def connections(m: Match[DependencyNode], nodes: Seq[DependencyNode]): Set[Graph.Edge[DependencyNode]] = {
134 | m.edges.filter(edge => nodes.contains(edge.source) || nodes.contains(edge.dest)).toSet
135 | }
136 | }
137 | case class ClausalComponent(rel: Part, arg: Part) {
138 | def text = arg.text + " " + rel.text
139 | }
140 | case class AdverbialModifier(contents: Part) {
141 | def text = contents.text
142 | }
143 |
144 | private val attributionPattern = DependencyPattern.deserialize("{old} nsubj> {arg}")
145 | private val conditionalPattern = DependencyPattern.deserialize("{old} nsubj> {arg}")
146 | def fromMatch(expand: Boolean)(graph: DependencyGraph, m: Match[DependencyNode], ex: PatternExtractor): Iterable[DetailedExtraction] = {
147 | def clausalComponent(node: DependencyNode, until: Set[DependencyNode]) = {
148 | attributionPattern.apply(graph.graph, node) match {
149 | case List(m) =>
150 | assume(m.nodeGroups.get("rel").isDefined)
151 | assume(m.nodeGroups.get("arg").isDefined)
152 |
153 | val rel = m.nodeGroups("rel").node
154 | val arg = m.nodeGroups("arg").node
155 |
156 | val Part(expandedRelNodes, expandedRelText) = expandRelation(graph, rel, until + arg).head
157 | val expandedArg = expandArgument(graph, arg, until + rel)
158 |
159 | Some(ClausalComponent(Part(expandedRelNodes, expandedRelText), Part(expandedArg, DetailedExtraction.nodesToString(expandedArg))))
160 | case _ => None
161 | }
162 | }
163 |
164 | def adverbialModifier(node: DependencyNode, until: Set[DependencyNode]): Option[AdverbialModifier] = {
165 | val neighbors = graph.graph.neighbors(node, dedge => dedge.dir == Direction.Down && dedge.edge.label == "advcl")
166 | val nodes = neighbors.flatMap(graph.graph.inferiors(_))
167 | if (nodes.isEmpty) None
168 | else {
169 | val span = Interval.span(nodes.map(_.indices))
170 | val clause = graph.nodes.filter(node => span.superset(node.indices))
171 | Some(AdverbialModifier(Part(clause, DetailedExtraction.nodesToString(clause))))
172 | }
173 | }
174 |
175 | val groups = m.nodeGroups
176 |
177 | val rels = groups.filter(_._1 startsWith "rel").toSeq.sortBy(_._1).map(_._2.node)
178 | if (rels.isEmpty) (throw new IllegalArgumentException("no rel: " + m))
179 | val arg1 = groups.get("arg1").map(_.node) getOrElse (throw new IllegalArgumentException("no arg1: " + m))
180 | val arg2 = groups.get("arg2").map(_.node) getOrElse (throw new IllegalArgumentException("no arg2: " + m))
181 |
182 | val expandedArg1 = if (expand) expandArgument(graph, arg1, rels.toSet) else SortedSet(arg1)
183 | val expandedArg2 = if (expand) expandArgument(graph, arg2, rels.toSet) else SortedSet(arg2)
184 | val expandRels =
185 | // hack to exclude rel rel extractions with a second nsubj
186 | if (rels.size > 0 && rels.tail.exists(rel => graph.graph.dedges(rel).exists(dedge => dedge.dir == Direction.Down && dedge.edge.label == "nsubj"))) {
187 | Set.empty
188 | }
189 | else if (expand) {
190 | import scalaz._
191 | import Scalaz._
192 |
193 | val expansions = rels.map(rel => expandRelation(graph, rel, expandedArg1 ++ expandedArg2).toList).toList.sequence
194 |
195 | expansions.map(expansion => Part(expansion.map(_.nodes).reduce(_ ++ _), expansion.map(_.text).mkString(" ")))
196 | } else {
197 | Set(Part(SortedSet.empty[DependencyNode] ++ rels, rels.map(_.text).mkString(" ")))
198 | }
199 |
200 | for {
201 | Part(expandedRelNodes, expandedRelText) <- expandRels
202 | val nodes = expandedArg1 ++ expandedArg2 ++ expandedRelNodes
203 | val clausal = rels.flatMap(rel => clausalComponent(rel, nodes)).headOption
204 | val modifier = rels.flatMap(rel => adverbialModifier(rel, nodes)).headOption
205 |
206 | // arguments don't overlap
207 | if (!(Interval.span(expandedArg1.map(_.indices)(scala.collection.breakOut)) intersects Interval.span(expandedArg2.map(_.indices)(scala.collection.breakOut))))
208 | } yield (
209 | new DetailedExtraction(ex, m, new Part(expandedArg1), Part(expandedRelNodes, expandedRelText), new Part(expandedArg2), clausal = clausal, modifier = modifier)
210 | )
211 |
212 | }
213 |
214 | private val argumentExpansionLabels = Set("det", "prep_of", "amod", "num", "number", "nn", "poss", "quantmod", "neg")
215 | def expandArgument(graph: DependencyGraph, node: DependencyNode, until: Set[DependencyNode]): SortedSet[DependencyNode] = {
216 | def expandNode(node: DependencyNode) = {
217 | val expansion = expand(graph, node, until, argumentExpansionLabels)
218 | if (expansion.exists(_.isProperNoun)) expansion
219 | else expansion ++ components(graph, node, Set("rcmod", "infmod", "partmod", "ref", "prepc_of"), until, false).flatten
220 | }
221 |
222 | // expand over any conjunction/disjunction edges to non-verbs
223 | val nodes = graph.graph.connected(node, (dedge: DirectedEdge[DependencyNode]) =>
224 | !(dedge.end.postag startsWith "VB") && (dedge.edge.label == "conj_and" || dedge.edge.label == "conj_or"))
225 |
226 | if (nodes.size == 1) {
227 | // there are no conjunctive edges
228 | expandNode(node)
229 | }
230 | else {
231 | val flat = nodes.map(expandNode).flatten
232 | val span = Interval.span(flat.map(_.indices).toSeq)
233 | // take the nodes that cover all the nodes found
234 | graph.nodes.filter(node => span.superset(node.indices))
235 | }
236 | }
237 |
238 | /** Expand the relation nodes of a match.
239 | *
240 | * Multiple parts can be returned if there are multiple dobj or iobjs.
241 | *
242 | * @return parts the part (or multiple parts) that describes the relation
243 | */
244 | def expandRelation(graph: DependencyGraph, node: DependencyNode, until: Set[DependencyNode]): Set[Part] = {
245 | // count the adjacent dobj edges. We will only expand across
246 | // dobj components if there is exactly one adjacent dobj edge.
247 | // This edge may already be used, but in that case we won't
248 | // expand over it because of the until set.
249 | val dobjCount = graph.graph.edges(node).count(_.label == "dobj")
250 | val iobjCount = graph.graph.edges(node).count(_.label == "iobj")
251 |
252 | var attachLabels = Set[String]()
253 | if (dobjCount == 1) attachLabels += "dobj"
254 | if (iobjCount == 1) attachLabels += "iobj"
255 |
256 | /*
257 | * acomp: "She looks beautiful on Thursday."
258 | */
259 | def pred(edge: Graph.Edge[DependencyNode]) =
260 | // make sure we don't re-add the relation node
261 | edge.dest != node && (
262 | // attach adverbs
263 | edge.label == "advmod" && edge.dest.postag == "RB" ||
264 | edge.label == "aux" || edge.label == "cop" || edge.label == "auxpass" || edge.label == "prt" || edge.label == "acomp")
265 |
266 | // expand across noun label for relational nouns
267 | // i.e. "He is the *best* president of the USA"
268 | val expandNounLabels =
269 | if (node.postag startsWith "NN") expand(graph, node, until, argumentExpansionLabels)
270 | else expand(graph, node, until, Set("det", "amod", "num", "number", "nn", "poss", "quantmod", "neg"))
271 |
272 | // modifiers on copulars are stored on a different node
273 | // i.e. in "he *will* be the president"
274 | val cops = graph.graph.predecessors(node, (e: Graph.Edge[DependencyNode])=>e.label == "cop").headOption
275 | val expandCopLabels = cops.map(cop => augment(graph, cop, until, pred)).getOrElse(List.empty)
276 |
277 | def f(s: Set[List[DependencyNode]]): Set[List[DependencyNode]] =
278 | if (s.isEmpty) Set(List())
279 | else s
280 | val dobjs = f(components(graph, node, Set("dobj"), until, true))
281 | val iobjs = f(components(graph, node, Set("iobj"), until, true))
282 |
283 | for (dobj <- dobjs; iobj <- iobjs) yield {
284 | val expansion = expandCopLabels ++ (expandNounLabels ::
285 | // make sure that we don't use a label that was
286 | // already captured by expandNounlabels. This
287 | // can happen when a verb edges goes between two
288 | // noun labels.
289 | ((augment(graph, node, until, pred).map(_ -- expandNounLabels)) :+
290 | // add subcomponents
291 | (SortedSet[DependencyNode]() ++ dobj) :+
292 | (SortedSet[DependencyNode]() ++ iobj)).filterNot { c =>
293 | // don't add empty components
294 | c.isEmpty ||
295 | // don't add components with just "who" or "whom"
296 | c.size == 1 && c.headOption.map(_.postag == "WP").getOrElse(false)
297 | })
298 |
299 | val sorted = expansion.sortBy(nodes => Interval.span(nodes.map(_.indices)))
300 |
301 | // perform a more complicated node->text transformation
302 | val texts = sorted.map(DetailedExtraction.nodesToString(_))
303 | Part(expansion.reduce(_ ++ _), texts.mkString(" "))
304 | }
305 | }
306 | }
307 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/extract/GeneralExtractor.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.extract
2 |
3 | import org.slf4j.LoggerFactory
4 | import edu.knowitall.collection.immutable.graph.pattern.{Pattern, Match}
5 | import edu.knowitall.collection.immutable.graph.Graph
6 | import edu.knowitall.tool.parse.graph.{DependencyPattern, DependencyNode, DependencyGraph}
7 | import edu.knowitall.ollie.Ollie.stemmer
8 | import GeneralExtractor.logger
9 | import edu.knowitall.openparse.ExtractorPattern
10 |
11 | /** An extractor that is purely specified by a pattern.
12 | *
13 | * @param pattern the pattern to extract
14 | * @param conf the confidence of this extractor
15 | *
16 | * @author Michael Schmitz
17 | */
18 | class GeneralExtractor(pattern: ExtractorPattern, val conf: Double) extends PatternExtractor(pattern) {
19 | import GeneralExtractor._
20 |
21 | def this(pattern: Pattern[DependencyNode], conf: Double) =
22 | this(new ExtractorPattern(pattern), conf)
23 |
24 | protected def extractWithMatches(dgraph: DependencyGraph)(implicit
25 | buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction],
26 | validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean) = {
27 |
28 | // apply pattern and keep valid matches
29 | val matches = pattern(dgraph.graph)
30 | if (!matches.isEmpty && logger.isDebugEnabled) logger.debug("matches: " + matches.mkString(", "))
31 |
32 | val filtered = matches.filter(validMatch(dgraph.graph))
33 | if (!filtered.isEmpty && logger.isDebugEnabled) logger.debug("filtered: " + filtered.mkString(", "))
34 |
35 | for (m <- filtered; extr <- buildExtraction(dgraph, m, this)) yield {
36 | (extr, m)
37 | }
38 | }
39 |
40 | override def extract(dgraph: DependencyGraph)(implicit
41 | buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction],
42 | validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean) = {
43 | logger.debug("pattern: " + pattern)
44 |
45 | val extractions = this.extractWithMatches(dgraph).map(_._1)
46 | if (!extractions.isEmpty) logger.debug("extractions: " + extractions.mkString(", "))
47 |
48 | extractions
49 | }
50 |
51 | override def confidence(extr: Extraction): Double = {
52 | this.conf
53 | }
54 |
55 | /** A maximum confidence for any extraction from this extractor.
56 | * This is used for optimization. If the minimum confidence is
57 | * larger than the threshold, we don't need to run this extractor. */
58 | override def maximumConfidence: Double = this.conf
59 | }
60 |
61 | case object GeneralExtractor extends PatternExtractorType {
62 | val logger = LoggerFactory.getLogger(this.getClass)
63 |
64 | def fromLines(lines: Iterator[String]): List[GeneralExtractor] = {
65 | val patterns: List[(Pattern[DependencyNode], Int)] = lines.map { line =>
66 | line.split("\t") match {
67 | // full information specified
68 | case Array(pat, count) => (DependencyPattern.deserialize(pat), count.toInt)
69 | // assume a count of 1 if nothing is specified
70 | case Array(pat) => logger.warn("warning: pattern has no count: " + pat); (DependencyPattern.deserialize(pat), 1)
71 | case _ => throw new IllegalArgumentException("line must have one or two columns: " + line)
72 | }
73 | }.toList
74 |
75 | (for ((p, conf) <- patterns) yield {
76 | new GeneralExtractor(new ExtractorPattern(p), conf.toDouble)
77 | }).toList
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/extract/PatternExtractor.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.extract
2 |
3 | import java.io.File
4 | import scala.io.Source
5 | import edu.knowitall.collection.immutable.graph.pattern.{Pattern, Match}
6 | import edu.knowitall.collection.immutable.graph.Graph
7 | import edu.knowitall.common.Resource.using
8 | import edu.knowitall.tool.parse.graph.{DependencyNode, DependencyGraph}
9 | import javax.naming.OperationNotSupportedException
10 | import edu.knowitall.collection.immutable.graph.pattern.CaptureNodeMatcher
11 | import edu.knowitall.openparse.ExtractorPattern
12 |
13 | /** An superclass for extractors based on patterns.
14 | *
15 | * @param pattern the pattern to extract
16 | *
17 | * @author Michael Schmitz
18 | */
19 | abstract class PatternExtractor(val pattern: ExtractorPattern) {
20 | def extract(dgraph: DependencyGraph)(implicit
21 | buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction],
22 | validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean): Iterable[DetailedExtraction]
23 | def confidence(extr: Extraction): Double
24 |
25 | /** A maximum confidence for any extraction from this extractor.
26 | * This is used for optimization. If the minimum confidence is
27 | * larger than the threshold, we don't need to run this extractor.
28 | */
29 | def maximumConfidence: Double
30 |
31 | override def toString = pattern.toString
32 |
33 | def tabSerialize: String = throw new OperationNotSupportedException()
34 |
35 | def prepMismatch: Boolean = false
36 | }
37 |
38 | object PatternExtractor {
39 | def tabDeserialize(seq: Seq[String]): (PatternExtractor, Seq[String]) = {
40 | seq(0).toLowerCase match {
41 | case "template" => TemplateExtractor.tabDeserialize(seq.drop(1))
42 | }
43 | }
44 | }
45 |
46 | abstract class PatternExtractorType {
47 | def fromFile(file: File): Seq[PatternExtractor] = {
48 | using (Source.fromFile(file, "UTF8")) { source =>
49 | fromLines(source.getLines)
50 | }
51 |
52 | }
53 | def fromLines(lines: Iterator[String]): Seq[PatternExtractor]
54 |
55 | def name = this.getClass.getSimpleName
56 | }
57 |
58 | object PatternExtractorType {
59 | def apply(string: String) = string match {
60 | case "general" => GeneralExtractor
61 | case "template" => TemplateExtractor
62 | case "specific" => SpecificExtractor
63 | case _ => throw new IllegalArgumentException("unknown extractor: " + string)
64 | }
65 | }
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/extract/SpecificExtractor.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.extract
2 |
3 | import scala.Array.canBuildFrom
4 | import edu.knowitall.collection.immutable.graph.pattern.{Pattern, Match}
5 | import edu.knowitall.collection.immutable.graph.Graph
6 | import edu.knowitall.tool.parse.graph.{DependencyNode, DependencyGraph}
7 | import edu.knowitall.tool.stem.MorphaStemmer
8 | import edu.knowitall.openparse.ExtractorPattern
9 |
10 | /** An extractor that is specified only with a pattern
11 | * but only works for specific relation lemmas.
12 | *
13 | * @param relation the resulting relation string
14 | * @param relationLemmas the acceptible matched lemmas
15 | * @param pattern the pattern to extract
16 | * @param conf the confidence of this extractor
17 | *
18 | * @author Michael Schmitz
19 | */
20 | class SpecificExtractor(val relation: String,
21 | val relationLemmas: List[String],
22 | pattern: ExtractorPattern, conf: Double)
23 | extends GeneralExtractor(pattern, conf) {
24 |
25 | def this(relation: String, relationLemmas: List[String], pattern: Pattern[DependencyNode], conf: Double) =
26 | this(relation, relationLemmas, new ExtractorPattern(pattern), conf)
27 |
28 | override def extract(dgraph: DependencyGraph)(implicit
29 | buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction],
30 | validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean) = {
31 | val extractions = super.extract(dgraph)
32 | extractions.withFilter{ extr =>
33 | val extrRelationLemmas = extr.rel.text.split(" ").map(MorphaStemmer.lemmatize(_))
34 | relationLemmas.forall(extrRelationLemmas.contains(_))
35 | }.map(_.replaceRelation(relation))
36 | }
37 | }
38 |
39 | case object SpecificExtractor extends PatternExtractorType {
40 | def fromLines(lines: Iterator[String]) = throw new UnsupportedOperationException
41 | }
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/extract/TemplateExtractor.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.extract
2 |
3 | import scala.util.matching.Regex
4 | import org.slf4j.LoggerFactory
5 | import edu.knowitall.collection.immutable.graph.pattern.{Pattern, Match}
6 | import edu.knowitall.collection.immutable.graph.Graph
7 | import edu.knowitall.tool.parse.graph.{DependencyPattern, DependencyNode, DependencyGraph}
8 | import Template.group
9 | import edu.knowitall.ollie.Ollie.stemmer
10 | import edu.knowitall.tool.postag.Postagger
11 | import edu.knowitall.tool.parse.graph.RegexEdgeMatcher
12 | import edu.knowitall.tool.parse.graph.LabelEdgeMatcher
13 | import edu.knowitall.openparse.ExtractorPattern
14 |
15 | /** An extractor that is specified by a pattern and a template.
16 | * the template can add a "to be" and/or preposition word around
17 | * the relation. It can also change the preposition word to another
18 | * preposition (i.e., switch "of" to "in").
19 | *
20 | * @param template a template in which to put the relation words
21 | * @param pattern the pattern to extract
22 | * @param conf the confidence of this extractor
23 | *
24 | * @author Michael Schmitz
25 | */
26 | class TemplateExtractor(val template: Template, pattern: ExtractorPattern, conf: Double)
27 | extends GeneralExtractor(pattern, conf) {
28 |
29 | def this(template: Template, pattern: Pattern[DependencyNode], conf: Double) =
30 | this(template, new ExtractorPattern(pattern), conf)
31 |
32 | override def extract(dgraph: DependencyGraph)(implicit
33 | buildExtraction: (DependencyGraph, Match[DependencyNode], PatternExtractor)=>Iterable[DetailedExtraction],
34 | validMatch: Graph[DependencyNode]=>Match[DependencyNode]=>Boolean) = {
35 |
36 | val extractions = super.extractWithMatches(dgraph)
37 |
38 | extractions.map{ case (extr, m) => template(extr, dgraph, m) }
39 | }
40 |
41 | override def tabSerialize = Iterable("Template", template.serialize, pattern.serialize, conf.toString).mkString("\t")
42 |
43 | override def prepMismatch: Boolean = {
44 | val trailingPrep = TemplateExtractor.trailingPreposition.findFirstIn(template.serialize)
45 | val lastPatternPrep = pattern.baseEdgeMatchers.flatMap {
46 | case m: RegexEdgeMatcher if m.labelRegex == new Regex("""prep_(.*)""") => Some("{prep}")
47 | case m: LabelEdgeMatcher if m.label startsWith "prep_" => Some(m.label.drop(5))
48 | case _ => None
49 | }.lastOption
50 |
51 | trailingPrep == lastPatternPrep
52 | }
53 | }
54 |
55 | case object TemplateExtractor extends PatternExtractorType {
56 | val logger = LoggerFactory.getLogger(this.getClass)
57 |
58 | private val trailingPreposition = new Regex("\\s(?:" + Postagger.prepositions.mkString("|") + "|\\{prep\\})$")
59 |
60 | override def fromLines(lines: Iterator[String]): List[PatternExtractor] = {
61 | val patterns: List[(Template, Pattern[DependencyNode], Double)] = lines.map { line =>
62 | line.split("\t") match {
63 | // full information specified
64 | case Array(template, pat, conf) =>
65 | (Template.deserialize(template), DependencyPattern.deserialize(pat), conf.toDouble)
66 | // assume a count of 1 if nothing is specified
67 | case Array(template, pat) =>
68 | logger.warn("warning: pattern has no confidence: " + pat);
69 | (Template.deserialize(template), DependencyPattern.deserialize(pat), 1.0)
70 | case _ => throw new IllegalArgumentException("line must have two or three columns: " +line)
71 | }
72 | }.toList
73 |
74 | val maxCount = patterns.maxBy(_._3)._3
75 | (for ((template, pattern, conf) <- patterns) yield {
76 | new TemplateExtractor(template, new ExtractorPattern(pattern), conf)
77 | }).toList
78 | }
79 |
80 | def tabDeserialize(string: String) = {
81 | val parts = string.split("\t")
82 | }
83 |
84 | def tabDeserialize(parts: Seq[String]): (TemplateExtractor, Seq[String]) = {
85 | val Seq(templateString, patternString, confString, rest @ _*) = parts
86 |
87 | val template = Template.deserialize(templateString)
88 | val pattern = new ExtractorPattern(DependencyPattern.deserialize(patternString))
89 | val conf = confString.toDouble
90 |
91 | (new TemplateExtractor(template, pattern, conf), rest)
92 | }
93 | }
94 |
95 | case class Template(template: String, be: Boolean) {
96 | import Template._
97 | def apply(extr: DetailedExtraction, dgraph: DependencyGraph, m: Match[DependencyNode]) = {
98 | def matchGroup(name: String): String = name match {
99 | case "rel" => extr.relText
100 | case "arg1" => extr.arg1Text
101 | case "arg2" => extr.arg2Text
102 | case _ => m.groups(name).text
103 | }
104 |
105 | // don't add the be if we attach a verb using a cop, aux, or auxpass edge.
106 | // there are a lot of examples where adding "be" makes it very messy
107 | // "She has practiced law, with Foo, Bar."
108 | // don't want: (Bar; be has practiced with; Foo)
109 | // This is somewhat of a hack that makes bad patterns look less bad.
110 | val prefix = if (be &&
111 | !(dgraph.graph.neighbors(m.nodeGroups.getOrElse("rel", m.nodeGroups("rel1")).node, dedge => (dedge.edge.label startsWith "aux") || dedge.edge.label == "cop") filter (_.postag startsWith "VB") exists (neighbor => extr.rel.nodes contains neighbor))) {
112 | "be"
113 | }
114 | else ""
115 |
116 | // pull out the modals because they must preceed the prefix
117 | // also include "to"
118 | val modals = extr.rel.nodes.filter(node => (node.postag startsWith "MD") ||
119 | (node.postag == "TO"))
120 |
121 | // horrible escape is required. See JavaDoc for Match.replaceAll
122 | // or https://issues.scala-lang.org/browse/SI-5437
123 | var rel = group.replaceAllIn(template, (gm: Regex.Match) => matchGroup(gm.group(1))
124 | .replaceAll("_", " ")
125 | .replaceAll("""\\""", """\\\\""")
126 | .replaceAll("""\$""", """\\\$"""))
127 |
128 | if (!prefix.isEmpty) {
129 | if (modals.isEmpty) {
130 | rel = prefix + " " + rel
131 | } else {
132 | val regex = new Regex("(^.*\\b(?:" + modals.iterator.map(_.text).mkString("|") + "))\\b")
133 | rel = regex.replaceAllIn(rel, "$1 " + prefix)
134 | }
135 | }
136 |
137 | extr.replaceRelation(rel)
138 | }
139 |
140 | override def toString = (if (be) "be " else "") + template
141 |
142 | def serialize = this.toString
143 | }
144 |
145 | object Template {
146 | val group = """\{(.*?)}""".r
147 | def deserialize(string: String) = {
148 | if (string.startsWith("be ")) {
149 | Template(string.drop(3), true)
150 | }
151 | else {
152 | Template(string, false)
153 | }
154 | }
155 | }
156 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/template/CountsToConfidence.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.template
2 |
3 | import scopt.mutable.OptionParser
4 | import java.io.File
5 | import edu.knowitall.common.Resource.using
6 | import scala.io.Source
7 | import java.io.PrintWriter
8 |
9 | object CountsToConfidence {
10 | abstract class Settings {
11 | def sourceFile: File
12 | def destFile: Option[File]
13 | }
14 |
15 | def main(args: Array[String]) = {
16 | object settings extends Settings {
17 | var sourceFile: File = _
18 | var destFile: Option[File] = None
19 | }
20 |
21 | val parser = new OptionParser("convertconf") {
22 | arg("source", "file with pattern, count pairs", { path: String => settings.sourceFile = new File(path) })
23 | argOpt("dest", "optional parameter to specify output to a file", { path: String => settings.destFile = Some(new File(path)) })
24 | }
25 |
26 | if (parser.parse(args)) {
27 | run(settings)
28 | }
29 | }
30 |
31 | def run(settings: Settings) = {
32 | using (Source.fromFile(settings.sourceFile)) { source =>
33 | using (
34 | settings.destFile match {
35 | case Some(file) => new PrintWriter(file)
36 | case None => new PrintWriter(System.out)
37 | }
38 | ) { output =>
39 | val lines = {
40 | val it = source.getLines
41 | val first = it.next
42 | output.println(first)
43 | it.toList
44 | }
45 |
46 | val max = lines.map(_.split("\t").last.toInt).max
47 |
48 | for (line <- lines) {
49 | val parts = line.split("\t")
50 | val count = parts.last.toInt
51 | output.println(parts.take(parts.length - 1).mkString("\t") + "\t" + ("%1.4f" format (count.toDouble / max.toDouble)))
52 | }
53 | }
54 | }
55 | }
56 | }
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/template/GeneralizeTemplate.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.template
2 |
3 | import java.io.{PrintWriter, File}
4 |
5 | import scala.collection.immutable
6 | import scala.io.Source
7 |
8 | import edu.knowitall.collection.immutable.graph.pattern.{NodeMatcher, ConjunctiveNodeMatcher}
9 | import edu.knowitall.common.Resource.using
10 | import edu.knowitall.openparse.{SlotMatcher, RelationMatcher, ExtractorPattern, ExtractionPartMatcher}
11 | import edu.knowitall.tool.parse.graph.{RegexNodeMatcher, PostagNodeMatcher, DependencyPattern, DependencyNode}
12 | import edu.knowitall.ollie.Ollie.stemmer
13 |
14 | import scopt.OptionParser
15 |
16 | /** Generalize semantic restrictions to semantic classes.
17 | *
18 | * @author Michael Schmitz
19 | */
20 | object GeneralizeTemplates {
21 | abstract class Settings {
22 | def sourceFile: File
23 | def destFile: Option[File]
24 |
25 | val categories = List("person", "location")
26 | }
27 |
28 | def main(args: Array[String]) = {
29 | object settings extends Settings {
30 | var sourceFile: File = null
31 | var destFile: Option[File] = None
32 | }
33 |
34 | val parser = new OptionParser("buildtemp") {
35 | arg("source", "file with source relation, pattern pairs", { path: String => settings.sourceFile = new File(path) })
36 | argOpt("dest", "optional parameter to specify output to a file", { path: String => settings.destFile = Some(new File(path)) })
37 | }
38 |
39 | if (parser.parse(args)) {
40 | run(settings)
41 | }
42 | }
43 |
44 | def lexicalRestrictions(extractionPartMatcher: ExtractionPartMatcher) = {
45 | extractionPartMatcher.matcher match {
46 | case m: ConjunctiveNodeMatcher[_] =>
47 | val postag = (m.matchers.collect { case m: PostagNodeMatcher => m } head).postag
48 | val lemmas = (m.matchers.collect { case m: RegexNodeMatcher => m } head).regex.toString.split("\\|").toSeq
49 | Some(postag, lemmas)
50 | case _ => None
51 | }
52 | }
53 |
54 | case class Category(name: String, elements: Set[String]) {
55 | override def toString = "Category(" + name + ")"
56 | }
57 |
58 | def loadCategories(categories: Seq[String]) = {
59 | def loadCategory(name: String) = {
60 | val elements =
61 | using(this.getClass.getClassLoader.getResourceAsStream("categories/" + name + ".txt")) { stream =>
62 | using(Source.fromInputStream(stream)) { source =>
63 | source.getLines().toSet
64 | }
65 | }
66 |
67 | Category(name, elements)
68 | }
69 |
70 | (for (cat <- categories) yield (loadCategory(cat))).toList
71 | }
72 |
73 | def run(settings: Settings) {
74 | val categories = loadCategories(settings.categories)
75 |
76 | def generalize(matcher: NodeMatcher[DependencyNode], postag: String, lemmas: Set[String]) = {
77 | def distance(cat: Category) = {
78 | val intersectSize = (cat.elements intersect lemmas).size
79 | intersectSize.toDouble / lemmas.size.toDouble
80 | if (intersectSize < 5) 0.0
81 | else intersectSize.toDouble / lemmas.size.toDouble
82 | }
83 | if (lemmas.size < 10) matcher
84 | else {
85 | postag match {
86 | case "NN" | "NNS" =>
87 | val overlaps = categories map (cat => (cat, distance(cat))) sortBy (-_._2)
88 | if (overlaps.iterator.map(_._2).sum > 0.75) {
89 | val categories = overlaps.filter(_._2 > 0.10).map(_._1)
90 | val uncategorized = lemmas -- categories.flatMap(_.elements)
91 | val elements = immutable.SortedSet[String]() ++ categories.flatMap(_.elements) ++ uncategorized
92 | new ConjunctiveNodeMatcher(new PostagNodeMatcher(postag), new RegexNodeMatcher(elements.mkString("|").r))
93 | } else matcher
94 | case m => matcher
95 | }
96 | }
97 | }
98 |
99 | var templates =
100 | using(Source.fromFile(settings.sourceFile, "UTF8")) { source =>
101 | source.getLines().map { line =>
102 | val Array(template, pattern, count) = line.split("\t")
103 | ((template, new ExtractorPattern(DependencyPattern.deserialize(pattern))), count.toInt)
104 | }.toList
105 | }
106 |
107 | templates = templates.map {
108 | case ((template, pattern), count) =>
109 | val matchers = pattern.matchers.map { matcher =>
110 | matcher match {
111 | case m: ExtractionPartMatcher if m.isInstanceOf[SlotMatcher] || m.isInstanceOf[RelationMatcher] =>
112 | lexicalRestrictions(m) match {
113 | case Some((postag, lemmas)) => m.withMatcher(generalize(m.matcher, postag, lemmas.toSet))
114 | case None => m
115 | }
116 | case m => m
117 | }
118 | }
119 |
120 | ((template, new ExtractorPattern(matchers)), count)
121 | }
122 |
123 | using (
124 | settings.destFile match {
125 | case Some(file) => new PrintWriter(file, "UTF8")
126 | case None => new PrintWriter(System.out)
127 | })
128 | { writer =>
129 | templates map { case ((template, pattern), count) => Iterable(template, pattern, count).mkString("\t") } foreach writer.println
130 | }
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/core/src/main/scala/edu/knowitall/openparse/template/PassiveReflections.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse.template
2 |
3 | import java.io.{PrintWriter, File}
4 | import scala.Option.option2Iterable
5 | import scala.annotation.elidable
6 | import scala.collection.immutable
7 | import scala.io.Source
8 | import scala.util.matching.Regex
9 | import org.slf4j.LoggerFactory
10 | import edu.knowitall.collection.immutable.graph.pattern.{Matcher, ConjunctiveNodeMatcher, DirectedEdgeMatcher, CaptureEdgeMatcher}
11 | import edu.knowitall.collection.immutable.Bag
12 | import edu.knowitall.common.Resource.using
13 | import edu.knowitall.common.enrich.Traversables.traversableOncePairTo
14 | import edu.knowitall.openparse.{SlotMatcher, RelationMatcher, ExtractorPattern, ArgumentMatcher}
15 | import edu.knowitall.tool.parse.graph.{RegexNodeMatcher, RegexEdgeMatcher, PostagNodeMatcher, LabelEdgeMatcher, DependencyPattern, DependencyNode}
16 | import edu.knowitall.tool.postag.Postagger
17 | import edu.knowitall.ollie.Ollie.stemmer
18 | import scalaz.Scalaz._
19 | import scalaz._
20 | import scopt.OptionParser
21 | import edu.knowitall.collection.immutable.graph.pattern.CaptureNodeMatcher
22 |
23 | /** A main method for adding active and passive conversions
24 | * of patterns to a model file. BuiltTemplates removes
25 | * duplicate patterns, removing a lot of garbage but also
26 | * the active/passive conversions.
27 | *
28 | * @author Michael Schmitz
29 | */
30 | object PassiveReflections {
31 | val logger = LoggerFactory.getLogger(this.getClass)
32 |
33 | abstract class Settings {
34 | def sourceFile: File
35 | def destFile: Option[File]
36 | }
37 |
38 | def main(args: Array[String]) {
39 | val settings = new Settings {
40 | var sourceFile: File = null
41 | var destFile: Option[File] = None
42 | }
43 |
44 | val parser = new OptionParser("passivemodel") {
45 | arg("source", "input model file", { path: String => settings.sourceFile = new File(path) })
46 | argOpt("dest", "output model file", { path: String => settings.destFile = Some(new File(path)) })
47 | }
48 |
49 | if (parser.parse(args)) {
50 | run(settings)
51 | }
52 | }
53 |
54 | def run(settings: Settings) {
55 | def switchArgs(pattern: ExtractorPattern) = {
56 | val arg1 = pattern.matchers.find { case m: CaptureNodeMatcher[_] => m.alias == "arg1" case _ => false } get
57 | val arg2 = pattern.matchers.find { case m: CaptureNodeMatcher[_] => m.alias == "arg2" case _ => false } get
58 |
59 | new ExtractorPattern(pattern.matchers.map {
60 | case m: CaptureNodeMatcher[_] if m.alias == "arg1" => arg2
61 | case m: CaptureNodeMatcher[_] if m.alias == "arg2" => arg1
62 | case m => m
63 | })
64 | }
65 |
66 | val patterns = using {
67 | Source.fromFile(settings.sourceFile)
68 | } { source =>
69 | source.getLines.drop(1).map { line =>
70 | val Array(template, pattern, count) = line.split("\t")
71 | (template, new ExtractorPattern(DependencyPattern.deserialize(pattern)), count)
72 | }.toList
73 | }
74 |
75 | using(
76 | settings.destFile match {
77 | case Some(file) => new PrintWriter(file, "UTF8")
78 | case None => new PrintWriter(System.out)
79 | }) { output =>
80 | patterns.foreach {
81 | case (template, pattern, count) =>
82 | output.println(Iterable(template, pattern, count).mkString("\t"))
83 |
84 | if (pattern.baseEdgeMatchers.exists { case m: LabelEdgeMatcher => m.label == "nsubj" case _ => false }) {
85 | // print the passive conversion
86 |
87 | if (!(template startsWith "be ")) {
88 | output.println(Iterable("be " + template, switchArgs(pattern), count).mkString("\t"))
89 | }
90 | } else if (pattern.baseEdgeMatchers.exists { case m: LabelEdgeMatcher => m.label == "nsubjpass" case _ => false }) {
91 | if (template startsWith "be ") {
92 | output.println(Iterable(template.drop(3), switchArgs(pattern), count).mkString("\t"))
93 | }
94 | }
95 | }
96 | }
97 | }
98 | }
--------------------------------------------------------------------------------
/core/src/test/resources/logback-test.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
6 |
7 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/core/src/test/scala/edu/knowitall/common/enrich/TraversableSpecTest.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.common.enrich
2 |
3 | import edu.knowitall.collection.immutable.Bag
4 |
5 | import org.junit.runner.RunWith
6 | import org.specs2.mutable.Specification
7 | import org.specs2.runner.JUnitRunner
8 |
9 | @RunWith(classOf[JUnitRunner])
10 | object TraversableSpecTest extends Specification {
11 | import Traversables._
12 |
13 | "simple histogram works fine" in {
14 | val h1 = List(1, 2, 2, 3, 3, 3).histogram
15 | val h2 = List(3, 2, 1, 3, 2, 3).histogram
16 | h1 must_== h2
17 | h1 must haveTheSameElementsAs(List((1, 1), (2, 2), (3, 3)))
18 | }
19 |
20 | "histogram from partials works fine" in {
21 | val list = List((1, 1), (2, 2), (2, 2), (3, 3), (3, 3), (3, 3))
22 | val h1 = list.mergeHistograms
23 | val h2 = list.reverse.mergeHistograms
24 | val h3 = list.mergeKeys(_ + _)
25 | h1 must_== h2
26 | h1 must_== h3
27 | h1 must haveTheSameElementsAs(List((1, 1), (2, 4), (3, 9)))
28 | }
29 |
30 | "list multimaps works fine" in {
31 | val list = List(1 -> 1, 1 -> 2, 1 -> 1, 2 -> 2)
32 | val multimap = list.toListMultimap
33 |
34 | multimap must haveTheSameElementsAs(Map(1 -> List(1, 2, 1), 2 -> List(2)))
35 |
36 | val extended = (multimap.toSeq :+ (1 -> List(2, 3, 4, 5)))
37 | val merged = extended.mergeKeys(_ ++ _)
38 |
39 | merged must haveTheSameElementsAs(Map(1 -> List(1, 2, 1, 2, 3, 4, 5), 2 -> List(2)))
40 | }
41 |
42 | "set multimaps works fine" in {
43 | val list = List(1 -> 1, 1 -> 2, 1 -> 1, 2 -> 2)
44 | val multimap = list.toSetMultimap
45 |
46 | multimap must haveTheSameElementsAs(Map(1 -> Set(1, 2), 2 -> Set(2)))
47 |
48 | val extended = (multimap.toSeq :+ (1 -> Set(2, 3, 4, 5)))
49 | val merged = extended.mergeKeys(_ ++ _)
50 |
51 | merged must haveTheSameElementsAs(Map(1 -> Set(1, 2, 3, 4, 5), 2 -> Set(2)))
52 | }
53 |
54 | "bag multimaps works fine" in {
55 | val list = List(1 -> 1, 1 -> 2, 1 -> 1, 2 -> 2)
56 | val multimap = list.toBagMultimap
57 |
58 | multimap must haveTheSameElementsAs(Map(1 -> Bag(1, 1, 2), 2 -> Bag(2)))
59 |
60 | val extended = (multimap.toSeq :+ (1 -> Bag(2, 3, 4, 5)))
61 | val merged = extended.mergeKeys(_ ++ _)
62 |
63 | merged must haveTheSameElementsAs(Map(1 -> Bag(1, 1, 2, 2, 3, 4, 5), 2 -> Bag(2)))
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/core/src/test/scala/edu/knowitall/ollie/DependencyGraphExtrasSpec.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.ollie
2 |
3 | import org.junit.runner.RunWith
4 | import org.specs2.mutable.Specification
5 | import org.specs2.runner.JUnitRunner
6 | import edu.knowitall.tool.parse.graph.DependencyGraph
7 |
8 | @RunWith(classOf[JUnitRunner])
9 | object DependencyGraphExtrasTest extends Specification {
10 | "switch to passive voice works" in {
11 | val graph = DependencyGraph.deserialize("nsubj(hit_VBD_1_8, Michael_NNP_0_0); dobj(hit_VBD_1_8, ball_NN_3_16); punct(hit_VBD_1_8, ._._4_20); det(ball_NN_3_16, the_DT_2_12)")
12 | val extras = new DependencyGraphExtras(graph)
13 |
14 | val switched = extras.switchVoice
15 |
16 | switched.size must_== 1
17 | switched.head.serialize must_== "det(ball_NN_1_4, the_DT_0_0); auxpass(hit_VBD_2_13, was_VBD_1_9); nsubjpass(hit_VBD_2_13, ball_NN_1_4); prep(hit_VBD_2_13, by_IN_3_17); punct(hit_VBD_2_13, ._._6_28); pobj(by_IN_3_17, Michael_NNP_4_20)"
18 | }
19 |
20 | "switch to active voice works" in {
21 | val graph = DependencyGraph.deserialize("det(ball_NN_1_4, The_DT_0_0); nsubjpass(hit_VBN_3_13, ball_NN_1_4); auxpass(hit_VBN_3_13, was_VBD_2_9); prep(hit_VBN_3_13, by_IN_4_17); punct(hit_VBN_3_13, ._._6_27); pobj(by_IN_4_17, Michael_NNP_5_20)")
22 | val extras = new DependencyGraphExtras(graph)
23 |
24 | val switched = extras.switchVoice
25 |
26 | switched.size must_== 1
27 | switched.head.serialize must_== "nsubj(hit_VBN_1_8, Michael_NNP_0_0); dobj(hit_VBN_1_8, ball_NN_3_16); punct(hit_VBN_1_8, ._._4_21); det(ball_NN_3_16, The_DT_2_12)"
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/core/src/test/scala/edu/knowitall/ollie/confidence/OllieFeatureSetSpec.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.ollie.confidence
2 |
3 | import org.junit._
4 | import org.junit.Assert._
5 | import org.specs2.mutable.Specification
6 | import org.junit.runner.RunWith
7 | import org.specs2.runner.JUnitRunner
8 | import edu.knowitall.tool.parse.graph.DependencyGraph
9 | import edu.knowitall.ollie.Ollie
10 | import edu.knowitall.ollie.OllieExtractionInstance
11 | import edu.knowitall.ollie.ScoredOllieExtractionInstance
12 | import edu.knowitall.openparse.OpenParse
13 | import org.junit.runner.RunWith
14 | import org.specs2.runner.JUnitRunner
15 |
16 | @RunWith(classOf[JUnitRunner])
17 | object OllieFeatureSetSpec extends Specification {
18 | val ollie = new Ollie(OpenParse.withDefaultModel())
19 |
20 | "if right before arg1" in {
21 | val graph = DependencyGraph.deserialize("poss(father_NN_2_12, his_PRP$_1_8); punct(father_NN_2_12, ,_,_3_19); appos(father_NN_2_12, Whitechapel_NNP_4_21); punct(father_NN_2_12, ,_,_5_33); advmod(betrays_VBZ_6_35, However_RB_0_0); nsubj(betrays_VBZ_6_35, father_NN_2_12); dobj(betrays_VBZ_6_35, whereabouts_NN_8_47); punct(betrays_VBZ_6_35, ,_,_9_59); xcomp(betrays_VBZ_6_35, fearing_VBG_10_61); punct(betrays_VBZ_6_35, ._._27_149); poss(whereabouts_NN_8_47, his_PRP$_7_43); ccomp(fearing_VBG_10_61, die_VB_15_87); poss(son_NN_13_78, his_PRP$_12_74); complm(die_VB_15_87, that_IN_11_69); nsubj(die_VB_15_87, son_NN_13_78); aux(die_VB_15_87, will_MD_14_82); advcl(die_VB_15_87, captured_VBN_20_104); mark(captured_VBN_20_104, if_IN_16_91); nsubjpass(captured_VBN_20_104, he_PRP_17_94); auxpass(captured_VBN_20_104, is_VBZ_18_97); neg(captured_VBN_20_104, not_RB_19_100); cc(captured_VBN_20_104, and_CC_21_113); conj(captured_VBN_20_104, returned_VBN_22_117); dobj(captured_VBN_20_104, home_NN_23_126); prep(captured_VBN_20_104, to_TO_24_131); pobj(to_TO_24_131, plantation_NN_26_138); det(plantation_NN_26_138, the_DT_25_134)")
22 | val extrs = ollie.extract(graph)
23 |
24 | val extr = extrs.toSeq(2)
25 | OllieFeatures.ifRightBeforeArg1(extr) must_== 1.0
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/core/src/test/scala/edu/knowitall/openparse/BuildPatternsSpec.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse
2 |
3 | import org.junit._
4 | import org.junit.Assert._
5 | import org.specs2.mutable.Specification
6 | import org.junit.runner.RunWith
7 | import org.specs2.runner.JUnitRunner
8 |
9 | import edu.knowitall.tool.parse.graph.DependencyGraph
10 | import edu.knowitall.tool.stem.MorphaStemmer
11 |
12 | @RunWith(classOf[JUnitRunner])
13 | object BuildPatternsSpecTest extends Specification {
14 | def findPatterns(row: (String, String, String, String, String), maxLength: Option[Int] = None) = {
15 | val (rel, arg1, arg2, lemmasString, pickled) = row
16 | val lemmas = lemmasString.split("\\s+").toSet
17 | val graph = DependencyGraph.deserialize(pickled).map(_.lemmatize(MorphaStemmer)).normalize
18 | BuildPatterns.findRelationPatterns(graph, rel, arg1, arg2, lemmas, maxLength)
19 | }
20 |
21 | "A pattern is found when the argument overlap" in {
22 | val row, (arg1, rel, arg2, lemmas, pickled) = ("be marry to", "hillary clinton", "bill clinton", "hillary clinton marry bill", "cc(married_VBN_11_0, And_CC_0_0); nn(Clinton_NNP_2_0, Hillary_NNP_1_0); nsubjpass(married_VBN_11_0, Clinton_NNP_2_0); punct(Clinton_NNP_2_0, _,_3_0); dep(know_VBP_8_0, who_WP_4_0); punct(know_VBP_8_0, _,_5_0); mark(know_VBP_8_0, as_IN_6_0); nsubj(know_VBP_8_0, we_PRP_7_0); rcmod(Clinton_NNP_2_0, know_VBP_8_0); punct(Clinton_NNP_2_0, _,_9_0); auxpass(married_VBN_11_0, is_VBZ_10_0); nn(Clinton_NNP_14_0, Bill_NNP_13_0); prep_to(married_VBN_11_0, Clinton_NNP_14_0); punct(married_VBN_11_0, ._._15_0)")
23 | val patterns = findPatterns(row)
24 | patterns.size must_== 1
25 | patterns.head._1.toString must_== "{arg1} prep_to> {arg2}"
26 | }
27 |
28 | "A pattern is found with exactly one slot" in {
29 | val row = ("arrive in", "barack obama", "afghanistan", "barack obama arrive afghanistan", "(to_TO_4_0), (in_IN_12_0), (on_IN_14_0), (or_CC_16_0), (for_IN_20_0), (to_TO_23_0), (and_CC_27_0), (in_IN_29_0), (of_IN_34_0), (from_IN_38_0), poss(trip_NN_3_0, his_PRP$_1_0); amod(trip_NN_3_0, two-day_JJ_2_0); pobj(After_IN_0_0, trip_NN_3_0); prep_to(trip_NN_3_0, Afghanistan_NNP_5_0); punct(trip_NN_3_0, ,_,_6_0); nn(Obama_NNP_10_0, U.S._NNP_7_0); nn(Obama_NNP_10_0, Senator_NNP_8_0); nn(Obama_NNP_10_0, Barack_NNP_9_0); nsubj(arrived_VBD_11_0, Obama_NNP_10_0); rcmod(trip_NN_3_0, arrived_VBD_11_0); prep_in(arrived_VBD_11_0, Iraq_NNP_13_0); prep_on(arrived_VBD_11_0, Monday_NNP_15_0); prep_on(arrived_VBD_11_0, July_NNP_17_0); conj_or(Monday_NNP_15_0, July_NNP_17_0); num(July_NNP_17_0, 21_CD_18_0); punct(trip_NN_3_0, ,_,_19_0); det(visit_NN_22_0, a_DT_21_0); prep_for(trip_NN_3_0, visit_NN_22_0); det(East_NNP_26_0, the_DT_24_0); nn(East_NNP_26_0, Middle_NNP_25_0); prep_to(visit_NN_22_0, East_NNP_26_0); prep_to(visit_NN_22_0, Europe_NNP_28_0); conj_and(East_NNP_26_0, Europe_NNP_28_0); poss(capacity_NN_31_0, his_PRP$_30_0); prep_in(visit_NN_22_0, capacity_NN_31_0); det(member_NN_33_0, a_DT_32_0); dep(capacity_NN_31_0, member_NN_33_0); det(Senate_NNP_37_0, the_DT_35_0); nn(Senate_NNP_37_0, U.S._NNP_36_0); prep_of(member_NN_33_0, Senate_NNP_37_0); prep_from(member_NN_33_0, Illinois_NNP_39_0); punct(After_IN_0_0, ._._40_0)")
30 | val patterns = findPatterns(row)
31 | patterns.size must_== 1
32 | patterns.head._1.toString must_== "{arg1} prep_to> {arg2}"
33 | }
34 |
35 | "A pattern is NOT found because of a length restriction" in {
36 | val row = ("arrive in", "barack obama", "afghanistan", "barack obama arrive afghanistan", "(to_TO_4_0), (in_IN_12_0), (on_IN_14_0), (or_CC_16_0), (for_IN_20_0), (to_TO_23_0), (and_CC_27_0), (in_IN_29_0), (of_IN_34_0), (from_IN_38_0), poss(trip_NN_3_0, his_PRP$_1_0); amod(trip_NN_3_0, two-day_JJ_2_0); pobj(After_IN_0_0, trip_NN_3_0); prep_to(trip_NN_3_0, Afghanistan_NNP_5_0); punct(trip_NN_3_0, ,_,_6_0); nn(Obama_NNP_10_0, U.S._NNP_7_0); nn(Obama_NNP_10_0, Senator_NNP_8_0); nn(Obama_NNP_10_0, Barack_NNP_9_0); nsubj(arrived_VBD_11_0, Obama_NNP_10_0); rcmod(trip_NN_3_0, arrived_VBD_11_0); prep_in(arrived_VBD_11_0, Iraq_NNP_13_0); prep_on(arrived_VBD_11_0, Monday_NNP_15_0); prep_on(arrived_VBD_11_0, July_NNP_17_0); conj_or(Monday_NNP_15_0, July_NNP_17_0); num(July_NNP_17_0, 21_CD_18_0); punct(trip_NN_3_0, ,_,_19_0); det(visit_NN_22_0, a_DT_21_0); prep_for(trip_NN_3_0, visit_NN_22_0); det(East_NNP_26_0, the_DT_24_0); nn(East_NNP_26_0, Middle_NNP_25_0); prep_to(visit_NN_22_0, East_NNP_26_0); prep_to(visit_NN_22_0, Europe_NNP_28_0); conj_and(East_NNP_26_0, Europe_NNP_28_0); poss(capacity_NN_31_0, his_PRP$_30_0); prep_in(visit_NN_22_0, capacity_NN_31_0); det(member_NN_33_0, a_DT_32_0); dep(capacity_NN_31_0, member_NN_33_0); det(Senate_NNP_37_0, the_DT_35_0); nn(Senate_NNP_37_0, U.S._NNP_36_0); prep_of(member_NN_33_0, Senate_NNP_37_0); prep_from(member_NN_33_0, Illinois_NNP_39_0); punct(After_IN_0_0, ._._40_0)")
37 | val patterns = findPatterns(row, Some(2))
38 | patterns.size must_== 0
39 | }
40 |
41 | // rel rel
42 | "A pattern is found" in {
43 | val row = ("be bear a", "queequag", "slave", "bear queequag slave", "(in_IN_5_0), (._._7_0), nsubjpass(born_VBN_2_0, Queequag_NNP_0_0); auxpass(born_VBN_2_0, was_VBD_1_0); dobj(born_VBN_2_0, slave_NN_4_0); det(slave_NN_4_0, a_DT_3_0); prep_in(slave_NN_4_0, Africa_NNP_6_0)")
44 | val patterns = findPatterns(row, Some(2))
45 | patterns.size must_== 1
46 | patterns.head._1.toString must_== "{arg1} dobj> {arg2}"
47 | }
48 |
49 | "A single pattern is found with a slot instead of a rel rel" in {
50 | val row = ("be elect president of", "barack obama", "unite state", "barack obama unite state elect president", "(of_IN_5_0), (._._9_0), nn(Obama_NNP_1_0, Barack_NNP_0_0); nsubjpass(elected_VBN_3_0, Obama_NNP_1_0); auxpass(elected_VBN_3_0, was_VBD_2_0); dobj(elected_VBN_3_0, president_NN_4_0); prep_of(president_NN_4_0, States_NNPS_8_0); det(States_NNPS_8_0, the_DT_6_0); nn(States_NNPS_8_0, United_NNP_7_0)")
51 | val patterns = findPatterns(row)
52 | patterns.size must_== 1
53 | patterns.head._1.toString must_== "{arg1} dobj> {rel1:postag=NN} >prep_of> {arg2}"
54 | }
55 |
56 | "A single pattern is found with a slot instead of a rel rel" in {
57 | val row = ("be team locate in", "mariner", "seattle", "mariner team locate seattle", "(in_IN_6_0), (._._8_0), det(Mariners_NNPS_1_0, The_DT_0_0); nsubj(team_NN_4_0, Mariners_NNPS_1_0); cop(team_NN_4_0, are_VBP_2_0); det(team_NN_4_0, a_DT_3_0); partmod(team_NN_4_0, located_VBN_5_0); prep_in(located_VBN_5_0, Seattle_NNP_7_0)")
58 | val patterns = findPatterns(row)
59 | patterns.head._1.toString must_== "{arg1} partmod> {rel1:postag=VBN} >prep_in> {arg2}"
60 | }
61 |
62 | "A single pattern is found with a slot instead of a rel rel" in {
63 | val row = ("be going populate", "human", "earth", "human go populate earth", "(._._7_0), nsubj(going_VBG_2_0, Humans_NNS_0_0); aux(going_VBG_2_0, are_VBP_1_0); xcomp(going_VBG_2_0, populate_VB_4_0); aux(populate_VB_4_0, to_TO_3_0); dobj(populate_VB_4_0, earth_NN_6_0); det(earth_NN_6_0, the_DT_5_0)")
64 | val patterns = findPatterns(row)
65 | patterns.size must_== 1
66 | patterns.head._1.toString must_== "{arg1} xcomp> {rel:postag=VB} >dobj> {arg2}"
67 | }
68 |
69 | "A single pattern is found with a slot instead of a rel rel" in {
70 | val row = ("have crush on", "juliette", "romeo", "juliette have crush romeo", "(on_IN_4_0), (._._6_0), nsubj(has_VBZ_1_0, Juliette_NNP_0_0); dobj(has_VBZ_1_0, crush_NN_3_0); det(crush_NN_3_0, a_DT_2_0); prep_on(crush_NN_3_0, Romeo_NNP_5_0)")
71 | val patterns = findPatterns(row)
72 | patterns.size must_== 1
73 | patterns.head._1.toString must_== "{arg1} dobj> {rel1:postag=NN} >prep_on> {arg2}"
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/core/src/test/scala/edu/knowitall/openparse/ExtractorPatternSpec.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse
2 |
3 | import org.junit._
4 | import org.junit.Assert._
5 | import org.specs2.mutable.Specification
6 | import org.junit.runner.RunWith
7 | import org.specs2.runner.JUnitRunner
8 |
9 | import edu.knowitall.tool.parse.graph.DependencyPattern
10 | import edu.knowitall.ollie.Ollie.stemmer
11 |
12 | @RunWith(classOf[JUnitRunner])
13 | object ExtractorPatternSpecTest extends Specification {
14 | def testSymmetric(pattern: String, symmetric: Boolean) {
15 | (pattern + " is " + (if (symmetric) "symmetric" else "not symmetric")) in {
16 | new ExtractorPattern(DependencyPattern.deserialize(pattern)).symmetric must be_==(symmetric)
17 | }
18 | }
19 |
20 | testSymmetric("{arg1} dobj> {arg2}", false)
21 | testSymmetric("{arg1} nsubj> {arg2}", true)
22 | testSymmetric("{arg1} prep_of> {arg2}", true)
23 | testSymmetric("{rel:postag=NN} nn> {arg2}", false)
24 | }
25 |
--------------------------------------------------------------------------------
/core/src/test/scala/edu/knowitall/openparse/OllieSpec.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse
2 |
3 | import org.junit._
4 | import org.junit.Assert._
5 | import org.specs2.mutable.Specification
6 | import org.junit.runner.RunWith
7 | import org.specs2.runner.JUnitRunner
8 | import edu.knowitall.tool.parse.graph.DependencyGraph
9 | import edu.knowitall.tool.stem.MorphaStemmer
10 | import edu.knowitall.ollie.Ollie
11 | import edu.knowitall.ollie.OllieExtractionInstance
12 | import edu.knowitall.ollie.ScoredOllieExtractionInstance
13 | import edu.knowitall.ollie.confidence.OllieConfidenceFunction
14 |
15 | @RunWith(classOf[JUnitRunner])
16 | object OllieSpecTest extends Specification {
17 | val ollie = new Ollie(OpenParse.withDefaultModel())
18 | val conf = OllieConfidenceFunction.loadDefaultClassifier()
19 |
20 | "Ollie finds an example extraction" in {
21 | val graph = DependencyGraph.deserialize("(._._5_37), nsubj(finds_VBZ_1_10, OpenParse_NNP_0_0); dobj(finds_VBZ_1_10, extraction_NN_4_27); det(extraction_NN_4_27, an_DT_2_16); nn(extraction_NN_4_27, example_NN_3_19)")
22 | val extrs = ollie.extract(graph)
23 |
24 | val extr = extrs.head
25 | extr must_== OllieExtractionInstance.tabDeserialize(extr.tabSerialize)
26 |
27 | val scored = new ScoredOllieExtractionInstance(true, extr)
28 | scored must_== ScoredOllieExtractionInstance.tabDeserialize(scored.tabSerialize)
29 | }
30 |
31 | "Ollie confidence function executes" in {
32 | val graph = DependencyGraph.deserialize("(._._5_37), nsubj(finds_VBZ_1_10, OpenParse_NNP_0_0); dobj(finds_VBZ_1_10, extraction_NN_4_27); det(extraction_NN_4_27, an_DT_2_16); nn(extraction_NN_4_27, example_NN_3_19)")
33 | val extrs = ollie.extract(graph)
34 | extrs map conf must not(throwA[Exception])
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/core/src/test/scala/edu/knowitall/openparse/OpenParseSpec.scala:
--------------------------------------------------------------------------------
1 | package edu.knowitall.openparse
2 |
3 | import org.junit._
4 | import org.junit.Assert._
5 | import org.specs2.mutable.Specification
6 | import org.junit.runner.RunWith
7 | import org.specs2.runner.JUnitRunner
8 |
9 | import edu.knowitall.tool.parse.graph.DependencyGraph
10 | import edu.knowitall.tool.stem.MorphaStemmer
11 |
12 | @RunWith(classOf[JUnitRunner])
13 | object OpenParseSpecTest extends Specification {
14 | val openparse = OpenParse.withDefaultModel()
15 |
16 | "OpenParse finds an example extraction" in {
17 | val graph = DependencyGraph.deserialize("(._._5_37), nsubj(finds_VBZ_1_10, OpenParse_NNP_0_0); dobj(finds_VBZ_1_10, extraction_NN_4_27); det(extraction_NN_4_27, an_DT_2_16); nn(extraction_NN_4_27, example_NN_3_19)")
18 | val extrs = openparse.extract(graph)
19 |
20 | extrs.size must_== 1
21 | extrs.head._2.toString must_== "(OpenParse; finds; an example extraction)"
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/example/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | edu.washington.cs.knowitall.ollie
5 | ollie-example
6 | ollie-example
7 | 1.0.0-SNAPSHOT
8 |
9 | edu.washington.cs.knowitall
10 | knowitall-oss
11 | 1.0.2
12 |
13 |
14 | UTF-8
15 | 2.4.0
16 |
17 |
18 |
19 | edu.washington.cs.knowitall.ollie
20 | ollie-core_2.9.2
21 | 1.0.2
22 |
23 |
24 | edu.washington.cs.knowitall.nlptools
25 | nlptools-parse-malt_2.9.2
26 | ${nlptools.version}
27 |
28 |
29 |
30 | ch.qos.logback
31 | logback-classic
32 | 1.0.7
33 |
34 |
35 | ch.qos.logback
36 | logback-core
37 | 1.0.7
38 |
39 |
40 |
41 | src/main/scala
42 | src/test/scala
43 |
44 |
45 | src/main/resources
46 |
47 |
48 |
49 |
50 | net.alchim31.maven
51 | scala-maven-plugin
52 | 3.1.0
53 |
54 |
55 | -deprecation
56 | -unchecked
57 |
58 |
59 |
60 |
61 |
62 | compile
63 | testCompile
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
--------------------------------------------------------------------------------
/example/src/main/java/example/JavaOllieWrapper.java:
--------------------------------------------------------------------------------
1 | package example;
2 |
3 | import java.io.File;
4 | import java.net.MalformedURLException;
5 |
6 | import edu.knowitall.ollie.Ollie;
7 | import edu.knowitall.ollie.OllieExtraction;
8 | import edu.knowitall.ollie.OllieExtractionInstance;
9 | import edu.knowitall.tool.parse.MaltParser;
10 | import edu.knowitall.tool.parse.graph.DependencyGraph;
11 |
12 | /** This is an example class that shows one way of using Ollie from Java. */
13 | public class JavaOllieWrapper {
14 | // the extractor itself
15 | private Ollie ollie;
16 |
17 | // the parser--a step required before the extractor
18 | private MaltParser maltParser;
19 |
20 | // the path of the malt parser model file
21 | private static final String MALT_PARSER_FILENAME = "engmalt.linear-1.7.mco";
22 |
23 | public JavaOllieWrapper() throws MalformedURLException {
24 | // initialize MaltParser
25 | scala.Option nullOption = scala.Option.apply(null);
26 | maltParser = new MaltParser(new File(MALT_PARSER_FILENAME).toURI().toURL(), nullOption);
27 |
28 | // initialize Ollie
29 | ollie = new Ollie();
30 | }
31 |
32 | /**
33 | * Gets Ollie extractions from a single sentence.
34 | * @param sentence
35 | * @return the set of ollie extractions
36 | */
37 | public Iterable extract(String sentence) {
38 | // parse the sentence
39 | DependencyGraph graph = maltParser.dependencyGraph(sentence);
40 |
41 | // run Ollie over the sentence and convert to a Java collection
42 | Iterable extrs = scala.collection.JavaConversions.asJavaIterable(ollie.extract(graph));
43 | return extrs;
44 | }
45 |
46 | public static void main(String args[]) throws MalformedURLException {
47 | System.out.println(JavaOllieWrapper.class.getResource("/logback.xml"));
48 | // initialize
49 | JavaOllieWrapper ollieWrapper = new JavaOllieWrapper();
50 |
51 | // extract from a single sentence.
52 | String sentence = "President Obama will meet with Congressional leaders on Friday, and House Republicans summoned lawmakers back for a Sunday session, in a last-ditch effort to avert a fiscal crisis brought on by automatic tax increases and spending cuts scheduled to hit next week.";
53 | Iterable extrs = ollieWrapper.extract(sentence);
54 |
55 | // print the extractions.
56 | for (OllieExtractionInstance inst : extrs) {
57 | OllieExtraction extr = inst.extr();
58 | System.out.println(extr.arg1().text()+"\t"+extr.rel().text()+"\t"+extr.arg2().text());
59 | }
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/example/src/main/resouces/logback.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | %d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/example/src/main/scala/ollie/Example.scala:
--------------------------------------------------------------------------------
1 | package ollie
2 |
3 | import edu.knowitall.ollie.Ollie
4 | import edu.knowitall.tool.parse.MaltParser
5 | import scala.io.Source
6 | import edu.knowitall.ollie.confidence.OllieConfidenceFunction
7 |
8 | /** This is an example project that takes lines as input from stdin,
9 | * parses them, runs the Ollie extractor on them, scores the
10 | * extractions with a confidence function, and then prints the results.
11 | *
12 | * You can run this project with the following command:
13 | * mvn clean compile exec:java -Dexec.mainClass=ollie.Example
14 | *
15 | * You will need to have engmalt.linear-1.7.mco in the base directory
16 | * of this example for the program to work. You can download this
17 | * file from the MaltParser website:
18 | *
19 | * http://www.maltparser.org/mco/english_parser/engmalt.html
20 | */
21 | object Example extends App {
22 | val parser = new MaltParser
23 | val ollie = new Ollie
24 | val confidence = OllieConfidenceFunction.loadDefaultClassifier()
25 | for (line <- Source.stdin.getLines; if !line.trim.isEmpty) {
26 | val parsed = parser.dependencyGraph(line)
27 | val extractionInstances = ollie.extract(parsed)
28 |
29 | println("Extractions:")
30 | for (inst <- extractionInstances) {
31 | val conf = confidence(inst)
32 | println(("%.2f" format conf) + "\t" + inst.extraction)
33 | }
34 | println("Waiting for next input...")
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | 4.0.0
4 | ollie
5 | ollie
6 | 1.0.0-SNAPSHOT
7 | pom
8 | edu.washington.cs.knowitall.ollie
9 |
10 | UTF-8
11 |
12 |
13 | core
14 | app
15 |
16 |
17 |
--------------------------------------------------------------------------------