├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── build.sbt
└── src
    ├── main
        └── java
        │   └── edu
        │       └── washington
        │           └── cs
        │               └── knowitall
        │                   ├── logic
        │                       ├── ArgFactory.java
        │                       ├── Expression.java
        │                       ├── LogicException.java
        │                       ├── LogicExpression.java
        │                       ├── LogicExpressionParser.java
        │                       └── LogicExpressionParsers.java
        │                   └── regex
        │                       ├── Expression.java
        │                       ├── ExpressionFactory.java
        │                       ├── FiniteAutomaton.java
        │                       ├── Match.java
        │                       ├── RegexException.java
        │                       ├── RegularExpression.java
        │                       ├── RegularExpressionParser.java
        │                       └── RegularExpressionParsers.java
    └── test
        ├── java
            └── edu
            │   └── washington
            │       └── cs
            │           └── knowitall
            │               └── regex
            │                   └── MinMaxTest.java
        └── scala
            └── edu
                └── washington
                    └── cs
                        └── knowitall
                            ├── logic
                                ├── LogicTest.scala
                                └── WordLogicTest.scala
                            └── regex
                                ├── RegularExpressionAssertionTest.scala
                                ├── RegularExpressionNamedGroupTest.scala
                                ├── RegularExpressionPermutationTest.scala
                                ├── RegularExpressionUnnamedGroupTest.scala
                                └── WordRegularExpressionTest.scala


/.gitignore:
--------------------------------------------------------------------------------
1 | .classpath
2 | .project
3 | .settings
4 | .cache
5 | target
6 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 |   - "2.10.2"
4 | jdk:
5 |   - oraclejdk7
6 |   - openjdk7
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                    GNU LESSER GENERAL PUBLIC LICENSE
  2 |                        Version 3, 29 June 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 | 
  9 |   This version of the GNU Lesser General Public License incorporates
 10 | the terms and conditions of version 3 of the GNU General Public
 11 | License, supplemented by the additional permissions listed below.
 12 | 
 13 |   0. Additional Definitions.
 14 | 
 15 |   As used herein, "this License" refers to version 3 of the GNU Lesser
 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
 17 | General Public License.
 18 | 
 19 |   "The Library" refers to a covered work governed by this License,
 20 | other than an Application or a Combined Work as defined below.
 21 | 
 22 |   An "Application" is any work that makes use of an interface provided
 23 | by the Library, but which is not otherwise based on the Library.
 24 | Defining a subclass of a class defined by the Library is deemed a mode
 25 | of using an interface provided by the Library.
 26 | 
 27 |   A "Combined Work" is a work produced by combining or linking an
 28 | Application with the Library.  The particular version of the Library
 29 | with which the Combined Work was made is also called the "Linked
 30 | Version".
 31 | 
 32 |   The "Minimal Corresponding Source" for a Combined Work means the
 33 | Corresponding Source for the Combined Work, excluding any source code
 34 | for portions of the Combined Work that, considered in isolation, are
 35 | based on the Application, and not on the Linked Version.
 36 | 
 37 |   The "Corresponding Application Code" for a Combined Work means the
 38 | object code and/or source code for the Application, including any data
 39 | and utility programs needed for reproducing the Combined Work from the
 40 | Application, but excluding the System Libraries of the Combined Work.
 41 | 
 42 |   1. Exception to Section 3 of the GNU GPL.
 43 | 
 44 |   You may convey a covered work under sections 3 and 4 of this License
 45 | without being bound by section 3 of the GNU GPL.
 46 | 
 47 |   2. Conveying Modified Versions.
 48 | 
 49 |   If you modify a copy of the Library, and, in your modifications, a
 50 | facility refers to a function or data to be supplied by an Application
 51 | that uses the facility (other than as an argument passed when the
 52 | facility is invoked), then you may convey a copy of the modified
 53 | version:
 54 | 
 55 |    a) under this License, provided that you make a good faith effort to
 56 |    ensure that, in the event an Application does not supply the
 57 |    function or data, the facility still operates, and performs
 58 |    whatever part of its purpose remains meaningful, or
 59 | 
 60 |    b) under the GNU GPL, with none of the additional permissions of
 61 |    this License applicable to that copy.
 62 | 
 63 |   3. Object Code Incorporating Material from Library Header Files.
 64 | 
 65 |   The object code form of an Application may incorporate material from
 66 | a header file that is part of the Library.  You may convey such object
 67 | code under terms of your choice, provided that, if the incorporated
 68 | material is not limited to numerical parameters, data structure
 69 | layouts and accessors, or small macros, inline functions and templates
 70 | (ten or fewer lines in length), you do both of the following:
 71 | 
 72 |    a) Give prominent notice with each copy of the object code that the
 73 |    Library is used in it and that the Library and its use are
 74 |    covered by this License.
 75 | 
 76 |    b) Accompany the object code with a copy of the GNU GPL and this license
 77 |    document.
 78 | 
 79 |   4. Combined Works.
 80 | 
 81 |   You may convey a Combined Work under terms of your choice that,
 82 | taken together, effectively do not restrict modification of the
 83 | portions of the Library contained in the Combined Work and reverse
 84 | engineering for debugging such modifications, if you also do each of
 85 | the following:
 86 | 
 87 |    a) Give prominent notice with each copy of the Combined Work that
 88 |    the Library is used in it and that the Library and its use are
 89 |    covered by this License.
 90 | 
 91 |    b) Accompany the Combined Work with a copy of the GNU GPL and this license
 92 |    document.
 93 | 
 94 |    c) For a Combined Work that displays copyright notices during
 95 |    execution, include the copyright notice for the Library among
 96 |    these notices, as well as a reference directing the user to the
 97 |    copies of the GNU GPL and this license document.
 98 | 
 99 |    d) Do one of the following:
100 | 
101 |        0) Convey the Minimal Corresponding Source under the terms of this
102 |        License, and the Corresponding Application Code in a form
103 |        suitable for, and under terms that permit, the user to
104 |        recombine or relink the Application with a modified version of
105 |        the Linked Version to produce a modified Combined Work, in the
106 |        manner specified by section 6 of the GNU GPL for conveying
107 |        Corresponding Source.
108 | 
109 |        1) Use a suitable shared library mechanism for linking with the
110 |        Library.  A suitable mechanism is one that (a) uses at run time
111 |        a copy of the Library already present on the user's computer
112 |        system, and (b) will operate properly with a modified version
113 |        of the Library that is interface-compatible with the Linked
114 |        Version.
115 | 
116 |    e) Provide Installation Information, but only if you would otherwise
117 |    be required to provide such information under section 6 of the
118 |    GNU GPL, and only to the extent that such information is
119 |    necessary to install and execute a modified version of the
120 |    Combined Work produced by recombining or relinking the
121 |    Application with a modified version of the Linked Version. (If
122 |    you use option 4d0, the Installation Information must accompany
123 |    the Minimal Corresponding Source and Corresponding Application
124 |    Code. If you use option 4d1, you must provide the Installation
125 |    Information in the manner specified by section 6 of the GNU GPL
126 |    for conveying Corresponding Source.)
127 | 
128 |   5. Combined Libraries.
129 | 
130 |   You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 | 
136 |    a) Accompany the combined library with a copy of the same work based
137 |    on the Library, uncombined with any other library facilities,
138 |    conveyed under the terms of this License.
139 | 
140 |    b) Give prominent notice with the combined library that part of it
141 |    is a work based on the Library, and explaining where to find the
142 |    accompanying uncombined form of the same work.
143 | 
144 |   6. Revised Versions of the GNU Lesser General Public License.
145 | 
146 |   The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 | 
151 |   Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 | 
161 |   If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # OpenRegex
  2 | 
  3 | OpenRegex is written by Michael Schmitz at the Turing Center
  4 | <http://turing.cs.washington.edu/>.  It is licensed under the lesser GPL.
  5 | Please see the LICENSE file for more details.
  6 | 
  7 | 
  8 | ## Introduction
  9 | 
 10 | OpenRegex is an efficient and flexible token-based regular expression language
 11 | and engine.  Most regular expression implementations are closed to run only
 12 | over characters.  Although this is the the most common application for regular
 13 | expressions, OpenRegex does not have this restriction.  OpenRegex is open to
 14 | any sequences of user-defined objects.
 15 | 
 16 | 
 17 | ## Applied to Natural Language
 18 | 
 19 | For example, OpenRegex is used in the R2A2 extension to ReVerb, an open-domain
 20 | information extractor, to determine argument boundaries.  In this case, tokens
 21 | are words in English sentences with additional information (the string of the
 22 | word, the part-of-speech tag, and the chunk tag).
 23 | 
 24 |     case class WordToken(string: String, postag: String, chunk: String)
 25 | 
 26 | Now that we have defined our token, we can build up a sentence (a NLP library
 27 | such as OpenNLP can help out here).  We will also need to define a way to
 28 | translate each token in the expression (text between <angled brackets>) into
 29 | an expression that can be applied to a word token.
 30 | 
 31 | ```
 32 |   def compile(string: String): RegularExpression[WordToken] = {
 33 |     // create a parser for regular expression language that have
 34 |     // the same token representation
 35 |     val parser =
 36 |       new RegularExpressionParser[WordToken]() {
 37 |         // Translate an string "part=value" into a BaseExpression that
 38 |         // checks whether the part of a WordToken has value 'value'.
 39 |         override def factory(string: String): BaseExpression[WordToken] = {
 40 |           new BaseExpression[WordToken](string) {
 41 |             val Array(part, quotedValue) = string.split("=")
 42 |             val value = quotedValue.drop(1).take(quotedValue.size - 2)
 43 |             override def apply(entity: WordToken) = {
 44 |               part match {
 45 |                 case "string" => entity.string equalsIgnoreCase value
 46 |                 case "postag" => entity.postag equalsIgnoreCase value
 47 |                 case "chunk" => entity.chunk equalsIgnoreCase value
 48 |               }
 49 |             }
 50 |           }
 51 |         }
 52 |       }
 53 | 
 54 |     parser.parse(string)
 55 |   }
 56 | ```
 57 | 
 58 | Now we can compile a regular expression and apply it to a sentence.  Consider
 59 | the following pattern.  The first line defines a non-matching group that
 60 | matches a determiner ("a", "an", or "the").  The second line matches a sequence
 61 | of part-of-speech tags ("JJ" is adjective, "NNP" is proper noun, and "NN" is
 62 | common noun).
 63 | 
 64 |     (?:<string='a'> | <string='an'> | <string='the'>)?
 65 |     <postag="JJ">* <postag='NNP'>+ <postag='NN'>+ <postag='NNP'>+
 66 | 
 67 | We can try applying it to a couple of sentences.
 68 | 
 69 | 1.  The US president Barack Obama is travelling to Mexico.
 70 | 
 71 | ```
 72 |     regex.find(sentence).groups.get(0) matches "The US president Barack Obama"
 73 | ```
 74 | 
 75 | 2.  If all the ice melted from the frigid Earth continent Antarctica, sea
 76 |     levels would rise hundreds of feet.
 77 | 
 78 | ```
 79 |     regex.find(sentence).groups.get(0) matches "the frigid Earth continent Antarctica"
 80 | ```
 81 | 
 82 | We may want to pull out the text from certain parts of our match.  We can do
 83 | this with either named or unnamed groups.  Consider the following new form of
 84 | the pattern and the sentence in example 2.
 85 | 
 86 | ```
 87 |       (?:<string="a"> | <string="an"> | <string="the">)? <postag="JJ">*
 88 |       (<arg1>:<postag='NNP'>+) (<rel>:<postag='NN'>+) (<arg2>:<postag='NNP'>+)
 89 | 
 90 |       regex.find(sentence).groups.get(0) matches "the frigid Earth continent Antarctica"
 91 |       regex.find(sentence).groups.get(1) matches "Earth"
 92 |       regex.find(sentence).groups.get(2) matches "continent"
 93 |       regex.find(sentence).groups.get(2) matches "Antarctica"
 94 | 
 95 |       regex.find(sentence).group("arg1") matches "Earth"
 96 |       regex.find(sentence).group("rel")  matches "continent"
 97 |       regex.find(sentence).group("arg2") matches "Antarctica"
 98 | ```
 99 | 
100 | ## Supported Constructs
101 | 
102 | The regular expression library supports the following constructs.
103 | 
104 | ```
105 |     | alternation
106 |     ? option
107 |     * Kleene-star
108 |     + plus
109 |     ^ beginning
110 |     $ end
111 |     {x,y}     match at least x but not more than y times
112 |     ()        matching groups
113 |     (?:)      non-matching groups
114 |     (<name>:) named groups
115 | ```
116 | 
117 | Most of these operators work the same as in java.util.regex.  Presently,
118 | however, alternation binds to its immediate neighbors.  This means that `<a> <b> | <c>`
119 | means `<a> (?:<b> | <c>)` whereas in Java it would mean `(?:<a> <b>) | <c>`.
120 | This may change in a future release so it is advised that the
121 | alternation arguments be made explicit with non-matching groups.
122 | 
123 | All operators are greedy, and there are no non-greedy counterparts.
124 | Backreferences are not supported because the underlying representation only
125 | supports regular languages (backreferences are not regular).
126 | 
127 | 
128 | ## Simple Java Example
129 | 
130 | The NLP example is rather complex but it shows the power of OpenRegex.  For a
131 | simpler example, look at RegularExpressions.word.  This is a static factory
132 | method for a simple word-based regular expression where only the string is
133 | considered.  This factory is used in the test cases.
134 | 
135 | You can also play around with RegularExpressions.word by running the main
136 | method in RegularExpression and specifying an expression with arg1.
137 | 
138 |     sbt 'run-main edu.washington.cs.knowitall.regex.RegularExpression "<the> <fat>* <cows> <are> <mooing> (?:<loudly>)?"'
139 | 
140 | 
141 | ## Logic Expressions
142 | 
143 | Included is an engine for parsing and evaluating logic expressions.  For
144 | example, you might want to extend the NLP regular expression language to be
145 | able to check multiple fields in a single regular expression token.  If you
146 | assumed each regular expression token to be a logic expression, you could
147 | write patterns such as the following.
148 | 
149 | ```
150 |     <string="the" & postag="DT"> <postag="JJ"> <string="earth" | postag="NNP">
151 | ```
152 | 
153 | Extending the regular expression in this way is easy.  It only involves
154 | rewriting the apply method in BaseExpression inside the compile method.
155 | Most of the code below existed before--now it's just moved outside the
156 | apply method.
157 | 
158 | ```
159 |     val logic = new LogicExpressionParser[WordToken] {
160 |       override def factory(expr: String) = {
161 |         new Arg.Pred[WordToken](expr) {
162 |           val Array(part, quotedValue) = expr.split("=")
163 |           val value = quotedValue.drop(1).take(quotedValue.size - 2)
164 |           override def apply(entity: WordToken) = part match {
165 |             case "string" => entity.string == value
166 |             case "postag" => entity.postag == value
167 |             case "chunk" => entity.chunk == value
168 |           }
169 |         }
170 |       }
171 |     }.parse(value)
172 | 
173 |     override def apply(entity: WordToken) = {
174 |       logic.apply(entity)
175 |     }
176 | ```
177 | 
178 | Play around with logic expression by using the main method in LogicExpression.
179 | 
180 |     sbt 'run-main edu.washington.cs.knowitall.logic.LogicExpression'
181 |  
182 | You can enter logic expressions such as "true & false" or "true | false" and
183 | have them evaluated interactively.
184 | 
185 | 
186 | ## Implementation
187 | 
188 | Regular expressions are evaluated using Thomson NFA, which is fast and does not have
189 | the pathological cases that most regular expression libraries have.  For more
190 | information about Thomson NFA in comparison to recursive backtracking, read
191 | http://swtch.com/~rsc/regexp/regexp1.html.  Future work may involve compiling
192 | NFAs to DFAs.
193 | 
194 | 
195 | ## Future Work
196 | 
197 | 1.  Compile to DFA.
198 | 2.  Use parser combinators for parsing regular expressions.
199 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | organization := "edu.washington.cs.knowitall"
 2 | 
 3 | name := "openregex"
 4 | 
 5 | description := "OpenRegex is an efficient and flexible library for running regular expressions over sequences of user-defined objects."
 6 | 
 7 | version := "1.1.2-SNAPSHOT"
 8 | 
 9 | libraryDependencies ++= Seq("com.google.code.findbugs" % "jsr305" % "2.0.1",
10 |     "com.google.guava" % "guava" % "15.0",
11 |     "org.scala-lang" % "scala-library" % "2.10.2" % "test",
12 |     "junit" % "junit" % "4.10" % "test",
13 |     "org.specs2" % "specs2_2.10" % "2.2.2" % "test",
14 |     "org.scalacheck" % "scalacheck_2.10" % "1.10.1" % "test")
15 | 
16 | licenses := Seq("LGPL (GNU Lesser General Public License)" -> url("http://www.gnu.org/licenses/lgpl.html"))
17 | 
18 | homepage := Some(url("https://github.com/knowitall/openregex"))
19 | 
20 | publishMavenStyle := true
21 | 
22 | publishTo <<= version { (v: String) =>
23 |   val nexus = "https://oss.sonatype.org/"
24 |   if (v.trim.endsWith("SNAPSHOT"))
25 |     Some("snapshots" at nexus + "content/repositories/snapshots")
26 |   else
27 |     Some("releases"  at nexus + "service/local/staging/deploy/maven2")
28 | }
29 | 
30 | pomExtra := (
31 |   <scm>
32 |     <url>https://github.com/knowitall/openregex</url>
33 |     <connection>scm:git://github.com/knowitall/openregex.git</connection>
34 |     <developerConnection>scm:git:git@github.com:knowitall/openregex.git</developerConnection>
35 |     <tag>HEAD</tag>
36 |   </scm>
37 |   <developers>
38 |    <developer>
39 |       <name>Michael Schmitz</name>
40 |     </developer>
41 |   </developers>)
42 | 


--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/logic/ArgFactory.java:
--------------------------------------------------------------------------------
 1 | package edu.washington.cs.knowitall.logic;
 2 | 
 3 | import com.google.common.base.Function;
 4 | 
 5 | /**
 6 |  * An abstract factory class that converts the string representation of
 7 |  * an argument into a token.  This token uses the supplied delegate to
 8 |  * evaluate the expression against an entity into a boolean.
 9 |  *
10 |  * @author Michael Schmitz <schmmd@cs.washington.edu>
11 |  *
12 |  * @param <E>
13 |  */
14 | public abstract class ArgFactory<E> implements Function<String, Expression.Arg<E>> {
15 |     /***
16 |      * Converts the supplied string into a token.
17 |      */
18 |     public abstract Expression.Arg<E> create(String string);
19 | 
20 |     /***
21 |      * Method to satisfy abstract superclass.
22 |      */
23 |     @Override
24 |     public Expression.Arg<E> apply(String string) {
25 |         return this.create(string);
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/logic/Expression.java:
--------------------------------------------------------------------------------
  1 | package edu.washington.cs.knowitall.logic;
  2 | 
  3 | import com.google.common.base.Predicate;
  4 | 
  5 | /**
  6 |  * Superclass for expressions in a Logic Expression.
  7 |  *
  8 |  * @author Michael Schmitz <schmmd@cs.washington.edu>
  9 |  */
 10 | public abstract class Expression<E> {
 11 |     /**
 12 |      * An expression that can be applied.
 13 |      */
 14 |     public static abstract class Apply<E> extends Expression<E> {
 15 |         /**
 16 |          * Apply this expression to an entity to get true or false.
 17 |          */
 18 |         public abstract boolean apply(E entity);
 19 |     }
 20 | 
 21 |     /**
 22 |      * An operator expression.
 23 |      */
 24 |     public static abstract class Op<E> extends Apply<E> {
 25 |         /**
 26 |          * @returns  true if this has precedence over that
 27 |          */
 28 |         public boolean preceeds(Op<?> that) {
 29 |             return this.precedence() < that.precedence();
 30 |         }
 31 | 
 32 |         /**
 33 |          * The precedence of this operator.  A smaller number denotes higher
 34 |          * precedence.
 35 |          *
 36 |          * @returns  the precedence level of this operator
 37 |          */
 38 |         public abstract int precedence();
 39 | 
 40 |         /**
 41 |          * An operator that takes a single argument, such as negation.
 42 |          */
 43 |         public static abstract class Mon<E> extends Op<E> {
 44 |             public Apply<E> sub;
 45 | 
 46 |             public String toString(String symbol) {
 47 |                 if (sub == null) {
 48 |                     return symbol;
 49 |                 }
 50 |                 else {
 51 |                     return symbol + "(" + sub.toString() + ")";
 52 |                 }
 53 |             }
 54 | 
 55 |             /**
 56 |              * The negation operator.
 57 |              */
 58 |             public static class Not<E> extends Mon<E> {
 59 |                 public String toString() {
 60 |                     return super.toString("!");
 61 |                 }
 62 | 
 63 |                 @Override
 64 |                 public boolean apply(E entity) {
 65 |                     return !sub.apply(entity);
 66 |                 }
 67 | 
 68 |                 @Override
 69 |                 public int precedence() {
 70 |                     return 0;
 71 |                 }
 72 |             }
 73 |         }
 74 | 
 75 |         /**
 76 |          * An operator that takes two arguments, such as disjunction.
 77 |          */
 78 |         public static abstract class Bin<E> extends Op<E> {
 79 |             public Apply<E> left;
 80 |             public Apply<E> right;
 81 | 
 82 |             public String toString(String symbol) {
 83 |                 if (left == null || right == null) {
 84 |                     return symbol;
 85 |                 }
 86 |                 else {
 87 |                     return "(" + left.toString() + " " + symbol + " " + right.toString() + ")";
 88 |                 }
 89 |             }
 90 | 
 91 |             /**
 92 |              * The conjunction (logical and) operator.
 93 |              */
 94 |             public static class And<E> extends Bin<E> {
 95 |                 public String toString() {
 96 |                     return super.toString("&");
 97 |                 }
 98 | 
 99 |                 @Override
100 |                 public boolean apply(E entity) {
101 |                     return left.apply(entity) && right.apply(entity);
102 |                 }
103 | 
104 |                 @Override
105 |                 public int precedence() {
106 |                     return 1;
107 |                 }
108 |             }
109 | 
110 |             /**
111 |              * The disjunction (logical or) operator.
112 |              */
113 |             public static class Or<E> extends Bin<E> {
114 |                 public String toString() {
115 |                     return super.toString("|");
116 |                 }
117 | 
118 |                 @Override
119 |                 public boolean apply(E entity) {
120 |                     return left.apply(entity) || right.apply(entity);
121 |                 }
122 | 
123 |                 @Override
124 |                 public int precedence() {
125 |                     return 2;
126 |                 }
127 |             }
128 |         }
129 |     }
130 | 
131 |     /**
132 |      * An expression that evaluates to true or false.
133 |      */
134 |     public static abstract class Arg<E> extends Apply<E> implements Predicate<E> {
135 |         /**
136 |          * An expression that evaluates to true or false by applying a
137 |          * predicate to the supplied entity.
138 |          */
139 |         public static abstract class Pred<E> extends Arg<E> {
140 |             private String description;
141 | 
142 |             public Pred(String description) {
143 |                 this.description = description;
144 |             }
145 | 
146 |             @Override
147 |             public abstract boolean apply(E entity);
148 | 
149 |             public String getDescription() {
150 |                 return this.description;
151 |             }
152 | 
153 |             public String toString() {
154 |                 return this.getDescription();
155 |             }
156 |         }
157 | 
158 |         /**
159 |          * An expression that is a constant value--either true or false.
160 |          */
161 |         public static class Value<E> extends Arg<E> {
162 |             private boolean value;
163 | 
164 |             public Value(boolean value) {
165 |                 this.value = value;
166 |             }
167 | 
168 |             @Override
169 |             public boolean apply(E entity) {
170 |                 return this.apply();
171 |             }
172 | 
173 |             public boolean apply() {
174 |                 return value;
175 |             }
176 | 
177 |             @Override
178 |             public String toString() {
179 |                 return Boolean.toString(this.value);
180 |             }
181 |         }
182 |     }
183 | 
184 |     /**
185 |      * A parenthesis, used for grouping.  These are only uses prior to building
186 |      * the AST.
187 |      */
188 |     public static class Paren<E> extends Expression<E> {
189 |         /**
190 |          * A left parenthesis.
191 |          */
192 |         public static class L<E> extends Paren<E> {
193 |             public String toString() {
194 |                 return "(";
195 |             }
196 |         }
197 | 
198 |         /**
199 |          * A right parenthesis.
200 |          */
201 |         public static class R<E> extends Paren<E> {
202 |             public String toString() {
203 |                 return ")";
204 |             }
205 |         }
206 |     }
207 | }
208 | 


--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/logic/LogicException.java:
--------------------------------------------------------------------------------
 1 | package edu.washington.cs.knowitall.logic;
 2 | 
 3 | /**
 4 |  *
 5 |  * @author Michael Schmitz <schmmd@cs.washington.edu>
 6 |  */
 7 | public class LogicException extends RuntimeException {
 8 |     private static final long serialVersionUID = 1L;
 9 | 
10 |     public LogicException(String message) {
11 |         super(message);
12 |     }
13 | 
14 |     public LogicException(String message, Exception e) {
15 |         super(message, e);
16 |     }
17 | 
18 |     /**
19 |      * Exception while applying an expression to an object.
20 |      */
21 |     public static class ApplyLogicException extends LogicException {
22 |         private static final long serialVersionUID = 1L;
23 | 
24 |         public ApplyLogicException(String message, Exception e) {
25 |             super(message, e);
26 |         }
27 | 
28 |         public ApplyLogicException(String message) {
29 |             super(message);
30 |         }
31 |     }
32 | 
33 |     /**
34 |      * Exception while converting the tokens into a valid expression.
35 |      */
36 |     public static class CompileLogicException extends LogicException {
37 |         private static final long serialVersionUID = 1L;
38 | 
39 |         public CompileLogicException(String message, Exception e) {
40 |             super(message, e);
41 |         }
42 | 
43 |         public CompileLogicException(String message) {
44 |             super(message);
45 |         }
46 |     }
47 | 
48 |     /**
49 |      * Exception while tokenizing the logic expression string.
50 |      */
51 |     public static class TokenizeLogicException extends LogicException {
52 |         private static final long serialVersionUID = 1L;
53 | 
54 |         public TokenizeLogicException(String message, Exception e) {
55 |             super(message, e);
56 |         }
57 | 
58 |         public TokenizeLogicException(String message) {
59 |             super(message);
60 |         }
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/logic/LogicExpression.java:
--------------------------------------------------------------------------------
  1 | package edu.washington.cs.knowitall.logic;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.EmptyStackException;
  5 | import java.util.LinkedList;
  6 | import java.util.List;
  7 | import java.util.Scanner;
  8 | import java.util.Stack;
  9 | 
 10 | import com.google.common.base.Function;
 11 | import com.google.common.base.Predicate;
 12 | 
 13 | import edu.washington.cs.knowitall.logic.Expression.Apply;
 14 | import edu.washington.cs.knowitall.logic.Expression.Arg;
 15 | import edu.washington.cs.knowitall.logic.Expression.Op;
 16 | import edu.washington.cs.knowitall.logic.Expression.Paren;
 17 | import edu.washington.cs.knowitall.logic.LogicException.ApplyLogicException;
 18 | import edu.washington.cs.knowitall.logic.LogicException.CompileLogicException;
 19 | import edu.washington.cs.knowitall.logic.LogicException.TokenizeLogicException;
 20 | 
 21 | /**
 22 |  * A logic expression engine that operates over user specified objects.
 23 |  *
 24 |  * @author Michael Schmitz <schmmd@cs.washington.edu>
 25 |  *
 26 |  * @param  <E>  the type of the base expressions
 27 |  */
 28 | public class LogicExpression<E> implements Predicate<E> {
 29 |     private final Apply<E> expression;
 30 | 
 31 |     /***
 32 |      *
 33 |      * @param input an infix representation of the logic expression.
 34 |      * @throws TokenizeLogicException
 35 |      * @throws CompileLogicException
 36 |      */
 37 |     protected LogicExpression(List<Expression<E>> expressions)
 38 |             throws TokenizeLogicException, CompileLogicException {
 39 |         // put in reverse polish notation
 40 |         List<Expression<E>> rpn = rpn(expressions);
 41 | 
 42 |         // compile the expression
 43 |         expression = buildAst(rpn);
 44 |     }
 45 | 
 46 |     /***
 47 |      * Compile an infix list of tokens into an expression tree.
 48 |      * @param rpn a list of tokens in infix form.
 49 |      * @return an expression tree.
 50 |      */
 51 |     public static <E> LogicExpression<E> compile(
 52 |             final List<Expression<E>> expressions) {
 53 |         return new LogicExpression<E>(expressions);
 54 |     }
 55 | 
 56 |     /***
 57 |      * Helper factory method to instantiate a LogicExpression.
 58 |      * @param  input  The string to parse.
 59 |      * @param  factoryDelegate  The factory to build tokens.
 60 |      * @return  a new LogicExpression
 61 |      */
 62 |     public static <E> LogicExpression<E> compile(final String input,
 63 |             final Function<String, Arg<E>> factoryDelegate) {
 64 |         return new LogicExpressionParser<E>() {
 65 |             @Override
 66 |             public Arg<E> factory(String argument) {
 67 |                 return factoryDelegate.apply(argument);
 68 |             }
 69 |         }.parse(input);
 70 |     }
 71 | 
 72 |     @Override
 73 |     public String toString() {
 74 |         if (this.isEmpty()) {
 75 |             return "(empty)";
 76 |         }
 77 |         else {
 78 |             return expression.toString();
 79 |         }
 80 |     }
 81 | 
 82 | 
 83 |     /***
 84 |      * If the expression is empty, it returns true for all inputs.
 85 |      * @return true iff the expression is empty.
 86 |      */
 87 |     public boolean isEmpty() {
 88 |         return this.expression == null;
 89 |     }
 90 | 
 91 |     @Override
 92 |     public boolean apply(E entity) {
 93 |         if (this.isEmpty()) {
 94 |             return true;
 95 |         }
 96 |         else {
 97 |             return this.expression.apply(entity);
 98 |         }
 99 |     }
100 | 
101 |     /***
102 |      * Compile a rpn list of tokens into an expression tree.
103 |      * @param rpn a list of tokens in infix form.
104 |      * @return an expression tree.
105 |      */
106 |     public static <E> Apply<E> buildAst(List<Expression<E>> rpn) {
107 |         if (rpn.isEmpty()) {
108 |             return null;
109 |         }
110 | 
111 |         Stack<Apply<E>> stack = new Stack<Apply<E>>();
112 |         for (Expression<E> tok : rpn) {
113 |             if (tok instanceof Arg<?>) {
114 |                 stack.push((Arg<E>) tok);
115 |             } else if (tok instanceof Op<?>) {
116 |                 try {
117 |                     if (tok instanceof Op.Mon<?>){
118 |                        Apply<E> sub = stack.pop();
119 | 
120 |                         Op.Mon<E> mon = (Op.Mon<E>) tok;
121 | 
122 |                         mon.sub = sub;
123 | 
124 |                         stack.push(mon);
125 |                     }
126 |                     if (tok instanceof Op.Bin<?>) {
127 |                         Apply<E> arg2 = stack.pop();
128 |                         Apply<E> arg1 = stack.pop();
129 | 
130 |                         Op.Bin<E> bin = (Op.Bin<E>) tok;
131 | 
132 |                         bin.left = arg1;
133 |                         bin.right = arg2;
134 | 
135 |                         stack.push(bin);
136 |                     }
137 |                 }
138 |                 catch (EmptyStackException e) {
139 |                     throw new CompileLogicException(
140 |                             "No argument for operator (stack empty): "
141 |                             + tok.toString());
142 |                 }
143 |             }
144 |         }
145 | 
146 |         if (stack.size() > 1) {
147 |             throw new ApplyLogicException(
148 |                     "Stack has multiple elements after apply: " + stack.toString());
149 |         }
150 | 
151 |         if (stack.size() == 0) {
152 |             throw new ApplyLogicException(
153 |                     "Stack has zero elements after apply.");
154 |         }
155 | 
156 |         if (!(stack.peek() instanceof Apply<?>)) {
157 |             throw new ApplyLogicException(
158 |                     "Stack contains non-appliable tokens after apply: " + stack.toString());
159 |         }
160 | 
161 |         return (stack.pop());
162 |     }
163 | 
164 |     /***
165 |      * Return a list of the arguments contained in the expression.
166 |      * @return
167 |      */
168 |     public List<String> getArgs() {
169 |         List<String> args = new ArrayList<String>();
170 |         getArgs(this.expression, args);
171 | 
172 |         return args;
173 |     }
174 | 
175 |     /***
176 |      * Private helper method to recursively find arguments.
177 |      * @param apply the expression tree to search.
178 |      * @param args the resulting list of arguments.
179 |      */
180 |     private void getArgs(Apply<?> apply, List<String> args) {
181 |         if (apply instanceof Op.Bin<?>) {
182 |             Op.Bin<?> bin = (Op.Bin<?>) apply;
183 | 
184 |             getArgs(bin.left, args);
185 |             getArgs(bin.right, args);
186 |         }
187 |         else if (apply instanceof Arg.Pred<?>) {
188 |             args.add(((Arg.Pred<?>)apply).getDescription());
189 |         }
190 |     }
191 | 
192 |     /***
193 |      * Converts an infix logic representation into a postfix logic representation.
194 |      * @param tokens a list of tokens in infix form.
195 |      * @return a list of tokens in postfix (rpn) form.
196 |      * @throws CompileLogicException
197 |      */
198 |     public List<Expression<E>> rpn(List<Expression<E>> tokens)
199 |             throws CompileLogicException {
200 |         // intermediate storage
201 |         Stack<Expression<E>> stack = new Stack<Expression<E>>();
202 | 
203 |         // final rpn output
204 |         LinkedList<Expression<E>> output = new LinkedList<Expression<E>>();
205 | 
206 |         for (Expression<E> tok : tokens) {
207 |             if (tok instanceof Paren.L<?>) {
208 |                 stack.push(tok);
209 |             } else if (tok instanceof Paren.R<?>) {
210 |                 Expression<E> top;
211 |                 do {
212 |                     top = stack.pop();
213 | 
214 |                     if (!(top instanceof Paren.L<?>)) {
215 |                         output.offer(top);
216 |                     }
217 | 
218 |                 } while (!(top instanceof Paren.L<?>));
219 | 
220 |             } else if (tok instanceof Op.Mon<?>) {
221 |                 stack.push(tok);
222 |             } else if (tok instanceof Op.Bin<?>) {
223 |                 // higher precedence
224 |                 while (!stack.isEmpty() && stack.peek() instanceof Op<?>
225 |                         && ((Op<?>)stack.peek()).preceeds((Op<?>)tok)) {
226 |                     output.offer(stack.pop());
227 |                 }
228 | 
229 |                 stack.push(tok);
230 |             } else if (tok instanceof Arg<?>) {
231 |                 output.offer(tok);
232 |             }
233 |         }
234 | 
235 |         // empty out items remaining ni the stack
236 |         while (!stack.isEmpty()) {
237 |             Expression<E> top = stack.pop();
238 | 
239 |             if (top instanceof Paren.L<?> || top instanceof Paren.R<?>) {
240 |                 throw new CompileLogicException("Unbalanced parentheses.");
241 |             }
242 | 
243 |             output.offer(top);
244 |         }
245 | 
246 |         return output;
247 |     }
248 | 
249 | 
250 |     /***
251 |      * Iteractively interpret logic statements from stdin such as "true | (true & false)".
252 |      * @param args
253 |      */
254 |     public static void main(String[] args) {
255 |         Scanner scan = new Scanner(System.in);
256 | 
257 |         while (scan.hasNextLine()) {
258 |             String line = scan.nextLine();
259 | 
260 |             LogicExpression<String> expr = LogicExpressionParsers.trivial.parse(line);
261 | 
262 |             System.out.println("string: " + expr.toString());
263 |             System.out.println("value:  " + expr.apply(null));
264 |             System.out.println();
265 |         }
266 | 
267 |         scan.close();
268 |     }
269 | }
270 | 


--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/logic/LogicExpressionParser.java:
--------------------------------------------------------------------------------
  1 | package edu.washington.cs.knowitall.logic;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | import java.util.Stack;
  6 | import java.util.regex.Matcher;
  7 | import java.util.regex.Pattern;
  8 | 
  9 | import com.google.common.base.Function;
 10 | import com.google.common.collect.Lists;
 11 | 
 12 | import edu.washington.cs.knowitall.logic.Expression.Arg;
 13 | import edu.washington.cs.knowitall.logic.Expression.Op;
 14 | import edu.washington.cs.knowitall.logic.Expression.Paren;
 15 | import edu.washington.cs.knowitall.logic.LogicException.TokenizeLogicException;
 16 | 
 17 | /**
 18 |  * A logic expression engine that operates over user specified objects.
 19 |  *
 20 |  * @author Michael Schmitz <schmmd@cs.washington.edu>
 21 |  *
 22 |  * @param  <E>  the type of the base expressions
 23 |  */
 24 | abstract public class LogicExpressionParser<E> implements Function<String, LogicExpression<E>> {
 25 |     /***
 26 |      * Create a LogicExpression object from the supplied string.
 27 |      * @param string
 28 |      * @return
 29 |      */
 30 |     public LogicExpression<E> parse(String string) {
 31 |         List<Expression<E>> expressions = this.tokenize(string);
 32 |         return new LogicExpression<E>(expressions);
 33 |     }
 34 | 
 35 |     @Override
 36 |     public LogicExpression<E> apply(String string) {
 37 |         return this.parse(string);
 38 |     }
 39 | 
 40 |     /***
 41 |      * The factory method creates an argument from the supplied token string.
 42 |      * @param  argument  a string representation of a token
 43 |      * @return  an evaluatable representation of a token
 44 |      */
 45 |     public abstract Arg<E> factory(String argument);
 46 | 
 47 |     public final static Pattern doubleQuoteStringLiteralRegex =
 48 |             Pattern.compile("\"" + "([^\"\\p{Cntrl}\\\\]*+(?:\\\\[\\\\'\"bfnrt])*+(?:\\\\u[a-fA-F0-9]{4})*+)*+" + "\"");
 49 |     public final static Pattern singleQuoteStringLiteralRegex =
 50 |             Pattern.compile("'" + "(?:[^']*+)" + "'");
 51 |     public final static Pattern regexLiteralRegex =
 52 |             Pattern.compile("/" + "(?:(?:[^/\\\\]*+(?:\\\\)*+(?:\\\\/)*+)*+)" + "/");
 53 |     private final static List<Pattern> literalPatterns = Lists.newArrayList(
 54 |             doubleQuoteStringLiteralRegex, singleQuoteStringLiteralRegex,
 55 |             regexLiteralRegex);
 56 | 
 57 |     /***
 58 |      * The readToken method reads a token from the remaining LogicExpression string.
 59 |      *
 60 |      * A token may contain a string.  If it contains parentheses, the token
 61 |      * will last until the parentheses are balanced.  And &, |, or unbalanced )
 62 |      * will mark the end of a token.
 63 |      *
 64 |      * This is a default implementation that may be overriden.
 65 |      * @param  remainder  the remaining text to tokenize
 66 |      * @return  a token from the beginning on `remaining`
 67 |      */
 68 |     public String readToken(String remainder) {
 69 |         final String token;
 70 |         try {
 71 |             Stack<Character> parens = new Stack<Character>();
 72 | 
 73 |             int nextExpression;
 74 |             for (nextExpression = 0; nextExpression < remainder.length(); nextExpression++) {
 75 |                 char c = remainder.charAt(nextExpression);
 76 | 
 77 |                 // check for quotation
 78 |                 String match = null;
 79 |                 for (Pattern pattern : literalPatterns) {
 80 |                     Matcher matcher = pattern.matcher(remainder).region(
 81 |                             nextExpression, remainder.length());
 82 |                     if (matcher.lookingAt()) {
 83 |                         match = matcher.group(0);
 84 |                         break;
 85 |                     }
 86 |                 }
 87 | 
 88 |                 if (match != null) {
 89 |                     // we found and can consume a quotation
 90 |                     nextExpression += match.length() - 1;
 91 |                 } else if (c == '(') {
 92 |                     parens.push(c);
 93 |                 } else if (c == ')') {
 94 |                     if (parens.isEmpty()) {
 95 |                         break;
 96 |                     } else {
 97 |                         parens.pop();
 98 |                     }
 99 |                 } else if (c == '&' || c == '|') {
100 |                     break;
101 |                 }
102 |             }
103 | 
104 |             token = remainder.substring(0, nextExpression).trim();
105 |         } catch (Exception e) {
106 |             throw new TokenizeLogicException("Error parsing token: "
107 |                     + remainder, e);
108 |         }
109 | 
110 |         if (token.isEmpty()) {
111 |             throw new TokenizeLogicException("zero-length token found.");
112 |         }
113 | 
114 |         return token;
115 |     }
116 | 
117 |     /***
118 |      * Convert an infix string logic representation to an infix list of tokens.
119 |      * @param input an infix string logic representation.
120 |      * @param factory a delegate that converts a string representation of an
121 |      * argument into a token object.  @return
122 |      *
123 |      * @throws TokenizeLogicException
124 |      */
125 |     public List<Expression<E>> tokenize(String input)
126 |     throws TokenizeLogicException {
127 |         List<Expression<E>> tokens = new ArrayList<Expression<E>>();
128 | 
129 |         int i = 0;
130 |         while (i < input.length()) {
131 |             String substring = input.substring(i);
132 |             char firstChar = substring.charAt(0);
133 | 
134 |             if (firstChar == ' ') {
135 |                 i += 1;
136 |                 continue;
137 |             }
138 |             else if (firstChar == '(') {
139 |                 tokens.add(new Paren.L<E>());
140 |                 i += 1;
141 |             } else if (firstChar == ')') {
142 |                 tokens.add(new Paren.R<E>());
143 |                 i += 1;
144 |             } else if (firstChar == '!') {
145 |                 tokens.add(new Op.Mon.Not<E>());
146 |                 i += 1;
147 |             } else if (firstChar == '&') {
148 |                 tokens.add(new Op.Bin.And<E>());
149 |                 i += 1;
150 |             } else if (firstChar == '|') {
151 |                 tokens.add(new Op.Bin.Or<E>());
152 |                 i += 1;
153 |             } else {
154 |                 // parse out the token
155 |                 String token = this.readToken(substring);
156 | 
157 |                 tokens.add(factory(token));
158 |                 i += token.length();
159 |             }
160 |         }
161 | 
162 |         return tokens;
163 |     }
164 | }
165 | 


--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/logic/LogicExpressionParsers.java:
--------------------------------------------------------------------------------
 1 | package edu.washington.cs.knowitall.logic;
 2 | 
 3 | /**
 4 |  * Static factories for logic expressions over basic objects.
 5 |  *
 6 |  * @author Michael Schmitz <schmmd@cs.washington.edu>
 7 |  */
 8 | class LogicExpressionParsers {
 9 |     /**
10 |      * Logic expressions where "true" evaluates to true and "false" evaluates to
11 |      * false. For example:
12 |      *
13 |      * (true | false) & true
14 |      *
15 |      * This logic expression is trivial because it's value is independent of the
16 |      * object it is applied to.
17 |      */
18 |     public final static LogicExpressionParser<String> trivial =
19 |         new LogicExpressionParser<String>() {
20 |             @Override
21 |             public Expression.Arg<String> factory(final String string) {
22 |                 return new Expression.Arg.Pred<String>(string) {
23 |                     @Override
24 |                     public boolean apply(String entity) {
25 |                         return "true".equals(string);
26 |                     }
27 |                 };
28 |             }
29 |         };
30 | 
31 |     /**
32 |      * Logic expressions where tokens are strings.  A token is true if it
33 |      * matches the input string.
34 |      */
35 |     public final static LogicExpressionParser<String> stringMatch =
36 |         new LogicExpressionParser<String>() {
37 |             @Override
38 |             public Expression.Arg<String> factory(final String token) {
39 |                 return new Expression.Arg.Pred<String>(token) {
40 |                     final String string = token.substring(1, token.length() - 1);
41 | 
42 |                     @Override
43 |                     public boolean apply(String entity) {
44 |                         return entity.equals(string);
45 |                     }
46 |                 };
47 |             }
48 |         };
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/regex/Expression.java:
--------------------------------------------------------------------------------
  1 | package edu.washington.cs.knowitall.regex;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Iterator;
  5 | import java.util.List;
  6 | 
  7 | import com.google.common.base.Joiner;
  8 | import com.google.common.base.Predicate;
  9 | 
 10 | import edu.washington.cs.knowitall.regex.FiniteAutomaton.Automaton;
 11 | import edu.washington.cs.knowitall.regex.FiniteAutomaton.State;
 12 | 
 13 | /**
 14 |  * Interface for a component of a regular expression.
 15 |  *
 16 |  * @author Michael Schmitz <schmmd@cs.washington.edu>
 17 |  */
 18 | public interface Expression<E> extends Predicate<E> {
 19 | 
 20 |     public Automaton<E> build();
 21 | 
 22 |     public int minMatchingLength();
 23 | 
 24 |     /**
 25 |      * Represents a matching group that is referred to by order number.
 26 |      *     {@code (<foo> <bar>+)}
 27 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
 28 |      *
 29 |      * @param <E>
 30 |      */
 31 |     public class MatchingGroup<E> implements Expression<E> {
 32 |         public final List<Expression<E>> expressions;
 33 | 
 34 |         public MatchingGroup(List<Expression<E>> expressions) {
 35 |             this.expressions = expressions;
 36 |         }
 37 | 
 38 |         @Override
 39 |         public boolean apply(E entity) {
 40 |             throw new UnsupportedOperationException();
 41 |         }
 42 | 
 43 |         public String subexpString() {
 44 |             List<String> subs = new ArrayList<String>(this.expressions.size());
 45 |             for (Expression<E> expr : this.expressions) {
 46 |                 subs.add(expr.toString());
 47 |             }
 48 | 
 49 |             return Joiner.on(" ").join(subs);
 50 |         }
 51 | 
 52 |         @Override
 53 |         public String toString() {
 54 |             return "(" + subexpString() + ")";
 55 |         }
 56 | 
 57 |         /**
 58 |          * Convert the expression into a NFA.
 59 |          */
 60 |         @Override
 61 |         public Automaton<E> build() {
 62 |             Automaton<E> auto = new Automaton<E>(this);
 63 | 
 64 |             Iterator<Expression<E>> exprIterator = this.expressions.iterator();
 65 |             Automaton<E> sub;
 66 | 
 67 |             // connect the start to the first subexpression
 68 |             State<E> prev = auto.start;
 69 |             if (exprIterator.hasNext()) {
 70 |                 sub = exprIterator.next().build();
 71 |                 auto.start.connect(sub.start);
 72 |                 prev = sub.end;
 73 |             }
 74 |             while (exprIterator.hasNext()) {
 75 |                 Expression<E> expr = exprIterator.next();
 76 |                 sub = expr.build();
 77 | 
 78 |                 State<E> connector = new State<E>();
 79 | 
 80 |                 prev.connect(connector);
 81 |                 connector.connect(sub.start);
 82 |                 prev = sub.end;
 83 |             }
 84 | 
 85 |             prev.connect(auto.end);
 86 | 
 87 |             return auto;
 88 |         }
 89 | 
 90 |         @Override
 91 |         public int minMatchingLength() {
 92 |             int len = 0;
 93 |             for (Expression<E> expr : this.expressions) {
 94 |                 len += expr.minMatchingLength();
 95 |             }
 96 |             return len;
 97 |         }
 98 |     }
 99 | 
100 |     /**
101 |      * Represents a matching group that is referred to by name.
102 |      *     {@code (<name>:<foo> <bar>+)}
103 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
104 |      *
105 |      * @param <E>
106 |      */
107 |     public class NamedGroup<E> extends MatchingGroup<E> {
108 |         public final String name;
109 | 
110 |         public NamedGroup(String name, List<Expression<E>> expressions) {
111 |             super(expressions);
112 |             this.name = name;
113 |         }
114 | 
115 |         @Override
116 |         public String toString() {
117 |             return "(<"+this.name+">:" + super.subexpString() + ")";
118 |         }
119 |     }
120 | 
121 |     /**
122 |      * Represents a non-matching group.
123 |      *     {@code (?:<foo> <bar>+)}
124 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
125 |      *
126 |      * @param <E>
127 |      */
128 |     public class NonMatchingGroup<E> extends MatchingGroup<E> {
129 |         public NonMatchingGroup(List<Expression<E>> expressions) {
130 |             super(expressions);
131 |         }
132 | 
133 |         @Override
134 |         public String toString() {
135 |             return "(?:" + super.subexpString() + ")";
136 |         }
137 |     }
138 | 
139 |     /**
140 |      * Disjunction of two experssions.
141 |      *     {@code <foo>|<bar>}
142 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
143 |      *
144 |      * @param <E>
145 |      */
146 |     public static class Or<E> implements Expression<E> {
147 |         public final Expression<E> expr1;
148 |         public final Expression<E> expr2;
149 | 
150 |         public Or(Expression<E> expr1, Expression<E> expr2) {
151 |             this.expr1 = expr1;
152 |             this.expr2 = expr2;
153 |         }
154 | 
155 |         @Override
156 |         public boolean apply(E entity) {
157 |             return true;
158 |         }
159 | 
160 |         @Override
161 |         public String toString() {
162 |             return this.expr1.toString() + " | " + this.expr2.toString();
163 |         }
164 | 
165 |         /**
166 |          * Convert the expression into a NFA.
167 |          */
168 |         @Override
169 |         public Automaton<E> build() {
170 |             Automaton<E> auto = new Automaton<E>(this);
171 | 
172 |             Automaton<E> sub1 = this.expr1.build();
173 |             Automaton<E> sub2 = this.expr2.build();
174 | 
175 |             // attach the sub automata
176 |             auto.start.connect(sub1.start);
177 |             auto.start.connect(sub2.start);
178 |             sub1.end.connect(auto.end);
179 |             sub2.end.connect(auto.end);
180 | 
181 |             return auto;
182 |         }
183 | 
184 |         @Override
185 |         public int minMatchingLength() {
186 |             int left = this.expr1.minMatchingLength();
187 |             int right = this.expr2.minMatchingLength();
188 |             if (left < right)
189 |               return left;
190 |             else
191 |               return right;
192 |         }
193 |     }
194 | 
195 |     /**
196 |      * Kleene-star: zero or more of the enclosed expression.
197 |      *     {@code <foo>*}
198 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
199 |      *
200 |      * @param <E>
201 |      */
202 |     public static class Star<E> implements Expression<E> {
203 |         public final Expression<E> expr;
204 | 
205 |         public Star(Expression<E> expr) {
206 |             this.expr = expr;
207 |         }
208 | 
209 |         @Override
210 |         public boolean apply(E entity) {
211 |             return this.expr.apply(entity);
212 |         }
213 | 
214 |         @Override
215 |         public String toString() {
216 |             return this.expr.toString() + "*";
217 |         }
218 | 
219 |         /**
220 |          * Convert the expression into a NFA.
221 |          */
222 |         @Override
223 |         public Automaton<E> build() {
224 |             Automaton<E> auto = new Automaton<E>(this);
225 | 
226 |             Automaton<E> sub = this.expr.build();
227 | 
228 |             // run it again
229 |             sub.end.connect(sub.start);
230 | 
231 |             // attach the sub automaton
232 |             auto.start.connect(sub.start);
233 |             sub.end.connect(auto.end);
234 | 
235 |             // skip it completely
236 |             auto.start.connect(auto.end);
237 | 
238 |             return auto;
239 |         }
240 | 
241 |         @Override
242 |         public int minMatchingLength() {
243 |             return 0;
244 |         }
245 |     }
246 | 
247 |     /**
248 |      * One or more of the enclosed expression.  Plus(expr) is equivalent to
249 |      * expr followed by Star(expr).
250 |      *     {@code <foo>+} is the same as {@code <foo> <foo>*}
251 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
252 |      *
253 |      * @param <E>
254 |      */
255 |     public static class Plus<E> implements Expression<E> {
256 |         public final Expression<E> expr;
257 | 
258 |         public Plus(Expression<E> expr) {
259 |             this.expr = expr;
260 |         }
261 | 
262 |         @Override
263 |         public boolean apply(E entity) {
264 |             return this.expr.apply(entity);
265 |         }
266 | 
267 |         @Override
268 |         public String toString() {
269 |             return this.expr.toString() + "+";
270 |         }
271 | 
272 |         /**
273 |          * Convert the expression into a NFA.
274 |          */
275 |         @Override
276 |         public Automaton<E> build() {
277 |             Automaton<E> auto = new Automaton<E>(this);
278 | 
279 |             Automaton<E> sub = this.expr.build();
280 | 
281 |             // run it again
282 |             sub.end.connect(sub.start);
283 | 
284 |             // attach the sub automaton
285 |             auto.start.connect(sub.start);
286 |             sub.end.connect(auto.end);
287 | 
288 |             return auto;
289 |         }
290 | 
291 |         @Override
292 |         public int minMatchingLength() {
293 |             return 1;
294 |         }
295 |     }
296 | 
297 |     /**
298 |      * Zero or one of the enclosed expression.
299 |      *     {@code <foo>?}
300 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
301 |      *
302 |      * @param <E>
303 |      */
304 |     public static class Option<E> implements Expression<E> {
305 |         Expression<E> expr;
306 | 
307 |         public Option(Expression<E> expr) {
308 |             this.expr = expr;
309 |         }
310 | 
311 |         @Override
312 |         public boolean apply(E entity) {
313 |             return this.expr.apply(entity);
314 |         }
315 | 
316 |         @Override
317 |         public String toString() {
318 |             return this.expr.toString() + "?";
319 |         }
320 | 
321 |         /**
322 |          * Convert the expression into a NFA.
323 |          */
324 |         @Override
325 |         public Automaton<E> build() {
326 |             Automaton<E> auto = new Automaton<E>(this);
327 | 
328 |             Automaton<E> sub = this.expr.build();
329 | 
330 |             // attach the sub automaton
331 |             auto.start.connect(sub.start);
332 |             sub.end.connect(auto.end);
333 | 
334 |             // skip it completely
335 |             auto.start.connect(auto.end);
336 | 
337 |             return auto;
338 |         }
339 | 
340 |         @Override
341 |         public int minMatchingLength() {
342 |             return 0;
343 |         }
344 |     }
345 | 
346 |     /**
347 |      * A minimum to maximum number of occurrences of the enclosed expression.
348 |      *     {@code <foo>{1,3}}
349 |      * @author Daniel Naber
350 |      *
351 |      * @param <E>
352 |      */
353 |     public static class MinMax<E> implements Expression<E> {
354 |         Expression<E> expr;
355 |         final int minOccurrences;
356 |         final int maxOccurrences;
357 | 
358 |         /**
359 |          * @param minOccurrences minimum occurrences, must be >= 0
360 |          * @param maxOccurrences maximum occurrences, must be >= 1 - you should prefer small values,
361 |          *                       as the use of large values will create a large automaton that takes a lot of memory
362 |          */
363 |         public MinMax(Expression<E> expr, int minOccurrences, int maxOccurrences) {
364 |             this.expr = expr;
365 |             if (minOccurrences < 0 || maxOccurrences < 1) {
366 |                 throw new IllegalArgumentException("minOccurrences must be >= 0 and maxOccurrences must be >= 1: "
367 |                         + minOccurrences + ", " + maxOccurrences);
368 |             }
369 |             if (minOccurrences > maxOccurrences) {
370 |                 throw new IllegalArgumentException("minOccurrences must be <= maxOccurrences: "
371 |                         + minOccurrences + " > " + maxOccurrences);
372 |             }
373 |             this.minOccurrences = minOccurrences;
374 |             this.maxOccurrences = maxOccurrences;
375 |         }
376 | 
377 |         @Override
378 |         public boolean apply(E entity) {
379 |             return this.expr.apply(entity);
380 |         }
381 | 
382 |         @Override
383 |         public String toString() {
384 |             return this.expr.toString() + "{" + minOccurrences + "," + maxOccurrences + "}";
385 |         }
386 | 
387 |         /**
388 |          * Convert the expression into a NFA.
389 |          */
390 |         @Override
391 |         public Automaton<E> build() {
392 |             Automaton<E> auto = new Automaton<E>(this);
393 | 
394 |             List<Automaton<E>> subAutos = new ArrayList<Automaton<E>>();
395 |             int numberOfNodes = maxOccurrences;
396 |             for (int i = 0; i < numberOfNodes; i++) {
397 |                 Automaton<E> sub = this.expr.build();
398 |                 subAutos.add(sub);
399 |             }
400 | 
401 |             // attach the first sub automaton
402 |             auto.start.connect(subAutos.get(0).start);
403 | 
404 |             // attach the sub automatons among themselves and with the end
405 |             for (int i = 0; i < subAutos.size(); i++) {
406 |                 Automaton<E> sub = subAutos.get(i);
407 |                 if (i >= minOccurrences - 1) {
408 |                     sub.end.connect(auto.end);
409 |                 }
410 |                 if (i < subAutos.size() - 1) {
411 |                     Automaton<E> nextSub = subAutos.get(i + 1);
412 |                     sub.end.connect(nextSub.start);
413 |                 }
414 |             }
415 | 
416 |             if (minOccurrences == 0) {
417 |                 // skip it completely
418 |                 auto.start.connect(auto.end);
419 |             }
420 | 
421 |             return auto;
422 |         }
423 | 
424 |         @Override
425 |         public int minMatchingLength() {
426 |             return this.minOccurrences;
427 |         }
428 |     }
429 | 
430 |     /**
431 |      * An expression with no subexpression that is evaluated against a token
432 |      * using the supplied delegate.
433 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
434 |      *
435 |      * @param <E>
436 |      */
437 |     static abstract class BaseExpression<E> implements Expression<E> {
438 |         public final String source;
439 | 
440 |         public BaseExpression(String source) {
441 |             this.source = source;
442 |         }
443 | 
444 |         /**
445 |          * The delegate to evaluate the expression against a token.
446 |          */
447 |         @Override
448 |         public abstract boolean apply(E entity);
449 | 
450 |         public String toString() {
451 |             return "<" + this.source + ">";
452 |         }
453 | 
454 |         /**
455 |          * Convert the expression into a NFA.
456 |          */
457 |         @Override
458 |         public Automaton<E> build() {
459 |             Automaton<E> auto = new Automaton<E>(this);
460 | 
461 |             auto.start.connect(auto.end, this);
462 | 
463 |             return auto;
464 |         }
465 | 
466 |         @Override
467 |         public int minMatchingLength() {
468 |             return 1;
469 |         }
470 |     }
471 | 
472 |     /**
473 |      * A non-consuming expression that matches a token against a property of
474 |      * the text, such as the start or end of a line.
475 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
476 |      *
477 |      * @param <E>
478 |      */
479 |     static abstract class AssertionExpression<E> implements Expression<E> {
480 |         @Override
481 |         public boolean apply(E entity) {
482 |             return false;
483 |         }
484 | 
485 |         public abstract boolean apply(boolean hasStart, List<E> tokens, int count);
486 | 
487 |         /**
488 |          * Convert the expression into a NFA.
489 |          */
490 |         @Override
491 |         public Automaton<E> build() {
492 |             Automaton<E> auto = new Automaton<E>(this);
493 | 
494 |             auto.start.connect(auto.end, this);
495 | 
496 |             return auto;
497 |         }
498 | 
499 |         @Override
500 |         public int minMatchingLength() {
501 |             return 0;
502 |         }
503 |     }
504 | 
505 |     /**
506 |      * A non-consuming expression that matches the start of a line.
507 |      *     {@code ^<foo>}
508 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
509 |      *
510 |      * @param <E>
511 |      */
512 |     static class StartAssertion<E> extends AssertionExpression<E> {
513 |         @Override
514 |         public boolean apply(boolean hasStart, List<E> tokens, int count) {
515 |             return hasStart && tokens.size() == count;
516 |         }
517 | 
518 |         @Override
519 |         public String toString() {
520 |             return "^";
521 |         }
522 |     }
523 | 
524 |     /**
525 |      * A non-consuming expression that matches the end of a line.
526 |      *     {@code <foo>$}
527 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
528 |      *
529 |      * @param <E>
530 |      */
531 |     static class EndAssertion<E> extends AssertionExpression<E> {
532 |         @Override
533 |         public boolean apply(boolean hasStart, List<E> tokens, int count) {
534 |             return tokens.isEmpty();
535 |         }
536 | 
537 |         @Override
538 |         public String toString() {
539 |             return "$";
540 |         }
541 |     }
542 | }
543 | 


--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/regex/ExpressionFactory.java:
--------------------------------------------------------------------------------
 1 | package edu.washington.cs.knowitall.regex;
 2 | 
 3 | import com.google.common.base.Function;
 4 | 
 5 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression;
 6 | 
 7 | /**
 8 |  * Wrapper class for a Guava Function.  Used to unpickle a expression string
 9 |  * into a part of a logic expression.
10 |  *
11 |  * @author Michael Schmitz <schmmd@cs.washington.edu>
12 |  */
13 | public abstract class ExpressionFactory<E> implements Function<String, BaseExpression<E>> {
14 |     public abstract BaseExpression<E> create(String token);
15 | 
16 |     public BaseExpression<E> apply(String token) {
17 |         return this.create(token);
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/regex/FiniteAutomaton.java:
--------------------------------------------------------------------------------
  1 | package edu.washington.cs.knowitall.regex;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Iterator;
  5 | import java.util.List;
  6 | import java.util.concurrent.atomic.AtomicInteger;
  7 | 
  8 | import com.google.common.base.Predicate;
  9 | import com.google.common.collect.Iterables;
 10 | import com.google.common.collect.Lists;
 11 | 
 12 | import edu.washington.cs.knowitall.regex.Expression.AssertionExpression;
 13 | import edu.washington.cs.knowitall.regex.Expression.MatchingGroup;
 14 | 
 15 | /**
 16 |  * A finite automaton implementation.  There is support for epsilon
 17 |  * transitions (NFA) but if those are omitted then this works as an
 18 |  * implementation of a DFA.
 19 |  *
 20 |  * @author Michael Schmitz <schmmd@cs.washington.edu>
 21 |  */
 22 | public class FiniteAutomaton {
 23 |     /**
 24 |      * A component automaton with a single start state and a single end
 25 |      * state.
 26 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
 27 |      *
 28 |      * @param <E>
 29 |      */
 30 |     public static class Automaton<E> {
 31 |         public final StartState<E> start;
 32 |         public final EndState<E> end;
 33 | 
 34 |         public Automaton(StartState<E> start, EndState<E> end) {
 35 |             this.start = start;
 36 |             this.end = end;
 37 |         }
 38 | 
 39 |         public Automaton(Expression<E> expr) {
 40 |             this.start = new StartState<E>(expr);
 41 |             this.end = new EndState<E>(expr);
 42 |         }
 43 | 
 44 |         public boolean apply(List<E> tokens) {
 45 |             return this.evaluate(tokens, true) != null;
 46 |         }
 47 | 
 48 |         public int minMatchingLength() {
 49 |             return start.minMatchingLength();
 50 |         }
 51 | 
 52 |         public Match.FinalMatch<E> lookingAt(List<E> tokens) {
 53 |             return lookingAt(tokens, 0);
 54 |         }
 55 | 
 56 |         /**
 57 |          * @return null if no match, otherwise a representation of the match
 58 |          */
 59 |         public Match.FinalMatch<E> lookingAt(List<E> tokens, int startIndex) {
 60 |             if (tokens.size() - startIndex - this.minMatchingLength() < 0) {
 61 |                 // don't try if we can't possible match
 62 |                 return null;
 63 |             }
 64 |             else {
 65 |                 List<E> sublist = tokens.subList(startIndex, tokens.size());
 66 | 
 67 |                 Step<E> path = this.evaluate(sublist, startIndex == 0);
 68 |                 if (path == null) {
 69 |                     return null;
 70 |                 }
 71 | 
 72 |                 // build list of edges
 73 |                 List<AbstractEdge<E>> edges = new ArrayList<AbstractEdge<E>>();
 74 |                 while (path.state != this.start) {
 75 |                     edges.add(path.path);
 76 |                     path = path.prev;
 77 |                 }
 78 | 
 79 |                 Match.IntermediateMatch<E> match = new Match.IntermediateMatch<E>();
 80 |                 buildMatch(sublist.iterator(), null, new AtomicInteger(startIndex), this.start,
 81 |                            Lists.reverse(edges).iterator(), match);
 82 |                 return new Match.FinalMatch<E>(match);
 83 |             }
 84 |         }
 85 | 
 86 |         /**
 87 |          * Retrace the path through the NFA and produce an object that
 88 |          * represents the match.
 89 |          * @param tokenIterator an iterator over the tokens.
 90 |          * @param expression the expression to match.
 91 |          * @param index the present index.
 92 |          * @param state the present state.
 93 |          * @param edgeIterator an iterator over the edges in the solution.
 94 |          * @param match the solution.
 95 |          * @return
 96 |          */
 97 |         private State<E> buildMatch(Iterator<E> tokenIterator, Expression<E> expression,
 98 |                 AtomicInteger index, State<E> state, Iterator<AbstractEdge<E>> edgeIterator,
 99 |                 Match.IntermediateMatch<E> match) {
100 | 
101 |             Match.IntermediateMatch<E> newMatch = new Match.IntermediateMatch<E>();
102 | 
103 |             while (edgeIterator.hasNext() && !((state instanceof EndState<?>)
104 |                    && ((EndState<E>)state).expression == expression)) {
105 | 
106 |                 AbstractEdge<E> edge = edgeIterator.next();
107 | 
108 |                 // run the sub-automaton
109 |                 if (edge instanceof Edge<?>
110 |                     && !(((Edge<?>) edge).expression instanceof AssertionExpression<?>)) {
111 |                     // consume a token, this is the base case
112 |                     E token = tokenIterator.next();
113 |                     newMatch.add(((Edge<E>)edge).expression, token, index.getAndIncrement());
114 | 
115 |                     state = edge.dest;
116 |                 }
117 |                 else if (state instanceof StartState<?>) {
118 |                     // recurse on StartState so we have a group for that match
119 |                     Expression<E> expr = ((StartState<E>)state).expression;
120 |                     state = buildMatch(tokenIterator, expr, index, edge.dest, edgeIterator, newMatch);
121 |                     assert(state instanceof EndState<?> && ((EndState<?>)state).expression == expr);
122 |                 }
123 |                 else {
124 |                     assert(edge instanceof Epsilon<?>);
125 |                     state = edge.dest;
126 |                 }
127 |             }
128 | 
129 |             // add the sub match group
130 |             if (expression != null
131 |                 && (!newMatch.isEmpty() || expression instanceof MatchingGroup<?>)) {
132 |                 // create a wrapper for the expressions it matched
133 |                 Match.Group<E> pair = new Match.Group<E>(expression);
134 |                 for (Match.Group<E> p : newMatch.pairs()) {
135 |                     if (p.expr instanceof Expression.BaseExpression<?>) {
136 |                         pair.addTokens(p);
137 |                     }
138 |                 }
139 | 
140 |                 // add it
141 |                 match.add(pair);
142 |             }
143 | 
144 |             // add the contents of the sub match group
145 |             match.addAll(newMatch.pairs());
146 | 
147 |             return state;
148 |         }
149 | 
150 |         /**
151 |          * A representation of a movement from a state to another, with a
152 |          * backreference to the previous state.  This is used in building
153 |          * a match object once a solution has been found.
154 |          * @author Michael Schmitz <schmmd@cs.washington.edu>
155 |          *
156 |          * @param <E>
157 |          */
158 |         private static class Step<E> {
159 |             public final State<E> state;
160 |             public final Step<E> prev;
161 |             public final AbstractEdge<E> path;
162 | 
163 |             public Step(State<E> state) {
164 |                 this(state, null, null);
165 |             }
166 | 
167 |             public Step(State<E> state, Step<E> prev, AbstractEdge<E> path) {
168 |                 this.state = state;
169 |                 this.prev = prev;
170 |                 this.path = path;
171 |             }
172 | 
173 |             public String toString() {
174 |                 return this.state.toString();
175 |             }
176 |         }
177 | 
178 |         /**
179 |          * Expand all epsilon transitions for the supplied steps.  That is,
180 |          * add all states available via an epsilon transition from a supplied
181 |          * state to the list.
182 |          * @param steps
183 |          */
184 |         private void expandEpsilons(List<Step<E>> steps) {
185 |             int size = steps.size();
186 |             for (int i = 0; i < size; i++) {
187 |                 Step<E> step = steps.get(i);
188 | 
189 |                 expandEpsilon(step, steps);
190 |             }
191 |         }
192 | 
193 |         /**
194 |          * Expand all epsilon transitions for the specified step.  That is,
195 |          * add all states avaiable via an epsilon transition from step.state.
196 |          * @param step
197 |          * @param steps
198 |          */
199 |         private void expandEpsilon(Step<E> step, List<Step<E>> steps) {
200 |             // loop over edges
201 |             for (final Epsilon<E> edge : step.state.epsilons) {
202 | 
203 |                 // try free edges if they do not lead to an existing
204 |                 // step
205 |                 if (!Iterables.any(steps,
206 |                                 new Predicate<Step<E>>() {
207 |                                     @Override
208 |                                     public boolean apply(Step<E> step) {
209 |                                         return step.state == edge.dest;
210 |                                     }
211 |                                 })) {
212 |                     Step<E> newstep = new Step<E>(edge.dest, step, edge);
213 |                     steps.add(newstep);
214 |                     expandEpsilon(newstep, steps);
215 |                 }
216 |             }
217 |         }
218 | 
219 |         /**
220 |          * Expand any state that has an assertion edge if the assertion passes
221 |          * given the present state.
222 |          * @param steps
223 |          * @param newsteps
224 |          * @param hasStart true iff the tokens contains the start token.
225 |          * @param tokens
226 |          * @param totalTokens
227 |          */
228 |         private void expandAssertions(List<Step<E>> steps, List<Step<E>> newsteps, boolean hasStart,
229 |                                       List<E> tokens, int totalTokens) {
230 |             for (Step<E> step : steps) {
231 |                 for (final Edge<E> edge : step.state.edges) {
232 |                     if (edge.expression instanceof AssertionExpression<?>) {
233 |                         AssertionExpression<E> assertion = (AssertionExpression<E>)edge.expression;
234 | 
235 |                         if (assertion.apply(hasStart, tokens, totalTokens)) {
236 |                             newsteps.add(new Step<E>(edge.dest, step, edge));
237 |                         }
238 |                     }
239 |                 }
240 |             }
241 |         }
242 | 
243 |         private Step<E> evaluate(List<E> tokens, boolean hasStart) {
244 |             List<Step<E>> steps = new ArrayList<Step<E>>();
245 |             steps.add(new Step<E>(this.start));
246 |             return evaluate(tokens, steps, hasStart);
247 |         }
248 | 
249 |         /**
250 |          * Evaluate the NFA against the list of tokens using the Thompson NFA
251 |          * algorithm.
252 |          * @param tokens the tokens to evaluate against
253 |          * @param steps present list of accessible states.
254 |          * @param hasStart true iff tokens contains the start token.
255 |          * @return a Step object representing the last transition or null.
256 |          */
257 |         private Step<E> evaluate(List<E> tokens, List<Step<E>> steps, boolean hasStart) {
258 |             int totalTokens = tokens.size();
259 | 
260 |             int solutionTokensLeft = totalTokens;
261 |             Step<E> solution = null;
262 |             while (!steps.isEmpty()) {
263 | 
264 |                 expandEpsilons(steps);
265 | 
266 |                 List<Step<E>> intermediate = new ArrayList<Step<E>>(steps);
267 |                 List<Step<E>> newsteps = new ArrayList<Step<E>>(steps.size() * 2);
268 |                 do {
269 | 
270 |                     // check if at end
271 |                     for (Step<E> step : intermediate) {
272 |                         if (step.state == this.end) {
273 |                             if (tokens.size() == totalTokens) {
274 |                                 // can't succeed if no tokens are consumed
275 |                             }
276 |                             else {
277 |                                 // we have reached the end
278 |                                 if (tokens.size() < solutionTokensLeft) {
279 |                                     solution = step;
280 |                                     solutionTokensLeft = tokens.size();
281 |                                 }
282 |                             }
283 |                         }
284 |                     }
285 | 
286 |                     // handle assertions
287 |                     newsteps.clear();
288 |                     expandAssertions(intermediate, newsteps, hasStart, tokens, totalTokens);
289 |                     expandEpsilons(newsteps);
290 | 
291 |                     intermediate.clear();
292 |                     intermediate.addAll(newsteps);
293 | 
294 |                     steps.addAll(newsteps);
295 |                 } while (newsteps.size() > 0);
296 | 
297 |                 newsteps.clear();
298 |                 if (!tokens.isEmpty()) {
299 |                     for (Step<E> step : steps) {
300 |                         for (final Edge<E> edge : step.state.edges) {
301 |                             // try other edges if they match the current token
302 |                             if (edge.apply(tokens.get(0))) {
303 |                                 newsteps.add(new Step<E>(edge.dest, step, edge));
304 |                             }
305 |                         }
306 |                     }
307 | 
308 |                     // consume a token
309 |                     tokens = tokens.subList(1, tokens.size());
310 |                 }
311 | 
312 |                 steps = newsteps;
313 |             }
314 | 
315 |             return solution;
316 |         }
317 |     }
318 | 
319 |     /**
320 |      * Representation of a state in the automaton.
321 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
322 |      *
323 |      * @param <E>
324 |      */
325 |     public static class State<E> {
326 |         public final List<Edge<E>> edges = new ArrayList<Edge<E>>();
327 |         public final List<Epsilon<E>> epsilons = new ArrayList<Epsilon<E>>();
328 | 
329 |         /**
330 |          * Add an epsilon transition between this state and dest.
331 |          * @param dest the state to connect
332 |          */
333 |         public void connect(State<E> dest) {
334 |             this.epsilons.add(new Epsilon<E>(dest));
335 |         }
336 | 
337 |         /**
338 |          * Add an edge between this state and dest.
339 |          * @param dest the state to connect
340 |          * @param cost the expression of the edge
341 |          */
342 |         public void connect(State<E> dest, Expression<E> cost) {
343 |             this.edges.add(new Edge<E>(dest, cost));
344 |         }
345 | 
346 |         public String toString() {
347 |             return this.getClass().getSimpleName() + ":" + this.edges.size();
348 |         }
349 |     }
350 | 
351 |     /**
352 |      * A start or end state.
353 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
354 |      *
355 |      * @param <E>
356 |      */
357 |     public static class TerminusState<E> extends State<E> {
358 |         public final Expression<E> expression;
359 |         public TerminusState(Expression<E> expression) {
360 |             super();
361 |             this.expression = expression;
362 |         }
363 | 
364 |         public String toString() {
365 |             return this.getClass().getSimpleName()
366 |                    + "("+this.expression.toString()+"):" + this.edges.size();
367 |         }
368 |     }
369 | 
370 |     /**
371 |      * A start state.
372 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
373 |      *
374 |      * @param <E>
375 |      */
376 |     public static class StartState<E> extends TerminusState<E> {
377 |         public StartState(Expression<E> expression) {
378 |             super(expression);
379 |         }
380 | 
381 |         public int minMatchingLength() {
382 |             return this.expression.minMatchingLength();
383 |         }
384 |     }
385 | 
386 |     /**
387 |      * An end state.
388 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
389 |      *
390 |      * @param <E>
391 |      */
392 |     public static class EndState<E> extends TerminusState<E> {
393 |         public EndState(Expression<E> expression) {
394 |             super(expression);
395 |         }
396 |     }
397 | 
398 |     /**
399 |      * An abstract representation of an edge.
400 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
401 |      *
402 |      * @param <E>
403 |      */
404 |     public static abstract class AbstractEdge<E> implements Predicate<E> {
405 |         public final State<E> dest;
406 | 
407 |         public AbstractEdge(State<E> dest) {
408 |             this.dest = dest;
409 |         }
410 |     }
411 | 
412 |     /**
413 |      * An edge with cost {@code expression}.
414 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
415 |      *
416 |      * @param <E>
417 |      */
418 |     public static class Edge<E> extends AbstractEdge<E> {
419 |         public final Expression<E> expression;
420 | 
421 |         public Edge(State<E> dest, Expression<E> base) {
422 |             super(dest);
423 |             this.expression = base;
424 |         }
425 | 
426 |         @Override
427 |         public String toString() {
428 |             return "(" + this.expression.toString() + ") -> " + this.dest.toString();
429 |         }
430 | 
431 |         @Override
432 |         public boolean apply(E entity) {
433 |             if (expression == null) {
434 |                 return true;
435 |             }
436 |             else {
437 |                 return expression.apply(entity);
438 |             }
439 |         }
440 |     }
441 | 
442 |     /**
443 |      * An edge without cost, an epsilon transition.
444 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
445 |      *
446 |      * @param <E>
447 |      */
448 |     public static class Epsilon<E> extends AbstractEdge<E> {
449 |         public Epsilon(State<E> dest) {
450 |             super(dest);
451 |         }
452 | 
453 |         @Override
454 |         public String toString() {
455 |             return "(epsilon) -> " + dest.toString();
456 |         }
457 | 
458 |         @Override
459 |         public boolean apply(E entity) {
460 |             return true;
461 |         }
462 |     }
463 | }
464 | 


--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/regex/Match.java:
--------------------------------------------------------------------------------
  1 | package edu.washington.cs.knowitall.regex;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Collections;
  5 | import java.util.Collection;
  6 | import java.util.List;
  7 | 
  8 | import com.google.common.base.Function;
  9 | import com.google.common.base.Functions;
 10 | import com.google.common.base.Joiner;
 11 | import com.google.common.collect.Lists;
 12 | 
 13 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression;
 14 | 
 15 | /**
 16 |  * A class to represent a match. Each part of the regular expression is matched
 17 |  * to a sequence of tokens.   A match also stores information about the range
 18 |  * of tokens matched and the matching groups in the match.
 19 |  *
 20 |  * @author Michael Schmitz <schmmd@cs.washington.edu>
 21 |  *
 22 |  * @param <E>
 23 |  */
 24 | public abstract class Match<E> {
 25 |     protected List<Match.Group<E>> pairs;
 26 | 
 27 |     protected Match() {
 28 |       pairs = new ArrayList<Match.Group<E>>();
 29 |     }
 30 | 
 31 |     protected Match(Match<E> match) {
 32 |         this();
 33 |         for (Group<E> pair : match.pairs) {
 34 |             this.add(new Group<E>(pair.expr, pair.tokens));
 35 |         }
 36 |     }
 37 | 
 38 |     public boolean add(Group<E> pair) {
 39 |         return this.pairs.add(pair);
 40 |     }
 41 | 
 42 |     public boolean addAll(Collection<Group<E>> pairs) {
 43 |         boolean result = true;
 44 |         for (Group<E> pair : pairs) {
 45 |             result &= this.add(pair);
 46 |         }
 47 | 
 48 |         return result;
 49 |     }
 50 | 
 51 |     /**
 52 |      * Convenience method for add(new Group<E>(expr, token, pos)).
 53 |      * @param expr
 54 |      * @param token
 55 |      * @param pos
 56 |      * @return
 57 |      */
 58 |     public boolean add(Expression<E> expr, E token, int pos) {
 59 |         return this.add(new Group<E>(expr, token, pos));
 60 |     }
 61 | 
 62 |     /**
 63 |      * True iff this match contains no pairs.  This should only happen on an
 64 |      * IntermediateMatch that has not had any pairs added to it yet.
 65 |      */
 66 |     public boolean isEmpty() {
 67 |         return this.pairs.isEmpty();
 68 |     }
 69 | 
 70 |     @Override
 71 |     public String toString() {
 72 |         return "[" + Joiner.on(", ").join(
 73 |           Lists.transform(this.pairs, Functions.toStringFunction())) + "]";
 74 |     }
 75 | 
 76 |     public String toMultilineString() {
 77 |         return Joiner.on("\n").join(Lists.transform(this.pairs,
 78 |           Functions.toStringFunction()));
 79 |     }
 80 | 
 81 |     /**
 82 |      * @return the index of the first token matched (inclusive start).
 83 |      */
 84 |     public abstract int startIndex();
 85 | 
 86 |     /**
 87 |      * @return the index one past of the last token matched (exclusive end).
 88 |      */
 89 |     public abstract int endIndex();
 90 | 
 91 |     /**
 92 |      * Pairs differ from the matching groups in that each regular expression
 93 |      * element has a pair to associate the element with the text matched.
 94 |      * For example, 'a*' might be associated with 'a a a a'.
 95 |      *
 96 |      * @return all pairs in this match.
 97 |      */
 98 |     public List<Group<E>> pairs() {
 99 |         return Collections.unmodifiableList(this.pairs);
100 |     }
101 | 
102 |     /**
103 |      * @return all matching groups (named and unnamed).
104 |      */
105 |     public abstract List<Group<E>> groups();
106 | 
107 |     /**
108 |      * @return all matched tokens.
109 |      */
110 |     public abstract List<E> tokens();
111 | 
112 |     /**
113 |      * @return the number of tokens in the match.
114 |      */
115 |     public int length() {
116 |         return this.tokens().size();
117 |     }
118 | 
119 |     /**
120 |      * Retrieve a group by name.
121 |      * @param name the name of the group to retrieve.
122 |      * @return the associated group.
123 |      */
124 |     public Group<E> group(String name) {
125 |         for (Group<E> group : this.groups()) {
126 |             if (group.expr instanceof Expression.NamedGroup<?>) {
127 |                 Expression.NamedGroup<E> namedGroup = (Expression.NamedGroup<E>) group.expr;
128 |                 if (namedGroup.name.equals(name)) {
129 |                     return group;
130 |                 }
131 |             }
132 |         }
133 | 
134 |         return null;
135 |     }
136 | 
137 |     /**
138 |      * A match representation that has efficient method calls but is immutable.
139 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
140 |      *
141 |      * @param <E>
142 |      */
143 |     protected final static class FinalMatch<E> extends Match<E> {
144 |         private final int startIndex;
145 |         private final List<E> tokens;
146 |         private final List<Group<E>> groups;
147 | 
148 |         protected FinalMatch(Match<E> m) {
149 |             super(m);
150 |             this.startIndex = m.startIndex();
151 |             this.tokens = Collections.unmodifiableList(m.tokens());
152 |             this.groups = Collections.unmodifiableList(m.groups());
153 |         }
154 | 
155 |         public int startIndex() {
156 |             return this.startIndex;
157 |         }
158 | 
159 |         public int endIndex() {
160 |             return this.startIndex() + this.tokens.size();
161 |         }
162 | 
163 |         public List<E> tokens() {
164 |             return this.tokens;
165 |         }
166 | 
167 |         @Override
168 |         public List<Match.Group<E>> groups() {
169 |             return this.groups;
170 |         }
171 |     }
172 | 
173 |     /**
174 |      * A match representation that is mutable but many method calls compute
175 |      * values instead of returning stored values.  This is a good in-between
176 |      * while building a match object.
177 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
178 |      *
179 |      * @param <E>
180 |      */
181 |     protected final static class IntermediateMatch<E> extends Match<E> {
182 |         protected IntermediateMatch() {
183 |             super();
184 |         }
185 | 
186 |         @Override
187 |         public List<E> tokens() {
188 |             List<E> tokens = new ArrayList<E>();
189 |             for (Match.Group<E> pair : this.pairs) {
190 |                 if (pair.expr instanceof BaseExpression<?>) {
191 |                     tokens.addAll(pair.tokens());
192 |                 }
193 |             }
194 | 
195 |             return tokens;
196 |         }
197 | 
198 |         @Override
199 |         public List<Group<E>> groups() {
200 |             List<Group<E>> groups = new ArrayList<Group<E>>();
201 |             for (Group<E> pair : this.pairs) {
202 |                 if (pair.expr instanceof Expression.MatchingGroup<?>
203 |                 && !(pair.expr instanceof Expression.NonMatchingGroup<?>)) {
204 |                     groups.add(pair);
205 |                 }
206 |             }
207 | 
208 |             return groups;
209 |         }
210 | 
211 |         @Override
212 |         public int startIndex() {
213 |             for (Match.Group<E> pair : this.pairs) {
214 |                 if (pair.expr instanceof Expression.BaseExpression<?>) {
215 |                     return pair.tokens.get(0).index;
216 |                 }
217 |             }
218 | 
219 |             return -1;
220 |         }
221 | 
222 |         @Override
223 |         public int endIndex() {
224 |             for (Match.Group<E> pair : Lists.reverse(this.pairs)) {
225 |                 if (pair.expr instanceof Expression.BaseExpression<?>) {
226 |                     return pair.tokens.get(0).index;
227 |                 }
228 |             }
229 | 
230 |             return -1;
231 |         }
232 |     }
233 | 
234 |     /**
235 |      * A captured group in a matched expression.
236 |      * @author Michael Schmitz <schmmd@cs.washington.edu>
237 |      *
238 |      * @param <E>
239 |      */
240 |     public static class Group<E> {
241 |         private static class Token<E> {
242 |             public E entity;
243 |             public int index;
244 | 
245 |             public Token(E entity, int index) {
246 |                 this.entity = entity;
247 |                 this.index = index;
248 |             }
249 | 
250 |             public String toString() {
251 |                 return this.entity.toString();
252 |             }
253 |         }
254 | 
255 |         public final Expression<E> expr;
256 |         private final List<Token<E>> tokens;
257 | 
258 |         public Group(Expression<E> expr, E token, int pos) {
259 |             this(expr, Collections.singletonList(new Token<E>(token, pos)));
260 |         }
261 | 
262 |         public Group(Expression<E> expr, List<Token<E>> tokens) {
263 |             this.expr = expr;
264 |             this.tokens = new ArrayList<Token<E>>(tokens);
265 |         }
266 | 
267 |         public Group(Expression<E> expr) {
268 |             this(expr, new ArrayList<Token<E>>());
269 |         }
270 | 
271 |         /**
272 |          * Add tokens to the group.
273 |          * @param group
274 |          */
275 |         protected void addTokens(Group<E> group) {
276 |             this.tokens.addAll(group.tokens);
277 |         }
278 | 
279 |         /**
280 |          * @return the tokens matched.
281 |          */
282 |         public List<E> tokens() {
283 |             return Lists.transform(this.tokens,
284 |                     new Function<Match.Group.Token<E>, E>() {
285 |                         @Override
286 |                         public E apply(Match.Group.Token<E> token) {
287 |                             return token.entity;
288 |                         }
289 |                     });
290 |         }
291 | 
292 |         /**
293 |          * @return the index of the first token in this group or -1
294 |          */
295 |         public int startIndex() {
296 |             int min = -1;
297 |             for (Token<E> token : this.tokens) {
298 |                 if (min == -1 || token.index < min)
299 |                     min = token.index;
300 |             }
301 | 
302 |             return min;
303 |         }
304 | 
305 |         /**
306 |          * @return the index of the last token in this group or -1
307 |          */
308 |         public int endIndex() {
309 |             int max = -1;
310 |             for (Token<E> token : this.tokens) {
311 |                 if (token.index == -1 || token.index > max)
312 |                     max = token.index;
313 |             }
314 | 
315 |             return max;
316 |         }
317 | 
318 |         /**
319 |          * A string representation of the group.
320 |          * This is a lighter-weight representation than toString.
321 |          */
322 |         public String text() {
323 |             return Joiner.on(" ").join(this.tokens());
324 |         }
325 | 
326 |         /**
327 |          * @return the number of tokens matched.
328 |          */
329 |         public int tokenCount() {
330 |             return this.tokens.size();
331 |         }
332 | 
333 |         @Override
334 |         public String toString() {
335 |             return expr.toString()
336 |                     + ":'"
337 |                     + Joiner.on(" ").join(
338 |                             Lists.transform(this.tokens,
339 |                                     Functions.toStringFunction())) + "'";
340 |         }
341 |     }
342 | }
343 | 


--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/regex/RegexException.java:
--------------------------------------------------------------------------------
 1 | package edu.washington.cs.knowitall.regex;
 2 | 
 3 | /**
 4 |  *
 5 |  * @author Michael Schmitz <schmmd@cs.washington.edu>
 6 |  */
 7 | public class RegexException extends RuntimeException {
 8 |     private static final long serialVersionUID = -3534531866062810681L;
 9 | 
10 |     public RegexException(String message, Exception e) {
11 |         super(message, e);
12 |     }
13 | 
14 |     public RegexException(String message) {
15 |         super(message);
16 |     }
17 | 
18 |     public static class TokenizationRegexException extends RegexException {
19 |         private static final long serialVersionUID = 7064825496455884721L;
20 | 
21 |         public TokenizationRegexException(String message, Exception e) {
22 |             super(message, e);
23 |         }
24 | 
25 |         public TokenizationRegexException(String message) {
26 |             super(message);
27 |         }
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/regex/RegularExpression.java:
--------------------------------------------------------------------------------
  1 | package edu.washington.cs.knowitall.regex;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Arrays;
  5 | import java.util.List;
  6 | import java.util.Scanner;
  7 | 
  8 | import com.google.common.base.Predicate;
  9 | import com.google.common.base.Joiner;
 10 | import com.google.common.base.Function;
 11 | 
 12 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression;
 13 | import edu.washington.cs.knowitall.regex.FiniteAutomaton.Automaton;
 14 | 
 15 | /**
 16 |  * A regular expression engine that operates over sequences of user-specified
 17 |  * objects.
 18 |  *
 19 |  * @author Michael Schmitz <schmmd@cs.washington.edu>
 20 |  *
 21 |  * @param  <E>  the type of the sequence elements
 22 |  */
 23 | public class RegularExpression<E> implements Predicate<List<E>> {
 24 |     public final List<Expression<E>> expressions;
 25 |     public final Automaton<E> auto;
 26 | 
 27 |     public RegularExpression(List<Expression<E>> expressions) {
 28 |         this.expressions = expressions;
 29 |         this.auto = RegularExpression.build(this.expressions);
 30 |     }
 31 | 
 32 |     /***
 33 |      * Create a regular expression without tokenization support.
 34 |      * @param expressions
 35 |      * @return
 36 |      */
 37 |     public static <E> RegularExpression<E> compile(List<Expression<E>> expressions) {
 38 |         return new RegularExpression<E>(expressions);
 39 |     }
 40 | 
 41 |     /***
 42 |      * Create a regular expression from the specified string.
 43 |      * @param expression
 44 |      * @param factoryDelegate
 45 |      * @return
 46 |      */
 47 |     public static <E> RegularExpression<E> compile(final String expression,
 48 |             final Function<String, BaseExpression<E>> factoryDelegate) {
 49 |         return new RegularExpressionParser<E>() {
 50 |             @Override
 51 |             public BaseExpression<E> factory(String token) {
 52 |                 return factoryDelegate.apply(token);
 53 |             }
 54 |         }.parse(expression);
 55 |     }
 56 | 
 57 |     @Override
 58 |     public boolean equals(Object other) {
 59 |         if (! (other instanceof RegularExpression<?>)) {
 60 |             return false;
 61 |         }
 62 | 
 63 |         RegularExpression<?> expression = (RegularExpression<?>) other;
 64 |         return this.toString().equals(expression.toString());
 65 |     }
 66 | 
 67 |     @Override
 68 |     public int hashCode() {
 69 |         return this.toString().hashCode();
 70 |     }
 71 | 
 72 |     @Override
 73 |     public String toString() {
 74 |         List<String> expressions = new ArrayList<String>(
 75 |                 this.expressions.size());
 76 |         for (Expression<E> expr : this.expressions) {
 77 |             expressions.add(expr.toString());
 78 |         }
 79 | 
 80 |         return Joiner.on(" ").join(expressions);
 81 |     }
 82 | 
 83 |     /**
 84 |      * Build an NFA from the list of expressions.
 85 |      * @param exprs
 86 |      * @return
 87 |      */
 88 |     public static <E> Automaton<E> build(List<Expression<E>> exprs) {
 89 |         Expression.MatchingGroup<E> group = new Expression.MatchingGroup<E>(exprs);
 90 |         return group.build();
 91 |     }
 92 | 
 93 |     /**
 94 |      * Apply the expression against a list of tokens.
 95 |      *
 96 |      * @return true iff the expression if found within the tokens.
 97 |      */
 98 |     @Override
 99 |     public boolean apply(List<E> tokens) {
100 |         if (this.find(tokens) != null) {
101 |             return true;
102 |         } else {
103 |             return false;
104 |         }
105 |     }
106 | 
107 |     /**
108 |      * Apply the expression against a list of tokens.
109 |      *
110 |      * @return true iff the expression matches all of the tokens.
111 |      */
112 |     public boolean matches(List<E> tokens) {
113 |         Match<E> match = this.lookingAt(tokens, 0);
114 |         return match != null && match.endIndex() == tokens.size();
115 |     }
116 | 
117 |     /**
118 |      * Find the first match of the regular expression against tokens. This
119 |      * method is slightly slower due to additional memory allocations. However,
120 |      * the response has much greater detail and is very useful for
121 |      * writing/debugging regular expressions.
122 |      *
123 |      * @param tokens
124 |      * @return an object representing the match, or null if no match is found.
125 |      */
126 |     public Match<E> find(List<E> tokens) {
127 |         return this.find(tokens, 0);
128 |     }
129 | 
130 |     /**
131 |      * Find the first match of the regular expression against tokens, starting
132 |      * at the specified index.
133 |      *
134 |      * @param tokens tokens to match against.
135 |      * @param start index to start looking for a match.
136 |      * @return an object representing the match, or null if no match is found.
137 |      */
138 |     public Match<E> find(List<E> tokens, int start) {
139 |         Match<E> match;
140 |         for (int i = start; i <= tokens.size() - auto.minMatchingLength(); i++) {
141 |             match = this.lookingAt(tokens, i);
142 |             if (match != null) {
143 |                 return match;
144 |             }
145 |         }
146 | 
147 |         return null;
148 |     }
149 | 
150 |     /**
151 |      * Determine if the regular expression matches the beginning of the
152 |      * supplied tokens.
153 |      *
154 |      * @param tokens the list of tokens to match.
155 |      * @return an object representing the match, or null if no match is found.
156 |      */
157 |     public Match<E> lookingAt(List<E> tokens) {
158 |         return this.lookingAt(tokens, 0);
159 |     }
160 | 
161 |     /**
162 |      * Determine if the regular expression matches the supplied tokens,
163 |      * starting at the specified index.
164 |      *
165 |      * @param tokens the list of tokens to match.
166 |      * @param start the index where the match should begin.
167 |      * @return an object representing the match, or null if no match is found.
168 |      */
169 |     public Match<E> lookingAt(List<E> tokens, int start) {
170 |         return auto.lookingAt(tokens, start);
171 |     }
172 | 
173 |     public Match<E> match(List<E> tokens) {
174 |         Match<E> match = this.lookingAt(tokens);
175 |         if (match != null && match.endIndex() == tokens.size()) {
176 |             return match;
177 |         }
178 |         else {
179 |             return null;
180 |         }
181 |     }
182 | 
183 |     /**
184 |      * Find all non-overlapping matches of the regular expression against tokens.
185 |      *
186 |      * @param tokens
187 |      * @return an list of objects representing the match.
188 |      */
189 |     public List<Match<E>> findAll(List<E> tokens) {
190 |         List<Match<E>> results = new ArrayList<Match<E>>();
191 | 
192 |         int start = 0;
193 |         Match<E> match;
194 |         do {
195 |             match = this.find(tokens, start);
196 | 
197 |             if (match != null) {
198 |                 start = match.endIndex();
199 | 
200 |                 // match may be empty query string has all optional parts
201 |                 if (!match.isEmpty()) {
202 |                     results.add(match);
203 |                 }
204 |             }
205 |         } while (match != null);
206 | 
207 |         return results;
208 |     }
209 | 
210 |     /**
211 |      * An interactive program that compiles a word-based regular expression
212 |      * specified in arg1 and then reads strings from stdin, evaluating them
213 |      * against the regular expression.
214 |      * @param args
215 |      */
216 |     public static void main(String[] args) {
217 |         Scanner scan = new Scanner(System.in);
218 | 
219 |         RegularExpression<String> regex = RegularExpressionParsers.word.parse(args[0]);
220 |         System.out.println("regex: " + regex);
221 |         System.out.println();
222 | 
223 |         while (scan.hasNextLine()) {
224 |             String line = scan.nextLine();
225 | 
226 |             System.out.println("contains: " + regex.apply(Arrays.asList(line.split("\\s+"))));
227 |             System.out.println("matches:  " + regex.matches(Arrays.asList(line.split("\\s+"))));
228 |             System.out.println();
229 |         }
230 | 
231 |         scan.close();
232 |     }
233 | }
234 | 


--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/regex/RegularExpressionParser.java:
--------------------------------------------------------------------------------
  1 | package edu.washington.cs.knowitall.regex;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | import java.util.regex.Matcher;
  6 | import java.util.regex.Pattern;
  7 | 
  8 | import com.google.common.base.Function;
  9 | 
 10 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression;
 11 | import edu.washington.cs.knowitall.regex.Expression.EndAssertion;
 12 | import edu.washington.cs.knowitall.regex.Expression.StartAssertion;
 13 | import edu.washington.cs.knowitall.regex.RegexException.TokenizationRegexException;
 14 | 
 15 | /**
 16 |  * A regular expression parser turns strings into RegularExpression
 17 |  * objects.
 18 |  *
 19 |  * @author Michael Schmitz <schmmd@cs.washington.edu>
 20 |  *
 21 |  * @param  <E>  the type of the sequence elements
 22 |  */
 23 | public abstract class RegularExpressionParser<E> implements Function<String, RegularExpression<E>> {
 24 |     /***
 25 |      * The factory method creates an expression from the supplied token string.
 26 |      * @param  token  a string representation of a token
 27 |      * @return  an evaluatable representation of a token
 28 |      */
 29 |     public abstract BaseExpression<E> factory(String token);
 30 | 
 31 |     public RegularExpression<E> parse(String string) {
 32 |         List<Expression<E>> expressions = this.tokenize(string);
 33 |         return new RegularExpression<E>(expressions);
 34 |     }
 35 | 
 36 |     @Override
 37 |     public RegularExpression<E> apply(String string) {
 38 |         return this.parse(string);
 39 |     }
 40 | 
 41 |     /***
 42 |      * Read a token from the remaining text and return it.
 43 |      *
 44 |      * This is a default implementation that is overridable.
 45 |      * In the default implementation, the starting and ending
 46 |      * token characters are not escapable.
 47 |      *
 48 |      * If this implemenation is overridden, A token MUST ALWAYS
 49 |      * start with '<' or '[' and end with '>' or ']'.
 50 |      *
 51 |      * @param remaining
 52 |      * @return
 53 |      */
 54 |     public String readToken(String remaining) {
 55 |         int start = 0;
 56 |         char c = remaining.charAt(0);
 57 | 
 58 |         int end;
 59 |         if (c == '<') {
 60 |             end = indexOfClose(remaining, start, '<', '>');
 61 |         }
 62 |         else if (c == '[' ){
 63 |             end = indexOfClose(remaining, start, '[', ']');
 64 |         }
 65 |         else {
 66 |             throw new IllegalStateException();
 67 |         }
 68 | 
 69 |         // make sure we found the end
 70 |         if (end == -1) {
 71 |             throw new TokenizationRegexException(
 72 |                     "bad token. Non-matching brackets (<> or []): " + start
 73 |                     + ":\"" + remaining.substring(start) + "\"");
 74 |         }
 75 | 
 76 |         String token = remaining.substring(start, end + 1);
 77 |         return token;
 78 |     }
 79 | 
 80 |     /**
 81 |      * Convert a list of tokens (<...>) to a list of expressions.
 82 |      *
 83 |      * @param tokens
 84 |      * @param factory
 85 |      *            Factory class to create a BaseExpression from the text between
 86 |      *            angled brackets.
 87 |      * @return
 88 |      */
 89 |     public List<Expression<E>> tokenize(String string) {
 90 |         List<Expression<E>> expressions = new ArrayList<Expression<E>>();
 91 | 
 92 |         final Pattern whitespacePattern = Pattern.compile("\\s+");
 93 |         final Pattern unaryPattern = Pattern.compile("[*?+]");
 94 |         final Pattern minMaxPattern = Pattern.compile("\\{(\\d+),(\\d+)\\}");
 95 |         final Pattern binaryPattern = Pattern.compile("[|]");
 96 | 
 97 |         List<String> tokens = new ArrayList<String>();
 98 | 
 99 |         char stack = ' ';
100 |         int start = 0;
101 |         while (start < string.length()) {
102 |             Matcher matcher;
103 | 
104 |             // skip whitespace
105 |             if ((matcher = whitespacePattern.matcher(string))
106 |                 .region(start, string.length()).lookingAt()) {
107 |                 start = matcher.end();
108 |                 continue;
109 |             }
110 | 
111 |             char c = string.charAt(start);
112 |             // group, assertion, or token
113 |             if (c == '(' || c == '<' || c == '[' || c == '$' || c == '^') {
114 |                 // group
115 |                 if (string.charAt(start) == '(') {
116 |                     int end = indexOfClose(string, start, '(', ')');
117 |                     if (end == -1) {
118 |                         throw new TokenizationRegexException("unclosed parenthesis: " + start
119 |                                 + ":\"" + string.substring(start) + ")\"");
120 |                     }
121 | 
122 |                     String group = string.substring(start + 1, end);
123 |                     start = end + 1;
124 | 
125 |                     final Pattern namedPattern = Pattern.compile("<(\\w*)>:(.*)");
126 |                     final Pattern unnamedPattern = Pattern.compile("\\?:(.*)");
127 | 
128 |                     // named group (matching)
129 |                     if ((matcher = namedPattern.matcher(group)).matches()) {
130 |                         String groupName = matcher.group(1);
131 |                         group = matcher.group(2);
132 |                         List<Expression<E>> groupExpressions = this.tokenize(group);
133 |                         expressions.add(new Expression.NamedGroup<E>(groupName, groupExpressions));
134 |                     }
135 |                     // unnamed group
136 |                     else if ((matcher = unnamedPattern.matcher(group)).matches()) {
137 |                         group = matcher.group(1);
138 |                         List<Expression<E>> groupExpressions = this.tokenize(group);
139 |                         expressions.add(new Expression.NonMatchingGroup<E>(groupExpressions));
140 |                     }
141 |                     // group (matching)
142 |                     else {
143 |                         List<Expression<E>> groupExpressions = this.tokenize(group);
144 |                         expressions.add(new Expression.MatchingGroup<E>(groupExpressions));
145 |                     }
146 |                 }
147 | 
148 |                 // token
149 |                 else if (c == '<' || c == '[') {
150 |                     String token = readToken(string.substring(start));
151 |                     try {
152 |                         // strip off enclosing characters
153 |                         String tokenInside = token.substring(1, token.length() - 1);
154 |                         BaseExpression<E> base = factory(tokenInside);
155 |                         expressions.add(base);
156 | 
157 |                         start += token.length();
158 |                     }
159 |                     catch (Exception e) {
160 |                         throw new TokenizationRegexException("error parsing token: " + token, e);
161 |                     }
162 |                 }
163 | 
164 |                 // assertion (^)
165 |                 else if (c == '^') {
166 |                     expressions.add(new StartAssertion<E>());
167 |                     start += 1;
168 |                 }
169 | 
170 |                 // assertion ($)
171 |                 else if (c == '$') {
172 |                     expressions.add(new EndAssertion<E>());
173 |                     start += 1;
174 |                 }
175 | 
176 |                 // check if we have a floating OR operator
177 |                 if (stack == '|') {
178 |                     try {
179 |                         stack = ' ';
180 |                         if (expressions.size() < 2) {
181 |                             throw new IllegalStateException(
182 |                                     "OR operator is applied to fewer than 2 elements.");
183 |                         }
184 | 
185 |                         Expression<E> expr1 = expressions.remove(expressions.size() - 1);
186 |                         Expression<E> expr2 = expressions.remove(expressions.size() - 1);
187 |                         expressions.add(new Expression.Or<E>(expr1, expr2));
188 |                     }
189 |                     catch (Exception e) {
190 |                         throw new TokenizationRegexException("error parsing OR (|) operator.", e);
191 |                     }
192 |                 }
193 |             }
194 |             // unary operator
195 |             else if ((matcher = unaryPattern.matcher(string))
196 |                      .region(start, string.length()).lookingAt()) {
197 |                 char operator = matcher.group(0).charAt(0);
198 | 
199 |                 // pop the last expression
200 |                 Expression<E> base = expressions.remove(expressions.size() - 1);
201 | 
202 |                 // add the operator to it
203 |                 Expression<E> expr;
204 |                 if (operator == '?') {
205 |                     expr = new Expression.Option<E>(base);
206 |                 } else if (operator == '*') {
207 |                     expr = new Expression.Star<E>(base);
208 |                 } else if (operator == '+') {
209 |                     expr = new Expression.Plus<E>(base);
210 |                 }
211 |                 else {
212 |                     throw new IllegalStateException();
213 |                 }
214 | 
215 |                 expressions.add(expr);
216 | 
217 |                 start = matcher.end();
218 |             }
219 |             // min/max operator "{x,y}"
220 |             else if ((matcher = minMaxPattern.matcher(string))
221 |                     .region(start, string.length()).lookingAt()) {
222 |                 int minOccurrences = Integer.parseInt(matcher.group(1));
223 |                 int maxOccurrences = Integer.parseInt(matcher.group(2));
224 | 
225 |                 // pop the last expression and add operator
226 |                 Expression<E> base = expressions.remove(expressions.size() - 1);
227 |                 Expression<E> expr = new Expression.MinMax<E>(base, minOccurrences, maxOccurrences);
228 | 
229 |                 expressions.add(expr);
230 | 
231 |                 start = matcher.end();
232 |             }
233 |             // binary operator (alternation)
234 |             else if ((matcher = binaryPattern.matcher(string))
235 |                      .region(start, string.length()).lookingAt()) {
236 |                 tokens.add(matcher.group(0));
237 |                 stack = '|';
238 |                 start = matcher.end();
239 |             }
240 |             else {
241 |                 throw new TokenizationRegexException("unknown symbol: "
242 |                         + string.substring(start));
243 |             }
244 |         }
245 | 
246 |         if (stack == '|') {
247 |             throw new TokenizationRegexException("OR remains on the stack.");
248 |         }
249 | 
250 |         return expressions;
251 |     }
252 | 
253 |     private static int indexOfClose(String string, int start, char open, char close) {
254 |         start--;
255 | 
256 |         int count = 0;
257 |         do {
258 |             start++;
259 | 
260 |             // we hit the end
261 |             if (start >= string.length()) {
262 |                 return -1;
263 |             }
264 | 
265 |             char c = string.charAt(start);
266 | 
267 |             // we hit an open/close
268 |             if (c == open) {
269 |                 count++;
270 |             } else if (c == close) {
271 |                 count--;
272 |             }
273 | 
274 |         } while (count > 0);
275 | 
276 |         return start;
277 |     }
278 | }
279 | 


--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/regex/RegularExpressionParsers.java:
--------------------------------------------------------------------------------
 1 | package edu.washington.cs.knowitall.regex;
 2 | 
 3 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression;
 4 | 
 5 | /**
 6 |  * Static factories for regular expressions over some basic sequences.
 7 |  *
 8 |  * @author Michael Schmitz <schmmd@cs.washington.edu>
 9 |  */
10 | public class RegularExpressionParsers {
11 |     /**
12 |      * Regular expressions over words where sequences are string
13 |      * representations of words.
14 |      */
15 |     public final static RegularExpressionParser<String> word =
16 |         new RegularExpressionParser<String>() {
17 |             @Override public BaseExpression<String> factory(final String string) {
18 |                 return new BaseExpression<String>(string) {
19 |                     @Override public boolean apply(final String token) {
20 |                         return string.equals(token);
21 |                     }
22 |                 };
23 |             }
24 |         };
25 | 
26 |     /**
27 |      * Regular expression over characters, as in java.util.Regex.
28 |      */
29 |     public final static RegularExpressionParser<Character> character =
30 |         new RegularExpressionParser<Character>() {
31 |             @Override public BaseExpression<Character> factory(final String string) {
32 |                 return new BaseExpression<Character>(string) {
33 |                     @Override public boolean apply(final Character token) {
34 |                         return string.equals(token.toString());
35 |                     }
36 |                 };
37 |             }
38 |         };
39 | }
40 | 


--------------------------------------------------------------------------------
/src/test/java/edu/washington/cs/knowitall/regex/MinMaxTest.java:
--------------------------------------------------------------------------------
 1 | package edu.washington.cs.knowitall.regex;
 2 | 
 3 | import org.junit.Test;
 4 | 
 5 | import com.google.common.collect.Lists;
 6 | 
 7 | import java.util.Arrays;
 8 | 
 9 | import static junit.framework.Assert.assertNotNull;
10 | import static junit.framework.Assert.assertNull;
11 | 
12 | public class MinMaxTest {
13 | 
14 |     @Test
15 |     public void testMinMax() {
16 |         RegularExpression<String> regExZeroToOne = getAbcRegex(0, 1);
17 |         assertMatch(regExZeroToOne, "a", "c");
18 |         assertMatch(regExZeroToOne, "a", "b", "c");
19 |         assertNoMatch(regExZeroToOne, "a", "b", "b", "c");
20 | 
21 |         RegularExpression<String> regExOne = getAbcRegex(1, 1);
22 |         assertNoMatch(regExOne, "a", "c");
23 |         assertMatch(regExOne, "a", "b", "c");
24 |         assertNoMatch(regExOne, "a", "b", "b", "c");
25 | 
26 |         RegularExpression<String> regExTwo = getAbcRegex(2, 2);
27 |         assertNoMatch(regExTwo, "a", "c");
28 |         assertNoMatch(regExTwo, "a", "b", "c");
29 |         assertMatch(regExTwo, "a", "b", "b", "c");
30 |         assertNoMatch(regExTwo, "a", "b", "b", "b", "c");
31 | 
32 |         RegularExpression<String> regExOneToTwo = getAbcRegex(1, 2);
33 |         assertNoMatch(regExOneToTwo, "a", "c");
34 |         assertMatch(regExOneToTwo, "a", "b", "c");
35 |         assertMatch(regExOneToTwo, "a", "b", "b", "c");
36 |         assertNoMatch(regExOneToTwo, "a", "b", "b", "b", "c");
37 | 
38 |         RegularExpression<String> regExTwoToFour = getAbcRegex(2, 4);
39 |         assertNoMatch(regExTwoToFour, "a", "c");
40 |         assertNoMatch(regExTwoToFour, "a", "b", "c");
41 |         assertMatch(regExTwoToFour, "a", "b", "b", "c");
42 |         assertMatch(regExTwoToFour, "a", "b", "b", "b", "c");
43 |         assertMatch(regExTwoToFour, "a", "b", "b", "b", "b", "c");
44 |         assertNoMatch(regExTwoToFour, "a", "b", "b", "b", "b", "b", "c");
45 |     }
46 | 
47 |     private void assertMatch(RegularExpression<String> regex, String... input) {
48 |         assertNotNull(regex.find(Arrays.asList(input)));
49 |     }
50 | 
51 |     private void assertNoMatch(RegularExpression<String> regex, String... input) {
52 |         assertNull(regex.find(Arrays.asList(input)));
53 |     }
54 | 
55 |     @Test(expected = IllegalArgumentException.class)
56 |     public void testException1() {
57 |         getAbcRegex(0, 0);
58 |     }
59 | 
60 |     @Test(expected = IllegalArgumentException.class)
61 |     public void testException2() {
62 |         getAbcRegex(1, 0);
63 |     }
64 | 
65 |     @Test(expected = IllegalArgumentException.class)
66 |     public void testException3() {
67 |         getAbcRegex(-1, 0);
68 |     }
69 | 
70 |     @Test(expected = IllegalArgumentException.class)
71 |     public void testException4() {
72 |         getAbcRegex(0, -1);
73 |     }
74 | 
75 |     private RegularExpression<String> getAbcRegex(int min, int max) {
76 |         Expression<String> wordA = RegularExpressionParsers.word.parse("<a>").expressions.get(0);
77 |         Expression<String> wordB = RegularExpressionParsers.word.parse("<b>").expressions.get(0);
78 |         Expression<String> wordC = RegularExpressionParsers.word.parse("<c>").expressions.get(0);
79 |         return RegularExpression.compile(Lists.newArrayList(
80 |                 wordA,
81 |                 new Expression.MinMax<String>(wordB, min, max),
82 |                 wordC)
83 |             );
84 |     }
85 | }
86 | 


--------------------------------------------------------------------------------
/src/test/scala/edu/washington/cs/knowitall/logic/LogicTest.scala:
--------------------------------------------------------------------------------
 1 | package edu.washington.cs.knowitall.logic;
 2 | 
 3 | import org.junit.runner.RunWith
 4 | import org.specs2.mutable.Specification
 5 | import org.specs2.runner.JUnitRunner
 6 | import org.specs2.ScalaCheck
 7 | 
 8 | import edu.washington.cs.knowitall.logic.Expression.Arg;
 9 | 
10 | @RunWith(classOf[JUnitRunner])
11 | class LogicTest extends Specification with ScalaCheck {
12 |   "escape characters" should {
13 |     "tokenize ok" in {
14 |       val regex = compileStringMatch("\"zebra\" | \"zeb\\\"ra\"")
15 |       // note: escape characters are tokenized by not interpreted
16 |       regex("zeb\\\"ra")
17 |     }
18 |   }
19 |   "order of operations" should {
20 |     "infer the correct parenthesis" in {
21 |       compile("false & false & false").toString() must_== "(false & (false & false))"
22 |       compile("false & false | false").toString() must_== "((false & false) | false)"
23 |       compile("false | false & false").toString() must_== "(false | (false & false))"
24 |     }
25 |   }
26 | 
27 |   def eval(expr: String,  f: (Boolean, Boolean) => Boolean) =
28 |     "evaluate ("+expr+") correctly" in {
29 |       check { (a: Boolean, b: Boolean) => compile(substitute(expr, a, b))(null) must_== f(a, b) }
30 |     }
31 |   def eval(expr: String,  f: (Boolean, Boolean, Boolean) => Boolean) =
32 |     "evaluate ("+expr+") correctly" in {
33 |       check { (a: Boolean, b: Boolean, c: Boolean) => compile(substitute(expr, a, b, c))(null) must_== f(a, b, c) }
34 |     }
35 |   def eval(expr: String,  f: (Boolean, Boolean, Boolean, Boolean) => Boolean) =
36 |     "evaluate ("+expr+") correctly" in {
37 |       check { (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => compile(substitute(expr, a, b, c, d))(null) must_== f(a, b, c, d) }
38 |     }
39 | 
40 |   "two variable logic expressions" should {
41 |     eval("a | b", (a: Boolean, b: Boolean) => a | b)
42 |     eval("a & b", (a: Boolean, b: Boolean) => a & b)
43 |   }
44 | 
45 |   "three variable logic expressions" should {
46 |     eval("(a | (b & c))", (a: Boolean, b: Boolean, c: Boolean) => (a | (b & c)))
47 |     eval("(a & (b & c))", (a: Boolean, b: Boolean, c: Boolean) => (a & (b & c)))
48 |     eval("(a & (b | c))", (a: Boolean, b: Boolean, c: Boolean) => (a & (b | c)))
49 |     eval("(a | (b | c))", (a: Boolean, b: Boolean, c: Boolean) => (a | (b | c)))
50 |   }
51 | 
52 |   "four variable logic expressions" should {
53 |     eval("(a | (b & c & d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a | (b & c & d)))
54 |     eval("(a | (b & c | d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a | (b & c | d)))
55 |     eval("(a | (b | c & d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a | (b | c & d)))
56 |     eval("(a | (b | c | d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a | (b | c | d)))
57 |     eval("(a & (b & c & d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a & (b & c & d)))
58 |     eval("(a & (b & c | d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a & (b & c | d)))
59 |     eval("(a & (b | c & d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a & (b | c & d)))
60 |     eval("(a & (b | c | d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a & (b | c | d)))
61 |     eval("((a | b) & (c | d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => ((a | b) & (c | d)))
62 |     eval("((a & b) | (c & d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => ((a & b) | (c & d)))
63 |     eval("(!(a | b) & (c | d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (!(a | b) & (c | d)))
64 |     eval("((a | b) & !(c | d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => ((a | b) & !(c | d)))
65 |     eval("(!((a | b) & !(c | d)))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (!((a | b) & !(c | d))))
66 |   }
67 | 
68 |   def substitute(expr: String, varargs: Boolean*) =
69 |     (expr /: varargs.zipWithIndex) { case (expr, (arg, i)) =>
70 |       val v = ('a' + i).toChar;
71 |       expr.replace(v.toString(), arg.toString);
72 |     }
73 | 
74 |   def compile(logic: String): LogicExpression[String] = LogicExpressionParsers.trivial.parse(logic)
75 | 
76 |   def compileStringMatch(logic: String): LogicExpression[String] = LogicExpressionParsers.stringMatch.parse(logic)
77 | }
78 | 


--------------------------------------------------------------------------------
/src/test/scala/edu/washington/cs/knowitall/logic/WordLogicTest.scala:
--------------------------------------------------------------------------------
 1 | package edu.washington.cs.knowitall.regex
 2 | 
 3 | import scala.collection.JavaConverters._
 4 | 
 5 | import org.junit.runner.RunWith
 6 | import org.specs2.mutable.Specification
 7 | import org.specs2.runner.JUnitRunner
 8 | 
 9 | import edu.washington.cs.knowitall.logic._
10 | import edu.washington.cs.knowitall.logic.Expression.Arg
11 | 
12 | @RunWith(classOf[JUnitRunner])
13 | class WordLogicTest extends Specification {
14 |   case class WordToken(string: String, postag: String, chunk: String)
15 | 
16 |   "README logic example" should {
17 |     "work" in {
18 |       def create(string: String) = {
19 |         new LogicExpressionParser[WordToken] {
20 |           override def factory(expr: String) = {
21 |             new Arg.Pred[WordToken](expr) {
22 |               val Array(part, quotedValue) = expr.split("=")
23 |               val value = quotedValue.drop(1).take(quotedValue.size - 2)
24 |               override def apply(entity: WordToken) = part match {
25 |                 case "string" => entity.string == value
26 |                 case "postag" => entity.postag == value
27 |                 case "chunk" => entity.chunk == value
28 |               }
29 |             }
30 |           }
31 |         }.parse(string)
32 |       }
33 | 
34 |       val logic = create("string='the' | postag='JJ'")
35 |       logic.apply(WordToken("the", "foo", "bar")) must beTrue
36 |       logic.apply(WordToken("foo", "JJ", "bar")) must beTrue
37 |       logic.apply(WordToken("foo", "bar", "baz")) must beFalse
38 |     }
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/test/scala/edu/washington/cs/knowitall/regex/RegularExpressionAssertionTest.scala:
--------------------------------------------------------------------------------
 1 | package edu.washington.cs.knowitall.regex
 2 | import org.junit.runner.RunWith
 3 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression
 4 | import scala.collection.JavaConversions._
 5 | import org.specs2.mutable.Specification
 6 | import org.specs2.runner.JUnitRunner
 7 | 
 8 | @RunWith(classOf[JUnitRunner])
 9 | class RegularExpressionAssertionTest extends Specification {
10 |   val regexTokens = List("^", "<is>", "<a>", "$")
11 |   val matchTokens = List("this", "is", "a", "test")
12 | 
13 |   val regex = RegularExpressionParsers.word.parse(regexTokens.tail.init.mkString(" "))
14 |   val regexEnd = RegularExpressionParsers.word.parse(regexTokens.tail.mkString(" "))
15 |   val regexStart = RegularExpressionParsers.word.parse(regexTokens.init.mkString(" "))
16 |   val regexBoth = RegularExpressionParsers.word.parse(regexTokens.mkString(" "))
17 | 
18 |   def evaluate(regex: RegularExpression[String], tokens: List[String], value: Boolean) =
19 |     (if (value) "" else "not ") + "be found in '" + tokens.mkString(" ") + "': " in {
20 |       regex.apply(tokens) must beTrue.iff(value)
21 |     }
22 | 
23 | 
24 |   regex.toString should {
25 |     evaluate(regex, matchTokens, true)
26 |     evaluate(regex, matchTokens.tail, true)
27 |     evaluate(regex, matchTokens.init, true)
28 |   }
29 | 
30 |   regexEnd.toString should {
31 |     evaluate(regexEnd, matchTokens, false)
32 |     evaluate(regexEnd, matchTokens.tail, false)
33 |     evaluate(regexEnd, matchTokens.init, true)
34 |   }
35 | 
36 |   regexStart.toString should {
37 |     evaluate(regexStart, matchTokens, false)
38 |     evaluate(regexStart, matchTokens.tail, true)
39 |     evaluate(regexStart, matchTokens.init, false)
40 |   }
41 | 
42 |   regexBoth.toString should {
43 |     "match 'is a'" in {
44 |       regexBoth.matches(List("is", "a")) must beTrue
45 |     }
46 |     evaluate(regexBoth, matchTokens, false)
47 |     evaluate(regexBoth, matchTokens.tail, false)
48 |     evaluate(regexBoth, matchTokens.init, false)
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/test/scala/edu/washington/cs/knowitall/regex/RegularExpressionNamedGroupTest.scala:
--------------------------------------------------------------------------------
 1 | package edu.washington.cs.knowitall.regex
 2 | import org.junit.runner.RunWith
 3 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression
 4 | import scala.collection.JavaConversions._
 5 | import org.specs2.mutable.Specification
 6 | import org.specs2.runner.JUnitRunner
 7 | 
 8 | @RunWith(classOf[JUnitRunner])
 9 | class RegularExpressionNamedGroupTest extends Specification {
10 |   val regex = RegularExpressionParsers.word.parse("(<subject>: <I> | (?: <The> (<subjadj>: <crazy>)? <Mariners>)) <know> <all> <of> (<poss>: <her> | (?: <the> (<possadj>: <dirty>?) <King> <'s>)) <secrets>")
11 | 
12 |   regex.toString should {
13 |     val matches = List("I know all of her secrets",
14 |       "The Mariners know all of her secrets",
15 |       "The Mariners know all of the dirty King 's secrets",
16 |       "The Mariners know all of the King 's secrets",
17 |       "The crazy Mariners know all of the King 's secrets")
18 | 
19 |     matches.foreach { m =>
20 |       "match against " + m in {
21 |         regex.apply(m.split(" ").toList) must beTrue
22 |       }
23 |     }
24 | 
25 |     "yield the correct groups" in {
26 |       val m = regex.find("The crazy Mariners know all of the King 's secrets".split(" ").toList)
27 |       m.groups().size() must_== 5
28 | 
29 |       m.group("subject").text must_== "The crazy Mariners"
30 |       m.group("subject").startIndex must_== 0
31 |       m.group("subject").endIndex must_== 2
32 | 
33 |       m.group("subjadj").text must_== "crazy"
34 |       m.group("subjadj").startIndex must_== 1
35 |       m.group("subjadj").endIndex must_== 1
36 | 
37 |       m.group("poss").text must_== "the King 's"
38 |       m.group("poss").startIndex must_== 6
39 |       m.group("poss").endIndex must_== 8
40 | 
41 |       m.group("possadj").text must_== ""
42 |       m.group("possadj").startIndex must_== -1
43 |       m.group("possadj").endIndex must_== -1
44 |     }
45 | 
46 |     "yield the correct groups" in {
47 |       val m = regex.find("The Mariners know all of her secrets".split(" ").toList)
48 |       m.groups().size() must_== 3
49 | 
50 |       m.group("subject").text must_== "The Mariners"
51 |       m.group("subject").startIndex must_== 0
52 |       m.group("subject").endIndex must_== 1
53 | 
54 |       m.group("poss").text must_== "her"
55 |       m.group("poss").startIndex must_== 5
56 |       m.group("poss").endIndex must_== 5
57 |     }
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/test/scala/edu/washington/cs/knowitall/regex/RegularExpressionPermutationTest.scala:
--------------------------------------------------------------------------------
 1 | package edu.washington.cs.knowitall.regex
 2 | import org.junit.runner.RunWith
 3 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression
 4 | import scala.collection.JavaConversions._
 5 | import org.specs2.mutable.Specification
 6 | import org.specs2.runner.JUnitRunner
 7 | import scala.collection.immutable.SortedSet
 8 | 
 9 | @RunWith(classOf[JUnitRunner])
10 | class RegularExpressionPermutationTest extends Specification {
11 |   case class TestCase(tokens: List[String], value: Boolean) extends Ordered[TestCase] {
12 |     def extend(test: TestCase) =
13 |       TestCase(tokens ::: test.tokens, value & test.value)
14 | 
15 |     def compare(that: TestCase) = {
16 |       val c1 = this.tokens.mkString(" ") compare that.tokens.mkString(" ")
17 |       if (c1 != 0) c1
18 |       else this.value.compare(that.value)
19 |     }
20 |   }
21 | 
22 |   val tokens = List("<this>+", "<is>*", "<a>?", "<test>")
23 |   tokens.permutations.foreach { permutation =>
24 |     permutation.mkString("'", " ", "'") should {
25 |       "match sentences correctly" in {
26 |         val regex = RegularExpressionParsers.word.parse(permutation.mkString(" "))
27 | 
28 |         { test: TestCase =>
29 |           regex.matches(test.tokens) aka test.tokens.mkString("'", " ", "'") must beTrue.iff(test.value)
30 |         }.forall(cases(regex))
31 |       }
32 |     }
33 |   }
34 | 
35 |   def cases(regex: RegularExpression[String]) = {
36 |     def makeCases(exprs: List[Expression[String]]) = {
37 |       def makeNext(expr: Expression[String]): (List[List[String]], List[List[String]]) = expr match {
38 |         case star: Expression.Star[_] =>
39 |           val source = star.expr.asInstanceOf[BaseExpression[String]].source
40 |           (List(), List(List(), List(source), List(source, source)))
41 |         case plus: Expression.Plus[_] =>
42 |           val source = plus.expr.asInstanceOf[BaseExpression[String]].source
43 |           (List(List()), List(List(source), List(source, source)))
44 |         case option: Expression.Option[_] =>
45 |           val source = option.expr.asInstanceOf[BaseExpression[String]].source
46 |           (List(List(source, source)), List(List(), List(source)))
47 |         case base: Expression.BaseExpression[_] =>
48 |           val source = base.source
49 |           (List(List(), List(source, source)), List(List(source)))
50 |         case _ => (List(), List())
51 |       }
52 | 
53 |       def makeNextCase(expr: Expression[String]) = {
54 |         val (falses, trues) = makeNext(expr)
55 |         falses.map(TestCase(_, false)) ::: trues.map(TestCase(_, true))
56 |       }
57 | 
58 |       def combine(tests: List[TestCase], nexts: List[TestCase]) =
59 |         if (nexts.isEmpty) tests
60 |         else for (test <- tests; next <- nexts) yield (test extend next)
61 | 
62 |       def rec(exprs: List[Expression[String]]): List[TestCase] = exprs match {
63 |         case expr :: exprs =>
64 |           val tests = makeNextCase(expr)
65 |           val extentions = rec(exprs)
66 |           combine(tests, extentions)
67 |         case Nil => List()
68 |       }
69 | 
70 |       SortedSet[TestCase]() ++ rec(exprs)
71 |     }
72 | 
73 |     makeCases(regex.expressions.toList)
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/scala/edu/washington/cs/knowitall/regex/RegularExpressionUnnamedGroupTest.scala:
--------------------------------------------------------------------------------
 1 | package edu.washington.cs.knowitall.regex
 2 | import org.junit.runner.RunWith
 3 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression
 4 | import scala.collection.JavaConversions._
 5 | import org.specs2.mutable.Specification
 6 | import org.specs2.runner.JUnitRunner
 7 | 
 8 | @RunWith(classOf[JUnitRunner])
 9 | class RegularExpressionTest extends Specification {
10 |   val regex = RegularExpressionParsers.word.parse("<this> <is> (((?:(?: <a> <very>+) | <an>) <amazing>? <new>{1,3}) | (?: <a> <many>* <centuries> <old>)) <test>")
11 | 
12 |   regex.toString should {
13 |     "match" in {
14 |       regex.apply("this is a very very very amazing new test".split(" ").toList) must beTrue
15 |       regex.apply("this is a very new test".split(" ").toList) must beTrue
16 |       regex.apply("this is an amazing new test".split(" ").toList) must beTrue
17 |       regex.apply("this is a centuries old test".split(" ").toList) must beTrue
18 |       regex.apply("this is a many many centuries old test".split(" ").toList) must beTrue
19 |       regex.apply("this is a very new test".split(" ").toList) must beTrue
20 |       regex.apply("this is a very new new test".split(" ").toList) must beTrue
21 |       regex.apply("this is a very new new new test".split(" ").toList) must beTrue
22 |       regex.apply("this is a very new new new new test".split(" ").toList) must beFalse
23 |     }
24 | 
25 |     "not match" in {
26 |       regex.apply("this is a amazing new test".split(" ").toList) must beFalse
27 |     }
28 | 
29 |     "yield the correct groups" in {
30 |       val m = regex.find("this is a very very very amazing new test".split(" ").toList)
31 |       m.groups().size() must_== 3
32 |       m.groups().get(1).text must_== "a very very very amazing new"
33 |       m.groups().get(2).text must_== "a very very very amazing new"
34 |     }
35 | 
36 |     "yield the correct groups" in {
37 |       val m = regex.find("this is a centuries old test".split(" ").toList)
38 |       m.groups().size() must_== 2
39 |       m.groups().get(1).text must_== "a centuries old"
40 |     }
41 |   }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/test/scala/edu/washington/cs/knowitall/regex/WordRegularExpressionTest.scala:
--------------------------------------------------------------------------------
 1 | package edu.washington.cs.knowitall.regex
 2 | 
 3 | import scala.collection.JavaConverters._
 4 | 
 5 | import org.junit.runner.RunWith
 6 | import org.specs2.mutable.Specification
 7 | import org.specs2.runner.JUnitRunner
 8 | 
 9 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression
10 | 
11 | @RunWith(classOf[JUnitRunner])
12 | class WordRegularExpressionTest extends Specification {
13 |   case class WordToken(string: String, postag: String, chunk: String)
14 | 
15 |   def compile(string: String): RegularExpression[WordToken] = {
16 |     // create a parser for regular expression language that have
17 |     // the same token representation
18 |     val parser =
19 |       new RegularExpressionParser[WordToken]() {
20 |         // Translate an string "part=value" into a BaseExpression that
21 |         // checks whether the part of a WordToken has value 'value'.
22 |         override def factory(string: String): BaseExpression[WordToken] = {
23 |           new BaseExpression[WordToken](string) {
24 |             val Array(part, quotedValue) = string.split("=")
25 |             val value = quotedValue.drop(1).take(quotedValue.size - 2)
26 |             override def apply(entity: WordToken) = {
27 |               part match {
28 |                 case "string" => entity.string equalsIgnoreCase value
29 |                 case "postag" => entity.postag equalsIgnoreCase value
30 |                 case "chunk" => entity.chunk equalsIgnoreCase value
31 |               }
32 |             }
33 |           }
34 |         }
35 |       }
36 | 
37 |     parser.parse(string)
38 |   }
39 | 
40 |   "README regex example one" should {
41 |     "work" in {
42 |       val sentence = "The US president Barack Obama is travelling to Mexico."
43 |       val tokens = Seq(
44 |         WordToken("The", "DT", null),
45 |         WordToken("US", "NNP", null),
46 |         WordToken("president", "NN", null),
47 |         WordToken("Barack", "NNP", null),
48 |         WordToken("Obama", "NNP", null),
49 |         WordToken("is", "VB", null),
50 |         WordToken("travelling", "VB", null),
51 |         WordToken("to", "TO", null),
52 |         WordToken("Mexico", "NN", null),
53 |         WordToken(".", ".", null))
54 |       val regex = compile("""(?:<string='a'> | <string='an'> | <string='the'>)? <postag='JJ'>* <postag='NNP'>+ <postag='NN'>+ <postag='NNP'>+""")
55 |       val found = Option(regex.find(tokens.asJava))
56 |       found.size must_== 1
57 |       found.get.groups.get(0).tokens.asScala.map(_.string).mkString(" ") must_== "The US president Barack Obama"
58 |     }
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------