├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── build.sbt └── src ├── main └── java │ └── edu │ └── washington │ └── cs │ └── knowitall │ ├── logic │ ├── ArgFactory.java │ ├── Expression.java │ ├── LogicException.java │ ├── LogicExpression.java │ ├── LogicExpressionParser.java │ └── LogicExpressionParsers.java │ └── regex │ ├── Expression.java │ ├── ExpressionFactory.java │ ├── FiniteAutomaton.java │ ├── Match.java │ ├── RegexException.java │ ├── RegularExpression.java │ ├── RegularExpressionParser.java │ └── RegularExpressionParsers.java └── test ├── java └── edu │ └── washington │ └── cs │ └── knowitall │ └── regex │ └── MinMaxTest.java └── scala └── edu └── washington └── cs └── knowitall ├── logic ├── LogicTest.scala └── WordLogicTest.scala └── regex ├── RegularExpressionAssertionTest.scala ├── RegularExpressionNamedGroupTest.scala ├── RegularExpressionPermutationTest.scala ├── RegularExpressionUnnamedGroupTest.scala └── WordRegularExpressionTest.scala /.gitignore: -------------------------------------------------------------------------------- 1 | .classpath 2 | .project 3 | .settings 4 | .cache 5 | target 6 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: scala 2 | scala: 3 | - "2.10.2" 4 | jdk: 5 | - oraclejdk7 6 | - openjdk7 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU LESSER GENERAL PUBLIC LICENSE 2 | Version 3, 29 June 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | 9 | This version of the GNU Lesser General Public License incorporates 10 | the terms and conditions of version 3 of the GNU General Public 11 | License, supplemented by the additional permissions listed below. 12 | 13 | 0. Additional Definitions. 14 | 15 | As used herein, "this License" refers to version 3 of the GNU Lesser 16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU 17 | General Public License. 18 | 19 | "The Library" refers to a covered work governed by this License, 20 | other than an Application or a Combined Work as defined below. 21 | 22 | An "Application" is any work that makes use of an interface provided 23 | by the Library, but which is not otherwise based on the Library. 24 | Defining a subclass of a class defined by the Library is deemed a mode 25 | of using an interface provided by the Library. 26 | 27 | A "Combined Work" is a work produced by combining or linking an 28 | Application with the Library. The particular version of the Library 29 | with which the Combined Work was made is also called the "Linked 30 | Version". 31 | 32 | The "Minimal Corresponding Source" for a Combined Work means the 33 | Corresponding Source for the Combined Work, excluding any source code 34 | for portions of the Combined Work that, considered in isolation, are 35 | based on the Application, and not on the Linked Version. 36 | 37 | The "Corresponding Application Code" for a Combined Work means the 38 | object code and/or source code for the Application, including any data 39 | and utility programs needed for reproducing the Combined Work from the 40 | Application, but excluding the System Libraries of the Combined Work. 41 | 42 | 1. Exception to Section 3 of the GNU GPL. 43 | 44 | You may convey a covered work under sections 3 and 4 of this License 45 | without being bound by section 3 of the GNU GPL. 46 | 47 | 2. Conveying Modified Versions. 48 | 49 | If you modify a copy of the Library, and, in your modifications, a 50 | facility refers to a function or data to be supplied by an Application 51 | that uses the facility (other than as an argument passed when the 52 | facility is invoked), then you may convey a copy of the modified 53 | version: 54 | 55 | a) under this License, provided that you make a good faith effort to 56 | ensure that, in the event an Application does not supply the 57 | function or data, the facility still operates, and performs 58 | whatever part of its purpose remains meaningful, or 59 | 60 | b) under the GNU GPL, with none of the additional permissions of 61 | this License applicable to that copy. 62 | 63 | 3. Object Code Incorporating Material from Library Header Files. 64 | 65 | The object code form of an Application may incorporate material from 66 | a header file that is part of the Library. You may convey such object 67 | code under terms of your choice, provided that, if the incorporated 68 | material is not limited to numerical parameters, data structure 69 | layouts and accessors, or small macros, inline functions and templates 70 | (ten or fewer lines in length), you do both of the following: 71 | 72 | a) Give prominent notice with each copy of the object code that the 73 | Library is used in it and that the Library and its use are 74 | covered by this License. 75 | 76 | b) Accompany the object code with a copy of the GNU GPL and this license 77 | document. 78 | 79 | 4. Combined Works. 80 | 81 | You may convey a Combined Work under terms of your choice that, 82 | taken together, effectively do not restrict modification of the 83 | portions of the Library contained in the Combined Work and reverse 84 | engineering for debugging such modifications, if you also do each of 85 | the following: 86 | 87 | a) Give prominent notice with each copy of the Combined Work that 88 | the Library is used in it and that the Library and its use are 89 | covered by this License. 90 | 91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license 92 | document. 93 | 94 | c) For a Combined Work that displays copyright notices during 95 | execution, include the copyright notice for the Library among 96 | these notices, as well as a reference directing the user to the 97 | copies of the GNU GPL and this license document. 98 | 99 | d) Do one of the following: 100 | 101 | 0) Convey the Minimal Corresponding Source under the terms of this 102 | License, and the Corresponding Application Code in a form 103 | suitable for, and under terms that permit, the user to 104 | recombine or relink the Application with a modified version of 105 | the Linked Version to produce a modified Combined Work, in the 106 | manner specified by section 6 of the GNU GPL for conveying 107 | Corresponding Source. 108 | 109 | 1) Use a suitable shared library mechanism for linking with the 110 | Library. A suitable mechanism is one that (a) uses at run time 111 | a copy of the Library already present on the user's computer 112 | system, and (b) will operate properly with a modified version 113 | of the Library that is interface-compatible with the Linked 114 | Version. 115 | 116 | e) Provide Installation Information, but only if you would otherwise 117 | be required to provide such information under section 6 of the 118 | GNU GPL, and only to the extent that such information is 119 | necessary to install and execute a modified version of the 120 | Combined Work produced by recombining or relinking the 121 | Application with a modified version of the Linked Version. (If 122 | you use option 4d0, the Installation Information must accompany 123 | the Minimal Corresponding Source and Corresponding Application 124 | Code. If you use option 4d1, you must provide the Installation 125 | Information in the manner specified by section 6 of the GNU GPL 126 | for conveying Corresponding Source.) 127 | 128 | 5. Combined Libraries. 129 | 130 | You may place library facilities that are a work based on the 131 | Library side by side in a single library together with other library 132 | facilities that are not Applications and are not covered by this 133 | License, and convey such a combined library under terms of your 134 | choice, if you do both of the following: 135 | 136 | a) Accompany the combined library with a copy of the same work based 137 | on the Library, uncombined with any other library facilities, 138 | conveyed under the terms of this License. 139 | 140 | b) Give prominent notice with the combined library that part of it 141 | is a work based on the Library, and explaining where to find the 142 | accompanying uncombined form of the same work. 143 | 144 | 6. Revised Versions of the GNU Lesser General Public License. 145 | 146 | The Free Software Foundation may publish revised and/or new versions 147 | of the GNU Lesser General Public License from time to time. Such new 148 | versions will be similar in spirit to the present version, but may 149 | differ in detail to address new problems or concerns. 150 | 151 | Each version is given a distinguishing version number. If the 152 | Library as you received it specifies that a certain numbered version 153 | of the GNU Lesser General Public License "or any later version" 154 | applies to it, you have the option of following the terms and 155 | conditions either of that published version or of any later version 156 | published by the Free Software Foundation. If the Library as you 157 | received it does not specify a version number of the GNU Lesser 158 | General Public License, you may choose any version of the GNU Lesser 159 | General Public License ever published by the Free Software Foundation. 160 | 161 | If the Library as you received it specifies that a proxy can decide 162 | whether future versions of the GNU Lesser General Public License shall 163 | apply, that proxy's public statement of acceptance of any version is 164 | permanent authorization for you to choose that version for the 165 | Library. 166 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenRegex 2 | 3 | OpenRegex is written by Michael Schmitz at the Turing Center 4 | . It is licensed under the lesser GPL. 5 | Please see the LICENSE file for more details. 6 | 7 | 8 | ## Introduction 9 | 10 | OpenRegex is an efficient and flexible token-based regular expression language 11 | and engine. Most regular expression implementations are closed to run only 12 | over characters. Although this is the the most common application for regular 13 | expressions, OpenRegex does not have this restriction. OpenRegex is open to 14 | any sequences of user-defined objects. 15 | 16 | 17 | ## Applied to Natural Language 18 | 19 | For example, OpenRegex is used in the R2A2 extension to ReVerb, an open-domain 20 | information extractor, to determine argument boundaries. In this case, tokens 21 | are words in English sentences with additional information (the string of the 22 | word, the part-of-speech tag, and the chunk tag). 23 | 24 | case class WordToken(string: String, postag: String, chunk: String) 25 | 26 | Now that we have defined our token, we can build up a sentence (a NLP library 27 | such as OpenNLP can help out here). We will also need to define a way to 28 | translate each token in the expression (text between ) into 29 | an expression that can be applied to a word token. 30 | 31 | ``` 32 | def compile(string: String): RegularExpression[WordToken] = { 33 | // create a parser for regular expression language that have 34 | // the same token representation 35 | val parser = 36 | new RegularExpressionParser[WordToken]() { 37 | // Translate an string "part=value" into a BaseExpression that 38 | // checks whether the part of a WordToken has value 'value'. 39 | override def factory(string: String): BaseExpression[WordToken] = { 40 | new BaseExpression[WordToken](string) { 41 | val Array(part, quotedValue) = string.split("=") 42 | val value = quotedValue.drop(1).take(quotedValue.size - 2) 43 | override def apply(entity: WordToken) = { 44 | part match { 45 | case "string" => entity.string equalsIgnoreCase value 46 | case "postag" => entity.postag equalsIgnoreCase value 47 | case "chunk" => entity.chunk equalsIgnoreCase value 48 | } 49 | } 50 | } 51 | } 52 | } 53 | 54 | parser.parse(string) 55 | } 56 | ``` 57 | 58 | Now we can compile a regular expression and apply it to a sentence. Consider 59 | the following pattern. The first line defines a non-matching group that 60 | matches a determiner ("a", "an", or "the"). The second line matches a sequence 61 | of part-of-speech tags ("JJ" is adjective, "NNP" is proper noun, and "NN" is 62 | common noun). 63 | 64 | (?: | | )? 65 | * + + + 66 | 67 | We can try applying it to a couple of sentences. 68 | 69 | 1. The US president Barack Obama is travelling to Mexico. 70 | 71 | ``` 72 | regex.find(sentence).groups.get(0) matches "The US president Barack Obama" 73 | ``` 74 | 75 | 2. If all the ice melted from the frigid Earth continent Antarctica, sea 76 | levels would rise hundreds of feet. 77 | 78 | ``` 79 | regex.find(sentence).groups.get(0) matches "the frigid Earth continent Antarctica" 80 | ``` 81 | 82 | We may want to pull out the text from certain parts of our match. We can do 83 | this with either named or unnamed groups. Consider the following new form of 84 | the pattern and the sentence in example 2. 85 | 86 | ``` 87 | (?: | | )? * 88 | (:+) (:+) (:+) 89 | 90 | regex.find(sentence).groups.get(0) matches "the frigid Earth continent Antarctica" 91 | regex.find(sentence).groups.get(1) matches "Earth" 92 | regex.find(sentence).groups.get(2) matches "continent" 93 | regex.find(sentence).groups.get(2) matches "Antarctica" 94 | 95 | regex.find(sentence).group("arg1") matches "Earth" 96 | regex.find(sentence).group("rel") matches "continent" 97 | regex.find(sentence).group("arg2") matches "Antarctica" 98 | ``` 99 | 100 | ## Supported Constructs 101 | 102 | The regular expression library supports the following constructs. 103 | 104 | ``` 105 | | alternation 106 | ? option 107 | * Kleene-star 108 | + plus 109 | ^ beginning 110 | $ end 111 | {x,y} match at least x but not more than y times 112 | () matching groups 113 | (?:) non-matching groups 114 | (:) named groups 115 | ``` 116 | 117 | Most of these operators work the same as in java.util.regex. Presently, 118 | however, alternation binds to its immediate neighbors. This means that ` | ` 119 | means ` (?: | )` whereas in Java it would mean `(?: ) | `. 120 | This may change in a future release so it is advised that the 121 | alternation arguments be made explicit with non-matching groups. 122 | 123 | All operators are greedy, and there are no non-greedy counterparts. 124 | Backreferences are not supported because the underlying representation only 125 | supports regular languages (backreferences are not regular). 126 | 127 | 128 | ## Simple Java Example 129 | 130 | The NLP example is rather complex but it shows the power of OpenRegex. For a 131 | simpler example, look at RegularExpressions.word. This is a static factory 132 | method for a simple word-based regular expression where only the string is 133 | considered. This factory is used in the test cases. 134 | 135 | You can also play around with RegularExpressions.word by running the main 136 | method in RegularExpression and specifying an expression with arg1. 137 | 138 | sbt 'run-main edu.washington.cs.knowitall.regex.RegularExpression " * (?:)?"' 139 | 140 | 141 | ## Logic Expressions 142 | 143 | Included is an engine for parsing and evaluating logic expressions. For 144 | example, you might want to extend the NLP regular expression language to be 145 | able to check multiple fields in a single regular expression token. If you 146 | assumed each regular expression token to be a logic expression, you could 147 | write patterns such as the following. 148 | 149 | ``` 150 | 151 | ``` 152 | 153 | Extending the regular expression in this way is easy. It only involves 154 | rewriting the apply method in BaseExpression inside the compile method. 155 | Most of the code below existed before--now it's just moved outside the 156 | apply method. 157 | 158 | ``` 159 | val logic = new LogicExpressionParser[WordToken] { 160 | override def factory(expr: String) = { 161 | new Arg.Pred[WordToken](expr) { 162 | val Array(part, quotedValue) = expr.split("=") 163 | val value = quotedValue.drop(1).take(quotedValue.size - 2) 164 | override def apply(entity: WordToken) = part match { 165 | case "string" => entity.string == value 166 | case "postag" => entity.postag == value 167 | case "chunk" => entity.chunk == value 168 | } 169 | } 170 | } 171 | }.parse(value) 172 | 173 | override def apply(entity: WordToken) = { 174 | logic.apply(entity) 175 | } 176 | ``` 177 | 178 | Play around with logic expression by using the main method in LogicExpression. 179 | 180 | sbt 'run-main edu.washington.cs.knowitall.logic.LogicExpression' 181 | 182 | You can enter logic expressions such as "true & false" or "true | false" and 183 | have them evaluated interactively. 184 | 185 | 186 | ## Implementation 187 | 188 | Regular expressions are evaluated using Thomson NFA, which is fast and does not have 189 | the pathological cases that most regular expression libraries have. For more 190 | information about Thomson NFA in comparison to recursive backtracking, read 191 | http://swtch.com/~rsc/regexp/regexp1.html. Future work may involve compiling 192 | NFAs to DFAs. 193 | 194 | 195 | ## Future Work 196 | 197 | 1. Compile to DFA. 198 | 2. Use parser combinators for parsing regular expressions. 199 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | organization := "edu.washington.cs.knowitall" 2 | 3 | name := "openregex" 4 | 5 | description := "OpenRegex is an efficient and flexible library for running regular expressions over sequences of user-defined objects." 6 | 7 | version := "1.1.2-SNAPSHOT" 8 | 9 | libraryDependencies ++= Seq("com.google.code.findbugs" % "jsr305" % "2.0.1", 10 | "com.google.guava" % "guava" % "15.0", 11 | "org.scala-lang" % "scala-library" % "2.10.2" % "test", 12 | "junit" % "junit" % "4.10" % "test", 13 | "org.specs2" % "specs2_2.10" % "2.2.2" % "test", 14 | "org.scalacheck" % "scalacheck_2.10" % "1.10.1" % "test") 15 | 16 | licenses := Seq("LGPL (GNU Lesser General Public License)" -> url("http://www.gnu.org/licenses/lgpl.html")) 17 | 18 | homepage := Some(url("https://github.com/knowitall/openregex")) 19 | 20 | publishMavenStyle := true 21 | 22 | publishTo <<= version { (v: String) => 23 | val nexus = "https://oss.sonatype.org/" 24 | if (v.trim.endsWith("SNAPSHOT")) 25 | Some("snapshots" at nexus + "content/repositories/snapshots") 26 | else 27 | Some("releases" at nexus + "service/local/staging/deploy/maven2") 28 | } 29 | 30 | pomExtra := ( 31 | 32 | https://github.com/knowitall/openregex 33 | scm:git://github.com/knowitall/openregex.git 34 | scm:git:git@github.com:knowitall/openregex.git 35 | HEAD 36 | 37 | 38 | 39 | Michael Schmitz 40 | 41 | ) 42 | -------------------------------------------------------------------------------- /src/main/java/edu/washington/cs/knowitall/logic/ArgFactory.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.logic; 2 | 3 | import com.google.common.base.Function; 4 | 5 | /** 6 | * An abstract factory class that converts the string representation of 7 | * an argument into a token. This token uses the supplied delegate to 8 | * evaluate the expression against an entity into a boolean. 9 | * 10 | * @author Michael Schmitz 11 | * 12 | * @param 13 | */ 14 | public abstract class ArgFactory implements Function> { 15 | /*** 16 | * Converts the supplied string into a token. 17 | */ 18 | public abstract Expression.Arg create(String string); 19 | 20 | /*** 21 | * Method to satisfy abstract superclass. 22 | */ 23 | @Override 24 | public Expression.Arg apply(String string) { 25 | return this.create(string); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/edu/washington/cs/knowitall/logic/Expression.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.logic; 2 | 3 | import com.google.common.base.Predicate; 4 | 5 | /** 6 | * Superclass for expressions in a Logic Expression. 7 | * 8 | * @author Michael Schmitz 9 | */ 10 | public abstract class Expression { 11 | /** 12 | * An expression that can be applied. 13 | */ 14 | public static abstract class Apply extends Expression { 15 | /** 16 | * Apply this expression to an entity to get true or false. 17 | */ 18 | public abstract boolean apply(E entity); 19 | } 20 | 21 | /** 22 | * An operator expression. 23 | */ 24 | public static abstract class Op extends Apply { 25 | /** 26 | * @returns true if this has precedence over that 27 | */ 28 | public boolean preceeds(Op that) { 29 | return this.precedence() < that.precedence(); 30 | } 31 | 32 | /** 33 | * The precedence of this operator. A smaller number denotes higher 34 | * precedence. 35 | * 36 | * @returns the precedence level of this operator 37 | */ 38 | public abstract int precedence(); 39 | 40 | /** 41 | * An operator that takes a single argument, such as negation. 42 | */ 43 | public static abstract class Mon extends Op { 44 | public Apply sub; 45 | 46 | public String toString(String symbol) { 47 | if (sub == null) { 48 | return symbol; 49 | } 50 | else { 51 | return symbol + "(" + sub.toString() + ")"; 52 | } 53 | } 54 | 55 | /** 56 | * The negation operator. 57 | */ 58 | public static class Not extends Mon { 59 | public String toString() { 60 | return super.toString("!"); 61 | } 62 | 63 | @Override 64 | public boolean apply(E entity) { 65 | return !sub.apply(entity); 66 | } 67 | 68 | @Override 69 | public int precedence() { 70 | return 0; 71 | } 72 | } 73 | } 74 | 75 | /** 76 | * An operator that takes two arguments, such as disjunction. 77 | */ 78 | public static abstract class Bin extends Op { 79 | public Apply left; 80 | public Apply right; 81 | 82 | public String toString(String symbol) { 83 | if (left == null || right == null) { 84 | return symbol; 85 | } 86 | else { 87 | return "(" + left.toString() + " " + symbol + " " + right.toString() + ")"; 88 | } 89 | } 90 | 91 | /** 92 | * The conjunction (logical and) operator. 93 | */ 94 | public static class And extends Bin { 95 | public String toString() { 96 | return super.toString("&"); 97 | } 98 | 99 | @Override 100 | public boolean apply(E entity) { 101 | return left.apply(entity) && right.apply(entity); 102 | } 103 | 104 | @Override 105 | public int precedence() { 106 | return 1; 107 | } 108 | } 109 | 110 | /** 111 | * The disjunction (logical or) operator. 112 | */ 113 | public static class Or extends Bin { 114 | public String toString() { 115 | return super.toString("|"); 116 | } 117 | 118 | @Override 119 | public boolean apply(E entity) { 120 | return left.apply(entity) || right.apply(entity); 121 | } 122 | 123 | @Override 124 | public int precedence() { 125 | return 2; 126 | } 127 | } 128 | } 129 | } 130 | 131 | /** 132 | * An expression that evaluates to true or false. 133 | */ 134 | public static abstract class Arg extends Apply implements Predicate { 135 | /** 136 | * An expression that evaluates to true or false by applying a 137 | * predicate to the supplied entity. 138 | */ 139 | public static abstract class Pred extends Arg { 140 | private String description; 141 | 142 | public Pred(String description) { 143 | this.description = description; 144 | } 145 | 146 | @Override 147 | public abstract boolean apply(E entity); 148 | 149 | public String getDescription() { 150 | return this.description; 151 | } 152 | 153 | public String toString() { 154 | return this.getDescription(); 155 | } 156 | } 157 | 158 | /** 159 | * An expression that is a constant value--either true or false. 160 | */ 161 | public static class Value extends Arg { 162 | private boolean value; 163 | 164 | public Value(boolean value) { 165 | this.value = value; 166 | } 167 | 168 | @Override 169 | public boolean apply(E entity) { 170 | return this.apply(); 171 | } 172 | 173 | public boolean apply() { 174 | return value; 175 | } 176 | 177 | @Override 178 | public String toString() { 179 | return Boolean.toString(this.value); 180 | } 181 | } 182 | } 183 | 184 | /** 185 | * A parenthesis, used for grouping. These are only uses prior to building 186 | * the AST. 187 | */ 188 | public static class Paren extends Expression { 189 | /** 190 | * A left parenthesis. 191 | */ 192 | public static class L extends Paren { 193 | public String toString() { 194 | return "("; 195 | } 196 | } 197 | 198 | /** 199 | * A right parenthesis. 200 | */ 201 | public static class R extends Paren { 202 | public String toString() { 203 | return ")"; 204 | } 205 | } 206 | } 207 | } 208 | -------------------------------------------------------------------------------- /src/main/java/edu/washington/cs/knowitall/logic/LogicException.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.logic; 2 | 3 | /** 4 | * 5 | * @author Michael Schmitz 6 | */ 7 | public class LogicException extends RuntimeException { 8 | private static final long serialVersionUID = 1L; 9 | 10 | public LogicException(String message) { 11 | super(message); 12 | } 13 | 14 | public LogicException(String message, Exception e) { 15 | super(message, e); 16 | } 17 | 18 | /** 19 | * Exception while applying an expression to an object. 20 | */ 21 | public static class ApplyLogicException extends LogicException { 22 | private static final long serialVersionUID = 1L; 23 | 24 | public ApplyLogicException(String message, Exception e) { 25 | super(message, e); 26 | } 27 | 28 | public ApplyLogicException(String message) { 29 | super(message); 30 | } 31 | } 32 | 33 | /** 34 | * Exception while converting the tokens into a valid expression. 35 | */ 36 | public static class CompileLogicException extends LogicException { 37 | private static final long serialVersionUID = 1L; 38 | 39 | public CompileLogicException(String message, Exception e) { 40 | super(message, e); 41 | } 42 | 43 | public CompileLogicException(String message) { 44 | super(message); 45 | } 46 | } 47 | 48 | /** 49 | * Exception while tokenizing the logic expression string. 50 | */ 51 | public static class TokenizeLogicException extends LogicException { 52 | private static final long serialVersionUID = 1L; 53 | 54 | public TokenizeLogicException(String message, Exception e) { 55 | super(message, e); 56 | } 57 | 58 | public TokenizeLogicException(String message) { 59 | super(message); 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/edu/washington/cs/knowitall/logic/LogicExpression.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.logic; 2 | 3 | import java.util.ArrayList; 4 | import java.util.EmptyStackException; 5 | import java.util.LinkedList; 6 | import java.util.List; 7 | import java.util.Scanner; 8 | import java.util.Stack; 9 | 10 | import com.google.common.base.Function; 11 | import com.google.common.base.Predicate; 12 | 13 | import edu.washington.cs.knowitall.logic.Expression.Apply; 14 | import edu.washington.cs.knowitall.logic.Expression.Arg; 15 | import edu.washington.cs.knowitall.logic.Expression.Op; 16 | import edu.washington.cs.knowitall.logic.Expression.Paren; 17 | import edu.washington.cs.knowitall.logic.LogicException.ApplyLogicException; 18 | import edu.washington.cs.knowitall.logic.LogicException.CompileLogicException; 19 | import edu.washington.cs.knowitall.logic.LogicException.TokenizeLogicException; 20 | 21 | /** 22 | * A logic expression engine that operates over user specified objects. 23 | * 24 | * @author Michael Schmitz 25 | * 26 | * @param the type of the base expressions 27 | */ 28 | public class LogicExpression implements Predicate { 29 | private final Apply expression; 30 | 31 | /*** 32 | * 33 | * @param input an infix representation of the logic expression. 34 | * @throws TokenizeLogicException 35 | * @throws CompileLogicException 36 | */ 37 | protected LogicExpression(List> expressions) 38 | throws TokenizeLogicException, CompileLogicException { 39 | // put in reverse polish notation 40 | List> rpn = rpn(expressions); 41 | 42 | // compile the expression 43 | expression = buildAst(rpn); 44 | } 45 | 46 | /*** 47 | * Compile an infix list of tokens into an expression tree. 48 | * @param rpn a list of tokens in infix form. 49 | * @return an expression tree. 50 | */ 51 | public static LogicExpression compile( 52 | final List> expressions) { 53 | return new LogicExpression(expressions); 54 | } 55 | 56 | /*** 57 | * Helper factory method to instantiate a LogicExpression. 58 | * @param input The string to parse. 59 | * @param factoryDelegate The factory to build tokens. 60 | * @return a new LogicExpression 61 | */ 62 | public static LogicExpression compile(final String input, 63 | final Function> factoryDelegate) { 64 | return new LogicExpressionParser() { 65 | @Override 66 | public Arg factory(String argument) { 67 | return factoryDelegate.apply(argument); 68 | } 69 | }.parse(input); 70 | } 71 | 72 | @Override 73 | public String toString() { 74 | if (this.isEmpty()) { 75 | return "(empty)"; 76 | } 77 | else { 78 | return expression.toString(); 79 | } 80 | } 81 | 82 | 83 | /*** 84 | * If the expression is empty, it returns true for all inputs. 85 | * @return true iff the expression is empty. 86 | */ 87 | public boolean isEmpty() { 88 | return this.expression == null; 89 | } 90 | 91 | @Override 92 | public boolean apply(E entity) { 93 | if (this.isEmpty()) { 94 | return true; 95 | } 96 | else { 97 | return this.expression.apply(entity); 98 | } 99 | } 100 | 101 | /*** 102 | * Compile a rpn list of tokens into an expression tree. 103 | * @param rpn a list of tokens in infix form. 104 | * @return an expression tree. 105 | */ 106 | public static Apply buildAst(List> rpn) { 107 | if (rpn.isEmpty()) { 108 | return null; 109 | } 110 | 111 | Stack> stack = new Stack>(); 112 | for (Expression tok : rpn) { 113 | if (tok instanceof Arg) { 114 | stack.push((Arg) tok); 115 | } else if (tok instanceof Op) { 116 | try { 117 | if (tok instanceof Op.Mon){ 118 | Apply sub = stack.pop(); 119 | 120 | Op.Mon mon = (Op.Mon) tok; 121 | 122 | mon.sub = sub; 123 | 124 | stack.push(mon); 125 | } 126 | if (tok instanceof Op.Bin) { 127 | Apply arg2 = stack.pop(); 128 | Apply arg1 = stack.pop(); 129 | 130 | Op.Bin bin = (Op.Bin) tok; 131 | 132 | bin.left = arg1; 133 | bin.right = arg2; 134 | 135 | stack.push(bin); 136 | } 137 | } 138 | catch (EmptyStackException e) { 139 | throw new CompileLogicException( 140 | "No argument for operator (stack empty): " 141 | + tok.toString()); 142 | } 143 | } 144 | } 145 | 146 | if (stack.size() > 1) { 147 | throw new ApplyLogicException( 148 | "Stack has multiple elements after apply: " + stack.toString()); 149 | } 150 | 151 | if (stack.size() == 0) { 152 | throw new ApplyLogicException( 153 | "Stack has zero elements after apply."); 154 | } 155 | 156 | if (!(stack.peek() instanceof Apply)) { 157 | throw new ApplyLogicException( 158 | "Stack contains non-appliable tokens after apply: " + stack.toString()); 159 | } 160 | 161 | return (stack.pop()); 162 | } 163 | 164 | /*** 165 | * Return a list of the arguments contained in the expression. 166 | * @return 167 | */ 168 | public List getArgs() { 169 | List args = new ArrayList(); 170 | getArgs(this.expression, args); 171 | 172 | return args; 173 | } 174 | 175 | /*** 176 | * Private helper method to recursively find arguments. 177 | * @param apply the expression tree to search. 178 | * @param args the resulting list of arguments. 179 | */ 180 | private void getArgs(Apply apply, List args) { 181 | if (apply instanceof Op.Bin) { 182 | Op.Bin bin = (Op.Bin) apply; 183 | 184 | getArgs(bin.left, args); 185 | getArgs(bin.right, args); 186 | } 187 | else if (apply instanceof Arg.Pred) { 188 | args.add(((Arg.Pred)apply).getDescription()); 189 | } 190 | } 191 | 192 | /*** 193 | * Converts an infix logic representation into a postfix logic representation. 194 | * @param tokens a list of tokens in infix form. 195 | * @return a list of tokens in postfix (rpn) form. 196 | * @throws CompileLogicException 197 | */ 198 | public List> rpn(List> tokens) 199 | throws CompileLogicException { 200 | // intermediate storage 201 | Stack> stack = new Stack>(); 202 | 203 | // final rpn output 204 | LinkedList> output = new LinkedList>(); 205 | 206 | for (Expression tok : tokens) { 207 | if (tok instanceof Paren.L) { 208 | stack.push(tok); 209 | } else if (tok instanceof Paren.R) { 210 | Expression top; 211 | do { 212 | top = stack.pop(); 213 | 214 | if (!(top instanceof Paren.L)) { 215 | output.offer(top); 216 | } 217 | 218 | } while (!(top instanceof Paren.L)); 219 | 220 | } else if (tok instanceof Op.Mon) { 221 | stack.push(tok); 222 | } else if (tok instanceof Op.Bin) { 223 | // higher precedence 224 | while (!stack.isEmpty() && stack.peek() instanceof Op 225 | && ((Op)stack.peek()).preceeds((Op)tok)) { 226 | output.offer(stack.pop()); 227 | } 228 | 229 | stack.push(tok); 230 | } else if (tok instanceof Arg) { 231 | output.offer(tok); 232 | } 233 | } 234 | 235 | // empty out items remaining ni the stack 236 | while (!stack.isEmpty()) { 237 | Expression top = stack.pop(); 238 | 239 | if (top instanceof Paren.L || top instanceof Paren.R) { 240 | throw new CompileLogicException("Unbalanced parentheses."); 241 | } 242 | 243 | output.offer(top); 244 | } 245 | 246 | return output; 247 | } 248 | 249 | 250 | /*** 251 | * Iteractively interpret logic statements from stdin such as "true | (true & false)". 252 | * @param args 253 | */ 254 | public static void main(String[] args) { 255 | Scanner scan = new Scanner(System.in); 256 | 257 | while (scan.hasNextLine()) { 258 | String line = scan.nextLine(); 259 | 260 | LogicExpression expr = LogicExpressionParsers.trivial.parse(line); 261 | 262 | System.out.println("string: " + expr.toString()); 263 | System.out.println("value: " + expr.apply(null)); 264 | System.out.println(); 265 | } 266 | 267 | scan.close(); 268 | } 269 | } 270 | -------------------------------------------------------------------------------- /src/main/java/edu/washington/cs/knowitall/logic/LogicExpressionParser.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.logic; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.Stack; 6 | import java.util.regex.Matcher; 7 | import java.util.regex.Pattern; 8 | 9 | import com.google.common.base.Function; 10 | import com.google.common.collect.Lists; 11 | 12 | import edu.washington.cs.knowitall.logic.Expression.Arg; 13 | import edu.washington.cs.knowitall.logic.Expression.Op; 14 | import edu.washington.cs.knowitall.logic.Expression.Paren; 15 | import edu.washington.cs.knowitall.logic.LogicException.TokenizeLogicException; 16 | 17 | /** 18 | * A logic expression engine that operates over user specified objects. 19 | * 20 | * @author Michael Schmitz 21 | * 22 | * @param the type of the base expressions 23 | */ 24 | abstract public class LogicExpressionParser implements Function> { 25 | /*** 26 | * Create a LogicExpression object from the supplied string. 27 | * @param string 28 | * @return 29 | */ 30 | public LogicExpression parse(String string) { 31 | List> expressions = this.tokenize(string); 32 | return new LogicExpression(expressions); 33 | } 34 | 35 | @Override 36 | public LogicExpression apply(String string) { 37 | return this.parse(string); 38 | } 39 | 40 | /*** 41 | * The factory method creates an argument from the supplied token string. 42 | * @param argument a string representation of a token 43 | * @return an evaluatable representation of a token 44 | */ 45 | public abstract Arg factory(String argument); 46 | 47 | public final static Pattern doubleQuoteStringLiteralRegex = 48 | Pattern.compile("\"" + "([^\"\\p{Cntrl}\\\\]*+(?:\\\\[\\\\'\"bfnrt])*+(?:\\\\u[a-fA-F0-9]{4})*+)*+" + "\""); 49 | public final static Pattern singleQuoteStringLiteralRegex = 50 | Pattern.compile("'" + "(?:[^']*+)" + "'"); 51 | public final static Pattern regexLiteralRegex = 52 | Pattern.compile("/" + "(?:(?:[^/\\\\]*+(?:\\\\)*+(?:\\\\/)*+)*+)" + "/"); 53 | private final static List literalPatterns = Lists.newArrayList( 54 | doubleQuoteStringLiteralRegex, singleQuoteStringLiteralRegex, 55 | regexLiteralRegex); 56 | 57 | /*** 58 | * The readToken method reads a token from the remaining LogicExpression string. 59 | * 60 | * A token may contain a string. If it contains parentheses, the token 61 | * will last until the parentheses are balanced. And &, |, or unbalanced ) 62 | * will mark the end of a token. 63 | * 64 | * This is a default implementation that may be overriden. 65 | * @param remainder the remaining text to tokenize 66 | * @return a token from the beginning on `remaining` 67 | */ 68 | public String readToken(String remainder) { 69 | final String token; 70 | try { 71 | Stack parens = new Stack(); 72 | 73 | int nextExpression; 74 | for (nextExpression = 0; nextExpression < remainder.length(); nextExpression++) { 75 | char c = remainder.charAt(nextExpression); 76 | 77 | // check for quotation 78 | String match = null; 79 | for (Pattern pattern : literalPatterns) { 80 | Matcher matcher = pattern.matcher(remainder).region( 81 | nextExpression, remainder.length()); 82 | if (matcher.lookingAt()) { 83 | match = matcher.group(0); 84 | break; 85 | } 86 | } 87 | 88 | if (match != null) { 89 | // we found and can consume a quotation 90 | nextExpression += match.length() - 1; 91 | } else if (c == '(') { 92 | parens.push(c); 93 | } else if (c == ')') { 94 | if (parens.isEmpty()) { 95 | break; 96 | } else { 97 | parens.pop(); 98 | } 99 | } else if (c == '&' || c == '|') { 100 | break; 101 | } 102 | } 103 | 104 | token = remainder.substring(0, nextExpression).trim(); 105 | } catch (Exception e) { 106 | throw new TokenizeLogicException("Error parsing token: " 107 | + remainder, e); 108 | } 109 | 110 | if (token.isEmpty()) { 111 | throw new TokenizeLogicException("zero-length token found."); 112 | } 113 | 114 | return token; 115 | } 116 | 117 | /*** 118 | * Convert an infix string logic representation to an infix list of tokens. 119 | * @param input an infix string logic representation. 120 | * @param factory a delegate that converts a string representation of an 121 | * argument into a token object. @return 122 | * 123 | * @throws TokenizeLogicException 124 | */ 125 | public List> tokenize(String input) 126 | throws TokenizeLogicException { 127 | List> tokens = new ArrayList>(); 128 | 129 | int i = 0; 130 | while (i < input.length()) { 131 | String substring = input.substring(i); 132 | char firstChar = substring.charAt(0); 133 | 134 | if (firstChar == ' ') { 135 | i += 1; 136 | continue; 137 | } 138 | else if (firstChar == '(') { 139 | tokens.add(new Paren.L()); 140 | i += 1; 141 | } else if (firstChar == ')') { 142 | tokens.add(new Paren.R()); 143 | i += 1; 144 | } else if (firstChar == '!') { 145 | tokens.add(new Op.Mon.Not()); 146 | i += 1; 147 | } else if (firstChar == '&') { 148 | tokens.add(new Op.Bin.And()); 149 | i += 1; 150 | } else if (firstChar == '|') { 151 | tokens.add(new Op.Bin.Or()); 152 | i += 1; 153 | } else { 154 | // parse out the token 155 | String token = this.readToken(substring); 156 | 157 | tokens.add(factory(token)); 158 | i += token.length(); 159 | } 160 | } 161 | 162 | return tokens; 163 | } 164 | } 165 | -------------------------------------------------------------------------------- /src/main/java/edu/washington/cs/knowitall/logic/LogicExpressionParsers.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.logic; 2 | 3 | /** 4 | * Static factories for logic expressions over basic objects. 5 | * 6 | * @author Michael Schmitz 7 | */ 8 | class LogicExpressionParsers { 9 | /** 10 | * Logic expressions where "true" evaluates to true and "false" evaluates to 11 | * false. For example: 12 | * 13 | * (true | false) & true 14 | * 15 | * This logic expression is trivial because it's value is independent of the 16 | * object it is applied to. 17 | */ 18 | public final static LogicExpressionParser trivial = 19 | new LogicExpressionParser() { 20 | @Override 21 | public Expression.Arg factory(final String string) { 22 | return new Expression.Arg.Pred(string) { 23 | @Override 24 | public boolean apply(String entity) { 25 | return "true".equals(string); 26 | } 27 | }; 28 | } 29 | }; 30 | 31 | /** 32 | * Logic expressions where tokens are strings. A token is true if it 33 | * matches the input string. 34 | */ 35 | public final static LogicExpressionParser stringMatch = 36 | new LogicExpressionParser() { 37 | @Override 38 | public Expression.Arg factory(final String token) { 39 | return new Expression.Arg.Pred(token) { 40 | final String string = token.substring(1, token.length() - 1); 41 | 42 | @Override 43 | public boolean apply(String entity) { 44 | return entity.equals(string); 45 | } 46 | }; 47 | } 48 | }; 49 | } 50 | -------------------------------------------------------------------------------- /src/main/java/edu/washington/cs/knowitall/regex/Expression.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Iterator; 5 | import java.util.List; 6 | 7 | import com.google.common.base.Joiner; 8 | import com.google.common.base.Predicate; 9 | 10 | import edu.washington.cs.knowitall.regex.FiniteAutomaton.Automaton; 11 | import edu.washington.cs.knowitall.regex.FiniteAutomaton.State; 12 | 13 | /** 14 | * Interface for a component of a regular expression. 15 | * 16 | * @author Michael Schmitz 17 | */ 18 | public interface Expression extends Predicate { 19 | 20 | public Automaton build(); 21 | 22 | public int minMatchingLength(); 23 | 24 | /** 25 | * Represents a matching group that is referred to by order number. 26 | * {@code ( +)} 27 | * @author Michael Schmitz 28 | * 29 | * @param 30 | */ 31 | public class MatchingGroup implements Expression { 32 | public final List> expressions; 33 | 34 | public MatchingGroup(List> expressions) { 35 | this.expressions = expressions; 36 | } 37 | 38 | @Override 39 | public boolean apply(E entity) { 40 | throw new UnsupportedOperationException(); 41 | } 42 | 43 | public String subexpString() { 44 | List subs = new ArrayList(this.expressions.size()); 45 | for (Expression expr : this.expressions) { 46 | subs.add(expr.toString()); 47 | } 48 | 49 | return Joiner.on(" ").join(subs); 50 | } 51 | 52 | @Override 53 | public String toString() { 54 | return "(" + subexpString() + ")"; 55 | } 56 | 57 | /** 58 | * Convert the expression into a NFA. 59 | */ 60 | @Override 61 | public Automaton build() { 62 | Automaton auto = new Automaton(this); 63 | 64 | Iterator> exprIterator = this.expressions.iterator(); 65 | Automaton sub; 66 | 67 | // connect the start to the first subexpression 68 | State prev = auto.start; 69 | if (exprIterator.hasNext()) { 70 | sub = exprIterator.next().build(); 71 | auto.start.connect(sub.start); 72 | prev = sub.end; 73 | } 74 | while (exprIterator.hasNext()) { 75 | Expression expr = exprIterator.next(); 76 | sub = expr.build(); 77 | 78 | State connector = new State(); 79 | 80 | prev.connect(connector); 81 | connector.connect(sub.start); 82 | prev = sub.end; 83 | } 84 | 85 | prev.connect(auto.end); 86 | 87 | return auto; 88 | } 89 | 90 | @Override 91 | public int minMatchingLength() { 92 | int len = 0; 93 | for (Expression expr : this.expressions) { 94 | len += expr.minMatchingLength(); 95 | } 96 | return len; 97 | } 98 | } 99 | 100 | /** 101 | * Represents a matching group that is referred to by name. 102 | * {@code (: +)} 103 | * @author Michael Schmitz 104 | * 105 | * @param 106 | */ 107 | public class NamedGroup extends MatchingGroup { 108 | public final String name; 109 | 110 | public NamedGroup(String name, List> expressions) { 111 | super(expressions); 112 | this.name = name; 113 | } 114 | 115 | @Override 116 | public String toString() { 117 | return "(<"+this.name+">:" + super.subexpString() + ")"; 118 | } 119 | } 120 | 121 | /** 122 | * Represents a non-matching group. 123 | * {@code (?: +)} 124 | * @author Michael Schmitz 125 | * 126 | * @param 127 | */ 128 | public class NonMatchingGroup extends MatchingGroup { 129 | public NonMatchingGroup(List> expressions) { 130 | super(expressions); 131 | } 132 | 133 | @Override 134 | public String toString() { 135 | return "(?:" + super.subexpString() + ")"; 136 | } 137 | } 138 | 139 | /** 140 | * Disjunction of two experssions. 141 | * {@code |} 142 | * @author Michael Schmitz 143 | * 144 | * @param 145 | */ 146 | public static class Or implements Expression { 147 | public final Expression expr1; 148 | public final Expression expr2; 149 | 150 | public Or(Expression expr1, Expression expr2) { 151 | this.expr1 = expr1; 152 | this.expr2 = expr2; 153 | } 154 | 155 | @Override 156 | public boolean apply(E entity) { 157 | return true; 158 | } 159 | 160 | @Override 161 | public String toString() { 162 | return this.expr1.toString() + " | " + this.expr2.toString(); 163 | } 164 | 165 | /** 166 | * Convert the expression into a NFA. 167 | */ 168 | @Override 169 | public Automaton build() { 170 | Automaton auto = new Automaton(this); 171 | 172 | Automaton sub1 = this.expr1.build(); 173 | Automaton sub2 = this.expr2.build(); 174 | 175 | // attach the sub automata 176 | auto.start.connect(sub1.start); 177 | auto.start.connect(sub2.start); 178 | sub1.end.connect(auto.end); 179 | sub2.end.connect(auto.end); 180 | 181 | return auto; 182 | } 183 | 184 | @Override 185 | public int minMatchingLength() { 186 | int left = this.expr1.minMatchingLength(); 187 | int right = this.expr2.minMatchingLength(); 188 | if (left < right) 189 | return left; 190 | else 191 | return right; 192 | } 193 | } 194 | 195 | /** 196 | * Kleene-star: zero or more of the enclosed expression. 197 | * {@code *} 198 | * @author Michael Schmitz 199 | * 200 | * @param 201 | */ 202 | public static class Star implements Expression { 203 | public final Expression expr; 204 | 205 | public Star(Expression expr) { 206 | this.expr = expr; 207 | } 208 | 209 | @Override 210 | public boolean apply(E entity) { 211 | return this.expr.apply(entity); 212 | } 213 | 214 | @Override 215 | public String toString() { 216 | return this.expr.toString() + "*"; 217 | } 218 | 219 | /** 220 | * Convert the expression into a NFA. 221 | */ 222 | @Override 223 | public Automaton build() { 224 | Automaton auto = new Automaton(this); 225 | 226 | Automaton sub = this.expr.build(); 227 | 228 | // run it again 229 | sub.end.connect(sub.start); 230 | 231 | // attach the sub automaton 232 | auto.start.connect(sub.start); 233 | sub.end.connect(auto.end); 234 | 235 | // skip it completely 236 | auto.start.connect(auto.end); 237 | 238 | return auto; 239 | } 240 | 241 | @Override 242 | public int minMatchingLength() { 243 | return 0; 244 | } 245 | } 246 | 247 | /** 248 | * One or more of the enclosed expression. Plus(expr) is equivalent to 249 | * expr followed by Star(expr). 250 | * {@code +} is the same as {@code *} 251 | * @author Michael Schmitz 252 | * 253 | * @param 254 | */ 255 | public static class Plus implements Expression { 256 | public final Expression expr; 257 | 258 | public Plus(Expression expr) { 259 | this.expr = expr; 260 | } 261 | 262 | @Override 263 | public boolean apply(E entity) { 264 | return this.expr.apply(entity); 265 | } 266 | 267 | @Override 268 | public String toString() { 269 | return this.expr.toString() + "+"; 270 | } 271 | 272 | /** 273 | * Convert the expression into a NFA. 274 | */ 275 | @Override 276 | public Automaton build() { 277 | Automaton auto = new Automaton(this); 278 | 279 | Automaton sub = this.expr.build(); 280 | 281 | // run it again 282 | sub.end.connect(sub.start); 283 | 284 | // attach the sub automaton 285 | auto.start.connect(sub.start); 286 | sub.end.connect(auto.end); 287 | 288 | return auto; 289 | } 290 | 291 | @Override 292 | public int minMatchingLength() { 293 | return 1; 294 | } 295 | } 296 | 297 | /** 298 | * Zero or one of the enclosed expression. 299 | * {@code ?} 300 | * @author Michael Schmitz 301 | * 302 | * @param 303 | */ 304 | public static class Option implements Expression { 305 | Expression expr; 306 | 307 | public Option(Expression expr) { 308 | this.expr = expr; 309 | } 310 | 311 | @Override 312 | public boolean apply(E entity) { 313 | return this.expr.apply(entity); 314 | } 315 | 316 | @Override 317 | public String toString() { 318 | return this.expr.toString() + "?"; 319 | } 320 | 321 | /** 322 | * Convert the expression into a NFA. 323 | */ 324 | @Override 325 | public Automaton build() { 326 | Automaton auto = new Automaton(this); 327 | 328 | Automaton sub = this.expr.build(); 329 | 330 | // attach the sub automaton 331 | auto.start.connect(sub.start); 332 | sub.end.connect(auto.end); 333 | 334 | // skip it completely 335 | auto.start.connect(auto.end); 336 | 337 | return auto; 338 | } 339 | 340 | @Override 341 | public int minMatchingLength() { 342 | return 0; 343 | } 344 | } 345 | 346 | /** 347 | * A minimum to maximum number of occurrences of the enclosed expression. 348 | * {@code {1,3}} 349 | * @author Daniel Naber 350 | * 351 | * @param 352 | */ 353 | public static class MinMax implements Expression { 354 | Expression expr; 355 | final int minOccurrences; 356 | final int maxOccurrences; 357 | 358 | /** 359 | * @param minOccurrences minimum occurrences, must be >= 0 360 | * @param maxOccurrences maximum occurrences, must be >= 1 - you should prefer small values, 361 | * as the use of large values will create a large automaton that takes a lot of memory 362 | */ 363 | public MinMax(Expression expr, int minOccurrences, int maxOccurrences) { 364 | this.expr = expr; 365 | if (minOccurrences < 0 || maxOccurrences < 1) { 366 | throw new IllegalArgumentException("minOccurrences must be >= 0 and maxOccurrences must be >= 1: " 367 | + minOccurrences + ", " + maxOccurrences); 368 | } 369 | if (minOccurrences > maxOccurrences) { 370 | throw new IllegalArgumentException("minOccurrences must be <= maxOccurrences: " 371 | + minOccurrences + " > " + maxOccurrences); 372 | } 373 | this.minOccurrences = minOccurrences; 374 | this.maxOccurrences = maxOccurrences; 375 | } 376 | 377 | @Override 378 | public boolean apply(E entity) { 379 | return this.expr.apply(entity); 380 | } 381 | 382 | @Override 383 | public String toString() { 384 | return this.expr.toString() + "{" + minOccurrences + "," + maxOccurrences + "}"; 385 | } 386 | 387 | /** 388 | * Convert the expression into a NFA. 389 | */ 390 | @Override 391 | public Automaton build() { 392 | Automaton auto = new Automaton(this); 393 | 394 | List> subAutos = new ArrayList>(); 395 | int numberOfNodes = maxOccurrences; 396 | for (int i = 0; i < numberOfNodes; i++) { 397 | Automaton sub = this.expr.build(); 398 | subAutos.add(sub); 399 | } 400 | 401 | // attach the first sub automaton 402 | auto.start.connect(subAutos.get(0).start); 403 | 404 | // attach the sub automatons among themselves and with the end 405 | for (int i = 0; i < subAutos.size(); i++) { 406 | Automaton sub = subAutos.get(i); 407 | if (i >= minOccurrences - 1) { 408 | sub.end.connect(auto.end); 409 | } 410 | if (i < subAutos.size() - 1) { 411 | Automaton nextSub = subAutos.get(i + 1); 412 | sub.end.connect(nextSub.start); 413 | } 414 | } 415 | 416 | if (minOccurrences == 0) { 417 | // skip it completely 418 | auto.start.connect(auto.end); 419 | } 420 | 421 | return auto; 422 | } 423 | 424 | @Override 425 | public int minMatchingLength() { 426 | return this.minOccurrences; 427 | } 428 | } 429 | 430 | /** 431 | * An expression with no subexpression that is evaluated against a token 432 | * using the supplied delegate. 433 | * @author Michael Schmitz 434 | * 435 | * @param 436 | */ 437 | static abstract class BaseExpression implements Expression { 438 | public final String source; 439 | 440 | public BaseExpression(String source) { 441 | this.source = source; 442 | } 443 | 444 | /** 445 | * The delegate to evaluate the expression against a token. 446 | */ 447 | @Override 448 | public abstract boolean apply(E entity); 449 | 450 | public String toString() { 451 | return "<" + this.source + ">"; 452 | } 453 | 454 | /** 455 | * Convert the expression into a NFA. 456 | */ 457 | @Override 458 | public Automaton build() { 459 | Automaton auto = new Automaton(this); 460 | 461 | auto.start.connect(auto.end, this); 462 | 463 | return auto; 464 | } 465 | 466 | @Override 467 | public int minMatchingLength() { 468 | return 1; 469 | } 470 | } 471 | 472 | /** 473 | * A non-consuming expression that matches a token against a property of 474 | * the text, such as the start or end of a line. 475 | * @author Michael Schmitz 476 | * 477 | * @param 478 | */ 479 | static abstract class AssertionExpression implements Expression { 480 | @Override 481 | public boolean apply(E entity) { 482 | return false; 483 | } 484 | 485 | public abstract boolean apply(boolean hasStart, List tokens, int count); 486 | 487 | /** 488 | * Convert the expression into a NFA. 489 | */ 490 | @Override 491 | public Automaton build() { 492 | Automaton auto = new Automaton(this); 493 | 494 | auto.start.connect(auto.end, this); 495 | 496 | return auto; 497 | } 498 | 499 | @Override 500 | public int minMatchingLength() { 501 | return 0; 502 | } 503 | } 504 | 505 | /** 506 | * A non-consuming expression that matches the start of a line. 507 | * {@code ^} 508 | * @author Michael Schmitz 509 | * 510 | * @param 511 | */ 512 | static class StartAssertion extends AssertionExpression { 513 | @Override 514 | public boolean apply(boolean hasStart, List tokens, int count) { 515 | return hasStart && tokens.size() == count; 516 | } 517 | 518 | @Override 519 | public String toString() { 520 | return "^"; 521 | } 522 | } 523 | 524 | /** 525 | * A non-consuming expression that matches the end of a line. 526 | * {@code $} 527 | * @author Michael Schmitz 528 | * 529 | * @param 530 | */ 531 | static class EndAssertion extends AssertionExpression { 532 | @Override 533 | public boolean apply(boolean hasStart, List tokens, int count) { 534 | return tokens.isEmpty(); 535 | } 536 | 537 | @Override 538 | public String toString() { 539 | return "$"; 540 | } 541 | } 542 | } 543 | -------------------------------------------------------------------------------- /src/main/java/edu/washington/cs/knowitall/regex/ExpressionFactory.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex; 2 | 3 | import com.google.common.base.Function; 4 | 5 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression; 6 | 7 | /** 8 | * Wrapper class for a Guava Function. Used to unpickle a expression string 9 | * into a part of a logic expression. 10 | * 11 | * @author Michael Schmitz 12 | */ 13 | public abstract class ExpressionFactory implements Function> { 14 | public abstract BaseExpression create(String token); 15 | 16 | public BaseExpression apply(String token) { 17 | return this.create(token); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/main/java/edu/washington/cs/knowitall/regex/FiniteAutomaton.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Iterator; 5 | import java.util.List; 6 | import java.util.concurrent.atomic.AtomicInteger; 7 | 8 | import com.google.common.base.Predicate; 9 | import com.google.common.collect.Iterables; 10 | import com.google.common.collect.Lists; 11 | 12 | import edu.washington.cs.knowitall.regex.Expression.AssertionExpression; 13 | import edu.washington.cs.knowitall.regex.Expression.MatchingGroup; 14 | 15 | /** 16 | * A finite automaton implementation. There is support for epsilon 17 | * transitions (NFA) but if those are omitted then this works as an 18 | * implementation of a DFA. 19 | * 20 | * @author Michael Schmitz 21 | */ 22 | public class FiniteAutomaton { 23 | /** 24 | * A component automaton with a single start state and a single end 25 | * state. 26 | * @author Michael Schmitz 27 | * 28 | * @param 29 | */ 30 | public static class Automaton { 31 | public final StartState start; 32 | public final EndState end; 33 | 34 | public Automaton(StartState start, EndState end) { 35 | this.start = start; 36 | this.end = end; 37 | } 38 | 39 | public Automaton(Expression expr) { 40 | this.start = new StartState(expr); 41 | this.end = new EndState(expr); 42 | } 43 | 44 | public boolean apply(List tokens) { 45 | return this.evaluate(tokens, true) != null; 46 | } 47 | 48 | public int minMatchingLength() { 49 | return start.minMatchingLength(); 50 | } 51 | 52 | public Match.FinalMatch lookingAt(List tokens) { 53 | return lookingAt(tokens, 0); 54 | } 55 | 56 | /** 57 | * @return null if no match, otherwise a representation of the match 58 | */ 59 | public Match.FinalMatch lookingAt(List tokens, int startIndex) { 60 | if (tokens.size() - startIndex - this.minMatchingLength() < 0) { 61 | // don't try if we can't possible match 62 | return null; 63 | } 64 | else { 65 | List sublist = tokens.subList(startIndex, tokens.size()); 66 | 67 | Step path = this.evaluate(sublist, startIndex == 0); 68 | if (path == null) { 69 | return null; 70 | } 71 | 72 | // build list of edges 73 | List> edges = new ArrayList>(); 74 | while (path.state != this.start) { 75 | edges.add(path.path); 76 | path = path.prev; 77 | } 78 | 79 | Match.IntermediateMatch match = new Match.IntermediateMatch(); 80 | buildMatch(sublist.iterator(), null, new AtomicInteger(startIndex), this.start, 81 | Lists.reverse(edges).iterator(), match); 82 | return new Match.FinalMatch(match); 83 | } 84 | } 85 | 86 | /** 87 | * Retrace the path through the NFA and produce an object that 88 | * represents the match. 89 | * @param tokenIterator an iterator over the tokens. 90 | * @param expression the expression to match. 91 | * @param index the present index. 92 | * @param state the present state. 93 | * @param edgeIterator an iterator over the edges in the solution. 94 | * @param match the solution. 95 | * @return 96 | */ 97 | private State buildMatch(Iterator tokenIterator, Expression expression, 98 | AtomicInteger index, State state, Iterator> edgeIterator, 99 | Match.IntermediateMatch match) { 100 | 101 | Match.IntermediateMatch newMatch = new Match.IntermediateMatch(); 102 | 103 | while (edgeIterator.hasNext() && !((state instanceof EndState) 104 | && ((EndState)state).expression == expression)) { 105 | 106 | AbstractEdge edge = edgeIterator.next(); 107 | 108 | // run the sub-automaton 109 | if (edge instanceof Edge 110 | && !(((Edge) edge).expression instanceof AssertionExpression)) { 111 | // consume a token, this is the base case 112 | E token = tokenIterator.next(); 113 | newMatch.add(((Edge)edge).expression, token, index.getAndIncrement()); 114 | 115 | state = edge.dest; 116 | } 117 | else if (state instanceof StartState) { 118 | // recurse on StartState so we have a group for that match 119 | Expression expr = ((StartState)state).expression; 120 | state = buildMatch(tokenIterator, expr, index, edge.dest, edgeIterator, newMatch); 121 | assert(state instanceof EndState && ((EndState)state).expression == expr); 122 | } 123 | else { 124 | assert(edge instanceof Epsilon); 125 | state = edge.dest; 126 | } 127 | } 128 | 129 | // add the sub match group 130 | if (expression != null 131 | && (!newMatch.isEmpty() || expression instanceof MatchingGroup)) { 132 | // create a wrapper for the expressions it matched 133 | Match.Group pair = new Match.Group(expression); 134 | for (Match.Group p : newMatch.pairs()) { 135 | if (p.expr instanceof Expression.BaseExpression) { 136 | pair.addTokens(p); 137 | } 138 | } 139 | 140 | // add it 141 | match.add(pair); 142 | } 143 | 144 | // add the contents of the sub match group 145 | match.addAll(newMatch.pairs()); 146 | 147 | return state; 148 | } 149 | 150 | /** 151 | * A representation of a movement from a state to another, with a 152 | * backreference to the previous state. This is used in building 153 | * a match object once a solution has been found. 154 | * @author Michael Schmitz 155 | * 156 | * @param 157 | */ 158 | private static class Step { 159 | public final State state; 160 | public final Step prev; 161 | public final AbstractEdge path; 162 | 163 | public Step(State state) { 164 | this(state, null, null); 165 | } 166 | 167 | public Step(State state, Step prev, AbstractEdge path) { 168 | this.state = state; 169 | this.prev = prev; 170 | this.path = path; 171 | } 172 | 173 | public String toString() { 174 | return this.state.toString(); 175 | } 176 | } 177 | 178 | /** 179 | * Expand all epsilon transitions for the supplied steps. That is, 180 | * add all states available via an epsilon transition from a supplied 181 | * state to the list. 182 | * @param steps 183 | */ 184 | private void expandEpsilons(List> steps) { 185 | int size = steps.size(); 186 | for (int i = 0; i < size; i++) { 187 | Step step = steps.get(i); 188 | 189 | expandEpsilon(step, steps); 190 | } 191 | } 192 | 193 | /** 194 | * Expand all epsilon transitions for the specified step. That is, 195 | * add all states avaiable via an epsilon transition from step.state. 196 | * @param step 197 | * @param steps 198 | */ 199 | private void expandEpsilon(Step step, List> steps) { 200 | // loop over edges 201 | for (final Epsilon edge : step.state.epsilons) { 202 | 203 | // try free edges if they do not lead to an existing 204 | // step 205 | if (!Iterables.any(steps, 206 | new Predicate>() { 207 | @Override 208 | public boolean apply(Step step) { 209 | return step.state == edge.dest; 210 | } 211 | })) { 212 | Step newstep = new Step(edge.dest, step, edge); 213 | steps.add(newstep); 214 | expandEpsilon(newstep, steps); 215 | } 216 | } 217 | } 218 | 219 | /** 220 | * Expand any state that has an assertion edge if the assertion passes 221 | * given the present state. 222 | * @param steps 223 | * @param newsteps 224 | * @param hasStart true iff the tokens contains the start token. 225 | * @param tokens 226 | * @param totalTokens 227 | */ 228 | private void expandAssertions(List> steps, List> newsteps, boolean hasStart, 229 | List tokens, int totalTokens) { 230 | for (Step step : steps) { 231 | for (final Edge edge : step.state.edges) { 232 | if (edge.expression instanceof AssertionExpression) { 233 | AssertionExpression assertion = (AssertionExpression)edge.expression; 234 | 235 | if (assertion.apply(hasStart, tokens, totalTokens)) { 236 | newsteps.add(new Step(edge.dest, step, edge)); 237 | } 238 | } 239 | } 240 | } 241 | } 242 | 243 | private Step evaluate(List tokens, boolean hasStart) { 244 | List> steps = new ArrayList>(); 245 | steps.add(new Step(this.start)); 246 | return evaluate(tokens, steps, hasStart); 247 | } 248 | 249 | /** 250 | * Evaluate the NFA against the list of tokens using the Thompson NFA 251 | * algorithm. 252 | * @param tokens the tokens to evaluate against 253 | * @param steps present list of accessible states. 254 | * @param hasStart true iff tokens contains the start token. 255 | * @return a Step object representing the last transition or null. 256 | */ 257 | private Step evaluate(List tokens, List> steps, boolean hasStart) { 258 | int totalTokens = tokens.size(); 259 | 260 | int solutionTokensLeft = totalTokens; 261 | Step solution = null; 262 | while (!steps.isEmpty()) { 263 | 264 | expandEpsilons(steps); 265 | 266 | List> intermediate = new ArrayList>(steps); 267 | List> newsteps = new ArrayList>(steps.size() * 2); 268 | do { 269 | 270 | // check if at end 271 | for (Step step : intermediate) { 272 | if (step.state == this.end) { 273 | if (tokens.size() == totalTokens) { 274 | // can't succeed if no tokens are consumed 275 | } 276 | else { 277 | // we have reached the end 278 | if (tokens.size() < solutionTokensLeft) { 279 | solution = step; 280 | solutionTokensLeft = tokens.size(); 281 | } 282 | } 283 | } 284 | } 285 | 286 | // handle assertions 287 | newsteps.clear(); 288 | expandAssertions(intermediate, newsteps, hasStart, tokens, totalTokens); 289 | expandEpsilons(newsteps); 290 | 291 | intermediate.clear(); 292 | intermediate.addAll(newsteps); 293 | 294 | steps.addAll(newsteps); 295 | } while (newsteps.size() > 0); 296 | 297 | newsteps.clear(); 298 | if (!tokens.isEmpty()) { 299 | for (Step step : steps) { 300 | for (final Edge edge : step.state.edges) { 301 | // try other edges if they match the current token 302 | if (edge.apply(tokens.get(0))) { 303 | newsteps.add(new Step(edge.dest, step, edge)); 304 | } 305 | } 306 | } 307 | 308 | // consume a token 309 | tokens = tokens.subList(1, tokens.size()); 310 | } 311 | 312 | steps = newsteps; 313 | } 314 | 315 | return solution; 316 | } 317 | } 318 | 319 | /** 320 | * Representation of a state in the automaton. 321 | * @author Michael Schmitz 322 | * 323 | * @param 324 | */ 325 | public static class State { 326 | public final List> edges = new ArrayList>(); 327 | public final List> epsilons = new ArrayList>(); 328 | 329 | /** 330 | * Add an epsilon transition between this state and dest. 331 | * @param dest the state to connect 332 | */ 333 | public void connect(State dest) { 334 | this.epsilons.add(new Epsilon(dest)); 335 | } 336 | 337 | /** 338 | * Add an edge between this state and dest. 339 | * @param dest the state to connect 340 | * @param cost the expression of the edge 341 | */ 342 | public void connect(State dest, Expression cost) { 343 | this.edges.add(new Edge(dest, cost)); 344 | } 345 | 346 | public String toString() { 347 | return this.getClass().getSimpleName() + ":" + this.edges.size(); 348 | } 349 | } 350 | 351 | /** 352 | * A start or end state. 353 | * @author Michael Schmitz 354 | * 355 | * @param 356 | */ 357 | public static class TerminusState extends State { 358 | public final Expression expression; 359 | public TerminusState(Expression expression) { 360 | super(); 361 | this.expression = expression; 362 | } 363 | 364 | public String toString() { 365 | return this.getClass().getSimpleName() 366 | + "("+this.expression.toString()+"):" + this.edges.size(); 367 | } 368 | } 369 | 370 | /** 371 | * A start state. 372 | * @author Michael Schmitz 373 | * 374 | * @param 375 | */ 376 | public static class StartState extends TerminusState { 377 | public StartState(Expression expression) { 378 | super(expression); 379 | } 380 | 381 | public int minMatchingLength() { 382 | return this.expression.minMatchingLength(); 383 | } 384 | } 385 | 386 | /** 387 | * An end state. 388 | * @author Michael Schmitz 389 | * 390 | * @param 391 | */ 392 | public static class EndState extends TerminusState { 393 | public EndState(Expression expression) { 394 | super(expression); 395 | } 396 | } 397 | 398 | /** 399 | * An abstract representation of an edge. 400 | * @author Michael Schmitz 401 | * 402 | * @param 403 | */ 404 | public static abstract class AbstractEdge implements Predicate { 405 | public final State dest; 406 | 407 | public AbstractEdge(State dest) { 408 | this.dest = dest; 409 | } 410 | } 411 | 412 | /** 413 | * An edge with cost {@code expression}. 414 | * @author Michael Schmitz 415 | * 416 | * @param 417 | */ 418 | public static class Edge extends AbstractEdge { 419 | public final Expression expression; 420 | 421 | public Edge(State dest, Expression base) { 422 | super(dest); 423 | this.expression = base; 424 | } 425 | 426 | @Override 427 | public String toString() { 428 | return "(" + this.expression.toString() + ") -> " + this.dest.toString(); 429 | } 430 | 431 | @Override 432 | public boolean apply(E entity) { 433 | if (expression == null) { 434 | return true; 435 | } 436 | else { 437 | return expression.apply(entity); 438 | } 439 | } 440 | } 441 | 442 | /** 443 | * An edge without cost, an epsilon transition. 444 | * @author Michael Schmitz 445 | * 446 | * @param 447 | */ 448 | public static class Epsilon extends AbstractEdge { 449 | public Epsilon(State dest) { 450 | super(dest); 451 | } 452 | 453 | @Override 454 | public String toString() { 455 | return "(epsilon) -> " + dest.toString(); 456 | } 457 | 458 | @Override 459 | public boolean apply(E entity) { 460 | return true; 461 | } 462 | } 463 | } 464 | -------------------------------------------------------------------------------- /src/main/java/edu/washington/cs/knowitall/regex/Match.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.Collection; 6 | import java.util.List; 7 | 8 | import com.google.common.base.Function; 9 | import com.google.common.base.Functions; 10 | import com.google.common.base.Joiner; 11 | import com.google.common.collect.Lists; 12 | 13 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression; 14 | 15 | /** 16 | * A class to represent a match. Each part of the regular expression is matched 17 | * to a sequence of tokens. A match also stores information about the range 18 | * of tokens matched and the matching groups in the match. 19 | * 20 | * @author Michael Schmitz 21 | * 22 | * @param 23 | */ 24 | public abstract class Match { 25 | protected List> pairs; 26 | 27 | protected Match() { 28 | pairs = new ArrayList>(); 29 | } 30 | 31 | protected Match(Match match) { 32 | this(); 33 | for (Group pair : match.pairs) { 34 | this.add(new Group(pair.expr, pair.tokens)); 35 | } 36 | } 37 | 38 | public boolean add(Group pair) { 39 | return this.pairs.add(pair); 40 | } 41 | 42 | public boolean addAll(Collection> pairs) { 43 | boolean result = true; 44 | for (Group pair : pairs) { 45 | result &= this.add(pair); 46 | } 47 | 48 | return result; 49 | } 50 | 51 | /** 52 | * Convenience method for add(new Group(expr, token, pos)). 53 | * @param expr 54 | * @param token 55 | * @param pos 56 | * @return 57 | */ 58 | public boolean add(Expression expr, E token, int pos) { 59 | return this.add(new Group(expr, token, pos)); 60 | } 61 | 62 | /** 63 | * True iff this match contains no pairs. This should only happen on an 64 | * IntermediateMatch that has not had any pairs added to it yet. 65 | */ 66 | public boolean isEmpty() { 67 | return this.pairs.isEmpty(); 68 | } 69 | 70 | @Override 71 | public String toString() { 72 | return "[" + Joiner.on(", ").join( 73 | Lists.transform(this.pairs, Functions.toStringFunction())) + "]"; 74 | } 75 | 76 | public String toMultilineString() { 77 | return Joiner.on("\n").join(Lists.transform(this.pairs, 78 | Functions.toStringFunction())); 79 | } 80 | 81 | /** 82 | * @return the index of the first token matched (inclusive start). 83 | */ 84 | public abstract int startIndex(); 85 | 86 | /** 87 | * @return the index one past of the last token matched (exclusive end). 88 | */ 89 | public abstract int endIndex(); 90 | 91 | /** 92 | * Pairs differ from the matching groups in that each regular expression 93 | * element has a pair to associate the element with the text matched. 94 | * For example, 'a*' might be associated with 'a a a a'. 95 | * 96 | * @return all pairs in this match. 97 | */ 98 | public List> pairs() { 99 | return Collections.unmodifiableList(this.pairs); 100 | } 101 | 102 | /** 103 | * @return all matching groups (named and unnamed). 104 | */ 105 | public abstract List> groups(); 106 | 107 | /** 108 | * @return all matched tokens. 109 | */ 110 | public abstract List tokens(); 111 | 112 | /** 113 | * @return the number of tokens in the match. 114 | */ 115 | public int length() { 116 | return this.tokens().size(); 117 | } 118 | 119 | /** 120 | * Retrieve a group by name. 121 | * @param name the name of the group to retrieve. 122 | * @return the associated group. 123 | */ 124 | public Group group(String name) { 125 | for (Group group : this.groups()) { 126 | if (group.expr instanceof Expression.NamedGroup) { 127 | Expression.NamedGroup namedGroup = (Expression.NamedGroup) group.expr; 128 | if (namedGroup.name.equals(name)) { 129 | return group; 130 | } 131 | } 132 | } 133 | 134 | return null; 135 | } 136 | 137 | /** 138 | * A match representation that has efficient method calls but is immutable. 139 | * @author Michael Schmitz 140 | * 141 | * @param 142 | */ 143 | protected final static class FinalMatch extends Match { 144 | private final int startIndex; 145 | private final List tokens; 146 | private final List> groups; 147 | 148 | protected FinalMatch(Match m) { 149 | super(m); 150 | this.startIndex = m.startIndex(); 151 | this.tokens = Collections.unmodifiableList(m.tokens()); 152 | this.groups = Collections.unmodifiableList(m.groups()); 153 | } 154 | 155 | public int startIndex() { 156 | return this.startIndex; 157 | } 158 | 159 | public int endIndex() { 160 | return this.startIndex() + this.tokens.size(); 161 | } 162 | 163 | public List tokens() { 164 | return this.tokens; 165 | } 166 | 167 | @Override 168 | public List> groups() { 169 | return this.groups; 170 | } 171 | } 172 | 173 | /** 174 | * A match representation that is mutable but many method calls compute 175 | * values instead of returning stored values. This is a good in-between 176 | * while building a match object. 177 | * @author Michael Schmitz 178 | * 179 | * @param 180 | */ 181 | protected final static class IntermediateMatch extends Match { 182 | protected IntermediateMatch() { 183 | super(); 184 | } 185 | 186 | @Override 187 | public List tokens() { 188 | List tokens = new ArrayList(); 189 | for (Match.Group pair : this.pairs) { 190 | if (pair.expr instanceof BaseExpression) { 191 | tokens.addAll(pair.tokens()); 192 | } 193 | } 194 | 195 | return tokens; 196 | } 197 | 198 | @Override 199 | public List> groups() { 200 | List> groups = new ArrayList>(); 201 | for (Group pair : this.pairs) { 202 | if (pair.expr instanceof Expression.MatchingGroup 203 | && !(pair.expr instanceof Expression.NonMatchingGroup)) { 204 | groups.add(pair); 205 | } 206 | } 207 | 208 | return groups; 209 | } 210 | 211 | @Override 212 | public int startIndex() { 213 | for (Match.Group pair : this.pairs) { 214 | if (pair.expr instanceof Expression.BaseExpression) { 215 | return pair.tokens.get(0).index; 216 | } 217 | } 218 | 219 | return -1; 220 | } 221 | 222 | @Override 223 | public int endIndex() { 224 | for (Match.Group pair : Lists.reverse(this.pairs)) { 225 | if (pair.expr instanceof Expression.BaseExpression) { 226 | return pair.tokens.get(0).index; 227 | } 228 | } 229 | 230 | return -1; 231 | } 232 | } 233 | 234 | /** 235 | * A captured group in a matched expression. 236 | * @author Michael Schmitz 237 | * 238 | * @param 239 | */ 240 | public static class Group { 241 | private static class Token { 242 | public E entity; 243 | public int index; 244 | 245 | public Token(E entity, int index) { 246 | this.entity = entity; 247 | this.index = index; 248 | } 249 | 250 | public String toString() { 251 | return this.entity.toString(); 252 | } 253 | } 254 | 255 | public final Expression expr; 256 | private final List> tokens; 257 | 258 | public Group(Expression expr, E token, int pos) { 259 | this(expr, Collections.singletonList(new Token(token, pos))); 260 | } 261 | 262 | public Group(Expression expr, List> tokens) { 263 | this.expr = expr; 264 | this.tokens = new ArrayList>(tokens); 265 | } 266 | 267 | public Group(Expression expr) { 268 | this(expr, new ArrayList>()); 269 | } 270 | 271 | /** 272 | * Add tokens to the group. 273 | * @param group 274 | */ 275 | protected void addTokens(Group group) { 276 | this.tokens.addAll(group.tokens); 277 | } 278 | 279 | /** 280 | * @return the tokens matched. 281 | */ 282 | public List tokens() { 283 | return Lists.transform(this.tokens, 284 | new Function, E>() { 285 | @Override 286 | public E apply(Match.Group.Token token) { 287 | return token.entity; 288 | } 289 | }); 290 | } 291 | 292 | /** 293 | * @return the index of the first token in this group or -1 294 | */ 295 | public int startIndex() { 296 | int min = -1; 297 | for (Token token : this.tokens) { 298 | if (min == -1 || token.index < min) 299 | min = token.index; 300 | } 301 | 302 | return min; 303 | } 304 | 305 | /** 306 | * @return the index of the last token in this group or -1 307 | */ 308 | public int endIndex() { 309 | int max = -1; 310 | for (Token token : this.tokens) { 311 | if (token.index == -1 || token.index > max) 312 | max = token.index; 313 | } 314 | 315 | return max; 316 | } 317 | 318 | /** 319 | * A string representation of the group. 320 | * This is a lighter-weight representation than toString. 321 | */ 322 | public String text() { 323 | return Joiner.on(" ").join(this.tokens()); 324 | } 325 | 326 | /** 327 | * @return the number of tokens matched. 328 | */ 329 | public int tokenCount() { 330 | return this.tokens.size(); 331 | } 332 | 333 | @Override 334 | public String toString() { 335 | return expr.toString() 336 | + ":'" 337 | + Joiner.on(" ").join( 338 | Lists.transform(this.tokens, 339 | Functions.toStringFunction())) + "'"; 340 | } 341 | } 342 | } 343 | -------------------------------------------------------------------------------- /src/main/java/edu/washington/cs/knowitall/regex/RegexException.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex; 2 | 3 | /** 4 | * 5 | * @author Michael Schmitz 6 | */ 7 | public class RegexException extends RuntimeException { 8 | private static final long serialVersionUID = -3534531866062810681L; 9 | 10 | public RegexException(String message, Exception e) { 11 | super(message, e); 12 | } 13 | 14 | public RegexException(String message) { 15 | super(message); 16 | } 17 | 18 | public static class TokenizationRegexException extends RegexException { 19 | private static final long serialVersionUID = 7064825496455884721L; 20 | 21 | public TokenizationRegexException(String message, Exception e) { 22 | super(message, e); 23 | } 24 | 25 | public TokenizationRegexException(String message) { 26 | super(message); 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/main/java/edu/washington/cs/knowitall/regex/RegularExpression.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Arrays; 5 | import java.util.List; 6 | import java.util.Scanner; 7 | 8 | import com.google.common.base.Predicate; 9 | import com.google.common.base.Joiner; 10 | import com.google.common.base.Function; 11 | 12 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression; 13 | import edu.washington.cs.knowitall.regex.FiniteAutomaton.Automaton; 14 | 15 | /** 16 | * A regular expression engine that operates over sequences of user-specified 17 | * objects. 18 | * 19 | * @author Michael Schmitz 20 | * 21 | * @param the type of the sequence elements 22 | */ 23 | public class RegularExpression implements Predicate> { 24 | public final List> expressions; 25 | public final Automaton auto; 26 | 27 | public RegularExpression(List> expressions) { 28 | this.expressions = expressions; 29 | this.auto = RegularExpression.build(this.expressions); 30 | } 31 | 32 | /*** 33 | * Create a regular expression without tokenization support. 34 | * @param expressions 35 | * @return 36 | */ 37 | public static RegularExpression compile(List> expressions) { 38 | return new RegularExpression(expressions); 39 | } 40 | 41 | /*** 42 | * Create a regular expression from the specified string. 43 | * @param expression 44 | * @param factoryDelegate 45 | * @return 46 | */ 47 | public static RegularExpression compile(final String expression, 48 | final Function> factoryDelegate) { 49 | return new RegularExpressionParser() { 50 | @Override 51 | public BaseExpression factory(String token) { 52 | return factoryDelegate.apply(token); 53 | } 54 | }.parse(expression); 55 | } 56 | 57 | @Override 58 | public boolean equals(Object other) { 59 | if (! (other instanceof RegularExpression)) { 60 | return false; 61 | } 62 | 63 | RegularExpression expression = (RegularExpression) other; 64 | return this.toString().equals(expression.toString()); 65 | } 66 | 67 | @Override 68 | public int hashCode() { 69 | return this.toString().hashCode(); 70 | } 71 | 72 | @Override 73 | public String toString() { 74 | List expressions = new ArrayList( 75 | this.expressions.size()); 76 | for (Expression expr : this.expressions) { 77 | expressions.add(expr.toString()); 78 | } 79 | 80 | return Joiner.on(" ").join(expressions); 81 | } 82 | 83 | /** 84 | * Build an NFA from the list of expressions. 85 | * @param exprs 86 | * @return 87 | */ 88 | public static Automaton build(List> exprs) { 89 | Expression.MatchingGroup group = new Expression.MatchingGroup(exprs); 90 | return group.build(); 91 | } 92 | 93 | /** 94 | * Apply the expression against a list of tokens. 95 | * 96 | * @return true iff the expression if found within the tokens. 97 | */ 98 | @Override 99 | public boolean apply(List tokens) { 100 | if (this.find(tokens) != null) { 101 | return true; 102 | } else { 103 | return false; 104 | } 105 | } 106 | 107 | /** 108 | * Apply the expression against a list of tokens. 109 | * 110 | * @return true iff the expression matches all of the tokens. 111 | */ 112 | public boolean matches(List tokens) { 113 | Match match = this.lookingAt(tokens, 0); 114 | return match != null && match.endIndex() == tokens.size(); 115 | } 116 | 117 | /** 118 | * Find the first match of the regular expression against tokens. This 119 | * method is slightly slower due to additional memory allocations. However, 120 | * the response has much greater detail and is very useful for 121 | * writing/debugging regular expressions. 122 | * 123 | * @param tokens 124 | * @return an object representing the match, or null if no match is found. 125 | */ 126 | public Match find(List tokens) { 127 | return this.find(tokens, 0); 128 | } 129 | 130 | /** 131 | * Find the first match of the regular expression against tokens, starting 132 | * at the specified index. 133 | * 134 | * @param tokens tokens to match against. 135 | * @param start index to start looking for a match. 136 | * @return an object representing the match, or null if no match is found. 137 | */ 138 | public Match find(List tokens, int start) { 139 | Match match; 140 | for (int i = start; i <= tokens.size() - auto.minMatchingLength(); i++) { 141 | match = this.lookingAt(tokens, i); 142 | if (match != null) { 143 | return match; 144 | } 145 | } 146 | 147 | return null; 148 | } 149 | 150 | /** 151 | * Determine if the regular expression matches the beginning of the 152 | * supplied tokens. 153 | * 154 | * @param tokens the list of tokens to match. 155 | * @return an object representing the match, or null if no match is found. 156 | */ 157 | public Match lookingAt(List tokens) { 158 | return this.lookingAt(tokens, 0); 159 | } 160 | 161 | /** 162 | * Determine if the regular expression matches the supplied tokens, 163 | * starting at the specified index. 164 | * 165 | * @param tokens the list of tokens to match. 166 | * @param start the index where the match should begin. 167 | * @return an object representing the match, or null if no match is found. 168 | */ 169 | public Match lookingAt(List tokens, int start) { 170 | return auto.lookingAt(tokens, start); 171 | } 172 | 173 | public Match match(List tokens) { 174 | Match match = this.lookingAt(tokens); 175 | if (match != null && match.endIndex() == tokens.size()) { 176 | return match; 177 | } 178 | else { 179 | return null; 180 | } 181 | } 182 | 183 | /** 184 | * Find all non-overlapping matches of the regular expression against tokens. 185 | * 186 | * @param tokens 187 | * @return an list of objects representing the match. 188 | */ 189 | public List> findAll(List tokens) { 190 | List> results = new ArrayList>(); 191 | 192 | int start = 0; 193 | Match match; 194 | do { 195 | match = this.find(tokens, start); 196 | 197 | if (match != null) { 198 | start = match.endIndex(); 199 | 200 | // match may be empty query string has all optional parts 201 | if (!match.isEmpty()) { 202 | results.add(match); 203 | } 204 | } 205 | } while (match != null); 206 | 207 | return results; 208 | } 209 | 210 | /** 211 | * An interactive program that compiles a word-based regular expression 212 | * specified in arg1 and then reads strings from stdin, evaluating them 213 | * against the regular expression. 214 | * @param args 215 | */ 216 | public static void main(String[] args) { 217 | Scanner scan = new Scanner(System.in); 218 | 219 | RegularExpression regex = RegularExpressionParsers.word.parse(args[0]); 220 | System.out.println("regex: " + regex); 221 | System.out.println(); 222 | 223 | while (scan.hasNextLine()) { 224 | String line = scan.nextLine(); 225 | 226 | System.out.println("contains: " + regex.apply(Arrays.asList(line.split("\\s+")))); 227 | System.out.println("matches: " + regex.matches(Arrays.asList(line.split("\\s+")))); 228 | System.out.println(); 229 | } 230 | 231 | scan.close(); 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /src/main/java/edu/washington/cs/knowitall/regex/RegularExpressionParser.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | import java.util.regex.Matcher; 6 | import java.util.regex.Pattern; 7 | 8 | import com.google.common.base.Function; 9 | 10 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression; 11 | import edu.washington.cs.knowitall.regex.Expression.EndAssertion; 12 | import edu.washington.cs.knowitall.regex.Expression.StartAssertion; 13 | import edu.washington.cs.knowitall.regex.RegexException.TokenizationRegexException; 14 | 15 | /** 16 | * A regular expression parser turns strings into RegularExpression 17 | * objects. 18 | * 19 | * @author Michael Schmitz 20 | * 21 | * @param the type of the sequence elements 22 | */ 23 | public abstract class RegularExpressionParser implements Function> { 24 | /*** 25 | * The factory method creates an expression from the supplied token string. 26 | * @param token a string representation of a token 27 | * @return an evaluatable representation of a token 28 | */ 29 | public abstract BaseExpression factory(String token); 30 | 31 | public RegularExpression parse(String string) { 32 | List> expressions = this.tokenize(string); 33 | return new RegularExpression(expressions); 34 | } 35 | 36 | @Override 37 | public RegularExpression apply(String string) { 38 | return this.parse(string); 39 | } 40 | 41 | /*** 42 | * Read a token from the remaining text and return it. 43 | * 44 | * This is a default implementation that is overridable. 45 | * In the default implementation, the starting and ending 46 | * token characters are not escapable. 47 | * 48 | * If this implemenation is overridden, A token MUST ALWAYS 49 | * start with '<' or '[' and end with '>' or ']'. 50 | * 51 | * @param remaining 52 | * @return 53 | */ 54 | public String readToken(String remaining) { 55 | int start = 0; 56 | char c = remaining.charAt(0); 57 | 58 | int end; 59 | if (c == '<') { 60 | end = indexOfClose(remaining, start, '<', '>'); 61 | } 62 | else if (c == '[' ){ 63 | end = indexOfClose(remaining, start, '[', ']'); 64 | } 65 | else { 66 | throw new IllegalStateException(); 67 | } 68 | 69 | // make sure we found the end 70 | if (end == -1) { 71 | throw new TokenizationRegexException( 72 | "bad token. Non-matching brackets (<> or []): " + start 73 | + ":\"" + remaining.substring(start) + "\""); 74 | } 75 | 76 | String token = remaining.substring(start, end + 1); 77 | return token; 78 | } 79 | 80 | /** 81 | * Convert a list of tokens (<...>) to a list of expressions. 82 | * 83 | * @param tokens 84 | * @param factory 85 | * Factory class to create a BaseExpression from the text between 86 | * angled brackets. 87 | * @return 88 | */ 89 | public List> tokenize(String string) { 90 | List> expressions = new ArrayList>(); 91 | 92 | final Pattern whitespacePattern = Pattern.compile("\\s+"); 93 | final Pattern unaryPattern = Pattern.compile("[*?+]"); 94 | final Pattern minMaxPattern = Pattern.compile("\\{(\\d+),(\\d+)\\}"); 95 | final Pattern binaryPattern = Pattern.compile("[|]"); 96 | 97 | List tokens = new ArrayList(); 98 | 99 | char stack = ' '; 100 | int start = 0; 101 | while (start < string.length()) { 102 | Matcher matcher; 103 | 104 | // skip whitespace 105 | if ((matcher = whitespacePattern.matcher(string)) 106 | .region(start, string.length()).lookingAt()) { 107 | start = matcher.end(); 108 | continue; 109 | } 110 | 111 | char c = string.charAt(start); 112 | // group, assertion, or token 113 | if (c == '(' || c == '<' || c == '[' || c == '$' || c == '^') { 114 | // group 115 | if (string.charAt(start) == '(') { 116 | int end = indexOfClose(string, start, '(', ')'); 117 | if (end == -1) { 118 | throw new TokenizationRegexException("unclosed parenthesis: " + start 119 | + ":\"" + string.substring(start) + ")\""); 120 | } 121 | 122 | String group = string.substring(start + 1, end); 123 | start = end + 1; 124 | 125 | final Pattern namedPattern = Pattern.compile("<(\\w*)>:(.*)"); 126 | final Pattern unnamedPattern = Pattern.compile("\\?:(.*)"); 127 | 128 | // named group (matching) 129 | if ((matcher = namedPattern.matcher(group)).matches()) { 130 | String groupName = matcher.group(1); 131 | group = matcher.group(2); 132 | List> groupExpressions = this.tokenize(group); 133 | expressions.add(new Expression.NamedGroup(groupName, groupExpressions)); 134 | } 135 | // unnamed group 136 | else if ((matcher = unnamedPattern.matcher(group)).matches()) { 137 | group = matcher.group(1); 138 | List> groupExpressions = this.tokenize(group); 139 | expressions.add(new Expression.NonMatchingGroup(groupExpressions)); 140 | } 141 | // group (matching) 142 | else { 143 | List> groupExpressions = this.tokenize(group); 144 | expressions.add(new Expression.MatchingGroup(groupExpressions)); 145 | } 146 | } 147 | 148 | // token 149 | else if (c == '<' || c == '[') { 150 | String token = readToken(string.substring(start)); 151 | try { 152 | // strip off enclosing characters 153 | String tokenInside = token.substring(1, token.length() - 1); 154 | BaseExpression base = factory(tokenInside); 155 | expressions.add(base); 156 | 157 | start += token.length(); 158 | } 159 | catch (Exception e) { 160 | throw new TokenizationRegexException("error parsing token: " + token, e); 161 | } 162 | } 163 | 164 | // assertion (^) 165 | else if (c == '^') { 166 | expressions.add(new StartAssertion()); 167 | start += 1; 168 | } 169 | 170 | // assertion ($) 171 | else if (c == '$') { 172 | expressions.add(new EndAssertion()); 173 | start += 1; 174 | } 175 | 176 | // check if we have a floating OR operator 177 | if (stack == '|') { 178 | try { 179 | stack = ' '; 180 | if (expressions.size() < 2) { 181 | throw new IllegalStateException( 182 | "OR operator is applied to fewer than 2 elements."); 183 | } 184 | 185 | Expression expr1 = expressions.remove(expressions.size() - 1); 186 | Expression expr2 = expressions.remove(expressions.size() - 1); 187 | expressions.add(new Expression.Or(expr1, expr2)); 188 | } 189 | catch (Exception e) { 190 | throw new TokenizationRegexException("error parsing OR (|) operator.", e); 191 | } 192 | } 193 | } 194 | // unary operator 195 | else if ((matcher = unaryPattern.matcher(string)) 196 | .region(start, string.length()).lookingAt()) { 197 | char operator = matcher.group(0).charAt(0); 198 | 199 | // pop the last expression 200 | Expression base = expressions.remove(expressions.size() - 1); 201 | 202 | // add the operator to it 203 | Expression expr; 204 | if (operator == '?') { 205 | expr = new Expression.Option(base); 206 | } else if (operator == '*') { 207 | expr = new Expression.Star(base); 208 | } else if (operator == '+') { 209 | expr = new Expression.Plus(base); 210 | } 211 | else { 212 | throw new IllegalStateException(); 213 | } 214 | 215 | expressions.add(expr); 216 | 217 | start = matcher.end(); 218 | } 219 | // min/max operator "{x,y}" 220 | else if ((matcher = minMaxPattern.matcher(string)) 221 | .region(start, string.length()).lookingAt()) { 222 | int minOccurrences = Integer.parseInt(matcher.group(1)); 223 | int maxOccurrences = Integer.parseInt(matcher.group(2)); 224 | 225 | // pop the last expression and add operator 226 | Expression base = expressions.remove(expressions.size() - 1); 227 | Expression expr = new Expression.MinMax(base, minOccurrences, maxOccurrences); 228 | 229 | expressions.add(expr); 230 | 231 | start = matcher.end(); 232 | } 233 | // binary operator (alternation) 234 | else if ((matcher = binaryPattern.matcher(string)) 235 | .region(start, string.length()).lookingAt()) { 236 | tokens.add(matcher.group(0)); 237 | stack = '|'; 238 | start = matcher.end(); 239 | } 240 | else { 241 | throw new TokenizationRegexException("unknown symbol: " 242 | + string.substring(start)); 243 | } 244 | } 245 | 246 | if (stack == '|') { 247 | throw new TokenizationRegexException("OR remains on the stack."); 248 | } 249 | 250 | return expressions; 251 | } 252 | 253 | private static int indexOfClose(String string, int start, char open, char close) { 254 | start--; 255 | 256 | int count = 0; 257 | do { 258 | start++; 259 | 260 | // we hit the end 261 | if (start >= string.length()) { 262 | return -1; 263 | } 264 | 265 | char c = string.charAt(start); 266 | 267 | // we hit an open/close 268 | if (c == open) { 269 | count++; 270 | } else if (c == close) { 271 | count--; 272 | } 273 | 274 | } while (count > 0); 275 | 276 | return start; 277 | } 278 | } 279 | -------------------------------------------------------------------------------- /src/main/java/edu/washington/cs/knowitall/regex/RegularExpressionParsers.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex; 2 | 3 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression; 4 | 5 | /** 6 | * Static factories for regular expressions over some basic sequences. 7 | * 8 | * @author Michael Schmitz 9 | */ 10 | public class RegularExpressionParsers { 11 | /** 12 | * Regular expressions over words where sequences are string 13 | * representations of words. 14 | */ 15 | public final static RegularExpressionParser word = 16 | new RegularExpressionParser() { 17 | @Override public BaseExpression factory(final String string) { 18 | return new BaseExpression(string) { 19 | @Override public boolean apply(final String token) { 20 | return string.equals(token); 21 | } 22 | }; 23 | } 24 | }; 25 | 26 | /** 27 | * Regular expression over characters, as in java.util.Regex. 28 | */ 29 | public final static RegularExpressionParser character = 30 | new RegularExpressionParser() { 31 | @Override public BaseExpression factory(final String string) { 32 | return new BaseExpression(string) { 33 | @Override public boolean apply(final Character token) { 34 | return string.equals(token.toString()); 35 | } 36 | }; 37 | } 38 | }; 39 | } 40 | -------------------------------------------------------------------------------- /src/test/java/edu/washington/cs/knowitall/regex/MinMaxTest.java: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex; 2 | 3 | import org.junit.Test; 4 | 5 | import com.google.common.collect.Lists; 6 | 7 | import java.util.Arrays; 8 | 9 | import static junit.framework.Assert.assertNotNull; 10 | import static junit.framework.Assert.assertNull; 11 | 12 | public class MinMaxTest { 13 | 14 | @Test 15 | public void testMinMax() { 16 | RegularExpression regExZeroToOne = getAbcRegex(0, 1); 17 | assertMatch(regExZeroToOne, "a", "c"); 18 | assertMatch(regExZeroToOne, "a", "b", "c"); 19 | assertNoMatch(regExZeroToOne, "a", "b", "b", "c"); 20 | 21 | RegularExpression regExOne = getAbcRegex(1, 1); 22 | assertNoMatch(regExOne, "a", "c"); 23 | assertMatch(regExOne, "a", "b", "c"); 24 | assertNoMatch(regExOne, "a", "b", "b", "c"); 25 | 26 | RegularExpression regExTwo = getAbcRegex(2, 2); 27 | assertNoMatch(regExTwo, "a", "c"); 28 | assertNoMatch(regExTwo, "a", "b", "c"); 29 | assertMatch(regExTwo, "a", "b", "b", "c"); 30 | assertNoMatch(regExTwo, "a", "b", "b", "b", "c"); 31 | 32 | RegularExpression regExOneToTwo = getAbcRegex(1, 2); 33 | assertNoMatch(regExOneToTwo, "a", "c"); 34 | assertMatch(regExOneToTwo, "a", "b", "c"); 35 | assertMatch(regExOneToTwo, "a", "b", "b", "c"); 36 | assertNoMatch(regExOneToTwo, "a", "b", "b", "b", "c"); 37 | 38 | RegularExpression regExTwoToFour = getAbcRegex(2, 4); 39 | assertNoMatch(regExTwoToFour, "a", "c"); 40 | assertNoMatch(regExTwoToFour, "a", "b", "c"); 41 | assertMatch(regExTwoToFour, "a", "b", "b", "c"); 42 | assertMatch(regExTwoToFour, "a", "b", "b", "b", "c"); 43 | assertMatch(regExTwoToFour, "a", "b", "b", "b", "b", "c"); 44 | assertNoMatch(regExTwoToFour, "a", "b", "b", "b", "b", "b", "c"); 45 | } 46 | 47 | private void assertMatch(RegularExpression regex, String... input) { 48 | assertNotNull(regex.find(Arrays.asList(input))); 49 | } 50 | 51 | private void assertNoMatch(RegularExpression regex, String... input) { 52 | assertNull(regex.find(Arrays.asList(input))); 53 | } 54 | 55 | @Test(expected = IllegalArgumentException.class) 56 | public void testException1() { 57 | getAbcRegex(0, 0); 58 | } 59 | 60 | @Test(expected = IllegalArgumentException.class) 61 | public void testException2() { 62 | getAbcRegex(1, 0); 63 | } 64 | 65 | @Test(expected = IllegalArgumentException.class) 66 | public void testException3() { 67 | getAbcRegex(-1, 0); 68 | } 69 | 70 | @Test(expected = IllegalArgumentException.class) 71 | public void testException4() { 72 | getAbcRegex(0, -1); 73 | } 74 | 75 | private RegularExpression getAbcRegex(int min, int max) { 76 | Expression wordA = RegularExpressionParsers.word.parse("").expressions.get(0); 77 | Expression wordB = RegularExpressionParsers.word.parse("").expressions.get(0); 78 | Expression wordC = RegularExpressionParsers.word.parse("").expressions.get(0); 79 | return RegularExpression.compile(Lists.newArrayList( 80 | wordA, 81 | new Expression.MinMax(wordB, min, max), 82 | wordC) 83 | ); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /src/test/scala/edu/washington/cs/knowitall/logic/LogicTest.scala: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.logic; 2 | 3 | import org.junit.runner.RunWith 4 | import org.specs2.mutable.Specification 5 | import org.specs2.runner.JUnitRunner 6 | import org.specs2.ScalaCheck 7 | 8 | import edu.washington.cs.knowitall.logic.Expression.Arg; 9 | 10 | @RunWith(classOf[JUnitRunner]) 11 | class LogicTest extends Specification with ScalaCheck { 12 | "escape characters" should { 13 | "tokenize ok" in { 14 | val regex = compileStringMatch("\"zebra\" | \"zeb\\\"ra\"") 15 | // note: escape characters are tokenized by not interpreted 16 | regex("zeb\\\"ra") 17 | } 18 | } 19 | "order of operations" should { 20 | "infer the correct parenthesis" in { 21 | compile("false & false & false").toString() must_== "(false & (false & false))" 22 | compile("false & false | false").toString() must_== "((false & false) | false)" 23 | compile("false | false & false").toString() must_== "(false | (false & false))" 24 | } 25 | } 26 | 27 | def eval(expr: String, f: (Boolean, Boolean) => Boolean) = 28 | "evaluate ("+expr+") correctly" in { 29 | check { (a: Boolean, b: Boolean) => compile(substitute(expr, a, b))(null) must_== f(a, b) } 30 | } 31 | def eval(expr: String, f: (Boolean, Boolean, Boolean) => Boolean) = 32 | "evaluate ("+expr+") correctly" in { 33 | check { (a: Boolean, b: Boolean, c: Boolean) => compile(substitute(expr, a, b, c))(null) must_== f(a, b, c) } 34 | } 35 | def eval(expr: String, f: (Boolean, Boolean, Boolean, Boolean) => Boolean) = 36 | "evaluate ("+expr+") correctly" in { 37 | check { (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => compile(substitute(expr, a, b, c, d))(null) must_== f(a, b, c, d) } 38 | } 39 | 40 | "two variable logic expressions" should { 41 | eval("a | b", (a: Boolean, b: Boolean) => a | b) 42 | eval("a & b", (a: Boolean, b: Boolean) => a & b) 43 | } 44 | 45 | "three variable logic expressions" should { 46 | eval("(a | (b & c))", (a: Boolean, b: Boolean, c: Boolean) => (a | (b & c))) 47 | eval("(a & (b & c))", (a: Boolean, b: Boolean, c: Boolean) => (a & (b & c))) 48 | eval("(a & (b | c))", (a: Boolean, b: Boolean, c: Boolean) => (a & (b | c))) 49 | eval("(a | (b | c))", (a: Boolean, b: Boolean, c: Boolean) => (a | (b | c))) 50 | } 51 | 52 | "four variable logic expressions" should { 53 | eval("(a | (b & c & d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a | (b & c & d))) 54 | eval("(a | (b & c | d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a | (b & c | d))) 55 | eval("(a | (b | c & d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a | (b | c & d))) 56 | eval("(a | (b | c | d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a | (b | c | d))) 57 | eval("(a & (b & c & d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a & (b & c & d))) 58 | eval("(a & (b & c | d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a & (b & c | d))) 59 | eval("(a & (b | c & d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a & (b | c & d))) 60 | eval("(a & (b | c | d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (a & (b | c | d))) 61 | eval("((a | b) & (c | d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => ((a | b) & (c | d))) 62 | eval("((a & b) | (c & d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => ((a & b) | (c & d))) 63 | eval("(!(a | b) & (c | d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (!(a | b) & (c | d))) 64 | eval("((a | b) & !(c | d))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => ((a | b) & !(c | d))) 65 | eval("(!((a | b) & !(c | d)))", (a: Boolean, b: Boolean, c: Boolean, d: Boolean) => (!((a | b) & !(c | d)))) 66 | } 67 | 68 | def substitute(expr: String, varargs: Boolean*) = 69 | (expr /: varargs.zipWithIndex) { case (expr, (arg, i)) => 70 | val v = ('a' + i).toChar; 71 | expr.replace(v.toString(), arg.toString); 72 | } 73 | 74 | def compile(logic: String): LogicExpression[String] = LogicExpressionParsers.trivial.parse(logic) 75 | 76 | def compileStringMatch(logic: String): LogicExpression[String] = LogicExpressionParsers.stringMatch.parse(logic) 77 | } 78 | -------------------------------------------------------------------------------- /src/test/scala/edu/washington/cs/knowitall/logic/WordLogicTest.scala: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex 2 | 3 | import scala.collection.JavaConverters._ 4 | 5 | import org.junit.runner.RunWith 6 | import org.specs2.mutable.Specification 7 | import org.specs2.runner.JUnitRunner 8 | 9 | import edu.washington.cs.knowitall.logic._ 10 | import edu.washington.cs.knowitall.logic.Expression.Arg 11 | 12 | @RunWith(classOf[JUnitRunner]) 13 | class WordLogicTest extends Specification { 14 | case class WordToken(string: String, postag: String, chunk: String) 15 | 16 | "README logic example" should { 17 | "work" in { 18 | def create(string: String) = { 19 | new LogicExpressionParser[WordToken] { 20 | override def factory(expr: String) = { 21 | new Arg.Pred[WordToken](expr) { 22 | val Array(part, quotedValue) = expr.split("=") 23 | val value = quotedValue.drop(1).take(quotedValue.size - 2) 24 | override def apply(entity: WordToken) = part match { 25 | case "string" => entity.string == value 26 | case "postag" => entity.postag == value 27 | case "chunk" => entity.chunk == value 28 | } 29 | } 30 | } 31 | }.parse(string) 32 | } 33 | 34 | val logic = create("string='the' | postag='JJ'") 35 | logic.apply(WordToken("the", "foo", "bar")) must beTrue 36 | logic.apply(WordToken("foo", "JJ", "bar")) must beTrue 37 | logic.apply(WordToken("foo", "bar", "baz")) must beFalse 38 | } 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/test/scala/edu/washington/cs/knowitall/regex/RegularExpressionAssertionTest.scala: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex 2 | import org.junit.runner.RunWith 3 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression 4 | import scala.collection.JavaConversions._ 5 | import org.specs2.mutable.Specification 6 | import org.specs2.runner.JUnitRunner 7 | 8 | @RunWith(classOf[JUnitRunner]) 9 | class RegularExpressionAssertionTest extends Specification { 10 | val regexTokens = List("^", "", "", "$") 11 | val matchTokens = List("this", "is", "a", "test") 12 | 13 | val regex = RegularExpressionParsers.word.parse(regexTokens.tail.init.mkString(" ")) 14 | val regexEnd = RegularExpressionParsers.word.parse(regexTokens.tail.mkString(" ")) 15 | val regexStart = RegularExpressionParsers.word.parse(regexTokens.init.mkString(" ")) 16 | val regexBoth = RegularExpressionParsers.word.parse(regexTokens.mkString(" ")) 17 | 18 | def evaluate(regex: RegularExpression[String], tokens: List[String], value: Boolean) = 19 | (if (value) "" else "not ") + "be found in '" + tokens.mkString(" ") + "': " in { 20 | regex.apply(tokens) must beTrue.iff(value) 21 | } 22 | 23 | 24 | regex.toString should { 25 | evaluate(regex, matchTokens, true) 26 | evaluate(regex, matchTokens.tail, true) 27 | evaluate(regex, matchTokens.init, true) 28 | } 29 | 30 | regexEnd.toString should { 31 | evaluate(regexEnd, matchTokens, false) 32 | evaluate(regexEnd, matchTokens.tail, false) 33 | evaluate(regexEnd, matchTokens.init, true) 34 | } 35 | 36 | regexStart.toString should { 37 | evaluate(regexStart, matchTokens, false) 38 | evaluate(regexStart, matchTokens.tail, true) 39 | evaluate(regexStart, matchTokens.init, false) 40 | } 41 | 42 | regexBoth.toString should { 43 | "match 'is a'" in { 44 | regexBoth.matches(List("is", "a")) must beTrue 45 | } 46 | evaluate(regexBoth, matchTokens, false) 47 | evaluate(regexBoth, matchTokens.tail, false) 48 | evaluate(regexBoth, matchTokens.init, false) 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/test/scala/edu/washington/cs/knowitall/regex/RegularExpressionNamedGroupTest.scala: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex 2 | import org.junit.runner.RunWith 3 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression 4 | import scala.collection.JavaConversions._ 5 | import org.specs2.mutable.Specification 6 | import org.specs2.runner.JUnitRunner 7 | 8 | @RunWith(classOf[JUnitRunner]) 9 | class RegularExpressionNamedGroupTest extends Specification { 10 | val regex = RegularExpressionParsers.word.parse("(: | (?: (: )? )) (: | (?: (: ?) <'s>)) ") 11 | 12 | regex.toString should { 13 | val matches = List("I know all of her secrets", 14 | "The Mariners know all of her secrets", 15 | "The Mariners know all of the dirty King 's secrets", 16 | "The Mariners know all of the King 's secrets", 17 | "The crazy Mariners know all of the King 's secrets") 18 | 19 | matches.foreach { m => 20 | "match against " + m in { 21 | regex.apply(m.split(" ").toList) must beTrue 22 | } 23 | } 24 | 25 | "yield the correct groups" in { 26 | val m = regex.find("The crazy Mariners know all of the King 's secrets".split(" ").toList) 27 | m.groups().size() must_== 5 28 | 29 | m.group("subject").text must_== "The crazy Mariners" 30 | m.group("subject").startIndex must_== 0 31 | m.group("subject").endIndex must_== 2 32 | 33 | m.group("subjadj").text must_== "crazy" 34 | m.group("subjadj").startIndex must_== 1 35 | m.group("subjadj").endIndex must_== 1 36 | 37 | m.group("poss").text must_== "the King 's" 38 | m.group("poss").startIndex must_== 6 39 | m.group("poss").endIndex must_== 8 40 | 41 | m.group("possadj").text must_== "" 42 | m.group("possadj").startIndex must_== -1 43 | m.group("possadj").endIndex must_== -1 44 | } 45 | 46 | "yield the correct groups" in { 47 | val m = regex.find("The Mariners know all of her secrets".split(" ").toList) 48 | m.groups().size() must_== 3 49 | 50 | m.group("subject").text must_== "The Mariners" 51 | m.group("subject").startIndex must_== 0 52 | m.group("subject").endIndex must_== 1 53 | 54 | m.group("poss").text must_== "her" 55 | m.group("poss").startIndex must_== 5 56 | m.group("poss").endIndex must_== 5 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/test/scala/edu/washington/cs/knowitall/regex/RegularExpressionPermutationTest.scala: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex 2 | import org.junit.runner.RunWith 3 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression 4 | import scala.collection.JavaConversions._ 5 | import org.specs2.mutable.Specification 6 | import org.specs2.runner.JUnitRunner 7 | import scala.collection.immutable.SortedSet 8 | 9 | @RunWith(classOf[JUnitRunner]) 10 | class RegularExpressionPermutationTest extends Specification { 11 | case class TestCase(tokens: List[String], value: Boolean) extends Ordered[TestCase] { 12 | def extend(test: TestCase) = 13 | TestCase(tokens ::: test.tokens, value & test.value) 14 | 15 | def compare(that: TestCase) = { 16 | val c1 = this.tokens.mkString(" ") compare that.tokens.mkString(" ") 17 | if (c1 != 0) c1 18 | else this.value.compare(that.value) 19 | } 20 | } 21 | 22 | val tokens = List("+", "*", "?", "") 23 | tokens.permutations.foreach { permutation => 24 | permutation.mkString("'", " ", "'") should { 25 | "match sentences correctly" in { 26 | val regex = RegularExpressionParsers.word.parse(permutation.mkString(" ")) 27 | 28 | { test: TestCase => 29 | regex.matches(test.tokens) aka test.tokens.mkString("'", " ", "'") must beTrue.iff(test.value) 30 | }.forall(cases(regex)) 31 | } 32 | } 33 | } 34 | 35 | def cases(regex: RegularExpression[String]) = { 36 | def makeCases(exprs: List[Expression[String]]) = { 37 | def makeNext(expr: Expression[String]): (List[List[String]], List[List[String]]) = expr match { 38 | case star: Expression.Star[_] => 39 | val source = star.expr.asInstanceOf[BaseExpression[String]].source 40 | (List(), List(List(), List(source), List(source, source))) 41 | case plus: Expression.Plus[_] => 42 | val source = plus.expr.asInstanceOf[BaseExpression[String]].source 43 | (List(List()), List(List(source), List(source, source))) 44 | case option: Expression.Option[_] => 45 | val source = option.expr.asInstanceOf[BaseExpression[String]].source 46 | (List(List(source, source)), List(List(), List(source))) 47 | case base: Expression.BaseExpression[_] => 48 | val source = base.source 49 | (List(List(), List(source, source)), List(List(source))) 50 | case _ => (List(), List()) 51 | } 52 | 53 | def makeNextCase(expr: Expression[String]) = { 54 | val (falses, trues) = makeNext(expr) 55 | falses.map(TestCase(_, false)) ::: trues.map(TestCase(_, true)) 56 | } 57 | 58 | def combine(tests: List[TestCase], nexts: List[TestCase]) = 59 | if (nexts.isEmpty) tests 60 | else for (test <- tests; next <- nexts) yield (test extend next) 61 | 62 | def rec(exprs: List[Expression[String]]): List[TestCase] = exprs match { 63 | case expr :: exprs => 64 | val tests = makeNextCase(expr) 65 | val extentions = rec(exprs) 66 | combine(tests, extentions) 67 | case Nil => List() 68 | } 69 | 70 | SortedSet[TestCase]() ++ rec(exprs) 71 | } 72 | 73 | makeCases(regex.expressions.toList) 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/test/scala/edu/washington/cs/knowitall/regex/RegularExpressionUnnamedGroupTest.scala: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex 2 | import org.junit.runner.RunWith 3 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression 4 | import scala.collection.JavaConversions._ 5 | import org.specs2.mutable.Specification 6 | import org.specs2.runner.JUnitRunner 7 | 8 | @RunWith(classOf[JUnitRunner]) 9 | class RegularExpressionTest extends Specification { 10 | val regex = RegularExpressionParsers.word.parse(" (((?:(?: +) | ) ? {1,3}) | (?: * )) ") 11 | 12 | regex.toString should { 13 | "match" in { 14 | regex.apply("this is a very very very amazing new test".split(" ").toList) must beTrue 15 | regex.apply("this is a very new test".split(" ").toList) must beTrue 16 | regex.apply("this is an amazing new test".split(" ").toList) must beTrue 17 | regex.apply("this is a centuries old test".split(" ").toList) must beTrue 18 | regex.apply("this is a many many centuries old test".split(" ").toList) must beTrue 19 | regex.apply("this is a very new test".split(" ").toList) must beTrue 20 | regex.apply("this is a very new new test".split(" ").toList) must beTrue 21 | regex.apply("this is a very new new new test".split(" ").toList) must beTrue 22 | regex.apply("this is a very new new new new test".split(" ").toList) must beFalse 23 | } 24 | 25 | "not match" in { 26 | regex.apply("this is a amazing new test".split(" ").toList) must beFalse 27 | } 28 | 29 | "yield the correct groups" in { 30 | val m = regex.find("this is a very very very amazing new test".split(" ").toList) 31 | m.groups().size() must_== 3 32 | m.groups().get(1).text must_== "a very very very amazing new" 33 | m.groups().get(2).text must_== "a very very very amazing new" 34 | } 35 | 36 | "yield the correct groups" in { 37 | val m = regex.find("this is a centuries old test".split(" ").toList) 38 | m.groups().size() must_== 2 39 | m.groups().get(1).text must_== "a centuries old" 40 | } 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/test/scala/edu/washington/cs/knowitall/regex/WordRegularExpressionTest.scala: -------------------------------------------------------------------------------- 1 | package edu.washington.cs.knowitall.regex 2 | 3 | import scala.collection.JavaConverters._ 4 | 5 | import org.junit.runner.RunWith 6 | import org.specs2.mutable.Specification 7 | import org.specs2.runner.JUnitRunner 8 | 9 | import edu.washington.cs.knowitall.regex.Expression.BaseExpression 10 | 11 | @RunWith(classOf[JUnitRunner]) 12 | class WordRegularExpressionTest extends Specification { 13 | case class WordToken(string: String, postag: String, chunk: String) 14 | 15 | def compile(string: String): RegularExpression[WordToken] = { 16 | // create a parser for regular expression language that have 17 | // the same token representation 18 | val parser = 19 | new RegularExpressionParser[WordToken]() { 20 | // Translate an string "part=value" into a BaseExpression that 21 | // checks whether the part of a WordToken has value 'value'. 22 | override def factory(string: String): BaseExpression[WordToken] = { 23 | new BaseExpression[WordToken](string) { 24 | val Array(part, quotedValue) = string.split("=") 25 | val value = quotedValue.drop(1).take(quotedValue.size - 2) 26 | override def apply(entity: WordToken) = { 27 | part match { 28 | case "string" => entity.string equalsIgnoreCase value 29 | case "postag" => entity.postag equalsIgnoreCase value 30 | case "chunk" => entity.chunk equalsIgnoreCase value 31 | } 32 | } 33 | } 34 | } 35 | } 36 | 37 | parser.parse(string) 38 | } 39 | 40 | "README regex example one" should { 41 | "work" in { 42 | val sentence = "The US president Barack Obama is travelling to Mexico." 43 | val tokens = Seq( 44 | WordToken("The", "DT", null), 45 | WordToken("US", "NNP", null), 46 | WordToken("president", "NN", null), 47 | WordToken("Barack", "NNP", null), 48 | WordToken("Obama", "NNP", null), 49 | WordToken("is", "VB", null), 50 | WordToken("travelling", "VB", null), 51 | WordToken("to", "TO", null), 52 | WordToken("Mexico", "NN", null), 53 | WordToken(".", ".", null)) 54 | val regex = compile("""(?: | | )? * + + +""") 55 | val found = Option(regex.find(tokens.asJava)) 56 | found.size must_== 1 57 | found.get.groups.get(0).tokens.asScala.map(_.string).mkString(" ") must_== "The US president Barack Obama" 58 | } 59 | } 60 | } 61 | --------------------------------------------------------------------------------