├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── build.sbt
└── src
├── main
└── java
│ └── edu
│ └── washington
│ └── cs
│ └── knowitall
│ ├── logic
│ ├── ArgFactory.java
│ ├── Expression.java
│ ├── LogicException.java
│ ├── LogicExpression.java
│ ├── LogicExpressionParser.java
│ └── LogicExpressionParsers.java
│ └── regex
│ ├── Expression.java
│ ├── ExpressionFactory.java
│ ├── FiniteAutomaton.java
│ ├── Match.java
│ ├── RegexException.java
│ ├── RegularExpression.java
│ ├── RegularExpressionParser.java
│ └── RegularExpressionParsers.java
└── test
├── java
└── edu
│ └── washington
│ └── cs
│ └── knowitall
│ └── regex
│ └── MinMaxTest.java
└── scala
└── edu
└── washington
└── cs
└── knowitall
├── logic
├── LogicTest.scala
└── WordLogicTest.scala
└── regex
├── RegularExpressionAssertionTest.scala
├── RegularExpressionNamedGroupTest.scala
├── RegularExpressionPermutationTest.scala
├── RegularExpressionUnnamedGroupTest.scala
└── WordRegularExpressionTest.scala
/.gitignore:
--------------------------------------------------------------------------------
1 | .classpath
2 | .project
3 | .settings
4 | .cache
5 | target
6 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: scala
2 | scala:
3 | - "2.10.2"
4 | jdk:
5 | - oraclejdk7
6 | - openjdk7
7 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | GNU LESSER GENERAL PUBLIC LICENSE
2 | Version 3, 29 June 2007
3 |
4 | Copyright (C) 2007 Free Software Foundation, Inc.
5 | Everyone is permitted to copy and distribute verbatim copies
6 | of this license document, but changing it is not allowed.
7 |
8 |
9 | This version of the GNU Lesser General Public License incorporates
10 | the terms and conditions of version 3 of the GNU General Public
11 | License, supplemented by the additional permissions listed below.
12 |
13 | 0. Additional Definitions.
14 |
15 | As used herein, "this License" refers to version 3 of the GNU Lesser
16 | General Public License, and the "GNU GPL" refers to version 3 of the GNU
17 | General Public License.
18 |
19 | "The Library" refers to a covered work governed by this License,
20 | other than an Application or a Combined Work as defined below.
21 |
22 | An "Application" is any work that makes use of an interface provided
23 | by the Library, but which is not otherwise based on the Library.
24 | Defining a subclass of a class defined by the Library is deemed a mode
25 | of using an interface provided by the Library.
26 |
27 | A "Combined Work" is a work produced by combining or linking an
28 | Application with the Library. The particular version of the Library
29 | with which the Combined Work was made is also called the "Linked
30 | Version".
31 |
32 | The "Minimal Corresponding Source" for a Combined Work means the
33 | Corresponding Source for the Combined Work, excluding any source code
34 | for portions of the Combined Work that, considered in isolation, are
35 | based on the Application, and not on the Linked Version.
36 |
37 | The "Corresponding Application Code" for a Combined Work means the
38 | object code and/or source code for the Application, including any data
39 | and utility programs needed for reproducing the Combined Work from the
40 | Application, but excluding the System Libraries of the Combined Work.
41 |
42 | 1. Exception to Section 3 of the GNU GPL.
43 |
44 | You may convey a covered work under sections 3 and 4 of this License
45 | without being bound by section 3 of the GNU GPL.
46 |
47 | 2. Conveying Modified Versions.
48 |
49 | If you modify a copy of the Library, and, in your modifications, a
50 | facility refers to a function or data to be supplied by an Application
51 | that uses the facility (other than as an argument passed when the
52 | facility is invoked), then you may convey a copy of the modified
53 | version:
54 |
55 | a) under this License, provided that you make a good faith effort to
56 | ensure that, in the event an Application does not supply the
57 | function or data, the facility still operates, and performs
58 | whatever part of its purpose remains meaningful, or
59 |
60 | b) under the GNU GPL, with none of the additional permissions of
61 | this License applicable to that copy.
62 |
63 | 3. Object Code Incorporating Material from Library Header Files.
64 |
65 | The object code form of an Application may incorporate material from
66 | a header file that is part of the Library. You may convey such object
67 | code under terms of your choice, provided that, if the incorporated
68 | material is not limited to numerical parameters, data structure
69 | layouts and accessors, or small macros, inline functions and templates
70 | (ten or fewer lines in length), you do both of the following:
71 |
72 | a) Give prominent notice with each copy of the object code that the
73 | Library is used in it and that the Library and its use are
74 | covered by this License.
75 |
76 | b) Accompany the object code with a copy of the GNU GPL and this license
77 | document.
78 |
79 | 4. Combined Works.
80 |
81 | You may convey a Combined Work under terms of your choice that,
82 | taken together, effectively do not restrict modification of the
83 | portions of the Library contained in the Combined Work and reverse
84 | engineering for debugging such modifications, if you also do each of
85 | the following:
86 |
87 | a) Give prominent notice with each copy of the Combined Work that
88 | the Library is used in it and that the Library and its use are
89 | covered by this License.
90 |
91 | b) Accompany the Combined Work with a copy of the GNU GPL and this license
92 | document.
93 |
94 | c) For a Combined Work that displays copyright notices during
95 | execution, include the copyright notice for the Library among
96 | these notices, as well as a reference directing the user to the
97 | copies of the GNU GPL and this license document.
98 |
99 | d) Do one of the following:
100 |
101 | 0) Convey the Minimal Corresponding Source under the terms of this
102 | License, and the Corresponding Application Code in a form
103 | suitable for, and under terms that permit, the user to
104 | recombine or relink the Application with a modified version of
105 | the Linked Version to produce a modified Combined Work, in the
106 | manner specified by section 6 of the GNU GPL for conveying
107 | Corresponding Source.
108 |
109 | 1) Use a suitable shared library mechanism for linking with the
110 | Library. A suitable mechanism is one that (a) uses at run time
111 | a copy of the Library already present on the user's computer
112 | system, and (b) will operate properly with a modified version
113 | of the Library that is interface-compatible with the Linked
114 | Version.
115 |
116 | e) Provide Installation Information, but only if you would otherwise
117 | be required to provide such information under section 6 of the
118 | GNU GPL, and only to the extent that such information is
119 | necessary to install and execute a modified version of the
120 | Combined Work produced by recombining or relinking the
121 | Application with a modified version of the Linked Version. (If
122 | you use option 4d0, the Installation Information must accompany
123 | the Minimal Corresponding Source and Corresponding Application
124 | Code. If you use option 4d1, you must provide the Installation
125 | Information in the manner specified by section 6 of the GNU GPL
126 | for conveying Corresponding Source.)
127 |
128 | 5. Combined Libraries.
129 |
130 | You may place library facilities that are a work based on the
131 | Library side by side in a single library together with other library
132 | facilities that are not Applications and are not covered by this
133 | License, and convey such a combined library under terms of your
134 | choice, if you do both of the following:
135 |
136 | a) Accompany the combined library with a copy of the same work based
137 | on the Library, uncombined with any other library facilities,
138 | conveyed under the terms of this License.
139 |
140 | b) Give prominent notice with the combined library that part of it
141 | is a work based on the Library, and explaining where to find the
142 | accompanying uncombined form of the same work.
143 |
144 | 6. Revised Versions of the GNU Lesser General Public License.
145 |
146 | The Free Software Foundation may publish revised and/or new versions
147 | of the GNU Lesser General Public License from time to time. Such new
148 | versions will be similar in spirit to the present version, but may
149 | differ in detail to address new problems or concerns.
150 |
151 | Each version is given a distinguishing version number. If the
152 | Library as you received it specifies that a certain numbered version
153 | of the GNU Lesser General Public License "or any later version"
154 | applies to it, you have the option of following the terms and
155 | conditions either of that published version or of any later version
156 | published by the Free Software Foundation. If the Library as you
157 | received it does not specify a version number of the GNU Lesser
158 | General Public License, you may choose any version of the GNU Lesser
159 | General Public License ever published by the Free Software Foundation.
160 |
161 | If the Library as you received it specifies that a proxy can decide
162 | whether future versions of the GNU Lesser General Public License shall
163 | apply, that proxy's public statement of acceptance of any version is
164 | permanent authorization for you to choose that version for the
165 | Library.
166 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # OpenRegex
2 |
3 | OpenRegex is written by Michael Schmitz at the Turing Center
4 | . It is licensed under the lesser GPL.
5 | Please see the LICENSE file for more details.
6 |
7 |
8 | ## Introduction
9 |
10 | OpenRegex is an efficient and flexible token-based regular expression language
11 | and engine. Most regular expression implementations are closed to run only
12 | over characters. Although this is the the most common application for regular
13 | expressions, OpenRegex does not have this restriction. OpenRegex is open to
14 | any sequences of user-defined objects.
15 |
16 |
17 | ## Applied to Natural Language
18 |
19 | For example, OpenRegex is used in the R2A2 extension to ReVerb, an open-domain
20 | information extractor, to determine argument boundaries. In this case, tokens
21 | are words in English sentences with additional information (the string of the
22 | word, the part-of-speech tag, and the chunk tag).
23 |
24 | case class WordToken(string: String, postag: String, chunk: String)
25 |
26 | Now that we have defined our token, we can build up a sentence (a NLP library
27 | such as OpenNLP can help out here). We will also need to define a way to
28 | translate each token in the expression (text between ) into
29 | an expression that can be applied to a word token.
30 |
31 | ```
32 | def compile(string: String): RegularExpression[WordToken] = {
33 | // create a parser for regular expression language that have
34 | // the same token representation
35 | val parser =
36 | new RegularExpressionParser[WordToken]() {
37 | // Translate an string "part=value" into a BaseExpression that
38 | // checks whether the part of a WordToken has value 'value'.
39 | override def factory(string: String): BaseExpression[WordToken] = {
40 | new BaseExpression[WordToken](string) {
41 | val Array(part, quotedValue) = string.split("=")
42 | val value = quotedValue.drop(1).take(quotedValue.size - 2)
43 | override def apply(entity: WordToken) = {
44 | part match {
45 | case "string" => entity.string equalsIgnoreCase value
46 | case "postag" => entity.postag equalsIgnoreCase value
47 | case "chunk" => entity.chunk equalsIgnoreCase value
48 | }
49 | }
50 | }
51 | }
52 | }
53 |
54 | parser.parse(string)
55 | }
56 | ```
57 |
58 | Now we can compile a regular expression and apply it to a sentence. Consider
59 | the following pattern. The first line defines a non-matching group that
60 | matches a determiner ("a", "an", or "the"). The second line matches a sequence
61 | of part-of-speech tags ("JJ" is adjective, "NNP" is proper noun, and "NN" is
62 | common noun).
63 |
64 | (?: | | )?
65 | * + + +
66 |
67 | We can try applying it to a couple of sentences.
68 |
69 | 1. The US president Barack Obama is travelling to Mexico.
70 |
71 | ```
72 | regex.find(sentence).groups.get(0) matches "The US president Barack Obama"
73 | ```
74 |
75 | 2. If all the ice melted from the frigid Earth continent Antarctica, sea
76 | levels would rise hundreds of feet.
77 |
78 | ```
79 | regex.find(sentence).groups.get(0) matches "the frigid Earth continent Antarctica"
80 | ```
81 |
82 | We may want to pull out the text from certain parts of our match. We can do
83 | this with either named or unnamed groups. Consider the following new form of
84 | the pattern and the sentence in example 2.
85 |
86 | ```
87 | (?: | | )? *
88 | (:+) (:+) (:+)
89 |
90 | regex.find(sentence).groups.get(0) matches "the frigid Earth continent Antarctica"
91 | regex.find(sentence).groups.get(1) matches "Earth"
92 | regex.find(sentence).groups.get(2) matches "continent"
93 | regex.find(sentence).groups.get(2) matches "Antarctica"
94 |
95 | regex.find(sentence).group("arg1") matches "Earth"
96 | regex.find(sentence).group("rel") matches "continent"
97 | regex.find(sentence).group("arg2") matches "Antarctica"
98 | ```
99 |
100 | ## Supported Constructs
101 |
102 | The regular expression library supports the following constructs.
103 |
104 | ```
105 | | alternation
106 | ? option
107 | * Kleene-star
108 | + plus
109 | ^ beginning
110 | $ end
111 | {x,y} match at least x but not more than y times
112 | () matching groups
113 | (?:) non-matching groups
114 | (:) named groups
115 | ```
116 |
117 | Most of these operators work the same as in java.util.regex. Presently,
118 | however, alternation binds to its immediate neighbors. This means that ` | `
119 | means ` (?: | )` whereas in Java it would mean `(?: ) | `.
120 | This may change in a future release so it is advised that the
121 | alternation arguments be made explicit with non-matching groups.
122 |
123 | All operators are greedy, and there are no non-greedy counterparts.
124 | Backreferences are not supported because the underlying representation only
125 | supports regular languages (backreferences are not regular).
126 |
127 |
128 | ## Simple Java Example
129 |
130 | The NLP example is rather complex but it shows the power of OpenRegex. For a
131 | simpler example, look at RegularExpressions.word. This is a static factory
132 | method for a simple word-based regular expression where only the string is
133 | considered. This factory is used in the test cases.
134 |
135 | You can also play around with RegularExpressions.word by running the main
136 | method in RegularExpression and specifying an expression with arg1.
137 |
138 | sbt 'run-main edu.washington.cs.knowitall.regex.RegularExpression " * (?:)?"'
139 |
140 |
141 | ## Logic Expressions
142 |
143 | Included is an engine for parsing and evaluating logic expressions. For
144 | example, you might want to extend the NLP regular expression language to be
145 | able to check multiple fields in a single regular expression token. If you
146 | assumed each regular expression token to be a logic expression, you could
147 | write patterns such as the following.
148 |
149 | ```
150 |
151 | ```
152 |
153 | Extending the regular expression in this way is easy. It only involves
154 | rewriting the apply method in BaseExpression inside the compile method.
155 | Most of the code below existed before--now it's just moved outside the
156 | apply method.
157 |
158 | ```
159 | val logic = new LogicExpressionParser[WordToken] {
160 | override def factory(expr: String) = {
161 | new Arg.Pred[WordToken](expr) {
162 | val Array(part, quotedValue) = expr.split("=")
163 | val value = quotedValue.drop(1).take(quotedValue.size - 2)
164 | override def apply(entity: WordToken) = part match {
165 | case "string" => entity.string == value
166 | case "postag" => entity.postag == value
167 | case "chunk" => entity.chunk == value
168 | }
169 | }
170 | }
171 | }.parse(value)
172 |
173 | override def apply(entity: WordToken) = {
174 | logic.apply(entity)
175 | }
176 | ```
177 |
178 | Play around with logic expression by using the main method in LogicExpression.
179 |
180 | sbt 'run-main edu.washington.cs.knowitall.logic.LogicExpression'
181 |
182 | You can enter logic expressions such as "true & false" or "true | false" and
183 | have them evaluated interactively.
184 |
185 |
186 | ## Implementation
187 |
188 | Regular expressions are evaluated using Thomson NFA, which is fast and does not have
189 | the pathological cases that most regular expression libraries have. For more
190 | information about Thomson NFA in comparison to recursive backtracking, read
191 | http://swtch.com/~rsc/regexp/regexp1.html. Future work may involve compiling
192 | NFAs to DFAs.
193 |
194 |
195 | ## Future Work
196 |
197 | 1. Compile to DFA.
198 | 2. Use parser combinators for parsing regular expressions.
199 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | organization := "edu.washington.cs.knowitall"
2 |
3 | name := "openregex"
4 |
5 | description := "OpenRegex is an efficient and flexible library for running regular expressions over sequences of user-defined objects."
6 |
7 | version := "1.1.2-SNAPSHOT"
8 |
9 | libraryDependencies ++= Seq("com.google.code.findbugs" % "jsr305" % "2.0.1",
10 | "com.google.guava" % "guava" % "15.0",
11 | "org.scala-lang" % "scala-library" % "2.10.2" % "test",
12 | "junit" % "junit" % "4.10" % "test",
13 | "org.specs2" % "specs2_2.10" % "2.2.2" % "test",
14 | "org.scalacheck" % "scalacheck_2.10" % "1.10.1" % "test")
15 |
16 | licenses := Seq("LGPL (GNU Lesser General Public License)" -> url("http://www.gnu.org/licenses/lgpl.html"))
17 |
18 | homepage := Some(url("https://github.com/knowitall/openregex"))
19 |
20 | publishMavenStyle := true
21 |
22 | publishTo <<= version { (v: String) =>
23 | val nexus = "https://oss.sonatype.org/"
24 | if (v.trim.endsWith("SNAPSHOT"))
25 | Some("snapshots" at nexus + "content/repositories/snapshots")
26 | else
27 | Some("releases" at nexus + "service/local/staging/deploy/maven2")
28 | }
29 |
30 | pomExtra := (
31 |
32 | https://github.com/knowitall/openregex
33 | scm:git://github.com/knowitall/openregex.git
34 | scm:git:git@github.com:knowitall/openregex.git
35 | HEAD
36 |
37 |
38 |
39 | Michael Schmitz
40 |
41 | )
42 |
--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/logic/ArgFactory.java:
--------------------------------------------------------------------------------
1 | package edu.washington.cs.knowitall.logic;
2 |
3 | import com.google.common.base.Function;
4 |
5 | /**
6 | * An abstract factory class that converts the string representation of
7 | * an argument into a token. This token uses the supplied delegate to
8 | * evaluate the expression against an entity into a boolean.
9 | *
10 | * @author Michael Schmitz
11 | *
12 | * @param
13 | */
14 | public abstract class ArgFactory implements Function> {
15 | /***
16 | * Converts the supplied string into a token.
17 | */
18 | public abstract Expression.Arg create(String string);
19 |
20 | /***
21 | * Method to satisfy abstract superclass.
22 | */
23 | @Override
24 | public Expression.Arg apply(String string) {
25 | return this.create(string);
26 | }
27 | }
28 |
--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/logic/Expression.java:
--------------------------------------------------------------------------------
1 | package edu.washington.cs.knowitall.logic;
2 |
3 | import com.google.common.base.Predicate;
4 |
5 | /**
6 | * Superclass for expressions in a Logic Expression.
7 | *
8 | * @author Michael Schmitz
9 | */
10 | public abstract class Expression {
11 | /**
12 | * An expression that can be applied.
13 | */
14 | public static abstract class Apply extends Expression {
15 | /**
16 | * Apply this expression to an entity to get true or false.
17 | */
18 | public abstract boolean apply(E entity);
19 | }
20 |
21 | /**
22 | * An operator expression.
23 | */
24 | public static abstract class Op extends Apply {
25 | /**
26 | * @returns true if this has precedence over that
27 | */
28 | public boolean preceeds(Op> that) {
29 | return this.precedence() < that.precedence();
30 | }
31 |
32 | /**
33 | * The precedence of this operator. A smaller number denotes higher
34 | * precedence.
35 | *
36 | * @returns the precedence level of this operator
37 | */
38 | public abstract int precedence();
39 |
40 | /**
41 | * An operator that takes a single argument, such as negation.
42 | */
43 | public static abstract class Mon extends Op {
44 | public Apply sub;
45 |
46 | public String toString(String symbol) {
47 | if (sub == null) {
48 | return symbol;
49 | }
50 | else {
51 | return symbol + "(" + sub.toString() + ")";
52 | }
53 | }
54 |
55 | /**
56 | * The negation operator.
57 | */
58 | public static class Not extends Mon {
59 | public String toString() {
60 | return super.toString("!");
61 | }
62 |
63 | @Override
64 | public boolean apply(E entity) {
65 | return !sub.apply(entity);
66 | }
67 |
68 | @Override
69 | public int precedence() {
70 | return 0;
71 | }
72 | }
73 | }
74 |
75 | /**
76 | * An operator that takes two arguments, such as disjunction.
77 | */
78 | public static abstract class Bin extends Op {
79 | public Apply left;
80 | public Apply right;
81 |
82 | public String toString(String symbol) {
83 | if (left == null || right == null) {
84 | return symbol;
85 | }
86 | else {
87 | return "(" + left.toString() + " " + symbol + " " + right.toString() + ")";
88 | }
89 | }
90 |
91 | /**
92 | * The conjunction (logical and) operator.
93 | */
94 | public static class And extends Bin {
95 | public String toString() {
96 | return super.toString("&");
97 | }
98 |
99 | @Override
100 | public boolean apply(E entity) {
101 | return left.apply(entity) && right.apply(entity);
102 | }
103 |
104 | @Override
105 | public int precedence() {
106 | return 1;
107 | }
108 | }
109 |
110 | /**
111 | * The disjunction (logical or) operator.
112 | */
113 | public static class Or extends Bin {
114 | public String toString() {
115 | return super.toString("|");
116 | }
117 |
118 | @Override
119 | public boolean apply(E entity) {
120 | return left.apply(entity) || right.apply(entity);
121 | }
122 |
123 | @Override
124 | public int precedence() {
125 | return 2;
126 | }
127 | }
128 | }
129 | }
130 |
131 | /**
132 | * An expression that evaluates to true or false.
133 | */
134 | public static abstract class Arg extends Apply implements Predicate {
135 | /**
136 | * An expression that evaluates to true or false by applying a
137 | * predicate to the supplied entity.
138 | */
139 | public static abstract class Pred extends Arg {
140 | private String description;
141 |
142 | public Pred(String description) {
143 | this.description = description;
144 | }
145 |
146 | @Override
147 | public abstract boolean apply(E entity);
148 |
149 | public String getDescription() {
150 | return this.description;
151 | }
152 |
153 | public String toString() {
154 | return this.getDescription();
155 | }
156 | }
157 |
158 | /**
159 | * An expression that is a constant value--either true or false.
160 | */
161 | public static class Value extends Arg {
162 | private boolean value;
163 |
164 | public Value(boolean value) {
165 | this.value = value;
166 | }
167 |
168 | @Override
169 | public boolean apply(E entity) {
170 | return this.apply();
171 | }
172 |
173 | public boolean apply() {
174 | return value;
175 | }
176 |
177 | @Override
178 | public String toString() {
179 | return Boolean.toString(this.value);
180 | }
181 | }
182 | }
183 |
184 | /**
185 | * A parenthesis, used for grouping. These are only uses prior to building
186 | * the AST.
187 | */
188 | public static class Paren extends Expression {
189 | /**
190 | * A left parenthesis.
191 | */
192 | public static class L extends Paren {
193 | public String toString() {
194 | return "(";
195 | }
196 | }
197 |
198 | /**
199 | * A right parenthesis.
200 | */
201 | public static class R extends Paren {
202 | public String toString() {
203 | return ")";
204 | }
205 | }
206 | }
207 | }
208 |
--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/logic/LogicException.java:
--------------------------------------------------------------------------------
1 | package edu.washington.cs.knowitall.logic;
2 |
3 | /**
4 | *
5 | * @author Michael Schmitz
6 | */
7 | public class LogicException extends RuntimeException {
8 | private static final long serialVersionUID = 1L;
9 |
10 | public LogicException(String message) {
11 | super(message);
12 | }
13 |
14 | public LogicException(String message, Exception e) {
15 | super(message, e);
16 | }
17 |
18 | /**
19 | * Exception while applying an expression to an object.
20 | */
21 | public static class ApplyLogicException extends LogicException {
22 | private static final long serialVersionUID = 1L;
23 |
24 | public ApplyLogicException(String message, Exception e) {
25 | super(message, e);
26 | }
27 |
28 | public ApplyLogicException(String message) {
29 | super(message);
30 | }
31 | }
32 |
33 | /**
34 | * Exception while converting the tokens into a valid expression.
35 | */
36 | public static class CompileLogicException extends LogicException {
37 | private static final long serialVersionUID = 1L;
38 |
39 | public CompileLogicException(String message, Exception e) {
40 | super(message, e);
41 | }
42 |
43 | public CompileLogicException(String message) {
44 | super(message);
45 | }
46 | }
47 |
48 | /**
49 | * Exception while tokenizing the logic expression string.
50 | */
51 | public static class TokenizeLogicException extends LogicException {
52 | private static final long serialVersionUID = 1L;
53 |
54 | public TokenizeLogicException(String message, Exception e) {
55 | super(message, e);
56 | }
57 |
58 | public TokenizeLogicException(String message) {
59 | super(message);
60 | }
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/logic/LogicExpression.java:
--------------------------------------------------------------------------------
1 | package edu.washington.cs.knowitall.logic;
2 |
3 | import java.util.ArrayList;
4 | import java.util.EmptyStackException;
5 | import java.util.LinkedList;
6 | import java.util.List;
7 | import java.util.Scanner;
8 | import java.util.Stack;
9 |
10 | import com.google.common.base.Function;
11 | import com.google.common.base.Predicate;
12 |
13 | import edu.washington.cs.knowitall.logic.Expression.Apply;
14 | import edu.washington.cs.knowitall.logic.Expression.Arg;
15 | import edu.washington.cs.knowitall.logic.Expression.Op;
16 | import edu.washington.cs.knowitall.logic.Expression.Paren;
17 | import edu.washington.cs.knowitall.logic.LogicException.ApplyLogicException;
18 | import edu.washington.cs.knowitall.logic.LogicException.CompileLogicException;
19 | import edu.washington.cs.knowitall.logic.LogicException.TokenizeLogicException;
20 |
21 | /**
22 | * A logic expression engine that operates over user specified objects.
23 | *
24 | * @author Michael Schmitz
25 | *
26 | * @param the type of the base expressions
27 | */
28 | public class LogicExpression implements Predicate {
29 | private final Apply expression;
30 |
31 | /***
32 | *
33 | * @param input an infix representation of the logic expression.
34 | * @throws TokenizeLogicException
35 | * @throws CompileLogicException
36 | */
37 | protected LogicExpression(List> expressions)
38 | throws TokenizeLogicException, CompileLogicException {
39 | // put in reverse polish notation
40 | List> rpn = rpn(expressions);
41 |
42 | // compile the expression
43 | expression = buildAst(rpn);
44 | }
45 |
46 | /***
47 | * Compile an infix list of tokens into an expression tree.
48 | * @param rpn a list of tokens in infix form.
49 | * @return an expression tree.
50 | */
51 | public static LogicExpression compile(
52 | final List> expressions) {
53 | return new LogicExpression(expressions);
54 | }
55 |
56 | /***
57 | * Helper factory method to instantiate a LogicExpression.
58 | * @param input The string to parse.
59 | * @param factoryDelegate The factory to build tokens.
60 | * @return a new LogicExpression
61 | */
62 | public static LogicExpression compile(final String input,
63 | final Function> factoryDelegate) {
64 | return new LogicExpressionParser() {
65 | @Override
66 | public Arg factory(String argument) {
67 | return factoryDelegate.apply(argument);
68 | }
69 | }.parse(input);
70 | }
71 |
72 | @Override
73 | public String toString() {
74 | if (this.isEmpty()) {
75 | return "(empty)";
76 | }
77 | else {
78 | return expression.toString();
79 | }
80 | }
81 |
82 |
83 | /***
84 | * If the expression is empty, it returns true for all inputs.
85 | * @return true iff the expression is empty.
86 | */
87 | public boolean isEmpty() {
88 | return this.expression == null;
89 | }
90 |
91 | @Override
92 | public boolean apply(E entity) {
93 | if (this.isEmpty()) {
94 | return true;
95 | }
96 | else {
97 | return this.expression.apply(entity);
98 | }
99 | }
100 |
101 | /***
102 | * Compile a rpn list of tokens into an expression tree.
103 | * @param rpn a list of tokens in infix form.
104 | * @return an expression tree.
105 | */
106 | public static Apply buildAst(List> rpn) {
107 | if (rpn.isEmpty()) {
108 | return null;
109 | }
110 |
111 | Stack> stack = new Stack>();
112 | for (Expression tok : rpn) {
113 | if (tok instanceof Arg>) {
114 | stack.push((Arg) tok);
115 | } else if (tok instanceof Op>) {
116 | try {
117 | if (tok instanceof Op.Mon>){
118 | Apply sub = stack.pop();
119 |
120 | Op.Mon mon = (Op.Mon) tok;
121 |
122 | mon.sub = sub;
123 |
124 | stack.push(mon);
125 | }
126 | if (tok instanceof Op.Bin>) {
127 | Apply arg2 = stack.pop();
128 | Apply arg1 = stack.pop();
129 |
130 | Op.Bin bin = (Op.Bin) tok;
131 |
132 | bin.left = arg1;
133 | bin.right = arg2;
134 |
135 | stack.push(bin);
136 | }
137 | }
138 | catch (EmptyStackException e) {
139 | throw new CompileLogicException(
140 | "No argument for operator (stack empty): "
141 | + tok.toString());
142 | }
143 | }
144 | }
145 |
146 | if (stack.size() > 1) {
147 | throw new ApplyLogicException(
148 | "Stack has multiple elements after apply: " + stack.toString());
149 | }
150 |
151 | if (stack.size() == 0) {
152 | throw new ApplyLogicException(
153 | "Stack has zero elements after apply.");
154 | }
155 |
156 | if (!(stack.peek() instanceof Apply>)) {
157 | throw new ApplyLogicException(
158 | "Stack contains non-appliable tokens after apply: " + stack.toString());
159 | }
160 |
161 | return (stack.pop());
162 | }
163 |
164 | /***
165 | * Return a list of the arguments contained in the expression.
166 | * @return
167 | */
168 | public List getArgs() {
169 | List args = new ArrayList();
170 | getArgs(this.expression, args);
171 |
172 | return args;
173 | }
174 |
175 | /***
176 | * Private helper method to recursively find arguments.
177 | * @param apply the expression tree to search.
178 | * @param args the resulting list of arguments.
179 | */
180 | private void getArgs(Apply> apply, List args) {
181 | if (apply instanceof Op.Bin>) {
182 | Op.Bin> bin = (Op.Bin>) apply;
183 |
184 | getArgs(bin.left, args);
185 | getArgs(bin.right, args);
186 | }
187 | else if (apply instanceof Arg.Pred>) {
188 | args.add(((Arg.Pred>)apply).getDescription());
189 | }
190 | }
191 |
192 | /***
193 | * Converts an infix logic representation into a postfix logic representation.
194 | * @param tokens a list of tokens in infix form.
195 | * @return a list of tokens in postfix (rpn) form.
196 | * @throws CompileLogicException
197 | */
198 | public List> rpn(List> tokens)
199 | throws CompileLogicException {
200 | // intermediate storage
201 | Stack> stack = new Stack>();
202 |
203 | // final rpn output
204 | LinkedList> output = new LinkedList>();
205 |
206 | for (Expression tok : tokens) {
207 | if (tok instanceof Paren.L>) {
208 | stack.push(tok);
209 | } else if (tok instanceof Paren.R>) {
210 | Expression top;
211 | do {
212 | top = stack.pop();
213 |
214 | if (!(top instanceof Paren.L>)) {
215 | output.offer(top);
216 | }
217 |
218 | } while (!(top instanceof Paren.L>));
219 |
220 | } else if (tok instanceof Op.Mon>) {
221 | stack.push(tok);
222 | } else if (tok instanceof Op.Bin>) {
223 | // higher precedence
224 | while (!stack.isEmpty() && stack.peek() instanceof Op>
225 | && ((Op>)stack.peek()).preceeds((Op>)tok)) {
226 | output.offer(stack.pop());
227 | }
228 |
229 | stack.push(tok);
230 | } else if (tok instanceof Arg>) {
231 | output.offer(tok);
232 | }
233 | }
234 |
235 | // empty out items remaining ni the stack
236 | while (!stack.isEmpty()) {
237 | Expression top = stack.pop();
238 |
239 | if (top instanceof Paren.L> || top instanceof Paren.R>) {
240 | throw new CompileLogicException("Unbalanced parentheses.");
241 | }
242 |
243 | output.offer(top);
244 | }
245 |
246 | return output;
247 | }
248 |
249 |
250 | /***
251 | * Iteractively interpret logic statements from stdin such as "true | (true & false)".
252 | * @param args
253 | */
254 | public static void main(String[] args) {
255 | Scanner scan = new Scanner(System.in);
256 |
257 | while (scan.hasNextLine()) {
258 | String line = scan.nextLine();
259 |
260 | LogicExpression expr = LogicExpressionParsers.trivial.parse(line);
261 |
262 | System.out.println("string: " + expr.toString());
263 | System.out.println("value: " + expr.apply(null));
264 | System.out.println();
265 | }
266 |
267 | scan.close();
268 | }
269 | }
270 |
--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/logic/LogicExpressionParser.java:
--------------------------------------------------------------------------------
1 | package edu.washington.cs.knowitall.logic;
2 |
3 | import java.util.ArrayList;
4 | import java.util.List;
5 | import java.util.Stack;
6 | import java.util.regex.Matcher;
7 | import java.util.regex.Pattern;
8 |
9 | import com.google.common.base.Function;
10 | import com.google.common.collect.Lists;
11 |
12 | import edu.washington.cs.knowitall.logic.Expression.Arg;
13 | import edu.washington.cs.knowitall.logic.Expression.Op;
14 | import edu.washington.cs.knowitall.logic.Expression.Paren;
15 | import edu.washington.cs.knowitall.logic.LogicException.TokenizeLogicException;
16 |
17 | /**
18 | * A logic expression engine that operates over user specified objects.
19 | *
20 | * @author Michael Schmitz
21 | *
22 | * @param the type of the base expressions
23 | */
24 | abstract public class LogicExpressionParser implements Function> {
25 | /***
26 | * Create a LogicExpression object from the supplied string.
27 | * @param string
28 | * @return
29 | */
30 | public LogicExpression parse(String string) {
31 | List> expressions = this.tokenize(string);
32 | return new LogicExpression(expressions);
33 | }
34 |
35 | @Override
36 | public LogicExpression apply(String string) {
37 | return this.parse(string);
38 | }
39 |
40 | /***
41 | * The factory method creates an argument from the supplied token string.
42 | * @param argument a string representation of a token
43 | * @return an evaluatable representation of a token
44 | */
45 | public abstract Arg factory(String argument);
46 |
47 | public final static Pattern doubleQuoteStringLiteralRegex =
48 | Pattern.compile("\"" + "([^\"\\p{Cntrl}\\\\]*+(?:\\\\[\\\\'\"bfnrt])*+(?:\\\\u[a-fA-F0-9]{4})*+)*+" + "\"");
49 | public final static Pattern singleQuoteStringLiteralRegex =
50 | Pattern.compile("'" + "(?:[^']*+)" + "'");
51 | public final static Pattern regexLiteralRegex =
52 | Pattern.compile("/" + "(?:(?:[^/\\\\]*+(?:\\\\)*+(?:\\\\/)*+)*+)" + "/");
53 | private final static List literalPatterns = Lists.newArrayList(
54 | doubleQuoteStringLiteralRegex, singleQuoteStringLiteralRegex,
55 | regexLiteralRegex);
56 |
57 | /***
58 | * The readToken method reads a token from the remaining LogicExpression string.
59 | *
60 | * A token may contain a string. If it contains parentheses, the token
61 | * will last until the parentheses are balanced. And &, |, or unbalanced )
62 | * will mark the end of a token.
63 | *
64 | * This is a default implementation that may be overriden.
65 | * @param remainder the remaining text to tokenize
66 | * @return a token from the beginning on `remaining`
67 | */
68 | public String readToken(String remainder) {
69 | final String token;
70 | try {
71 | Stack parens = new Stack();
72 |
73 | int nextExpression;
74 | for (nextExpression = 0; nextExpression < remainder.length(); nextExpression++) {
75 | char c = remainder.charAt(nextExpression);
76 |
77 | // check for quotation
78 | String match = null;
79 | for (Pattern pattern : literalPatterns) {
80 | Matcher matcher = pattern.matcher(remainder).region(
81 | nextExpression, remainder.length());
82 | if (matcher.lookingAt()) {
83 | match = matcher.group(0);
84 | break;
85 | }
86 | }
87 |
88 | if (match != null) {
89 | // we found and can consume a quotation
90 | nextExpression += match.length() - 1;
91 | } else if (c == '(') {
92 | parens.push(c);
93 | } else if (c == ')') {
94 | if (parens.isEmpty()) {
95 | break;
96 | } else {
97 | parens.pop();
98 | }
99 | } else if (c == '&' || c == '|') {
100 | break;
101 | }
102 | }
103 |
104 | token = remainder.substring(0, nextExpression).trim();
105 | } catch (Exception e) {
106 | throw new TokenizeLogicException("Error parsing token: "
107 | + remainder, e);
108 | }
109 |
110 | if (token.isEmpty()) {
111 | throw new TokenizeLogicException("zero-length token found.");
112 | }
113 |
114 | return token;
115 | }
116 |
117 | /***
118 | * Convert an infix string logic representation to an infix list of tokens.
119 | * @param input an infix string logic representation.
120 | * @param factory a delegate that converts a string representation of an
121 | * argument into a token object. @return
122 | *
123 | * @throws TokenizeLogicException
124 | */
125 | public List> tokenize(String input)
126 | throws TokenizeLogicException {
127 | List> tokens = new ArrayList>();
128 |
129 | int i = 0;
130 | while (i < input.length()) {
131 | String substring = input.substring(i);
132 | char firstChar = substring.charAt(0);
133 |
134 | if (firstChar == ' ') {
135 | i += 1;
136 | continue;
137 | }
138 | else if (firstChar == '(') {
139 | tokens.add(new Paren.L());
140 | i += 1;
141 | } else if (firstChar == ')') {
142 | tokens.add(new Paren.R());
143 | i += 1;
144 | } else if (firstChar == '!') {
145 | tokens.add(new Op.Mon.Not());
146 | i += 1;
147 | } else if (firstChar == '&') {
148 | tokens.add(new Op.Bin.And());
149 | i += 1;
150 | } else if (firstChar == '|') {
151 | tokens.add(new Op.Bin.Or());
152 | i += 1;
153 | } else {
154 | // parse out the token
155 | String token = this.readToken(substring);
156 |
157 | tokens.add(factory(token));
158 | i += token.length();
159 | }
160 | }
161 |
162 | return tokens;
163 | }
164 | }
165 |
--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/logic/LogicExpressionParsers.java:
--------------------------------------------------------------------------------
1 | package edu.washington.cs.knowitall.logic;
2 |
3 | /**
4 | * Static factories for logic expressions over basic objects.
5 | *
6 | * @author Michael Schmitz
7 | */
8 | class LogicExpressionParsers {
9 | /**
10 | * Logic expressions where "true" evaluates to true and "false" evaluates to
11 | * false. For example:
12 | *
13 | * (true | false) & true
14 | *
15 | * This logic expression is trivial because it's value is independent of the
16 | * object it is applied to.
17 | */
18 | public final static LogicExpressionParser trivial =
19 | new LogicExpressionParser() {
20 | @Override
21 | public Expression.Arg factory(final String string) {
22 | return new Expression.Arg.Pred(string) {
23 | @Override
24 | public boolean apply(String entity) {
25 | return "true".equals(string);
26 | }
27 | };
28 | }
29 | };
30 |
31 | /**
32 | * Logic expressions where tokens are strings. A token is true if it
33 | * matches the input string.
34 | */
35 | public final static LogicExpressionParser stringMatch =
36 | new LogicExpressionParser() {
37 | @Override
38 | public Expression.Arg factory(final String token) {
39 | return new Expression.Arg.Pred(token) {
40 | final String string = token.substring(1, token.length() - 1);
41 |
42 | @Override
43 | public boolean apply(String entity) {
44 | return entity.equals(string);
45 | }
46 | };
47 | }
48 | };
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/java/edu/washington/cs/knowitall/regex/Expression.java:
--------------------------------------------------------------------------------
1 | package edu.washington.cs.knowitall.regex;
2 |
3 | import java.util.ArrayList;
4 | import java.util.Iterator;
5 | import java.util.List;
6 |
7 | import com.google.common.base.Joiner;
8 | import com.google.common.base.Predicate;
9 |
10 | import edu.washington.cs.knowitall.regex.FiniteAutomaton.Automaton;
11 | import edu.washington.cs.knowitall.regex.FiniteAutomaton.State;
12 |
13 | /**
14 | * Interface for a component of a regular expression.
15 | *
16 | * @author Michael Schmitz
17 | */
18 | public interface Expression extends Predicate {
19 |
20 | public Automaton build();
21 |
22 | public int minMatchingLength();
23 |
24 | /**
25 | * Represents a matching group that is referred to by order number.
26 | * {@code ( +)}
27 | * @author Michael Schmitz
28 | *
29 | * @param
30 | */
31 | public class MatchingGroup implements Expression {
32 | public final List> expressions;
33 |
34 | public MatchingGroup(List> expressions) {
35 | this.expressions = expressions;
36 | }
37 |
38 | @Override
39 | public boolean apply(E entity) {
40 | throw new UnsupportedOperationException();
41 | }
42 |
43 | public String subexpString() {
44 | List subs = new ArrayList(this.expressions.size());
45 | for (Expression expr : this.expressions) {
46 | subs.add(expr.toString());
47 | }
48 |
49 | return Joiner.on(" ").join(subs);
50 | }
51 |
52 | @Override
53 | public String toString() {
54 | return "(" + subexpString() + ")";
55 | }
56 |
57 | /**
58 | * Convert the expression into a NFA.
59 | */
60 | @Override
61 | public Automaton build() {
62 | Automaton auto = new Automaton(this);
63 |
64 | Iterator> exprIterator = this.expressions.iterator();
65 | Automaton sub;
66 |
67 | // connect the start to the first subexpression
68 | State prev = auto.start;
69 | if (exprIterator.hasNext()) {
70 | sub = exprIterator.next().build();
71 | auto.start.connect(sub.start);
72 | prev = sub.end;
73 | }
74 | while (exprIterator.hasNext()) {
75 | Expression expr = exprIterator.next();
76 | sub = expr.build();
77 |
78 | State connector = new State();
79 |
80 | prev.connect(connector);
81 | connector.connect(sub.start);
82 | prev = sub.end;
83 | }
84 |
85 | prev.connect(auto.end);
86 |
87 | return auto;
88 | }
89 |
90 | @Override
91 | public int minMatchingLength() {
92 | int len = 0;
93 | for (Expression expr : this.expressions) {
94 | len += expr.minMatchingLength();
95 | }
96 | return len;
97 | }
98 | }
99 |
100 | /**
101 | * Represents a matching group that is referred to by name.
102 | * {@code (: +)}
103 | * @author Michael Schmitz
104 | *
105 | * @param
106 | */
107 | public class NamedGroup extends MatchingGroup {
108 | public final String name;
109 |
110 | public NamedGroup(String name, List> expressions) {
111 | super(expressions);
112 | this.name = name;
113 | }
114 |
115 | @Override
116 | public String toString() {
117 | return "(<"+this.name+">:" + super.subexpString() + ")";
118 | }
119 | }
120 |
121 | /**
122 | * Represents a non-matching group.
123 | * {@code (?: +)}
124 | * @author Michael Schmitz
125 | *
126 | * @param
127 | */
128 | public class NonMatchingGroup extends MatchingGroup {
129 | public NonMatchingGroup(List> expressions) {
130 | super(expressions);
131 | }
132 |
133 | @Override
134 | public String toString() {
135 | return "(?:" + super.subexpString() + ")";
136 | }
137 | }
138 |
139 | /**
140 | * Disjunction of two experssions.
141 | * {@code |}
142 | * @author Michael Schmitz
143 | *
144 | * @param
145 | */
146 | public static class Or implements Expression {
147 | public final Expression expr1;
148 | public final Expression expr2;
149 |
150 | public Or(Expression expr1, Expression expr2) {
151 | this.expr1 = expr1;
152 | this.expr2 = expr2;
153 | }
154 |
155 | @Override
156 | public boolean apply(E entity) {
157 | return true;
158 | }
159 |
160 | @Override
161 | public String toString() {
162 | return this.expr1.toString() + " | " + this.expr2.toString();
163 | }
164 |
165 | /**
166 | * Convert the expression into a NFA.
167 | */
168 | @Override
169 | public Automaton build() {
170 | Automaton auto = new Automaton(this);
171 |
172 | Automaton sub1 = this.expr1.build();
173 | Automaton sub2 = this.expr2.build();
174 |
175 | // attach the sub automata
176 | auto.start.connect(sub1.start);
177 | auto.start.connect(sub2.start);
178 | sub1.end.connect(auto.end);
179 | sub2.end.connect(auto.end);
180 |
181 | return auto;
182 | }
183 |
184 | @Override
185 | public int minMatchingLength() {
186 | int left = this.expr1.minMatchingLength();
187 | int right = this.expr2.minMatchingLength();
188 | if (left < right)
189 | return left;
190 | else
191 | return right;
192 | }
193 | }
194 |
195 | /**
196 | * Kleene-star: zero or more of the enclosed expression.
197 | * {@code *}
198 | * @author Michael Schmitz
199 | *
200 | * @param
201 | */
202 | public static class Star implements Expression {
203 | public final Expression expr;
204 |
205 | public Star(Expression expr) {
206 | this.expr = expr;
207 | }
208 |
209 | @Override
210 | public boolean apply(E entity) {
211 | return this.expr.apply(entity);
212 | }
213 |
214 | @Override
215 | public String toString() {
216 | return this.expr.toString() + "*";
217 | }
218 |
219 | /**
220 | * Convert the expression into a NFA.
221 | */
222 | @Override
223 | public Automaton build() {
224 | Automaton auto = new Automaton(this);
225 |
226 | Automaton sub = this.expr.build();
227 |
228 | // run it again
229 | sub.end.connect(sub.start);
230 |
231 | // attach the sub automaton
232 | auto.start.connect(sub.start);
233 | sub.end.connect(auto.end);
234 |
235 | // skip it completely
236 | auto.start.connect(auto.end);
237 |
238 | return auto;
239 | }
240 |
241 | @Override
242 | public int minMatchingLength() {
243 | return 0;
244 | }
245 | }
246 |
247 | /**
248 | * One or more of the enclosed expression. Plus(expr) is equivalent to
249 | * expr followed by Star(expr).
250 | * {@code +} is the same as {@code *}
251 | * @author Michael Schmitz
252 | *
253 | * @param
254 | */
255 | public static class Plus implements Expression {
256 | public final Expression expr;
257 |
258 | public Plus(Expression expr) {
259 | this.expr = expr;
260 | }
261 |
262 | @Override
263 | public boolean apply(E entity) {
264 | return this.expr.apply(entity);
265 | }
266 |
267 | @Override
268 | public String toString() {
269 | return this.expr.toString() + "+";
270 | }
271 |
272 | /**
273 | * Convert the expression into a NFA.
274 | */
275 | @Override
276 | public Automaton build() {
277 | Automaton auto = new Automaton(this);
278 |
279 | Automaton sub = this.expr.build();
280 |
281 | // run it again
282 | sub.end.connect(sub.start);
283 |
284 | // attach the sub automaton
285 | auto.start.connect(sub.start);
286 | sub.end.connect(auto.end);
287 |
288 | return auto;
289 | }
290 |
291 | @Override
292 | public int minMatchingLength() {
293 | return 1;
294 | }
295 | }
296 |
297 | /**
298 | * Zero or one of the enclosed expression.
299 | * {@code ?}
300 | * @author Michael Schmitz
301 | *
302 | * @param
303 | */
304 | public static class Option implements Expression {
305 | Expression expr;
306 |
307 | public Option(Expression expr) {
308 | this.expr = expr;
309 | }
310 |
311 | @Override
312 | public boolean apply(E entity) {
313 | return this.expr.apply(entity);
314 | }
315 |
316 | @Override
317 | public String toString() {
318 | return this.expr.toString() + "?";
319 | }
320 |
321 | /**
322 | * Convert the expression into a NFA.
323 | */
324 | @Override
325 | public Automaton build() {
326 | Automaton auto = new Automaton(this);
327 |
328 | Automaton sub = this.expr.build();
329 |
330 | // attach the sub automaton
331 | auto.start.connect(sub.start);
332 | sub.end.connect(auto.end);
333 |
334 | // skip it completely
335 | auto.start.connect(auto.end);
336 |
337 | return auto;
338 | }
339 |
340 | @Override
341 | public int minMatchingLength() {
342 | return 0;
343 | }
344 | }
345 |
346 | /**
347 | * A minimum to maximum number of occurrences of the enclosed expression.
348 | * {@code {1,3}}
349 | * @author Daniel Naber
350 | *
351 | * @param
352 | */
353 | public static class MinMax implements Expression {
354 | Expression expr;
355 | final int minOccurrences;
356 | final int maxOccurrences;
357 |
358 | /**
359 | * @param minOccurrences minimum occurrences, must be >= 0
360 | * @param maxOccurrences maximum occurrences, must be >= 1 - you should prefer small values,
361 | * as the use of large values will create a large automaton that takes a lot of memory
362 | */
363 | public MinMax(Expression expr, int minOccurrences, int maxOccurrences) {
364 | this.expr = expr;
365 | if (minOccurrences < 0 || maxOccurrences < 1) {
366 | throw new IllegalArgumentException("minOccurrences must be >= 0 and maxOccurrences must be >= 1: "
367 | + minOccurrences + ", " + maxOccurrences);
368 | }
369 | if (minOccurrences > maxOccurrences) {
370 | throw new IllegalArgumentException("minOccurrences must be <= maxOccurrences: "
371 | + minOccurrences + " > " + maxOccurrences);
372 | }
373 | this.minOccurrences = minOccurrences;
374 | this.maxOccurrences = maxOccurrences;
375 | }
376 |
377 | @Override
378 | public boolean apply(E entity) {
379 | return this.expr.apply(entity);
380 | }
381 |
382 | @Override
383 | public String toString() {
384 | return this.expr.toString() + "{" + minOccurrences + "," + maxOccurrences + "}";
385 | }
386 |
387 | /**
388 | * Convert the expression into a NFA.
389 | */
390 | @Override
391 | public Automaton build() {
392 | Automaton auto = new Automaton(this);
393 |
394 | List> subAutos = new ArrayList>();
395 | int numberOfNodes = maxOccurrences;
396 | for (int i = 0; i < numberOfNodes; i++) {
397 | Automaton sub = this.expr.build();
398 | subAutos.add(sub);
399 | }
400 |
401 | // attach the first sub automaton
402 | auto.start.connect(subAutos.get(0).start);
403 |
404 | // attach the sub automatons among themselves and with the end
405 | for (int i = 0; i < subAutos.size(); i++) {
406 | Automaton sub = subAutos.get(i);
407 | if (i >= minOccurrences - 1) {
408 | sub.end.connect(auto.end);
409 | }
410 | if (i < subAutos.size() - 1) {
411 | Automaton nextSub = subAutos.get(i + 1);
412 | sub.end.connect(nextSub.start);
413 | }
414 | }
415 |
416 | if (minOccurrences == 0) {
417 | // skip it completely
418 | auto.start.connect(auto.end);
419 | }
420 |
421 | return auto;
422 | }
423 |
424 | @Override
425 | public int minMatchingLength() {
426 | return this.minOccurrences;
427 | }
428 | }
429 |
430 | /**
431 | * An expression with no subexpression that is evaluated against a token
432 | * using the supplied delegate.
433 | * @author Michael Schmitz
434 | *
435 | * @param