├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── bin ├── os-arch.sh └── os-name.sh ├── pom.xml ├── re2-20140304.tgz └── src ├── main └── java │ └── com │ └── logentries │ └── re2 │ ├── EmbeddedLibraryTools.java │ ├── Encoding.java │ ├── LibraryLoader.java │ ├── Options.h │ ├── Options.java │ ├── RE2.cpp │ ├── RE2.h │ ├── RE2.java │ ├── RE2Matcher.java │ ├── RE2MatcherUnicodeWord.java │ ├── RE2String.java │ ├── RegExprException.java │ ├── UTF8CharOffset.java │ ├── entity │ ├── CaptureGroup.java │ └── NamedGroup.java │ └── op.h └── test └── java └── com └── logentries └── re2 ├── GenRegExpr.java ├── GenString.java ├── Main.java ├── TestExceptions.java ├── TestMatcherFind.java ├── TestRE2MatcherUnicodeWord.java ├── TestRandomExpr.java ├── TestThreads.java ├── TestUnicode.java └── TestUtf8CharOffset.java /.gitignore: -------------------------------------------------------------------------------- 1 | *.re2* 2 | *.iml 3 | .idea 4 | obj/ 5 | /re2/** 6 | target/ 7 | src/main/resources 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This library can be distributed and used under the terms of The BSD 3-Clause License. Text of the license can be found here: http://opensource.org/licenses/BSD-3-Clause . 2 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # 2 | # Inspired by https://github.com/xerial/snappy-java/blob/develop/Makefile . 3 | # 4 | 5 | OBJ=obj 6 | MVN=mvn 7 | NATIVES-TARGET=src/main/resources/NATIVE/$(shell bin/os-arch.sh)/$(shell bin/os-name.sh) 8 | 9 | all: build 10 | build: $(OBJ)/libre2-java.so class 11 | 12 | .re2.extract.stamp: 13 | # hg clone https://re2.googlecode.com/hg re2 14 | # wget http://re2.googlecode.com/files/re2-20140304.tgz -O re2.tgz 15 | tar xvf re2-20140304.tgz 16 | touch .re2.extract.stamp 17 | 18 | .re2.compile.stamp: .re2.extract.stamp 19 | cd re2 && make 20 | touch .re2.compile.stamp 21 | 22 | $(OBJ)/RE2.o: .re2.extract.stamp $(addprefix src/main/java/com/logentries/re2/, RE2.cpp RE2.h) 23 | mkdir -p $(OBJ) 24 | $(CXX) -O3 -g -fPIC -I$(JAVA_HOME)/include -I$(JAVA_HOME)/include/linux -Ire2 -c src/main/java/com/logentries/re2/RE2.cpp -o $(OBJ)/RE2.o 25 | 26 | $(OBJ)/libre2-java.so: $(OBJ)/RE2.o .re2.compile.stamp 27 | $(CXX) -shared -Wl,-soname,libre2-java.so -o $(OBJ)/libre2-java.so $(OBJ)/RE2.o -Lre2/obj/so -lre2 -lpthread 28 | 29 | class: build-class 30 | 31 | build-class: target/libre2-java-1.0-SNAPSHOT.jar 32 | 33 | target/libre2-java-1.0-SNAPSHOT.jar: add-so 34 | $(MVN) package -Dmaven.test.skip=true 35 | 36 | add-so: .re2.compile.stamp $(OBJ)/libre2-java.so 37 | mkdir -p $(NATIVES-TARGET) 38 | cp $(OBJ)/libre2-java.so re2/obj/so/libre2.so $(NATIVES-TARGET) 39 | 40 | lib: add-so 41 | 42 | clean: 43 | rm -fr re2 44 | rm -fr obj 45 | rm -fr target 46 | rm -fr src/main/resources/NATIVE 47 | rm -f .*.stamp 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | re2-java 2 | ======== 3 | 4 | re2 for Java 5 | 6 | Warning: Only 64bit Linux is supported for now. It should be easy to add support for other platforms. 7 | 8 | ## Licence ## 9 | 10 | Like [RE2 library](http://code.google.com/p/re2/) iteself, this library can be distributed and used under the terms of [The BSD 3-Clause License](http://opensource.org/licenses/BSD-3-Clause). 11 | 12 | 13 | ## Installation ## 14 | 15 | ### Requirements ### 16 | * Java 7 (JDK 1.7, never tested on Java 8). Set environment variable `JAVA_HOME` to point to the root directory of JDK. 17 | * Maven 3.x , http://maven.apache.org/ . 18 | - Check that mvn command can be run from your command line. 19 | * gcc 4.5.x or higher. 20 | * Boost C++ Library (http://www.boost.org/), version newer than Stonehenge should be enough. 21 | * wget 22 | 23 | ### Compilation ### 24 | 25 | Simply type: 26 | 27 | $ make 28 | 29 | It downloads latest stable revision of re2, builds re2 library in separate directory and builds another library with JNI bindigs as well. 30 | Finally, jar file that includes so libraries files is produced in the target folder. 31 | 32 | You can type: 33 | 34 | $ make clean 35 | 36 | to clean all files that come into existence during normal run of make. 37 | 38 | After seccessfull compilation you can run: 39 | 40 | $ mvn test 41 | 42 | But tests are very time and memory consuming and, at present, they print a lot of debug messages. Sorry if it is annoying, 43 | this binding is actually under development. 44 | 45 | ### Generating header files - example ### 46 | 47 | From root folder containing pom.xml, src/ etc. 48 | 49 | javah -jni -classpath "/home//repos/re2-java/src/main/java" -o src/main/java/com/logentries/re2/RE2.h com.logentries.re2.RE2 50 | 51 | ### Installation ### 52 | 53 | After running of `make`, directory `target` contains jar file with the library. You can include it to your `classpath`. 54 | Native library files (libre2.so and libre2-java.so) are part of the jar file as well. They are extracted after JVM 55 | startup, saved into temporary files and dynamically loaded into the address space of the JVM. 56 | 57 | ### Changelog ### 58 | 59 | 60 | #### v1.3 61 | - added option `UNICODE_WORD` to transparently handle unicode words and word boundaries. Take a look at the section below. 62 | 63 | #### v1.2 64 | 65 | - added `RE2.compile` static method, similar to `Pattern.compile`. The main difference with the `RE2` constructor 66 | is that `compile` method doesn't use checked exception and you can avoid `try/catch` block. 67 | 68 | - support for `RE2String` that can be reused with multiple patterns, in order to avoid multiple copies of the same string. 69 | 70 | - generalization of `RE2.matcher` that now accepts `CharSequence` rather than `String` 71 | 72 | #### v1.1 73 | 74 | - support for `RE2Matcher` 75 | 76 | 77 | ## Usage ## 78 | 79 | For usage of the library, please import `com.logentries.re2.RE2` and `com.logentries.re2.Options` . 80 | 81 | Basic usage of java-re2 is quite similar to the C++ library. 82 | 83 | Static functions `RE2.fullMatch(.)` and `RE2.partialMatch(.)` can be used. 84 | 85 | You can create precompiled RE in this way: 86 | 87 | RE2 re = new RE2("\\d+"); 88 | 89 | as the object allocates some memory that is not under the control of JVM, it should be freed explicitly. 90 | You can either use member function `dispoze()`, or member function `close()` . 91 | Class RE2 contains overloaded method `finalize()` that is automatically called before the object is destroyed by the Garbage Collector. 92 | This method ensures that the additional memory is freed and may be frees it on its own. 93 | But it is usually bad idea to rely on Java GC. :-) 94 | 95 | Any try to use the object after the call of `dispoze()` or `close()` will cause the thrown of `IllegalStateException` . 96 | 97 | Precompiled RE supports member functions `partialMatch(.)` or `fullMatch(.)`. 98 | 99 | re.fullMatch("2569"); 100 | re.partialMatch("xxx=2569"); 101 | 102 | `RE2` constructor is declared with checked exception that can be raised if the regex is malformed. This is quite annoying if 103 | the regex is a static variable instantiated at startup. You can then use static method `RE2.compile` that wraps checked exception 104 | to the unchecked `IllegalArgumentException`. 105 | 106 | public class MyClass { 107 | private static RE2 regex = RE2.compile("..."); 108 | } 109 | 110 | ### Matcher ### 111 | 112 | `RE2` object supports also a more javaesque interface, similar to `java.util.regex.Pattern` and `java.util.regex.Matcher`. 113 | 114 | RE2 re = new RE2("..(..)"); 115 | RE2Matcher matcher = re.matcher("my input string"); 116 | if (matcher.find()) { 117 | // get matching string(s), 118 | // see java.util.regex.Matcher javadoc or 119 | // com.logentries.re2.RE2Matcher code for additional details 120 | // eg. matcher.group() or matcher.start() and matcher.end() 121 | ... 122 | } 123 | 124 | You can also iterate over the input string searching for repeated pattern 125 | 126 | RE2 re = new RE2("bla?"); 127 | RE2Matcher matcher = re.matcher("my bla input string bl bla"); 128 | while (matcher.findNext()) { 129 | // 3 iterations, get positions using matcher.start() and matcher.end() 130 | } 131 | 132 | `R2Matcher` also implements `java.util.Iterable`. 133 | It can be used this way 134 | 135 | int c = 0; 136 | for (MatchResult mr : new RE2("t").matcher("input text")) { 137 | // play with matches using mr.start, mr.end, mr.group 138 | } 139 | assertEquals(3, c); 140 | 141 | This can be very useful when playing with this library in Scala: 142 | 143 | import scala.collection.JavaConversions._ 144 | import com.logentries.re2._ 145 | 146 | new RE2("abc?") matcher "abc and abc ab ab" map( _.group ) foreach println 147 | 148 | If you are not interested in fetching groups offset you can disable this feature, by using 149 | 150 | RE2Matcher m = new RE2("ab(c?)").matcher("abc and abc ab ab", false); 151 | assertEquals(1, m.GroupCount()); 152 | // now m contains information only for group 0 153 | // so m.start(), m.end() and m.group() 154 | // trying m.{start|end|group}(n : n > 0) always fails 155 | 156 | If your regex is very complex (most likely programmatically composed by concatenating different patterns) and the 157 | number of groups is huge, this can improve performance significantly (data structures to contain all possible matches 158 | are not allocated). 159 | 160 | **NOTE 1**: `RE2Matcher` object maintains a pointer to a char buffer that is used in C++ stack to manage the current string, in order to avoid a copy for each iteration. 161 | For this reason, `RE2Matcher` object implements AutoCloseable interface, to be used in `try-with-resource` statement. 162 | Close method is called in `finalize()`, so garbage collector will ensure (sooner or later) to free the memory. This is the same pattern that has been used for 163 | `RE2` object, but, usually, `RE2` regex are compiled and then used multiple times while `RE2Matcher` objects 164 | are used in stack and most likely you will want to delete it as soon as has been used. 165 | In this case, you can use the `try-with-resource` block to make sure you don't miss anything 166 | 167 | try (RE2Matcher matcher = re.matcher("my bla input string bl bla")) { 168 | matcher. .... 169 | } 170 | 171 | **NOTE 2**: `RE2Matcher` is not thread-safe, just like `java.util.regex.Matcher` 172 | 173 | ### Re-using strings ### 174 | 175 | Whenever a `RE2Matcher` is created, the content of the string is copied to make it accessible from C++ stack. If you have to 176 | check and search for several patterns on the same string, this could affect performances, because you are copying 177 | the same string multiple times. 178 | 179 | For this reason, from version v1.2, we have implemented a new object, `RE2String` that is a wrapper for a `CharSequence`. 180 | You can create an instance of this object in advance, and then create a `RE2Matcher` using your `RE2String`. This new object 181 | can be re-used multiple times to create matchers for different patterns. 182 | When `RE2Matcher` is created using a `RE2String`, it doesn't copy the string and when you close it (see above about the `AutoCloseable` interface) 183 | simply does nothing. Similarly, `RE2String` implements `AutoCloseable` interface and `finalize` method has been overridden to let the GC 184 | clean resources for you. 185 | 186 | 187 | RE2 regex1 = RE2.compile("\\b[\\d]{5}\\b"); 188 | RE2 regex2 = RE2.compile("\\b[a-zA-Z]{5}\\b"); 189 | 190 | String input = .... 191 | RE2String rstring = new RE2String(input); 192 | 193 | RE2Matcher m1 = regex1.matcher(rstring); 194 | RE2Matcher m2 = regex2.matcher(rstring); 195 | while(m1.find()) { 196 | int endFirst = m1.end(); 197 | if (m2.find(endFirst, endFirst + 10)) { 198 | ... 199 | } 200 | } 201 | 202 | // here m1.close() and m2.close() do nothing 203 | 204 | 205 | ### Submatch extraction ### 206 | 207 | Both static and member match functions support additional parameters in which submatches will be stored. 208 | Java does not support passing arguments by reference, so we use arrays to store submatches: 209 | 210 | int[] x = new int[1]; 211 | long[] y = new int[1]; 212 | RE2.fullMatch("121:2569856321142", "(\\d+):(\\d+)", x, y); 213 | // x[0] == 121, y[0] == 2569856321142 214 | 215 | Array of length bigger then 1 can be used. Then it is used to store as much consecutive submatches as is the length of the array: 216 | 217 | int[] x = new int[2]; 218 | String[] s = new String[1]; 219 | long[] y = new long[3]; 220 | new RE2 re = new RE2("(\\d+):(\\d+)-([a-zA-Z]+)-(\\d+):(\\d+):(\\d+)"); 221 | re.fullMatch("225:3-xxx-2:2555422298777:7", x, s, y); 222 | // x[0] == 225, x[1] == 3, s[0] == xxx, y[0] == 2, y[1] == 2555422298777, y[2] == 7 223 | 224 | So far, only int[], long[], float[], double[] and String[] are supported. Adding of other types should be quite easy. 225 | 226 | ### Little comment about the interface and passing by reference ### 227 | 228 | I know that a lot of Java programmers may complain that the interface based on passing of parameters by reference through the trick with arrays 229 | is quite bad practise, dirty trick and that it introduces something what is in fact not present in Java. 230 | 231 | But after I try it in a real code I decided that it is the best way to pass the values of submatches. 232 | ~~If you have any idea how to implement it in different way, please give me know.~~ *See Matcher interface above* 233 | 234 | ### Named capture group extraction ### 235 | 236 | Capture group entities have a sub-string and a reference to the beginning and end index that this string corresponds to 237 | in a matched event. Named capture group entities wrap this and include a name. 238 | 239 | getCaptureGroups(), and getCaptureGroupNames() are two methods that are called by getNamedCaptureGroups() to create a list 240 | of NamedGroup entities. The lists returned by these methods are in order, allowing getNamedCaptureGroups to associate them, 241 | if the length of the returned lists differ we can assume that we cannot maintain association and return an empty list. 242 | 243 | getCaptureGroupNames uses the native RE2 method, getCaptureGroups uses the contributor code to get RE2Matcher objects. 244 | 245 | ### Options ### 246 | 247 | Object `com.logentries.re2.Options` encapsulates possible configuration that is used during creation of the RE2 object. It is more or less equivalent to RE2::Options 248 | from C++ interface. It can be passed as a second argument to RE2 constructor. 249 | 250 | It uses several setter methods to set the configuration values: 251 | 252 | Options opt = new Options(); 253 | opt.setNeverNl(true); 254 | opt.setWordBoundary(false); 255 | 256 | or equivalently: 257 | 258 | Options opt = new Options().setNeverNl(true).setWordBoundary(false); 259 | 260 | `RE2` constructor is now overloaded to support for explicit flag list, to mimic C++ style: 261 | 262 | RE2 regex = new RE2("TGIF?", 263 | Options.CASE_INSENSITIVE, 264 | Options.ENCODING(Encoding.UTF8), 265 | Options.PERL_CLASSES(false) 266 | ); 267 | 268 | see `Options` static fields for further details. 269 | 270 | 271 | ### Unicode words and word boundaries ### 272 | 273 | `RE2` natively supports unicode words (with special character classes like `\pN` and `\pL`) but it does not support unicode word boundaries (`\b`), because it is implemented with a single byte lookahead. Even using multiple bytes as lookahead the resulting automata would be huge and more difficult to handle. 274 | 275 | What we did from version 1.3 is to transparently handle some of these cases, that we think are the most meaningful in a real world scenario. This behavior is enabled through the option `UNICODE_WORD` and works by replacing words and boundary classes with named groups that are then removed in the output. While there is no problem replacing `\w` and `\W`, there are some problems with `\b`. The replacement makes sense in cases where the `\b` is used "properly", i.e. without repetitions or non-word characters close to it. In these "extreme" cases what we do is actually changing the semantics of the regular expression, therefore we may produce false matches or missing some of them (with respect to the original expression). 276 | 277 | Anyway this patch works OK in the majority of "normal" cases of usage of `\b`, that is near word characters. For example, without using the option, the regex `\bball\b` would match "ball" in the text "Fußball" (because "ß" is a non-word in ascii). With the option `UNICODE_WORD` we can transparently avoid this match. 278 | -------------------------------------------------------------------------------- /bin/os-arch.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | UNAME=$(uname -i) 4 | 5 | # RHEL seems to use -m flag 6 | if [ "$UNAME" == 'unknown' ]; then 7 | UNAME=$(uname -m) 8 | fi 9 | 10 | if [ "$UNAME" == 'x86_64' ]; then 11 | echo 'amd64' 12 | elif [ "$UNAME" == 'amd64' ]; then 13 | echo 'amd64' 14 | else 15 | echo 'unknown' 16 | fi 17 | -------------------------------------------------------------------------------- /bin/os-name.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | UNAME=$(uname -o) 4 | 5 | if [[ "$UNAME" == *Linux* ]]; then 6 | echo 'Linux' 7 | else 8 | echo 'unknown' 9 | fi 10 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.logentries.re2 5 | libre2-java 6 | jar 7 | 1.3.1 8 | libre2-java 9 | http://maven.apache.org 10 | 11 | 12 | 13 | junit 14 | junit 15 | 4.11 16 | test 17 | 18 | 19 | 20 | 21 | 22 | 23 | org.apache.maven.plugins 24 | maven-compiler-plugin 25 | 3.2 26 | 27 | 1.7 28 | 1.7 29 | 30 | 31 | 32 | org.apache.maven.plugins 33 | maven-source-plugin 34 | 2.2.1 35 | 36 | 37 | attach-sources 38 | 39 | jar 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | org.apache.maven.plugins 48 | maven-surefire-plugin 49 | 2.17 50 | 51 | true 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /re2-20140304.tgz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SpazioDati/re2-java/4191430bb761a18a6f69f495e52704b17e4f92fa/re2-20140304.tgz -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/EmbeddedLibraryTools.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Java Bindings for the RE2 Library 3 | * 4 | * (c) 2012 Daniel Fiala 5 | * 6 | */ 7 | 8 | package com.logentries.re2; 9 | 10 | /* 11 | * Inspired by https://github.com/zeromq/jzmq/tree/master/src/org/zeromq . 12 | */ 13 | 14 | import java.io.BufferedOutputStream; 15 | import java.io.File; 16 | import java.io.FileOutputStream; 17 | import java.io.IOException; 18 | import java.io.InputStream; 19 | import java.io.OutputStream; 20 | import java.net.URL; 21 | import java.util.ArrayList; 22 | import java.util.Collection; 23 | import java.util.Enumeration; 24 | import java.util.List; 25 | import java.util.jar.JarEntry; 26 | import java.util.jar.JarFile; 27 | 28 | public class EmbeddedLibraryTools { 29 | public static final boolean VERBOSE = Boolean.parseBoolean(System.getProperty("verbose","false")); 30 | 31 | public static final boolean LOADED_RE2; 32 | public static final boolean LOADED_RE2_JAVA; 33 | 34 | static { 35 | LOADED_RE2 = loadEmbeddedLibrary("libre2"); 36 | LOADED_RE2_JAVA = LOADED_RE2 && loadEmbeddedLibrary("libre2-java"); 37 | } 38 | 39 | public static String getCurrentPlatformIdentifier() { 40 | String osName = System.getProperty("os.name"); 41 | if (osName.toLowerCase().indexOf("windows") > -1) { 42 | osName = "Windows"; 43 | } 44 | return System.getProperty("os.arch") + "/" + osName; 45 | } 46 | 47 | private static boolean loadEmbeddedLibrary(final String name) { 48 | // attempt to locate embedded native library within JAR at following location: 49 | // /NATIVE/${os.arch}/${os.name}/libre2{,-java}.[so|dylib|dll] 50 | String[] allowedExtensions = new String[]{"so", "dylib", "dll",}; 51 | StringBuilder url = new StringBuilder(); 52 | url.append("/NATIVE/"); 53 | url.append(getCurrentPlatformIdentifier()); 54 | url.append('/'); 55 | url.append(name); 56 | url.append('.'); 57 | URL nativeLibraryUrl = null; 58 | //System.out.println(url.toString()); 59 | // loop through extensions, stopping after finding first one 60 | for (String ext : allowedExtensions) { 61 | if (VERBOSE) { 62 | System.err.println("Looking for native library: " + url.toString() + ext); 63 | } 64 | nativeLibraryUrl = RE2.class.getResource(url.toString() + ext); 65 | if (nativeLibraryUrl != null) 66 | break; 67 | } 68 | // 69 | if (nativeLibraryUrl != null) { 70 | // native library found within JAR, extract and load 71 | try { 72 | final File libfile = File.createTempFile(name, ".lib"); 73 | libfile.deleteOnExit(); // just in case 74 | // 75 | final InputStream in = nativeLibraryUrl.openStream(); 76 | final OutputStream out = new BufferedOutputStream(new FileOutputStream(libfile)); 77 | // 78 | int len = 0; 79 | byte[] buffer = new byte[8192]; 80 | while ((len = in.read(buffer)) > -1) 81 | out.write(buffer, 0, len); 82 | out.close(); 83 | in.close(); 84 | System.load(libfile.getAbsolutePath()); 85 | //do not delete the lib file now, in certain environments this 86 | //may lead library loading to fail 87 | if (VERBOSE) System.out.println("Loaded "+nativeLibraryUrl.toString()); 88 | 89 | //libfile.delete(); 90 | return true; 91 | } catch (IOException x) { 92 | if (VERBOSE) x.printStackTrace(); 93 | // mission failed, do nothing 94 | } 95 | } // nativeLibraryUrl exists 96 | return false; 97 | } 98 | 99 | private EmbeddedLibraryTools() { 100 | } 101 | 102 | ; 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/Encoding.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Java Bindings for the RE2 Library 3 | * 4 | * (c) 2012 Daniel Fiala 5 | * 6 | */ 7 | 8 | package com.logentries.re2; 9 | 10 | public enum Encoding { 11 | UTF8, Latin1; 12 | } 13 | -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/LibraryLoader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Java Bindings for the RE2 Library 3 | * 4 | * (c) 2012 Daniel Fiala 5 | * 6 | */ 7 | 8 | package com.logentries.re2; 9 | 10 | public class LibraryLoader { 11 | static { 12 | if (!EmbeddedLibraryTools.LOADED_RE2) { 13 | System.loadLibrary("re2"); 14 | } 15 | if (!EmbeddedLibraryTools.LOADED_RE2_JAVA) { 16 | System.loadLibrary("re2-java"); 17 | } 18 | } 19 | 20 | protected LibraryLoader() { } 21 | } 22 | -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/Options.h: -------------------------------------------------------------------------------- 1 | /* DO NOT EDIT THIS FILE - it is machine generated */ 2 | #include 3 | /* Header for class com_logentries_re2_Options */ 4 | 5 | #ifndef _Included_com_logentries_re2_Options 6 | #define _Included_com_logentries_re2_Options 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | /* 11 | * Class: com_logentries_re2_Options 12 | * Method: setDefaults 13 | * Signature: ()V 14 | */ 15 | JNIEXPORT void JNICALL Java_com_logentries_re2_Options_setDefaults 16 | (JNIEnv *, jobject); 17 | 18 | #ifdef __cplusplus 19 | } 20 | #endif 21 | #endif 22 | -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/Options.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Java Bindings for the RE2 Library 3 | * 4 | * (c) 2012 Daniel Fiala 5 | * 6 | */ 7 | 8 | package com.logentries.re2; 9 | 10 | public final class Options extends LibraryLoader { 11 | private Encoding encoding; 12 | private boolean posixSyntax; 13 | private boolean longestMatch; 14 | private boolean logErrors; 15 | private long maxMem; 16 | private boolean literal; 17 | private boolean neverNl; 18 | private boolean neverCapture; 19 | private boolean caseSensitive; 20 | private boolean perlClasses; 21 | private boolean wordBoundary; 22 | private boolean oneLine; 23 | private boolean unicodeWord = false; 24 | 25 | private native void setDefaults(); 26 | 27 | public Options() { 28 | setDefaults(); 29 | } 30 | 31 | public Options setEncoding(final Encoding encoding) { 32 | this.encoding = encoding; 33 | return this; 34 | } 35 | public Options setPosixSyntax(final boolean posixSyntax) { 36 | this.posixSyntax = posixSyntax; 37 | return this; 38 | } 39 | public Options setLongestMatch(final boolean longestMatch) { 40 | this.longestMatch = longestMatch; 41 | return this; 42 | } 43 | public Options setLogErrors(final boolean logErrors) { 44 | this.logErrors = logErrors; 45 | return this; 46 | } 47 | public Options setMaxMem(final long maxMem) { 48 | this.maxMem = maxMem; 49 | return this; 50 | } 51 | public Options setLiteral(final boolean literal) { 52 | this.literal = literal; 53 | return this; 54 | } 55 | public Options setNeverNl(final boolean neverNl) { 56 | this.neverNl = neverNl; 57 | return this; 58 | } 59 | public Options setNeverCapture(final boolean neverCapture) { 60 | this.neverCapture = neverCapture; 61 | return this; 62 | } 63 | public Options setCaseSensitive(final boolean caseSensitive) { 64 | this.caseSensitive = caseSensitive; 65 | return this; 66 | } 67 | public Options setCaseInsensitive(final boolean caseInsensitive) { 68 | this.caseSensitive = !caseInsensitive; 69 | return this; 70 | } 71 | public Options setPerlClasses(final boolean perlClasses) { 72 | this.perlClasses = perlClasses; 73 | return this; 74 | } 75 | public Options setWordBoundary(final boolean wordBoundary) { 76 | this.wordBoundary = wordBoundary; 77 | return this; 78 | } 79 | public Options setOneLine(final boolean oneLine) { 80 | this.oneLine = oneLine; 81 | return this; 82 | } 83 | public Options setUnicodeWord(final boolean unicodeWord){ 84 | this.unicodeWord = unicodeWord; 85 | return this; 86 | } 87 | 88 | public boolean isUnicodeWord(){ 89 | return unicodeWord; 90 | } 91 | 92 | /// FLAGS 93 | public static interface Flag { 94 | public void apply(Options opt); 95 | } 96 | 97 | public static final Flag POSIX_SINTAX = POSIX_SINTAX(true); 98 | public static Flag POSIX_SINTAX(final boolean v) { 99 | return new Flag() { 100 | @Override 101 | public void apply(Options opt) { 102 | opt.setPosixSyntax(v); 103 | } 104 | }; 105 | } 106 | public static final Flag LONGEST_MATCH = LONGEST_MATCH (true); 107 | public static Flag LONGEST_MATCH(final boolean v) { 108 | return new Flag() { 109 | @Override 110 | public void apply(Options opt) { 111 | opt.setLongestMatch(v); 112 | } 113 | }; 114 | } 115 | public static final Flag LOG_ERRORS = LOG_ERRORS (true); 116 | public static Flag LOG_ERRORS(final boolean v) { 117 | return new Flag() { 118 | @Override 119 | public void apply(Options opt) { 120 | opt.setLogErrors(v); 121 | } 122 | }; 123 | } 124 | public static final Flag LITERAL = LITERAL (true); 125 | public static Flag LITERAL(final boolean v) { 126 | return new Flag() { 127 | @Override 128 | public void apply(Options opt) { 129 | opt.setLiteral(v); 130 | } 131 | }; 132 | } 133 | public static final Flag NEVER_NL = NEVER_NL (true); 134 | public static Flag NEVER_NL(final boolean v) { 135 | return new Flag() { 136 | @Override 137 | public void apply(Options opt) { 138 | opt.setNeverNl(v); 139 | } 140 | }; 141 | } 142 | public static final Flag NEVER_CAPTURE = NEVER_CAPTURE (true); 143 | public static Flag NEVER_CAPTURE(final boolean v) { 144 | return new Flag() { 145 | @Override 146 | public void apply(Options opt) { 147 | opt.setNeverCapture(v); 148 | } 149 | }; 150 | } 151 | public static final Flag CASE_SENSITIVE = CASE_SENSITIVE (true); 152 | public static Flag CASE_SENSITIVE(final boolean v) { 153 | return new Flag() { 154 | @Override 155 | public void apply(Options opt) { 156 | opt.setCaseSensitive(v); 157 | } 158 | }; 159 | } 160 | public static final Flag CASE_INSENSITIVE = CASE_INSENSITIVE (true); 161 | public static Flag CASE_INSENSITIVE(final boolean v) { 162 | return new Flag() { 163 | @Override 164 | public void apply(Options opt) { 165 | opt.setCaseInsensitive(v); 166 | } 167 | }; 168 | } 169 | public static final Flag PERL_CLASSES = PERL_CLASSES (true); 170 | public static Flag PERL_CLASSES(final boolean v) { 171 | return new Flag() { 172 | @Override 173 | public void apply(Options opt) { 174 | opt.setPerlClasses(v); 175 | } 176 | }; 177 | } 178 | public static final Flag WORD_BOUNDARY = WORD_BOUNDARY (true); 179 | public static Flag WORD_BOUNDARY(final boolean v) { 180 | return new Flag() { 181 | @Override 182 | public void apply(Options opt) { 183 | opt.setWordBoundary(v); 184 | } 185 | }; 186 | } 187 | public static final Flag UNICODE_WORD = UNICODE_WORD(true); 188 | public static Flag UNICODE_WORD(final boolean v) { 189 | return new Flag() { 190 | @Override 191 | public void apply(Options opt) { 192 | opt.setUnicodeWord(v); 193 | } 194 | }; 195 | } 196 | 197 | public static final Flag ONE_LINE = ONE_LINE (true); 198 | public static Flag ONE_LINE(final boolean v) { 199 | return new Flag() { 200 | @Override 201 | public void apply(Options opt) { 202 | opt.setOneLine(v); 203 | } 204 | }; 205 | } 206 | 207 | public static Flag MAX_MEMORY(final long m) { 208 | return new Flag() { 209 | @Override 210 | public void apply(Options opt) { 211 | opt.setMaxMem(m); 212 | } 213 | }; 214 | } 215 | 216 | public static final Flag UTF8_ENCODING = ENCODING(Encoding.UTF8); 217 | public static final Flag LATIN1_ENCODING = ENCODING(Encoding.Latin1); 218 | public static Flag ENCODING(final Encoding e) { 219 | return new Flag() { 220 | @Override 221 | public void apply(Options opt) { 222 | opt.setEncoding(e); 223 | } 224 | }; 225 | } 226 | 227 | 228 | } 229 | -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/RE2.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Java Bindings for the RE2 Library 3 | * 4 | * (c) 2012 Daniel Fiala 5 | * 6 | */ 7 | 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "RE2.h" 16 | #include "op.h" 17 | #include "Options.h" 18 | 19 | using re2::StringPiece; 20 | using namespace std; 21 | 22 | template 23 | static Dst safe_cast(Src src) { 24 | Dst dst = static_cast(src); 25 | BOOST_VERIFY(static_cast(dst) == src); 26 | BOOST_VERIFY(dst > 0 == src > 0); 27 | return dst; 28 | } 29 | 30 | RE2::Options::Encoding get_re2_encoding(JNIEnv *env, jobject j_encoding) { 31 | jclass j_cls = env->FindClass("com/logentries/re2/Encoding"); 32 | 33 | jmethodID equals_id = env->GetMethodID(j_cls, "equals", "(Ljava/lang/Object;)Z"); 34 | 35 | const char *fields[] = {"UTF8", "Latin1", }; 36 | const RE2::Options::Encoding enc_fields[] = {RE2::Options::EncodingUTF8, RE2::Options::EncodingLatin1, }; 37 | for (int i = 0; i < sizeof(fields)/sizeof(*fields); ++i) { 38 | jfieldID fid = env->GetStaticFieldID(j_cls, fields[i], "Lcom/logentries/re2/Encoding;"); 39 | jobject item = env->GetStaticObjectField(j_cls, fid); 40 | if (env->CallBooleanMethod(item, equals_id, j_encoding)) { 41 | return enc_fields[i]; 42 | } 43 | } 44 | BOOST_VERIFY(0); 45 | } 46 | 47 | jobject get_j_encoding(JNIEnv *env, RE2::Options::Encoding enc) { 48 | const char *fields[] = {"UTF8", "Latin1", }; 49 | const RE2::Options::Encoding enc_fields[] = {RE2::Options::EncodingUTF8, RE2::Options::EncodingLatin1, }; 50 | for (int i = 0; i < sizeof(enc_fields)/sizeof(*enc_fields); ++i) { 51 | RE2::Options::Encoding enc_item = enc_fields[i]; 52 | if (enc_item == enc) { 53 | jclass j_cls = env->FindClass("com/logentries/re2/Encoding"); 54 | jfieldID fid = env->GetStaticFieldID(j_cls, fields[i], "Lcom/logentries/re2/Encoding;"); 55 | jobject item = env->GetStaticObjectField(j_cls, fid); 56 | return item; 57 | } 58 | } 59 | BOOST_VERIFY(0); 60 | } 61 | 62 | static jfieldID get_field_id_safe(JNIEnv *env, jclass j_cls, const char *name, const char *sig) { 63 | jfieldID fid = env->GetFieldID(j_cls, name, sig); 64 | BOOST_VERIFY(fid != NULL); 65 | return fid; 66 | } 67 | 68 | JNIEXPORT void JNICALL Java_com_logentries_re2_Options_setDefaults 69 | (JNIEnv *env, jobject j_this) { 70 | RE2::Options options; 71 | jclass j_cls = env->GetObjectClass(j_this); 72 | env->SetObjectField(j_this, get_field_id_safe(env, j_cls, "encoding", "Lcom/logentries/re2/Encoding;"), get_j_encoding(env, options.encoding())); 73 | env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "posixSyntax", "Z"), options.posix_syntax()); 74 | env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "longestMatch", "Z"), options.longest_match()); 75 | env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "logErrors", "Z"), options.log_errors()); 76 | env->SetLongField(j_this, get_field_id_safe(env, j_cls, "maxMem", "J"), safe_cast(options.max_mem())); 77 | env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "literal", "Z"), options.literal()); 78 | env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "neverNl", "Z"), options.never_nl()); 79 | env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "neverCapture", "Z"), options.never_capture()); 80 | env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "caseSensitive", "Z"), options.case_sensitive()); 81 | env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "perlClasses", "Z"), options.perl_classes()); 82 | env->SetBooleanField(j_this, get_field_id_safe(env, j_cls, "wordBoundary", "Z"), options.word_boundary()); 83 | } 84 | 85 | static void cpy_options(RE2::Options &options, JNIEnv *env, jobject j_options) { 86 | BOOST_VERIFY(j_options != 0); 87 | jclass j_options_cls = env->GetObjectClass(j_options); 88 | options.set_encoding(get_re2_encoding(env, env->GetObjectField(j_options, get_field_id_safe(env, j_options_cls, "encoding", "Lcom/logentries/re2/Encoding;")))); 89 | options.set_posix_syntax(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "posixSyntax", "Z"))); 90 | options.set_longest_match(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "longestMatch", "Z"))); 91 | options.set_log_errors(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "logErrors", "Z"))); 92 | options.set_max_mem(safe_cast(env->GetLongField(j_options, get_field_id_safe(env, j_options_cls, "maxMem", "J")))); 93 | options.set_literal(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "literal", "Z"))); 94 | options.set_never_nl(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "neverNl", "Z"))); 95 | options.set_never_capture(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "neverCapture", "Z"))); 96 | options.set_case_sensitive(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "caseSensitive", "Z"))); 97 | options.set_perl_classes(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "perlClasses", "Z"))); 98 | options.set_word_boundary(env->GetBooleanField(j_options, get_field_id_safe(env, j_options_cls, "wordBoundary", "Z"))); 99 | } 100 | 101 | class Options : public RE2::Options { 102 | public: 103 | Options(JNIEnv *env, jobject j_options) { 104 | if (j_options != 0) { 105 | cpy_options(*this, env, j_options); 106 | } 107 | } 108 | }; 109 | 110 | static bool is_empty_arr(JNIEnv *env, jarray j_arr) { 111 | return j_arr == 0 || env->GetArrayLength(j_arr) == 0; 112 | 113 | } 114 | 115 | static bool throw_RegExprException(JNIEnv *env, const char *msg) { 116 | const char *class_name = "com/logentries/re2/RegExprException" ; 117 | 118 | jclass j_cls = env->FindClass(class_name); 119 | if (j_cls == NULL) { 120 | BOOST_VERIFY(!"Cannot find exception class :-("); 121 | } 122 | 123 | return env->ThrowNew(j_cls, msg) == 0; 124 | } 125 | 126 | JNIEXPORT jlong JNICALL Java_com_logentries_re2_RE2_compileImpl 127 | (JNIEnv *env, jclass cls, jstring j_str, jobject j_options) { 128 | Options options(env, j_options); 129 | const char *str = env->GetStringUTFChars(j_str, 0); 130 | RE2 *pointer = new RE2(str, options); 131 | if (pointer->ok()) { 132 | env->ReleaseStringUTFChars(j_str, str); 133 | jlong j_pointer = reinterpret_cast(pointer); 134 | BOOST_VERIFY(reinterpret_cast(j_pointer) == pointer); 135 | return j_pointer; 136 | } else { 137 | throw_RegExprException(env, pointer->error().c_str()); 138 | delete pointer; 139 | return 0; 140 | } 141 | } 142 | 143 | JNIEXPORT void JNICALL Java_com_logentries_re2_RE2_releaseImpl 144 | (JNIEnv *env, jclass cls, jlong j_pointer) { 145 | RE2 *pointer = reinterpret_cast(j_pointer); 146 | //pool.destroy(pointer); 147 | delete pointer; 148 | } 149 | 150 | struct FullMatchCOp { 151 | const char *str_; 152 | const RE2 *pattern_; 153 | 154 | FullMatchCOp(const char *str, const RE2 *pattern) 155 | : str_(str), 156 | pattern_(pattern) 157 | { } 158 | 159 | bool operator()(const RE2::Arg* const args[], const int n) const { 160 | return RE2::FullMatchN(str_, *pattern_, args, n); 161 | } 162 | }; 163 | 164 | JNIEXPORT jboolean JNICALL Java_com_logentries_re2_RE2_fullMatchImpl__Ljava_lang_String_2J_3Ljava_lang_Object_2 165 | (JNIEnv *env, jclass cls, jstring j_str, jlong j_pointer, jobjectArray j_args) { 166 | const char *str = env->GetStringUTFChars(j_str, 0); 167 | RE2 *pointer = reinterpret_cast(j_pointer); 168 | const bool res = is_empty_arr(env, j_args) ? RE2::FullMatch(str, *pointer) : do_op(env, FullMatchCOp(str, pointer), j_args); 169 | env->ReleaseStringUTFChars(j_str, str); 170 | return static_cast(res); 171 | } 172 | 173 | struct PartialMatchCOp { 174 | const char *str_; 175 | const RE2 *pattern_; 176 | 177 | PartialMatchCOp(const char *str, const RE2 *pattern) 178 | : str_(str), 179 | pattern_(pattern) 180 | { } 181 | 182 | bool operator()(const RE2::Arg* const args[], const int n) const { 183 | return RE2::PartialMatchN(str_, *pattern_, args, n); 184 | } 185 | }; 186 | 187 | JNIEXPORT jboolean JNICALL Java_com_logentries_re2_RE2_partialMatchImpl__Ljava_lang_String_2J_3Ljava_lang_Object_2 188 | (JNIEnv *env, jclass cls, jstring j_str, jlong j_pointer, jobjectArray j_args) { 189 | const char *str = env->GetStringUTFChars(j_str, 0); 190 | RE2 *pointer = reinterpret_cast(j_pointer); 191 | const bool res = is_empty_arr(env, j_args) ? RE2::PartialMatch(str, *pointer) : do_op(env, PartialMatchCOp(str, pointer), j_args); 192 | env->ReleaseStringUTFChars(j_str, str); 193 | return static_cast(res); 194 | } 195 | 196 | struct FullMatchOp { 197 | const char *str_; 198 | const char *pattern_; 199 | 200 | FullMatchOp(const char *str, const char *pattern) 201 | : str_(str), 202 | pattern_(pattern) 203 | { } 204 | 205 | bool operator()(const RE2::Arg* const args[], const int n) const { 206 | return RE2::FullMatchN(str_, pattern_, args, n); 207 | } 208 | }; 209 | 210 | JNIEXPORT jboolean JNICALL Java_com_logentries_re2_RE2_fullMatchImpl__Ljava_lang_String_2Ljava_lang_String_2_3Ljava_lang_Object_2 211 | (JNIEnv *env, jclass cls, jstring j_str, jstring j_pattern, jobjectArray j_args) { 212 | const char *str = env->GetStringUTFChars(j_str, 0); 213 | const char *pattern = env->GetStringUTFChars(j_pattern, 0); 214 | const bool res = is_empty_arr(env, j_args) ? RE2::FullMatch(str, pattern) : do_op(env, FullMatchOp(str, pattern), j_args); 215 | env->ReleaseStringUTFChars(j_str, str); 216 | env->ReleaseStringUTFChars(j_pattern, pattern); 217 | return static_cast(res); 218 | } 219 | 220 | struct PartialMatchOp { 221 | const char *str_; 222 | const char *pattern_; 223 | 224 | PartialMatchOp(const char *str, const char *pattern) 225 | : str_(str), 226 | pattern_(pattern) 227 | { } 228 | 229 | bool operator()(const RE2::Arg* const args[], const int n) const { 230 | return RE2::PartialMatchN(str_, pattern_, args, n); 231 | } 232 | }; 233 | 234 | JNIEXPORT jboolean JNICALL Java_com_logentries_re2_RE2_partialMatchImpl__Ljava_lang_String_2Ljava_lang_String_2_3Ljava_lang_Object_2 235 | (JNIEnv *env, jclass cls, jstring j_str, jstring j_pattern, jobjectArray j_args) { 236 | const char *str = env->GetStringUTFChars(j_str, 0); 237 | const char *pattern = env->GetStringUTFChars(j_pattern, 0); 238 | const bool res = is_empty_arr(env, j_args) ? RE2::PartialMatch(str, pattern) : do_op(env, PartialMatchOp(str, pattern), j_args); 239 | env->ReleaseStringUTFChars(j_str, str); 240 | env->ReleaseStringUTFChars(j_pattern, pattern); 241 | return static_cast(res); 242 | } 243 | 244 | JNIEXPORT jobject JNICALL Java_com_logentries_re2_RE2_getCaptureGroupNamesImpl 245 | (JNIEnv *env, jclass cls, jlong j_pointer) { 246 | RE2 *pointer = reinterpret_cast(j_pointer); 247 | 248 | jclass j_hashmap_class = env->FindClass("java/util/HashMap"); 249 | if (j_hashmap_class == NULL) return NULL; 250 | 251 | jmethodID hashMapCtor = env->GetMethodID(j_hashmap_class, "", "()V"); 252 | jmethodID put_method = env->GetMethodID(j_hashmap_class, "put", "(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object;"); 253 | jobject java_map = env->NewObject(j_hashmap_class, hashMapCtor); 254 | 255 | jclass j_int_class = env->FindClass("java/lang/Integer"); 256 | jmethodID newInt = env->GetStaticMethodID(j_int_class, "valueOf", "(I)Ljava/lang/Integer;"); 257 | 258 | map groupNames = (pointer->CapturingGroupNames()); 259 | map::iterator it; 260 | 261 | for (it = groupNames.begin(); it != groupNames.end(); ++it) { 262 | jstring jvalue = env->NewStringUTF(it->second.c_str()); 263 | jobject jkey = env->CallStaticObjectMethod(j_int_class, newInt, (jint) it->first); 264 | 265 | env->CallObjectMethod(java_map, put_method, jkey, jvalue); 266 | }; 267 | 268 | return java_map; 269 | } 270 | 271 | JNIEXPORT jint JNICALL Java_com_logentries_re2_RE2_numberOfCapturingGroupsImpl 272 | (JNIEnv *env, jclass cls, jlong re2_pointer) { 273 | 274 | RE2 *regex = reinterpret_cast(re2_pointer); 275 | return static_cast(regex->NumberOfCapturingGroups()); 276 | } 277 | 278 | JNIEXPORT jlong JNICALL Java_com_logentries_re2_RE2String_createStringBuffer 279 | (JNIEnv *env, jclass cls, jbyteArray input) { 280 | // const char *str = env->GetStringUTFChars(input, 0); 281 | char* str = (char*) env->GetByteArrayElements(input, 0); 282 | return reinterpret_cast(str); 283 | } 284 | 285 | 286 | JNIEXPORT void JNICALL Java_com_logentries_re2_RE2String_releaseStringBuffer 287 | (JNIEnv *env, jclass cls, jbyteArray input, jlong j_pointer) { 288 | char *pointer = reinterpret_cast(j_pointer); 289 | env->ReleaseByteArrayElements(input, (jbyte*)pointer, JNI_ABORT); 290 | } 291 | 292 | static const int stackSize = 16 + 1; // see 'kVecSize' in re2.cc 293 | 294 | JNIEXPORT jboolean JNICALL Java_com_logentries_re2_RE2Matcher_findImpl 295 | (JNIEnv *env, jclass cls, jobject matcher, jlong re2_pointer, jlong str_pointer, jint ngroups, jint start, jint end) { 296 | 297 | 298 | RE2 *regex = reinterpret_cast(re2_pointer); 299 | char *str = reinterpret_cast(str_pointer); 300 | 301 | StringPiece* groups; 302 | StringPiece stackgroups[stackSize]; 303 | StringPiece* heapgroups = NULL; 304 | 305 | if (ngroups <= stackSize) { 306 | groups = stackgroups; 307 | } else { 308 | groups = new StringPiece[ngroups]; 309 | heapgroups = groups; 310 | } 311 | 312 | StringPiece text(str); 313 | const bool res = regex->Match(text, start, end, RE2::UNANCHORED, groups, ngroups); 314 | if (res) { 315 | jclass matcher_class = env->FindClass("com/logentries/re2/RE2Matcher"); 316 | jmethodID addID = env->GetStaticMethodID(matcher_class, "addGroup", "(Lcom/logentries/re2/RE2Matcher;II)V"); 317 | for (int i=0; iCallStaticObjectMethod( 320 | matcher_class, 321 | addID, 322 | matcher, 323 | static_cast(groups[i].data() - str), 324 | static_cast(groups[i].data() - str + groups[i].size()) 325 | ); 326 | } else { 327 | env->CallStaticObjectMethod(matcher_class, addID, 328 | matcher, static_cast(-1), static_cast(-1)); 329 | } 330 | } 331 | } 332 | 333 | delete[] heapgroups; 334 | return static_cast(res); 335 | } -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/RE2.h: -------------------------------------------------------------------------------- 1 | /* DO NOT EDIT THIS FILE - it is machine generated */ 2 | #include 3 | /* Header for class com_logentries_re2_RE2 */ 4 | 5 | #ifndef _Included_com_logentries_re2_RE2 6 | #define _Included_com_logentries_re2_RE2 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | /* 11 | * Class: com_logentries_re2_RE2 12 | * Method: compileImpl 13 | * Signature: (Ljava/lang/String;Lcom/logentries/re2/Options;)J 14 | */ 15 | JNIEXPORT jlong JNICALL Java_com_logentries_re2_RE2_compileImpl 16 | (JNIEnv *, jclass, jstring, jobject); 17 | 18 | /* 19 | * Class: com_logentries_re2_RE2 20 | * Method: releaseImpl 21 | * Signature: (J)V 22 | */ 23 | JNIEXPORT void JNICALL Java_com_logentries_re2_RE2_releaseImpl 24 | (JNIEnv *, jclass, jlong); 25 | 26 | /* 27 | * Class: com_logentries_re2_RE2 28 | * Method: fullMatchImpl 29 | * Signature: (Ljava/lang/String;J[Ljava/lang/Object;)Z 30 | */ 31 | JNIEXPORT jboolean JNICALL Java_com_logentries_re2_RE2_fullMatchImpl__Ljava_lang_String_2J_3Ljava_lang_Object_2 32 | (JNIEnv *, jclass, jstring, jlong, jobjectArray); 33 | 34 | /* 35 | * Class: com_logentries_re2_RE2 36 | * Method: partialMatchImpl 37 | * Signature: (Ljava/lang/String;J[Ljava/lang/Object;)Z 38 | */ 39 | JNIEXPORT jboolean JNICALL Java_com_logentries_re2_RE2_partialMatchImpl__Ljava_lang_String_2J_3Ljava_lang_Object_2 40 | (JNIEnv *, jclass, jstring, jlong, jobjectArray); 41 | 42 | /* 43 | * Class: com_logentries_re2_RE2 44 | * Method: fullMatchImpl 45 | * Signature: (Ljava/lang/String;Ljava/lang/String;[Ljava/lang/Object;)Z 46 | */ 47 | JNIEXPORT jboolean JNICALL Java_com_logentries_re2_RE2_fullMatchImpl__Ljava_lang_String_2Ljava_lang_String_2_3Ljava_lang_Object_2 48 | (JNIEnv *, jclass, jstring, jstring, jobjectArray); 49 | 50 | /* 51 | * Class: com_logentries_re2_RE2 52 | * Method: partialMatchImpl 53 | * Signature: (Ljava/lang/String;Ljava/lang/String;[Ljava/lang/Object;)Z 54 | */ 55 | JNIEXPORT jboolean JNICALL Java_com_logentries_re2_RE2_partialMatchImpl__Ljava_lang_String_2Ljava_lang_String_2_3Ljava_lang_Object_2 56 | (JNIEnv *, jclass, jstring, jstring, jobjectArray); 57 | 58 | /* 59 | * Class: com_logentries_re2_RE2 60 | * Method: captureGroupNamesImpl 61 | * Signature: (J[Ljava/lang/Object;)Ljava/util/Map; 62 | */ 63 | JNIEXPORT jobject JNICALL Java_com_logentries_re2_RE2_getCaptureGroupNamesImpl 64 | (JNIEnv *, jclass, jlong); 65 | 66 | /* 67 | * Class: com_logentries_re2_RE2 68 | * Method: numberOfCapturingGroupsImpl 69 | * Signature: (J)I 70 | */ 71 | JNIEXPORT jint JNICALL Java_com_logentries_re2_RE2_numberOfCapturingGroupsImpl 72 | (JNIEnv *, jclass, jlong); 73 | 74 | 75 | /* 76 | * Class: com_logentries_re2_RE2String 77 | * Method: createStringBuffer 78 | * Signature: ([B)J 79 | */ 80 | JNIEXPORT jlong JNICALL Java_com_logentries_re2_RE2String_createStringBuffer 81 | (JNIEnv *, jclass, jbyteArray); 82 | 83 | /* 84 | * Class: com_logentries_re2_RE2String 85 | * Method: releaseStringBuffer 86 | * Signature: ([BJ)V 87 | */ 88 | JNIEXPORT void JNICALL Java_com_logentries_re2_RE2String_releaseStringBuffer 89 | (JNIEnv *, jclass, jbyteArray, jlong); 90 | 91 | 92 | /* 93 | * Class: com_logentries_re2_RE2Matcher 94 | * Method: findImpl 95 | * Signature: (Ljava/lang/Object;JJIII)Z 96 | */ 97 | JNIEXPORT jboolean JNICALL Java_com_logentries_re2_RE2Matcher_findImpl 98 | (JNIEnv *, jclass, jobject, jlong, jlong, jint, jint, jint); 99 | 100 | #ifdef __cplusplus 101 | } 102 | #endif 103 | #endif 104 | -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/RE2.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Java Bindings for the RE2 Library 3 | * 4 | * (c) 2012 Daniel Fiala 5 | * 6 | */ 7 | 8 | package com.logentries.re2; 9 | 10 | import com.logentries.re2.entity.CaptureGroup; 11 | import com.logentries.re2.entity.NamedGroup; 12 | 13 | import java.util.*; 14 | import java.util.regex.MatchResult; 15 | 16 | public final class RE2 extends LibraryLoader implements AutoCloseable { 17 | 18 | private static native long compileImpl(final String pattern, final Options options) throws RegExprException; 19 | private static native void releaseImpl(final long pointer); 20 | private static native boolean fullMatchImpl(final String str, final long pointer, Object ... args); 21 | private static native boolean partialMatchImpl(final String str, final long pointer, Object ... args); 22 | private static native boolean fullMatchImpl(final String str, final String pattern, Object ... args); 23 | private static native boolean partialMatchImpl(final String str, final String pattern, Object ... args); 24 | private static native HashMap getCaptureGroupNamesImpl(final long pointer); 25 | private static native int numberOfCapturingGroupsImpl(final long pointer); 26 | 27 | 28 | private long pointer; 29 | private boolean changedGroups = false; 30 | //private Map originalGroupMap = null; 31 | 32 | private void checkState() throws IllegalStateException { 33 | if (pointer == 0) { 34 | throw new IllegalStateException(); 35 | } 36 | } 37 | boolean isClosed() { 38 | return pointer == 0; 39 | } 40 | 41 | public RE2(String pattern, final Options options) throws RegExprException { 42 | if (options.isUnicodeWord()) pattern = patchUnicodeWord(pattern); 43 | pointer = compileImpl(pattern, options); 44 | //if (changedGroups) mapPatchedGroups(); 45 | } 46 | public RE2(String pattern, final Options.Flag... options) throws RegExprException { 47 | Options opt = new Options(); 48 | for (Options.Flag f : options) f.apply(opt); 49 | if (opt.isUnicodeWord()) pattern = patchUnicodeWord(pattern); 50 | pointer = compileImpl(pattern, opt); 51 | //if (changedGroups) mapPatchedGroups(); 52 | 53 | } 54 | 55 | //////////////////////////////////////////////////////////////////////////////////// 56 | /* Unicode word patch */ 57 | static final int IDLE = 0, QUOTING = 2; 58 | static final String WORD_BOUNDARY_GNAME = "_ignore_"; 59 | static final String BOUNDARY_REPLACE = boundaryReplace(new String[]{"\\z", "\\A", "[^\\pL\\pN]"}); 60 | static final String WORD_REPLACE = "[\\pL\\pN]"; 61 | static final String NON_WORD_REPLACE = "[^\\pL\\pN]"; 62 | 63 | static String boundaryReplace ( String[] args ) { 64 | String output = "(?P<" + WORD_BOUNDARY_GNAME + ">"; 65 | 66 | for ( String s : args ) 67 | output += "(?P<" + WORD_BOUNDARY_GNAME + ">" + s + ")" + "|"; 68 | 69 | return output.subSequence(0, output.length() - 1) + ")"; 70 | } 71 | 72 | String patchUnicodeWord(String original) { 73 | StringBuilder buffer = new StringBuilder(original.length()); 74 | int state = IDLE; 75 | int wordBoundaryCount = 0; 76 | for (int i=0; i 0) { 83 | if ( next == 'Q') { 84 | buffer.append("\\Q"); 85 | state = QUOTING; 86 | } else if ( next == 'w') { 87 | buffer.append(WORD_REPLACE); 88 | } else if ( next == 'W') { 89 | buffer.append(NON_WORD_REPLACE); 90 | } else if ( next == 'b') { 91 | buffer.append(BOUNDARY_REPLACE); 92 | //wordBoundaryCount += 4; 93 | changedGroups = true; 94 | } else { 95 | buffer.append(c).append(next); 96 | } 97 | i++; 98 | } else { 99 | buffer.append(c); 100 | } 101 | break; 102 | case QUOTING: 103 | if (c == '\\' && next == 'E') { 104 | state = IDLE; 105 | buffer.append("\\E"); 106 | i++; 107 | } else 108 | buffer.append(c); 109 | break; 110 | } 111 | } 112 | 113 | //System.out.println(buffer.toString()); 114 | return buffer.toString(); 115 | } 116 | 117 | Set getIgnoreGropus ( ) { 118 | HashMap groups = getCaptureGroupNameMap(); 119 | int total = numberOfCapturingGroups(); 120 | int offset = 0; 121 | int originalgroup = 1; 122 | 123 | Set output = new HashSet<>(); 124 | 125 | for (int i = 1; i getCaptureGroupNameMap(){ 152 | checkState(); 153 | return getCaptureGroupNamesImpl(pointer); 154 | } 155 | 156 | public void dispoze() { 157 | if (pointer != 0) { 158 | releaseImpl(pointer); 159 | pointer = 0; 160 | } 161 | } 162 | 163 | public void close() { 164 | dispoze(); 165 | } 166 | 167 | protected void finalize() throws Throwable { 168 | dispoze(); 169 | super.finalize(); 170 | } 171 | 172 | static private int checkArg(final Object obj) throws IllegalArgumentException { 173 | if (obj instanceof int[]) { 174 | return ((int[])obj).length; 175 | } 176 | if (obj instanceof long[]) { 177 | return ((long[])obj).length; 178 | } 179 | if (obj instanceof float[]) { 180 | return ((float[])obj).length; 181 | } 182 | if (obj instanceof double[]) { 183 | return ((double[])obj).length; 184 | } 185 | if (obj instanceof String[]) { 186 | return ((String[])obj).length; 187 | } 188 | throw new IllegalArgumentException(); 189 | } 190 | 191 | static private void checkArgs(Object ... args) throws IllegalArgumentException { 192 | int length = 0; 193 | for (Object arg: args) { 194 | if ((length += checkArg(arg)) > 31) { 195 | throw new IllegalArgumentException("Only up to 31 arguments supported"); 196 | } 197 | } 198 | } 199 | 200 | public static boolean fullMatch(final String str, final String pattern, Object ... args) { 201 | checkArgs(args); 202 | return fullMatchImpl(str, pattern, args); 203 | } 204 | 205 | public static boolean partialMatch(final String str, final String pattern, Object ... args) { 206 | checkArgs(args); 207 | return partialMatchImpl(str, pattern, args); 208 | } 209 | 210 | public boolean fullMatch(final String str, Object ... args) throws IllegalStateException { 211 | checkState(); 212 | checkArgs(args); 213 | return fullMatchImpl(str, pointer, args); 214 | } 215 | 216 | public boolean partialMatch(final String str, Object ... args) throws IllegalStateException { 217 | checkState(); 218 | checkArgs(args); 219 | return partialMatchImpl(str, pointer, args); 220 | } 221 | 222 | /** 223 | * This method returns ordered names. 224 | * 225 | * @param args 226 | * @return List of names for the capture groups 227 | * @throws IllegalStateException 228 | */ 229 | public List getCaptureGroupNames(Object... args) throws IllegalStateException { 230 | checkState(); 231 | checkArgs(args); 232 | HashMap nameMap = getCaptureGroupNamesImpl(pointer); 233 | return new ArrayList<>(nameMap.values()); 234 | } 235 | 236 | 237 | public RE2Matcher matcher(final CharSequence str) { 238 | return matcher(str, true); 239 | } 240 | public RE2Matcher matcher(final CharSequence str, boolean fetchGroups) { 241 | checkState(); 242 | if (changedGroups) 243 | return new RE2MatcherUnicodeWord(str, this, pointer); 244 | else 245 | return new RE2Matcher(str, this, pointer, fetchGroups); 246 | } 247 | public RE2Matcher matcher(final RE2String str) { 248 | return matcher(str, true); 249 | } 250 | public RE2Matcher matcher(final RE2String str, boolean fetchGroups) { 251 | checkState(); 252 | if (changedGroups) 253 | return new RE2MatcherUnicodeWord(str, this, pointer); 254 | else 255 | return new RE2Matcher(str, this, pointer, fetchGroups); 256 | } 257 | 258 | /** 259 | * Gets the ordered capture groups for this event message and pattern. 260 | * @param str is an events message. 261 | * @return is a list of CaptureGroups. 262 | */ 263 | public List getCaptureGroups(final String str) { 264 | checkState(); 265 | List captureGroups = new ArrayList<>(); 266 | RE2Matcher re2match = this.matcher(str); 267 | 268 | try { 269 | for (MatchResult match : re2match) { 270 | for (int i = 1; i < match.groupCount(); i++) { 271 | if (match.start() > -1) { 272 | captureGroups.add(new CaptureGroup(match.group(i), match.start(i), match.end(i))); 273 | } 274 | } 275 | } 276 | } catch (IndexOutOfBoundsException e) { 277 | return captureGroups; 278 | } 279 | return captureGroups; 280 | } 281 | 282 | /** 283 | * Returns a list of named capture groups and their position information in the event message. 284 | * @param names is a list of names to match against. 285 | * @param str is an events message. 286 | * @return is a list of named capture groups. 287 | */ 288 | public List getNamedCaptureGroups(List names, final String str) { 289 | List namedGroups = new ArrayList<>(); 290 | List captureGroups = getCaptureGroups(str); 291 | int len = names.size(); 292 | 293 | if (len != captureGroups.size()) { 294 | // Matching text for a named group hasn't been found. 295 | return namedGroups; 296 | } 297 | 298 | for (int i = 0; i < len; i++) { 299 | if (captureGroups.get(i).start > -1) { 300 | namedGroups.add(new NamedGroup(names.get(i), captureGroups.get(i))); 301 | } 302 | } 303 | return namedGroups; 304 | } 305 | } 306 | -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/RE2Matcher.java: -------------------------------------------------------------------------------- 1 | package com.logentries.re2; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Iterator; 5 | import java.util.NoSuchElementException; 6 | import java.util.regex.MatchResult; 7 | 8 | public class RE2Matcher implements MatchResult, AutoCloseable, Iterable { 9 | 10 | private static native boolean findImpl( 11 | final Object matcher, 12 | final long re2_pointer, 13 | final long str_pointer, 14 | final int fetch_groups, 15 | final int start, 16 | final int end 17 | ); 18 | 19 | public static class Range { 20 | int start, end; 21 | static Range of(int start, int end) { 22 | Range r = new Range(); 23 | r.start = start; 24 | r.end = end; 25 | return r; 26 | } 27 | } 28 | 29 | public static void addGroup(RE2Matcher obj, int start, int end) { 30 | if (start >= 0 && end >= 0) { 31 | start = obj.utf8input.charPos(start); 32 | end = obj.utf8input.charPos(end); 33 | } 34 | obj.groups.add(Range.of(start, end)); 35 | } 36 | 37 | 38 | protected ArrayList groups; 39 | protected RE2String utf8input; 40 | protected RE2String managedString; 41 | protected long re2Pointer = 0; 42 | protected RE2 regex; 43 | protected boolean matched; 44 | protected boolean fetchGroups; 45 | 46 | RE2Matcher(RE2String input, RE2 regex, long re2Pointer, boolean fetchGroups) { 47 | this.utf8input = input; 48 | this.matched = false; 49 | this.groups = new ArrayList<>(fetchGroups? regex.numberOfCapturingGroups() + 1 : 1); 50 | this.re2Pointer = re2Pointer; 51 | this.regex = regex; //to avoid that re2Pointer could be garbaged 52 | this.fetchGroups = fetchGroups; 53 | this.managedString = null; 54 | } 55 | 56 | 57 | RE2Matcher(CharSequence input, RE2 regex, long re2Pointer, boolean fetchGroups) { 58 | this(new RE2String(input), regex, re2Pointer, fetchGroups); 59 | this.managedString = utf8input; 60 | } 61 | public void close() { 62 | if (managedString != null) 63 | managedString.close(); 64 | } 65 | 66 | 67 | 68 | public boolean found() { 69 | return matched; 70 | } 71 | 72 | public boolean findNext() { 73 | if (!matched) return find(); 74 | else return find(end(0)); 75 | } 76 | 77 | public boolean find() { 78 | return find(0); 79 | } 80 | public boolean find(int start) { 81 | return find(start, utf8input.length()); 82 | } 83 | 84 | public boolean find(int start, int end) { 85 | groups.clear(); 86 | matched = false; 87 | 88 | if (utf8input.isClosed()) throw new IllegalStateException("String buffer has been already closed"); 89 | if (regex.isClosed()) throw new IllegalStateException("Regex has been already closed"); 90 | 91 | start = utf8input.bytePos(start); 92 | end = utf8input.bytePos(end); 93 | int ngroups = fetchGroups ? regex.numberOfCapturingGroups() + 1 : 1; 94 | @SuppressWarnings("deprecation") 95 | long stringPointer = utf8input.pointer(); 96 | return matched = findImpl(this, re2Pointer, stringPointer, ngroups, start, end); 97 | } 98 | 99 | private void checkGroup(int group) { 100 | if (!matched) throw new IllegalStateException("The pattern has not been matched!"); 101 | if (group >= groups.size()) throw new IllegalStateException("Group n. "+group+" is not in pattern!"); 102 | } 103 | 104 | @Override 105 | public int start() { 106 | return start(0); 107 | } 108 | 109 | @Override 110 | public int start(int group) { 111 | checkGroup(group); 112 | return groups.get(group).start; 113 | } 114 | 115 | @Override 116 | public int end() { 117 | return end(0); 118 | } 119 | 120 | @Override 121 | public int end(int group) { 122 | checkGroup(group); 123 | return groups.get(group).end; 124 | } 125 | 126 | @Override 127 | public String group() { 128 | return group(0); 129 | } 130 | 131 | @Override 132 | public String group(int group) { 133 | checkGroup(group); 134 | if (groups.get(group).start < 0) 135 | return null; 136 | else 137 | return utf8input.subSequence(groups.get(group).start, groups.get(group).end).toString(); 138 | } 139 | 140 | @Override 141 | public int groupCount() { 142 | checkGroup(0); 143 | return groups.size(); 144 | } 145 | 146 | @Override 147 | public Iterator iterator() { 148 | 149 | return new Iterator() { 150 | boolean moved = false; 151 | boolean hasnext = false; 152 | @Override 153 | public boolean hasNext() { 154 | if (!moved) { 155 | hasnext = findNext(); 156 | moved = true; 157 | } 158 | return hasnext; 159 | } 160 | @Override 161 | public MatchResult next() { 162 | if (hasNext()) { 163 | moved = false; 164 | return RE2Matcher.this; 165 | } else 166 | throw new NoSuchElementException(); 167 | } 168 | @Override 169 | public void remove() { 170 | throw new UnsupportedOperationException(); 171 | } 172 | }; 173 | } 174 | 175 | @Override 176 | public String toString() { 177 | StringBuffer buffer = new StringBuffer(); 178 | buffer.append(matched); 179 | for (int i=0; i ignore = regex.getIgnoreGropus(); 23 | for ( int i = 0; i < groupCount(); ++i ) { 24 | if ( !ignore.contains(i) ) 25 | System.out.println("Good: '" + group(i) + "'"); 26 | else 27 | System.out.println("Ignored: '" + group(i) + "'"); 28 | }*/ 29 | 30 | groups = patchGroups(groups, regex.getIgnoreGropus()); 31 | 32 | return true; 33 | } 34 | 35 | static ArrayList patchGroups(ArrayList groups, Set ignoreRanges) { 36 | ArrayList patched = new ArrayList<>(); 37 | Set startRanges = new HashSet<>(); 38 | 39 | // remove the groups to ignore 40 | for ( int i = 0; i < groups.size(); ++i ) { 41 | if ( !ignoreRanges.contains(i) ) 42 | patched.add(groups.get(i)); 43 | else { 44 | Range ri = groups.get(i); 45 | if ( ri.start >= 0 && ri.start != ri.end ) { 46 | //System.out.println(ri.start); 47 | startRanges.add(ri.start); 48 | } 49 | } 50 | 51 | } 52 | 53 | // adjust the ranges of the remaining ones 54 | for ( int i = 0; i < patched.size(); ++i ) { 55 | Range ri = patched.get(i); 56 | 57 | if ( ri.start != ri.end && ri.start >= 0 ) { 58 | 59 | if (startRanges.contains(ri.start)) 60 | patched.set(i, Range.of(ri.start + 1, ri.end)); 61 | 62 | // eventually I just modified the range! 63 | ri = patched.get(i); 64 | 65 | if (startRanges.contains(ri.end - 1)) 66 | patched.set(i, Range.of(ri.start, ri.end - 1)); 67 | } 68 | } 69 | 70 | return patched; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/RE2String.java: -------------------------------------------------------------------------------- 1 | package com.logentries.re2; 2 | 3 | import java.nio.ByteBuffer; 4 | import java.nio.CharBuffer; 5 | import java.nio.charset.CharsetEncoder; 6 | import java.nio.charset.CodingErrorAction; 7 | import java.nio.charset.StandardCharsets; 8 | 9 | public class RE2String implements CharSequence, AutoCloseable { 10 | 11 | private static native long createStringBuffer(final byte[] input); 12 | private static native void releaseStringBuffer(final byte[] input, final long pointer); 13 | 14 | private CharSequence input; 15 | private byte[] utf8CString; 16 | private long utf8StringPointer = 0; 17 | private UTF8CharOffset utf8Offset; 18 | 19 | 20 | public RE2String(CharSequence input) { 21 | this.input = input; 22 | try { 23 | this.utf8CString = createUtf8CString(input); 24 | } catch (Exception e ){ 25 | throw new IllegalArgumentException("Unable to encode input using UTF-8", e); 26 | } 27 | this.utf8StringPointer = createStringBuffer(utf8CString); 28 | this.utf8Offset = new UTF8CharOffset(input); 29 | } 30 | 31 | public int bytePos(int charPosition) { 32 | check(); 33 | return utf8Offset.fromStringToByte(charPosition); 34 | } 35 | public int charPos(int bytePosition) { 36 | check(); 37 | return utf8Offset.fromByteToChar(bytePosition); 38 | } 39 | public boolean isClosed() { 40 | return utf8StringPointer == 0; 41 | } 42 | 43 | /** 44 | * @deprecated 45 | */ 46 | @Deprecated() 47 | long pointer() { 48 | return utf8StringPointer; 49 | } 50 | 51 | 52 | private void check() { 53 | if (utf8StringPointer == 0) 54 | throw new IllegalStateException("Buffer has been already closed!"); 55 | } 56 | 57 | private void free() { 58 | if (utf8StringPointer != 0) { 59 | releaseStringBuffer(utf8CString, utf8StringPointer); 60 | utf8StringPointer = 0; 61 | } 62 | } 63 | @Override 64 | public void close() { 65 | free(); 66 | } 67 | 68 | @Override 69 | protected void finalize() throws Throwable { 70 | free(); 71 | super.finalize(); 72 | } 73 | 74 | 75 | @Override 76 | public int length() { 77 | return input.length(); 78 | } 79 | 80 | @Override 81 | public char charAt(int index) { 82 | return input.charAt(index); 83 | } 84 | 85 | @Override 86 | public CharSequence subSequence(int start, int end) { 87 | return input.subSequence(start, end); 88 | } 89 | 90 | @Override 91 | public String toString() { 92 | return input.toString(); 93 | } 94 | 95 | private byte[] createUtf8CString(CharSequence s) throws Exception { 96 | CharsetEncoder encoder = StandardCharsets.UTF_8.newEncoder() 97 | .onMalformedInput(CodingErrorAction.REPLACE) 98 | .onUnmappableCharacter(CodingErrorAction.REPLACE); 99 | 100 | ByteBuffer bytes = encoder.encode(CharBuffer.wrap(s)); 101 | 102 | if (bytes.limit() == bytes.capacity()) { 103 | ByteBuffer newBuffer = ByteBuffer.allocate(bytes.limit()+1); 104 | System.arraycopy(bytes.array(), 0, newBuffer.array(), 0, bytes.limit()); 105 | bytes = newBuffer; 106 | } else 107 | bytes.limit(bytes.limit()+1); 108 | 109 | bytes.put(bytes.limit()-1, (byte) 0); 110 | return bytes.array(); 111 | } 112 | } 113 | -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/RegExprException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Java Bindings for the RE2 Library 3 | * 4 | * (c) 2012 Daniel Fiala 5 | * 6 | */ 7 | 8 | package com.logentries.re2; 9 | 10 | public class RegExprException extends Exception { 11 | public RegExprException(final String msg) { 12 | super(msg); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/UTF8CharOffset.java: -------------------------------------------------------------------------------- 1 | package com.logentries.re2; 2 | 3 | import java.nio.charset.StandardCharsets; 4 | 5 | public class UTF8CharOffset { 6 | 7 | static float AVG_BYTE_PER_CHAR = StandardCharsets.UTF_8.newEncoder().averageBytesPerChar(); 8 | static float MIN_INIT_DIMENSION = 8; 9 | 10 | private int[] byte2char; 11 | private int byteSize; 12 | private int[] char2byte; 13 | private int charSize; 14 | public UTF8CharOffset(CharSequence input) { 15 | this(input, (int)Math.max((int)(input.length() * AVG_BYTE_PER_CHAR),MIN_INIT_DIMENSION)); 16 | } 17 | public UTF8CharOffset(CharSequence input, int utf8Len) { 18 | char2byte = new int[input.length()]; 19 | charSize = input.length(); 20 | byte2char = new int[utf8Len]; 21 | byteSize = 0; 22 | int strPos = 0; 23 | for (int i=0; i byteSize) throw new IndexOutOfBoundsException(""+bytePos); 57 | if (bytePos == byteSize) return charSize; 58 | else return byte2char[bytePos]; 59 | } 60 | 61 | public int fromStringToByte(int charPos) { 62 | if (charPos < 0) throw new IndexOutOfBoundsException(""+charPos); 63 | if (charPos > charSize) throw new IndexOutOfBoundsException(""+charPos); 64 | if (charPos == charSize) return byteSize; 65 | else return char2byte[charPos]; 66 | } 67 | 68 | } 69 | -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/entity/CaptureGroup.java: -------------------------------------------------------------------------------- 1 | package com.logentries.re2.entity; 2 | 3 | /** 4 | * Matching text and the location of that text. 5 | */ 6 | public class CaptureGroup { 7 | public final int start, end; 8 | public final String matchingText; 9 | 10 | public CaptureGroup(final String matchingText, final int start, final int end) { 11 | this.matchingText = matchingText; 12 | this.start = start; 13 | this.end = end; 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/entity/NamedGroup.java: -------------------------------------------------------------------------------- 1 | package com.logentries.re2.entity; 2 | 3 | /** 4 | * Name, matching text and the location of that text. 5 | */ 6 | public class NamedGroup { 7 | public final String name; 8 | public final CaptureGroup captureGroup; 9 | 10 | public NamedGroup(final String name, final CaptureGroup captureGroup) { 11 | this.name = name; 12 | this.captureGroup = captureGroup; 13 | } 14 | 15 | public NamedGroup(final String name, final String matchingText, final int start, final int end) { 16 | this(name, new CaptureGroup(matchingText, start, end)); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/main/java/com/logentries/re2/op.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Java Bindings for the RE2 Library 3 | * 4 | * (c) 2012 Daniel Fiala 5 | * 6 | */ 7 | 8 | #ifndef COM_LOGENTRIES_RE2_OP_H 9 | # define COM_LOGENTRIES_RE2_OP_H 10 | 11 | union JavaRE2_Any { 12 | jint i_; 13 | jlong l_; 14 | jfloat f_; 15 | jdouble d_; 16 | char sbuf_[sizeof(re2::StringPiece)]; // union cannot contain member with constructor 17 | // FIXME: Memory alignment 18 | 19 | re2::StringPiece *get_s(void) { 20 | re2::StringPiece *s = reinterpret_cast(sbuf_); 21 | BOOST_VERIFY( reinterpret_cast(sbuf_) == reinterpret_cast(s) ); 22 | return s; 23 | } 24 | 25 | re2::StringPiece *construct_s(void) { 26 | re2::StringPiece *s = new(sbuf_) re2::StringPiece(); 27 | BOOST_VERIFY(reinterpret_cast(s) == reinterpret_cast(get_s())); 28 | return s; 29 | } 30 | 31 | void destroy_s(void) { 32 | re2::StringPiece *s = get_s(); 33 | s->~StringPiece(); 34 | } 35 | }; 36 | 37 | enum JavaRE2_AnyType { 38 | JavaRE2_INT, 39 | JavaRE2_LONG, 40 | JavaRE2_FLOAT, 41 | JavaRE2_DOUBLE, 42 | JavaRE2_STRING, 43 | }; 44 | 45 | class JavaRE2_Arg { 46 | private: 47 | RE2::Arg arg_; 48 | JavaRE2_Any any_; // arg_ contains pointer to any_ 49 | JavaRE2_AnyType type_; 50 | jarray j_array_; 51 | jsize j_index_; 52 | 53 | private: 54 | RE2::Arg init_arg(const JavaRE2_AnyType type) { 55 | switch (type) { 56 | case JavaRE2_INT: 57 | return RE2::Arg(&any_.i_); 58 | case JavaRE2_LONG: 59 | return RE2::Arg(&any_.l_); 60 | case JavaRE2_FLOAT: 61 | return RE2::Arg(&any_.f_); 62 | case JavaRE2_DOUBLE: 63 | return RE2::Arg(&any_.d_); 64 | case JavaRE2_STRING: 65 | any_.construct_s(); 66 | return RE2::Arg(any_.get_s()); 67 | default: 68 | BOOST_VERIFY(0); 69 | } 70 | } 71 | 72 | public: 73 | JavaRE2_Arg(JavaRE2_AnyType type, jarray j_array, const jsize j_index) 74 | : type_(type), 75 | arg_(init_arg(type)), // any_ is set here too 76 | j_array_(j_array), 77 | j_index_(j_index) 78 | { } 79 | 80 | void transfer(JNIEnv *env) { 81 | switch (type_) { 82 | case JavaRE2_INT: { 83 | jintArray j_int_arr = static_cast(j_array_); 84 | env->SetIntArrayRegion(j_int_arr, j_index_, 1, &any_.i_); 85 | } 86 | break; 87 | case JavaRE2_LONG: { 88 | jlongArray j_long_arr = static_cast(j_array_); 89 | env->SetLongArrayRegion(j_long_arr, j_index_, 1, &any_.l_); 90 | } 91 | break; 92 | case JavaRE2_FLOAT: { 93 | jfloatArray j_float_arr = static_cast(j_array_); 94 | env->SetFloatArrayRegion(j_float_arr, j_index_, 1, &any_.f_); 95 | } 96 | break; 97 | case JavaRE2_DOUBLE: { 98 | jdoubleArray j_double_arr = static_cast(j_array_); 99 | env->SetDoubleArrayRegion(j_double_arr, j_index_, 1, &any_.d_); 100 | } 101 | break; 102 | case JavaRE2_STRING: { 103 | re2::StringPiece *s = any_.get_s(); 104 | jobjectArray j_obj_arr = static_cast(j_array_); 105 | jstring j_str = env->NewStringUTF(s->as_string().c_str()); 106 | env->SetObjectArrayElement(j_obj_arr, j_index_, j_str); 107 | } 108 | break; 109 | default: 110 | BOOST_VERIFY(0); 111 | } 112 | } 113 | 114 | RE2::Arg *get_re2_arg(void) { 115 | return &arg_; 116 | } 117 | 118 | ~JavaRE2_Arg(void) { 119 | if (type_ == JavaRE2_STRING) { 120 | any_.destroy_s(); 121 | } 122 | } 123 | }; 124 | 125 | static bool is_int_arr(JNIEnv *env, jclass j_cls) { 126 | jclass j_arr_cls = env->FindClass("[I"); 127 | return env->IsAssignableFrom(j_cls, j_arr_cls); 128 | } 129 | 130 | static bool is_long_arr(JNIEnv *env, jclass j_cls) { 131 | jclass j_arr_cls = env->FindClass("[J"); 132 | return env->IsAssignableFrom(j_cls, j_arr_cls); 133 | } 134 | 135 | static bool is_float_arr(JNIEnv *env, jclass j_cls) { 136 | jclass j_arr_cls = env->FindClass("[F"); 137 | return env->IsAssignableFrom(j_cls, j_arr_cls); 138 | } 139 | 140 | static bool is_double_arr(JNIEnv *env, jclass j_cls) { 141 | jclass j_arr_cls = env->FindClass("[D"); 142 | return env->IsAssignableFrom(j_cls, j_arr_cls); 143 | } 144 | 145 | static bool is_string_arr(JNIEnv *env, jclass j_cls) { 146 | jclass j_arr_cls = env->FindClass("[Ljava/lang/String;"); 147 | BOOST_VERIFY(j_arr_cls); 148 | return env->IsAssignableFrom(j_cls, j_arr_cls); 149 | } 150 | 151 | static JavaRE2_AnyType get_type(JNIEnv *env, jobject j_object) { 152 | jclass j_cls = env->GetObjectClass(j_object); 153 | if (is_int_arr(env, j_cls)) { 154 | return JavaRE2_INT; 155 | } 156 | if (is_long_arr(env, j_cls)) { 157 | return JavaRE2_LONG; 158 | } 159 | if (is_float_arr(env, j_cls)) { 160 | return JavaRE2_FLOAT; 161 | } 162 | if (is_double_arr(env, j_cls)) { 163 | return JavaRE2_DOUBLE; 164 | } 165 | if (is_string_arr(env, j_cls)) { 166 | return JavaRE2_STRING; 167 | } 168 | BOOST_VERIFY(!"Unexpected parameter supplied"); // This should not occure, args are checked from Java 169 | } 170 | 171 | static jsize sum_lengths(JNIEnv *env, jobjectArray j_args) { 172 | jsize j_sum = 0; 173 | const jsize j_args_length = env->GetArrayLength(j_args); 174 | for (jsize j_i = 0; j_i < j_args_length; ++j_i) { 175 | jarray j_arr = static_cast(env->GetObjectArrayElement(j_args, j_i)); 176 | j_sum += env->GetArrayLength(j_arr); 177 | } 178 | return j_sum; 179 | } 180 | 181 | template 182 | static bool do_op(JNIEnv *env, const Op &op, jobjectArray j_args) { 183 | struct Buf { 184 | char _[sizeof(JavaRE2_Arg)]; // FIXME: Memory alignment 185 | 186 | JavaRE2_Arg *get_arg(void) { 187 | JavaRE2_Arg *arg = reinterpret_cast(_); 188 | BOOST_VERIFY(reinterpret_cast(arg) == reinterpret_cast(_)); 189 | return arg; 190 | } 191 | 192 | JavaRE2_Arg *construct_arg(JavaRE2_AnyType type, jarray j_array, const jsize j_index) { 193 | JavaRE2_Arg *arg = new(_) JavaRE2_Arg(type, j_array, j_index); 194 | BOOST_VERIFY(reinterpret_cast(arg) == reinterpret_cast(get_arg())); 195 | return arg; 196 | } 197 | 198 | void destroy_arg(void) { 199 | get_arg()->~JavaRE2_Arg(); 200 | } 201 | }; 202 | const jsize j_args_len = env->GetArrayLength(j_args); 203 | const jsize j_total_len = sum_lengths(env, j_args); 204 | if (j_total_len > 31) { 205 | BOOST_VERIFY(!"Megical constant from re2 source code exceeded"); // This should not occure, args are checked from Java 206 | } 207 | 208 | Buf buf_args[j_total_len]; 209 | RE2::Arg *args[j_total_len]; 210 | 211 | for (jsize j_i = 0, j_index = 0; j_i < j_args_len; ++j_i) { 212 | jarray j_arr = static_cast( env->GetObjectArrayElement(j_args, j_i) ); 213 | const jsize j_len = env->GetArrayLength(j_arr); 214 | const JavaRE2_AnyType type = get_type(env, j_arr); 215 | for (jsize j_j = 0; j_j < j_len; ++j_j) { 216 | args[j_index] = buf_args[j_index].construct_arg(type, j_arr, j_j)->get_re2_arg(); 217 | ++j_index; 218 | } 219 | } 220 | 221 | const int total_len = static_cast(j_total_len); 222 | BOOST_VERIFY(static_cast(total_len) == j_total_len); 223 | BOOST_VERIFY(total_len > 0 == j_total_len > 0); 224 | bool ret = op(args, j_total_len); 225 | 226 | for (jsize j_i = 0; j_i < j_total_len; ++j_i) { 227 | buf_args[j_i].get_arg()->transfer(env); 228 | buf_args[j_i].destroy_arg(); 229 | } 230 | return ret; 231 | } 232 | 233 | #endif 234 | -------------------------------------------------------------------------------- /src/test/java/com/logentries/re2/GenRegExpr.java: -------------------------------------------------------------------------------- 1 | package com.logentries.re2; 2 | 3 | import java.util.List; 4 | import java.util.Arrays; 5 | import java.util.Collection; 6 | import java.util.ArrayList; 7 | import java.util.Stack; 8 | import java.util.Random; 9 | 10 | public class GenRegExpr { 11 | public abstract class Operator { 12 | public abstract int getArity(); 13 | public abstract String call(String ... args); 14 | } 15 | 16 | public abstract class NullaryOperator extends Operator { 17 | public final int getArity() { 18 | return 0; 19 | } 20 | public String call(String ... args) { 21 | assert args.length == 0; 22 | return callNullary(); 23 | } 24 | protected abstract String callNullary(); 25 | } 26 | 27 | public abstract class UnaryOperator extends Operator { 28 | public final int getArity() { 29 | return 1; 30 | } 31 | public String call(String ... args) { 32 | assert args.length == 1; 33 | return callUnary(args[0]); 34 | } 35 | protected abstract String callUnary(final String arg); 36 | } 37 | 38 | public abstract class BinaryOperator extends Operator { 39 | public final int getArity() { 40 | return 2; 41 | } 42 | public String call(String ... args) { 43 | assert args.length == 2; 44 | return callBinary(args[0], args[1]); 45 | } 46 | protected abstract String callBinary(final String leftArg, final String rightArg); 47 | } 48 | 49 | public final class ConstOperator extends NullaryOperator { 50 | private final String m_val; 51 | 52 | public ConstOperator(final String val) { 53 | m_val = val; 54 | } 55 | 56 | protected String callNullary() { 57 | return m_val; 58 | } 59 | 60 | public String toString() { 61 | return "" + '"' + m_val + '"'; 62 | } 63 | } 64 | 65 | public final class OperatorUnaryStar extends UnaryOperator { 66 | protected String callUnary(final String arg) { 67 | return arg + '*'; 68 | } 69 | 70 | public String toString() { 71 | return "*"; 72 | } 73 | } 74 | 75 | public final class OperatorUnaryPlus extends UnaryOperator { 76 | protected String callUnary(final String arg) { 77 | return arg + '+'; 78 | } 79 | 80 | public String toString() { 81 | return "+"; 82 | } 83 | } 84 | 85 | public final class OperatorUnaryQM extends UnaryOperator { 86 | protected String callUnary(final String arg) { 87 | return arg + '?'; 88 | } 89 | 90 | public String toString() { 91 | return "?"; 92 | } 93 | } 94 | 95 | public final class OperatorBinaryConcat extends BinaryOperator { 96 | protected String callBinary(final String leftArg, final String rightArg) { 97 | return leftArg + rightArg; 98 | } 99 | 100 | public String toString() { 101 | return "<>"; 102 | } 103 | } 104 | 105 | public final class OperatorBinaryPipe extends BinaryOperator { 106 | protected String callBinary(final String leftArg, final String rightArg) { 107 | return leftArg + '|' + rightArg; 108 | } 109 | 110 | public String toString() { 111 | return "|"; 112 | } 113 | } 114 | 115 | /* Member Variables */ 116 | 117 | Random mRand = new Random(); 118 | 119 | // Operators that are not nullary 120 | private List mOps = Arrays.asList(new OperatorUnaryStar(), 121 | new OperatorUnaryPlus(), 122 | new OperatorUnaryQM(), 123 | new OperatorBinaryConcat(), 124 | new OperatorBinaryPipe() 125 | ); 126 | 127 | // Nullary operators 128 | private List mNullary; // Generated from input 129 | 130 | private int mMaxAtoms; 131 | private int mMaxOps; 132 | 133 | /* Member Functions */ 134 | 135 | public GenRegExpr(final Collection consts, final int maxAtoms, final int maxOps) { 136 | mMaxAtoms = maxAtoms; 137 | mMaxOps = maxOps; 138 | mNullary = new ArrayList(consts.size()); 139 | for (String s: consts) { 140 | mNullary.add(new ConstOperator(s)); 141 | } 142 | } 143 | 144 | protected String group(final String s) { 145 | return "(?:" + s + ")"; 146 | } 147 | 148 | protected String runPostfix(Stack opsStack) { 149 | /* 150 | String ret = "Stack"; 151 | for (Operator item: stack) { 152 | ret += " " + item.toString(); 153 | } 154 | return ret; 155 | */ 156 | Stack valsStack = new Stack(); 157 | for (Operator item: opsStack) { 158 | switch (item.getArity()) { 159 | case 0: 160 | valsStack.push(item.call()); 161 | break; 162 | case 1: 163 | final String arg = valsStack.pop(); 164 | valsStack.push( group(item.call(arg)) ); 165 | break; 166 | case 2: 167 | final String rightArg = valsStack.pop(); 168 | final String leftArg = valsStack.pop(); 169 | valsStack.push( group(item.call(leftArg, rightArg)) ); 170 | break; 171 | default: 172 | assert false; 173 | break; 174 | } 175 | } 176 | assert valsStack.size() == 1; 177 | return valsStack.pop(); 178 | } 179 | 180 | protected String genPostfix(Stack stack, final int nstk, final int ops, final int atoms) { 181 | for (;;) { 182 | if (nstk + ops >= mMaxOps) { 183 | return null; 184 | } 185 | 186 | if (nstk == 1 && mRand.nextInt(2) == 0) { 187 | return runPostfix(stack); 188 | } 189 | 190 | if (atoms < mMaxAtoms && mRand.nextInt(2) == 0) { 191 | stack.push( mNullary.get(mRand.nextInt(mNullary.size())) ); 192 | final String ret = genPostfix(stack, nstk + 1, ops, atoms + 1); 193 | stack.pop(); 194 | if (ret != null) { 195 | return ret; 196 | } 197 | } 198 | 199 | if (ops < mMaxOps && mRand.nextInt(2) == 0) { 200 | final Operator op = mOps.get(mRand.nextInt(mOps.size())); 201 | if (op.getArity() <= nstk) { 202 | stack.push(op); 203 | final String ret = genPostfix(stack, nstk - op.getArity() + 1, ops + 1, atoms); 204 | stack.pop(); 205 | if (ret != null) { 206 | return ret; 207 | } 208 | } 209 | } 210 | } 211 | } 212 | 213 | public String next() { 214 | return genPostfix(new Stack(), 0, 0, 0); 215 | } 216 | } 217 | -------------------------------------------------------------------------------- /src/test/java/com/logentries/re2/GenString.java: -------------------------------------------------------------------------------- 1 | package com.logentries.re2; 2 | 3 | import java.util.List; 4 | import java.util.ArrayList; 5 | import java.util.Random; 6 | 7 | public class GenString { 8 | private List mAlphabet; 9 | private int mMaxLen; 10 | private Random mRand = new Random(); 11 | 12 | public GenString(final List alphabet, final int maxLen) { 13 | mAlphabet = new ArrayList(alphabet); 14 | mMaxLen = maxLen; 15 | } 16 | 17 | public String next() { 18 | final int len = mRand.nextInt(100) == 0 ? mRand.nextInt(mMaxLen) : mRand.nextInt(mMaxLen - 1) + 1; 19 | final int asize = mAlphabet.size(); 20 | String ret = new String(); 21 | for (int i = 0; i < len; ++i) { 22 | ret += mAlphabet.get(mRand.nextInt(asize)); 23 | } 24 | return ret; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/test/java/com/logentries/re2/Main.java: -------------------------------------------------------------------------------- 1 | package com.logentries.re2; 2 | 3 | import org.junit.runner.JUnitCore; 4 | import org.junit.runner.Result; 5 | import org.junit.runner.notification.Failure; 6 | 7 | public class Main { 8 | private static void testThreads() { 9 | Result result = JUnitCore.runClasses(TestThreads.class); 10 | for (Failure failure : result.getFailures()) { 11 | System.out.println(failure.toString()); 12 | } 13 | } 14 | 15 | private static void testRandomExpr() { 16 | Result result = JUnitCore.runClasses(TestRandomExpr.class); 17 | for (Failure failure : result.getFailures()) { 18 | System.out.println(failure.toString()); 19 | } 20 | } 21 | 22 | public static void main(String[] args) { 23 | testThreads(); 24 | testRandomExpr(); 25 | 26 | /* 27 | System.err.println("Generating random sequences"); 28 | for (int i = 0; i < 30; ++i) { 29 | System.err.println(new GenRegExpr(Arrays.asList("aaa", "b", "ccc"), 3, 12).next()); 30 | } 31 | System.err.println("Generating random strings"); 32 | final GenString gs = new GenString(Arrays.asList("aaa", "b", "ccc"), 12); 33 | for (int i = 0; i < 30; ++i) { 34 | System.err.println(gs.next()); 35 | } 36 | */ 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/test/java/com/logentries/re2/TestExceptions.java: -------------------------------------------------------------------------------- 1 | package com.logentries.re2; 2 | 3 | import org.junit.Test; 4 | import static org.junit.Assert.assertEquals; 5 | import static org.junit.Assert.assertTrue; 6 | import static org.junit.Assert.assertFalse; 7 | import static org.junit.Assert.assertNotNull; 8 | import static org.junit.Assert.fail; 9 | 10 | public class TestExceptions { 11 | @Test 12 | public void testCorrect() { 13 | try { 14 | assertNotNull(new RE2("Everything Works")); 15 | } catch (RegExprException e) { 16 | fail(); 17 | } 18 | } 19 | 20 | @Test(expected=RegExprException.class) 21 | public void testWrong() throws RegExprException { 22 | try { 23 | RE2 re2 = new RE2("(Nothing Works", new Options().setLogErrors(false)); 24 | System.err.println("re2 = " + re2); 25 | } catch (RegExprException e) { 26 | System.err.println("Exdeption thrown, msg: " + e.getMessage()); 27 | throw e; 28 | } catch (Throwable e) { 29 | e.printStackTrace(); 30 | } 31 | } 32 | } 33 | 34 | -------------------------------------------------------------------------------- /src/test/java/com/logentries/re2/TestMatcherFind.java: -------------------------------------------------------------------------------- 1 | package com.logentries.re2; 2 | 3 | import com.logentries.re2.entity.NamedGroup; 4 | import org.junit.Assert; 5 | import org.junit.Test; 6 | 7 | import java.util.List; 8 | import java.util.Random; 9 | import java.util.regex.MatchResult; 10 | 11 | import static org.junit.Assert.*; 12 | 13 | 14 | public class TestMatcherFind { 15 | final String oneNamedGroup = "(?Pcode)"; 16 | final String twoNamedGroups = "(?Ptest).*co(?Pde)"; 17 | final String nestedNamedGroups = "(?P(?Ptest).*co(?Pde))"; 18 | final String optionalNamedGroup = "(?Phello)?"; 19 | 20 | @Test 21 | public void testFindSimple() throws Exception { 22 | 23 | RE2 regex = new RE2("(www\\.)?dandelion\\.eu"); 24 | 25 | RE2Matcher matcher = regex.matcher("https://dandelion.eu/datatxt"); 26 | assertTrue(matcher.find()); 27 | assertEquals(8, matcher.start()); 28 | assertEquals(20, matcher.end()); 29 | assertEquals("dandelion.eu", matcher.group()); 30 | assertEquals(8, matcher.start(0)); 31 | assertEquals(20, matcher.end(0)); 32 | assertEquals("dandelion.eu", matcher.group(0)); 33 | assertEquals(-1, matcher.start(1)); 34 | assertEquals(-1, matcher.end(1)); 35 | assertNull(matcher.group(1)); 36 | 37 | } 38 | @Test 39 | public void testFindNoGroups() throws Exception { 40 | 41 | RE2 regex = new RE2("(www\\.)?dandelion\\.eu"); 42 | 43 | RE2Matcher matcher = regex.matcher("https://www.dandelion.eu/datatxt", false); 44 | assertTrue(matcher.find()); 45 | assertEquals("www.dandelion.eu", matcher.group()); 46 | assertEquals(1, matcher.groupCount()); 47 | } 48 | 49 | @Test 50 | public void testMatchGroups() throws Exception { 51 | RE2 regex = new RE2("(www\\.)?dandelion\\.(eu)"); 52 | RE2Matcher matcher = regex.matcher("€€ https://dandelion.eu/datatxt - www.dandelion.eu/datatxt"); 53 | assertTrue(matcher.findNext()); 54 | assertEquals("dandelion.eu", matcher.group()); 55 | assertNull(matcher.group(1)); 56 | assertEquals("eu", matcher.group(2)); 57 | 58 | assertTrue(matcher.findNext()); 59 | assertEquals("www.dandelion.eu", matcher.group()); 60 | assertEquals("www.",matcher.group(1)); 61 | assertEquals("eu", matcher.group(2)); 62 | 63 | assertFalse(matcher.findNext()); 64 | } 65 | 66 | @Test 67 | public void testGetCaptureGroupNames() throws Exception { 68 | assertEquals(1, new RE2(oneNamedGroup).getCaptureGroupNames().size()); 69 | assertEquals(2, new RE2(twoNamedGroups).getCaptureGroupNames().size()); 70 | assertEquals(3, new RE2(nestedNamedGroups).getCaptureGroupNames().size()); 71 | assertEquals(1, new RE2(optionalNamedGroup).getCaptureGroupNames().size()); 72 | 73 | for (int i = 0; i < 3; i++) { 74 | assertEquals("name"+(i+1), new RE2(nestedNamedGroups).getCaptureGroupNames().get(i)); 75 | } 76 | } 77 | 78 | @Test 79 | public void testSingleNamedCaptureGroupsTest() throws Exception { 80 | String event = "test code best log"; 81 | RE2 regex = new RE2(oneNamedGroup); 82 | 83 | List names = regex.getCaptureGroupNames(); 84 | List namedCaptureGroups = regex.getNamedCaptureGroups(names, event); 85 | 86 | assertEquals(1, namedCaptureGroups.size()); 87 | assertEquals("name1", namedCaptureGroups.get(0).name); 88 | assertEquals("code", namedCaptureGroups.get(0).captureGroup.matchingText); 89 | } 90 | 91 | @Test 92 | public void testMultipleNamedCaptureGroupsTest() throws Exception { 93 | String event = "test code best log"; 94 | RE2 regex = new RE2(twoNamedGroups); 95 | 96 | List names = regex.getCaptureGroupNames(); 97 | List namedCaptureGroups = regex.getNamedCaptureGroups(names, event); 98 | 99 | assertEquals(2, namedCaptureGroups.size()); 100 | assertEquals("name1", namedCaptureGroups.get(0).name); 101 | assertEquals("test", namedCaptureGroups.get(0).captureGroup.matchingText); 102 | assertEquals("name2", namedCaptureGroups.get(1).name); 103 | assertEquals("de", namedCaptureGroups.get(1).captureGroup.matchingText); 104 | } 105 | 106 | @Test 107 | public void testNestedNamedCaptureGroupsTest() throws Exception { 108 | String event = "test code best log"; 109 | RE2 regex = new RE2(nestedNamedGroups); 110 | 111 | List names = regex.getCaptureGroupNames(); 112 | List namedCaptureGroups = regex.getNamedCaptureGroups(names, event); 113 | 114 | assertEquals(3, namedCaptureGroups.size()); 115 | assertEquals("name1", namedCaptureGroups.get(0).name); 116 | assertEquals("test code", namedCaptureGroups.get(0).captureGroup.matchingText); 117 | assertEquals("name2", namedCaptureGroups.get(1).name); 118 | assertEquals("test", namedCaptureGroups.get(1).captureGroup.matchingText); 119 | assertEquals("name3", namedCaptureGroups.get(2).name); 120 | assertEquals("de", namedCaptureGroups.get(2).captureGroup.matchingText); 121 | } 122 | 123 | @Test 124 | public void testOptionalNamedCaptureGroupsTest() throws Exception { 125 | String event = "hello log"; 126 | RE2 regex = new RE2(optionalNamedGroup); 127 | 128 | List names = regex.getCaptureGroupNames(); 129 | List namedCaptureGroups = regex.getNamedCaptureGroups(names, event); 130 | 131 | assertEquals(1, namedCaptureGroups.size()); 132 | assertEquals("name1", namedCaptureGroups.get(0).name); 133 | assertEquals("hello", namedCaptureGroups.get(0).captureGroup.matchingText); 134 | 135 | String event2 = "test log"; 136 | RE2 regex2 = new RE2(optionalNamedGroup); 137 | 138 | List names2 = regex.getCaptureGroupNames(); 139 | List namedCaptureGroups2 = regex2.getNamedCaptureGroups(names2, event2); 140 | 141 | assertEquals(0, namedCaptureGroups2.size()); 142 | } 143 | 144 | @Test 145 | public void testFindNext() throws Exception { 146 | RE2 regex = new RE2("(www\\.)?dandelion\\.(eu)"); 147 | RE2Matcher matcher = regex.matcher("€€ https://dandelion.euwww.dandelion.eu"); 148 | assertTrue(matcher.findNext()); 149 | assertEquals("dandelion.eu", matcher.group()); 150 | assertNull(matcher.group(1)); 151 | assertEquals("eu", matcher.group(2)); 152 | 153 | assertTrue(matcher.findNext()); 154 | assertEquals("www.dandelion.eu", matcher.group()); 155 | assertEquals("www.",matcher.group(1)); 156 | assertEquals("eu", matcher.group(2)); 157 | 158 | assertFalse(matcher.findNext()); 159 | } 160 | 161 | @Test(expected = IllegalStateException.class) 162 | public void testFindGroupOverflow() throws Exception { 163 | 164 | RE2 regex = new RE2("(www\\.)?dandelion\\.eu"); 165 | 166 | RE2Matcher matcher = regex.matcher("https://dandelion.eu/datatxt"); 167 | assertTrue(matcher.find()); 168 | matcher.group(2); 169 | 170 | } 171 | 172 | @Test 173 | public void testFindStart() throws Exception { 174 | 175 | RE2 regex = new RE2("(www\\.)?dandelion\\.eu"); 176 | 177 | RE2Matcher matcher = regex.matcher("Datatxt: https://dandelion.eu/datatxt - the named entity extraction tool by Spaziodati"); 178 | assertTrue(matcher.find()); 179 | assertTrue(matcher.find(17)); 180 | assertFalse(matcher.find(18)); 181 | assertFalse(matcher.find(40)); 182 | } 183 | 184 | @Test 185 | public void testFindEnd() throws Exception { 186 | 187 | RE2 regex = new RE2("(www\\.)?dandelion\\.eu"); 188 | 189 | RE2Matcher matcher = regex.matcher("Datatxt: https://dandelion.eu/datatxt -"); 190 | assertTrue(matcher.find()); 191 | assertTrue(matcher.find(17,29)); 192 | assertFalse(matcher.find(18,29)); 193 | assertTrue(matcher.find(0, 39)); 194 | } 195 | 196 | @Test 197 | public void testOffsetSpecialChars() throws Exception { 198 | 199 | RE2 regex = new RE2("dandelion\\.eu"); 200 | 201 | String[] input = { 202 | "Dàtàtxt: https://dandelion.eu/datatxt - ", //offset 2 203 | "D€t€t€€: https://dandelion.eu/datatxt - ", //offset 3 204 | "€€€€€€€: https://dandelion.eu/datatxt €€€", //offset 3 205 | }; 206 | 207 | for (String i : input) { 208 | RE2Matcher matcher = regex.matcher(i); 209 | assertTrue(i, matcher.find()); 210 | assertEquals(i, "dandelion.eu", matcher.group()); 211 | assertTrue(i, matcher.find(17)); 212 | assertEquals(i, "dandelion.eu", matcher.group()); 213 | assertFalse(i, matcher.find(18)); 214 | } 215 | 216 | } 217 | @Test 218 | public void testSurrogateChars() throws Exception { 219 | 220 | RE2 regex = new RE2("(www\\.)?dandelion\\.eu"); 221 | 222 | String[] input = { 223 | "D\uD801\uDC28t\uD801\uDC28t\uD801\uDC28€: https://dandelion.eu/datatxt - ", //surrogate 224 | "D\uD83D\uDC3Et\uD83D\uDC3Et\uD83D\uDC3E€: https://dandelion.eu/datatxt - ", //surrogate 225 | }; 226 | 227 | for (String i : input) { 228 | RE2Matcher matcher = regex.matcher(i); 229 | assertTrue(i, matcher.find()); 230 | assertEquals(i, "dandelion.eu", matcher.group()); 231 | assertTrue(i, matcher.find(20)); 232 | assertEquals(i, "dandelion.eu", matcher.group()); 233 | assertFalse(i, matcher.find(21)); 234 | } 235 | 236 | } 237 | 238 | 239 | @Test 240 | public void testEmptyStrings() throws Exception { 241 | RE2 regex = new RE2("(www\\.)?dandelion\\.eu"); 242 | assertFalse(regex.matcher("").find()); 243 | assertFalse(regex.matcher("a").find()); 244 | assertFalse(regex.matcher("€").find()); 245 | } 246 | 247 | @Test() 248 | public void testIterator() throws Exception { 249 | int c = 0; 250 | for (MatchResult mr : new RE2("t").matcher("input text")) c++; 251 | assertEquals(3, c); 252 | } 253 | 254 | 255 | 256 | @Test(expected = IllegalStateException.class) 257 | public void testClosed() throws Exception { 258 | RE2Matcher m = new RE2("test").matcher("input text"); 259 | m.close(); 260 | m.find(); 261 | } 262 | 263 | @Test() 264 | public void testTryWith() throws Exception { 265 | RE2 r = new RE2("t"); 266 | try (RE2Matcher m = r.matcher("input text")) { 267 | assertTrue(m.findNext()); 268 | assertTrue(m.findNext()); 269 | assertTrue(m.findNext()); 270 | assertFalse(m.findNext()); 271 | } 272 | } 273 | 274 | 275 | @Test(expected = IllegalStateException.class) 276 | public void testReClosed() throws Exception { 277 | RE2 regex = new RE2("test"); 278 | RE2Matcher m = regex.matcher("input text"); 279 | regex.close(); 280 | m.find(); 281 | } 282 | 283 | @Test 284 | public void testOptionsList() throws Exception { 285 | RE2 regex = new RE2("TGIF?", 286 | Options.CASE_INSENSITIVE, 287 | Options.ENCODING(Encoding.UTF8), 288 | Options.PERL_CLASSES(false) 289 | ); 290 | } 291 | 292 | @Test 293 | public void testMoreGroups() throws Exception { 294 | String pattern = ""; 295 | char c = 'a'; 296 | for (int i=0; i<25; i++) { 297 | char cnext = (char)(c+i); 298 | if (i>0) pattern += "|("+cnext+")"; 299 | else pattern += "("+ cnext +")"; 300 | } 301 | 302 | RE2Matcher matcher = new RE2(pattern).matcher("a very beatiful string"); 303 | assertTrue(matcher.findNext()); //a 304 | assertTrue(matcher.findNext()); //v 305 | assertEquals("v", matcher.group()); 306 | assertEquals("v", matcher.group('v' - 'a' + 1)); 307 | } 308 | 309 | static String rnd(int len) { 310 | Random r = new Random(); 311 | String s = new String(); 312 | for (int i=0; i input = new ArrayList() {{ 22 | add(Range.of(0,10)); 23 | add(Range.of(10,16)); 24 | add(Range.of(10,11)); 25 | add(Range.of(20,23)); 26 | add(Range.of(22,23)); 27 | }}; 28 | Set ignore = new HashSet() {{ 29 | add(2); 30 | add(4); 31 | }}; 32 | 33 | ArrayList output = RE2MatcherUnicodeWord.patchGroups(input, ignore); 34 | assertEquals(3, output.size()); 35 | assertEquals(input.get(0), output.get(0)); 36 | // removed one at the beginning 37 | assertEquals(11, output.get(1).start); 38 | assertEquals(16, output.get(1).end); 39 | // removed one at the end 40 | assertEquals(20, output.get(2).start); 41 | assertEquals(22, output.get(2).end); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/test/java/com/logentries/re2/TestRandomExpr.java: -------------------------------------------------------------------------------- 1 | package com.logentries.re2; 2 | 3 | import org.junit.Test; 4 | 5 | import java.util.Arrays; 6 | import java.util.List; 7 | import java.util.regex.Pattern; 8 | 9 | import static org.junit.Assert.assertEquals; 10 | import static org.junit.Assert.fail; 11 | 12 | public class TestRandomExpr { 13 | private static class InterruptibleCharSequence implements CharSequence { 14 | CharSequence inner; 15 | 16 | public InterruptibleCharSequence(CharSequence inner) { 17 | super(); 18 | this.inner = inner; 19 | } 20 | 21 | public char charAt(int index) { 22 | if (Thread.interrupted()) { // clears flag if set 23 | throw new RuntimeException(new InterruptedException()); 24 | } 25 | return inner.charAt(index); 26 | } 27 | 28 | public int length() { 29 | return inner.length(); 30 | } 31 | 32 | public CharSequence subSequence(int start, int end) { 33 | return new InterruptibleCharSequence(inner.subSequence(start, end)); 34 | } 35 | 36 | @Override 37 | public String toString() { 38 | return inner.toString(); 39 | } 40 | } 41 | 42 | private List mAlphabet = Arrays.asList("aaa", "b", "ccc"); 43 | 44 | private GenRegExpr genRegExpr = new GenRegExpr(mAlphabet, 3, 12); 45 | private GenString genString = new GenString(mAlphabet, 15); 46 | 47 | private static Boolean applyMatches(final Pattern pattern, final String str) { 48 | class ApplyMatches implements Runnable { 49 | private volatile Boolean res = null; 50 | public Boolean getRes() { 51 | return res; 52 | } 53 | 54 | public void run() { 55 | res = pattern.matcher(new InterruptibleCharSequence(str)).matches(); 56 | } 57 | } 58 | ApplyMatches am = new ApplyMatches(); 59 | Thread thread = new Thread(am); 60 | thread.start(); 61 | try { 62 | thread.join(1500); 63 | } catch (InterruptedException ex) { 64 | } 65 | final Boolean res = am.getRes(); 66 | if (res == null) { 67 | thread.interrupt(); 68 | try { 69 | thread.join(); 70 | } catch (InterruptedException ex) { 71 | } 72 | } 73 | return res; 74 | } 75 | 76 | private static Boolean applyFind(final Pattern pattern, final String str) { 77 | class ApplyFind implements Runnable { 78 | private volatile Boolean res = null; 79 | public Boolean getRes() { 80 | return res; 81 | } 82 | 83 | public void run() { 84 | res = pattern.matcher(new InterruptibleCharSequence(str)).find(); 85 | } 86 | } 87 | ApplyFind af = new ApplyFind(); 88 | Thread thread = new Thread(af); 89 | thread.start(); 90 | try { 91 | thread.join(1500); 92 | } catch (InterruptedException ex) { 93 | } 94 | final Boolean res = af.getRes(); 95 | if (res == null) { 96 | thread.interrupt(); 97 | try { 98 | thread.join(); 99 | } catch (InterruptedException ex) { 100 | } 101 | } 102 | return res; 103 | } 104 | 105 | private void compareOneRandExpr(final int index) { 106 | final String regExprStr = genRegExpr.next(); 107 | System.err.println("Runnig i = " + index + "\t" + regExprStr); 108 | 109 | Pattern pattern = Pattern.compile(regExprStr); 110 | System.err.println("\t+Pattern.compile()"); 111 | RE2 re2 = null; 112 | try { 113 | re2 = new RE2(regExprStr); 114 | } catch (RegExprException e) { 115 | System.err.println("Cannot construct re: [" + regExprStr + "] : " + e.getMessage()); 116 | fail("Unexpected error in RE"); 117 | } 118 | System.err.println("\t+new RE2()"); 119 | 120 | for (int i = 0; i < 25; ++i) { 121 | final String str = genString.next(); 122 | System.err.println("\t" + str); 123 | final Boolean matches = applyMatches(pattern, str); 124 | if (matches == null) { 125 | System.err.println("Timeout of matches(.) for re=[" + regExprStr + "] and string=[" + str + "]"); 126 | } 127 | System.err.println("\t\tPattern.matches()"); 128 | final boolean re2_matches = re2.fullMatch(str); 129 | System.err.println("\t\tRE2.matches()"); 130 | final Boolean found = applyFind(pattern, str); 131 | if (found == null) { 132 | System.err.println("Timeout of find(.) for re=[" + regExprStr + "] and string=[" + str + "]"); 133 | } 134 | System.err.println("\t\tPattern.find()"); 135 | final boolean re2_found = re2.partialMatch(str); 136 | System.err.println("\t\tRE2.partialMatch()"); 137 | if ((matches != null && matches != re2_matches) || (found != null && found != re2_found)) { 138 | System.err.println("reg-expr:[" + regExprStr + "]; str:[" + str + "] " + matches + "\t" + re2_matches + "\t" + found + "\t" + re2_found); 139 | } 140 | if (matches != null) { 141 | assertEquals(matches, re2_matches); 142 | } 143 | if (found != null) { 144 | assertEquals(found, re2_found); 145 | } 146 | } 147 | re2.dispoze(); 148 | } 149 | 150 | public void testRandExpr() { 151 | for (int i = 0; i < 200; ++i) { 152 | compareOneRandExpr(i); 153 | } 154 | } 155 | 156 | @Test 157 | public void testRandRE2() { 158 | class Worker implements Runnable { 159 | public void run() { 160 | for (int i = 0; i < 2000; ++i) { 161 | // runOneRandRE2(i); 162 | compareOneRandExpr(i); 163 | } 164 | } 165 | } 166 | 167 | Thread[] ths = new Thread[8]; 168 | for (int i = 0; i < ths.length; ++i) { 169 | (ths[i] = new Thread(new Worker())).start(); 170 | } 171 | for (int i = 0; i < ths.length; ++i) { 172 | try { 173 | ths[i].join(); 174 | } catch (InterruptedException e) { 175 | } 176 | } 177 | } 178 | 179 | public void runOneRandRE2(final int index) { 180 | final String regExprStr = genRegExpr.next(); 181 | System.err.println("Runnig i = " + index + "\t" + regExprStr); 182 | 183 | 184 | RE2 re2 = null; 185 | try { 186 | new RE2(regExprStr); 187 | } catch (RegExprException e) { 188 | System.err.println("Cannot construct re: [" + regExprStr + "] : " + e.getMessage()); 189 | fail("Unexpected error in RE"); 190 | } 191 | System.err.println("\t+new RE2()"); 192 | 193 | for (int i = 0; i < 25; ++i) { 194 | final String str = genString.next(); 195 | System.err.println("\t" + str); 196 | final boolean re2_matches = re2.fullMatch(str); 197 | System.err.println("\t\tRE2.matches()"); 198 | final boolean re2_found = re2.partialMatch(str); 199 | System.err.println("\t\tRE2.partialMatch()"); 200 | System.err.println("reg-expr:[" + regExprStr + "]; str:[" + str + "] " + re2_matches + "\t" + re2_found); 201 | } 202 | re2.dispoze(); 203 | } 204 | } 205 | -------------------------------------------------------------------------------- /src/test/java/com/logentries/re2/TestThreads.java: -------------------------------------------------------------------------------- 1 | package com.logentries.re2; 2 | 3 | import org.junit.Test; 4 | import static org.junit.Assert.assertEquals; 5 | import static org.junit.Assert.assertTrue; 6 | import static org.junit.Assert.assertFalse; 7 | import static org.junit.Assert.assertNotNull; 8 | import static org.junit.Assert.fail; 9 | 10 | public class TestThreads { 11 | private RE2 safeNewRE2(final String regExprStr, final Options options) { 12 | try { 13 | if (options == null) { 14 | return new RE2(regExprStr); 15 | } else { 16 | return new RE2(regExprStr, options); 17 | } 18 | } catch (RegExprException e) { 19 | System.err.println("Cannot construct re: [" + regExprStr + "] : " + e.getMessage()); 20 | fail("Unexpected error in RE"); 21 | return null; 22 | } 23 | } 24 | @Test 25 | public void testThreads() { 26 | class Worker implements Runnable { 27 | public void xxx() { 28 | final boolean res1 = RE2.fullMatch("hello", "(h.*o)"); 29 | assertTrue(res1); 30 | /* */ 31 | final boolean res2 = RE2.fullMatch("hello", "(h.*x)"); 32 | assertFalse(res2); 33 | /* */ 34 | final RE2 re_x = safeNewRE2("(h.*o)", null); 35 | assertNotNull(re_x); 36 | final boolean res3 = re_x.fullMatch("hello"); 37 | assertTrue(res3); 38 | final boolean res4 = re_x.fullMatch("hellx"); 39 | assertFalse(res4); 40 | re_x.dispoze(); 41 | final RE2 re_y = safeNewRE2("(h.*o)", new Options()); 42 | assertNotNull(re_y); 43 | re_y.dispoze(); 44 | /* */ 45 | int[] out00 = new int[1]; 46 | final boolean res5 = RE2.fullMatch("1256", "(\\d+)", out00); 47 | assertTrue(res5); 48 | assertEquals(1256, out00[0]); 49 | /* */ 50 | int[] out10 = new int[1]; 51 | String[] out11 = new String[1]; 52 | long[] out12 = new long[1]; 53 | final boolean res6 = RE2.fullMatch("1256xsssx136985478256", "(\\d+)x(\\w+)x(\\d+)", out10, out11, out12); 54 | assertTrue(res6); 55 | assertEquals(1256, out10[0]); 56 | assertEquals("sss", out11[0]); 57 | assertEquals(136985478256L, out12[0]); 58 | /* */ 59 | int[] out20 = new int[3]; 60 | String[] out21 = new String[2]; 61 | long[] out22 = new long[4]; 62 | final boolean res7 = RE2.fullMatch("1256-34-567xstring-everythingworks@13-698-547-12345678256", "(\\d+)-(\\d+)-(\\d+)x(\\w+)-(\\w+)@(\\d+)-(\\d+)-(\\d+)-(\\d+)", out20, out21, out22); 63 | assertTrue(res7); 64 | // out20 65 | assertEquals(1256, out20[0]); 66 | assertEquals(34, out20[1]); 67 | assertEquals(567, out20[2]); 68 | // out21 69 | assertEquals("string", out21[0]); 70 | assertEquals("everythingworks", out21[1]); 71 | // out22 72 | assertEquals(13L, out22[0]); 73 | assertEquals(698L, out22[1]); 74 | assertEquals(547L, out22[2]); 75 | assertEquals(12345678256L, out22[3]); 76 | } 77 | public void run() { 78 | for (int i = 0; i < 2000; ++i) { 79 | xxx(); 80 | } 81 | } 82 | } 83 | 84 | Thread[] ths = new Thread[12]; 85 | System.err.println("Running test consisting of " + ths.length + " threads"); 86 | for (int i = 0; i < ths.length; ++i) { 87 | (ths[i] = new Thread(new Worker())).start(); 88 | } 89 | for (int i = 0; i < ths.length; ++i) { 90 | try { 91 | ths[i].join(); 92 | } catch (InterruptedException e) { 93 | } 94 | } 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/test/java/com/logentries/re2/TestUnicode.java: -------------------------------------------------------------------------------- 1 | package com.logentries.re2; 2 | 3 | import org.junit.Ignore; 4 | import org.junit.Test; 5 | 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | 9 | import static org.junit.Assert.*; 10 | 11 | /** 12 | * Created by edt on 1/15/15. 13 | */ 14 | public class TestUnicode { 15 | 16 | @Ignore 17 | @Test 18 | public void testReplace ( ) { 19 | Map cases = new HashMap() {{ 20 | put("", ""); 21 | put("\\b", RE2.BOUNDARY_REPLACE); 22 | put("\\w", RE2.WORD_REPLACE); 23 | put("\\W", RE2.NON_WORD_REPLACE); 24 | put("\\b\\b", RE2.BOUNDARY_REPLACE + RE2.BOUNDARY_REPLACE); 25 | }}; 26 | 27 | for ( String s : cases.keySet() ) { 28 | RE2 re = RE2.compile(s); 29 | assertEquals(cases.get(s), re.patchUnicodeWord(s)); 30 | } 31 | } 32 | 33 | @Test 34 | public void testSimpleUnicode () throws Exception { 35 | RE2 regex = new RE2("\\w+", Options.UNICODE_WORD); 36 | 37 | Map cases = new HashMap() {{ 38 | put("pio pio pio", new String[]{"pio", "pio", "pio"}); 39 | put("pio pio perché é ᴘrêché", 40 | new String[]{"pio", "pio", "perché", "é", "ᴘrêché"}); 41 | put("='.12é ᴘrêc2hé///pupp2a perché1 ", 42 | new String[]{"12é", "ᴘrêc2hé", "pupp2a", "perché1"}); 43 | }}; 44 | 45 | 46 | for ( String s : cases.keySet() ) { 47 | RE2Matcher matcher = regex.matcher(s); 48 | String[] matches = cases.get(s); 49 | 50 | for (String match : matches) { 51 | assertTrue(matcher.findNext()); 52 | assertEquals(match, matcher.group()); 53 | } 54 | } 55 | } 56 | 57 | public static class TestWordBoundaries { 58 | @Test 59 | public void testIgnoreMiddle() throws Exception { 60 | RE2 regex = new RE2("(\\b\\w+) asd (\\w+\\b)", Options.UNICODE_WORD); 61 | RE2Matcher matcher = regex.matcher("a éé asd éé "); 62 | assertTrue(matcher.findNext()); 63 | 64 | assertEquals("éé asd éé", matcher.group()); 65 | assertEquals("éé", matcher.group(1)); 66 | assertEquals("éé", matcher.group(2)); 67 | 68 | matcher = regex.matcher("ééasdéé"); 69 | assertFalse(matcher.find()); 70 | } 71 | 72 | @Test 73 | public void testIgnoreEnd() throws Exception { 74 | RE2 regex = new RE2("((\\w+)\\b)", Options.UNICODE_WORD); 75 | RE2Matcher matcher = regex.matcher("éaéa"); 76 | assertTrue(matcher.findNext()); 77 | assertEquals("éaéa", matcher.group()); 78 | 79 | matcher = regex.matcher("éaéa.[',"); 80 | assertTrue(matcher.findNext()); 81 | assertEquals("éaéa", matcher.group()); 82 | } 83 | 84 | @Test 85 | public void testIgnoreBeginning() throws Exception { 86 | RE2 regex = new RE2("(\\b\\w+)", Options.UNICODE_WORD); 87 | RE2Matcher matcher = regex.matcher("éaéa"); 88 | assertTrue(matcher.findNext()); 89 | assertEquals("éaéa", matcher.group()); 90 | 91 | matcher = regex.matcher("/,/.,é1é"); 92 | assertTrue(matcher.findNext()); 93 | assertEquals("é1é", matcher.group()); 94 | } 95 | 96 | @Test 97 | public void moreTests () throws Exception { 98 | RE2 regex = new RE2("\\b\\w+\\b", Options.UNICODE_WORD); 99 | 100 | Map cases = new HashMap() {{ 101 | put("pio pio pio", new String[]{"pio", "pio", "pio"}); 102 | put("pio pio perché é ᴘrêché", new String[]{"pio", "pio", "perché", "é", "ᴘrêché"}); 103 | put("='.12é ᴘrêc2hé///pupp2a perché1 ", new String[]{"12é", "ᴘrêc2hé", "pupp2a", "perché1"}); 104 | put(" d1sé=... ᴘrêc2hé ", new String[]{"d1sé", "ᴘrêc2hé"}); 105 | }}; 106 | 107 | 108 | for ( String s : cases.keySet() ) { 109 | RE2Matcher matcher = regex.matcher(s); 110 | String[] matches = cases.get(s); 111 | 112 | for (String match : matches) { 113 | assertTrue(matcher.findNext()); 114 | assertEquals(match, matcher.group()); 115 | } 116 | } 117 | } 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/test/java/com/logentries/re2/TestUtf8CharOffset.java: -------------------------------------------------------------------------------- 1 | package com.logentries.re2; 2 | 3 | import org.junit.Test; 4 | import org.junit.runner.RunWith; 5 | import org.junit.runners.Parameterized; 6 | 7 | import java.nio.ByteBuffer; 8 | import java.nio.CharBuffer; 9 | import java.nio.charset.Charset; 10 | import java.nio.charset.StandardCharsets; 11 | import java.util.ArrayList; 12 | import java.util.Arrays; 13 | import java.util.Collection; 14 | import java.util.List; 15 | 16 | import static java.util.Arrays.asList; 17 | import static org.hamcrest.CoreMatchers.equalTo; 18 | import static org.junit.Assert.assertEquals; 19 | import static org.junit.Assert.assertThat; 20 | 21 | @RunWith(Parameterized.class) 22 | public class TestUtf8CharOffset { 23 | 24 | @Parameterized.Parameters(name = "{0}") 25 | public static Collection data() { 26 | // input char -> byte byte -> char 27 | 28 | return asList(new Object[][]{{ 29 | "abcd efg", asList(0,1,2,3,4,5,6,7), asList(0,1,2,3,4,5,6,7), 30 | },{ 31 | "abcd èfg", asList(0,1,2,3,4,5,7,8), asList(0,1,2,3,4,5,5,6,7) 32 | },{ 33 | "abcd €fg", asList(0,1,2,3,4,5,8,9), asList(0,1,2,3,4,5,5,5,6,7) 34 | },{ 35 | "abcd €€€", asList(0,1,2,3,4,5,8,11), asList(0,1,2,3,4,5,5,5,6,6,6,7,7,7) 36 | },{ 37 | "àbcd €fg", asList(0,2,3,4,5,6,9,10), asList(0,0,1,2,3,4,5,5,5,6,7) 38 | },{ 39 | "a\uD83D\uDC36cd efg", asList(0,1,1,5,6,7,8,9,10), asList(0,1,1,1,1,3,4,5,6,7,8) 40 | },{ 41 | // but why on earth we have to spend time to support chars!!!!! 42 | "\uD83D\uDC36\uD83D\uDC3Ecd efg", asList(0,0,4,4,8,9,10,11,12,13), asList(0,0,0,0,2,2,2,2,4,5,6,7,8,9) 43 | },{ 44 | "\uD83D\uDD0D sp", asList(0,0,4,5,6), asList(0,0,0,0,2,3,4) 45 | },{ 46 | "â€", asList(0,2), asList(0,0,1,1,1) 47 | }}); 48 | } 49 | 50 | @Parameterized.Parameter(value = 0) 51 | public String input; 52 | @Parameterized.Parameter(value = 1) 53 | public List char2byte; 54 | @Parameterized.Parameter(value = 2) 55 | public List byte2char; 56 | 57 | @Test 58 | public void test() throws Exception { 59 | 60 | UTF8CharOffset offset = new UTF8CharOffset(input); 61 | byte[] utf8 = input.getBytes("UTF-8"); 62 | 63 | assertEquals("check test consistency: ", utf8.length, byte2char.size()); 64 | assertEquals("check test consistency: ", input.length(), char2byte.size()); 65 | 66 | List myChar2Byte = new ArrayList<>(); 67 | for (int i=0; i myByte2Char = new ArrayList<>(); 69 | for (int i=0; i