11 |
12 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ZSCReading
2 |
3 | Source code reading app for Android. This app assumes target source codes are archived as one zip file.
4 |
5 | This app first indexing whole zip contents, then perform regular expression query.
6 | This app is Android port of [Code Search](https://github.com/google/codesearch) + GUI client.
7 |
8 | ## ZSCReading is built using open source software:
9 |
10 | - io.reactivex.rxjava2:rxjava
11 | - io.reactivex.rxjava2:rxandroid
12 | - [re2j-td](https://github.com/sopel39/re2j-td) we forked to use byte array and run on Android
13 | - JavaPrettify
14 |
15 | Also, we use codeserach logic.
16 |
--------------------------------------------------------------------------------
/app/.gitignore:
--------------------------------------------------------------------------------
1 | /build
2 |
--------------------------------------------------------------------------------
/app/build.gradle:
--------------------------------------------------------------------------------
1 | apply plugin: 'com.android.application'
2 | apply plugin: 'kotlin-android'
3 |
4 | android {
5 | compileSdkVersion 27
6 | buildToolsVersion '28.0.3'
7 | defaultConfig {
8 | applicationId "com.livejournal.karino2.zipsourcecodereading"
9 | minSdkVersion 19
10 | targetSdkVersion 27
11 | versionCode 3
12 | versionName "0.3"
13 | testInstrumentationRunner "android.support.test.runner.AndroidJUnitRunner"
14 | }
15 | buildTypes {
16 | release {
17 | minifyEnabled false
18 | proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
19 | }
20 | debug {
21 | applicationIdSuffix ".debug"
22 | }
23 |
24 | debugRelease {
25 | minifyEnabled false
26 | proguardFiles getDefaultProguardFile('proguard-android.txt'), 'proguard-rules.pro'
27 | signingConfig signingConfigs.debug
28 | }
29 |
30 | }
31 | }
32 |
33 | dependencies {
34 | implementation fileTree(include: ['*.jar'], dir: 'libs')
35 | androidTestImplementation('com.android.support.test.espresso:espresso-core:2.2.2', {
36 | exclude group: 'com.android.support', module: 'support-annotations'
37 | })
38 | implementation "org.jetbrains.kotlin:kotlin-stdlib-jdk7:$kotlin_version"
39 | implementation 'com.android.support:appcompat-v7:27.1.1'
40 | implementation 'com.android.support:recyclerview-v7:27.1.1'
41 | implementation 'com.android.support:support-compat:27.1.1'
42 | implementation 'com.android.support.constraint:constraint-layout:1.0.2'
43 | implementation 'io.reactivex.rxjava2:rxjava:2.1.3'
44 | implementation 'io.reactivex.rxjava2:rxandroid:2.0.1'
45 | testImplementation 'junit:junit:4.12'
46 | }
47 | repositories {
48 | mavenCentral()
49 | }
50 |
--------------------------------------------------------------------------------
/app/libs/JavaPrettify-1.2.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/karino2/ZipSourceCodeReading/fee6b82247e233b0f816e30a4e68e6c9ed0b76d9/app/libs/JavaPrettify-1.2.1.jar
--------------------------------------------------------------------------------
/app/proguard-rules.pro:
--------------------------------------------------------------------------------
1 | # Add project specific ProGuard rules here.
2 | # By default, the flags in this file are appended to flags specified
3 | # in C:\Users\_\AppData\Local\Android\sdk/tools/proguard/proguard-android.txt
4 | # You can edit the include path and order by changing the proguardFiles
5 | # directive in build.gradle.
6 | #
7 | # For more details, see
8 | # http://developer.android.com/guide/developing/tools/proguard.html
9 |
10 | # Add any project specific keep options here:
11 |
12 | # If your project uses WebView with JS, uncomment the following
13 | # and specify the fully qualified class name to the JavaScript interface
14 | # class:
15 | #-keepclassmembers class fqcn.of.javascript.interface.for.webview {
16 | # public *;
17 | #}
18 |
19 | # Uncomment this to preserve the line number information for
20 | # debugging stack traces.
21 | #-keepattributes SourceFile,LineNumberTable
22 |
23 | # If you keep the line number information, uncomment this to
24 | # hide the original source file name.
25 | #-renamesourcefileattribute SourceFile
26 |
--------------------------------------------------------------------------------
/app/src/androidTest/java/com/livejournal/karino2/zipsourcecodereading/ExampleInstrumentedTest.java:
--------------------------------------------------------------------------------
1 | package com.livejournal.karino2.zipsourcecodereading;
2 |
3 | import android.content.Context;
4 | import android.support.test.InstrumentationRegistry;
5 | import android.support.test.runner.AndroidJUnit4;
6 |
7 | import org.junit.Test;
8 | import org.junit.runner.RunWith;
9 |
10 | import static org.junit.Assert.*;
11 |
12 | /**
13 | * Instrumentation test, which will execute on an Android device.
14 | *
15 | * @see Testing documentation
16 | */
17 | @RunWith(AndroidJUnit4.class)
18 | public class ExampleInstrumentedTest {
19 | @Test
20 | public void useAppContext() throws Exception {
21 | // Context of the app under test.
22 | Context appContext = InstrumentationRegistry.getTargetContext();
23 |
24 | assertEquals("karino2.livejournal.com.zipsourcecodereading", appContext.getPackageName());
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/app/src/main/AndroidManifest.xml:
--------------------------------------------------------------------------------
1 |
2 |
4 |
5 |
6 |
7 |
8 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
30 |
32 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/CharGroup.java:
--------------------------------------------------------------------------------
1 | // GENERATED BY make_perl_groups.pl; DO NOT EDIT.
2 | // make_perl_groups.pl >perl_groups.go
3 |
4 | package com.google.re2j;
5 |
6 | import java.util.HashMap;
7 |
8 | class CharGroup {
9 |
10 | final int sign;
11 | final int[] cls;
12 |
13 | private CharGroup(int sign, int[] cls) {
14 | this.sign = sign;
15 | this.cls = cls;
16 | }
17 |
18 | private static final int[] code1 = { /* \d */
19 | 0x30, 0x39,
20 | };
21 |
22 | private static final int[] code2 = { /* \s */
23 | 0x9, 0xa,
24 | 0xc, 0xd,
25 | 0x20, 0x20,
26 | };
27 |
28 | private static final int[] code3 = { /* \w */
29 | 0x30, 0x39,
30 | 0x41, 0x5a,
31 | 0x5f, 0x5f,
32 | 0x61, 0x7a,
33 | };
34 |
35 | static final HashMap PERL_GROUPS =
36 | new HashMap();
37 |
38 | static {
39 | PERL_GROUPS.put("\\d", new CharGroup(+1, code1));
40 | PERL_GROUPS.put("\\D", new CharGroup(-1, code1));
41 | PERL_GROUPS.put("\\s", new CharGroup(+1, code2));
42 | PERL_GROUPS.put("\\S", new CharGroup(-1, code2));
43 | PERL_GROUPS.put("\\w", new CharGroup(+1, code3));
44 | PERL_GROUPS.put("\\W", new CharGroup(-1, code3));
45 | }
46 | private static final int[] code4 = { /* [:alnum:] */
47 | 0x30, 0x39,
48 | 0x41, 0x5a,
49 | 0x61, 0x7a,
50 | };
51 |
52 | private static final int[] code5 = { /* [:alpha:] */
53 | 0x41, 0x5a,
54 | 0x61, 0x7a,
55 | };
56 |
57 | private static final int[] code6 = { /* [:ascii:] */
58 | 0x0, 0x7f,
59 | };
60 |
61 | private static final int[] code7 = { /* [:blank:] */
62 | 0x9, 0x9,
63 | 0x20, 0x20,
64 | };
65 |
66 | private static final int[] code8 = { /* [:cntrl:] */
67 | 0x0, 0x1f,
68 | 0x7f, 0x7f,
69 | };
70 |
71 | private static final int[] code9 = { /* [:digit:] */
72 | 0x30, 0x39,
73 | };
74 |
75 | private static final int[] code10 = { /* [:graph:] */
76 | 0x21, 0x7e,
77 | };
78 |
79 | private static final int[] code11 = { /* [:lower:] */
80 | 0x61, 0x7a,
81 | };
82 |
83 | private static final int[] code12 = { /* [:print:] */
84 | 0x20, 0x7e,
85 | };
86 |
87 | private static final int[] code13 = { /* [:punct:] */
88 | 0x21, 0x2f,
89 | 0x3a, 0x40,
90 | 0x5b, 0x60,
91 | 0x7b, 0x7e,
92 | };
93 |
94 | private static final int[] code14 = { /* [:space:] */
95 | 0x9, 0xd,
96 | 0x20, 0x20,
97 | };
98 |
99 | private static final int[] code15 = { /* [:upper:] */
100 | 0x41, 0x5a,
101 | };
102 |
103 | private static final int[] code16 = { /* [:word:] */
104 | 0x30, 0x39,
105 | 0x41, 0x5a,
106 | 0x5f, 0x5f,
107 | 0x61, 0x7a,
108 | };
109 |
110 | private static final int[] code17 = { /* [:xdigit:] */
111 | 0x30, 0x39,
112 | 0x41, 0x46,
113 | 0x61, 0x66,
114 | };
115 |
116 | static final HashMap POSIX_GROUPS =
117 | new HashMap();
118 |
119 | static {
120 | POSIX_GROUPS.put("[:alnum:]", new CharGroup(+1, code4));
121 | POSIX_GROUPS.put("[:^alnum:]", new CharGroup(-1, code4));
122 | POSIX_GROUPS.put("[:alpha:]", new CharGroup(+1, code5));
123 | POSIX_GROUPS.put("[:^alpha:]", new CharGroup(-1, code5));
124 | POSIX_GROUPS.put("[:ascii:]", new CharGroup(+1, code6));
125 | POSIX_GROUPS.put("[:^ascii:]", new CharGroup(-1, code6));
126 | POSIX_GROUPS.put("[:blank:]", new CharGroup(+1, code7));
127 | POSIX_GROUPS.put("[:^blank:]", new CharGroup(-1, code7));
128 | POSIX_GROUPS.put("[:cntrl:]", new CharGroup(+1, code8));
129 | POSIX_GROUPS.put("[:^cntrl:]", new CharGroup(-1, code8));
130 | POSIX_GROUPS.put("[:digit:]", new CharGroup(+1, code9));
131 | POSIX_GROUPS.put("[:^digit:]", new CharGroup(-1, code9));
132 | POSIX_GROUPS.put("[:graph:]", new CharGroup(+1, code10));
133 | POSIX_GROUPS.put("[:^graph:]", new CharGroup(-1, code10));
134 | POSIX_GROUPS.put("[:lower:]", new CharGroup(+1, code11));
135 | POSIX_GROUPS.put("[:^lower:]", new CharGroup(-1, code11));
136 | POSIX_GROUPS.put("[:print:]", new CharGroup(+1, code12));
137 | POSIX_GROUPS.put("[:^print:]", new CharGroup(-1, code12));
138 | POSIX_GROUPS.put("[:punct:]", new CharGroup(+1, code13));
139 | POSIX_GROUPS.put("[:^punct:]", new CharGroup(-1, code13));
140 | POSIX_GROUPS.put("[:space:]", new CharGroup(+1, code14));
141 | POSIX_GROUPS.put("[:^space:]", new CharGroup(-1, code14));
142 | POSIX_GROUPS.put("[:upper:]", new CharGroup(+1, code15));
143 | POSIX_GROUPS.put("[:^upper:]", new CharGroup(-1, code15));
144 | POSIX_GROUPS.put("[:word:]", new CharGroup(+1, code16));
145 | POSIX_GROUPS.put("[:^word:]", new CharGroup(-1, code16));
146 | POSIX_GROUPS.put("[:xdigit:]", new CharGroup(+1, code17));
147 | POSIX_GROUPS.put("[:^xdigit:]", new CharGroup(-1, code17));
148 | }
149 |
150 | }
151 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/DFAMachine.java:
--------------------------------------------------------------------------------
1 | // Copyright 2015 The RE2 Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | // Original RE2 source here:
6 | // https://github.com/google/re2/blob/master/re2/dfa.cc
7 |
8 | package com.google.re2j;
9 |
10 | import com.google.re2j.RE2.Anchor;
11 | import com.google.re2j.RE2.MatchKind;
12 |
13 | import java.util.concurrent.ConcurrentHashMap;
14 | import java.util.concurrent.atomic.AtomicInteger;
15 |
16 | import static com.google.re2j.DFA.NO_MATCH;
17 | import static com.google.re2j.RE2.Anchor.ANCHOR_START;
18 | import static com.google.re2j.RE2.MatchKind.FIRST_MATCH;
19 | import static com.google.re2j.RE2.MatchKind.LONGEST_MATCH;
20 |
21 | /**
22 | * A {@link Machine} implementation using a DFA.
23 | */
24 | class DFAMachine implements Machine {
25 |
26 | private static final int MAX_DFA_KEY = 4;
27 |
28 | @SuppressWarnings("unchecked")
29 | private final ConcurrentHashMap[] stateCache = new ConcurrentHashMap[MAX_DFA_KEY];
30 | private final AtomicInteger availableStates;
31 | @SuppressWarnings("unchecked")
32 | private final ThreadLocal[] dfaCache = new ThreadLocal[MAX_DFA_KEY];
33 | private final RE2 re2;
34 |
35 | DFAMachine(RE2 re2, int maximumNumberOfDFAStates) {
36 | this.re2 = re2;
37 | this.availableStates = new AtomicInteger(maximumNumberOfDFAStates);
38 |
39 | for (int i = 0; i < MAX_DFA_KEY; ++i) {
40 | stateCache[i] = new ConcurrentHashMap<>();
41 | }
42 |
43 | setDfaThreadLocal(LONGEST_MATCH, true);
44 | setDfaThreadLocal(LONGEST_MATCH, false);
45 | setDfaThreadLocal(FIRST_MATCH, true);
46 | setDfaThreadLocal(FIRST_MATCH, false);
47 | }
48 |
49 | @Override
50 | public boolean match(MachineInput in, int pos, Anchor anchor, int[] submatches) {
51 | // Don't ask for the location if we won't use it. SearchDFA can do extra optimizations in that case.
52 | boolean wantMatchPosition = true;
53 | if (submatches.length == 0) {
54 | wantMatchPosition = false;
55 | }
56 |
57 | // Use DFA to find exact location of match, filter out non-matches.
58 | int matchStart;
59 | int matchEnd;
60 | switch (anchor) {
61 | case UNANCHORED:
62 | matchEnd = searchDFA(in, pos, in.endPos(), anchor, wantMatchPosition, re2.matchKind, false);
63 | if (matchEnd == NO_MATCH) {
64 | return false;
65 | }
66 |
67 | // Matched. Don't care where
68 | if (!wantMatchPosition) {
69 | return true;
70 | }
71 |
72 | // SearchDFA gives match end position but we don't know where the match started. Run the
73 | // regexp backwards from end position to find the longest possible match -- that's where it started.
74 | matchStart = searchDFA(in, pos, matchEnd, ANCHOR_START, true, LONGEST_MATCH, true);
75 | if (matchStart == NO_MATCH) {
76 | throw new IllegalStateException("reverse DFA did not found a match");
77 | }
78 |
79 | break;
80 | case ANCHOR_BOTH:
81 | case ANCHOR_START:
82 | matchEnd = searchDFA(in, pos, in.endPos(), anchor, wantMatchPosition, re2.matchKind, false);
83 | if (matchEnd == NO_MATCH) {
84 | return false;
85 | }
86 | matchStart = 0;
87 | break;
88 | default:
89 | throw new IllegalStateException("bad anchor");
90 | }
91 |
92 | if (submatches.length == 2) {
93 | submatches[0] = matchStart;
94 | submatches[1] = matchEnd;
95 | } else {
96 | if (!re2.nfaMachine.get().match(in, matchStart, anchor, submatches)) {
97 | throw new IllegalStateException("NFA inconsistency");
98 | }
99 | }
100 |
101 | return true;
102 | }
103 |
104 | private int searchDFA(MachineInput in, int startPos, int endPos, Anchor anchor, boolean wantMatchPosition, MatchKind matchKind, boolean reversed) {
105 | boolean hasCarat = reversed ? anchor.isAnchorEnd() : anchor.isAnchorStart();
106 | if (hasCarat && startPos != 0) {
107 | return NO_MATCH;
108 | }
109 |
110 | // Handle end match by running an anchored longest match and then checking if it covers all of text.
111 | boolean anchored = anchor.isAnchorStart();
112 | boolean endMatch = false;
113 | if (anchor.isAnchorEnd()) {
114 | endMatch = true;
115 | matchKind = LONGEST_MATCH;
116 | }
117 |
118 | // If the caller doesn't care where the match is (just whether one exists),
119 | // then we can stop at the very first match we find, the so-called
120 | // "earliest match".
121 | boolean wantEarliestMatch = false;
122 | if (!wantMatchPosition && !endMatch) {
123 | wantEarliestMatch = true;
124 | matchKind = LONGEST_MATCH;
125 | }
126 |
127 | DFA dfa = getDfa(matchKind, reversed);
128 | int match = dfa.search(in, startPos, endPos, anchored, wantEarliestMatch);
129 |
130 | if (match == NO_MATCH) {
131 | return NO_MATCH;
132 | }
133 |
134 | if (endMatch) {
135 | if ((reversed && match != startPos) || (!reversed && match != endPos)) {
136 | return NO_MATCH;
137 | }
138 | }
139 |
140 | return match;
141 | }
142 |
143 | private DFA getDfa(MatchKind matchKind, boolean reversed) {
144 | return dfaCache[dfaKey(matchKind, reversed)].get();
145 | }
146 |
147 | private int dfaKey(MatchKind matchKind, boolean reversed) {
148 | int longestInt = matchKind == LONGEST_MATCH ? 1 : 0;
149 | int reversedInt = reversed ? 1 : 0;
150 | return longestInt | (reversedInt << 1);
151 | }
152 |
153 | private void setDfaThreadLocal(final MatchKind matchKind, final boolean reversed) {
154 | final int dfaKey = dfaKey(matchKind, reversed);
155 | final Prog prog = reversed ? re2.reverseProg : re2.prog;
156 | dfaCache[dfaKey] = new ThreadLocal() {
157 | @Override
158 | public DFA initialValue() {
159 | return new DFA(prog, matchKind, reversed, stateCache[dfaKey], availableStates);
160 | }
161 | };
162 | }
163 | }
164 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/DFAState.java:
--------------------------------------------------------------------------------
1 | // Copyright 2015 The RE2 Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | // Original RE2 source here:
6 | // https://github.com/google/re2/blob/master/re2/dfa.cc
7 |
8 | package com.google.re2j;
9 |
10 | import static com.google.re2j.DFA.FLAG_MATCH;
11 | import static com.google.re2j.DFAState.StateType.DEAD;
12 | import static com.google.re2j.DFAState.StateType.REGULAR;
13 | import static java.lang.System.arraycopy;
14 |
15 | final class DFAState {
16 | public static final DFAState DEAD_STATE = new DFAState(DEAD);
17 |
18 | public enum StateType {
19 | DEAD, // no possible match out of this state
20 | REGULAR // all other states
21 | }
22 |
23 | private final StateType type; // the state type. Lets us create DEAD_STATE and FULL_MATCH_STATE
24 | private final int[] instIndexes; // indexes into prog instructions for this state
25 | private final int flag; // empty width flags
26 | private final DFAState[] next = new DFAState[256]; // Maps bytes to the next state to follow
27 |
28 | public DFAState(int[] instIndexes, int nIndexes, int flag) {
29 | this.type = REGULAR;
30 | this.instIndexes = new int[nIndexes];
31 | arraycopy(instIndexes, 0, this.instIndexes, 0, nIndexes);
32 | this.flag = flag;
33 | }
34 |
35 | private DFAState(StateType type) {
36 | this.type = type;
37 | this.instIndexes = new int[0];
38 | this.flag = 0;
39 | }
40 |
41 | public StateType getType() {
42 | return type;
43 | }
44 |
45 | public int getFlag() {
46 | return flag;
47 | }
48 |
49 | public int[] getInstIndexes() {
50 | return instIndexes;
51 | }
52 |
53 | public boolean isMatch() {
54 | return (flag & FLAG_MATCH) != 0;
55 | }
56 |
57 | public boolean isDead() {
58 | return type == DEAD;
59 | }
60 |
61 | public DFAState getNextState(byte b) {
62 | return next[b & 0xff];
63 | }
64 |
65 | public void setNextState(byte b, DFAState state) {
66 | next[b & 0xff] = state;
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/DFAStateKey.java:
--------------------------------------------------------------------------------
1 | // Copyright 2015 The RE2 Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | // Original RE2 source here:
6 | // https://github.com/google/re2/blob/master/re2/dfa.cc
7 |
8 | package com.google.re2j;
9 |
10 | import java.util.Arrays;
11 |
12 | import static com.google.re2j.Utils.arrayFirstElementsEqual;
13 |
14 | final class DFAStateKey {
15 | private final int[] instIndexes;
16 | private final int nIndexes;
17 | private final int flag;
18 |
19 | DFAStateKey(int[] instIndexes, int nIndexes, int flag) {
20 | this.instIndexes = instIndexes;
21 | this.nIndexes = nIndexes;
22 | this.flag = flag;
23 | }
24 |
25 | @Override
26 | public boolean equals(Object o) {
27 | if (this == o) return true;
28 | if (o == null || getClass() != o.getClass()) return false;
29 |
30 | DFAStateKey that = (DFAStateKey) o;
31 |
32 | return nIndexes == that.nIndexes && flag == that.flag && arrayFirstElementsEqual(instIndexes, that.instIndexes, nIndexes);
33 | }
34 |
35 | @Override
36 | public int hashCode() {
37 | int result = Arrays.hashCode(instIndexes);
38 | result = 31 * result + nIndexes;
39 | result = 31 * result + flag;
40 | return result;
41 | }
42 | }
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/Inst.java:
--------------------------------------------------------------------------------
1 | // Copyright 2010 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | // Original Go source here:
6 | // http://code.google.com/p/go/source/browse/src/pkg/regexp/syntax/prog.go
7 |
8 | package com.google.re2j;
9 |
10 | import static com.google.re2j.Inst.Op.BYTE;
11 |
12 | /**
13 | * A single instruction in the regular expression virtual machine.
14 | *
15 | * @see http://swtch.com/~rsc/regexp/regexp2.html
16 | */
17 | class Inst {
18 |
19 | enum Op {
20 | ALT,
21 | ALT_MATCH,
22 | CAPTURE,
23 | EMPTY_WIDTH,
24 | FAIL,
25 | MATCH,
26 | NOP,
27 | BYTE,
28 | BYTE1
29 | }
30 |
31 | Op op;
32 | int out; // all but MATCH, FAIL
33 | int arg; // ALT, ALT_MATCH, CAPTURE, EMPTY_WIDTH
34 | byte[] byteRanges; // length==1 => exact match. Otherwise a list of [lo,hi] pairs. hi is *inclusive*.
35 |
36 | Inst(Op op) {
37 | this.op = op;
38 | }
39 |
40 | // op() returns i.Op but merges all the byte special cases into BYTE
41 | // Beware "op" is a public field.
42 | Op op() {
43 | switch (op) {
44 | case BYTE1:
45 | return BYTE;
46 | default:
47 | return op;
48 | }
49 | }
50 |
51 | // MatchByte returns true if the instruction matches (and consumes) b.
52 | // It should only be called when op == InstByte.
53 | boolean matchByte(byte b) {
54 | // Special case: single-byte slice is from literal string, not byte range.
55 | if (byteRanges.length == 1) {
56 | int b0 = byteRanges[0];
57 | return b == b0;
58 | }
59 |
60 | // Search through all pairs.
61 | int byteInt = b & 0xff;
62 | for (int j = 0; j < byteRanges.length; j += 2) {
63 | if (byteInt < (byteRanges[j] & 0xff)) {
64 | return false;
65 | }
66 | if (byteInt <= (byteRanges[j + 1] & 0xff)) {
67 | return true;
68 | }
69 | }
70 |
71 | return false;
72 | }
73 |
74 | @Override
75 | public String toString() {
76 | switch (op) {
77 | case ALT:
78 | return "alt -> " + out + ", " + arg;
79 | case ALT_MATCH:
80 | return "altmatch -> " + out + ", " + arg;
81 | case CAPTURE:
82 | return "cap " + arg + " -> " + out;
83 | case EMPTY_WIDTH:
84 | return "empty " + arg + " -> " + out;
85 | case MATCH:
86 | return "match";
87 | case FAIL:
88 | return "fail";
89 | case NOP:
90 | return "nop -> " + out;
91 | case BYTE:
92 | return "byte " + appendBytes() + " -> " + out;
93 | case BYTE1:
94 | return "byte1 " + appendBytes() + " -> " + out;
95 | default:
96 | throw new IllegalStateException("unhandled case in Inst.toString");
97 | }
98 | }
99 |
100 | private String appendBytes() {
101 | StringBuilder out = new StringBuilder();
102 | if (byteRanges.length == 1) {
103 | out.append(byteRanges[0] & 0xff);
104 | } else {
105 | for (int i = 0; i < byteRanges.length; i += 2) {
106 | out.append("[")
107 | .append(byteRanges[i] & 0xff)
108 | .append(",")
109 | .append(byteRanges[i + 1] & 0xff)
110 | .append("]");
111 | if (i < byteRanges.length - 2) {
112 | out.append(";");
113 | }
114 | }
115 | }
116 | return out.toString();
117 | }
118 | }
119 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/Machine.java:
--------------------------------------------------------------------------------
1 | // Copyright 2010 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | // Original Go source here:
6 | // http://code.google.com/p/go/source/browse/src/pkg/regexp/exec.go
7 |
8 | package com.google.re2j;
9 |
10 | import com.google.re2j.RE2.Anchor;
11 |
12 | /**
13 | * A Machine matches an input string of Unicode characters against an RE2 instance.
14 | */
15 | interface Machine {
16 |
17 | /**
18 | * Runs the machine over the input |in| starting at |pos| with the RE2 Anchor |anchor|.
19 | * |submatches| contains group positions after a successful match.
20 | *
21 | * @return reports whether a match was found.
22 | */
23 | boolean match(MachineInput in, int pos, Anchor anchor, int[] submatches);
24 | }
25 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/MachineInput.java:
--------------------------------------------------------------------------------
1 | // Copyright 2010 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | // Original Go source here:
6 | // http://code.google.com/p/go/source/browse/src/pkg/regexp/regexp.go
7 |
8 | package com.google.re2j;
9 |
10 | import com.livejournal.karino2.zipsourcecodereading.Slice;
11 |
12 |
13 | /**
14 | * MachineInput represents the UTF-8 input text supplied to the Machine. It provides one-character
15 | * lookahead.
16 | */
17 | final class MachineInput {
18 |
19 |
20 | static final byte EOF = -1;
21 |
22 | static MachineInput fromUTF8(Slice slice) {
23 | return new MachineInput(slice);
24 | }
25 |
26 | final Slice slice;
27 | /*
28 | final Object base;
29 | final long address;
30 | */
31 | final int length;
32 |
33 | MachineInput(Slice slice) {
34 | this.slice = slice;
35 | /*
36 | this.base = slice.getBase();
37 | this.address = slice.getAddress();
38 | */
39 | this.length = slice.length();
40 | }
41 |
42 | // Returns the byte at the specified index.
43 | byte getByte(int i) {
44 | if (i >= length) {
45 | return EOF;
46 | }
47 |
48 | if (i < 0) {
49 | throw new IndexOutOfBoundsException("index less than zero (" + i + ")");
50 | }
51 |
52 | return getByteUnchecked(i);
53 | }
54 |
55 | byte getByteUnchecked(int i) {
56 | return slice.getByte(i);
57 | }
58 |
59 | // Returns the index relative to |pos| at which |re2.prefix| is found
60 | // in this input stream, or a negative value if not found.
61 | int index(RE2 re2, int pos) {
62 | int i = Utils.indexOf(slice, re2.prefixUTF8, pos);
63 | return i < 0 ? i : i - pos;
64 | }
65 |
66 | // Returns the end position in the same units as step().
67 | int endPos() {
68 | return length;
69 | }
70 | }
71 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/Options.java:
--------------------------------------------------------------------------------
1 | // Copyright 2015 Teradata. All Rights Reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | package com.google.re2j;
6 |
7 | import java.io.Serializable;
8 |
9 | import static com.google.re2j.Options.Algorithm.DFA;
10 | import static java.util.Objects.requireNonNull;
11 |
12 | public final class Options implements Serializable {
13 | public static final Options DEFAULT_OPTIONS = builder().build();
14 |
15 | // Start state + end state
16 | private static final int MINIMUM_NUMBER_OF_DFA_STATES = 2;
17 | private static final int DEFAULT_NUMBER_OF_DFA_RETRIES = 5;
18 |
19 | private Algorithm algorithm = DFA;
20 | private EventsListener eventsListener = null;
21 | private int maximumNumberOfDFAStates = Integer.MAX_VALUE;
22 | private int numberOfDFARetries = DEFAULT_NUMBER_OF_DFA_RETRIES;
23 |
24 | public enum Algorithm {
25 | // Use DFA exclusively, throw an exception when maximum number of DFA states is reached n times.
26 | // DFA machine is reset each time states cache is full.
27 | DFA,
28 | // Use DFA, fallback to NFA when maximum number of DFA states is reached n times. DFA machine
29 | // is reset each time states cache is full.
30 | DFA_FALLBACK_TO_NFA,
31 | // use NFA exclusively
32 | NFA
33 | }
34 |
35 | public Algorithm getAlgorithm() {
36 | return algorithm;
37 | }
38 |
39 | public EventsListener getEventsListener() {
40 | return eventsListener;
41 | }
42 |
43 | public int getMaximumNumberOfDFAStates() {
44 | return maximumNumberOfDFAStates;
45 | }
46 |
47 | public int getNumberOfDFARetries() {
48 | return numberOfDFARetries;
49 | }
50 |
51 | @Override
52 | public boolean equals(Object o) {
53 | if (this == o) return true;
54 | if (o == null || getClass() != o.getClass()) return false;
55 |
56 | Options options = (Options) o;
57 |
58 | return maximumNumberOfDFAStates == options.maximumNumberOfDFAStates
59 | && numberOfDFARetries == options.numberOfDFARetries
60 | && algorithm == options.algorithm
61 | && !(eventsListener != null ? !eventsListener.equals(options.eventsListener) : options.eventsListener != null);
62 |
63 | }
64 |
65 | @Override
66 | public int hashCode() {
67 | int result = algorithm.hashCode();
68 | result = 31 * result + (eventsListener != null ? eventsListener.hashCode() : 0);
69 | result = 31 * result + maximumNumberOfDFAStates;
70 | result = 31 * result + numberOfDFARetries;
71 | return result;
72 | }
73 |
74 | @Override
75 | public String toString() {
76 | return "Options{" +
77 | "algorithm=" + algorithm +
78 | ", eventsListener=" + eventsListener +
79 | ", maximumNumberOfDFAStates=" + maximumNumberOfDFAStates +
80 | ", numberOfDFARetries=" + numberOfDFARetries +
81 | '}';
82 | }
83 |
84 | public static OptionsBuilder builder() {
85 | return new OptionsBuilder();
86 | }
87 |
88 | /**
89 | * Interface for RE2J events listening.
90 | */
91 | public interface EventsListener {
92 |
93 | /**
94 | * Called when NFA is being used instead of DFA because too many {@link DFAState}s has been
95 | * created.
96 | */
97 | void fallbackToNFA();
98 | }
99 |
100 | public static final class OptionsBuilder {
101 | private Options options = new Options();
102 |
103 | public OptionsBuilder setAlgorithm(Algorithm algorithm) {
104 | options.algorithm = requireNonNull(algorithm);
105 | return this;
106 | }
107 |
108 | public OptionsBuilder setMaximumNumberOfDFAStates(int maximumNumberOfDFAStates) {
109 | if (maximumNumberOfDFAStates < MINIMUM_NUMBER_OF_DFA_STATES) {
110 | throw new IllegalArgumentException("maximum number of DFA states must be larger or equal to " + MINIMUM_NUMBER_OF_DFA_STATES);
111 | }
112 | options.maximumNumberOfDFAStates = maximumNumberOfDFAStates;
113 | return this;
114 | }
115 |
116 | public OptionsBuilder setNumberOfDFARetries(int numberOfDFARetries) {
117 | if (numberOfDFARetries < 0) {
118 | throw new IllegalArgumentException("number of DFA retries cannot be below 0");
119 | }
120 | options.numberOfDFARetries = numberOfDFARetries;
121 | return this;
122 | }
123 |
124 | public OptionsBuilder setEventsListener(EventsListener eventsListener) {
125 | options.eventsListener = requireNonNull(eventsListener);
126 | return this;
127 | }
128 |
129 | public Options build() {
130 | return options;
131 | }
132 | }
133 | }
134 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/PatternSyntaxException.java:
--------------------------------------------------------------------------------
1 | // Copyright 2010 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | package com.google.re2j;
6 |
7 | /**
8 | * An exception thrown by the parser if the pattern was invalid.
9 | *
10 | * Following {@code java.util.regex.PatternSyntaxException}, this is an
11 | * unchecked exception.
12 | */
13 | public class PatternSyntaxException extends RuntimeException {
14 |
15 | private final String error; // the nature of the error
16 | private final String input; // the partial input at the point of error.
17 |
18 | public PatternSyntaxException(String error, String input) {
19 | super("error parsing regexp: " + error + ": `" + input + "`");
20 | this.error = error;
21 | this.input = input;
22 | }
23 |
24 | public PatternSyntaxException(String error) {
25 | super("error parsing regexp: " + error);
26 | this.error = error;
27 | this.input = "";
28 | }
29 |
30 | /**
31 | * Retrieves the error index.
32 | *
33 | * @return The approximate index in the pattern of the error,
34 | * or -1 if the index is not known
35 | */
36 | public int getIndex() {
37 | return -1;
38 | }
39 |
40 | /**
41 | * Retrieves the description of the error.
42 | *
43 | * @return The description of the error
44 | */
45 | public String getDescription() {
46 | return error;
47 | }
48 |
49 | /**
50 | * Retrieves the erroneous regular-expression pattern.
51 | *
52 | * @return The erroneous pattern
53 | */
54 | public String getPattern() {
55 | return input;
56 | }
57 |
58 | }
59 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/Prog.java:
--------------------------------------------------------------------------------
1 | // Copyright 2010 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | // Original Go source here:
6 | // http://code.google.com/p/go/source/browse/src/pkg/regexp/syntax/prog.go
7 |
8 | package com.google.re2j;
9 |
10 | import java.io.ByteArrayOutputStream;
11 | import java.io.IOException;
12 | import java.util.ArrayList;
13 | import java.util.List;
14 |
15 | import static com.google.re2j.Inst.Op.BYTE;
16 | import static com.google.re2j.Inst.Op.MATCH;
17 |
18 | /**
19 | * A Prog is a compiled regular expression program.
20 | */
21 | class Prog {
22 |
23 | private final List inst = new ArrayList();
24 | int start; // index of start instruction
25 | int startUnanchored; // index of unanchored start instruction
26 | int numCap = 2; // number of CAPTURE insts in re
27 | // 2 => implicit ( and ) for whole match $0
28 |
29 | // Constructs an empty program.
30 | Prog() {}
31 |
32 | // Returns the instruction at the specified pc.
33 | // Precondition: pc > 0 && pc < numInst().
34 | Inst getInst(int pc) {
35 | return inst.get(pc);
36 | }
37 |
38 | Inst[] getInst() {
39 | return inst.toArray(new Inst[inst.size()]);
40 | }
41 |
42 | // Returns the number of instructions in this program.
43 | int numInst() {
44 | return inst.size();
45 | }
46 |
47 | // Adds a new instruction to this program, with operator |op| and |pc| equal
48 | // to |numInst()|.
49 | void addInst(Inst.Op op) {
50 | inst.add(new Inst(op));
51 | }
52 |
53 | // skipNop() follows any no-op or capturing instructions and returns the
54 | // resulting instruction.
55 | Inst skipNop(int pc) {
56 | Inst i = inst.get(pc);
57 | while (i.op == Inst.Op.NOP || i.op == Inst.Op.CAPTURE) {
58 | i = inst.get(pc);
59 | pc = i.out;
60 | }
61 | return i;
62 | }
63 |
64 | // prefix() returns a pair of a literal slice that all matches for the
65 | // regexp must start with, and a boolean which is true if the prefix is the
66 | // entire match. The slice is returned by appending to |prefix|.
67 | boolean prefix(ByteArrayOutputStream prefix) {
68 | Inst i = skipNop(start);
69 |
70 | // Avoid allocation of buffer if prefix is empty.
71 | if (i.op() != BYTE || i.byteRanges.length != 1) {
72 | return i.op == MATCH; // (append "" to prefix)
73 | }
74 |
75 | int length = 0;
76 | while (i.op() == BYTE && i.byteRanges.length == 1) {
77 | i = skipNop(i.out);
78 | length++;
79 | }
80 |
81 | byte[] bytes = new byte[length];
82 | length = 0;
83 | i = skipNop(start);
84 | while (i.op() == BYTE && i.byteRanges.length == 1) {
85 | bytes[length] = i.byteRanges[0];
86 | i = skipNop(i.out);
87 | length++;
88 | }
89 |
90 | // Have prefix; gather characters.
91 | try {
92 | prefix.write(bytes);
93 | } catch (IOException e) {
94 | throw new RuntimeException("Never reached here.");
95 | }
96 |
97 | return i.op == MATCH;
98 | }
99 |
100 | // startCond() returns the leading empty-width conditions that must be true
101 | // in any match. It returns -1 (all bits set) if no matches are possible.
102 | int startCond() {
103 | int flag = 0; // bitmask of EMPTY_* flags
104 | int pc = start;
105 | loop:
106 | for (;;) {
107 | Inst i = inst.get(pc);
108 | switch (i.op) {
109 | case EMPTY_WIDTH:
110 | flag |= i.arg;
111 | break;
112 | case FAIL:
113 | return -1;
114 | case CAPTURE:
115 | case NOP:
116 | break; // skip
117 | default:
118 | break loop;
119 | }
120 | pc = i.out;
121 | }
122 | return flag;
123 | }
124 |
125 | // --- Patch list ---
126 |
127 | // A patchlist is a list of instruction pointers that need to be filled in
128 | // (patched). Because the pointers haven't been filled in yet, we can reuse
129 | // their storage to hold the list. It's kind of sleazy, but works well in
130 | // practice. See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
131 |
132 | // These aren't really pointers: they're integers, so we can reinterpret them
133 | // this way without using package unsafe. A value l denotes p.inst[l>>1].out
134 | // (l&1==0) or .arg (l&1==1). l == 0 denotes the empty list, okay because we
135 | // start every program with a fail instruction, so we'll never want to point
136 | // at its output link.
137 |
138 | int next(int l) {
139 | Inst i = inst.get(l >> 1);
140 | if ((l & 1) == 0) {
141 | return i.out;
142 | }
143 | return i.arg;
144 | }
145 |
146 | void patch(int l, int val) {
147 | while (l != 0) {
148 | Inst i = inst.get(l >> 1);
149 | if ((l & 1) == 0) {
150 | l = i.out;
151 | i.out = val;
152 | } else {
153 | l = i.arg;
154 | i.arg = val;
155 | }
156 | }
157 | }
158 |
159 | int append(int l1, int l2) {
160 | if (l1 == 0) {
161 | return l2;
162 | }
163 | if (l2 == 0) {
164 | return l1;
165 | }
166 | int last = l1;
167 | for (;;) {
168 | int next = next(last);
169 | if (next == 0) {
170 | break;
171 | }
172 | last = next;
173 | }
174 | Inst i = inst.get(last>>1);
175 | if ((last & 1) == 0) {
176 | i.out = l2;
177 | } else {
178 | i.arg = l2;
179 | }
180 | return l1;
181 | }
182 |
183 | // ---
184 |
185 | @Override
186 | public String toString() {
187 | StringBuilder out = new StringBuilder();
188 | for (int pc = 0; pc < inst.size(); ++pc) {
189 | int len = out.length();
190 | out.append(pc);
191 | if (pc == start) {
192 | out.append('*');
193 | }
194 | if (pc == startUnanchored) {
195 | out.append("@");
196 | }
197 | // Use spaces not tabs since they're not always preserved in
198 | // Google Java source, such as our tests.
199 | out.append(" ".substring(out.length() - len)).
200 | append(inst.get(pc)).append('\n');
201 | }
202 | return out.toString();
203 | }
204 | }
205 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/Simplify.java:
--------------------------------------------------------------------------------
1 | // Copyright 2011 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | // Original Go source here:
6 | // http://code.google.com/p/go/source/browse/src/pkg/regexp/syntax/simplify.go
7 |
8 | package com.google.re2j;
9 |
10 | import java.util.ArrayList;
11 |
12 | class Simplify {
13 |
14 | // Simplify returns a regexp equivalent to re but without counted
15 | // repetitions and with various other simplifications, such as
16 | // rewriting /(?:a+)+/ to /a+/. The resulting regexp will execute
17 | // correctly but its string representation will not produce the same
18 | // parse tree, because capturing parentheses may have been duplicated
19 | // or removed. For example, the simplified form for /(x){1,2}/ is
20 | // /(x)(x)?/ but both parentheses capture as $1. The returned regexp
21 | // may share structure with or be the original.
22 | static Regexp simplify(Regexp re) {
23 | if (re == null) {
24 | return null;
25 | }
26 | switch (re.op) {
27 | case CAPTURE:
28 | case CONCAT:
29 | case ALTERNATE: {
30 | // Simplify children, building new Regexp if children change.
31 | Regexp nre = re;
32 | for (int i = 0; i < re.subs.length; ++i) {
33 | Regexp sub = re.subs[i];
34 | Regexp nsub = simplify(sub);
35 | if (nre == re && nsub != sub) {
36 | // Start a copy.
37 | nre = new Regexp(re); // shallow copy
38 | nre.runes = null;
39 | nre.subs = Parser.subarray(re.subs, 0, re.subs.length); // clone
40 | }
41 | if (nre != re) {
42 | nre.subs[i] = nsub;
43 | }
44 | }
45 | return nre;
46 | }
47 | case STAR:
48 | case PLUS:
49 | case QUEST: {
50 | Regexp sub = simplify(re.subs[0]);
51 | return simplify1(re.op, re.flags, sub, re);
52 | }
53 | case REPEAT: {
54 | // Special special case: x{0} matches the empty string
55 | // and doesn't even need to consider x.
56 | if (re.min == 0 && re.max == 0) {
57 | return new Regexp(Regexp.Op.EMPTY_MATCH);
58 | }
59 |
60 | // The fun begins.
61 | Regexp sub = simplify(re.subs[0]);
62 |
63 | // x{n,} means at least n matches of x.
64 | if (re.max == -1) {
65 | // Special case: x{0,} is x*.
66 | if (re.min == 0) {
67 | return simplify1(Regexp.Op.STAR, re.flags, sub, null);
68 | }
69 |
70 | // Special case: x{1,} is x+.
71 | if (re.min == 1) {
72 | return simplify1(Regexp.Op.PLUS, re.flags, sub, null);
73 | }
74 |
75 | // General case: x{4,} is xxxx+.
76 | Regexp nre = new Regexp(Regexp.Op.CONCAT);
77 | ArrayList subs = new ArrayList();
78 | for (int i = 0; i < re.min - 1; i++) {
79 | subs.add(sub);
80 | }
81 | subs.add(simplify1(Regexp.Op.PLUS, re.flags, sub, null));
82 | nre.subs = subs.toArray(new Regexp[subs.size()]);
83 | return nre;
84 | }
85 |
86 | // Special case x{0} handled above.
87 |
88 | // Special case: x{1} is just x.
89 | if (re.min == 1 && re.max == 1) {
90 | return sub;
91 | }
92 |
93 | // General case: x{n,m} means n copies of x and m copies of x?
94 | // The machine will do less work if we nest the final m copies,
95 | // so that x{2,5} = xx(x(x(x)?)?)?
96 |
97 | // Build leading prefix: xx.
98 | ArrayList prefixSubs = null;
99 | if (re.min > 0) {
100 | prefixSubs = new ArrayList();
101 | for (int i = 0; i < re.min; i++) {
102 | prefixSubs.add(sub);
103 | }
104 | }
105 |
106 | // Build and attach suffix: (x(x(x)?)?)?
107 | if (re.max > re.min) {
108 | Regexp suffix = simplify1(Regexp.Op.QUEST, re.flags, sub, null);
109 | for (int i = re.min + 1; i < re.max; i++) {
110 | Regexp nre2 = new Regexp(Regexp.Op.CONCAT);
111 | nre2.subs = new Regexp[] { sub, suffix };
112 | suffix = simplify1(Regexp.Op.QUEST, re.flags, nre2, null);
113 | }
114 | if (prefixSubs == null) {
115 | return suffix;
116 | }
117 | prefixSubs.add(suffix);
118 | }
119 | if (prefixSubs != null) {
120 | Regexp prefix = new Regexp(Regexp.Op.CONCAT);
121 | prefix.subs = prefixSubs.toArray(new Regexp[prefixSubs.size()]);
122 | return prefix;
123 | }
124 |
125 | // Some degenerate case like min > max or min < max < 0.
126 | // Handle as impossible match.
127 | return new Regexp(Regexp.Op.NO_MATCH);
128 | }
129 | }
130 |
131 | return re;
132 | }
133 |
134 | // simplify1 implements Simplify for the unary OpStar,
135 | // OpPlus, and OpQuest operators. It returns the simple regexp
136 | // equivalent to
137 | //
138 | // Regexp{Op: op, Flags: flags, Sub: {sub}}
139 | //
140 | // under the assumption that sub is already simple, and
141 | // without first allocating that structure. If the regexp
142 | // to be returned turns out to be equivalent to re, simplify1
143 | // returns re instead.
144 | //
145 | // simplify1 is factored out of Simplify because the implementation
146 | // for other operators generates these unary expressions.
147 | // Letting them call simplify1 makes sure the expressions they
148 | // generate are simple.
149 | private static Regexp simplify1(Regexp.Op op, int flags, Regexp sub,
150 | Regexp re) {
151 | // Special case: repeat the empty string as much as
152 | // you want, but it's still the empty string.
153 | if (sub.op == Regexp.Op.EMPTY_MATCH) {
154 | return sub;
155 | }
156 | // The operators are idempotent if the flags match.
157 | if (op == sub.op &&
158 | (flags & RE2.NON_GREEDY) == (sub.flags & RE2.NON_GREEDY)) {
159 | return sub;
160 | }
161 | if (re != null && re.op == op &&
162 | (re.flags & RE2.NON_GREEDY) == (flags & RE2.NON_GREEDY) &&
163 | sub == re.subs[0]) {
164 | return re;
165 | }
166 |
167 | re = new Regexp(op);
168 | re.flags = flags;
169 | re.subs = new Regexp[] { sub };
170 | return re;
171 | }
172 |
173 | private Simplify() {} // uninstantiable
174 |
175 | }
176 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/SliceUtils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Licensed under the Apache License, Version 2.0 (the "License");
3 | * you may not use this file except in compliance with the License.
4 | * You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software
9 | * distributed under the License is distributed on an "AS IS" BASIS,
10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | * See the License for the specific language governing permissions and
12 | * limitations under the License.
13 | */
14 | package com.google.re2j;
15 |
16 |
17 | /**
18 | * Utility methods related to {@link Slice} class.
19 | */
20 | final class SliceUtils {
21 |
22 | /*
23 | static void appendReplacement(SliceOutput so, Slice replacement, Matcher matcher) {
24 | int idx = 0;
25 |
26 | // Handle the following items:
27 | // 1. ${name};
28 | // 2. $0, $1, $123 (group 123, if exists; or group 12, if exists; or group 1);
29 | // 3. \\, \$, \t (literal 't').
30 | // 4. Anything that doesn't starts with \ or $ is considered regular bytes
31 | while (idx < replacement.length()) {
32 | byte nextByte = replacement.getByte(idx);
33 | if (nextByte == '$') {
34 | idx++;
35 | if (idx == replacement.length()) {
36 | throw new IllegalArgumentException("Illegal replacement sequence: " + replacement.toStringUtf8());
37 | }
38 | nextByte = replacement.getByte(idx);
39 | int backref;
40 | if (nextByte == '{') { // case 1 in the above comment
41 | idx++;
42 | int startCursor = idx;
43 | while (idx < replacement.length()) {
44 | nextByte = replacement.getByte(idx);
45 | if (nextByte == '}') {
46 | break;
47 | }
48 | idx++;
49 | }
50 | String groupName = replacement.slice(startCursor, idx - startCursor).toStringUtf8();
51 | Integer namedGroupIndex = matcher.pattern().re2().namedGroupIndexes.get(groupName);
52 | if (namedGroupIndex == null) {
53 | throw new IndexOutOfBoundsException("Illegal replacement sequence: unknown group " + groupName);
54 | }
55 | backref = namedGroupIndex;
56 | idx++;
57 | } else { // case 2 in the above comment
58 | backref = nextByte - '0';
59 | if (backref < 0 || backref > 9) {
60 | throw new IllegalArgumentException("Illegal replacement sequence: " + replacement.toStringUtf8());
61 | }
62 | if (matcher.groupCount() < backref) {
63 | throw new IndexOutOfBoundsException("Illegal replacement sequence: unknown group " + backref);
64 | }
65 | idx++;
66 | while (idx < replacement.length()) { // Adaptive group number: find largest group num that is not greater than actual number of groups
67 | int nextDigit = replacement.getByte(idx) - '0';
68 | if (nextDigit < 0 || nextDigit > 9) {
69 | break;
70 | }
71 | int newBackref = (backref * 10) + nextDigit;
72 | if (matcher.groupCount() < newBackref) {
73 | break;
74 | }
75 | backref = newBackref;
76 | idx++;
77 | }
78 | }
79 | Slice group = matcher.group(backref);
80 | if (group != null) {
81 | so.writeBytes(group);
82 | }
83 | } else { // case 3 and 4 in the above comment
84 | if (nextByte == '\\') {
85 | idx++;
86 | if (idx == replacement.length()) {
87 | throw new IllegalArgumentException("Illegal replacement sequence: " + replacement.toStringUtf8());
88 | }
89 | nextByte = replacement.getByte(idx);
90 | }
91 | so.appendByte(nextByte);
92 | idx++;
93 | }
94 | }
95 | }
96 | */
97 | private SliceUtils() {
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/SparseSet.java:
--------------------------------------------------------------------------------
1 | // Copyright 2015 The RE2 Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | // Original RE2 source here:
6 | // https://github.com/google/re2/blob/master/util/sparse_set.h
7 |
8 | package com.google.re2j;
9 |
10 | class SparseSet {
11 | private final int[] dense; // may contain stale Entries in slots >= size
12 | private final int[] sparse; // may contain stale but in-bounds values.
13 | private int size; // of prefix of |dense| that is logically populated
14 |
15 | SparseSet(int n) {
16 | this.sparse = new int[n];
17 | this.dense = new int[n];
18 | }
19 |
20 | boolean contains(int i) {
21 | return sparse[i] < size && dense[sparse[i]] == i;
22 | }
23 |
24 | boolean isEmpty() {
25 | return size == 0;
26 | }
27 |
28 | void add(int i) {
29 | dense[size] = i;
30 | sparse[i] = size;
31 | size++;
32 | }
33 |
34 | void clear() {
35 | size = 0;
36 | }
37 |
38 | int getValueAt(int i) {
39 | if (i >= size) {
40 | throw new IndexOutOfBoundsException(String.format("Cannot get index %d. SparseSet is size %d", i, size));
41 | }
42 | return dense[i];
43 | }
44 |
45 | int getSize() {
46 | return size;
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/Unicode.java:
--------------------------------------------------------------------------------
1 | // Copyright 2010 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | // Many of these were derived from the corresponding Go functions in
6 | // http://code.google.com/p/go/source/browse/src/pkg/unicode/letter.go
7 |
8 | package com.google.re2j;
9 |
10 | import static java.nio.charset.StandardCharsets.UTF_8;
11 |
12 | /**
13 | * Utilities for dealing with Unicode better than Java does.
14 | *
15 | * @author adonovan@google.com (Alan Donovan)
16 | */
17 | public class Unicode {
18 |
19 | // Rune and UTF8 sequences are the same.
20 | static final int RUNE_SELF = 0x80;
21 |
22 | // The highest legal rune value.
23 | static final int MAX_RUNE = 0x10FFFF;
24 |
25 | // The highest legal ASCII value.
26 | static final int MAX_ASCII = 0x7f;
27 |
28 | // The highest legal Latin-1 value.
29 | static final int MAX_LATIN1 = 0xFF;
30 |
31 | private static final int MAX_CASE = 3;
32 |
33 | // Represents invalid code points.
34 | private static final int REPLACEMENT_CHAR = 0xFFFD;
35 |
36 | // Minimum and maximum runes involved in folding.
37 | // Checked during test.
38 | static final int MIN_FOLD = 0x0041;
39 | static final int MAX_FOLD = 0x1044f;
40 |
41 | // Maximum bytes per rune
42 | static final int UTF_MAX = 4;
43 |
44 | // is32 uses binary search to test whether rune is in the specified
45 | // slice of 32-bit ranges.
46 | // TODO(adonovan): opt: consider using int[n*3] instead of int[n][3].
47 | private static boolean is32(int[][] ranges, int r) {
48 | // binary search over ranges
49 | for (int lo = 0, hi = ranges.length; lo < hi; ) {
50 | int m = lo + (hi - lo) / 2;
51 | int[] range = ranges[m]; // [lo, hi, stride]
52 | if (range[0] <= r && r <= range[1]) {
53 | return ((r - range[0]) % range[2]) == 0;
54 | }
55 | if (r < range[0]) {
56 | hi = m;
57 | } else {
58 | lo = m + 1;
59 | }
60 | }
61 | return false;
62 | }
63 |
64 | // is tests whether rune is in the specified table of ranges.
65 | private static boolean is(int[][] ranges, int r) {
66 | // common case: rune is ASCII or Latin-1, so use linear search.
67 | if (r <= MAX_LATIN1) {
68 | for (int[] range : ranges) { // range = [lo, hi, stride]
69 | if (r > range[1]) {
70 | continue;
71 | }
72 | if (r < range[0]) {
73 | return false;
74 | }
75 | return ((r - range[0]) % range[2]) == 0;
76 | }
77 | return false;
78 | }
79 | return ranges.length > 0 &&
80 | r >= ranges[0][0] &&
81 | is32(ranges, r);
82 | }
83 |
84 | static byte[] codePointToUtf8(int codePoint) {
85 | return new String(Character.toChars(codePoint)).getBytes(UTF_8);
86 | }
87 |
88 | static int maxRune(int len) {
89 | int b; // number of Rune bits in len-byte UTF-8 sequence (len < UTFmax)
90 | if (len == 1) {
91 | b = 7;
92 | } else {
93 | b = 8 - (len + 1) + 6 * (len - 1);
94 | }
95 | return (1 << b) - 1; // maximum Rune for b bits.
96 | }
97 |
98 | // isUpper reports whether the rune is an upper case letter.
99 | static boolean isUpper(int r) {
100 | // See comment in isGraphic.
101 | if (r <= MAX_LATIN1) {
102 | return Character.isUpperCase((char) r);
103 | }
104 | return is(UnicodeTables.Upper, r);
105 | }
106 |
107 | // isLower reports whether the rune is a lower case letter.
108 | static boolean isLower(int r) {
109 | // See comment in isGraphic.
110 | if (r <= MAX_LATIN1) {
111 | return Character.isLowerCase((char) r);
112 | }
113 | return is(UnicodeTables.Lower, r);
114 | }
115 |
116 | // isTitle reports whether the rune is a title case letter.
117 | static boolean isTitle(int r) {
118 | if (r <= MAX_LATIN1) {
119 | return false;
120 | }
121 | return is(UnicodeTables.Title, r);
122 | }
123 |
124 | // isPrint reports whether the rune is printable (Unicode L/M/N/P/S or ' ').
125 | static boolean isPrint(int r) {
126 | if (r <= MAX_LATIN1) {
127 | return r >= 0x20 && r < 0x7F ||
128 | r >= 0xA1 && r != 0xAD;
129 | }
130 | return is(UnicodeTables.L, r) ||
131 | is(UnicodeTables.M, r) ||
132 | is(UnicodeTables.N, r) ||
133 | is(UnicodeTables.P, r) ||
134 | is(UnicodeTables.S, r);
135 | }
136 |
137 | // A case range is conceptually a record:
138 | // class CaseRange {
139 | // int lo, hi;
140 | // int upper, lower, title;
141 | // }
142 | // but flattened as an int[5].
143 |
144 | // to maps the rune using the specified case mapping.
145 | private static int to(int kase, int r, int[][] caseRange) {
146 | if (kase < 0 || MAX_CASE <= kase) {
147 | return REPLACEMENT_CHAR; // as reasonable an error as any
148 | }
149 | // binary search over ranges
150 | for (int lo = 0, hi = caseRange.length; lo < hi; ) {
151 | int m = lo + (hi - lo) / 2;
152 | int[] cr = caseRange[m]; // cr = [lo, hi, upper, lower, title]
153 | int crlo = cr[0];
154 | int crhi = cr[1];
155 | if (crlo <= r && r <= crhi) {
156 | int delta = cr[2 + kase];
157 | if (delta > MAX_RUNE) {
158 | // In an Upper-Lower sequence, which always starts with
159 | // an UpperCase letter, the real deltas always look like:
160 | // {0, 1, 0} UpperCase (Lower is next)
161 | // {-1, 0, -1} LowerCase (Upper, Title are previous)
162 | // The characters at even offsets from the beginning of the
163 | // sequence are upper case; the ones at odd offsets are lower.
164 | // The correct mapping can be done by clearing or setting the low
165 | // bit in the sequence offset.
166 | // The constants UpperCase and TitleCase are even while LowerCase
167 | // is odd so we take the low bit from kase.
168 | return crlo + (((r - crlo) & ~1) | (kase & 1));
169 | }
170 | return r + delta;
171 | }
172 | if (r < crlo) {
173 | hi = m;
174 | } else {
175 | lo = m + 1;
176 | }
177 | }
178 | return r;
179 | }
180 |
181 | // to maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.
182 | private static int to(int kase, int r) {
183 | return to(kase, r, UnicodeTables.CASE_RANGES);
184 | }
185 |
186 | // toUpper maps the rune to upper case.
187 | static int toUpper(int r) {
188 | if (r <= MAX_ASCII) {
189 | if ('a' <= r && r <= 'z') {
190 | r -= 'a' - 'A';
191 | }
192 | return r;
193 | }
194 | return to(UnicodeTables.UpperCase, r);
195 | }
196 |
197 | // toLower maps the rune to lower case.
198 | static int toLower(int r) {
199 | if (r <= MAX_ASCII) {
200 | if ('A' <= r && r <= 'Z') {
201 | r += 'a' - 'A';
202 | }
203 | return r;
204 | }
205 | return to(UnicodeTables.LowerCase, r);
206 | }
207 |
208 | // simpleFold iterates over Unicode code points equivalent under
209 | // the Unicode-defined simple case folding. Among the code points
210 | // equivalent to rune (including rune itself), SimpleFold returns the
211 | // smallest r >= rune if one exists, or else the smallest r >= 0.
212 | //
213 | // For example:
214 | // SimpleFold('A') = 'a'
215 | // SimpleFold('a') = 'A'
216 | //
217 | // SimpleFold('K') = 'k'
218 | // SimpleFold('k') = '\u212A' (Kelvin symbol, K)
219 | // SimpleFold('\u212A') = 'K'
220 | //
221 | // SimpleFold('1') = '1'
222 | //
223 | // Derived from Go's unicode.SimpleFold.
224 | //
225 | public static int simpleFold(int r) {
226 | // Consult caseOrbit table for special cases.
227 | int lo = 0;
228 | int hi = UnicodeTables.CASE_ORBIT.length;
229 | while (lo < hi) {
230 | int m = lo + (hi - lo) / 2;
231 | if (UnicodeTables.CASE_ORBIT[m][0] < r) {
232 | lo = m + 1;
233 | } else {
234 | hi = m;
235 | }
236 | }
237 | if (lo < UnicodeTables.CASE_ORBIT.length &&
238 | UnicodeTables.CASE_ORBIT[lo][0] == r) {
239 | return UnicodeTables.CASE_ORBIT[lo][1];
240 | }
241 |
242 | // No folding specified. This is a one- or two-element
243 | // equivalence class containing rune and toLower(rune)
244 | // and toUpper(rune) if they are different from rune.
245 | int l = toLower(r);
246 | if (l != r) {
247 | return l;
248 | }
249 | return toUpper(r);
250 | }
251 |
252 | private Unicode() {} // uninstantiable
253 |
254 | }
255 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/Utils.java:
--------------------------------------------------------------------------------
1 | // Copyright 2010 The Go Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 |
5 | package com.google.re2j;
6 |
7 |
8 | import com.livejournal.karino2.zipsourcecodereading.Slice;
9 |
10 | import static com.google.re2j.MachineInput.EOF;
11 |
12 | /**
13 | * Various constants and helper utilities.
14 | */
15 | public abstract class Utils {
16 |
17 | static final int[] EMPTY_INTS = {};
18 |
19 | // Returns true iff |c| is an ASCII letter or decimal digit.
20 | static boolean isalnum(int c) {
21 | return '0' <= c && c <= '9' || 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z';
22 | }
23 |
24 | // If |c| is an ASCII hex digit, returns its value, otherwise -1.
25 | static int unhex(int c) {
26 | if ('0' <= c && c <= '9') {
27 | return c - '0';
28 | }
29 | if ('a' <= c && c <= 'f') {
30 | return c - 'a' + 10;
31 | }
32 | if ('A' <= c && c <= 'F') {
33 | return c - 'A' + 10;
34 | }
35 | return -1;
36 | }
37 |
38 | private static final String METACHARACTERS = "\\.+*?()|[]{}^$";
39 |
40 | // Appends a RE2 literal to |out| for rune |rune|,
41 | // with regexp metacharacters escaped.
42 | static void escapeRune(StringBuilder out, int rune) {
43 | if (Unicode.isPrint(rune)) {
44 | if (METACHARACTERS.indexOf((char) rune) >= 0) {
45 | out.append('\\');
46 | }
47 | out.appendCodePoint(rune);
48 | return;
49 | }
50 |
51 | switch (rune) {
52 | case '"': out.append("\\\""); break;
53 | case '\\': out.append("\\\\"); break;
54 | case '\t': out.append("\\t"); break;
55 | case '\n': out.append("\\n"); break;
56 | case '\r': out.append("\\r"); break;
57 | case '\b': out.append("\\b"); break;
58 | case '\f': out.append("\\f"); break;
59 | default: {
60 | String s = Integer.toHexString(rune);
61 | if (rune < 0x100) {
62 | out.append("\\x");
63 | if (s.length() == 1) {
64 | out.append('0');
65 | }
66 | out.append(s);
67 | } else {
68 | out.append("\\x{").append(s).append('}');
69 | }
70 | break;
71 | }
72 | }
73 | }
74 |
75 | // Returns the array of runes in the specified Java UTF-16 string.
76 | static int[] stringToRunes(String str) {
77 | int charlen = str.length();
78 | int runelen = str.codePointCount(0, charlen);
79 | int[] runes = new int[runelen];
80 | int r = 0, c = 0;
81 | while (c < charlen) {
82 | int rune = str.codePointAt(c);
83 | runes[r++] = rune;
84 | c += Character.charCount(rune);
85 | }
86 | return runes;
87 | }
88 |
89 | // Returns the Java UTF-16 string containing the single rune |r|.
90 | public static String runeToString(int r) {
91 | char c = (char) r;
92 | return r == c
93 | ? String.valueOf(c)
94 | : new String(Character.toChars(r));
95 | // fix by karino : new String(Character.toChars(c));
96 | }
97 |
98 | // Returns a new copy of the specified subarray.
99 | static int[] subarray(int[] array, int start, int end) {
100 | int[] r = new int[end - start];
101 | for (int i = start; i < end; ++i) {
102 | r[i - start] = array[i];
103 | }
104 | return r;
105 | }
106 |
107 | // Returns a new copy of the specified subarray.
108 | static byte[] subarray(byte[] array, int start, int end) {
109 | byte[] r = new byte[end - start];
110 | for (int i = start; i < end; ++i) {
111 | r[i - start] = array[i];
112 | }
113 | return r;
114 | }
115 |
116 | // Returns the index of the first occurrence of array |target| within
117 | // array |source| after |fromIndex|, or -1 if not found.
118 | static int indexOf(Slice source, Slice target, int fromIndex) {
119 | if (fromIndex >= source.length()) {
120 | return target.length() == 0 ? source.length() : -1;
121 | }
122 | if (fromIndex < 0) {
123 | fromIndex = 0;
124 | }
125 | if (target.length() == 0) {
126 | return fromIndex;
127 | }
128 |
129 | byte first = target.getByte(0);
130 | for (int i = fromIndex, max = source.length() - target.length(); i <= max;
131 | i++) {
132 | // Look for first byte.
133 | if (source.getByte(i) != first) {
134 | while (++i <= max && source.getByte(i) != first) {}
135 | }
136 |
137 | // Found first byte, now look at the rest of v2.
138 | if (i <= max) {
139 | int j = i + 1;
140 | int end = j + target.length() - 1;
141 | for (int k = 1; j < end && source.getByte(j) == target.getByte(k); j++, k++) {}
142 |
143 | if (j == end) {
144 | return i; // found whole array
145 | }
146 | }
147 | }
148 | return -1;
149 | }
150 |
151 | // isWordByte reports whether b is consider a ``word character''
152 | // during the evaluation of the \b and \B zero-width assertions.
153 | // These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
154 | static boolean isWordByte(byte b) {
155 | return ('A' <= b && b <= 'Z' ||
156 | 'a' <= b && b <= 'z' ||
157 | '0' <= b && b <= '9' ||
158 | b == '_');
159 | }
160 |
161 | static boolean isRuneStart(byte b) {
162 | return (b & 0xC0) != 0x80; // 10xxxxxx
163 | }
164 |
165 | //// EMPTY_* flags
166 |
167 | static final int EMPTY_BEGIN_LINE = 0x01;
168 | static final int EMPTY_END_LINE = 0x02;
169 | static final int EMPTY_BEGIN_TEXT = 0x04;
170 | static final int EMPTY_END_TEXT = 0x08;
171 | static final int EMPTY_WORD_BOUNDARY = 0x10;
172 | static final int EMPTY_NO_WORD_BOUNDARY = 0x20;
173 | static final int EMPTY_ALL = -1; // (impossible)
174 |
175 | // emptyOpContext returns the zero-width assertions satisfied at the position
176 | // between the bytes b1 and b2, a bitmask of EMPTY_* flags.
177 | // Passing b1 == -1 indicates that the position is at the beginning of the text.
178 | // Passing b2 == -1 indicates that the position is at the end of the text.
179 | // TODO(adonovan): move to Machine.
180 | static int emptyOpContext(byte b1, byte b2) {
181 | int op = 0;
182 | if (b1 == EOF) {
183 | op |= EMPTY_BEGIN_TEXT | EMPTY_BEGIN_LINE;
184 | }
185 | if (b1 == '\n') {
186 | op |= EMPTY_BEGIN_LINE;
187 | }
188 | if (b2 == EOF) {
189 | op |= EMPTY_END_TEXT | EMPTY_END_LINE;
190 | }
191 | if (b2 == '\n') {
192 | op |= EMPTY_END_LINE;
193 | }
194 | if (isWordByte(b1) != isWordByte(b2)) {
195 | op |= EMPTY_WORD_BOUNDARY;
196 | } else {
197 | op |= EMPTY_NO_WORD_BOUNDARY;
198 | }
199 | return op;
200 | }
201 |
202 | static boolean arrayFirstElementsEqual(int[] a, int[] a2, int length) {
203 | if (a == a2) {
204 | return true;
205 | }
206 |
207 | if (a == null || a2 == null) {
208 | return false;
209 | }
210 |
211 | if (a.length < length || a2.length < length) {
212 | return false;
213 | }
214 |
215 | for (int i = 0; i < length; i++)
216 | if (a[i] != a2[i])
217 | return false;
218 |
219 | return true;
220 | }
221 |
222 | private Utils() {} // uninstantiable
223 |
224 | }
225 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/make_perl_groups.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | # Copyright 2008 The Go Authors. All rights reserved.
3 | # Use of this source code is governed by a BSD-style
4 | # license that can be found in the LICENSE file.
5 |
6 | # Modified version of make_perl_groups.pl from RE2/Go:
7 | # code.google.com/p/go/source/browse/src/pkg/regexp/syntax/make_perl_groups.pl
8 | # which is in turn a modified version of RE2/C++ implementation.
9 |
10 | # Generate table entries giving character ranges
11 | # for POSIX/Perl character classes. Rather than
12 | # figure out what the definition is, it is easier to ask
13 | # Perl about each letter from 0-128 and write down
14 | # its answer.
15 |
16 | @posixclasses = (
17 | "[:alnum:]",
18 | "[:alpha:]",
19 | "[:ascii:]",
20 | "[:blank:]",
21 | "[:cntrl:]",
22 | "[:digit:]",
23 | "[:graph:]",
24 | "[:lower:]",
25 | "[:print:]",
26 | "[:punct:]",
27 | "[:space:]",
28 | "[:upper:]",
29 | "[:word:]",
30 | "[:xdigit:]",
31 | );
32 |
33 | @perlclasses = (
34 | "\\d",
35 | "\\s",
36 | "\\w",
37 | );
38 |
39 | sub ComputeClass($) {
40 | my @ranges;
41 | my ($class) = @_;
42 | my $regexp = "[$class]";
43 | my $start = -1;
44 | for (my $i=0; $i<=129; $i++) {
45 | if ($i == 129) { $i = 256; }
46 | if ($i <= 128 && chr($i) =~ $regexp) {
47 | if ($start < 0) {
48 | $start = $i;
49 | }
50 | } else {
51 | if ($start >= 0) {
52 | push @ranges, [$start, $i-1];
53 | }
54 | $start = -1;
55 | }
56 | }
57 | return @ranges;
58 | }
59 |
60 | sub PrintClass($$@) {
61 | my ($cname, $groupmap, $name, @ranges) = @_;
62 | print " private static final int[] code$cname = { /* $name */\n";
63 | for (my $i=0; $i<@ranges; $i++) {
64 | my @a = @{$ranges[$i]};
65 | printf "\t0x%x, 0x%x,\n", $a[0], $a[1];
66 | }
67 | print " };\n\n";
68 | my $n = @ranges;
69 | $negname = $name;
70 | if ($negname =~ /:/) {
71 | $negname =~ s/:/:^/;
72 | } else {
73 | $negname =~ y/a-z/A-Z/;
74 | }
75 | $name =~ s/\\/\\\\/g;
76 | $negname =~ s/\\/\\\\/g;
77 | return " $groupmap.put(\"$name\", \tnew CharGroup(+1, code$cname));\n" .
78 | " $groupmap.put(\"$negname\", \tnew CharGroup(-1, code$cname));\n";
79 | }
80 |
81 | my $gen = 0;
82 |
83 | sub PrintClasses($@) {
84 | my ($cname, @classes) = @_;
85 | my $groupmap = uc($cname) . "_GROUPS";
86 | my @entries;
87 | foreach my $cl (@classes) {
88 | my @ranges = ComputeClass($cl);
89 | push @entries, PrintClass(++$gen, $groupmap, $cl, @ranges);
90 | }
91 | print " static final HashMap $groupmap =\n";
92 | print " new HashMap();\n";
93 | print "\n";
94 | print " static {\n";
95 | foreach my $e (@entries) {
96 | print $e;
97 | }
98 | print " }\n";
99 | my $count = @entries;
100 | }
101 |
102 | print <perl_groups.go
105 |
106 | package com.google.re2j;
107 |
108 | import java.util.HashMap;
109 |
110 | class CharGroup {
111 |
112 | final int sign;
113 | final int[] cls;
114 |
115 | private CharGroup(int sign, int[] cls) {
116 | this.sign = sign;
117 | this.cls = cls;
118 | }
119 |
120 | EOF
121 |
122 | PrintClasses("perl", @perlclasses);
123 | PrintClasses("posix", @posixclasses);
124 |
125 |
126 | print <UnicodeTables.java
18 | #
19 | # States:
20 | # 0 = toplevel
21 | # 1 = inside Scripts/Categories/Properties definition:
22 | # var Categories = map[string]*RangeTable{
23 | # "Lm": Lm,
24 | # ...
25 | # }
26 | # 2 = inside a range definition:
27 | # var _Carian = &RangeTable{
28 | # ...
29 | # R32: []Range32{
30 | # {0x102a0, 0x102d0, 1},
31 | # ...
32 | # },
33 | # }
34 | # 3 = inside an alias definition:
35 | # var (
36 | # Cc = _Cc; // comment
37 | # ...
38 | # )
39 | # 4 = inside CaseRanges definition:
40 | # var _CaseRanges = []CaseRange{
41 | # {0x0041, 0x005A, d{0, 32, 0}},
42 | # ...
43 | # }
44 | # 5 = inside caseOrbit definition:
45 | # var caseOrbit = []foldPair{
46 | # {0x004B, 0x006B},
47 | # ...
48 | # }
49 |
50 | BEGIN {
51 | print "// AUTOGENERATED by make_unicode_tables.awk from the output of"
52 | print "// go/src/pkg/unicode/maketables.go. Yes it's awful, but frankly"
53 | print "// it's quicker than porting 1300 more lines of Go."
54 | print
55 | print "package com.google.re2j;";
56 | print
57 | print "import java.util.HashMap;"
58 | print "import java.util.Map;"
59 | print
60 | print "class UnicodeTables {";
61 |
62 | # Constants used by CASE_RANGES and by Unicode utilities.
63 | # TODO(adonovan): use Java-style identifiers.
64 | print " static final int UpperCase = 0;";
65 | print " static final int LowerCase = 1;";
66 | print " static final int TitleCase = 2;";
67 | print " static final int UpperLower = 0x110000;";
68 | }
69 |
70 |
71 | ### State 1
72 |
73 | state == 0 && /^var FoldScript = .*{}/ {
74 | # Special case for when this map is empty map
75 | print " private static Map " $2 "() {";
76 | print " return new HashMap();";
77 | print " }";
78 | next;
79 | }
80 | state == 0 && /^var (Categories|Scripts|FoldCategory|FoldScript|Properties)/ {
81 | print " private static Map "$2"() {";
82 | print " Map map = new HashMap();";
83 | state = 1;
84 | next;
85 | }
86 | state == 1 && /.*: .*,/ {
87 | key = substr($1, 0, length($1) - 1);
88 | value = substr($2, 0, length($2) - 1);
89 | print " map.put(" key ", " value ");";
90 | next;
91 | }
92 | state == 1 && /^}/ {
93 | print " return map;"
94 | print " }";
95 | state = 0;
96 | next;
97 | }
98 |
99 |
100 | ### State 2
101 |
102 | state == 0 && /^var .* = &RangeTable{/ {
103 | # Hack upon hack: javac refuses to compile too-large methods,
104 | # so we have to split this into smaller pieces.
105 | print " private static final int[][] " $2 " = make" $2 "();";
106 | print " private static int[][] make" $2 "() {";
107 | print " return new int[][] {"
108 | state = 2;
109 | next;
110 | }
111 | state == 2 && / *R(16|32)/ { next; }
112 | state == 2 && /\t},/ { next; }
113 | state == 2 && /^}/ {
114 | print " };";
115 | print " }";
116 | state = 0;
117 | next;
118 | }
119 | state == 2 { print; }
120 |
121 |
122 | ### State 3
123 |
124 | state == 0 && /^var \(/ {
125 | state = 3;
126 | next;
127 | }
128 | state == 3 && /=/ {
129 | print " static final int[][] " $1 " = " $3 ";";
130 | }
131 | state == 3 && /^)/ {
132 | state = 0;
133 | next;
134 | }
135 |
136 | ### State 4
137 |
138 | state == 0 && /^var _CaseRanges = / {
139 | print " static final int[][] CASE_RANGES = {";
140 | state = 4;
141 | next;
142 | }
143 | state == 4 && /^}/ {
144 | state = 0;
145 | print " };"
146 | next;
147 | }
148 | state == 4 {
149 | sub("d{", "");
150 | sub("}}", "}");
151 | print;
152 | }
153 |
154 | ### State 5
155 |
156 | state == 0 && /^var caseOrbit = / {
157 | print " static final int[][] CASE_ORBIT = {";
158 | state = 5;
159 | next;
160 | }
161 | state == 5 && /^}/ {
162 | state = 0;
163 | print " };"
164 | next;
165 | }
166 | state == 5 {
167 | print;
168 | }
169 |
170 |
171 | END {
172 | # Call the functions after all initialization has occurred.
173 | print " static final Map CATEGORIES = Categories();"
174 | print " static final Map SCRIPTS = Scripts();"
175 | print " static final Map PROPERTIES = Properties();"
176 | print " static final Map FOLD_CATEGORIES = FoldCategory();"
177 | print " static final Map FOLD_SCRIPT = FoldScript();"
178 | print ""
179 | print " private UnicodeTables() {} // uninstantiable";
180 | print "}"
181 | }
182 |
--------------------------------------------------------------------------------
/app/src/main/java/com/google/re2j/package.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | This package provides an implementation of regular expression
4 | matching based on Russ Cox's linear-time RE2 algorithm.
6 |
7 |
8 | The API presented by {@code com.google.re2j} mimics that of {@code
9 | java.util.regex.Matcher} and {@code java.util.regex.Pattern}.
10 | While not identical, they are similar enough that most users can
11 | switch implementations simply by changing their {@code import}s.
12 |
13 |
14 | The syntax of the regular expressions accepted is the same general
15 | syntax used by Perl, Python, and other languages. More precisely,
16 | it is the syntax accepted by the C++ and Go implementations of RE2
17 | described at http://code.google.com/p/re2/wiki/Syntax,
19 | except for \C (match any byte), which is not
20 | supported because in this implementation, the matcher's input is
21 | conceptually a stream of Unicode code points, not bytes.
22 |
23 |
24 | The current API is rather small and intended for compatibility
25 | with {@code java.util.regex}, but the underlying implementation
26 | supports some additional features, such as the ability to process
27 | input character streams encoded as UTF-8 byte arrays. These may
28 | be exposed in a future release if there is sufficient interest.
29 |