├── .gitignore
├── .idea
├── ant.xml
├── compiler.xml
├── encodings.xml
├── libraries
│ └── Maven__junit_junit_4_7.xml
├── misc.xml
├── modules.xml
├── vcs.xml
└── workspace.xml
├── README.markdown
├── article
├── build.sh
├── conf.py
├── images
│ ├── benchmark.pdf
│ ├── benchmark.png
│ ├── dfa.pdf
│ ├── dfa.png
│ ├── ire-dfa-nfa.pdf
│ ├── ire-dfa-nfa.png
│ ├── ire-fa.pdf
│ ├── ire-fa.png
│ ├── ire-overview.pdf
│ ├── ire-overview.png
│ ├── ire-rope.pdf
│ ├── ire-rope.png
│ ├── memory-overhead.pdf
│ ├── memory-overhead.png
│ ├── nfa-composition.pdf
│ ├── nfa-composition.png
│ ├── nfa-ne-all.pdf
│ ├── nfa-ne-all.png
│ ├── nfa-ne.pdf
│ ├── nfa-ne.png
│ ├── nfa.pdf
│ ├── nfa.png
│ ├── rope-append-sharing.pdf
│ ├── rope-append-sharing.png
│ ├── rope-append.pdf
│ ├── rope-append.png
│ ├── rope-minmax.pdf
│ ├── rope-minmax.png
│ ├── rope-nfa.pdf
│ ├── rope-nfa.png
│ ├── rope-ops.pdf
│ ├── rope-ops.png
│ ├── split-as-concatenation.odg
│ ├── split-as-concatenation.pdf
│ ├── split-as-concatenation.png
│ ├── split-sum-squares.pdf
│ ├── split-sum-squares.png
│ ├── tree-split-pred.pdf
│ └── tree-split-pred.png
├── index.rst
└── themes
│ └── ire
│ ├── static
│ └── ire.css_t
│ └── theme.conf
├── ire.iml
├── lib
└── annotations.jar
├── pom.xml
├── src
├── main
│ └── java
│ │ └── org
│ │ └── jkff
│ │ └── ire
│ │ ├── Compiler.java
│ │ ├── DFAIndexedString.java
│ │ ├── DFAMatcher.java
│ │ ├── DFARopePatternSet.java
│ │ ├── IndexedString.java
│ │ ├── LinearIS.java
│ │ ├── Match.java
│ │ ├── PatternSet.java
│ │ ├── fa
│ │ ├── BiDFA.java
│ │ ├── DFA.java
│ │ ├── IntState.java
│ │ ├── IntTable.java
│ │ ├── MutableTransferFunction.java
│ │ ├── PowerIntState.java
│ │ ├── PowerIntTable.java
│ │ ├── Sequence.java
│ │ ├── State.java
│ │ ├── TransferFunction.java
│ │ └── TransferTable.java
│ │ ├── regex
│ │ ├── Alternative.java
│ │ ├── CharacterClass.java
│ │ ├── Empty.java
│ │ ├── Labeled.java
│ │ ├── OnceOrMore.java
│ │ ├── RegexCompiler.java
│ │ ├── RegexParser.java
│ │ ├── RxNode.java
│ │ └── Sequence.java
│ │ ├── rope
│ │ ├── Rope.java
│ │ ├── RopeBasedIS.java
│ │ └── RopeFactory.java
│ │ └── util
│ │ ├── CoarsestPartition.java
│ │ ├── CollectionFactory.java
│ │ ├── Function.java
│ │ ├── Function2.java
│ │ ├── Pair.java
│ │ ├── Predicate.java
│ │ ├── Reducer.java
│ │ └── WrappedBitSet.java
└── test
│ └── java
│ └── org
│ └── jkff
│ └── ire
│ ├── DFABuilder.java
│ ├── IntegrationTest.java
│ ├── LinearISTest.java
│ ├── NFABuilder.java
│ ├── regex
│ ├── RegexCompilerTest.java
│ └── RegexParserTest.java
│ └── rope
│ ├── RopeBasedISTest.java
│ └── RopeTest.java
└── target
└── ire-0.1.jar
/.gitignore:
--------------------------------------------------------------------------------
1 | out
2 |
--------------------------------------------------------------------------------
/.idea/ant.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/libraries/Maven__junit_junit_4_7.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
32 |
33 |
34 | http://www.w3.org/1999/xhtml
35 |
36 |
37 |
38 |
39 |
40 |
41 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/README.markdown:
--------------------------------------------------------------------------------
1 | What's this
2 | =============
3 | ire is a library for incremental regular expression matching, based on further development of the ideas from Dan Piponi's famous blogpost [Fast Incremental Regular Expression Matching with Monoids](http://blog.sigfpe.com/2009/01/fast-incremental-regular-expression.html).
4 |
5 | * **ire** is for "incremental", "regular", "expressions"
6 | * **incremental** means "fast recomputation of results according to changes of input string" (not pattern)
7 | * **regular** means "regular" - t.i. no backrefs or other Perl magic.
8 |
9 | There is an article about it: http://jkff.info/articles/ire
10 |
11 | How to use it?
12 | ===============
13 |
14 | Add the target/ire-VERSION.jar to your classpath.
15 |
16 | import org.jkff.ire.*;
17 | import org.jkff.regex.RegexCompiler;
18 |
19 | // Compile the regexes
20 | String[] regexes = {...};
21 | PatternSet pat = RegexCompiler.compile(regexes);
22 |
23 | // Index a string (slow)
24 | IndexedString is = pat.match(someString); // or match(someString, blockSize)
25 |
26 | // Get matches (fast)
27 | for(Match m : is.getMatches()) {
28 | int startPos = m.startPos();
29 | int length = m.length();
30 | int whichPattern = m.whichPattern();
31 | }
32 |
33 | // Here's the "incremental" part. Assume 'a' and 'b' are IndexedString's.
34 | // You can cut and recombine string pieces, it will be fast, and getMatches()
35 | // of the resulting strings will be fast.
36 | IndexedString c = a.append(b);
37 | IndexedString sub = is.subSequence(start, end);
38 | Pair p = is.splitBefore(i);
39 |
40 | How to experiment with it?
41 | ==========================
42 | Open the IDEA project (or create a project in your favourite IDE over it - there's just one library dependency in the "lib" folder) and run the "tests" in `org.jkff.ire.IntegrationTest`.
43 |
44 | Do not forget to run the unit tests after changes.
45 |
46 | Ask me (ekirpichov@gmail.com) if you're interested in something.
47 |
48 | How fast is it?
49 | ===============
50 |
51 | It is much faster than `java.util.regex` in the following case:
52 |
53 | * Not too many patterns
54 | * Not too many occurences of them
55 | * The input strings are very long
56 | * Incremental operations are dominant
57 | * You have a lot of spare memory (the "block size" parameter is not too large)
58 |
59 | It is much slower in most other cases.
60 |
61 | For example, when finding all occurences of patterns from the ["regex-dna" benchmark](http://shootout.alioth.debian.org/u32q/performance.php?test=regexdna)
62 | in a 500-kb DNA string with total 100 occurences, with a block size of 16 we're getting nearly 5000 occurences per second with our library and about 500 per second with java.util.regex.
63 |
64 | However, when solving the same problem for a 50kb string with 100 occurences, with a block size of 256 we have exactly the opposite - 500 vs 6000.
65 |
66 | How does it work?
67 | ==================
68 | Read Dan Piponi's aforementioned blogpost; here are the differences:
69 |
70 | * Instead of fingertrees, we use a "rope" datastructure with caching sums of values in an arbitrary monoid. The rope datastructure is in `org.jkff.ire.rope` package. It uses a constant-height 2-3 tree of N..2N-1 array chunks. Append and split operations are quite trivial.
71 | * We not only test for a match, but also find match positions. This is done by 1 split to find the end of the match and another to find the beginning, with some intricacies for overlapping matches. See `org.jkff.ire.DFAMatcher` class.
72 | * We use NFA instead of DFA, because we care mostly about number of states (we have to compose transition tables) and state blow-up of DFAs is unacceptable. FAs are in `org.jkff.ire.fa` package.
73 | * We do some optimization of the NFA to further reduce states, see `org.jkff.ire.regex.RegexCompiler`.
74 | * We use a compact representation of NFA as a boolean matrix represented as a bitset, with fast multiplication, see `org.jkff.ire.fa.PowerIntTable`
75 |
76 | If you happen to read Russian, then read [an article in fprog.ru](http://fprog.ru/2010/issue6/eugene-kirpichov-incremental-regular-expressions/)
77 |
--------------------------------------------------------------------------------
/article/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | sphinx-build -b html -D math-output=mathjax -a -E . ./_build
4 |
--------------------------------------------------------------------------------
/article/conf.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | #
3 | # This file is execfile()d with the current directory set to its containing dir.
4 | #
5 | # Note that not all possible configuration values are present in this
6 | # autogenerated file.
7 | #
8 | # All configuration values have a default; values that are commented out
9 | # serve to show the default.
10 |
11 | import sys, os
12 |
13 | # If extensions (or modules to document with autodoc) are in another directory,
14 | # add these directories to sys.path here. If the directory is relative to the
15 | # documentation root, use os.path.abspath to make it absolute, like shown here.
16 | #sys.path.insert(0, os.path.abspath('.'))
17 |
18 | # -- General configuration -----------------------------------------------------
19 |
20 | # If your documentation needs a minimal Sphinx version, state it here.
21 | #needs_sphinx = '1.0'
22 |
23 | # Add any Sphinx extension module names here, as strings. They can be extensions
24 | # coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
25 | extensions = ['sphinx.ext.todo', 'sphinx.ext.mathjax']
26 |
27 | # Add any paths that contain templates here, relative to this directory.
28 | templates_path = ['_templates']
29 |
30 | # The suffix of source filenames.
31 | source_suffix = '.rst'
32 |
33 | # The encoding of source files.
34 | #source_encoding = 'utf-8-sig'
35 |
36 | # The master toctree document.
37 | master_doc = 'index'
38 |
39 | # General information about the project.
40 | project = u'ire'
41 | copyright = u'2012, Eugene Kirpichov'
42 |
43 | # The version info for the project you're documenting, acts as replacement for
44 | # |version| and |release|, also used in various other places throughout the
45 | # built documents.
46 | #
47 | # The short X.Y version.
48 | # The full version, including alpha/beta/rc tags.
49 |
50 | # The language for content autogenerated by Sphinx. Refer to documentation
51 | # for a list of supported languages.
52 | #language = None
53 |
54 | # There are two options for replacing |today|: either, you set today to some
55 | # non-false value, then it is used:
56 | #today = ''
57 | # Else, today_fmt is used as the format for a strftime call.
58 | #today_fmt = '%B %d, %Y'
59 |
60 | # List of patterns, relative to source directory, that match files and
61 | # directories to ignore when looking for source files.
62 | exclude_patterns = ['_build']
63 |
64 | # The reST default role (used for this markup: `text`) to use for all documents.
65 | #default_role = None
66 |
67 | # If true, '()' will be appended to :func: etc. cross-reference text.
68 | #add_function_parentheses = True
69 |
70 | # If true, the current module name will be prepended to all description
71 | # unit titles (such as .. function::).
72 | #add_module_names = True
73 |
74 | # If true, sectionauthor and moduleauthor directives will be shown in the
75 | # output. They are ignored by default.
76 | #show_authors = False
77 |
78 | # The name of the Pygments (syntax highlighting) style to use.
79 | pygments_style = 'sphinx'
80 |
81 | # A list of ignored prefixes for module index sorting.
82 | #modindex_common_prefix = []
83 |
84 |
85 | # -- Options for HTML output ---------------------------------------------------
86 |
87 | # The theme to use for HTML and HTML Help pages. See the documentation for
88 | # a list of builtin themes.
89 | html_theme = 'ire'
90 |
91 | # Theme options are theme-specific and customize the look and feel of a theme
92 | # further. For a list of options available for each theme, see the
93 | # documentation.
94 | html_theme_options = { 'sidebarwidth': 300 }
95 |
96 | # Add any paths that contain custom themes here, relative to this directory.
97 | html_theme_path = ['themes']
98 |
99 | # The name for this set of Sphinx documents. If None, it defaults to
100 | # " v documentation".
101 | html_title = 'Incremental regular expressions'
102 |
103 | # A shorter title for the navigation bar. Default is the same as html_title.
104 | #html_short_title = None
105 |
106 | # The name of an image file (relative to this directory) to place at the top
107 | # of the sidebar.
108 | #html_logo = None
109 |
110 | # The name of an image file (within the static path) to use as favicon of the
111 | # docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
112 | # pixels large.
113 | #html_favicon = None
114 |
115 | # Add any paths that contain custom static files (such as style sheets) here,
116 | # relative to this directory. They are copied after the builtin static files,
117 | # so a file named "default.css" will overwrite the builtin "default.css".
118 | html_static_path = ['_static']
119 |
120 | # If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
121 | # using the given strftime format.
122 | #html_last_updated_fmt = '%b %d, %Y'
123 |
124 | # If true, SmartyPants will be used to convert quotes and dashes to
125 | # typographically correct entities.
126 | #html_use_smartypants = True
127 |
128 | # Custom sidebar templates, maps document names to template names.
129 | #html_sidebars = {}
130 |
131 | # Additional templates that should be rendered to pages, maps page names to
132 | # template names.
133 | #html_additional_pages = {}
134 |
135 | # If false, no module index is generated.
136 | #html_domain_indices = True
137 |
138 | # If false, no index is generated.
139 | #html_use_index = True
140 |
141 | # If true, the index is split into individual pages for each letter.
142 | #html_split_index = False
143 |
144 | # If true, links to the reST sources are added to the pages.
145 | #html_show_sourcelink = True
146 |
147 | # If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
148 | #html_show_sphinx = True
149 |
150 | # If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
151 | #html_show_copyright = True
152 |
153 | # If true, an OpenSearch description file will be output, and all pages will
154 | # contain a tag referring to it. The value of this option must be the
155 | # base URL from which the finished HTML is served.
156 | #html_use_opensearch = ''
157 |
158 | # This is the file name suffix for HTML files (e.g. ".xhtml").
159 | #html_file_suffix = None
160 |
161 | # Output file base name for HTML help builder.
162 | htmlhelp_basename = 'iredoc'
163 |
164 |
165 | # -- Options for LaTeX output --------------------------------------------------
166 |
167 | latex_elements = {
168 | # The paper size ('letterpaper' or 'a4paper').
169 | #'papersize': 'letterpaper',
170 |
171 | # The font size ('10pt', '11pt' or '12pt').
172 | #'pointsize': '10pt',
173 |
174 | # Additional stuff for the LaTeX preamble.
175 | #'preamble': '',
176 | }
177 |
178 | # The name of an image file (relative to this directory) to place at the top of
179 | # the title page.
180 | #latex_logo = None
181 |
182 | # For "manual" documents, if this is true, then toplevel headings are parts,
183 | # not chapters.
184 | #latex_use_parts = False
185 |
186 | # If true, show page references after internal links.
187 | #latex_show_pagerefs = False
188 |
189 | # If true, show URL addresses after external links.
190 | #latex_show_urls = False
191 |
192 | # Documents to append as an appendix to all manuals.
193 | #latex_appendices = []
194 |
195 | # If false, no module index is generated.
196 | #latex_domain_indices = True
197 |
198 |
199 | # -- Options for manual page output --------------------------------------------
200 |
201 | # If true, show URL addresses after external links.
202 | #man_show_urls = False
203 |
204 |
205 | # -- Options for Texinfo output ------------------------------------------------
206 |
207 | # Documents to append as an appendix to all manuals.
208 | #texinfo_appendices = []
209 |
210 | # If false, no module index is generated.
211 | #texinfo_domain_indices = True
212 |
213 | # How to display URL addresses: 'footnote', 'no', or 'inline'.
214 | #texinfo_show_urls = 'footnote'
215 |
--------------------------------------------------------------------------------
/article/images/benchmark.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/benchmark.pdf
--------------------------------------------------------------------------------
/article/images/benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/benchmark.png
--------------------------------------------------------------------------------
/article/images/dfa.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/dfa.pdf
--------------------------------------------------------------------------------
/article/images/dfa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/dfa.png
--------------------------------------------------------------------------------
/article/images/ire-dfa-nfa.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/ire-dfa-nfa.pdf
--------------------------------------------------------------------------------
/article/images/ire-dfa-nfa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/ire-dfa-nfa.png
--------------------------------------------------------------------------------
/article/images/ire-fa.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/ire-fa.pdf
--------------------------------------------------------------------------------
/article/images/ire-fa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/ire-fa.png
--------------------------------------------------------------------------------
/article/images/ire-overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/ire-overview.pdf
--------------------------------------------------------------------------------
/article/images/ire-overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/ire-overview.png
--------------------------------------------------------------------------------
/article/images/ire-rope.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/ire-rope.pdf
--------------------------------------------------------------------------------
/article/images/ire-rope.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/ire-rope.png
--------------------------------------------------------------------------------
/article/images/memory-overhead.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/memory-overhead.pdf
--------------------------------------------------------------------------------
/article/images/memory-overhead.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/memory-overhead.png
--------------------------------------------------------------------------------
/article/images/nfa-composition.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/nfa-composition.pdf
--------------------------------------------------------------------------------
/article/images/nfa-composition.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/nfa-composition.png
--------------------------------------------------------------------------------
/article/images/nfa-ne-all.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/nfa-ne-all.pdf
--------------------------------------------------------------------------------
/article/images/nfa-ne-all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/nfa-ne-all.png
--------------------------------------------------------------------------------
/article/images/nfa-ne.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/nfa-ne.pdf
--------------------------------------------------------------------------------
/article/images/nfa-ne.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/nfa-ne.png
--------------------------------------------------------------------------------
/article/images/nfa.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/nfa.pdf
--------------------------------------------------------------------------------
/article/images/nfa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/nfa.png
--------------------------------------------------------------------------------
/article/images/rope-append-sharing.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/rope-append-sharing.pdf
--------------------------------------------------------------------------------
/article/images/rope-append-sharing.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/rope-append-sharing.png
--------------------------------------------------------------------------------
/article/images/rope-append.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/rope-append.pdf
--------------------------------------------------------------------------------
/article/images/rope-append.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/rope-append.png
--------------------------------------------------------------------------------
/article/images/rope-minmax.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/rope-minmax.pdf
--------------------------------------------------------------------------------
/article/images/rope-minmax.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/rope-minmax.png
--------------------------------------------------------------------------------
/article/images/rope-nfa.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/rope-nfa.pdf
--------------------------------------------------------------------------------
/article/images/rope-nfa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/rope-nfa.png
--------------------------------------------------------------------------------
/article/images/rope-ops.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/rope-ops.pdf
--------------------------------------------------------------------------------
/article/images/rope-ops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/rope-ops.png
--------------------------------------------------------------------------------
/article/images/split-as-concatenation.odg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/split-as-concatenation.odg
--------------------------------------------------------------------------------
/article/images/split-as-concatenation.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/split-as-concatenation.pdf
--------------------------------------------------------------------------------
/article/images/split-as-concatenation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/split-as-concatenation.png
--------------------------------------------------------------------------------
/article/images/split-sum-squares.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/split-sum-squares.pdf
--------------------------------------------------------------------------------
/article/images/split-sum-squares.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/split-sum-squares.png
--------------------------------------------------------------------------------
/article/images/tree-split-pred.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/tree-split-pred.pdf
--------------------------------------------------------------------------------
/article/images/tree-split-pred.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/article/images/tree-split-pred.png
--------------------------------------------------------------------------------
/article/themes/ire/static/ire.css_t:
--------------------------------------------------------------------------------
1 | /*
2 | * nature.css_t
3 | * ~~~~~~~~~~~~
4 | *
5 | * Sphinx stylesheet -- nature theme.
6 | *
7 | * :copyright: Copyright 2007-2011 by the Sphinx team, see AUTHORS.
8 | * :license: BSD, see LICENSE for details.
9 | *
10 | */
11 |
12 | @import url("basic.css");
13 |
14 | /* -- page layout ----------------------------------------------------------- */
15 |
16 | body {
17 | font-family: Baskerville, serif;
18 | font-size: 120%;
19 | background-color: #111;
20 | color: #555;
21 | margin: 0;
22 | padding: 0;
23 | }
24 |
25 | div.documentwrapper {
26 | float: left;
27 | width: 100%;
28 | }
29 |
30 | div.bodywrapper {
31 | margin: 0 0 0 {{ theme_sidebarwidth|toint }}px;
32 | }
33 |
34 | hr {
35 | border: 1px solid #B1B4B6;
36 | }
37 |
38 | div.document {
39 | background-color: #eee;
40 | }
41 |
42 | div.body {
43 | background-color: #ffffff;
44 | color: #3E4349;
45 | padding: 0 30px 30px 30px;
46 | width: 900px;
47 | font-size: 0.9em;
48 | }
49 |
50 | div.footer {
51 | color: #555;
52 | width: 100%;
53 | padding: 13px 0;
54 | text-align: center;
55 | font-size: 75%;
56 | }
57 |
58 | div.footer a {
59 | color: #444;
60 | text-decoration: underline;
61 | }
62 |
63 | div.related {
64 | background-color: #6BA81E;
65 | line-height: 32px;
66 | color: #fff;
67 | text-shadow: 0px 1px 0 #444;
68 | font-size: 0.9em;
69 | }
70 |
71 | div.related a {
72 | color: #E2F3CC;
73 | }
74 |
75 | div.sphinxsidebar {
76 | font-size: 0.75em;
77 | line-height: 1.5em;
78 | }
79 |
80 | div.sphinxsidebarwrapper{
81 | padding: 20px 0;
82 | }
83 |
84 | div.sphinxsidebar h3,
85 | div.sphinxsidebar h4 {
86 | font-family: Arial, sans-serif;
87 | color: #222;
88 | font-size: 1.2em;
89 | font-weight: normal;
90 | margin: 0;
91 | padding: 5px 10px;
92 | background-color: #ddd;
93 | text-shadow: 1px 1px 0 white
94 | }
95 |
96 | div.sphinxsidebar h4{
97 | font-size: 1.1em;
98 | }
99 |
100 | div.sphinxsidebar h3 a {
101 | color: #444;
102 | }
103 |
104 |
105 | div.sphinxsidebar p {
106 | color: #888;
107 | padding: 5px 20px;
108 | }
109 |
110 | div.sphinxsidebar p.topless {
111 | }
112 |
113 | div.sphinxsidebar ul {
114 | margin: 10px 10px;
115 | padding: 0;
116 | color: #000;
117 | }
118 |
119 | div.sphinxsidebar a {
120 | color: #444;
121 | }
122 |
123 | div.sphinxsidebar input {
124 | border: 1px solid #ccc;
125 | font-family: sans-serif;
126 | font-size: 1em;
127 | }
128 |
129 | div.sphinxsidebar input[type=text]{
130 | margin-left: 20px;
131 | }
132 |
133 | /* -- body styles ----------------------------------------------------------- */
134 |
135 | a {
136 | color: #005B81;
137 | text-decoration: none;
138 | }
139 |
140 | a:hover {
141 | color: #E32E00;
142 | text-decoration: underline;
143 | }
144 |
145 | div.body h1,
146 | div.body h2,
147 | div.body h3,
148 | div.body h4,
149 | div.body h5,
150 | div.body h6 {
151 | font-family: Gill Sans, Arial, sans-serif;
152 | background-color: #BED4EB;
153 | font-weight: normal;
154 | color: #212224;
155 | margin: 30px 0px 10px 0px;
156 | padding: 5px 0 5px 10px;
157 | text-shadow: 0px 1px 0 white
158 | }
159 |
160 | div.body h1 { border-top: 20px solid white; margin-top: 0; font-size: 200%; }
161 | div.body h2 { font-size: 150%; background-color: #C8D5E3; }
162 | div.body h3 { font-size: 120%; background-color: #D8DEE3; }
163 | div.body h4 { font-size: 110%; background-color: #D8DEE3; }
164 | div.body h5 { font-size: 100%; background-color: #D8DEE3; }
165 | div.body h6 { font-size: 100%; background-color: #D8DEE3; }
166 |
167 | a.headerlink {
168 | color: #c60f0f;
169 | font-size: 0.8em;
170 | padding: 0 4px 0 4px;
171 | text-decoration: none;
172 | }
173 |
174 | a.headerlink:hover {
175 | background-color: #c60f0f;
176 | color: white;
177 | }
178 |
179 | div.body p, div.body dd, div.body li {
180 | line-height: 1.5em;
181 | }
182 |
183 | div.admonition p.admonition-title + p {
184 | display: inline;
185 | }
186 |
187 | div.highlight{
188 | background-color: white;
189 | }
190 |
191 | div.note {
192 | background-color: #eee;
193 | border: 1px solid #ccc;
194 | }
195 |
196 | div.seealso {
197 | background-color: #ffc;
198 | border: 1px solid #ff6;
199 | }
200 |
201 | div.topic {
202 | background-color: #eee;
203 | }
204 |
205 | div.warning {
206 | background-color: #ffe4e4;
207 | border: 1px solid #f66;
208 | }
209 |
210 | p.admonition-title {
211 | display: inline;
212 | }
213 |
214 | p.admonition-title:after {
215 | content: ":";
216 | }
217 |
218 | pre {
219 | padding: 10px;
220 | background-color: White;
221 | color: #222;
222 | line-height: 1.2em;
223 | border: 1px solid #C6C9CB;
224 | font-size: 1.1em;
225 | margin: 1.5em 0 1.5em 0;
226 | -webkit-box-shadow: 1px 1px 1px #d8d8d8;
227 | -moz-box-shadow: 1px 1px 1px #d8d8d8;
228 | }
229 |
230 | tt {
231 | background-color: #ecf0f3;
232 | color: #222;
233 | /* padding: 1px 2px; */
234 | font-size: 1.1em;
235 | font-family: monospace;
236 | }
237 |
238 | .viewcode-back {
239 | font-family: Arial, sans-serif;
240 | }
241 |
242 | div.viewcode-block:target {
243 | background-color: #f4debf;
244 | border-top: 1px solid #ac9;
245 | border-bottom: 1px solid #ac9;
246 | }
247 |
--------------------------------------------------------------------------------
/article/themes/ire/theme.conf:
--------------------------------------------------------------------------------
1 | [theme]
2 | inherit = nature
3 | stylesheet = ire.css
4 | pygments_style = tango
5 |
--------------------------------------------------------------------------------
/ire.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/lib/annotations.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jkff/ire/7fc556f73b0a7f21d059463eb153e6629b90b48a/lib/annotations.jar
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
3 | 4.0.0
4 | org.jkff
5 | ire
6 | jar
7 | 0.1
8 | ire
9 | http://maven.apache.org
10 |
11 |
12 |
13 | junit
14 | junit
15 | 4.7
16 | test
17 |
18 |
19 |
20 | org.jetbrains
21 | annotations
22 | 1.0.0
23 | system
24 | ${basedir}/lib/annotations.jar
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/Compiler.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire;
2 |
3 | /**
4 | * Created on: 22.07.2010 23:26:38
5 | */
6 | public interface Compiler {
7 | PatternSet compile(String[] patterns);
8 | }
9 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/DFAIndexedString.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire;
2 |
3 | import org.jkff.ire.fa.TransferFunction;
4 |
5 | /**
6 | * Created on: 21.08.2010 21:03:13
7 | */
8 | public interface DFAIndexedString extends IndexedString {
9 | TransferFunction getForward();
10 | TransferFunction getBackward();
11 | }
12 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/DFAMatcher.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire;
2 |
3 | import org.jkff.ire.fa.BiDFA;
4 | import org.jkff.ire.fa.State;
5 | import org.jkff.ire.util.Function2;
6 | import org.jkff.ire.util.Pair;
7 | import org.jkff.ire.util.Predicate;
8 |
9 | import org.jkff.ire.util.WrappedBitSet;
10 | import java.util.List;
11 |
12 | import static org.jkff.ire.util.CollectionFactory.newArrayList;
13 |
14 | /**
15 | * Created on: 31.07.2010 12:19:28
16 | */
17 | public class DFAMatcher {
18 | @SuppressWarnings("unchecked")
19 | public static
20 | Iterable getMatches(
21 | final BiDFA bidfa, final DFAIndexedString string)
22 | {
23 | final ST initial = bidfa.getForward().getInitialState();
24 |
25 | Function2, IndexedString, SP> addString = new Function2, IndexedString, SP>() {
26 | public SP applyTo(SP sp, IndexedString s) {
27 | return new SP(((DFAIndexedString) s).getForward().next(sp.state), sp.pos+s.length());
28 | }
29 | };
30 |
31 | Function2, Character, SP> addChar = new Function2, Character, SP>() {
32 | public SP applyTo(SP sp, Character c) {
33 | return new SP(bidfa.getForward().transfer(c).next(sp.state), sp.pos+1);
34 | }
35 | };
36 |
37 | List res = newArrayList();
38 |
39 | int shift = 0;
40 |
41 | SP matchStartState = new SP(initial, 0);
42 | IndexedString rem = string;
43 | IndexedString seen = string.subSequence(0,0);
44 |
45 | while(true) {
46 | Pair p = rem.splitAfterRise(
47 | matchStartState, addString, addChar, DFAMatcher.hasForwardMatchAfter(shift));
48 | if(p == null)
49 | break;
50 |
51 | DFAIndexedString matchingPrefix = (DFAIndexedString) p.first;
52 | rem = p.second;
53 | seen = seen.append(matchingPrefix);
54 |
55 | final ST stateAfterMatch = matchingPrefix.getForward().next(matchStartState.state);
56 | WrappedBitSet term = stateAfterMatch.getTerminatedPatterns();
57 |
58 | ST backwardInitial = bidfa.getBackward().getInitialState();
59 |
60 | ST nextMatchStart = stateAfterMatch;
61 |
62 | for(int bit = term.nextSetBit(0); bit >= 0; bit = term.nextSetBit(bit+1)) {
63 | final int bit2 = bit;
64 |
65 | Function2 addStringBack = new Function2() {
66 | public ST applyTo(ST st, IndexedString s) {
67 | return ((DFAIndexedString) s).getBackward().next(st);
68 | }
69 | };
70 |
71 | Function2 addCharBack = new Function2() {
72 | public ST applyTo(ST st, Character c) {
73 | return bidfa.getBackward().transfer(c).next(st);
74 | }
75 | };
76 |
77 | Predicate startsThisMatch = new Predicate() {
78 | public boolean isTrueFor(ST state) {
79 | WrappedBitSet tp = state.getTerminatedPatterns();
80 | return tp!=null && tp.get(bit2);
81 | }
82 | };
83 |
84 | int len = seen.splitAfterBackRise(
85 | backwardInitial, addStringBack, addCharBack, startsThisMatch).second.length();
86 | int startPos = seen.length() - len;
87 | res.add(new Match(bit, startPos, len));
88 |
89 | nextMatchStart = bidfa.getForward().resetTerminatedPattern(nextMatchStart, bit);
90 | }
91 |
92 | matchStartState = new SP(nextMatchStart, matchingPrefix.length() + 1);
93 | }
94 |
95 | return res;
96 | }
97 |
98 | private static Predicate> hasForwardMatchAfter(final int pos) {
99 | return new Predicate>() {
100 | public boolean isTrueFor(SP sp) {
101 | return !sp.state.getTerminatedPatterns().isEmpty() && sp.pos >= pos;
102 | }
103 | };
104 | }
105 |
106 | // State and position.
107 | private static class SP {
108 | ST state;
109 | int pos;
110 |
111 | SP(ST state, int pos) {
112 | this.state = state;
113 | this.pos = pos;
114 | }
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/DFARopePatternSet.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire;
2 |
3 | import org.jkff.ire.fa.BiDFA;
4 | import org.jkff.ire.fa.PowerIntState;
5 | import org.jkff.ire.rope.RopeBasedIS;
6 |
7 | /**
8 | * Created on: 01.09.2010 23:44:51
9 | */
10 | public class DFARopePatternSet implements PatternSet {
11 | private BiDFA bidfa;
12 |
13 | public DFARopePatternSet(BiDFA bidfa) {
14 | this.bidfa = bidfa;
15 | }
16 |
17 | public IndexedString match(String s) {
18 | return new RopeBasedIS(bidfa, s);
19 | }
20 |
21 | public IndexedString match(String s, int blockSize) {
22 | return new RopeBasedIS(bidfa, s, blockSize);
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/IndexedString.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire;
2 |
3 | import org.jkff.ire.util.Function2;
4 | import org.jkff.ire.util.Pair;
5 | import org.jkff.ire.util.Predicate;
6 | import org.jetbrains.annotations.Nullable;
7 |
8 | /**
9 | * Created on: 22.07.2010 23:20:48
10 | */
11 | public interface IndexedString extends CharSequence {
12 | Iterable getMatches();
13 |
14 | Pair splitBefore(int index);
15 |
16 | @Nullable
17 | Pair splitAfterRise(
18 | ST seed,
19 | Function2 addChunk, Function2 addChar,
20 | Predicate toBool);
21 |
22 | /**
23 | * Like splitAfterRise, but we count from the right end.
24 | * @param addChunk will be given a NON-REVERSED chunk
25 | */
26 | @Nullable
27 | Pair splitAfterBackRise(
28 | T seed,
29 | Function2 addChunk, Function2 addChar,
30 | Predicate toBool);
31 |
32 | IndexedString append(IndexedString s);
33 |
34 | IndexedString subSequence(int start, int end);
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/LinearIS.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire;
2 |
3 | import org.jkff.ire.fa.BiDFA;
4 | import org.jkff.ire.fa.DFA;
5 | import org.jkff.ire.fa.TransferFunction;
6 | import org.jkff.ire.fa.State;
7 | import org.jkff.ire.util.Pair;
8 | import org.jkff.ire.util.Reducer;
9 | import org.jkff.ire.util.Function2;
10 | import org.jkff.ire.util.Predicate;
11 |
12 | /**
13 | * Created on: 23.07.2010 9:23:42
14 | */
15 | public class LinearIS implements DFAIndexedString {
16 | private CharSequence cs;
17 | private BiDFA bidfa;
18 | private TransferFunction forward;
19 | private TransferFunction backward;
20 |
21 | public LinearIS(CharSequence cs, BiDFA bidfa) {
22 | this(cs, bidfa, transferForward(bidfa, cs), transferBackward(bidfa, cs));
23 | }
24 |
25 | private LinearIS(CharSequence cs,
26 | BiDFA bidfa,
27 | TransferFunction forward, TransferFunction backward)
28 | {
29 | this.cs = cs;
30 | this.bidfa = bidfa;
31 | this.forward = forward;
32 | this.backward = backward;
33 | }
34 |
35 | public TransferFunction getForward() {
36 | return forward;
37 | }
38 |
39 | public TransferFunction getBackward() {
40 | return backward;
41 | }
42 |
43 | public Iterable getMatches() {
44 | return DFAMatcher.getMatches(bidfa, this);
45 | }
46 |
47 | public int length() {
48 | return cs.length();
49 | }
50 |
51 | public char charAt(int index) {
52 | return cs.charAt(index);
53 | }
54 |
55 | public String toString() {
56 | return cs.toString();
57 | }
58 |
59 | public LinearIS subSequence(int start, int end) {
60 | return new LinearIS(cs.subSequence(start, end), bidfa);
61 | }
62 |
63 | public Pair splitBefore(int index) {
64 | return Pair.of(
65 | (IndexedString)new LinearIS(cs.subSequence(0, index), bidfa),
66 | (IndexedString)new LinearIS(cs.subSequence(index, cs.length()), bidfa));
67 | }
68 |
69 | public Pair splitAfterRise(
70 | T seed,
71 | Function2 addChunk,
72 | Function2 addChar, Predicate toBool)
73 | {
74 | T t = seed;
75 | for(int i = 0; i < length(); ++i) {
76 | if(toBool.isTrueFor(t))
77 | return splitBefore(i);
78 | t = addChar.applyTo(t, this.charAt(i));
79 | }
80 | if(toBool.isTrueFor(t))
81 | return splitBefore(length());
82 | return null;
83 | }
84 |
85 | public Pair splitAfterBackRise(
86 | T seed,
87 | Function2 addChunk, Function2 addChar,
88 | Predicate toBool)
89 | {
90 | T t = seed;
91 | for(int i = length()-1; i >= 0; --i) {
92 | if(toBool.isTrueFor(t))
93 | return splitBefore(i+1);
94 | t = addChar.applyTo(t, this.charAt(i));
95 | }
96 | return null;
97 | }
98 |
99 | public IndexedString append(IndexedString other) {
100 | return new LinearIS(cs.toString() + other.toString(), bidfa);
101 | }
102 |
103 | private static TransferFunction transferForward(
104 | BiDFA bidfa, CharSequence cs)
105 | {
106 | DFA dfa = bidfa.getForward();
107 | Reducer> reducer = dfa.getTransferFunctionsReducer();
108 | TransferFunction res = null;
109 | for(int i = 0; i < cs.length(); ++i) {
110 | res = reducer.compose(res, dfa.transfer(cs.charAt(i)));
111 | }
112 | return res;
113 | }
114 |
115 | private static TransferFunction transferBackward(
116 | BiDFA bidfa, CharSequence cs) {
117 | DFA dfa = bidfa.getBackward();
118 | Reducer> reducer = dfa.getTransferFunctionsReducer();
119 | TransferFunction res = null;
120 | for(int i = cs.length() - 1; i >= 0; --i) {
121 | res = reducer.compose(res, dfa.transfer(cs.charAt(i)));
122 | }
123 | return res;
124 | }
125 |
126 | private static TransferFunction identity() {
127 | return new TransferFunction() {
128 | public T next(T x) {
129 | return x;
130 | }
131 | };
132 | }
133 | }
134 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/Match.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire;
2 |
3 | /**
4 | * Created on: 22.07.2010 23:25:29
5 | */
6 | public class Match {
7 | private int whichPattern;
8 | private int startPos;
9 | private int length;
10 |
11 | public Match(int whichPattern, int startPos, int length) {
12 | this.whichPattern = whichPattern;
13 | this.startPos = startPos;
14 | this.length = length;
15 | }
16 |
17 | public int whichPattern() {
18 | return whichPattern;
19 | }
20 |
21 | public int startPos() {
22 | return startPos;
23 | }
24 |
25 | public int length() {
26 | return length;
27 | }
28 |
29 | public String toString() {
30 | return "" + whichPattern + "@("+startPos+","+length+")";
31 | }
32 | }
33 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/PatternSet.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire;
2 |
3 | /**
4 | * Created on: 22.07.2010 23:24:31
5 | */
6 | public interface PatternSet {
7 | IndexedString match(String s);
8 |
9 | IndexedString match(String s, int blockSize);
10 | }
11 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/fa/BiDFA.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.fa;
2 |
3 | /**
4 | * Created on: 25.07.2010 13:34:11
5 | */
6 | public class BiDFA {
7 | private DFA forward;
8 | private DFA backward;
9 |
10 | public BiDFA(DFA forward, DFA backward) {
11 | this.forward = forward;
12 | this.backward = backward;
13 | }
14 |
15 | public DFA getForward() {
16 | return forward;
17 | }
18 |
19 | public DFA getBackward() {
20 | return backward;
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/fa/DFA.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.fa;
2 |
3 | import org.jkff.ire.util.Reducer;
4 |
5 | /**
6 | * Created on: 22.07.2010 23:54:27
7 | */
8 | public abstract class DFA {
9 | private TransferTable transfer;
10 | private S initialState;
11 | private Reducer> transferFunctionsReducer;
12 |
13 | public DFA(TransferTable transfer, S initialState,
14 | Reducer> transferFunctionsReducer)
15 | {
16 | this.transfer = transfer;
17 | this.initialState = initialState;
18 | this.transferFunctionsReducer = transferFunctionsReducer;
19 | }
20 |
21 | public S getInitialState() {
22 | return initialState;
23 | }
24 |
25 | public TransferFunction transfer(C token) {
26 | return transfer.forToken(token);
27 | }
28 |
29 | public Reducer> getTransferFunctionsReducer() {
30 | return transferFunctionsReducer;
31 | }
32 |
33 | public abstract S resetTerminatedPattern(S state, int pattern);
34 | }
35 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/fa/IntState.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.fa;
2 |
3 | import org.jkff.ire.util.WrappedBitSet;
4 |
5 | /**
6 | * Created on: 31.07.2010 15:16:46
7 | */
8 | public class IntState implements State {
9 | private int index;
10 | private WrappedBitSet terminatedPatterns;
11 |
12 | public IntState(int index, WrappedBitSet terminatedPatterns) {
13 | this.index = index;
14 | this.terminatedPatterns = terminatedPatterns;
15 | }
16 |
17 | public int getIndex() {
18 | return index;
19 | }
20 |
21 | public WrappedBitSet getTerminatedPatterns() {
22 | return terminatedPatterns;
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/fa/IntTable.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.fa;
2 |
3 | import org.jkff.ire.util.Reducer;
4 |
5 | /**
6 | * Created on: 22.07.2010 23:49:39
7 | */
8 | public class IntTable implements TransferFunction {
9 | private IntState[] states;
10 | private int[] table;
11 |
12 | public IntTable(IntState[] states, int[] table) {
13 | this.states = states;
14 | this.table = table;
15 | }
16 |
17 | public static Reducer> REDUCER = new Reducer>() {
18 | public TransferFunction compose(
19 | TransferFunction a, TransferFunction b)
20 | {
21 | if(a == null)
22 | return b;
23 | if(b == null)
24 | return a;
25 | return ((IntTable)a).followedBy((IntTable)b);
26 | }
27 |
28 | public TransferFunction composeAll(Sequence> ts) {
29 | TransferFunction res = ts.get(0);
30 | for(int i = 1; i < ts.length(); ++i) {
31 | res = compose(res, ts.get(i));
32 | }
33 | return res;
34 | }
35 | };
36 |
37 | private IntTable followedBy(IntTable other) {
38 | int[] res = new int[table.length];
39 | for(int i = 0; i < res.length; ++i) {
40 | res[i] = other.table[this.table[i]];
41 | }
42 | return new IntTable(states, res);
43 | }
44 |
45 | public IntState next(IntState x) {
46 | return states[table[x.getIndex()]];
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/fa/MutableTransferFunction.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.fa;
2 |
3 | /**
4 | * Created on: 09.09.2010 1:08:03
5 | */
6 | public interface MutableTransferFunction {
7 | void followInPlaceBy(TransferFunction other);
8 | }
9 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/fa/PowerIntState.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.fa;
2 |
3 | import org.jkff.ire.util.WrappedBitSet;
4 |
5 | /**
6 | * Created on: 01.08.2010 13:20:58
7 | */
8 | public class PowerIntState implements State {
9 | private State[] basis;
10 | private WrappedBitSet subset;
11 |
12 | public PowerIntState(State[] basis, WrappedBitSet subset) {
13 | this.basis = basis;
14 | this.subset = subset;
15 | }
16 |
17 | public State[] getBasis() {
18 | return basis;
19 | }
20 |
21 | public WrappedBitSet getSubset() {
22 | return subset;
23 | }
24 |
25 | public WrappedBitSet getTerminatedPatterns() {
26 | WrappedBitSet res = null;
27 | for(int bit = subset.nextSetBit(0); bit >= 0; bit = subset.nextSetBit(bit+1)) {
28 | if(res == null)
29 | res = basis[bit].getTerminatedPatterns().makeCopy();
30 | else
31 | res.or(basis[bit].getTerminatedPatterns());
32 | }
33 | return res;
34 | }
35 |
36 | public String toString() {
37 | return subset.toString();
38 | }
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/fa/PowerIntTable.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.fa;
2 |
3 | import org.jkff.ire.util.Reducer;
4 | import org.jkff.ire.util.WrappedBitSet;
5 |
6 | import java.util.Arrays;
7 |
8 | /**
9 | * Created on: 01.08.2010 13:23:02
10 | */
11 | public class PowerIntTable implements TransferFunction {
12 | private final int numStates;
13 | private final int blockSize;
14 | private final long[] words; // numStates blocks of ceil(numStates/64) longs
15 |
16 | public PowerIntTable(WrappedBitSet[] state2next) {
17 | this.numStates = state2next.length;
18 | this.blockSize = (63+numStates) / 64;
19 | this.words = new long[numStates * blockSize];
20 | for(int s = 0; s < numStates; ++s) {
21 | new WrappedBitSet(words, s*blockSize, blockSize, numStates).or(state2next[s]);
22 | }
23 | }
24 |
25 | private PowerIntTable(int numStates, long[] words) {
26 | this.numStates = numStates;
27 | this.blockSize = (63+numStates) / 64;
28 | this.words = words;
29 | }
30 |
31 | public static Reducer> REDUCER = new Reducer>() {
32 | public TransferFunction compose(
33 | TransferFunction a, TransferFunction b)
34 | {
35 | if(a == null)
36 | return b;
37 | if(b == null)
38 | return a;
39 | return ((PowerIntTable)a).followedBy((PowerIntTable) b);
40 | }
41 |
42 | public TransferFunction composeAll(Sequence> ts) {
43 | return PowerIntTable.composeAll(ts);
44 | }
45 | };
46 |
47 | public PowerIntTable followedBy(PowerIntTable other) {
48 | long[] words = new long[this.words.length];
49 | long[] theirWords = other.words;
50 | for(int state = 0; state < numStates; ++state) {
51 | int ourOffset = state * blockSize;
52 | int bit = WrappedBitSet.nextSetBit(this.words, ourOffset, blockSize, 0);
53 | while (bit >= 0) {
54 | for (int i = 0; i < blockSize; ++i) {
55 | words[ourOffset + i] |= theirWords[bit*blockSize + i];
56 | }
57 | bit = WrappedBitSet.nextSetBit(this.words, ourOffset, blockSize, bit + 1);
58 | }
59 | }
60 | return new PowerIntTable(numStates, words);
61 | }
62 |
63 | private static String toString(long[] ws) {
64 | StringBuilder sb = new StringBuilder();
65 | for(long w : ws) sb.append(w).append(" ");
66 | return sb.toString();
67 | }
68 |
69 | public PowerIntState next(PowerIntState st) {
70 | WrappedBitSet s = st.getSubset();
71 | WrappedBitSet res = new WrappedBitSet(s.numBits());
72 | for(int bit = s.nextSetBit(0); bit >= 0; bit = s.nextSetBit(bit+1)) {
73 | res.or(new WrappedBitSet(words, bit*blockSize, blockSize, numStates));
74 | }
75 | return new PowerIntState(st.getBasis(), res);
76 | }
77 |
78 | public static TransferFunction composeAll(Sequence> fs) {
79 | PowerIntTable first = (PowerIntTable) fs.get(0);
80 | int numWords = first.words.length;
81 | long[] curWords = Arrays.copyOf(first.words, numWords);
82 | long[] newWords = new long[numWords];
83 | int numStates = first.numStates;
84 | int blockSize = first.blockSize;
85 |
86 | for (int iF = 1; iF < fs.length(); iF++) {
87 | for(int j = 0; j < numWords; ++j) {
88 | newWords[j] = 0L;
89 | }
90 | long[] nextWords = ((PowerIntTable) fs.get(iF)).words;
91 | for (int state = 0; state < numStates; ++state) {
92 | int ourOffset = state * blockSize;
93 | int bit = WrappedBitSet.nextSetBit(curWords, ourOffset, blockSize, 0);
94 | while (bit >= 0) {
95 | for (int i = 0; i < blockSize; ++i) {
96 | newWords[ourOffset + i] |= nextWords[bit*blockSize + i];
97 | }
98 | bit = WrappedBitSet.nextSetBit(curWords, ourOffset, blockSize, bit + 1);
99 | }
100 | }
101 | long[] tmp = curWords;
102 | curWords = newWords;
103 | newWords = tmp;
104 | }
105 |
106 | return new PowerIntTable(numStates, curWords);
107 | }
108 |
109 | public String toString() {
110 | StringBuilder sb = new StringBuilder();
111 | for(int state = 0; state < numStates; ++state) {
112 | int offset = state * blockSize;
113 | sb.append(state).append(" -> ")
114 | .append(new WrappedBitSet(words, offset, blockSize, numStates).toString())
115 | .append("; ");
116 | }
117 | return sb.toString();
118 | }
119 | }
120 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/fa/Sequence.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.fa;
2 |
3 | /**
4 | * Created on: 09.09.2010 2:13:48
5 | */
6 | public interface Sequence {
7 | int length();
8 | T get(int i);
9 | }
10 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/fa/State.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.fa;
2 |
3 | import org.jkff.ire.util.WrappedBitSet;
4 |
5 | /**
6 | * Created on: 31.07.2010 14:57:34
7 | */
8 | public interface State {
9 | WrappedBitSet getTerminatedPatterns();
10 | }
11 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/fa/TransferFunction.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.fa;
2 |
3 | /**
4 | * Created on: 22.07.2010 23:48:22
5 | */
6 | public interface TransferFunction {
7 | T next(T t);
8 | }
9 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/fa/TransferTable.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.fa;
2 |
3 | /**
4 | * Created on: 31.07.2010 15:18:23
5 | */
6 | public interface TransferTable {
7 | TransferFunction forToken(C token);
8 | }
9 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/regex/Alternative.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.regex;
2 |
3 | /**
4 | * Created on: 01.09.2010 23:42:24
5 | */
6 | public class Alternative implements RxNode {
7 | public final RxNode a;
8 | public final RxNode b;
9 |
10 | public Alternative(RxNode a, RxNode b) {
11 | this.a = a;
12 | this.b = b;
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/regex/CharacterClass.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.regex;
2 |
3 | /**
4 | * Created on: 01.09.2010 23:47:14
5 | */
6 | public abstract class CharacterClass implements RxNode {
7 | public abstract boolean acceptsChar(char c);
8 |
9 | public static CharacterClass ANY_CHAR = new CharacterClass() {
10 | @Override
11 | public boolean acceptsChar(char c) {
12 | return true;
13 | }
14 |
15 | @Override
16 | public boolean intersects(CharacterClass c) {
17 | return true;
18 | }
19 |
20 | public String toString() {
21 | return ".";
22 | }
23 | };
24 |
25 | public static CharacterClass oneOf(final String s) {
26 | return new OneOf(s);
27 | }
28 |
29 | public abstract boolean intersects(CharacterClass c);
30 |
31 | private static class OneOf extends CharacterClass {
32 | private String s;
33 |
34 | public OneOf(String s) {
35 | this.s = s;
36 | }
37 |
38 | @Override
39 | public boolean acceptsChar(char c) {
40 | return s.indexOf(c) > -1;
41 | }
42 |
43 | @Override
44 | public boolean intersects(CharacterClass c) {
45 | if(c instanceof OneOf) {
46 | OneOf other = (OneOf) c;
47 | for(int i = 0; i < s.length(); ++i) {
48 | char ch = s.charAt(i);
49 | if(other.s.indexOf(ch) != -1) {
50 | return true;
51 | }
52 | }
53 | return false;
54 | } else if(c == ANY_CHAR) {
55 | return true;
56 | } else {
57 | throw new UnsupportedOperationException();
58 | }
59 | }
60 |
61 | public String toString() {
62 | return "[" + s + "]";
63 | }
64 |
65 | public boolean equals(Object other) {
66 | if(other == this) return true;
67 | if(other == null) return false;
68 | if(!(other instanceof OneOf)) return false;
69 | return s.equals(((OneOf)other).s);
70 | }
71 |
72 | public int hashCode() {
73 | return s.hashCode();
74 | }
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/regex/Empty.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.regex;
2 |
3 | /**
4 | * Created on: 01.09.2010 23:41:03
5 | */
6 | public class Empty implements RxNode {
7 | }
8 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/regex/Labeled.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.regex;
2 |
3 | /**
4 | * Created on: 04.09.2010 12:14:35
5 | */
6 | public class Labeled implements RxNode {
7 | public final RxNode a;
8 | public final int patternId;
9 |
10 | public Labeled(RxNode a, int patternId) {
11 | this.a = a;
12 | this.patternId = patternId;
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/regex/OnceOrMore.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.regex;
2 |
3 | /**
4 | * Created on: 01.09.2010 23:41:53
5 | */
6 | public class OnceOrMore implements RxNode {
7 | public final RxNode a;
8 |
9 | public OnceOrMore(RxNode a) {
10 | this.a = a;
11 | }
12 | }
13 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/regex/RegexCompiler.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.regex;
2 |
3 | import org.jkff.ire.DFARopePatternSet;
4 | import org.jkff.ire.PatternSet;
5 | import org.jkff.ire.fa.*;
6 | import org.jkff.ire.util.CoarsestPartition;
7 | import org.jkff.ire.util.WrappedBitSet;
8 | import org.jkff.ire.util.Pair;
9 |
10 | import java.util.*;
11 | import java.util.concurrent.atomic.AtomicInteger;
12 |
13 | import static org.jkff.ire.util.CollectionFactory.*;
14 |
15 | /**
16 | * Created on: 01.09.2010 23:43:07
17 | */
18 | public class RegexCompiler {
19 | public static PatternSet compile(List roots) {
20 | return new DFARopePatternSet(compileToBiDFA(roots));
21 | }
22 |
23 | public static PatternSet compile(String... regexes) {
24 | List roots = newArrayList();
25 | for(String regex : regexes) {
26 | roots.add(RegexParser.parse(regex));
27 | }
28 | return compile(roots);
29 | }
30 |
31 | static BiDFA compileToBiDFA(List roots) {
32 | List rootsAnywhere = newArrayList();
33 | List reversedRoots = newArrayList();
34 | for(RxNode root : roots) {
35 | Alternative dotStar = new Alternative(new Empty(), new OnceOrMore(CharacterClass.ANY_CHAR));
36 | rootsAnywhere.add(new Sequence(dotStar, new Sequence(root, dotStar)));
37 | reversedRoots.add(new Sequence(reverse(root), dotStar));
38 | }
39 | return new BiDFA(compileToDFA(rootsAnywhere), compileToDFA(reversedRoots));
40 | }
41 |
42 | static DFA compileToDFA(List rxNodes) {
43 | if(rxNodes.isEmpty()) {
44 | throw new IllegalArgumentException("Pattern list can't be empty");
45 | }
46 | List labeled = newArrayList();
47 | for(int i = 0; i < rxNodes.size(); ++i) {
48 | labeled.add(new Labeled(rxNodes.get(i), i));
49 | }
50 | RxNode alt = labeled.get(0);
51 | for(int i = 1; i < rxNodes.size(); ++i) {
52 | alt = new Alternative(alt, labeled.get(i));
53 | }
54 |
55 | return toDFA(reduceNFA(toNFA(alt)), rxNodes.size());
56 | }
57 |
58 | static DFA toDFA(NFA nfa, int numPatterns) {
59 | Pair, NFA.Node> opt = optimize(nfa);
60 |
61 | Set allNodes = opt.first;
62 | NFA.Node newInitial = opt.second;
63 |
64 | final int numStates = allNodes.size();
65 |
66 | final Map node2id = newLinkedHashMap();
67 | final NFA.Node[] id2node = allNodes.toArray(new NFA.Node[allNodes.size()]);
68 |
69 | for(int i = 0; i < id2node.length; ++i) {
70 | node2id.put(id2node[i], i);
71 | }
72 |
73 | final State[] basis = new State[numStates];
74 | for(int i = 0; i < numStates; ++i) {
75 | WrappedBitSet terminatedPatterns = new WrappedBitSet(numPatterns);
76 | for(int pat : id2node[i].patternIds) {
77 | terminatedPatterns.set(pat);
78 | }
79 | basis[i] = new IntState(i, terminatedPatterns);
80 | }
81 |
82 | TransferTable transfer = new TransferTable() {
83 | private TransferFunction[] transfer = new TransferFunction[Character.MAX_VALUE+1];
84 |
85 | public TransferFunction forToken(Character token) {
86 | char t = token;
87 | TransferFunction f = transfer[t];
88 | if(f == null) {
89 | transfer[t] = f = computeTransferFor(t);
90 | }
91 | return f;
92 | }
93 |
94 | private TransferFunction computeTransferFor(char token) {
95 | WrappedBitSet[] state2next = new WrappedBitSet[numStates];
96 | for(int i = 0; i < numStates; ++i) {
97 | WrappedBitSet res = new WrappedBitSet(numStates);
98 | NFA.Node node = id2node[i];
99 | for (Pair out : node.out) {
100 | if(out.first.acceptsChar(token)) {
101 | res.set(node2id.get(out.second));
102 | }
103 | }
104 | state2next[i] = res;
105 | }
106 | return new PowerIntTable(state2next);
107 | }
108 | };
109 |
110 | final WrappedBitSet justInitial = new WrappedBitSet(numStates);
111 | justInitial.set(node2id.get(newInitial));
112 | PowerIntState initial = new PowerIntState(basis, justInitial);
113 |
114 | // StringBuilder dot = new StringBuilder();
115 | // dot.append("digraph g {\n");
116 | // for(int i = 0; i < numStates; ++i) {
117 | // WrappedBitSet justThis = new WrappedBitSet(numStates);
118 | // justThis.set(i);
119 | // PowerIntState state = new PowerIntState(basis, justThis);
120 | // dot.append(i + " [shape=" + (state.getTerminatedPatterns().isEmpty() ? "circle" : "square") + "]\n");
121 | // }
122 | // for(int i = 0; i < numStates; ++i) {
123 | // WrappedBitSet justThis = new WrappedBitSet(numStates);
124 | // justThis.set(i);
125 | // PowerIntState state = new PowerIntState(basis, justThis);
126 | // PowerIntState nextState = transfer.forToken('t').next(state);
127 | // WrappedBitSet next = nextState.getSubset();
128 | // for(int bit = next.nextSetBit(0); bit != -1; bit = next.nextSetBit(bit+1)) {
129 | // dot.append(i + " -> " + bit + "\n");
130 | // }
131 | // }
132 | // dot.append("}\n");
133 | // System.out.println(dot);
134 |
135 | return new DFA(transfer, initial, PowerIntTable.REDUCER) {
136 | @Override
137 | public PowerIntState resetTerminatedPattern(PowerIntState state, int pattern) {
138 | WrappedBitSet reset = new WrappedBitSet(basis.length);
139 | reset.or(state.getSubset());
140 | for(int substate = reset.nextSetBit(0); substate != -1; substate = reset.nextSetBit(substate + 1)) {
141 | if(basis[substate].getTerminatedPatterns().get(pattern)) {
142 | reset.clear(substate);
143 | }
144 | }
145 | reset.or(justInitial);
146 | return new PowerIntState(basis, reset);
147 | }
148 | };
149 | }
150 |
151 | private static Pair, NFA.Node> optimize(NFA nfa) {
152 | Pair, NFA.Node> eClosure = computeEClosure(nfa);
153 | Pair, NFA.Node> groupedLeft = groupEquivalentStates(eClosure, true);
154 | Pair, NFA.Node> groupedRight = groupEquivalentStates(groupedLeft, false);
155 |
156 | return groupedRight;
157 | }
158 |
159 | private static Pair, NFA.Node> groupEquivalentStates(
160 | Pair, NFA.Node> nfa, boolean leftNotRight)
161 | {
162 | // See paper "On NFA reductions".
163 | Set nodes = nfa.first;
164 | NFA.Node initial = nfa.second;
165 |
166 | // Nodes terminating different patterns are different.
167 | Map, Integer> patIds2block = newLinkedHashMap();
168 | Map node2block = newLinkedHashMap();
169 | for(NFA.Node node : nodes) {
170 | Integer block = patIds2block.get(node.patternIds);
171 | if(block == null) {
172 | patIds2block.put(node.patternIds, block = patIds2block.size());
173 | }
174 | node2block.put(node, block);
175 | }
176 |
177 | NFA.Node[] id2node = nodes.toArray(new NFA.Node[nodes.size()]);
178 | Map node2id = newLinkedHashMap();
179 | for(int i = 0; i < id2node.length; ++i) {
180 | node2id.put(id2node[i], i);
181 | }
182 |
183 | int[] p = new int[id2node.length];
184 | for(int i = 0; i < id2node.length; ++i) {
185 | p[i] = node2block.get(id2node[i]);
186 | }
187 |
188 | // Instead of iterating over the whole unicode alphabet,
189 | // let us iterate over the distinct labels of the automaton.
190 | Set alphabet = newLinkedHashSet();
191 | for(NFA.Node node : nodes) {
192 | for (Pair out : node.out) {
193 | alphabet.add(out.first);
194 | }
195 | }
196 |
197 | boolean anythingChanged;
198 | do {
199 | anythingChanged = false;
200 | for(CharacterClass c : alphabet) {
201 | List edges = newArrayList();
202 | for(int i = 0; i < id2node.length; ++i) {
203 | NFA.Node node = id2node[i];
204 | for (Pair out : node.out) {
205 | // When splitting by a particular label, say, [agc],
206 | // we should take into account all edges that might
207 | // be triggered by any of the characters accepted
208 | // by this label. For example, a "." edge should be used.
209 | if(out.first.intersects(c)) {
210 | int j = node2id.get(out.second);
211 | edges.add(leftNotRight ? new int[] {i,j} : new int[] {j, i});
212 | }
213 | }
214 | }
215 | int[] newP = CoarsestPartition.coarsestStablePartition(p, edges.toArray(new int[edges.size()][]));
216 | if(!Arrays.equals(p, newP)) {
217 | anythingChanged = true;
218 | p = newP;
219 | }
220 | }
221 | } while(anythingChanged);
222 |
223 | // Group nodes of the nfa according to 'p'.
224 | Map block2newNode = newLinkedHashMap();
225 | Map> block2oldNodeIds = newLinkedHashMap();
226 | for (int i = 0; i < p.length; i++) {
227 | int b = p[i];
228 | if (!block2newNode.containsKey(b)) {
229 | NFA.Node newNode = new NFA.Node();
230 | block2newNode.put(b, newNode);
231 | block2oldNodeIds.put(b, new ArrayList());
232 | }
233 | block2oldNodeIds.get(b).add(i);
234 | }
235 | for(int b : block2newNode.keySet()) {
236 | NFA.Node newNode = block2newNode.get(b);
237 | for(int oldNodeId : block2oldNodeIds.get(b)) {
238 | NFA.Node oldNode = id2node[oldNodeId];
239 | newNode.patternIds.addAll(oldNode.patternIds);
240 | for (Pair out : oldNode.out) {
241 | CharacterClass cc = out.first;
242 | NFA.Node dest = out.second;
243 | NFA.Node newDest = block2newNode.get(p[node2id.get(dest)]);
244 | Pair edge = Pair.of(cc, newDest);
245 | if(!newNode.out.contains(edge))
246 | newNode.out.add(edge);
247 | }
248 | }
249 | }
250 | NFA.Node newInitial = block2newNode.get(p[node2id.get(initial)]);
251 | Set newNodes = new HashSet(block2newNode.values());
252 | return Pair.of(newNodes, newInitial);
253 | }
254 |
255 | private static Pair, NFA.Node> computeEClosure(NFA nfa) {
256 | final Map> node2closure = newLinkedHashMap();
257 |
258 | Set allOldNodes = dfs(nfa.begin, true);
259 | for(NFA.Node node : allOldNodes) {
260 | node2closure.put(node, dfs(node, false));
261 | }
262 |
263 | final Map> newNode2contents = newLinkedHashMap();
264 | final Map, NFA.Node> contents2newNode = newLinkedHashMap();
265 | Set newNodesToVisit = newLinkedHashSet();
266 | Set initialEC = node2closure.get(nfa.begin);
267 | NFA.Node newInitial = new NFA.Node();
268 | for(NFA.Node subNode : initialEC) {
269 | newInitial.patternIds.addAll(subNode.patternIds);
270 | }
271 | newNodesToVisit.add(newInitial);
272 | newNode2contents.put(newInitial, initialEC);
273 | contents2newNode.put(initialEC, newInitial);
274 | while(!newNodesToVisit.isEmpty()) {
275 | NFA.Node newNode = newNodesToVisit.iterator().next();
276 | newNodesToVisit.remove(newNode);
277 | Map> class2dest = newLinkedHashMap();
278 | for(NFA.Node subNode : newNode2contents.get(newNode)) {
279 | for(Pair out : subNode.out) {
280 | if(out.first == null) {
281 | // Skip epsilon transitions: we're operating on epsilon closures
282 | continue;
283 | }
284 | Set dest = class2dest.get(out.first);
285 | if(dest == null) {
286 | class2dest.put(out.first, dest = newLinkedHashSet());
287 | }
288 | dest.addAll(node2closure.get(out.second));
289 | }
290 | }
291 | for(CharacterClass cc : class2dest.keySet()) {
292 | Set dest = class2dest.get(cc);
293 | NFA.Node newDest = contents2newNode.get(dest);
294 | if(newDest == null) {
295 | newDest = new NFA.Node();
296 | for(NFA.Node subNode : dest) {
297 | newDest.patternIds.addAll(subNode.patternIds);
298 | }
299 | newNode2contents.put(newDest, dest);
300 | contents2newNode.put(dest, newDest);
301 | newNodesToVisit.add(newDest);
302 | }
303 | newNode.transition(cc, newDest);
304 | }
305 | }
306 |
307 | return Pair.of(newNode2contents.keySet(), newInitial);
308 | }
309 |
310 | static Set dfs(NFA.Node origin, boolean acceptNonEps) {
311 | Set res = newLinkedHashSet();
312 | Stack toVisit = new Stack();
313 | toVisit.add(origin);
314 | while(!toVisit.isEmpty()) {
315 | NFA.Node node = toVisit.pop();
316 | if(!res.add(node))
317 | continue;
318 | for (Pair out : node.out) {
319 | if(out.first == null || acceptNonEps) {
320 | toVisit.push(out.second);
321 | }
322 | }
323 | }
324 | return res;
325 | }
326 |
327 | static NFA reduceNFA(NFA nfa) {
328 | //
329 | return nfa;
330 | // TODO
331 | }
332 |
333 | static NFA toNFA(RxNode rxNode) {
334 | if(rxNode instanceof Alternative) {
335 | Alternative x = (Alternative) rxNode;
336 | NFA res = new NFA(new NFA.Node(), new NFA.Node());
337 | NFA a = toNFA(x.a), b = toNFA(x.b);
338 | res.begin.transition(null, a.begin);
339 | res.begin.transition(null, b.begin);
340 | a.end.transition(null, res.end);
341 | b.end.transition(null, res.end);
342 | return res;
343 | } else if(rxNode instanceof CharacterClass) {
344 | NFA res = new NFA(new NFA.Node(), new NFA.Node());
345 | res.begin.transition((CharacterClass) rxNode, res.end);
346 | return res;
347 | } else if(rxNode instanceof Empty) {
348 | NFA res = new NFA(new NFA.Node(), new NFA.Node());
349 | res.begin.transition(null, res.end);
350 | return res;
351 | } else if(rxNode instanceof OnceOrMore) {
352 | OnceOrMore x = (OnceOrMore) rxNode;
353 | NFA res = toNFA(x.a);
354 | res.end.transition(null, res.begin);
355 | return res;
356 | } else if(rxNode instanceof Sequence) {
357 | Sequence x = (Sequence) rxNode;
358 | NFA a = toNFA(x.a), b = toNFA(x.b);
359 | NFA res = new NFA(a.begin, b.end);
360 | a.end.transition(null, b.begin);
361 | return res;
362 | } else if(rxNode instanceof Labeled) {
363 | Labeled x = (Labeled) rxNode;
364 | NFA a = toNFA(x.a);
365 | a.end.patternIds.add(x.patternId);
366 | return a;
367 | } else {
368 | throw new UnsupportedOperationException("Unsupported node type " + rxNode.getClass());
369 | }
370 | }
371 |
372 | static class NFA {
373 | final Node begin, end;
374 |
375 | public NFA(Node begin, Node end) {
376 | this.begin = begin;
377 | this.end = end;
378 | }
379 |
380 | static class Node {
381 | static AtomicInteger nextId = new AtomicInteger(0);
382 |
383 | final List> out = newArrayList();
384 | final Set patternIds = newLinkedHashSet();
385 | final int id = nextId.incrementAndGet();
386 |
387 | void transition(CharacterClass cc, Node dest) {
388 | out.add(Pair.of(cc, dest));
389 | }
390 |
391 | public String toString() {
392 | return ""+id;
393 | }
394 |
395 | public boolean equals(Object o) {
396 | return id == ((Node) o).id;
397 | }
398 | public int hashCode() {
399 | return id;
400 | }
401 | }
402 | }
403 |
404 | private static RxNode reverse(RxNode rxNode) {
405 | if(rxNode instanceof Alternative) {
406 | Alternative x = (Alternative) rxNode;
407 | return new Alternative(reverse(x.a), reverse(x.b));
408 | } else if(rxNode instanceof CharacterClass) {
409 | return rxNode;
410 | } else if(rxNode instanceof Empty) {
411 | return rxNode;
412 | } else if(rxNode instanceof OnceOrMore) {
413 | OnceOrMore x = (OnceOrMore) rxNode;
414 | return new OnceOrMore(reverse(x.a));
415 | } else if(rxNode instanceof Sequence) {
416 | Sequence x = (Sequence) rxNode;
417 | // The only interesting case.
418 | return new Sequence(reverse(x.b), reverse(x.a));
419 | } else if(rxNode instanceof Labeled) {
420 | Labeled x = (Labeled) rxNode;
421 | return new Labeled(reverse(x.a), x.patternId);
422 | } else {
423 | throw new UnsupportedOperationException("Unsupported node type " + rxNode.getClass());
424 | }
425 | }
426 | }
427 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/regex/RegexParser.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.regex;
2 |
3 | /**
4 | * Created on: 04.09.2010 13:12:26
5 | */
6 | public class RegexParser {
7 | public static RxNode parse(String regex) {
8 | return parseAlt(new Tokenizer(regex));
9 | }
10 |
11 | private static boolean expect(Tokenizer t, char c) {
12 | Character p = t.peek();
13 | return p != null && p.charValue() == c;
14 | }
15 |
16 | private static RxNode parseAlt(Tokenizer t) {
17 | RxNode a = parseSequence(t);
18 | if (!expect(t, '|')) {
19 | return a;
20 | }
21 | t.next();
22 | RxNode b = parseAlt(t);
23 | return new Alternative(a, b);
24 | }
25 |
26 | private static RxNode parseSequence(Tokenizer t) {
27 | if(expect(t, '|') || expect(t, ')')) {
28 | return new Empty();
29 | }
30 | RxNode a = parseUnary(t);
31 | if(expect(t, '|') || expect(t, ')') || t.peek() == null) {
32 | return a;
33 | }
34 | RxNode b = parseSequence(t);
35 | return new Sequence(a, b);
36 | }
37 |
38 | private static RxNode parseUnary(Tokenizer t) {
39 | RxNode a = parseAtom(t);
40 | while(true) {
41 | if(expect(t, '+')) {
42 | t.next();
43 | a = new OnceOrMore(a);
44 | } else if(expect(t, '?')) {
45 | t.next();
46 | a = new Alternative(new Empty(), a);
47 | } else if(expect(t, '*')) {
48 | t.next();
49 | a = new Alternative(new Empty(), new OnceOrMore(a));
50 | } else {
51 | return a;
52 | }
53 | }
54 | }
55 |
56 | private static RxNode parseAtom(Tokenizer t) {
57 | if(expect(t, '(')) {
58 | t.next();
59 | return parseParen(t);
60 | } else if(expect(t, '[')) {
61 | t.next();
62 | return parseCharacterRange(t);
63 | } else if(expect(t, '.')) {
64 | t.next();
65 | return CharacterClass.ANY_CHAR;
66 | } else {
67 | return CharacterClass.oneOf(""+parseChar(t));
68 | }
69 | }
70 |
71 | private static RxNode parseParen(Tokenizer t) {
72 | RxNode a = parseAlt(t);
73 | if(!expect(t, ')')) {
74 | throw new IllegalArgumentException("Expected ')', got " + t.peek());
75 | }
76 | t.next();
77 | return a;
78 | }
79 |
80 | private static RxNode parseCharacterRange(Tokenizer t) {
81 | StringBuilder s = new StringBuilder();
82 | Character last = null;
83 | while(!expect(t, ']')) {
84 | Character c = parseChar(t);
85 | if(c != null && c.charValue() == '-' && last != null) {
86 | c = parseChar(t);
87 | for(char i = last; i <= c; ++i) {
88 | s.append(i);
89 | }
90 | } else {
91 | s.append(c);
92 | }
93 | last = c;
94 | }
95 | t.next();
96 | return CharacterClass.oneOf(s.toString());
97 | }
98 |
99 | private static Character parseChar(Tokenizer t) {
100 | if(expect(t, '\\')) {
101 | t.next();
102 | }
103 | return t.next();
104 | }
105 |
106 | private static class Tokenizer {
107 | private String s;
108 | private int pos;
109 |
110 | public Tokenizer(String s) {
111 | this.s = s;
112 | }
113 |
114 | public Character peek() {
115 | return (pos < s.length()) ? s.charAt(pos) : null;
116 | }
117 |
118 | public Character next() {
119 | return (pos < s.length()) ? s.charAt(pos++) : null;
120 | }
121 | }
122 | }
123 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/regex/RxNode.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.regex;
2 |
3 | /**
4 | * Created on: 01.09.2010 21:59:05
5 | */
6 | public interface RxNode {
7 | }
8 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/regex/Sequence.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.regex;
2 |
3 | /**
4 | * Created on: 01.09.2010 21:59:15
5 | */
6 | public class Sequence implements RxNode {
7 | public final RxNode a;
8 | public final RxNode b;
9 |
10 | public Sequence(RxNode a, RxNode b) {
11 | this.a = a;
12 | this.b = b;
13 | }
14 | }
15 |
--------------------------------------------------------------------------------
/src/main/java/org/jkff/ire/rope/Rope.java:
--------------------------------------------------------------------------------
1 | package org.jkff.ire.rope;
2 |
3 | import org.jkff.ire.fa.Sequence;
4 | import org.jetbrains.annotations.NotNull;
5 | import org.jkff.ire.util.Function2;
6 | import org.jkff.ire.util.Pair;
7 | import org.jkff.ire.util.Predicate;
8 | import org.jkff.ire.util.Reducer;
9 |
10 | /**
11 | * Created on: 21.08.2010 17:46:38
12 | */
13 | public class Rope {
14 | @NotNull
15 | private RopeFactory factory;
16 |
17 | private final M sum;
18 |
19 | private final Rope a, b, c;
20 | private final int h;
21 | private final int length;
22 |
23 | private final String block;
24 |
25 | private Rope(Rope a, Rope b, M sum) {
26 | this(a, b, null, a.factory, null, sum);
27 | }
28 |
29 | private Rope(Rope a, Rope b, Rope c, M sum) {
30 | this(a, b, c, a.factory, null, sum);
31 | }
32 |
33 | private Rope(RopeFactory factory, String block, M sum) {
34 | this(null, null, null, factory, block, sum);
35 | }
36 |
37 | private Rope(RopeFactory factory, String block) {
38 | this(null, null, null, factory, block, sumString(factory, block));
39 | }
40 |
41 | private static M sumString(RopeFactory factory, final String block) {
42 | return factory.mapReduce(new Sequence() {
43 | public int length() {
44 | return block.length();
45 | }
46 |
47 | public Character get(int i) {
48 | return block.charAt(i);
49 | }
50 | });
51 | }
52 |
53 | private Rope(Rope a, Rope b, Rope c, RopeFactory factory, String block, M sum) {
54 | if (block != null) {
55 | assert a == null : "Block can't have a child: 'a'";
56 | assert b == null : "Block can't have a child: 'b'";
57 | assert c == null : "Block can't have a child: 'c'";
58 | this.a = this.b = this.c = null;
59 | this.h = 0;
60 | this.length = block.length();
61 | this.block = block;
62 | this.factory = factory;
63 | this.sum = sum;
64 | } else {
65 | assert a != null : "Fork must have a child: 'a'";
66 | assert b != null : "Fork must have a child: 'b'";
67 | assert a.h == b.h : "Fork's 'a' and 'b' children must have same height";
68 | this.a = a;
69 | this.b = b;
70 | this.block = null;
71 | this.factory = factory;
72 | this.sum = sum;
73 | if (c == null) {
74 | this.c = null;
75 | this.h = a.h + 1;
76 | this.length = a.length + b.length;
77 | } else {
78 | assert a.h == c.h : "Fork's 'a' and 'c' children must have same height";
79 | this.c = c;
80 | this.h = a.h + 1;
81 | this.length = a.length + b.length + c.length;
82 | }
83 | }
84 | }
85 |
86 | public int length() {
87 | return length;
88 | }
89 |
90 | public char charAt(int index) {
91 | if (block != null)
92 | return block.charAt(index);
93 | if (index < a.length())
94 | return a.charAt(index);
95 | if (index < a.length() + b.length())
96 | return b.charAt(index - a.length());
97 | return c.charAt(index - a.length() - b.length());
98 | }
99 |
100 | public M getSum() {
101 | return sum;
102 | }
103 |
104 | public Rope append(Rope other) {
105 | return append(this, other);
106 | }
107 |
108 | private static Rope append(Rope left, Rope right) {
109 | int blockSize = left.factory.getBlockSize();
110 | Reducer reducer = left.factory.getReducer();
111 |
112 | M sum = reducer.compose(left.sum, right.sum);
113 |
114 | if (left.h == right.h) {
115 | if (left.h > 0)
116 | return new Rope(left, right, sum);
117 | if (!left.isUnderflownBlock() && !right.isUnderflownBlock())
118 | return new Rope(left, right, sum);
119 | String bigBlock = left.block + right.block;
120 | if (bigBlock.length() <= 2 * blockSize - 1)
121 | return new Rope(left.factory, bigBlock, sum);
122 | return new Rope(
123 | new Rope(left.factory, bigBlock.substring(0, blockSize)),
124 | new Rope(left.factory, bigBlock.substring(blockSize, bigBlock.length())),
125 | sum);
126 | } else if (left.h == right.h + 1) {
127 | if (left.c == null)
128 | return new Rope(left.a, left.b, right, sum);
129 | else
130 | return new Rope(
131 | // Optimization opportunity: remember a+b and b+c sums in 3-child nodes
132 | new Rope(left.a, left.b, reducer.compose(left.a.sum, left.b.sum)),
133 | new Rope(left.c, right, reducer.compose(left.c.sum, right.sum)),
134 | sum);
135 | } else if (right.h == left.h + 1) {
136 | if (right.c == null)
137 | return new Rope(left, right.a, right.b, sum);
138 | else
139 | return new Rope(
140 | new Rope(left, right.a, reducer.compose(left.sum, right.a.sum)),
141 | // Optimization opportunity: remember a+b and b+c sums in 3-child nodes
142 | new Rope(right.b, right.c, reducer.compose(right.b.sum, right.c.sum)),
143 | sum);
144 | } else if (left.h > right.h + 1) {
145 | if (left.c == null)
146 | // This would not be well-founded recursion, if not for the two previous cases
147 | // left.b.append(right) may be at most left.a.h+1 high and this will be handled by them.
148 | return left.a.append(left.b.append(right));
149 | else
150 | // etc.
151 | return (left.a.append(left.b)).append(left.c.append(right));
152 | } else { // right.h > left.h + 1
153 | if (right.c == null)
154 | return left.append(right.a).append(right.b);
155 | else
156 | return (left.append(right.a)).append(right.b.append(right.c));
157 | }
158 | }
159 |
160 | public Pair, Rope> splitAfterRise(
161 | S seed,
162 | Function2, S> addChunk, Function2 addChar,
163 | Predicate toBool) {
164 | if (block != null) {
165 | S s = seed;
166 | for (int i = 0; i < block.length(); ++i) {
167 | if (toBool.isTrueFor(s))
168 | return Pair.of(
169 | new Rope(this.factory, block.substring(0, i)),
170 | new Rope(this.factory, block.substring(i, block.length())));
171 | s = addChar.applyTo(s, block.charAt(i));
172 | }
173 | if (toBool.isTrueFor(s))
174 | return Pair.of(this, new Rope(this.factory, ""));
175 | return null;
176 | } else {
177 | if (toBool.isTrueFor(seed))
178 | return Pair.of(new Rope(this.factory, ""), this);
179 | S afterA = addChunk.applyTo(seed, a);
180 | if (toBool.isTrueFor(afterA)) {
181 | Pair, Rope> sa = a.splitAfterRise(seed, addChunk, addChar, toBool);
182 | if(sa == null) {
183 | System.out.println("Oops");
184 | }
185 | return (c == null)
186 | ? Pair.of(sa.first, sa.second.append(b))
187 | : Pair.of(sa.first, sa.second.append(b).append(c));
188 | }
189 | S afterB = addChunk.applyTo(afterA, b);
190 | if (toBool.isTrueFor(afterB)) {
191 | Pair