├── .gitignore
├── History.md
├── LICENSE
├── README.md
├── pack.pl
├── prolog
    ├── regex.pl
    └── regex
    │   ├── engine
    │       └── pp.pl
    │   ├── parser.pl
    │   └── state.pl
└── t
    ├── captures.pl
    ├── examples.pl
    ├── options.pl
    ├── perl_classes.pl
    └── synopsis.pl


/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.swp
3 | doc/
4 | dev.pl
5 | 


--------------------------------------------------------------------------------
/History.md:
--------------------------------------------------------------------------------
 1 | # v0.3.3 (2017-07-06)
 2 | 
 3 |   * Fix patterns stored in variables
 4 | 
 5 | # v0.3.2 (2017-01-06)
 6 | 
 7 |   * Fix order of captures in certain circumstances
 8 | 
 9 | # v0.3.1 (2015-10-16)
10 | 
11 |   * Fix packaging error
12 | 
13 | # v0.3.0 (2015-10-16)
14 | 
15 |   * Add support for leading `^` syntax
16 |   * Tidy up pack layout
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | This is free and unencumbered software released into the public domain.
 2 | 
 3 | Anyone is free to copy, modify, publish, use, compile, sell, or
 4 | distribute this software, either in source code form or as a compiled
 5 | binary, for any purpose, commercial or non-commercial, and by any
 6 | means.
 7 | 
 8 | In jurisdictions that recognize copyright laws, the author or authors
 9 | of this software dedicate any and all copyright interest in the
10 | software to the public domain. We make this dedication for the benefit
11 | of the public at large and to the detriment of our heirs and
12 | successors. We intend this dedication to be an overt act of
13 | relinquishment in perpetuity of all present and future rights to this
14 | software under copyright law.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 | OTHER DEALINGS IN THE SOFTWARE.
23 | 
24 | For more information, please refer to <http://unlicense.org>
25 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Synopsis
 2 | 
 3 |     :- use_module(library(regex)).
 4 |     ?- '99 Bottles of Beer' =~ '[0-9]+ bottles'/i.
 5 |     true.
 6 | 
 7 | # Description
 8 | 
 9 | Regular expression support for Prolog.
10 | 
11 | When Prologers want to match a string against a pattern, they typically write a
12 | DCG.  DCGs are powerful and flexible.  For medium to large patterns, they are
13 | also easier to read and maintain.  However, for small and local patterns the
14 | overhead of writing and naming auxiliary predicates can be too much.  In those
15 | circumstances, one might prefer a regular expression.  This pack makes it
16 | possible.
17 | 
18 | The `=~` operator matches a string (on the left side) against a regular
19 | expression (on the right side).  Either side can be an atom or a list of codes.
20 | The `\~` operator succeeds if the string does _not_ match the pattern.
21 | 
22 | # Syntax Reference
23 | 
24 | This section lists the regular expression syntax accepted by library(regex).
25 | Syntax not listed here is not yet supported.  Patches welcome.
26 | 
27 | ## Single characters
28 | 
29 |   * `.` - any character, including newline
30 |   * `[xyz]` - character class
31 |   * `[^xyz]` - negated character class
32 |   * `\d` - Perl character class
33 |   * `\D` - negated Perl character class
34 | 
35 | ## Composites
36 | 
37 |   * `xy` - `x` followed by `y`
38 |   * `x|y` - `x` or `y` (prefer x)
39 | 
40 | ## Repetitions
41 | 
42 |   * `x*` - zero or more `x`, prefer more
43 |   * `x+` - one or more `x`, prefer more
44 |   * `x?` - zero or one `x`, prefer one
45 |   * `x{n,m}` - `n` or `n+1` or ... or `m` `x`, prefer more
46 |   * `x{n,}` - `n` or more `x`, prefer more
47 |   * `x{n}` - exactly `n` `x`
48 | 
49 | ## Grouping
50 | 
51 |   * `(re)` - numbered capturing group
52 |   * `(?<name>re)` - named & numbered capturing group
53 | 
54 | ## Flags
55 | 
56 |   * `i` - case-insensitive (default false)
57 |   * `s` - let `.` match `\n` (default false)
58 | 
59 | ## Empty strings
60 | 
61 |   * `^` - at start of text
62 |   * `$` - at end of text
63 | 
64 | ## Character class elements
65 | 
66 |   * `x` - single character
67 |   * `A-Z` - character range (inclusive)
68 | 
69 | ## Perl character classes
70 | 
71 |   * `\d` - digits (same as `[0-9]`)
72 |   * `\D` - not digits (same as `[^0-9]`)
73 |   * `\s` - whitespace (same as `[\t\n\f\r ]`)
74 |   * `\S` - not whitespace (same as `[^\t\n\f\r ]`)
75 |   * `\w` - word characters (same as `[0-9A-Za-z_]`)
76 |   * `\W` - not word characters (same as `[^0-9A-Za-z_]`)
77 | 
78 | # Acknowledgements
79 | 
80 | Rob Cameron for his
81 | [lecture notes](http://www.cs.sfu.ca/~cameron/Teaching/384/99-3/regexp-plg.html)
82 | on which the original implementation was based.
83 | 
84 | # Installation
85 | 
86 | Using SWI-Prolog 6.3 or later:
87 | 
88 |     ?- pack_install(regex).
89 | 
90 | This module uses [semantic versioning](http://semver.org/).
91 | 
92 | Source code available and pull requests accepted at
93 | http://github.com/mndrix/regex
94 | 


--------------------------------------------------------------------------------
/pack.pl:
--------------------------------------------------------------------------------
 1 | name(regex).
 2 | title('Regular expressions').
 3 | 
 4 | version('0.3.3').
 5 | download('https://github.com/mndrix/regex/archive/v0.3.3.zip').
 6 | 
 7 | author('Michael Hendricks','michael@ndrix.org').
 8 | packager('Michael Hendricks','michael@ndrix.org').
 9 | maintainer('Michael Hendricks','michael@ndrix.org').
10 | home('https://github.com/mndrix/regex').
11 | 


--------------------------------------------------------------------------------
/prolog/regex.pl:
--------------------------------------------------------------------------------
  1 | :- module(regex, [ (=~)/2
  2 |                  , (\~)/2
  3 |                  , op(700,xfx,=~)
  4 |                  , op(700,xfx,\~)
  5 |                  , regex/4
  6 |                  ]).
  7 | :- use_module(library(error), [domain_error/2]).
  8 | :- use_module(library(regex/state), [new_state/3, numbered_captures/2]).
  9 | :- use_module(library(regex/parser), [re//2]).
 10 | :- use_module(library(regex/engine/pp), [engine_match/5]).
 11 | 
 12 | 
 13 | % operators for matching strings against regular expressions.
 14 | % the syntax is the same used by Perl and Haskell, but Prolog
 15 | % doesn't like '!' in operators so I had to use '\' instead.
 16 | :- op(700,xfx,=~).
 17 | :- op(700,xfx,\~).
 18 | 
 19 | 
 20 | %%  =~(+Text, +Pattern) is semidet.
 21 | %
 22 | %   True if Text matches regular expression Pattern. Only the first
 23 | %   match is considered.  Text and Pattern can be atoms or code lists.
 24 | %
 25 | %   Named captures are automatically bound to corresponding named
 26 | %   variables in the surrounding scope.  For example,
 27 | %
 28 | %       "Hi John" =~ "hi (?<Name>[a-z]+)"/i,
 29 | %       Name == "John".
 30 | Text =~ Pattern :-
 31 |     expand_equalstilde(Text =~ Pattern, _, Goal),
 32 |     call(Goal).
 33 | 
 34 | expand_equalstilde(Text =~ Pattern, Vars, regex(P,Options,Text,Vars)) :-
 35 |     ( nonvar(Pattern), Pattern = P/Options ->
 36 |         true
 37 |     ; % no explicit options ->
 38 |         P = Pattern,
 39 |         Options = []
 40 |     ).
 41 | 
 42 | % macro expansion giving access to in-scope variables.
 43 | user:goal_expansion(Text =~ Pattern, Goal) :-
 44 |     % is goal expansion wanted?
 45 |     prolog_load_context(module, Module),
 46 |     Module \== regex,  % we don't want string interpolation ourselves
 47 |     predicate_property(Module:(_=~_),imported_from(regex)),
 48 | 
 49 |     prolog_load_context(variable_names, Vars),
 50 |     expand_equalstilde(Text =~ Pattern, Vars, Goal).
 51 | 
 52 | 
 53 | %%  \~(+Text, +Pattern) is semidet.
 54 | %
 55 | %   Like `\+ Text =~ Pattern`.
 56 | Text \~ Pattern :-
 57 |     \+ Text =~ Pattern.
 58 | 
 59 | 
 60 | %%  regex(+Pattern:text,+Options,+Text:text,?Captures:list) is semidet
 61 | %
 62 | %   True if Text matches the regular expression Pattern. The pattern's
 63 | %   behavior is influenced by Options (see below). The values of any
 64 | %   capturing subgroups are unified with Captures (see below). A `text`
 65 | %   value may either be an atom or a list of codes.
 66 | %
 67 | %   Options can either be an atom or a list of options. If an atom, it's
 68 | %   split into a list of single character atoms which is used as the
 69 | %   Options value.  This allows on to use `is`, for example, instead of
 70 | %   `[i,s]`.  Acceptable options are:
 71 | %
 72 | %     * `i` - case-insensitive (default false)
 73 | %     * `s` - let `.` match `\n` (default false)
 74 | %
 75 | %   Captures is unified with a list of captured values, with the
 76 | %   leftmost capture first, etc. Each captured value is a list of codes.
 77 | %   For example,
 78 | %
 79 | %       ?- regex('(a+)(b*)', [], 'aaabbbbb', [A,B]).
 80 | %       A = "aaa",
 81 | %       B = "bbbbb".
 82 | %
 83 | %   Named captures are also supported. In that case, Captures must be
 84 | %   a list of pairs like `['A'=A,'B'=B]`. Every named capture in the
 85 | %   pattern must have a corresponding key in Captures. (This is a
 86 | %   temporary restriction and will be removed later).
 87 | %
 88 | %   A brief word on argument order. Prolog convention prefers to place
 89 | %   an Options argument as the final argument or as the last one before
 90 | %   outputs. However, widely followed regular expression
 91 | %   convention places options immediately after the pattern. I chose to
 92 | %   follow the latter convention. This argument order
 93 | %   benefits higher-order calls like maplist/3 which can do things
 94 | %   like:
 95 | %
 96 | %       ?- maplist(regex('(a+)(b+)', i), [ab, aab, abb], L).
 97 | %       L = [["a", "b"], ["aa", "b"], ["a", "bb"]].
 98 | regex(Pattern,Options,Text,Captures) :-
 99 |     % normalize text representations
100 |     text_codes(Text, T),
101 |     text_codes(Pattern, P0),
102 |     starts_with_caret(P0,P,StartingCaret),
103 | 
104 |     % normalize options and captures into a state value
105 |     new_state(Options, Captures, State0),
106 | 
107 |     % compile Pattern
108 |     ( phrase(re(State0,Re),P) ->
109 |         ( StartingCaret=yes ->
110 |             once(engine_match(Re, State0, State, T, _))
111 |         ; otherwise ->
112 |             once(regex_no_sugar(Re, State0, State, T, _))
113 |         ),
114 |         ( var(Captures) ->
115 |             numbered_captures(State, Captures)
116 |         ; % captures already bound ->
117 |             true
118 |         )
119 |     ; % invalid pattern ->
120 |         atom_codes(A, P),
121 |         domain_error(regex, A)
122 |     ).
123 | 
124 | 
125 | starts_with_caret([0'^|P],P,yes) :- % ' syntax highlighter
126 |     !.
127 | starts_with_caret(P,P,no).
128 | 
129 | 
130 | % the heart and soul of regex/4
131 | regex_no_sugar(Re, State0, State) -->
132 |     engine_match(Re, State0, State).
133 | regex_no_sugar(Re, State0, State) -->
134 |     [_],
135 |     regex_no_sugar(Re, State0, State).
136 | 
137 | 
138 | %%  text_codes(+Text, -Codes)
139 | %
140 | %   Convert Text (atom or codes) into Codes.
141 | text_codes(Atom, Codes) :-
142 |     atom(Atom),
143 |     !,
144 |     atom_codes(Atom, Codes).
145 | text_codes(String, Codes) :-
146 |     string(String),
147 |     !,
148 |     string_codes(String, Codes).
149 | text_codes(Codes, Codes).
150 | 


--------------------------------------------------------------------------------
/prolog/regex/engine/pp.pl:
--------------------------------------------------------------------------------
 1 | :- module(regex_engine_pp, [engine_match/5]).
 2 | :- use_module(library(regex/state)).
 3 | 
 4 | % regular expression interpreter
 5 | 
 6 | % engine_match(RE, Selected, S, Unmatched) is true if RE matches
 7 | % a string Prefix such that S = [Prefix|Unmatched], and
 8 | % Selected is the list of substrings of Prefix that matched
 9 | % the parenthesized components of RE.
10 | 
11 | engine_match(union(RE1, _RE2), State0, State) -->
12 |     engine_match(RE1, State0, State).
13 | engine_match(union(_RE1, RE2), State0, State) -->
14 |     engine_match(RE2, State0, State).
15 | 
16 | engine_match(conc(RE1, RE2), State0, State) -->
17 |     engine_match(RE1, State0, State1),
18 |     engine_match(RE2, State1, State).
19 | 
20 | % match a specific number of times
21 | engine_match(count(RE,N0,M0), State0, State) -->
22 |     { N0 > 0 },
23 |     engine_match(RE, State0, State1),    % try for minimum matches
24 |     { succ(N, N0) },
25 |     { succ(M, M0) },
26 |     engine_match(count(RE,N,M), State1, State).
27 | engine_match(count(RE,0,M0), State0, State) -->
28 |     { M0 > 0 },
29 |     engine_match(RE, State0, State1),    % prefer more matches
30 |     { succ(M, M0) },
31 |     engine_match(count(RE,0,M), State1, State).
32 | engine_match(count(_,0,_), State, State) -->
33 |     { true }.
34 | 
35 | % Match a capturing group
36 | engine_match(group(RE), State0, State, S, U) :-
37 |     push_capture(P, State0, State1),
38 |     engine_match(RE, State1, State, S, U),
39 |     append(P, U, S).
40 | 
41 | % Match a named group. Try saving the capture under a name; otherwise,
42 | % treat the group as a numbered capture.
43 | engine_match(named_group(Name,RE), State0, State, S, U) :-
44 |     push_capture(Name=P,State0,State1),
45 |     engine_match(RE, State1, State, S, U),
46 |     append(P, U, S).
47 | 
48 | engine_match(any, State, State) -->
49 |     [C],
50 |     {
51 |         ( C = 0'\n ->
52 |             singleline_mode(State)
53 |         ; % not a new line ->
54 |             true
55 |         )
56 |     }.
57 | 
58 | % matches both regular characters and metacharacters
59 | engine_match(char(C), State, State) -->
60 |     [C0],
61 |     { adjust_case(State, C0, C) }.
62 | 
63 | engine_match(eos, State, State, [], []).
64 | 
65 | engine_match(neg_set(Set), State, State) -->
66 |     [C0],
67 |     { adjust_case(State,C0,C) },
68 |     { \+ char_set_member(C, Set) }.
69 | 
70 | engine_match(pos_set(Set), State, State) -->
71 |     [C0],
72 |     { adjust_case(State,C0,C) },
73 |     { char_set_member(C, Set) }.
74 | 
75 | 
76 | char_set_member(C, [char(C) | _]).
77 | char_set_member(C, [range(C1, C2) | _]) :-
78 |     C1 =< C,
79 |     C =< C2.
80 | char_set_member(C, [_|T]) :-
81 |     char_set_member(C, T).
82 | 


--------------------------------------------------------------------------------
/prolog/regex/parser.pl:
--------------------------------------------------------------------------------
  1 | :- module(regex_parser, [re//2]).
  2 | :- use_module(library(dcg/basics), [integer//1, string//1]).
  3 | :- use_module(library(regex/state), [adjust_case/3]).
  4 | 
  5 | :- set_prolog_flag(double_quotes, string).
  6 | 
  7 | % DCG parser for regular expressions
  8 | re(Opt, Z) -->
  9 |     basic_re(Opt,W),
 10 |     re_tail(Opt,W,Z).
 11 | 
 12 | 
 13 | re_tail(Opt, W, Z) -->
 14 |     "|",
 15 |     basic_re(Opt,X),
 16 |     re_tail(Opt,union(W,X), Z).
 17 | re_tail(_Opt, W, W) -->
 18 |     { true }.
 19 | 
 20 | 
 21 | basic_re(Opt, Z) -->
 22 |     simple_re(Opt,W),
 23 |     basic_re_tail(Opt,W,Z).
 24 | 
 25 | basic_re_tail(Opt, W, Z) -->
 26 |     simple_re(Opt,X),
 27 |     basic_re_tail(Opt,conc(W,X), Z).
 28 | basic_re_tail(_Opt, W, W) -->
 29 |     { true }.
 30 | 
 31 | 
 32 | simple_re(Opt, Z) -->
 33 |     elemental_re(Opt,W),
 34 |     simple_re_tail(Opt,W,Z).
 35 | 
 36 | simple_re_tail(_Opt, W, count(W,0,999_999_999)) -->
 37 |     "*".
 38 | simple_re_tail(_Opt, W, count(W,1,999_999_999)) -->
 39 |     "+".
 40 | simple_re_tail(_Opt, W, count(W,0,1)) -->
 41 |     "?".
 42 | simple_re_tail(_Opt, W, count(W,N,N)) -->
 43 |     % {n}
 44 |     "{",
 45 |     integer(N),
 46 |     { N >= 0 },
 47 |     "}".
 48 | simple_re_tail(_Opt, W, count(W,N,999_999_999)) -->
 49 |     % {n,}
 50 |     "{",
 51 |     integer(N),
 52 |     { N >= 0 },
 53 |     ",",
 54 |     "}".
 55 | simple_re_tail(_Opt, W, count(W,N,M)) -->
 56 |     % {n,m}
 57 |     "{",
 58 |     integer(N),
 59 |     { N >= 0 },
 60 |     ",",
 61 |     integer(M),
 62 |     { M >= N },
 63 |     "}".
 64 | simple_re_tail(_Opt, W, W) -->
 65 |     { true }.
 66 | 
 67 | 
 68 | elemental_re(_Opt, any) -->
 69 |     ".".
 70 | %elemental_re(_Opt, caret) -->
 71 | %    "^".
 72 | elemental_re(Opt, group(X)) -->
 73 |     "(",
 74 |     re(Opt, X),
 75 |     ")".
 76 | elemental_re(Opt, named_group(Name, X)) -->
 77 |     "(?<",
 78 |     string(NameCodes),
 79 |     { atom_codes(Name, NameCodes) },
 80 |     ">",
 81 |     re(Opt, X),
 82 |     ")".
 83 | elemental_re(_Opt, eos) -->
 84 |     "$".
 85 | elemental_re(State, char(C)) -->
 86 |     [C0],
 87 |     { \+ re_metachar(C0) },
 88 |     { adjust_case(State, C0, C) }.
 89 | elemental_re(Opt, RE) -->
 90 |     "\\",
 91 |     [C],
 92 |     { perl_character_class(C, Opt, RE) }.
 93 | elemental_re(_Opt, char(C)) -->
 94 |     "\\",
 95 |     [C],
 96 |     { re_metachar(C) }.
 97 | elemental_re(Opt, neg_set(X)) -->
 98 |     "[^",
 99 |     !,  % don't backtrack into pos_set/1 clause below
100 |     set_items(Opt,X),
101 |     "]".
102 | elemental_re(Opt, pos_set([char(0'-)|X])) -->
103 |     "[-",
104 |     !,  % don't backtrack into pos_set/1 clause below
105 |     set_items(Opt,X),
106 |     "]".
107 | elemental_re(Opt, pos_set(X)) -->
108 |     "[",
109 |     set_items(Opt,X),
110 |     "]".
111 | elemental_re(Opt, pos_set([char(0'-)|X])) -->
112 |     "[",
113 |     set_items(Opt,X),
114 |     "-]".
115 | 
116 | 
117 | % true if argument is a code for a regular expression meta character
118 | re_metachar(0'^).
119 | re_metachar(0'\\).
120 | re_metachar(0'|).
121 | re_metachar(0'*).
122 | re_metachar(0'+).
123 | re_metachar(0'.).
124 | re_metachar(0'?).
125 | re_metachar(0'[).
126 | re_metachar(0'$).
127 | re_metachar(0'().
128 | re_metachar(0')).
129 | 
130 | 
131 | % define Perl character classes as character sets
132 | perl_character_class(0'd, Opt, pos_set(X)) :-
133 |     string_codes("0-9", Codes),
134 |     set_items(Opt, X,Codes,[]).
135 | perl_character_class(0'w, Opt, pos_set(X)) :-
136 |     string_codes("0-9A-Za-z_", Codes),
137 |     set_items(Opt, X,Codes,[]).
138 | perl_character_class(0's, _Opt, pos_set([ char(0'\t)  % tab
139 |                                   , char(0'\n)  % newline
140 |                                   , char(0'\f)  % form feed
141 |                                   , char(0'\r)  % carriage return
142 |                                   , char(0' )   % space
143 |                                   ])).
144 | perl_character_class(Upper, Opt, neg_set(Set)) :-
145 |     code_type(Lower, lower(Upper)),
146 |     perl_character_class(Lower, Opt, pos_set(Set)).
147 | 
148 | 
149 | set_items(Opt, [Item1|MoreItems]) -->
150 |     set_item(Opt, Item1),
151 |     set_items(Opt, MoreItems).
152 | set_items(Opt, [Item1]) -->
153 |     set_item(Opt, Item1).
154 | 
155 | set_item(State, char(C)) -->
156 |     [C0],
157 |     { \+ set_metachar(C0) },
158 |     { adjust_case(State,C0,C) }.
159 | set_item(_Opt, char(C)) -->
160 |     "\\",
161 |     [C],
162 |     { set_metachar(C) }.
163 | set_item(Opt, range(A,B)) -->
164 |     set_item(Opt, char(A)),
165 |     "-",
166 |     set_item(Opt, char(B)).
167 | 
168 | 
169 | set_metachar(0'\\).
170 | set_metachar(0']).
171 | set_metachar(0'-).
172 | 
173 | 


--------------------------------------------------------------------------------
/prolog/regex/state.pl:
--------------------------------------------------------------------------------
  1 | :- module(regex_state, [ adjust_case/3
  2 |                        , new_state/3
  3 |                        , push_capture/3
  4 |                        , singleline_mode/1
  5 |                        , numbered_captures/2
  6 |                        ]).
  7 | :- use_module(library(apply), [ foldl/4 ]).
  8 | :- use_module(library(assoc)).
  9 | :- use_module(library(record)).
 10 | 
 11 | :- record state(i='-', s='-', capture_count=0, captures).
 12 | 
 13 | %% new_state(+OptionSugar, +CaptureSugar, -State) is semidet
 14 | %
 15 | %  True if State is an opaque value representing the regular expression
 16 | %  state described by OptionSugar and CapturesSugar. OptionSugar
 17 | %  should be a list or an atom. If it's an atom it should
 18 | %  be something like 'ims', 'xi', etc. Fails if OptionSugar contains an
 19 | %  unknown option.
 20 | new_state(OptionSugar, CaptureSugar, State) :-
 21 |     atom(OptionSugar),
 22 |     OptionSugar \== [],  % empty list atom needs no expansion
 23 |     !,
 24 |     atom_chars(OptionSugar, Chars),
 25 |     new_state(Chars, CaptureSugar, State).
 26 | new_state(OptionList, CaptureSugar, State) :-
 27 |     var(CaptureSugar),
 28 |     !,
 29 |     new_state(OptionList, [], State).
 30 | new_state(OptionList, CaptureSugar, State) :-
 31 |     default_state(State0),
 32 | 
 33 |     % set all options
 34 |     foldl(set_option, OptionList, State0, State1),
 35 | 
 36 |     % prepare capture values
 37 |     empty_assoc(Captures),
 38 |     set_captures_of_state(Captures, State1, State2),
 39 |     foldl(push_pattern, CaptureSugar, State2, State3),
 40 | 
 41 |     % next round of captures resumes numbering at the beginning
 42 |     set_capture_count_of_state(0, State3, State).
 43 | 
 44 | 
 45 | %% set_option(+Option, +State0, -State) is semidet
 46 | %
 47 | %  Sets the option Option, giving a new State value.
 48 | set_option(i) -->
 49 |     set_i_of_state('+').
 50 | set_option(s) -->
 51 |     set_s_of_state('+').
 52 | 
 53 | 
 54 | %% push_pattern(+Capture, +State0, -State) is semidet
 55 | %
 56 | %  Adds Capture to State0 giving a new State. Capture may be
 57 | %  `Name=Value` or just `Value`
 58 | push_pattern(Capture, State0, State) :-
 59 |     var(Capture),
 60 |     !,
 61 |     push_numbered(Capture, State0, State).
 62 | push_pattern(Capture, State0, State) :-
 63 |     Capture = (Name=Value),
 64 |     !,
 65 |     ground(Name),
 66 |     push_named(Name, Value, State0, State).
 67 | push_pattern(Value, State0, State) :-
 68 |     % \+ var(Value)
 69 |     % \+ Value=(Name=_)
 70 |     push_numbered(Value, State0, State).
 71 | 
 72 | 
 73 | %% push_capture(+Capture, +State0, -State) is semidet
 74 | %
 75 | %  Adds Capture to State0 giving a new State. Capture may be
 76 | %  `Name=Value` or just `Value`. Pushing a named capture pushes both a
 77 | %  named and a numbered capture.
 78 | push_capture(Capture, State0, State) :-
 79 |     var(Capture),
 80 |     !,
 81 |     push_numbered(Capture, State0, State).
 82 | push_capture(Capture, State0, State) :-
 83 |     Capture = (Name=Value),
 84 |     !,
 85 |     ground(Name),
 86 |     push_named(Name, Value, State0, State1),
 87 |     push_numbered(Value, State1, State).
 88 | push_capture(Value, State0, State) :-
 89 |     % \+ var(Value)
 90 |     % \+ Value=(Name=_)
 91 |     push_numbered(Value, State0, State).
 92 | 
 93 | 
 94 | %% push_numbered(+Value, +State0, -State) is semidet.
 95 | %
 96 | %  Add Value to State0 as a numbered capture producing a new State.
 97 | push_numbered(Value, State0, State) :-
 98 |     % retrieve current values
 99 |     state_capture_count(State0, Count0),
100 |     state_captures(State0, Captures0),
101 | 
102 |     % create new values
103 |     insert_pair(Count0, Value, Captures0, Captures1),
104 |     succ(Count0, Count),
105 | 
106 |     % bundle them into a new state value
107 |     set_capture_count_of_state(Count, State0, State1),
108 |     set_captures_of_state(Captures1, State1, State).
109 | 
110 | 
111 | %% push_named(+Name, +Value, +State0, -State) is semidet
112 | %
113 | %  Add Name and Value pair to State0 producing a new State.
114 | push_named(Name, Value, State0, State) :-
115 |     % retrieve current values
116 |     state_captures(State0, Captures0),
117 | 
118 |     % create new value
119 |     insert_pair(Name, Value, Captures0, Captures),
120 | 
121 |     % bundle into a new state value
122 |     set_captures_of_state(Captures, State0, State).
123 | 
124 | 
125 | % unify a value with a named value; create a named value if the name
126 | % doesn't yet exist
127 | insert_pair(Name, Value, Assoc, Assoc) :-
128 |     get_assoc(Name, Assoc, CurrentValue),
129 |     !,
130 |     Value = CurrentValue.
131 | insert_pair(Name, Value, Assoc0, Assoc) :-
132 |     put_assoc(Name, Assoc0, Value, Assoc).
133 | 
134 | 
135 | %% numbered_captures(+State, -Captures:list) is det.
136 | %
137 | %  True if Captures is a list of numbered captures in State.
138 | numbered_captures(State, List) :-
139 |     state_capture_count(State, N),
140 |     state_captures(State, Captures),
141 |     numbered_captures(0, N, Captures, List).
142 | numbered_captures(N, N, _, []) :-
143 |     !.
144 | numbered_captures(N0, N, Captures, [Value|Tail]) :-
145 |     get_assoc(N0,Captures,Value),
146 |     succ(N0, N1),
147 |     numbered_captures(N1,N,Captures,Tail).
148 | 
149 | 
150 | %% adjust_case(+Options, +Code0, -Code) is det.
151 | %
152 | %  True if Code represents the same letter as Code0 but with case
153 | %  adjusted to compensate for the 'i' regular expression option (aka
154 | %  case insensitive).
155 | adjust_case(Options, Code0, Code) :-
156 |     ( state_i(Options, '+') ->
157 |           code_type(Code, to_lower(Code0))
158 |     ; % otherwise ->
159 |           Code = Code0
160 |     ).
161 | 
162 | %%	singleline_mode(+Options) is semidet.
163 | %
164 | %	True if Options request single-line mode (`/s`).
165 | singleline_mode(Options) :-
166 |     state_s(Options, '+').
167 | 


--------------------------------------------------------------------------------
/t/captures.pl:
--------------------------------------------------------------------------------
  1 | :- use_module(library(regex)).
  2 | :- use_module(library(when), [when/2]).
  3 | 
  4 | :- set_prolog_flag(double_quotes, codes).
  5 | 
  6 | % relates an even number to its codes representation
  7 | % (used for when/2 tests below)
  8 | codes_even(Codes, Even) :-
  9 |     number_codes(Even, Codes),
 10 |     0 =:= Even mod 2.
 11 | 
 12 | :- use_module(library(tap)).
 13 | 
 14 | % no captures
 15 | regex('\\d+', [], '932', []).
 16 | regex('.*', [], howdy, []).
 17 | 
 18 | 'single capture (entire)' :-
 19 |     regex('(\\d+)', [], '932', ["932"]).
 20 | 'single capture (partial)' :-
 21 |     regex('a (\\d+)rd', [], 'a 3rd', ["3"]).
 22 | 
 23 | 'two captures' :-
 24 |     regex('(\\S+) (\\S+)', [], 'hello world', ["hello", "world"]).
 25 | 
 26 | 
 27 | 'one, explicit named capture' :-
 28 |     regex('Hi (?<Name>\\w+)', i, 'Hi Ed', ['Name'=Name]),
 29 |     Name == "Ed".
 30 | 
 31 | 'two, explicit named captures' :-
 32 |     regex('(?<A>\\d) (?<B>\\d)', [], 'a 1 2 b', ['B'=B,'A'=A]),
 33 |     A == "1",
 34 |     B == "2".
 35 | 
 36 | 'two named captures, only one is used' :-
 37 |     regex('(?<A>\\d) (?<B>\\d)', [], 'a 1 2 b', ['A'=A]),
 38 |     A == "1".
 39 | 
 40 | 'one named capture, two names given' :-
 41 |     regex('(?<A>a)bcd', [], 'abcd', ['B'=B,'A'=A]),
 42 |     A == "a",
 43 |     var(B).
 44 | 
 45 | 'numbered, constrained captures' :-
 46 |     when(ground(A),(number_codes(N,A), 1 =:= N mod 2)),
 47 |     regex("odd: ([0-9]+)", i, "Odd: 77", [A]).
 48 | 
 49 | 'named, constrained captures' :-
 50 |     when(ground(A),(number_codes(N,A), 0 =:= N mod 2)),
 51 |     regex("even: (?<A>[0-9]+)", [], "even: 42", ['A'=A]).
 52 | 
 53 | 'named capture treated as a numbered capture' :-
 54 |     regex('(?<A>a)bcd', [], 'abcd', Captures),
 55 |     Captures == ["a"].
 56 | 
 57 | 'all captures initially unbound' :-
 58 |     regex('hello (\\w+)', i, 'Hello Sue', Captures),
 59 |     Captures == ["Sue"].
 60 | 
 61 | 'single capture initially unbound' :-
 62 |     regex('hello (\\w+)', i, 'Hello Joe', [Whom]),
 63 |     Whom == "Joe".
 64 | 
 65 | 'two captures initially unbound' :-
 66 |     regex('(\\S+) (\\S+)', [], 'hello world', Captures),
 67 |     Captures = ["hello", "world"].
 68 | 
 69 | 'pattern matches but captures fail unification'(fail) :-
 70 |     Whom = "Thomas",
 71 |     regex('hello ([a-z]+)', i, 'Hello Tom', [Whom]).
 72 | 
 73 | 
 74 | 'implicit named captures: one' :-
 75 |     N = _, % avoid singleton warning
 76 |     "num: 42" =~ "num: (?<N>\\d+)$",
 77 |     N == "42".
 78 | 
 79 | 'implicit named captures: two' :-
 80 |     X = _, Y = _, % avoid singleton warnings
 81 |     "hi:hola" =~ "(?<X>\\w+):(?<Y>\\w+)",
 82 |     Y == "hola",
 83 |     X == "hi".
 84 | 
 85 | 'implicit named captures: three' :-
 86 |     N = _, % avoid singleton warning
 87 |     Pattern = "num: (?<N>\\d+)$",
 88 |     "num: 42" =~ Pattern,
 89 |     N == "42".
 90 | 
 91 | 'implicit named captures: extra in-scope variables' :-
 92 |     X = X,  % an extra, in-scope variable
 93 |     Name = _, % avoid singleton warnings
 94 |     "Hi John" =~ "hi (?<Name>[a-z]+)"/i,
 95 |     Name == "John".
 96 | 
 97 | 'implicit named captures: contraints before match OK' :-
 98 |     when( ground(X), codes_even(X,N) ),
 99 |     "even: 42" =~ "even: (?<X>[0-9]+)$",
100 |     N =:= 42.
101 | 
102 | 'implicit named captures: contraints before match FAIL'(fail) :-
103 |     when( ground(X), codes_even(X,_) ),
104 |     "even: 43" =~ "even: (?<X>[0-9]+)$".
105 | 


--------------------------------------------------------------------------------
/t/examples.pl:
--------------------------------------------------------------------------------
 1 | :- use_module(library(regex)).
 2 | 
 3 | valid_email(Email) :-
 4 |     % should have a leading ^
 5 |     Email =~ '[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}$'/i.
 6 | 
 7 | :- use_module(library(tap)).
 8 | 
 9 | % character ranges
10 | deadbeef =~ '[a-f0-9]+'.
11 | "1973c" =~ '[a-f0-9]+'.
12 | `feed` =~ '[a-f0-9]+'.
13 | 'C' =~ '[A-Z0-9]'.
14 | 'trailing-dash' =~ '[a-zA-Z0-9-]+'.
15 | 'leading-dash' =~ '[-a-zA-Z0-9]+'.
16 | 
17 | % + meta character
18 | abbba =~ 'ab+a'.
19 | 
20 | % * meta character
21 | aa =~ "ab*a".
22 | 
23 | % union
24 | cat =~ 'cat|dog'.
25 | dog =~ 'cat|dog'.
26 | pig \~ 'cat|dog'.
27 | 
28 | % ^ meta character
29 | 'begin with ^' :- begin =~ '^beg'.
30 | 'beggar with ^' :- beggar =~ '^beg'.
31 | 'i beg with ^' :- 'i beg' \~ '^beg'.
32 | 'hello world' =~ world.  % no anchor matches anywhere in string
33 | 'i beg' =~ beg.
34 | 
35 | % $ meta character
36 | dog =~ 'dog$'.
37 | doggie \~ 'dog$'.
38 | 
39 | % . meta character
40 | cat =~ 'c.t'.
41 | cot =~ 'c.t'.
42 | 'c-t' =~ 'c.t'.
43 | 
44 | % ? meta character
45 | https =~ 'https?'.
46 | http =~ 'https?'.
47 | bot =~ 'boo?t'.
48 | boot =~ 'boo?t'.
49 | 
50 | % quantification
51 | http =~ 'ht{2}p'.
52 | bot \~ 'bo{2,}t'.
53 | boot =~ 'bo{2,}t'.
54 | booot =~ 'bo{2,}t'.
55 | boooot =~ 'bo{2,}t'.
56 | bot \~ 'bo{2,4}t'.
57 | boot =~ 'bo{2,4}t'.
58 | booot =~ 'bo{2,4}t'.
59 | boooot =~ 'bo{2,4}t'.
60 | booooot \~ 'bo{2,4}t'.
61 | 
62 | % email
63 | valid_email("michael@ndrix.org").
64 | valid_email("some-one@googlemail.co.uk").
65 | 
66 | 'works with pattern in variable' :-
67 |     Pattern = '[a-z]',
68 |     'a' =~ Pattern.
69 | 
70 | 'works with text in variable' :-
71 |     Text = a,
72 |     Text =~ '[a-z]'.
73 | 


--------------------------------------------------------------------------------
/t/options.pl:
--------------------------------------------------------------------------------
 1 | :- use_module(library(regex)).
 2 | 
 3 | :- use_module(library(tap)).
 4 | 
 5 | % i option succeeds
 6 | 'ABC' =~ abc/i.
 7 | abc =~ abc/i.
 8 | 'ABC' =~ 'ABC'/i.
 9 | abc =~ 'ABC'/i.
10 | 'ABC' =~ aBc/i.
11 | abc =~ aBc/i.
12 | aBc =~ aBC/i.
13 | 
14 | % i option affects character classes too
15 | 'E' =~ '[a-z]'/i.
16 | 'E' =~ '[A-Z]'/i.
17 | 'e' =~ '[a-z]'/i.
18 | 'e' =~ '[A-Z]'/i.
19 | 
20 | % i option doesn't help
21 | foo \~ abc/i.
22 | 
23 | 
24 | % s option (single-line mode)
25 | 'abc\ndef' \~ 'c.d'.
26 | 'abc\ndef' =~ 'c.d'/s.
27 | 


--------------------------------------------------------------------------------
/t/perl_classes.pl:
--------------------------------------------------------------------------------
 1 | :- use_module(library(regex)).
 2 | 
 3 | :- use_module(library(tap)).
 4 | 
 5 | 'one two' =~ '\\s'.
 6 | '^ with \\S' :- 'one two' =~ '^\\S'.
 7 | 
 8 | '^ with \\d' :- '123 main' =~ '^\\d+'.
 9 | '^ with \\D' :- '123 main' \~ '^\\D+'.
10 | 
11 | '123 main' =~ '\\w'.
12 | '^ with \\W' :- '123 main' \~ '^\\W'.
13 | 
14 | '9876' =~ '\\d'.
15 | '9876' \~ '\\D'.
16 | ' \t' \~ '\\w'.
17 | ' \t' =~ '\\W'.
18 | ' \t' =~ '\\s'.
19 | ' \t' \~ '\\S'.
20 | 


--------------------------------------------------------------------------------
/t/synopsis.pl:
--------------------------------------------------------------------------------
1 | :- use_module(library(regex)).
2 | :- use_module(library(tap)).
3 | 
4 | synopsis :-
5 |     '99 Bottles of Beer' =~ '[0-9]+ bottles'/i,
6 |     writeln('Take one down...').
7 | 


--------------------------------------------------------------------------------