├── Code
├── SIMD
│ ├── i-got-nothing.lisp
│ ├── package.lisp
│ ├── one-more-re-nightmare-simd.asd
│ ├── prefix.lisp
│ ├── code-generation.lisp
│ ├── loop.lisp
│ ├── prefix-strategy.lisp
│ └── sbcl-x86-64.lisp
├── Compiler
│ ├── layout.lisp
│ ├── optimize-settings.lisp
│ ├── compilation-strategy.lisp
│ ├── length-inference.lisp
│ └── code-generation.lisp
├── DFA-construction
│ ├── interpret.lisp
│ ├── empty.lisp
│ ├── effects.lisp
│ ├── nullable.lisp
│ ├── derivative-classes.lisp
│ ├── similar.lisp
│ ├── print-dfa.lisp
│ ├── derivative.lisp
│ ├── tag-sets.lisp
│ ├── type.lisp
│ ├── make-dfa.lisp
│ ├── re-types.lisp
│ └── sets.lisp
├── Interface
│ ├── convert-to-bytes.lisp
│ ├── code-cache.lisp
│ ├── lint.lisp
│ ├── syntax.lisp
│ └── interface.lisp
├── package.lisp
└── one-more-re-nightmare.asd
├── Tests
├── package.lisp
├── one-more-re-nightmare-tests.asd
├── regrind.lisp
└── tests.lisp
├── .gitignore
├── Documentation
├── make-html.sh
├── make-pdf.sh
├── one-more-re-nightmare.scrbl
├── README.md
├── tex-prefix.tex
├── mobile-view.css
├── spec-macros.tex
├── bibliography.rkt
├── introduction.scrbl
├── posix.scrbl
├── spec-macros.css
├── interface.scrbl
├── spec-macros.scrbl
└── linting.scrbl
├── LICENSE
└── README.md
/Code/SIMD/i-got-nothing.lisp:
--------------------------------------------------------------------------------
1 | (error "one-more-re-nightmare-simd only supports SBCL on x86-64. Sorry.")
2 |
--------------------------------------------------------------------------------
/Tests/package.lisp:
--------------------------------------------------------------------------------
1 | (defpackage :one-more-re-nightmare-tests
2 | (:use :cl)
3 | (:export #:run-tests #:regrind))
4 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *~
2 | *.fasl
3 | .*
4 | Documentation/*.aux
5 | Documentation/*.log
6 | Documentation/*.out
7 | Documentation/*.pdf
8 | Documentation/*.toc
9 | Documentation/*.ilg
10 | Documentation/*.ind
11 | Documentation/one-more-re-nightmare.tex
12 | Documentation/katex/*
13 | Documentation/one-more-re-nightmare/*
--------------------------------------------------------------------------------
/Code/SIMD/package.lisp:
--------------------------------------------------------------------------------
1 | (defpackage :one-more-re-nightmare.vector-primops
2 | (:use)
3 | (:export #:v-and #:v-or #:v-not #:all-of
4 | #:v-and8 #:v-and32 #:v-or8 #:v-or32 #:v-not8 #:v-not32
5 | #:v32> #:v32= #:v8> #:v8= #:v8-
6 | #:v-broadcast32 #:v-movemask32 #:v-movemask8 #:v-broadcast8
7 | #:v-load32 #:v-load8 #:+v-length+ #:find-first-set))
8 |
--------------------------------------------------------------------------------
/Tests/one-more-re-nightmare-tests.asd:
--------------------------------------------------------------------------------
1 | (asdf:defsystem :one-more-re-nightmare-tests
2 | :author "Hayley Patton"
3 | :description "Tests for a regular expression compiler"
4 | :license "BSD 2-clause"
5 | :depends-on (:parachute :one-more-re-nightmare :lparallel :cl-cpus)
6 | :serial t
7 | :components ((:file "package")
8 | (:file "tests")
9 | (:file "regrind")))
10 |
--------------------------------------------------------------------------------
/Code/Compiler/layout.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defstruct layout
4 | "A structure representing the type and accessors for a vector of some sort."
5 | (array-type '(simple-array character 1))
6 | (ref 'aref)
7 | (from-number 'code-char)
8 | (to-number 'char-code)
9 | (less '<)
10 | (less-or-equal '<=)
11 | (equal '=))
12 |
13 | (defvar *default-layout* (make-layout))
14 |
--------------------------------------------------------------------------------
/Documentation/make-html.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | scribble --htmls one-more-re-nightmare.scrbl
4 | for p in one-more-re-nightmare/*.html; do
5 | sed -i -e 's/]*>//' \
6 | -e 's/initial-scale=0.8/initial-scale=1.0/' \
7 | -e "s/{}’/{}'/g" \
8 | "$p"
9 | done
10 | cat spec-macros.css mobile-view.css >> one-more-re-nightmare/scribble-style.css
11 |
--------------------------------------------------------------------------------
/Documentation/make-pdf.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | scribble --prefix tex-prefix.tex --latex one-more-re-nightmare.scrbl
4 | sed -i -e 's/^\\renewrmdefault//g' \
5 | -e 's/^\\packageTxfonts//g' \
6 | -e 's/\\hspace\*{\\fill}//g' \
7 | -e 's/\\HR{}\\\\/\\HR{}/g' \
8 | one-more-re-nightmare.tex
9 | sed -i -e 's/^\\\\//g' one-more-re-nightmare.tex
10 | pdflatex one-more-re-nightmare.tex
11 | makeindex one-more-re-nightmare
12 | pdflatex one-more-re-nightmare.tex
13 |
--------------------------------------------------------------------------------
/Documentation/one-more-re-nightmare.scrbl:
--------------------------------------------------------------------------------
1 | #lang scribble/base
2 |
3 | @require[scribble/core scribble/html-properties "bibliography.rkt"]
4 |
5 | @title{The @emph{one-more-re-nightmare} Manual}
6 | @author{Applied Language}
7 |
8 | @table-of-contents[]
9 |
10 | @include-section["introduction.scrbl"]
11 | @include-section["interface.scrbl"]
12 | @include-section["linting.scrbl"]
13 | @include-section["posix.scrbl"]
14 | @generate-bibliography[]
15 |
16 | @index-section[]
17 |
--------------------------------------------------------------------------------
/Documentation/README.md:
--------------------------------------------------------------------------------
1 | Run `./make-html.sh` to compile the documentation into HTML
2 | documents. Run `./make-pdf.sh` to compile the documentation into a PDF
3 | book.
4 |
5 | The documentation is written using the
6 | [Scribble](https://docs.racket-lang.org/scribble/index.html) markup
7 | language. Note that compiling a PDF book also requires a LaTeX
8 | installation.
9 |
10 | Much of the style of both formats was taken from the [Netfarm
11 | book](https://cal-coop.gitlab.io/netfarm/documentation/).
12 |
--------------------------------------------------------------------------------
/Code/DFA-construction/interpret.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defun interpret-regular-expression (expression input)
4 | (loop for char across input
5 | for position from 0
6 | do (when (eq expression (empty-set))
7 | (return-from interpret-regular-expression nil))
8 | (print expression)
9 | (setf expression (derivative expression (symbol-set char))))
10 | (if (eq expression (empty-set))
11 | nil
12 | (nullable expression)))
13 |
--------------------------------------------------------------------------------
/Documentation/tex-prefix.tex:
--------------------------------------------------------------------------------
1 | \documentclass{article}
2 |
3 | \usepackage[utf8]{inputenc}
4 | \usepackage[T1]{fontenc}
5 | \usepackage{amsmath}
6 | \usepackage{pdflscape}
7 | \usepackage{changepage}
8 | \usepackage{ClearSans}
9 | \usepackage{charter}
10 | \usepackage[margin=1.5in,a4paper]{geometry}
11 |
12 | % Set up fonts
13 | \usepackage{titling}
14 | \usepackage{titlesec}
15 | \titleformat*{\section}{\Large\bfseries\sffamily}
16 | \titleformat*{\subsection}{\large\bfseries\sffamily}
17 | \titleformat*{\subsubsection}{\large\sffamily}
18 |
--------------------------------------------------------------------------------
/Code/SIMD/one-more-re-nightmare-simd.asd:
--------------------------------------------------------------------------------
1 | (asdf:defsystem :one-more-re-nightmare-simd
2 | :author "Hayley Patton"
3 | :description "SIMD acceleration for tight loops in DFAs"
4 | :license "BSD 2-clause"
5 | :depends-on (:one-more-re-nightmare)
6 | :serial t
7 | :components ((:file "package")
8 | (:file "code-generation")
9 | (:file "sbcl-x86-64")
10 | #-(and sbcl x86-64)
11 | (:file "i-got-nothing")
12 | (:file "prefix")
13 | (:file "prefix-strategy")
14 | (:file "loop")))
15 |
--------------------------------------------------------------------------------
/Code/DFA-construction/empty.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (trivia:defun-match re-empty-p (re)
4 | "Is a regular expression basically an empty string?
5 | This is different to NULLABLE, yes, NULLABLE would accept e.g. a* or anything that is a superset of { \"\" }, but this accepts only the empty string (± tags)."
6 | ((or (empty-string) (tag-set _)) t)
7 | ;; The innards of a grep vector. If we start an alternation with an
8 | ;; empty ALPHA, then the earliest scanner in the vector won.
9 | ((either (alpha (or (empty-set) (tag-set _)) _) _) t)
10 | ((alpha r s)
11 | (or (and (re-empty-p s)
12 | (eq (empty-set) r))
13 | (re-empty-p r)))
14 | ((kleene r)
15 | (re-empty-p r))
16 | ((or (join r s) (either r s) (both r s))
17 | (and (re-empty-p r) (re-empty-p s)))
18 | (_ nil))
19 |
20 |
--------------------------------------------------------------------------------
/Code/DFA-construction/effects.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (trivia:defun-match has-literals-p (re)
4 | ((literal _) t)
5 | ((either r s)
6 | (and (has-literals-p r) (has-literals-p s)))
7 | ((or (join r s) (both r s))
8 | (or (has-literals-p r) (has-literals-p s)))
9 | ((repeat r min _ _)
10 | (and (has-literals-p r) (plusp min)))
11 | ((invert r) (not (has-literals-p r)))
12 | (_ nil))
13 |
14 | (trivia:defun-match effects (re)
15 | ((tag-set s) s)
16 | ((join r s)
17 | (if (has-literals-p r)
18 | (effects r)
19 | (union (effects r) (effects s)
20 | :test #'equal)))
21 | ((or (either r s) (both r s))
22 | (union (effects r) (effects s)
23 | :test #'equal))
24 | ((invert r) (effects r))
25 | ((repeat r _ _ _) (effects r))
26 | ((alpha e _) (effects e))
27 | ((grep v _) (effects v))
28 | (_ '()))
29 |
--------------------------------------------------------------------------------
/Documentation/mobile-view.css:
--------------------------------------------------------------------------------
1 | @media screen and (max-width: 900px) {
2 | /* Make the table of contents appear before the main content. */
3 | .tocsub {
4 | display: none;
5 | }
6 | .tocset {
7 | position: initial;
8 | float: none;
9 | margin-bottom: 2em;
10 | }
11 | .tocview {
12 | background: transparent;
13 | }
14 |
15 | /* We don't need a left margin for the main content now. */
16 | .maincolumn, .tocset {
17 | width: calc(100% - 4em - var(--card-padding));
18 | margin-left: 2em;
19 | margin-right: 2em;
20 | }
21 |
22 | /* Put margin comments inline too. */
23 | .refpara, .refelem, .SAuthorListBox {
24 | position: initial;
25 | float: none;
26 | left: 0;
27 | width: 100%;
28 | margin: 0;
29 | height: initial;
30 | }
31 |
32 | .refcolumn {
33 | position: initial;
34 | width: initial;
35 | margin: initial;
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/Code/Compiler/optimize-settings.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defvar *optimize-settings*
4 | ;; SBCL SPEED doesn't seem to change anything, so we'll just try to
5 | ;; make the compiler run faster.
6 | #+sbcl '((speed 0) (safety 0) (compilation-speed 3) (debug 0))
7 | ;; Clozure doesn't bother to inline or type infer with high
8 | ;; COMPILATION-SPEED. Fortunately it's still fast enough.
9 | #+ccl '((speed 3) (safety 0) (compilation-speed 0) (debug 0))
10 | ;; ECL drops in performance when we increase compilation speed, but
11 | ;; the compiler is not noticeably faster. I blame the C compiler.
12 | #+ecl '((speed 3) (safety 0) (compilation-speed 0) (debug 0))
13 | ;; It seems that we get better performance on ABCL with low
14 | ;; compilation speed, but there's a lot of noise still.
15 | #+abcl '((speed 3) (safety 0) (compilation-speed 0) (debug 0))
16 | #-(or sbcl ccl ecl abcl) '((speed 3) (safety 0) (compilation-speed 3) (debug 0)))
17 |
18 | (defmacro with-naughty-compiler-switches (() &body body)
19 | #+sbcl
20 | `(let ((sb-c::*reoptimize-limit* 3))
21 | ,@body)
22 | #-sbcl
23 | `(progn ,@body))
24 |
--------------------------------------------------------------------------------
/Code/Interface/convert-to-bytes.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | #+(or)
4 | (trivia:defun-match string->byte-re (re)
5 | ((literal set)
6 | (etypecase set
7 | (positive-symbol-set
8 | (reduce #'either
9 | (elements set)
10 | :key (lambda (character)
11 | (text (babel:string-to-octets
12 | (string character))))
13 | :initial-value (empty-set)))
14 | (negative-symbol-set
15 | (reduce #'both
16 | (elements set)
17 | :key (lambda (character)
18 | (invert
19 | (text (babel:string-to-octets
20 | (string character)))))
21 | :initial-value (invert (empty-set))))))
22 | ((join r s) (join (string->byte-re r)
23 | (string->byte-re s)))
24 | ((either r s) (either (string->byte-re r)
25 | (string->byte-re s)))
26 | ((invert r) (invert (string->byte-re r)))
27 | ((kleene r) (kleene (string->byte-re r)))
28 | ((both r s) (both (string->byte-re r)
29 | (string->byte-re s)))
30 | ((type string)
31 | (string->byte-re (parse-regular-expression re)))
32 | (_ re))
33 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2020-2022 Hayley Patton
2 |
3 | Redistribution and use in source and binary forms, with or without
4 | modification, are permitted provided that the following conditions are
5 | met:
6 |
7 | 1. Redistributions of source code must retain the above copyright
8 | notice, this list of conditions and the following disclaimer.
9 | 2. Redistributions in binary form must reproduce the above copyright
10 | notice, this list of conditions and the following disclaimer in the
11 | documentation and/or other materials provided with the distribution.
12 |
13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
14 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
15 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
16 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
17 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
18 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
19 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
20 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
21 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 |
25 |
--------------------------------------------------------------------------------
/Code/SIMD/prefix.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (trivia:defun-match %prefix (re)
4 | "Find the constant string prefix of a regular expression."
5 | ((empty-set)
6 | (values '() (empty-string)))
7 | ((literal set)
8 | (if (csum-has-classes-p set)
9 | (values '() (empty-string))
10 | (values `((:literal ,set)) (empty-string))))
11 | ((tag-set tags)
12 | (values `((:tags ,tags)) (empty-string)))
13 | ((join r s)
14 | (multiple-value-bind (p1 s1)
15 | (%prefix r)
16 | (cond
17 | ((eq (empty-string) s1)
18 | ;; Haven't hit something not constant, keep searching.
19 | (multiple-value-bind (p2 s2)
20 | (%prefix s)
21 | (values (append p1 p2) s2)))
22 | (t
23 | (values p1 (join s1 s))))))
24 | ((alpha r _)
25 | (%prefix r))
26 | (_
27 | (values '() re)))
28 |
29 | (defun prefix (re)
30 | (multiple-value-bind (prefix suffix)
31 | (%prefix re)
32 | (values prefix
33 | ;; Glue the tag map back on.
34 | (join (tag-set
35 | (loop for (type tags) in prefix
36 | when (eql type :tags)
37 | append (loop for ((v r) . nil) in tags
38 | collect (cons (list v r) (list v r)))))
39 | suffix))))
40 |
--------------------------------------------------------------------------------
/Code/DFA-construction/nullable.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defvar *gensym-assignments?* t)
4 |
5 | (defun cached-nullable* (re)
6 | (if *gensym-assignments?*
7 | (cached-nullable re)
8 | (cached-nullable-no-gensym re)))
9 |
10 | (defun (setf cached-nullable*) (value re)
11 | (if *gensym-assignments?*
12 | (setf (cached-nullable re) value)
13 | (setf (cached-nullable-no-gensym re) value)))
14 |
15 | (defun nullable (re)
16 | "(language-of (nullable RE)) = (language-of (both RE (empty-string)))"
17 | (with-slot-consing (cached-nullable* re)
18 | (trivia:ematch re
19 | ((empty-string) (empty-string))
20 | ((literal _) (empty-set))
21 | ((join r s) (join (nullable r) (nullable s)))
22 | ((either r s) (either (nullable r) (nullable s)))
23 | ((repeat r min _ c) (let ((rn (if c (nullable r) (empty-set))))
24 | (if (plusp min)
25 | (empty-set)
26 | (either rn (empty-string)))))
27 | ((both r s) (both (nullable r) (nullable s)))
28 | ((tag-set s) (tag-set (gensym-position-assignments s)))
29 | ((invert r) (if (eq (nullable r) (empty-set))
30 | (empty-string)
31 | (empty-set)))
32 | ((grep r _) (nullable r))
33 | ((alpha r history) (either (nullable r) history)))))
34 |
--------------------------------------------------------------------------------
/Code/DFA-construction/derivative-classes.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defun merge-sets (sets1 sets2)
4 | "Produce a list of every subset of sets1 and sets2."
5 | (let ((sets (make-hash-table :test 'equal)))
6 | (loop for set1 in sets1
7 | do (loop for set2 in sets2
8 | for intersection = (csum-intersection set1 set2)
9 | do (setf (gethash intersection sets) t)))
10 | (alexandria:hash-table-keys sets)))
11 |
12 |
13 | (define-hash-consing-table *derivative-classes*)
14 |
15 | (defun derivative-classes (re)
16 | "Produce a list of the 'classes' (sets) of characters that compiling the regular expression would have to dispatch on."
17 | (with-hash-consing (*derivative-classes* re)
18 | (trivia:ematch re
19 | ((literal set) (list set (csum-complement set)))
20 | ((or (empty-string)
21 | (tag-set _))
22 | (list +universal-set+))
23 | ((join r s)
24 | (if (eq (nullable r) (empty-set))
25 | (derivative-classes r)
26 | (merge-sets (derivative-classes r)
27 | (derivative-classes s))))
28 | ((or (either r s) (both r s)
29 | (grep r s))
30 | (merge-sets (derivative-classes r)
31 | (derivative-classes s)))
32 | ((or (invert r) (repeat r _ _ _))
33 | (derivative-classes r))
34 | ((alpha r _)
35 | (derivative-classes r)))))
36 |
--------------------------------------------------------------------------------
/Code/package.lisp:
--------------------------------------------------------------------------------
1 | (when (find-package "NET.DIDIERVERNA.DECLT")
2 | (loop with request = "Get me off your 'reference manual' generator. "
3 | with nil = (write-line "Get Me Off Your 'Reference Manual' Generator
4 | " *debug-io*)
5 | for (section size) in '(("ABSTRACT" 10) ("1 INTRODUCTION" 25)
6 | (2 5) (2.1 20) (2.2 10) (3 25) (3.1 15)
7 | (3.2 15) (3.3 20) (3.4 15) (4 20)
8 | ("5 SUMMARY" 5))
9 | do (if (stringp section)
10 | (format *debug-io* "~%~%~A~%" section)
11 | (format *debug-io* "~%~%~D ~:@(~A~)~%" section request))
12 | (dotimes (i size)
13 | (write-string request *debug-io*)
14 | (when (and (> size 10) (zerop (random 16)))
15 | (terpri *debug-io*) (terpri *debug-io*))))
16 | #+sbcl (sb-ext:quit)
17 | ;; close enough
18 | #-sbcl (loop collect (make-array 10)))
19 |
20 | (defpackage :one-more-re-nightmare
21 | (:use :cl)
22 | (:export #:compile-regular-expression #:compiled-regular-expression
23 | #:all-matches #:all-string-matches
24 | #:first-match #:first-string-match
25 | #:do-matches
26 | #:*state-limit* #:exceeded-state-limit
27 | #:lint-style-warning
28 | #:not-matchable-style-warning
29 | #:matching-too-much-style-warning))
30 |
--------------------------------------------------------------------------------
/Code/SIMD/code-generation.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defvar *bits*)
4 | (defvar *broadcasts*)
5 | (defun find-broadcast (value)
6 | (or (gethash value *broadcasts*)
7 | (setf (gethash value *broadcasts*)
8 | (make-symbol (format nil "BROADCAST-~d" value)))))
9 |
10 | (defun test-from-isum (variable isum)
11 | (translate-scalar-code variable (make-test-form isum variable)))
12 |
13 | (defun code-from-prefix (prefix)
14 | (assert (not (null prefix)) () "Why /even bother/ with a zero-length prefix?")
15 | (let ((tests '())
16 | (loads '())
17 | (assignments '())
18 | (n 0))
19 | (dolist (part prefix)
20 | (trivia:ematch part
21 | ((list :literal isum)
22 | (let ((name (make-symbol (format nil "LOAD-~d" n))))
23 | (trivia:ematch (test-from-isum name isum)
24 | (:always)
25 | (:never (error "This should never happen - the empty set has no prefix!"))
26 | (test
27 | (push `(,name (,(find-op "LOAD") vector (the fixnum (+ ,n start))))
28 | loads)
29 | (push test tests))))
30 | (incf n))
31 | ((list :tags tags)
32 | (push `(let ((position (the fixnum (+ ,n position))))
33 | ,(setf-from-assignments tags))
34 | assignments))))
35 | (values
36 | (one-more-re-nightmare.vector-primops:all-of tests)
37 | loads
38 | `(progn ,@(reverse assignments))
39 | n)))
40 |
--------------------------------------------------------------------------------
/Documentation/spec-macros.tex:
--------------------------------------------------------------------------------
1 | % The "type" part of @defthing.
2 | \newcommand{\FloatRight}[1]{\nobreak\hfill\enspace #1}
3 | \newcommand{\Param}[1]{\mbox{#1}}
4 | % For fancy looking definition section headers.
5 | \newcommand{\SmallCaps}[1]{\textsc{#1}}
6 |
7 | \renewenvironment{SInsetFlow}{%
8 | \hfill\begin{minipage}{\dimexpr\textwidth-2em}%
9 | }{
10 | \end{minipage}\\
11 | }
12 |
13 | \newenvironment{Definitions}{%
14 | \vspace{0.5em}
15 | \begin{minipage}{\textwidth}
16 | \setlength{\parskip}{0.5em}
17 | \noindent{\rule{\linewidth}{0.4pt}}
18 | }{
19 | \end{minipage}
20 | }
21 |
22 | % Definition lines
23 | % Thanks to https://tex.stackexchange.com/questions/312581/how-to-calculate-width-of-remaining-part-of-line
24 | % for more or less showing how to get this layout working.
25 |
26 | \newsavebox{\defunnamebox}
27 | \newlength{\defunnamewidth}
28 |
29 | \newcommand{\DefunName}[1]{%
30 | \sbox{\defunnamebox}{\strut \textbf{#1}}%
31 | \global\defunnamewidth=\wd\defunnamebox
32 | \usebox{\defunnamebox}%
33 | }
34 | \newcommand{\LambdaList}[1]{%
35 | \parbox[t]{\dimexpr\columnwidth-\defunnamewidth-\fboxsep}{
36 | \raggedright\strut\emph{#1}
37 | }%
38 | }
39 |
40 | \newenvironment{Landscape}{%
41 | \begin{landscape}
42 | }{
43 | \end{landscape}
44 | }
45 |
46 | \newenvironment{CenteredContainer}{}{}
47 |
48 | % Okay, not really centering, but I don't know if I really want centering,
49 | % or just a big indent.
50 | \newenvironment{CenteredBlock}{%
51 | \hfill\begin{minipage}{\dimexpr\textwidth-6em}\raggedright
52 | }{
53 | \end{minipage}
54 | }
55 |
56 | \setlength{\parindent}{0pt}
57 |
58 | % Have each section start on a new page.
59 | \let\oldsection\section
60 | \renewcommand\section{\clearpage\oldsection}
61 | \setlength{\parskip}{0.5em}
--------------------------------------------------------------------------------
/Documentation/bibliography.rkt:
--------------------------------------------------------------------------------
1 | #lang racket
2 |
3 | (require scriblib/autobib)
4 | (provide generate-bibliography ~cite
5 | derivatives apl3000 derivatives-reexamined
6 | rte petalisp cox posix)
7 |
8 | (define-cite ~cite citet generate-bibliography)
9 |
10 | (define-syntax-rule (define-bib name title author date rest ...)
11 | (define name
12 | (make-bib #:title title
13 | #:author author
14 | #:date date
15 | rest ...)))
16 |
17 |
18 | (define-bib derivatives
19 | "Derivatives of Regular Expressions"
20 | "Janus A. Brzozowski"
21 | 1964
22 | #:url "https://dl.acm.org/doi/10.1145/321239.321249")
23 |
24 | (define-bib apl3000
25 | "The dynamic incremental compiler of APL\\3000"
26 | "Ronald L. Johnston"
27 | 1979
28 | #:url "http://www.softwarepreservation.org/projects/apl/Papers/DYNAMICINCREMENTAL")
29 |
30 | (define-bib derivatives-reexamined
31 | "Regular-expression derivatives reexamined"
32 | (authors "Scott Owens" "John Reppy" "Aaron Turon")
33 | 2009
34 | #:url "https://www.ccs.neu.edu/home/turon/re-deriv.pdf")
35 |
36 | (define-bib rte
37 | "Type-Checking of Heterogeneous Sequences in Common Lisp"
38 | (authors "Jim Newton" "Akim Demaille" "Didier Verna")
39 | 2016
40 | #:url "https://hal.archives-ouvertes.fr/hal-01380792/document")
41 |
42 | (define-bib petalisp
43 | "Petalisp: A Common Lisp Library for Data Parallel Programming"
44 | "Marco Heisig"
45 | 2018
46 | #:url "https://dl.acm.org/doi/10.5555/3323215.3323216")
47 |
48 | (define-bib cox
49 | "Regular Expression Matching: the Virtual Machine Approach"
50 | "Russ Cox"
51 | 2009
52 | #:url "https://swtch.com/~rsc/regexp/regexp2.html")
53 |
54 | (define-bib posix
55 | "Regular expressions"
56 | "IEEE"
57 | 2018
58 | #:url "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html")
59 |
--------------------------------------------------------------------------------
/Documentation/introduction.scrbl:
--------------------------------------------------------------------------------
1 | #lang scribble/base
2 | @require["spec-macros.scrbl" "bibliography.rkt"]
3 |
4 | @title{Introduction}
5 |
6 | one-more-re-nightmare is a new regular expression compiler implemented
7 | in Common Lisp.
8 |
9 | The aim is to generate code that will perform as well, if not better,
10 | than hand-written searching code. We believe we have achieved this
11 | aim; as finite state machine generation removes redundant tests that
12 | would not typically be removed by a human programmer, compilation is
13 | performed for specific array types, and the compiler can also make use
14 | of SIMD intrinsics for simple loops. We also intend to produce
15 | excellent analysis, to help with writing regular expressions that do
16 | the "right thing".
17 |
18 | @section{Prior work}
19 |
20 | The derivative approach was introduced by the late Janusz Brzozowski
21 | in @~cite[derivatives].
22 |
23 | The name of the library is due to the song @term{One More Red
24 | Nightmare} by the band King Crimson in 1974.
25 |
26 | Ronald Johnston wrote on the APL\3000 compiler in @~cite[apl3000],
27 | which produced specialised code for different array layouts on demand.
28 |
29 | Scott Owens, John Reppy and Aaron Turon showed how to produce finite
30 | state machines that are very close to minimal in size in
31 | @~cite[derivatives-reexamined], using the derivative approach. This
32 | minimality was achieved by using additional rewrite rules, allowing
33 | the machine generation process to reuse more states, instead of
34 | producing more redundant states.
35 |
36 | Jim Newton compiled finite state machines to Common Lisp code in
37 | @~cite[rte], to implement @term{regular type expressions}.
38 |
39 | We were first introduced to using the Common Lisp compiler as a
40 | backend for a @term{just-in-time compiler} by the Petalisp language by
41 | Marco Heisig @~cite[petalisp].
42 |
43 | Gilbert Baumann described how to implement submatching using
44 | derivatives in a currently unpublished paper.
45 |
--------------------------------------------------------------------------------
/Code/one-more-re-nightmare.asd:
--------------------------------------------------------------------------------
1 | (asdf:defsystem :one-more-re-nightmare
2 | :author "Hayley Patton"
3 | :description "A regular expression compiler"
4 | :license "BSD 2-clause"
5 | :depends-on (:trivia :alexandria :babel
6 | :esrap :trivial-indent
7 | :dynamic-mixins :stealth-mixin
8 | :bordeaux-threads)
9 | :serial t
10 | :components ((:file "package")
11 | (:module "DFA-construction"
12 | :components ((:file "type")
13 | (:file "sets")
14 | (:file "re-types")
15 | (:file "nullable")
16 | (:file "tag-sets")
17 | (:file "derivative")
18 | (:file "derivative-classes")
19 | (:file "empty")
20 | (:file "effects")
21 | (:file "similar")
22 | (:file "make-dfa")))
23 | (:module "Compiler"
24 | :components ((:file "layout")
25 | (:file "compilation-strategy")
26 | (:file "length-inference")
27 | (:file "optimize-settings")
28 | (:file "code-generation")))
29 | (:module "Interface"
30 | :components ((:file "syntax")
31 | (:file "convert-to-bytes")
32 | (:file "code-cache")
33 | (:file "lint")
34 | (:file "interface"))))
35 | ;; Trivia emits warnings at compile-time that type-i can't infer
36 | ;; types for some patterns.
37 | :around-compile (lambda (thunk)
38 | (handler-bind ((warning
39 | (lambda (c)
40 | (when (typep c (find-symbol "FAILED-TYPE-INFERENCE" "TYPE-I"))
41 | (muffle-warning)))))
42 | (funcall thunk))))
43 |
--------------------------------------------------------------------------------
/Code/Interface/code-cache.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defvar *code-cache*
4 | (make-hash-table :test 'equal))
5 | (defvar *code-lock*
6 | (bt:make-lock "Compiled code cache lock"))
7 |
8 | (defstruct (compiled-regular-expression (:conc-name cre-))
9 | (codes (alexandria:required-argument) :type simple-vector)
10 | original-re)
11 | (deftype re-designator ()
12 | `(or compiled-regular-expression string))
13 |
14 | (defvar *string-types*
15 | (alexandria:map-product #'list
16 | '(simple-array array)
17 | '(character base-char)
18 | '(1)))
19 |
20 | (defun compile-regular-expression (expression)
21 | (make-compiled-regular-expression
22 | :codes (coerce
23 | (loop for type in *string-types*
24 | collect (find-code expression type))
25 | 'vector)
26 | :original-re expression))
27 |
28 | (defmethod print-object ((cre compiled-regular-expression) stream)
29 | (print-unreadable-object (cre stream :type t)
30 | (write-string (cre-original-re cre) stream)))
31 |
32 | (defvar *type-dispatcher*
33 | (compile nil
34 | `(lambda (cre type)
35 | (cond
36 | ,@(loop for type in *string-types*
37 | for index from 0
38 | collect `((eq type ',type) (svref (cre-codes cre) ,index)))))))
39 |
40 | (defun find-code (regular-expression type-specifier)
41 | (when (compiled-regular-expression-p regular-expression)
42 | (return-from find-code
43 | (funcall *type-dispatcher* regular-expression type-specifier)))
44 | (bt:with-lock-held (*code-lock*)
45 | (multiple-value-bind (code present?)
46 | (gethash (list regular-expression type-specifier)
47 | *code-cache*)
48 | (when present?
49 | (return-from find-code code))))
50 | (multiple-value-bind (function groups)
51 | (%compile-regular-expression
52 | regular-expression
53 | :layout (make-layout :array-type type-specifier))
54 | (bt:with-lock-held (*code-lock*)
55 | (setf (gethash (list (copy-seq regular-expression)
56 | type-specifier)
57 | *code-cache*)
58 | (cons function groups)))))
59 |
60 | (defun string-type-of (string)
61 | (loop for type in *string-types*
62 | when (typep string type)
63 | do (return-from string-type-of type))
64 | (error 'type-error
65 | :datum string
66 | :expected-type 'string))
67 |
--------------------------------------------------------------------------------
/Code/SIMD/loop.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defclass simd-loop (simd-info) ())
4 |
5 | (defun assignments-idempotent-p (assignments)
6 | "Are the assignments idempotent, i.e. would repeated applications of the assignments, interleaved with incrementing the position, be the same as applying the assignments once at the end?"
7 | ;; We can't form un-idempotent assignments unless we end up writing
8 | ;; a variable we also read. But apparently optimising loops that
9 | ;; carry over variables, i.e. aren't for the start, makes
10 | ;; performance worse. Weird.
11 | #-one-more-re-nightmare::precise-aip
12 | (null assignments)
13 | #+one-more-re-nightmare::precise-aip
14 | (progn
15 | (setf assignments
16 | (loop for assignment in assignments
17 | for (target . source) = assignment
18 | unless (equal target source)
19 | collect assignment))
20 | (null (intersection (mapcar #'car assignments)
21 | (mapcar #'cdr assignments)
22 | :test #'equal))))
23 |
24 | (defmethod transition-code ((strategy simd-loop) previous-state transition)
25 | (let* ((next-state (transition-next-state transition))
26 | (next-expression (state-expression next-state)))
27 | ;; We optimise tight loops like A -> A.
28 | (when (or (re-stopped-p next-expression)
29 | (re-empty-p next-expression)
30 | (not (eq next-state previous-state))
31 | (not (assignments-idempotent-p
32 | (transition-tags-to-set transition)))
33 | (csum-has-classes-p (transition-class transition)))
34 | (return-from transition-code (call-next-method)))
35 | ;; Try to skip to the first character after for which this transition doesn't apply.
36 | (let* ((vector-length (/ one-more-re-nightmare.vector-primops:+v-length+ *bits*)))
37 | (trivia:ematch
38 | (test-from-isum 'loaded
39 | (csum-complement (transition-class transition)))
40 | (:never (call-next-method))
41 | (:always (error "Found a transition that is never taken."))
42 | (test
43 | `(progn
44 | ;; Don't try to read over the END we were given.
45 | (loop
46 | (unless (< (the fixnum (+ ,vector-length position)) end)
47 | (return))
48 | (let* ((loaded (,(find-op "LOAD") vector position))
49 | (test (,(find-op "MOVEMASK") ,test)))
50 | (unless (zerop test)
51 | (incf position (one-more-re-nightmare.vector-primops:find-first-set test))
52 | (return)))
53 | (incf position ,vector-length))
54 | ,(call-next-method)))))))
55 |
--------------------------------------------------------------------------------
/Tests/regrind.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare-tests)
2 |
3 | ;;;; It's regrind time!
4 |
5 | (defvar *start*)
6 | (defvar *end*)
7 |
8 | (defun checked-string-ref (string index)
9 | (assert (and (<= *start* index) (< index *end*)))
10 | (aref string index))
11 |
12 | (defvar *layout*
13 | (one-more-re-nightmare::make-layout
14 | :ref 'checked-string-ref))
15 |
16 | (defvar *remaining-depth* 4)
17 | (defun random-re ()
18 | (macrolet ((terminal ()
19 | ;; A random element of [A-Z].
20 | '(string (code-char (+ 65 (random 26)))))
21 | (recurse (control n)
22 | `(format nil ,control ,@(loop repeat n collect '(random-re)))))
23 | (if (zerop *remaining-depth*)
24 | (terminal)
25 | (let ((*remaining-depth* (1- *remaining-depth*)))
26 | (case (random 8)
27 | (0 (terminal))
28 | (1 (recurse "~a~a" 2))
29 | (2 (recurse "(~a)" 1))
30 | (3 (recurse "«~a»" 1))
31 | (4 (recurse "(~a)|(~a)" 2))
32 | (5 (recurse "(~a)&(~a)" 2))
33 | (6 (recurse "(¬~a)" 1))
34 | (7 (recurse "(~a)*" 1)))))))
35 |
36 | (defun random-haystack ()
37 | (let* ((n (random 80))
38 | (haystack (make-string n)))
39 | (dotimes (i n)
40 | (setf (char haystack i) (code-char (+ 65 (random 26)))))
41 | haystack))
42 |
43 | (defun ensure-lparallel-kernel ()
44 | (when (null lparallel:*kernel*)
45 | (setf lparallel:*kernel*
46 | (lparallel:make-kernel (cpus:get-number-of-processors)))))
47 |
48 | (defun regrind (n &key (depth 4))
49 | (let ((success t))
50 | (ensure-lparallel-kernel)
51 | (lparallel:pdotimes (i n success n)
52 | (let* ((*remaining-depth* depth)
53 | #+sbcl (one-more-re-nightmare::*code-type* :interpreted)
54 | (re (random-re))
55 | (haystack (random-haystack)))
56 | (handler-case
57 | (one-more-re-nightmare::%compile-regular-expression
58 | re
59 | :layout *layout*)
60 | (error (e)
61 | (format t "~&Compiling ~s fails with:~&~a" re e)
62 | (setf success nil))
63 | (:no-error (code registers)
64 | (let ((result (make-array registers))
65 | (*start* 0)
66 | (*end* (length haystack)))
67 | (handler-case
68 | (funcall code haystack 0 (length haystack) result
69 | (lambda ()
70 | (loop for p across result
71 | do (assert (or (null p) (<= 0 p *end*))))))
72 | (error (e)
73 | (format t "~&Matching ~s on the haystack ~s fails with:~&~a" re haystack e)
74 | (setf success nil))
75 | (:no-error (&rest stuff)
76 | (declare (ignore stuff))
77 | (when (zerop (mod i 100))
78 | (write-char #\.)
79 | (finish-output)))))))))))
80 |
--------------------------------------------------------------------------------
/Code/Compiler/compilation-strategy.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defvar *compiler-state*)
4 | (defvar *layout*)
5 |
6 | (defclass strategy ()
7 | ()
8 | (:documentation "A compilation strategy describes how potential matches should be searched for."))
9 |
10 | (defgeneric initial-states (strategy expression)
11 | (:documentation "Compute a list of states to start compiling from."))
12 | (defgeneric macros-for-strategy (strategy)
13 | (:documentation "A list of macros (at least using including WIN and RESTART) to use for compilation.")
14 | (:method-combination append))
15 | (defgeneric lambda-list (strategy)
16 | (:documentation "The lambda list of the function to generate."))
17 | (defgeneric start-code (strategy states)
18 | (:documentation "Part of a TAGBODY body used to start running a DFA."))
19 | (defgeneric transition-code (strategy previous-state transition))
20 | (defgeneric declarations (strategy))
21 |
22 | (defclass scan-everything (strategy)
23 | ()
24 | (:documentation "A compilation strategy which runs a regular expression vector over every position."))
25 |
26 | (defclass call-continuation (strategy)
27 | ()
28 | (:documentation "A compilation strategy which calls a continuation when a match is found."))
29 |
30 | (defun make-default-strategy (layout expression)
31 | (declare (ignore layout expression))
32 | (make-instance (dynamic-mixins:mix 'scan-everything 'call-continuation)))
33 |
34 | (defun add-tags (expression)
35 | (join (tag-set '(((start 0) . position)))
36 | (join expression
37 | (tag-set '(((end 0) . position))))))
38 |
39 | (defun make-search-machine (expression)
40 | ;; We add an ALPHA wrapper to store the last end point when we
41 | ;; succeed but have repetition, and a GREP wrapper to make sure we
42 | ;; continue when we fail to match.
43 | (let ((a (alpha (add-tags expression) (empty-set))))
44 | (grep a a)))
45 |
46 | (defmethod initial-states ((strategy scan-everything) expression)
47 | (list (make-search-machine expression)))
48 |
49 | (defmethod macros-for-strategy append ((strategy scan-everything))
50 | '((restart ()
51 | '(go start))))
52 |
53 | (defmethod macros-for-strategy append ((strategy call-continuation))
54 | '((win (&rest variables)
55 | `(progn
56 | ;; We have to compensate for the DFA assigning tags 1 too late
57 | ;; here.
58 | ,@(loop for (nil variable) in variables
59 | for n from 0
60 | collect `(setf (svref result-vector ,n)
61 | ,(if (symbolp variable)
62 | `(1- ,variable)
63 | variable)))
64 | (go win)))))
65 |
66 | (defmethod lambda-list ((strategy call-continuation))
67 | '(vector start end result-vector continuation))
68 |
69 | (defmethod declarations ((strategy call-continuation))
70 | `((,(layout-array-type *layout*) vector)
71 | (alexandria:array-index start end)
72 | (function continuation)
73 | (simple-vector result-vector)))
74 |
--------------------------------------------------------------------------------
/Documentation/posix.scrbl:
--------------------------------------------------------------------------------
1 | #lang scribble/base
2 |
3 | @require["spec-macros.scrbl" "bibliography.rkt"]
4 | @title{Differences between POSIX and PCRE regexen}
5 |
6 | Many regular expression engines implement PCRE-like regular expression
7 | semantics, but one-more-re-nightmare implements POSIX semantics. While
8 | the syntax of POSIX regular expressions is a subset of that of PCRE, the
9 | semantics can differ drastically.
10 |
11 | We use one-more-re-nightmare and cl-ppcre to show the differences in
12 | some experiments; the latter is probably the most commonly used
13 | regular expression engine for Common Lisp.
14 |
15 | @section{The longest match wins}
16 |
17 | When faced with an alternation, PCRE regular expressions prefer the
18 | left-most option, whereas POSIX regular expressions prefer the longest.
19 |
20 | @lisp-code{
21 | CL-USER> (one-more-re-nightmare:first-string-match "ant|antler" "antler")
22 | #("antler")
23 | CL-USER> (cl-ppcre:scan-to-strings "ant|antler" "antler")
24 | "ant"
25 | #()
26 | }
27 |
28 | Of course, the results are the same when the options are sorted
29 | longest to shortest.
30 |
31 | @lisp-code{
32 | CL-USER> (cl-ppcre:scan-to-strings "antler|ant" "antler")
33 | "antler"
34 | #()
35 | }
36 |
37 | @section{The longest submatch wins}
38 |
39 | The results of the longest match rule produces interesting results
40 | when combined with submatches.
41 |
42 | @lisp-code{
43 | CL-USER> (one-more-re-nightmare:first-string-match "«a*»*" "aaa")
44 | #("aaa" "aaa")
45 | CL-USER> (cl-ppcre:scan-to-strings "(a*)*" "aaa")
46 | "aaa"
47 | #("")
48 | }
49 |
50 | POSIX prefers matching everything, whereas PCRE prefers matching the empty
51 | string. The behaviour of PCRE can be fixed by requiring at least one iteration,
52 | by @cl{(a+)*}, but this is not necessary with POSIX.
53 |
54 | @lisp-code{
55 | CL-USER> (cl-ppcre:scan-to-strings "(a+)*" "aaa")
56 | "aaa"
57 | #("aaa")
58 | }
59 |
60 | Another interesting (but more contrived) example is @cl{(a|aa)*}.
61 |
62 | @lisp-code{
63 | CL-USER> (one-more-re-nightmare:first-string-match "«a|aa»*" "aaa")
64 | #("aaa" "a")
65 | CL-USER> (one-more-re-nightmare:first-string-match "«a|aa»*" "aaaa")
66 | #("aaaa" "aa")
67 |
68 | CL-USER> (cl-ppcre:scan-to-strings "(a|aa)*" "aaa")
69 | "aaa"
70 | #("a")
71 | CL-USER> (cl-ppcre:scan-to-strings "(a|aa)*" "aaaa")
72 | "aaaa"
73 | #("a")
74 | }
75 |
76 | Whereas PCRE always takes the left option (@cl{a}), POSIX attempts to
77 | take the right option (@cl{aa}) when possible. The result is that POSIX
78 | produces the submatch @cl{aa} when the string has an even (and
79 | non-zero) number of @cl{a}, and @cl{a} when the string has an odd
80 | number of @cl{a}.
81 |
82 | (We think that the "Pike virtual machine", popularised by a series of
83 | articles by Russ Cox @~cite[cox], cannot implement POSIX submatching
84 | semantics, despite its use in some implementations which claim POSIX
85 | compatibility; but we don't have a real proof yet. The basic idea is
86 | that the Pike machine prefers the shortest match that reaches a
87 | particular state, and thus only the shortest submatch is ever saved.
88 | The GNU libc implementation of regular expressions has the same
89 | behaviour as CL-PPCRE on @cl{(a|aa)*} for example.)
90 |
--------------------------------------------------------------------------------
/Code/DFA-construction/similar.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (define-condition no-match () ())
4 |
5 | (defvar *environment*)
6 | (defvar *backward-environment*)
7 |
8 | (defun assert-equivalent (from to)
9 | (when (or (and (eql from 'position) (eql to 'position))
10 | (and (eql from 'nil) (eql to 'nil)))
11 | ;; Don't care for POSITION ~ POSITION or NIL ~ NIL
12 | (return-from assert-equivalent))
13 | (multiple-value-bind (old-to present?)
14 | (gethash from *environment*)
15 | (when (and present? (not (equal to old-to)))
16 | (error 'no-match))
17 | (multiple-value-bind (old-from present?)
18 | (gethash to *backward-environment*)
19 | (when (and present? (not (equal from old-from)))
20 | (error 'no-match))
21 | (setf (gethash from *environment*) to
22 | (gethash to *backward-environment*) from))))
23 |
24 | (defun assert-equivalent-sources (from to)
25 | "Ensure that we don't unify POSITION and a variable, or two different variables."
26 | (when (eql from to)
27 | (return-from assert-equivalent-sources))
28 | (unless (and (listp from) (listp to))
29 | (error 'no-match))
30 | (unless (eql (first from) (first to))
31 | (error 'no-match))
32 | (assert-equivalent from to))
33 |
34 | (trivia:defun-match* %similar (from to)
35 | (((both r1 s1) (both r2 s2))
36 | (%similar r1 r2)
37 | (%similar s1 s2))
38 | (((either r1 s1) (either r2 s2))
39 | (%similar r1 r2)
40 | (%similar s1 s2))
41 | (((join r1 s1) (join r2 s2))
42 | (%similar r1 r2)
43 | (%similar s1 s2))
44 | (((repeat r1 min1 max1 c1) (repeat r2 min2 max2 c2))
45 | (unless (and (eql min1 min2) (eql max1 max2) (eql c1 c2))
46 | (error 'no-match))
47 | (%similar r1 r2))
48 | (((invert r1) (invert r2))
49 | (%similar r1 r2))
50 | (((literal set1) (literal set2))
51 | (unless (eq set1 set2)
52 | (error 'no-match)))
53 | (((empty-string) (empty-string)))
54 | (((tag-set set1) (tag-set set2))
55 | (unless (= (length set1) (length set2))
56 | (error 'no-match))
57 | ;; This happens to work nicely as we know that NULLABLE
58 | ;; will never re-order substitutions in TAG-SETs.
59 | (loop for ((v1 r1) . s1) in set1
60 | for ((v2 r2) . s2) in set2
61 | do (unless (eql v1 v2)
62 | (error 'no-match))
63 | (assert-equivalent-sources s1 s2)
64 | (assert-equivalent (list v1 r1)
65 | (list v2 r2))))
66 | (((grep r1 _) (grep r2 _))
67 | ;; We don't actually need to do anything for GREP prototypes; they
68 | ;; never cause effects, and they never change while generating a
69 | ;; DFA. Unifying them just causes unnecessary failures.
70 | (%similar r1 r2))
71 | (((alpha r1 n1) (alpha r2 n2))
72 | ;; Though we do need to unify the history/nullable part of ALPHA.
73 | (%similar r1 r2)
74 | (%similar n1 n2))
75 | ((_ _) (error 'no-match)))
76 |
77 | (defun similar (from to)
78 | (let ((*environment* (make-hash-table :test 'equal))
79 | (*backward-environment* (make-hash-table :test 'equal)))
80 | (handler-case
81 | (values (%similar from to) t)
82 | (no-match ()
83 | nil)
84 | (:no-error (&rest r)
85 | (declare (ignore r))
86 | *environment*))))
87 |
--------------------------------------------------------------------------------
/Code/DFA-construction/print-dfa.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | ;;; The DFA we are printing.
4 | (defvar *dfa*)
5 | (defvar *initial-state*)
6 | (defvar *print-state-names* t)
7 |
8 | (defun generate-dot-for-expression (expression &key (search? t) (name-states? t))
9 | (uiop:with-temporary-file (:pathname p :stream s :keep t :type "dot")
10 | (with-hash-consing-tables ()
11 | (let* ((*tag-gensym-counter* 0)
12 | (*print-state-names* name-states?)
13 | (regex (parse-regular-expression expression))
14 | (initial-state
15 | (if search? (make-search-machine regex) regex))
16 | (dfa (make-dfa-from-expression initial-state)))
17 | (print-dfa dfa initial-state s)))
18 | p))
19 |
20 | (defun print-dfa (dfa initial-state &optional (stream *standard-output*))
21 | (let ((*dfa* dfa)
22 | (*initial-state* (gethash initial-state dfa)))
23 | (cl-dot:print-graph
24 | (cl-dot:generate-graph-from-roots
25 | 'dfa
26 | (list (gethash initial-state dfa))
27 | `(:node (:fontname "Inconsolata" :shape ,(if *print-state-names* :box :circle))
28 | :edge (:fontname "Inconsolata")))
29 | :stream stream)))
30 |
31 | (defmethod cl-dot:graph-object-node ((graph (eql 'dfa)) (state (eql 'nothing)))
32 | (make-instance 'cl-dot:node
33 | :attributes (list :label " "
34 | :color "#00000000"
35 | :fillcolor "#00000000")))
36 |
37 | (defmethod cl-dot:graph-object-node ((graph (eql 'dfa)) state)
38 | ;; We add a newline to state names so that cl-dot will emit the
39 | ;; trailing \l, which is necessary even when there isn't another
40 | ;; line break.
41 | (make-instance 'cl-dot:node
42 | :attributes (append
43 | (if *print-state-names*
44 | (list :label (list :left (format nil "~A~%" (state-expression state))))
45 | '())
46 | (list :style (if (eq (empty-set) (nullable (state-expression state)))
47 | :solid :bold)))))
48 |
49 | (defun trim-assignments-for-show (assignments)
50 | (let ((new-assignments
51 | (loop for assignment in assignments
52 | for (target . source) = assignment
53 | unless (equal target source)
54 | collect assignment)))
55 | (if (null new-assignments)
56 | ""
57 | (tag-set new-assignments))))
58 |
59 | (defmethod cl-dot:graph-object-edges ((graph (eql 'dfa)))
60 | (let ((edges (list (list 'nothing *initial-state*))))
61 | (maphash (lambda (re state)
62 | (declare (ignore re))
63 | (dolist (transition (state-transitions state))
64 | (push (list state (transition-next-state transition)
65 | (list :label
66 | (format nil "~a ~a"
67 | (with-output-to-string (s)
68 | (print-csum
69 | (transition-class transition)
70 | s))
71 | (trim-assignments-for-show
72 | (transition-tags-to-set transition)))))
73 | edges)))
74 | *dfa*)
75 | edges))
76 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # one-more-re-nightmare
2 |
3 | one-more-re-nightmare is a regular expression engine that uses the
4 | technique presented in [Regular-expression derivatives
5 | revisited](https://www.ccs.neu.edu/home/turon/re-deriv.pdf) to
6 | interpret and compile regular expressions. We use a few tricks to make
7 | matching quite fast:
8 |
9 | - We use a deterministic finite automaton to have O(n) runtime.
10 | - We run the Common Lisp compiler to generate machine code, rather
11 | than interpreting a DFA or bytecode, or jumping through closures
12 | (like CL-PPCRE does).
13 | - We generate specialised code for each array type, so everything is
14 | inlined.
15 | - If you use the `one-more-re-nightmare-simd` system on SBCL 2.1.10 or
16 | newer with AVX2, we even use vectorised scanning of constant
17 | prefixes of regular expressions.
18 |
19 | Thanks to Gilbert Baumann for suggesting I use derivatives to compile
20 | regular expressions, and then for informing me of how to handle
21 | submatching properly, and my discrete mathematics teachers for
22 | formally introducing me to finite state machines.
23 |
24 | Please see [the reference
25 | book](https://applied-langua.ge/projects/one-more-re-nightmare/) for
26 | how to use one-more-re-nightmare, or [an
27 | article](https://applied-langua.ge/posts/omrn-compiler.html) on the
28 | history and theory involved.
29 |
30 | While the syntax is admittedly wonky (but somewhat more like how
31 | regular expressions are presented in papers), one-more-re-nightmare
32 | makes its best effort to implement POSIX semantics for matching (as
33 | described in the specification for [how `regcomp`
34 | works](https://pubs.opengroup.org/onlinepubs/9699919799/functions/regexec.html)
35 | and [regular expression
36 | definitions](https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html)). Any
37 | behaviour contrary to POSIX is a bug.
38 |
39 | ## A lousy benchmark
40 |
41 | ```lisp
42 | CL-USER> (let ((s (make-string 1000000 :initial-element #\a)))
43 | (setf (aref s 333333) #\b)
44 | (setf (aref s 555555) #\c)
45 | (the-cost-of-nothing:bench
46 | (all-string-matches "ab|ac" s)))
47 |
48 | CL-USER> (let ((s (make-string 1000000 :initial-element #\a)))
49 | (setf (aref s 333333) #\b)
50 | (setf (aref s 555555) #\c)
51 | (the-cost-of-nothing:bench
52 | (cl-ppcre:all-matches-as-strings "ab|ac" s)))
53 | ```
54 |
55 | Note that, by nature of calling the Common Lisp compiler,
56 | one-more-re-nightmare will take longer to compile a regular
57 | expression, so it is better suited for many matching operations with
58 | few expressions. It does cache compiled expressions when using the
59 | high-level interface, so the initial cost may amortize well over many
60 | calls; and constant regular expression strings are compiled at
61 | compile-time, with no runtime overhead whatsoever.
62 |
63 | | engine | SBCL | Clozure CL | SBCL with AVX2 | ditto, SIMPLE-BASE-STRING |
64 | |------------------|-----------|------------|----------------|---------------------------|
65 | | o-m-r-n | 0.57ms | 2.93ms | 0.18ms | 55µs |
66 | | compilation time | 4.65ms | 3.76ms | 6.82ms | 6.43ms |
67 | | cl-ppcre | 22.8ms | 45.3ms | 22.8ms | 21.6ms |
68 | | break even after | 209kchars | 88.7kchars | 301kchars | 305kchars |
69 |
--------------------------------------------------------------------------------
/Code/DFA-construction/derivative.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (define-hash-consing-table *derivative*)
4 |
5 | (defun derivative (re set)
6 | "Compute the derivative of a regular expression with regards to the set (i.e. the regular expression should be matched after a character in the set is matched)."
7 | (with-hash-consing (*derivative* (list re set))
8 | (trivia:ematch re
9 | ((or (empty-string) (empty-set) (tag-set _)) (empty-set))
10 | ((literal matching-set)
11 | (if (csum-null-p (csum-intersection matching-set set))
12 | (empty-set)
13 | (empty-string)))
14 | ((join r s)
15 | (let ((r* (derivative r set))
16 | (s* (derivative s set)))
17 | (cond
18 | ((eq r* (empty-set))
19 | ;; Something like [...]A doesn't need gensym'ing.
20 | (let ((*gensym-assignments?* nil))
21 | (join (nullable r) s*)))
22 | ((not (has-tags-p r*))
23 | ;; Ditto for A[...]
24 | (either (join r* s) (join (nullable r) s*)))
25 | (t
26 | (either (join r* (unique-tags s)) (join (nullable r) s*))))))
27 | ((repeat r min max _)
28 | (join (derivative r set)
29 | (repeat (unique-tags r)
30 | (max (1- min) 0)
31 | (if (null max) nil (1- max))
32 | nil)))
33 | ((either r s)
34 | (either (derivative r set) (derivative s set)))
35 | ((both r s)
36 | (both (derivative r set) (derivative s set)))
37 | ((invert r)
38 | (invert (derivative r set)))
39 | ((grep r s)
40 | (let* ((r* (derivative r set))
41 | (n (nullable r*)))
42 | (if (eq n (empty-set))
43 | (grep (either r*
44 | (if (has-tags-p r*)
45 | (unique-tags s)
46 | s))
47 | s)
48 | r*)))
49 | ((alpha r old-tags)
50 | (let* ((r* (derivative r set))
51 | (*gensym-assignments?* nil)
52 | (nullable (nullable r)))
53 | (alpha r*
54 | (either nullable old-tags)))))))
55 |
56 | (defun derivative* (re sequence &key search)
57 | (let ((variables (make-hash-table :test 'equal))
58 | (position 0)
59 | (*tag-gensym-counter* 0))
60 | (with-hash-consing-tables ()
61 | (when (stringp re) (setf re (parse-regular-expression re)))
62 | (when search (setf re (make-search-machine re)))
63 | (flet ((run-effects (effects)
64 | (loop for (target . source) in effects
65 | for value = (case source
66 | ((position) position)
67 | ((nil) 'nil)
68 | (otherwise (gethash source variables :unbound)))
69 | do (setf (gethash target variables) value))))
70 | (map 'nil
71 | (lambda (element)
72 | (let* ((new-re (derivative re (singleton-set (char-code element))))
73 | (effects (remove-if (lambda (x) (equal (car x) (cdr x)))
74 | (effects re))))
75 | (format t "~&~a~& ~:c ~a"
76 | re element effects)
77 | (setf re new-re)
78 | (run-effects effects)
79 | (incf position)))
80 | sequence)
81 | (run-effects (effects re))
82 | (format t "~&~a" (alexandria:hash-table-alist variables))
83 | (values re
84 | (trivia:match (nullable re)
85 | ((tag-set s)
86 | (loop for ((name nil) . source) in s
87 | unless (null (gethash source variables))
88 | collect (cons name (gethash source variables))))
89 | ((empty-string) '())
90 | ((empty-set) '()))
91 | (nullable re))))))
92 |
--------------------------------------------------------------------------------
/Documentation/spec-macros.css:
--------------------------------------------------------------------------------
1 | .FloatRight {
2 | float: right;
3 | }
4 |
5 | .SmallCaps {
6 | font-variant: small-caps;
7 | }
8 |
9 | .Indent {
10 | margin-left: 2em;
11 | }
12 |
13 | .Definitions {
14 | margin-top: 1em;
15 | margin-left: 0;
16 | margin-right: 0;
17 | border-top: 1px black solid;
18 | display: block;
19 | }
20 |
21 | .Landscape {
22 | margin: 0;
23 | }
24 |
25 | .Definitions > *:not(:empty):nth-of-type(1) {
26 | margin-top: 0.25em;
27 | }
28 |
29 | .TagDescription {
30 | float: right;
31 | text-style: italic;
32 | }
33 |
34 | .main {
35 | text-align: justify;
36 | }
37 |
38 | h1, h2, h3, h4, h5 {
39 | font-family: "Open Sans", sans-serif;
40 | text-align: left;
41 | }
42 |
43 | @media only screen {
44 | h4, h5 {
45 | background: #d1c4e9;
46 | }
47 | h3, h4, h5 {
48 | font-size: 120%;
49 | margin-bottom: 0.5em;
50 | }
51 |
52 | body {
53 | background: #eee;
54 | color: #111;
55 | --card-padding: 8pt;
56 | margin: var(--card-padding);
57 | }
58 |
59 | .tocset, .maincolumn, .refcolumn {
60 | padding: var(--card-padding);
61 | background: white;
62 | box-shadow: 0 3px 6px rgba(0,0,0,0.16), 0 3px 6px rgba(0,0,0,0.23);
63 | border-radius: 2px;
64 | }
65 | .maincolumn {
66 | width: min(50em, calc(100vw - 12.5em - 13.5em - 10 * var(--card-padding)));
67 | margin: 0 auto;
68 | }
69 |
70 | .tocview, .tocsub {
71 | background: transparent;
72 | }
73 | .tocviewlink, .tocviewselflink, .tocsubseclink {
74 | color: black;
75 | }
76 | .tocviewselflink {
77 | text-decoration: none;
78 | font-weight: 700;
79 | }
80 |
81 | h4, h5, .navsettop, .navsetbottom {
82 | padding: var(--card-padding);
83 | position: relative;
84 | left: calc(-1 * var(--card-padding));
85 | width: 100%;
86 | }
87 | .navsettop {
88 | top: calc(-1 * var(--card-padding));
89 | margin-bottom: calc(-1 * var(--card-padding));
90 | }
91 |
92 | .navsettop, .navsetbottom {
93 | border: none;
94 | background: #FFCDD2;
95 | }
96 | .navright > a {
97 | color: black;
98 | font-style: italic;
99 | text-decoration: none;
100 | }
101 | .nonavigation {
102 | color: #333;
103 | }
104 |
105 | .refpara {
106 | left: calc(3 * var(--card-padding));
107 | }
108 |
109 | li::marker {
110 | color: #888;
111 | }
112 | }
113 |
114 | .texMathInline, .texMathDisplay {
115 | /* Make the KaTeX output about the same size as the other text. */
116 | font-size: 75%;
117 | }
118 | .texMathDisplay {
119 | /* Attempt to allow scrolling of long mathematical stuff. */
120 | display: block;
121 | overflow-x: auto;
122 | overflow-y: clip;
123 | }
124 |
125 | .CenteredBlock, .CenteredContainer {
126 | margin: 0;
127 | }
128 | .CenteredBlock {
129 | text-align: left;
130 | }
131 |
132 | .CenteredBlock > .SIntrapara, .CenteredBlock > p {
133 | margin: 0;
134 | }
135 |
136 | .CenteredContainer {
137 | display: block;
138 | margin-left: 2em;
139 | margin-bottom: 1em;
140 | overflow-x: auto;
141 | overflow-y: clip;
142 | }
143 |
144 | p:empty {
145 | display: none;
146 | }
147 |
148 | tr {
149 | vertical-align: top;
150 | }
151 |
152 | .stt {
153 | /* Disable italics we inherit from argument lists */
154 | font-style: normal;
155 | /* Don't try to justify code, that looks bad */
156 | text-justify: none;
157 | }
158 |
159 | .refcolumn {
160 | text-align: left;
161 | border: none;
162 | }
163 |
164 | p {
165 | line-height: 1.2;
166 | }
167 |
168 | body {
169 | line-height: 1.2;
170 | hyphens: auto;
171 | }
172 |
173 | blockquote {
174 | margin-top: 0;
175 | margin-bottom: 0;
176 | }
177 |
178 | h2 {
179 | margin-top: 0.5em; /* Foo, who made it 0 in Scribble? */
180 | }
181 |
182 | .AutobibLink {
183 | color: #111;
184 | font-style: italic;
185 | text-decoration: dashed underline;
186 | }
187 |
--------------------------------------------------------------------------------
/Code/Interface/lint.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (define-condition lint-style-warning (style-warning)
4 | ())
5 |
6 | (define-condition not-matchable-style-warning (lint-style-warning)
7 | ())
8 |
9 | (define-condition expression-not-matchable (not-matchable-style-warning)
10 | ()
11 | (:report "This expression is impossible to match."))
12 |
13 | (define-condition group-not-matchable (not-matchable-style-warning)
14 | ((n :initarg :n :reader warning-group-number)
15 | (string :initarg :string :reader warning-group-string))
16 | (:report
17 | (lambda (c s)
18 | (format s "The ~:R group~%~4T~A~%in this expression is impossible to match."
19 | (warning-group-number c)
20 | (warning-group-string c)))))
21 |
22 | (define-condition matching-too-much-style-warning (lint-style-warning)
23 | ())
24 |
25 | (define-condition expression-matches-everything (matching-too-much-style-warning)
26 | ()
27 | (:report "This expression can match the empty string at every position."))
28 |
29 | (define-condition expression-matches-empty-string (matching-too-much-style-warning)
30 | ()
31 | (:report "This expression will only ever match the empty string at every position."))
32 |
33 | (defun check-liveness (dfa groups group-strings)
34 | (let ((matching? nil)
35 | (matched-groups '()))
36 | (maphash (lambda (re state)
37 | (unless (eq (nullable re) (empty-set))
38 | (setf matching? t))
39 | (loop for ((n nil) . nil) in (state-exit-map state)
40 | ;; Group #1 uses tags #1 and #2 - we'll use
41 | ;; #2 being written to test if #1 is alive.
42 | for group-number = (floor (1+ n) 2)
43 | when (evenp n)
44 | do (pushnew group-number matched-groups)))
45 | dfa)
46 | (if (not matching?)
47 | (warn 'expression-not-matchable)
48 | (loop for n from 1 to groups
49 | unless (member n matched-groups)
50 | do (warn 'group-not-matchable
51 | :n n
52 | :string (aref group-strings n))))))
53 |
54 | (defun check-empty-matches (dfa)
55 | (let ((warned-empty-string? nil)
56 | (expressions (remove (empty-set)
57 | (alexandria:hash-table-alist dfa)
58 | :key #'car)))
59 | ;; Check if this RE only matches the empty string. We can't just
60 | ;; do this syntactically, i.e. (eq expression (empty-string)) as e.g.
61 | ;; "|(a&bc)" confounds rewriting, and would produce a false negative.
62 | ;; Instead, we check that there is only one state that isn't the
63 | ;; empty set, and that it can only transition to the empty set.
64 | ;; Hence the DFA can only ever match the empty string.
65 | (when (alexandria:length= 1 expressions)
66 | (let ((re (car (first expressions))))
67 | (when (and (not (eq (nullable re) (empty-set)))
68 | (null
69 | (remove (empty-set)
70 | (state-transitions (cdr (first expressions)))
71 | :key (alexandria:compose #'state-expression
72 | #'transition-next-state))))
73 | (warn 'expression-matches-empty-string)
74 | (setf warned-empty-string? t))))
75 | (unless (or warned-empty-string? (null expressions))
76 | ;; We will match everywhere if every state is nullable.
77 | (let ((something-wont-match? nil))
78 | (maphash (lambda (re state)
79 | (declare (ignore state))
80 | (when (and (eq (nullable re) (empty-set))
81 | (not (eq re (empty-set))))
82 | (setf something-wont-match? t)))
83 | dfa)
84 | (unless something-wont-match? (warn 'expression-matches-everything))))))
85 |
86 | (defun lint-regular-expression (expression)
87 | (with-hash-consing-tables ()
88 | (multiple-value-bind (expression group-count group-strings)
89 | (parse-regular-expression expression)
90 | (let ((dfa (make-dfa-from-expression expression)))
91 | (check-liveness dfa group-count group-strings)
92 | (check-empty-matches dfa)))))
93 |
--------------------------------------------------------------------------------
/Code/Compiler/length-inference.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (stealth-mixin:define-stealth-mixin length-inference-info ()
4 | state
5 | ((predecessors :accessor predecessors)
6 | (minimum-length :accessor minimum-length)))
7 |
8 | (defun compute-predecessor-lists (states)
9 | ;; Clear predecessor lists first.
10 | (maphash (lambda (ex state)
11 | (declare (ignore ex))
12 | (setf (predecessors state) '()))
13 | states)
14 | ;; For each transition, add the predecessor to the predecessor list
15 | ;; of the successor.
16 | (maphash (lambda (ex predecessor)
17 | (declare (ignore ex))
18 | (dolist (transition (state-transitions predecessor))
19 | (let ((successor (transition-next-state transition)))
20 | (unless (null successor)
21 | (push predecessor (predecessors successor))))))
22 | states))
23 |
24 | ;;; The following function implements Kildall's algorithm for
25 | ;;; optimisation, as described in
26 | ;;; .
27 | ;;; The end result of this analysis is that we can replace several
28 | ;;; bounds checks, each checking for one more character, with one
29 | ;;; check that checks for multiple characters.
30 |
31 | ;;; To compute the minimum lengths required to match from each state,
32 | ;;; this algorithm first optimistically makes far too high guesses
33 | ;;; about minimum lengths: if a state is not accepting, it guesses
34 | ;;; that the state will never reach an accepting state, and thus has
35 | ;;; an infinite minimum length; else, it correctly determines that the
36 | ;;; minimum length is 0 when the state is accepting. We then propagate
37 | ;;; lengths. A state takes one more character to match than its
38 | ;;; successors (the "transfer" function), and a set of successors
39 | ;;; states has a minimum length that is the minimum of the minimum
40 | ;;; length of each successor (the "confluence" function). We treat
41 | ;;; states that never reach an accepting state as having infinite
42 | ;;; minimum length, so our implementation has to handle some
43 | ;;; additional cases that aren't integer arithmetic:
44 | ;;; X ≠ infinity ⇒ X < infinity
45 | ;;; min(infinity, X) = min(X, infinity) = X for all X
46 | ;;; infinity + 1 = infinity
47 | ;;; Minimum lengths are repeatedly propagated until we don't actually
48 | ;;; lower anything by propagation. Then we have a correct result.
49 |
50 | (defun compute-minimum-lengths (states)
51 | (let ((work-list '()))
52 | (flet ((recompute-predecessors-of (state)
53 | (dolist (pred (predecessors state))
54 | (pushnew pred work-list)))
55 | (confluence (x y)
56 | (cond
57 | ((eql x :infinity) y)
58 | ((eql y :infinity) x)
59 | (t (min x y))))
60 | (transfer (x)
61 | (if (eql x :infinity)
62 | :infinity
63 | (1+ x)))
64 | (lower-p (new old)
65 | (if (eql old :infinity)
66 | (not (eql new :infinity))
67 | (< new old))))
68 | ;; Set the minimum length of every nullable state to be 0, and
69 | ;; the minimum length of every other state to be pretty large.
70 | (maphash (lambda (ex state)
71 | (setf (minimum-length state)
72 | (if (eq (nullable ex) (empty-set))
73 | :infinity
74 | 0))
75 | (recompute-predecessors-of state))
76 | states)
77 | ;; Set each minimum length to be one more than the minimum
78 | ;; length of the successors.
79 | (loop until (null work-list)
80 | do (let* ((state (pop work-list))
81 | (minimum-successors-length
82 | (transfer
83 | (reduce #'confluence
84 | (state-transitions state)
85 | :key (lambda (transition)
86 | (minimum-length (transition-next-state transition)))
87 | :initial-value :infinity))))
88 | (when (lower-p minimum-successors-length (minimum-length state))
89 | (setf (minimum-length state) minimum-successors-length)
90 | (recompute-predecessors-of state)))))))
91 |
92 | (defun state-never-succeeds-p (state)
93 | (eql (minimum-length state) :infinity))
94 |
--------------------------------------------------------------------------------
/Code/DFA-construction/tag-sets.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defvar *tag-gensym-counter* 0)
4 |
5 | (defun tag-gensym ()
6 | (incf *tag-gensym-counter*))
7 |
8 | (defun gensym-position-assignments (set)
9 | "Replicate any assignments, turning T_n <- s for all s into T^r_n <- T_n for some arbitrary r"
10 | (loop for (target . source) in set
11 | for (variable nil) = target
12 | collect (cond
13 | ((eql source 'nil)
14 | (if *gensym-assignments?*
15 | (cons (list variable (tag-gensym)) 'nil)
16 | (cons target 'nil)))
17 | ((not *gensym-assignments?*)
18 | (cons target target))
19 | (t
20 | (cons (list variable (tag-gensym)) target)))))
21 |
22 | (defun unique-assignments (set)
23 | "Make assignments unique, turning T_n <- s for all s into T^r_n <- s"
24 | (loop for ((variable nil) . source) in set
25 | collect (cons (list variable (tag-gensym)) source)))
26 |
27 | (defun merge-tag-sets (set1 set2)
28 | (append (loop for assignment in set1
29 | for (target . source) = assignment
30 | unless (find (first target) set2 :key #'caar)
31 | collect assignment)
32 | set2))
33 |
34 | (defun used-tags (re)
35 | (with-slot-consing (cached-used-tags re)
36 | (trivia:match re
37 | ((tag-set s) (mapcar #'cdr s))
38 | ((or (either r s) (both r s) (join r s))
39 | (union (used-tags r) (used-tags s) :test #'equal))
40 | ((or (invert r) (repeat r _ _ _))
41 | (used-tags r))
42 | ((grep vector _) (used-tags vector))
43 | ((alpha r history)
44 | (union (used-tags r) (used-tags history) :test #'equal))
45 | (_ '()))))
46 |
47 | (defun tags (re)
48 | (with-slot-consing (cached-tags re)
49 | (trivia:match re
50 | ((tag-set s) s)
51 | ((or (either r s) (both r s) (join r s))
52 | (union (tags r) (tags s) :test #'equal))
53 | ((or (invert r) (repeat r _ _ _))
54 | (tags r))
55 | ((grep r _) (tags r))
56 | ((alpha r _) (tags r))
57 | (_ '()))))
58 |
59 | (defun keep-used-assignments (new-re assignments)
60 | (loop with used = (used-tags new-re)
61 | for assignment in assignments
62 | for (target . nil) = assignment
63 | when (member target used :test #'equal)
64 | collect assignment))
65 |
66 | (defun remove-tags (re)
67 | (with-slot-consing (cached-removed-tags re)
68 | (trivia:match re
69 | ((tag-set _) (empty-string))
70 | ((either r s) (either (remove-tags r) (remove-tags s)))
71 | ((both r s) (both (remove-tags r) (remove-tags s)))
72 | ((join r s) (join (remove-tags r) (remove-tags s)))
73 | ((repeat r min max c) (repeat (remove-tags r) min max c))
74 | ((invert r) (invert (remove-tags r)))
75 | ((alpha r s)
76 | (either (remove-tags r)
77 | (if (eq s (empty-set))
78 | (empty-set)
79 | (empty-string))))
80 | ((grep r s) (grep (remove-tags r) (remove-tags s)))
81 | (_ re))))
82 |
83 | (defun has-tags-p (re)
84 | (with-slot-consing (cached-has-tags-p re)
85 | (trivia:match re
86 | ((tag-set _) t)
87 | ((or (either r s) (both r s) (join r s) (alpha r s) (grep r s))
88 | (or (has-tags-p r) (has-tags-p s)))
89 | ((or (invert r) (repeat r _ _ _))
90 | (has-tags-p r))
91 | (_ nil))))
92 |
93 | (defvar *allow-alpha* t)
94 | (defun map-tags (f re)
95 | ;; Return the same RE if we have no tags to replace.
96 | (unless (has-tags-p re)
97 | (return-from map-tags re))
98 | (trivia:match re
99 | ((tag-set set) (tag-set (funcall f set)))
100 | ((either r s) (either (map-tags f r) (map-tags f s)))
101 | ((both r s) (both (map-tags f r) (map-tags f s)))
102 | ((join r s) (join (map-tags f r) (map-tags f s)))
103 | ((invert r) (invert (map-tags f r)))
104 | ((repeat r min max c) (repeat (map-tags f r) min max c))
105 | ((alpha r old-tags)
106 | (unless (or *allow-alpha* (eq old-tags (empty-set)))
107 | (error "Can't modify tags with history"))
108 | (alpha (map-tags f r)
109 | (map-tags f old-tags)))
110 | ((grep r s)
111 | (grep (map-tags f r)
112 | (map-tags f s)))
113 | (_ re)))
114 |
115 | (defun unique-tags (re)
116 | (let ((*allow-alpha* nil))
117 | (map-tags #'unique-assignments re)))
118 |
--------------------------------------------------------------------------------
/Code/DFA-construction/type.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defvar *table-names* '())
4 |
5 | (defmacro define-hash-consing-table (name)
6 | `(progn
7 | (defvar ,name)
8 | (pushnew ',name *table-names*)
9 | ',name))
10 |
11 | (defconstant +uncomputed+ '+uncomputed+)
12 | (defclass regular-expression ()
13 | ((%nullable :initform +uncomputed+ :accessor cached-nullable)
14 | (%nullable-no-gensym :initform +uncomputed+ :accessor cached-nullable-no-gensym)
15 | (%used-tags :initform +uncomputed+ :accessor cached-used-tags)
16 | (%tags :initform +uncomputed+ :accessor cached-tags)
17 | (%removed-tags :initform +uncomputed+ :accessor cached-removed-tags)
18 | (%has-tags-p :initform +uncomputed+ :accessor cached-has-tags-p)))
19 |
20 | (defmacro define-types (&body types)
21 | (loop for (name . slots) in types
22 | collect (let ((variables (loop for slot in slots collect (gensym (symbol-name slot))))
23 | (internal-creator (alexandria:format-symbol t "%~a" name))
24 | (table-name (alexandria:format-symbol '#:one-more-re-nightmare
25 | "*~A-TABLE*" name)))
26 | `(progn
27 | (defclass ,name (regular-expression) ,slots)
28 | (trivia:defpattern ,name ,variables
29 | (alexandria:with-gensyms (instance-name)
30 | (list 'trivia:guard1 (list instance-name ':type ',name)
31 | (list 'typep instance-name '',name)
32 | ,@(loop for slot in slots
33 | for variable in variables
34 | appending `((list 'slot-value instance-name '',slot) ,variable)))))
35 | (define-hash-consing-table ,table-name)
36 | (defun ,internal-creator ,slots
37 | (or (gethash (list ,@slots) ,table-name)
38 | (let ((instance (make-instance ',name)))
39 | ,@(loop for slot in slots
40 | collect `(setf (slot-value instance ',slot) ,slot))
41 | (setf (gethash (list ,@slots) ,table-name)
42 | instance))))))
43 | into forms
44 | finally (return `(progn ,@forms))))
45 |
46 | (defmacro define-rewrites ((name &rest slots) &key simplify hash-cons printer)
47 | (let ((internal-creator (alexandria:format-symbol t "%~a" name))
48 | (table-name (alexandria:format-symbol '#:one-more-re-nightmare
49 | "*~A-TABLE*" name)))
50 | `(progn
51 | ,@(unless (null printer)
52 | `((defmethod print-object ((instance ,name) stream)
53 | (trivia:ematch instance
54 | ,printer))))
55 | (defun ,name ,slots
56 | (trivia:match (list ,@slots)
57 | ,@(loop for ((nil . pattern) replacement) in simplify
58 | collect `((list ,@pattern) ,replacement))
59 | ,@(loop for ((nil . pattern) (nil . replacement)) in hash-cons
60 | collect `((list ,@pattern)
61 | (or (gethash (list ,@replacement) ,table-name)
62 | (trivia.next:next))))
63 | (_ (,internal-creator ,@slots)))))))
64 | (indent:define-indentation define-type (4 &body))
65 |
66 | (defmacro with-hash-consing ((table key) &body body)
67 | (alexandria:once-only (table key)
68 | (alexandria:with-gensyms (value present?)
69 | `(multiple-value-bind (,value ,present?)
70 | (gethash ,key ,table)
71 | (if ,present?
72 | ,value
73 | (setf (gethash ,key ,table)
74 | (progn ,@body)))))))
75 |
76 | (defmacro with-slot-consing ((accessor object &key (when 't)) &body body)
77 | (alexandria:once-only (object)
78 | (alexandria:with-gensyms (value)
79 | `(let ((,value (,accessor ,object)))
80 | (flet ((compute-the-damn-value ()
81 | ,@body))
82 | (cond
83 | ((not ,when)
84 | (compute-the-damn-value))
85 | ((eq ,value +uncomputed+)
86 | (setf (,accessor ,object)
87 | (compute-the-damn-value)))
88 | (t
89 | ,value)))))))
90 |
91 | (defmacro with-hash-consing-tables (() &body body)
92 | `(let ,(loop for name in *table-names*
93 | collect `(,name (make-hash-table :test 'equal)))
94 | ,@body))
95 |
96 | (defmacro clear-global-tables ()
97 | "Set up global tables for testing."
98 | `(setf ,@(loop for name in *table-names*
99 | append `(,name (make-hash-table :test 'equal)))))
100 |
--------------------------------------------------------------------------------
/Code/SIMD/prefix-strategy.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defclass simd-info ()
4 | ((bits :initarg :bits :reader bits)))
5 |
6 | (defclass simd-prefix (simd-info strategy)
7 | ()
8 | (:documentation "Match a prefix of the string using SIMD operations before entering a DFA.
9 | A prefix P of some regular expression R is defined to be a sequence of literals such that P·S = R for some other suffix regular expression S."))
10 |
11 | (defmethod initial-states ((strategy simd-prefix) expression)
12 | (multiple-value-bind (prefix suffix)
13 | (prefix (add-tags expression))
14 | (declare (ignore prefix))
15 | (list (alpha (add-tags expression) (empty-set))
16 | (alpha suffix (empty-set)))))
17 |
18 | (defmethod start-code ((strategy simd-prefix) states)
19 | ;; Note that, by definition, having a prefix implies that the RE
20 | ;; can't produce only zero length matches. Thus we don't have to
21 | ;; worry about that case.
22 | (when (eql (minimum-length (first states)) :infinity)
23 | ;; This can't actually match anything, so return immediately.
24 | (return-from start-code `(start (return))))
25 | (multiple-value-bind (prefix suffix)
26 | (prefix (state-expression (first states)))
27 | (declare (ignore suffix))
28 | (multiple-value-bind (test loads assignments jump-length)
29 | (code-from-prefix prefix)
30 | `(start
31 | ;; Don't try to read over the END we were given.
32 | (when (>= (the fixnum (+ ,jump-length
33 | ,(/ one-more-re-nightmare.vector-primops:+v-length+ *bits*)
34 | start))
35 | end)
36 | (setf position start)
37 | (go ,(find-state-name (first states) :bounds-check)))
38 | ;; Now perform the SIMD test.
39 | (let* (,@loads
40 | (test-results (,(find-op "MOVEMASK") ,test)))
41 | (unless (zerop test-results)
42 | ;; Found a match!
43 | (setf position (+ start (one-more-re-nightmare.vector-primops:find-first-set test-results)))
44 | ;; We increment POSITION by 1 before assigning to act like
45 | ;; the actual DFA, and then do the rest of the "jump"
46 | ;; after assignments.
47 | (incf position)
48 | ,assignments
49 | (incf position ,(1- jump-length))
50 | ;; The same deal as in START-CODE for SCAN-EVERYTHING: we
51 | ;; "inline" succeeding states, so we might need to succeed
52 | ;; rather than go to another state.
53 | ,(let ((expression (state-expression (second states))))
54 | (if (re-empty-p expression)
55 | (let ((effects (effects expression)))
56 | `(progn
57 | ;; Surely there wouldn't be any new
58 | ;; assignments, as PREFIX would strip them
59 | ;; off.
60 | (let ((position (1+ position)))
61 | ,(setf-from-assignments effects))
62 | (setf start (max (1+ start)
63 | (1- ,(find-in-map 'end (state-exit-map (second states))))))
64 | (win ,@(win-locations effects))))
65 | `(go ,(find-state-name (second states) :bounds-check)))))
66 | ;; No match, so just bump and try again.
67 | (incf start ,(/ one-more-re-nightmare.vector-primops:+v-length+ *bits*))
68 | (go start))))))
69 |
70 | (defmethod make-prog-parts :around ((strategy simd-info) expression)
71 | (let ((*broadcasts* (make-hash-table))
72 | (*bits* (bits strategy)))
73 | (multiple-value-bind (variables declarations body)
74 | (call-next-method)
75 | (maphash (lambda (value name)
76 | (push (list name `(,(find-op "BROADCAST") ,value))
77 | variables))
78 | *broadcasts*)
79 | (values variables declarations body))))
80 |
81 | ;; Surely we don't need this macro. Come on.
82 | (defmethod macros-for-strategy append ((strategy simd-prefix))
83 | '((restart ()
84 | '(go start))))
85 |
86 | (defun make-default-strategy (layout expression)
87 | (let ((bits
88 | (alexandria:switch ((layout-array-type layout) :test 'equal)
89 | ('(simple-array character 1) 32)
90 | ('(simple-array base-char 1) 8)
91 | (otherwise nil))))
92 | (cond
93 | ((eql *code-type* :interpreted)
94 | ;; Interpreter doesn't recognise VOPs, so we can't use SIMD.
95 | ;; Not that you'd really want to.
96 | (make-instance (dynamic-mixins:mix 'scan-everything 'call-continuation)))
97 | ((and (> (count :literal (prefix expression) :key #'first) 1)
98 | (not (null bits)))
99 | (make-instance (dynamic-mixins:mix 'simd-loop 'simd-prefix 'call-continuation)
100 | :bits bits))
101 | ((not (null bits))
102 | (make-instance (dynamic-mixins:mix 'simd-loop 'scan-everything 'call-continuation)
103 | :bits bits))
104 | (t
105 | (make-instance (dynamic-mixins:mix 'scan-everything 'call-continuation))))))
106 |
--------------------------------------------------------------------------------
/Tests/tests.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare-tests)
2 |
3 | (parachute:define-test one-more-re-nightmare)
4 |
5 | (defmacro first-match (haystack &body body)
6 | `(progn
7 | ,@(loop for (re result-vector) on body by #'cddr
8 | collect `(parachute:is equalp
9 | ,result-vector
10 | (one-more-re-nightmare:first-match ,re ,haystack)))))
11 |
12 | (defmacro all-string-matches (haystack &body body)
13 | `(progn
14 | ,@(loop for (re result-vector) on body by #'cddr
15 | collect `(parachute:is equalp
16 | ,result-vector
17 | (one-more-re-nightmare:all-string-matches ,re ,haystack)))))
18 |
19 | (parachute:define-test easy-stuff
20 | :parent one-more-re-nightmare
21 | (first-match "Hello world"
22 | "Hello" #(0 5)
23 | "world" #(6 11)
24 | "[a-z]+" #(1 5)
25 | "(h|l)o" #(3 5))
26 | ;; The engine should finish the match at the right spot.
27 | ;; (The first OMRN compiler would not do this correctly.)
28 | (first-match "ababa"
29 | "ab" #(0 2)
30 | "(ab)+" #(0 4))
31 | ;; The engine should also restart just after the actual match.
32 | (all-string-matches "ababc"
33 | "ab" '(#("ab") #("ab"))
34 | "ab(c|)" '(#("ab") #("abc")))
35 | ;; Per HAKMEM item #176.
36 | (all-string-matches "banana"
37 | "ana" '(#("ana")))
38 | (all-string-matches "aaa"
39 | "a+" '(#("aaa"))
40 | ;; Make sure we match the empty string at the end of the haystack..
41 | ;; One program used at university would loop indefinitely, because
42 | ;; it would not advance past the zero-length match at the end.
43 | "a*" '(#("aaa") #(""))))
44 |
45 | (parachute:define-test annoying-submatches
46 | :parent one-more-re-nightmare
47 | ;; Per footnote #14 on page #12 of gilberth's paper.
48 | (first-match "aaaaa"
49 | "«a|aa»+" #(0 5 4 5))
50 | (first-match "aaaaaa"
51 | "«a|aa»+" #(0 6 4 6))
52 | (first-match "aaaaaaaa"
53 | "«a|aa»+" #(0 8 6 8))
54 | ;; Per
55 | (first-match "ab"
56 | ;; This involves clearing registers after each iteration of
57 | ;; repetition due to * or +.
58 | ;; We only should match here:
59 | ;; | |
60 | ;; V V
61 | "«««a*»|b»|b»+" #(0 2 1 2 1 2 nil nil)
62 | "«««a*»|b»|b»*" #(0 2 1 2 1 2 nil nil))
63 | ;; Per :
64 | ;; "For this purpose, a null string shall be considered to be longer
65 | ;; than no match at all."
66 | (first-match ""
67 | "«a*»*" #(0 0 0 0)))
68 |
69 | (parachute:define-test annoying-parsing
70 | :parent one-more-re-nightmare
71 | ;; Per
72 | (all-string-matches "/* abc */ /* def */"
73 | "/\\*¬($*\\*/$*)\\*/" '(#("/* abc */") #("/* def */"))))
74 |
75 | (parachute:define-test annoying-negation
76 | :parent one-more-re-nightmare
77 | ;; We used to erroneously rewrite ¬ε to ø, which is plain wrong.
78 | (all-string-matches "abcde"
79 | "¬a" '(#("abcde") #(""))
80 | "$+&¬a" '(#("abcde"))))
81 |
82 | (defun compiler-macroexpand-1 (form)
83 | (funcall (compiler-macro-function (first form))
84 | form
85 | nil))
86 | (defmacro compiler-macro-warns (form type reason)
87 | ;; I mean, it doesn't really _fail_ if it just signals a
88 | ;; warning. But okay.
89 | `(parachute:fail (compiler-macroexpand-1 ',form) ,type
90 | "~A" ,reason))
91 | (defmacro macro-warns (form type reason)
92 | `(parachute:fail (macroexpand-1 ',form) ,type
93 | "~A" ,reason))
94 |
95 | (deftype full-warning () '(and warning (not style-warning)))
96 |
97 | (parachute:define-test user-interface
98 | :parent one-more-re-nightmare
99 | (compiler-macro-warns
100 | (one-more-re-nightmare:all-matches "a|«a»" s)
101 | one-more-re-nightmare:not-matchable-style-warning
102 | "A style-warning should be generated for dead submatches.")
103 | (compiler-macro-warns
104 | (one-more-re-nightmare:all-matches "a&b" s)
105 | one-more-re-nightmare:not-matchable-style-warning
106 | "A style-warning should be generated for dead REs.")
107 | (compiler-macro-warns
108 | (one-more-re-nightmare:all-matches "" s)
109 | one-more-re-nightmare:matching-too-much-style-warning
110 | "A style-warning should be generated for REs that only match the empty string.")
111 | (compiler-macro-warns
112 | (one-more-re-nightmare:all-matches "a*" s)
113 | one-more-re-nightmare:matching-too-much-style-warning
114 | "A style-warning should be generated for REs that match the empty string.")
115 | (compiler-macro-warns
116 | (one-more-re-nightmare:all-matches "«a" s)
117 | full-warning
118 | "A full warning should be generated for invalid RE syntax.")
119 | (macro-warns
120 | (one-more-re-nightmare:do-matches ((start end s1 e1) "Hello" s)
121 | (print (list s1 e1)))
122 | full-warning
123 | "A full warning should be generated by trying to address registers that don't exist."))
124 |
125 | (parachute:define-test character-classes
126 | :parent one-more-re-nightmare
127 | ;; Per
128 | (all-string-matches "'HI EVERYBODY!!!!!!!!!!' 'try pressing the the Caps Lock key'"
129 | "[[:upper:]][[:lower:]]+" '(#("Caps") #("Lock"))
130 | "[[:punct:]]{3,}" '(#("!!!!!!!!!!'"))
131 | "[[:digit:]]" '()
132 | "[[:digit:]y]" '(#("y") #("y"))
133 | "[¬[:punct:]]+" '(#("HI EVERYBODY") #(" ") #("try pressing the the Caps Lock key"))))
134 |
135 | (defun run-tests ()
136 | (parachute:test 'one-more-re-nightmare))
137 |
--------------------------------------------------------------------------------
/Code/Interface/syntax.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (esrap:defrule top-level
4 | (or two-expressions empty-string))
5 | (esrap:defrule two-expressions
6 | (and expression top-level)
7 | (:destructure (a b) (join a b)))
8 | (esrap:defrule expression
9 | (or either below-either))
10 | (esrap:defrule below-either
11 | (or both below-both))
12 | (esrap:defrule below-both
13 | (or join below-join))
14 | (esrap:defrule below-join
15 | (or invert
16 | kleene plus maybe repeated
17 | parens match-group character-range universal-set
18 | literal empty-string))
19 |
20 | (defvar *next-group*)
21 | (defun next-group ()
22 | (incf *next-group*))
23 | (defvar *group-strings*)
24 |
25 | (defun parse-regular-expression (string)
26 | (let ((*next-group* 0)
27 | (*group-strings* (make-hash-table)))
28 | (values (esrap:parse 'top-level string)
29 | *next-group*
30 | (coerce (cons string
31 | (loop for group from 1 to *next-group*
32 | for (s . e) = (gethash group *group-strings*)
33 | collect (subseq string s e)))
34 | 'vector))))
35 |
36 | ;;; Parens
37 | (esrap:defrule parens
38 | (and "(" expression ")")
39 | (:destructure (left expression right)
40 | (declare (ignore left right))
41 | expression))
42 |
43 | (esrap:defrule match-group
44 | (and "«" expression "»")
45 | (:around (esrap:&bounds start end)
46 | (let ((group-number (next-group)))
47 | (destructuring-bind (left expressions right)
48 | (esrap:call-transform)
49 | (declare (ignore left right))
50 | (setf (gethash group-number *group-strings*)
51 | (cons start end))
52 | (group expressions group-number)))))
53 |
54 | ;;; Binary operators
55 | (esrap:defrule either
56 | (and below-either "|" (or either below-either))
57 | (:destructure (e1 bar e2)
58 | (declare (ignore bar))
59 | (either e1 e2)))
60 |
61 | (esrap:defrule both
62 | (and below-both "&" (or both below-both))
63 | (:destructure (e1 bar e2)
64 | (declare (ignore bar))
65 | (both e1 e2)))
66 |
67 | (esrap:defrule join
68 | (and below-join (or join below-join))
69 | (:destructure (e1 e2) (join e1 e2)))
70 |
71 | ;;; Repeats
72 | (defun clear-registers (expression)
73 | (join (tag-set
74 | (loop for ((v nil) . nil) in (tags expression)
75 | collect (cons (list v (tag-gensym)) 'nil)))
76 | expression))
77 |
78 | (esrap:defrule kleene
79 | (and below-join "*")
80 | (:destructure (expression star)
81 | (declare (ignore star))
82 | (repeat (clear-registers expression) 0 nil t)))
83 |
84 | (esrap:defrule plus
85 | (and below-join "+")
86 | (:destructure (expression plus)
87 | (declare (ignore plus))
88 | (repeat (clear-registers expression) 1 nil t)))
89 |
90 | (esrap:defrule maybe
91 | (and below-join "?")
92 | (:destructure (expression q)
93 | (declare (ignore q))
94 | (repeat expression 0 1 t)))
95 |
96 | (esrap:defrule repetitions
97 | (and (esrap:? integer) "," (esrap:? integer))
98 | (:destructure (min comma max)
99 | (declare (ignore comma))
100 | (assert (or (null max) (null min) (> max min))
101 | (max min)
102 | "The maximum repetition count ~d cannot be less than the minimum count ~d."
103 | max min)
104 | (cons (or min 0) max)))
105 |
106 | (esrap:defrule repetition
107 | integer
108 | (:lambda (count) (cons count count)))
109 |
110 | (esrap:defrule repeated
111 | (and below-join "{" (or repetitions repetition) "}")
112 | (:destructure (e left counts right)
113 | (declare (ignore left right))
114 | (repeat e (car counts) (cdr counts) t)))
115 |
116 | (esrap:defrule invert
117 | (and (or "¬" "~") below-join)
118 | (:destructure (bar expression)
119 | (declare (ignore bar))
120 | (invert expression)))
121 |
122 | ;;; "Terminals"
123 | (esrap:defrule universal-set
124 | "$"
125 | (:constant (literal +universal-set+)))
126 |
127 | (esrap:defrule integer
128 | (+ (or "0" "1" "2" "3" "4" "5" "6" "7" "8" "9"))
129 | (:lambda (list)
130 | (parse-integer (format nil "~{~A~}" list))))
131 |
132 |
133 | (esrap:defrule escaped-character
134 | (and #\\ character)
135 | (:destructure (backslash char)
136 | (declare (ignore backslash))
137 | char))
138 |
139 | (esrap:defrule special-character
140 | (or "(" ")" "«" "»" "[" "]" "{" "}" "¬" "~" "|" "&" "*" "+" "?" "$"))
141 |
142 | (esrap:defrule literal
143 | (or escaped-character (not special-character))
144 | (:lambda (character) (literal (singleton-set (char-code character)))))
145 |
146 | (esrap:defrule empty-string
147 | ""
148 | (:constant (empty-string)))
149 |
150 | ;;; Character ranges
151 | (esrap:defrule character-range-escaped-constituent
152 | (and #\\ (or #\[ #\] #\- #\^ #\¬))
153 | (:destructure (backslash char)
154 | (declare (ignore backslash))
155 | (char char 0)))
156 |
157 | (esrap:defrule character-range-name-constituent (not (or ":" "[" "]")))
158 |
159 | (esrap:defrule character-range-named
160 | (and "[:" (+ character-range-name-constituent) ":]")
161 | (:destructure (open characters close)
162 | (declare (ignore open close))
163 | (named-range (coerce characters 'string))))
164 |
165 | (esrap:defrule character-range-character
166 | (or character-range-escaped-constituent (not (or "-" "]" "[" "\\"))))
167 |
168 | (esrap:defrule character-range-single
169 | character-range-character
170 | (:lambda (character)
171 | (singleton-set (char-code character))))
172 |
173 | (esrap:defrule character-range-range
174 | (and character-range-character "-" character-range-character)
175 | (:destructure (low dash high)
176 | (declare (ignore dash))
177 | (range (char-code low) (1+ (char-code high)))))
178 |
179 | (esrap:defrule character-range
180 | (and "["
181 | (esrap:? (or "^" "¬"))
182 | (* (or character-range-named
183 | character-range-range
184 | character-range-single))
185 | "]")
186 | (:destructure (left invert ranges right)
187 | (declare (ignore left right))
188 | (let ((sum (reduce #'csum-union ranges
189 | :initial-value +empty-set+)))
190 | (literal (if invert (csum-complement sum) sum)))))
191 |
--------------------------------------------------------------------------------
/Documentation/interface.scrbl:
--------------------------------------------------------------------------------
1 | #lang scribble/base
2 | @require["spec-macros.scrbl" "bibliography.rkt"]
3 |
4 | @title{Interface}
5 |
6 | @section{Syntax}
7 |
8 | We currently use our own syntax for regular expressions, since the
9 | POSIX syntax does not allow for expressing complements or
10 | intersections of regular expressions.
11 |
12 | @centered-block[
13 | @bnf[
14 | @rule["E"
15 | "E | E" "union"
16 | "E & E" "intersection"
17 | "E E" "concatenate"
18 | "¬E" "complement"
19 | "~E" "complement"
20 | "E*" "zero or more repeats"
21 | "E+" "one or more repeats"
22 | "E?" "zero or one repeats"
23 | "E{j,j}" "repeat"
24 | "«E»" "submatch"
25 | "(E)" "change precedence"
26 | "[r]" "character range"
27 | "[¬r]" "complement ranges"
28 | "[^r]" "complement ranges"
29 | "$" "every character"
30 | "c" "literal character"
31 | "" ""]
32 | @rule["r" "" "empty range"
33 | "cr" "single character"
34 | "c-cr" "character range"
35 | "[::]r" "character class"]
36 | @rule["j" "" "bound"
37 | "" "no bound"]
38 | @rule["c" "" ""]
39 | ]
40 | ]
41 |
42 | Rules higher on this list are "looser" than rules lower on the list.
43 | For example, the expression @cl{ab&cd|ef} is equivalent to
44 | @cl{((ab)&(cd))|(ef)}.
45 |
46 | @section{Semantics}
47 |
48 | Matches and submatches are performed using the semantics for
49 | POSIX regular expressions @~cite[posix], with additional rules pertaining
50 | to intersection and complements.
51 |
52 | Submatches on both sides of an intersection operator are matched. For
53 | example,
54 |
55 | @lisp-code{
56 | (first-string-match "«a»b&a«b»" "ab") ; => #("ab" "a" "b")
57 | }
58 |
59 | No submatches in a complement can ever be matched, including in the
60 | complement of a complement. Thus @cl{¬¬A} is not the same as @cl{A}:
61 |
62 | @lisp-code{
63 | (first-match "¬¬«a»" "a") ; => #(0 1 NIL NIL)
64 | (first-match "«a»" "a") ; => #(0 1 0 1)
65 | }
66 |
67 | @section{Matching}
68 |
69 | Note that one-more-re-nightmare can avoid a cache lookup (involving
70 | acquiring a lock and hash table searching) if the regular expression
71 | is a literal string, or a constant variable bound to a string.
72 |
73 | @definitions{
74 | @defun["first-match"]{regular-expression string @param{@&key start end}}
75 |
76 | @defun["first-string-match"]{regular-expression string @param{@&key start end}}
77 |
78 | Find the first match for @cl{regular-expression} in @cl{string}
79 | between @cl{start} and @cl{end}.
80 |
81 | @cl{first-match} returns a simple vector, where each element is a
82 | @concept{register}, or @cl{nil} when there is no match. The first two
83 | registers are always the start and end of the match, and then
84 | subsequent registers are the start and end of each submatch. A
85 | register is either a bounding index of @cl{string} or @cl{nil} when
86 | there is no submatch.
87 |
88 | @cl{first-string-match} either returns a simple vector, every element
89 | of which is a fresh string or @cl{nil} (when there is no submatch), or
90 | @cl{nil} if there is no match.
91 |
92 | @definition-section["Examples"]{
93 | @lisp-code{
94 | (first-match "[0-9]([0-9]| )+" "Phone: 632 3003")
95 | ;; => #(6 15)
96 | (first-string-match "[0-9]([0-9]| )+" "Phone: 632 3003")
97 | ;; => "632 3003"
98 |
99 | (first-match
100 | "«[0-9]+»x«[0-9]+»|«[0-9]+»p"
101 | "Foobar 1920x1080 17-inch display")
102 | ;; => #(7 16 7 11 12 16 NIL NIL)
103 | (first-string-match
104 | "«[0-9]+»x«[0-9]+»|«[0-9]+»p"
105 | "Foobar 1920x1080 17-inch display")
106 | ;; => #("1920x1080" "1920" "1080" NIL)
107 | }
108 | }
109 | }
110 |
111 | @definitions{
112 | @defun["all-matches"]{regular-expression string @param{@&key start end}}
113 |
114 | @defun["all-string-matches"]{regular-expression string @param{@&key start end}}
115 |
116 | Find all matches for @cl{regular-expression} in @cl{string} between
117 | @cl{start} and @cl{end}.
118 |
119 | Both functions return a list of matches; @cl{all-matches} represents
120 | matches as @cl{first-match} does, and @cl{all-string-matches}
121 | represents matches as @cl{first-string-match} does.
122 |
123 | @definition-section["Examples"]{
124 | @lisp-code{
125 | (all-matches
126 | "«[0-9]+»x«[0-9]+»|«[0-9]+»p"
127 | "Foobar 1920x1080 17-inch display or Quux 19-inch 720p display?")
128 | ;; => (#(7 16 7 11 12 16 NIL NIL) #(49 53 NIL NIL NIL NIL 49 52))
129 | (all-string-matches
130 | "«[0-9]+»x«[0-9]+»|«[0-9]+»p"
131 | "Foobar 1920x1080 17-inch display or Quux 19-inch 720p display?")
132 | ;; => (#("1920x1080" "1920" "1080" NIL) #("720p" NIL NIL "720"))
133 | }
134 | }
135 | }
136 |
137 | @definitions{
138 | @defmacro["do-matches"]{((@param{@&rest registers}) regular-expression
139 | string @param{@&key start end})
140 | @param{@&body body}}
141 |
142 | @cl{do-matches} iterates over all matches for @cl{regular-expression}
143 | across @cl{string}. The @cl{registers} variables are bound to the
144 | @term{registers} produced, as described for @cl{first-match}.
145 |
146 | It is possible to provide fewer variables than registers in the
147 | regular expression, but an error will be signalled if there are more
148 | variables than registers.
149 | }
150 |
151 | @section{Compiling}
152 |
153 | The compiler may be run manually, when the regular expression is not
154 | known at compile time, and the code cache takes too long to search.
155 | (The latter can happen if many threads are accessing the code cache,
156 | and the time taken searching is sufficiently short, as lookups grab a
157 | global lock currently.)
158 |
159 | @definitions{
160 | @defclass["compiled-regular-expression"]
161 |
162 | An object representing a compiled regular expression. An instance of
163 | this class can be provided as a regular expression to all the searching
164 | functions, instead of a string.
165 | }
166 |
167 | @definitions{
168 | @defun["compile-regular-expression"]{expression}
169 |
170 | Compile a regular expression, returning an instance of
171 | @cl{compiled-regular-expression}.
172 |
173 | An error is signalled if the expression has invalid syntax.
174 | }
175 |
176 | @definitions{
177 | @defvar["*state-limit*"]{1000}
178 |
179 | The maximum number of states which the compiler will generate.
180 |
181 | @define-condition["exceeded-state-limit"]{error}
182 |
183 | This error type is signalled when the compiler exceeds the limit on
184 | the number of states set by @cl{*state-limit*}.
185 | }
186 |
--------------------------------------------------------------------------------
/Documentation/spec-macros.scrbl:
--------------------------------------------------------------------------------
1 | #lang scribble/base
2 |
3 | @(require scribble/core
4 | scribble/html-properties
5 | scribble/latex-properties
6 | racket/list)
7 | @(provide definition-section definitions
8 | defthing defun defmacro defgeneric defmethod
9 | defaccessor defreader definitarg
10 | defclass defprotoclass defvar
11 | define-condition
12 | cl lisp-code
13 | param &keyword &key &optional &rest &allow-other-keys &body
14 | term concept
15 | bnf rule tag-rule
16 | todo note
17 | image/width
18 | code-template var
19 | landscape centered-block)
20 |
21 | @(define spec-macro-additions
22 | (list (make-tex-addition "spec-macros.tex")))
23 | @(define block-thing-additions
24 | (cons (alt-tag "div") spec-macro-additions))
25 |
26 | @(define float-right-style
27 | (make-style "FloatRight" spec-macro-additions))
28 | @(define sc-style
29 | (make-style "SmallCaps" spec-macro-additions))
30 | @(define definitions-style
31 | (make-style "Definitions" block-thing-additions))
32 | @(define indent-style
33 | (make-style "Indent" spec-macro-additions))
34 | @(define landscape-style
35 | (make-style "Landscape" block-thing-additions))
36 | @(define centered-block-style
37 | (make-style "CenteredBlock" block-thing-additions))
38 | @(define centered-container-style
39 | (make-style "CenteredContainer" block-thing-additions))
40 | @(define param-style
41 | (make-style "Param" spec-macro-additions))
42 | @(define defun-name-style
43 | (make-style "DefunName" (cons (alt-tag "b") spec-macro-additions)))
44 | @(define lambda-list-style
45 | (make-style "LambdaList" (cons (alt-tag "i") spec-macro-additions)))
46 |
47 | @(define @float-right[thing]
48 | (elem thing #:style float-right-style))
49 | @(define @indent[thing]
50 | (nested thing #:style 'inset))
51 | @(define @sc[thing]
52 | (elem thing #:style sc-style))
53 | @(define (landscape . stuff)
54 | (nested stuff #:style landscape-style))
55 | @(define (centered-block . stuff)
56 | (nested (nested stuff #:style centered-block-style)
57 | #:style centered-container-style))
58 |
59 | @(define @defthing[name more type #:index? [index? #t]]
60 | (let ([base (list (elem name #:style defun-name-style)
61 | " " (elem (list more (float-right (italic type)))
62 | #:style lambda-list-style))])
63 | (if index?
64 | (index* (list name) (list (list (string-append name " " type)))
65 | base)
66 | base)))
67 |
68 | @(define (definition-section name . things)
69 | (list (linebreak)
70 | (sc name)
71 | (linebreak)
72 | things))
73 |
74 | @(define (definitions . stuff)
75 | (nested #:style definitions-style
76 | stuff))
77 |
78 | @(define (def-function-thing name arguments type #:index? [index? #t])
79 | (list (defthing name (list " " (italic arguments)) type #:index? index?)
80 | (linebreak)))
81 |
82 | @(define (defun name . arguments)
83 | (def-function-thing name arguments "Function"))
84 | @(define (defmacro name . arguments)
85 | (def-function-thing name arguments "Macro"))
86 | @(define (defgeneric name . arguments)
87 | (def-function-thing name arguments "Generic Function"))
88 | @(define (defmethod name #:qualifier [q ""] . arguments)
89 | (def-function-thing name arguments
90 | (if (string=? q "")
91 | "Method"
92 | (string-append q " Method"))
93 | #:index? #f))
94 | @(define (defaccessor name . arguments)
95 | (def-function-thing name arguments "Accessor"))
96 | @(define (defreader name . arguments)
97 | (def-function-thing name arguments "Reader"))
98 | @(define (definitarg name)
99 | (defthing name "" "Initarg"))
100 |
101 | @(define (interleave v l)
102 | (reverse
103 | (for/fold ([a '()])
104 | ([x (in-list l)])
105 | (list* x v a))))
106 |
107 | @(define (superclass-part superclasses)
108 | (if (or (equal? superclasses "")
109 | (equal? superclasses '())
110 | (equal? superclasses '("")))
111 | ""
112 | (interleave " " (cons "<:" superclasses))))
113 |
114 | @(define (defclass name . superclasses)
115 | (defthing name (superclass-part superclasses) "Class"))
116 |
117 | @(define (define-condition name . supertypes)
118 | (def-function-thing name (superclass-part supertypes) "Condition Type"))
119 |
120 | @(define (defprotoclass name)
121 | (defthing name "" "Protocol Class"))
122 |
123 | @(define (defvar name initial-value)
124 | (defthing name (list "initially " initial-value) "Variable"))
125 |
126 | @(define (&keyword name)
127 | (elem #:style 'tt "&" name))
128 |
129 | @(define (param . stuff) (elem stuff #:style param-style))
130 | @(define &key @&keyword{key})
131 | @(define &rest @&keyword{rest})
132 | @(define &body @&keyword{body})
133 | @(define &optional @&keyword{optional})
134 | @(define &allow-other-keys @&keyword{allow-other-keys})
135 | @(define cl tt)
136 | @(define (lisp-code . stuff)
137 | (nested #:style 'code-inset
138 | (apply verbatim stuff)))
139 |
140 | @(define term italic)
141 | @(define (concept name)
142 | (index name (italic name)))
143 |
144 | @(define (groups x n)
145 | (if (> (length x) n)
146 | (cons (take x n) (groups (drop x n) n))
147 | (list x)))
148 |
149 | @(define (bnf . rules)
150 | (tabular #:sep (hspace 1)
151 | #:column-properties '(right center left right)
152 | #:style (style #f (list (attributes '((style . "width: 100%")))))
153 | (apply append rules)))
154 | @(define (rule name . generates)
155 | (for/list ([rhs (in-list (groups generates 2))]
156 | [n (in-naturals)])
157 | (if (zero? n)
158 | (list* name "::=" rhs)
159 | (list* "" "|" rhs))))
160 | @(define (tag-rule lhs tag rhs)
161 | (rule lhs (list (tt tag) " " rhs)))
162 |
163 | @(define (todo . stuff)
164 | (list (bold "TODO: ") stuff))
165 | @(define (note . stuff)
166 | (list (bold "NOTE: ") stuff))
167 |
168 | @(define (image/width width pathname #:scale [scale 0.5])
169 | (image #:style (style #f
170 | (list
171 | (attributes
172 | `((style . ,(string-append "height: auto; width: " width))))
173 | (command-optional (list (string-append "width=" width)))))
174 | #:scale scale
175 | pathname))
176 |
177 | @(define-syntax code-template
178 | (syntax-rules [var]
179 | [(code-template) '()]
180 | [(code-template (var x) rest ...)
181 | (cons x (code-template rest ...))]
182 | [(code-template x rest ...)
183 | (cons (tt x) (code-template rest ...))]))
184 | @(define (var _) (error "No var outside code-template"))
185 |
--------------------------------------------------------------------------------
/Documentation/linting.scrbl:
--------------------------------------------------------------------------------
1 | #lang scribble/base
2 | @require["spec-macros.scrbl" scribble-math/dollar]
3 |
4 | @title{Linting and warnings}
5 |
6 | one-more-re-nightmare can produce warnings at compile-time for various
7 | mistakes when writing regular expressions. The compiler produces a
8 | finite state machine, which involves traversing every execution path
9 | of the machine, so it can perform analysis with no false positives or
10 | negatives.
11 |
12 | @definitions{
13 | @define-condition["lint-style-warning" "style-warning"]
14 | }
15 |
16 | Linting occurs when regular expressions are provided as literal
17 | strings in the source code. All @term{style warnings} generated are of the
18 | type @cl{lint-style-warning}.
19 |
20 | @section{Unreachability}
21 |
22 | @definitions{
23 | @define-condition["not-matchable-style-warning" "lint-style-warning"]
24 | }
25 |
26 | The following issues generate @term{style warnings} at
27 | compile-time, of type @cl{not-matchable-style-warning}. They do not
28 | indicate that something will go wrong at run-time, but their behaviour
29 | is rarely desirable.
30 |
31 | @subsection{"This expression is impossible to match."}
32 |
33 | @definition-section["Explanation"]{
34 |
35 | The expression will never match any expressions; it is equivalent to
36 | the empty set.
37 |
38 | }
39 |
40 | @definition-section["Examples"]{
41 |
42 | @cl{a&b}: There are no characters that are simultaneously @cl{a} and @cl{b}.
43 |
44 | @cl{¬(a$|$b)&¬(¬(a$)&¬($b))}: There are no strings that match
45 | @cl{¬(a$|$b)} but not @cl{¬(a$)&¬($b)}. In other words, the linter is
46 | used to prove @${\overline{a \lor b} \Rightarrow \overline a \land
47 | \overline b}. While it is a fun idea, we don't recommend using the
48 | linter to check equivalence of Boolean expressions.
49 |
50 | }
51 |
52 | @subsection{"The group in this expression is impossible to match."}
53 |
54 | @definition-section["Explanation"]{
55 |
56 | A submatch in the expression will never match any expressions; it may
57 | either correspond to the empty set, or is "shadowed" by an alternate
58 | expression.
59 |
60 | }
61 |
62 | @definition-section["Examples"]{
63 |
64 | @cl{a|«a»} generates the warning "The first group in this expression
65 | is impossible to match.". The only string that the expression can
66 | match is @cl{a}, and the left-hand side of the @cl{|} operator takes
67 | precedence with POSIX semantics, so the right hand side can never match.
68 |
69 | @cl{a|«b&c»} generates the same warning. There are no characters that
70 | are simultaneously @cl{b} and @cl{c}.
71 |
72 | }
73 |
74 | @section{Matching too much}
75 |
76 | @definitions{
77 | @define-condition["matching-too-much-style-warning" "lint-style-warning"]
78 | }
79 |
80 | Some regular expressions may match at every position, which is usually
81 | a sign of a mistake, as one usually wants to extract something from a
82 | string, and not everything. The following issues generate
83 | @term{style warnings} at compile-time, of type
84 | @cl{matching-too-much-style-warning}.
85 |
86 | @subsection{"This expression matches the empty string at every position."}
87 |
88 | @definition-section["Explanation"]{
89 |
90 | The expression will match at every position, and most matches will
91 | have zero length. Often some @cl{*} repetition needs to be replaced with
92 | some @cl{+} repetition, to ensure matches contain at least one character.
93 |
94 | }
95 |
96 | @definition-section["Examples"]{
97 |
98 | The following code will produce too many matches:
99 |
100 | @lisp-code{
101 | (defun numbers (string)
102 | (one-more-re-nightmare:all-string-matches "[0-9]*" string))
103 | (numbers "Phone: 6323003")
104 | ;; => (#("") #("") #("") #("") #("") #("") #("") #("6323003") #(""))
105 | }
106 |
107 | one-more-re-nightmare generates this warning when the @cl{numbers}
108 | function is submitted. One solution is to replace the @cl{*} repetition
109 | with @cl{+} repetition.
110 |
111 | @lisp-code{
112 | (defun numbers (string)
113 | (one-more-re-nightmare:all-string-matches "[0-9]+" string))
114 | (numbers "Phone: 6323003")
115 | ;; => (#("6323003"))
116 | }
117 |
118 | }
119 |
120 | @subsection{"This expression will only ever match the empty string at
121 | every position."}
122 |
123 | @definition-section["Explanation"]{
124 |
125 | The expression will only match at every position, and all matches will
126 | have zero length.
127 |
128 | }
129 |
130 | @definition-section["Examples"]{
131 |
132 | Using the empty string as a regular expression generates this warning.
133 | Other regular expressions which are not just the empty string can still
134 | generate this warning; @cl{|b&c} will generate this warning, as the
135 | regular expression still can only match the empty string.
136 |
137 | }
138 |
139 | @section{Syntax errors}
140 |
141 | Syntax errors can also be caught at compile-time, signalling full
142 | warnings, as function with invalid syntax will always fail at
143 | run-time.
144 |
145 | @definition-section["Examples"]{
146 |
147 | @cl{(} generates a parsing error. The open-parenthesis should be
148 | matched with a closing @cl{)}.
149 |
150 | }
151 |
152 | @section{Type errors}
153 |
154 | Type errors can be caught at compile-time, signalling full warnings,
155 | as functions with type errors will always fail at run-time.
156 |
157 | @subsection{"This regular expression only produces registers, but
158 | variables were provided."}
159 |
160 | @definition-section["Explanation"]{
161 |
162 | Too many register variables were provided for the regular expression
163 | provided to @cl{do-matches}.
164 |
165 | }
166 |
167 | @definition-section["Examples"]{
168 |
169 | @cl{(one-more-re-nightmare:do-matches ((start end s1 e1) "abcde" x)
170 | (print (list s1 e1)))} generates the warning "This regular expression
171 | only produces two registers, but four variables were provided." There
172 | are no submatches in @cl{abcde}, but the @cl{do-matches} form was
173 | provided the variable names @cl{s1} and @cl{e1} for a submatch.
174 |
175 | }
176 |
177 | @subsection{SBCL reports a type conflict}
178 |
179 | @definition-section["Explanation"]{
180 |
181 | one-more-re-nightmare provides specific types to SBCL for regular
182 | expressions provided as string literals. The SBCL compiler can use
183 | these types to detect errors in code that uses the results produced by
184 | one-more-re-nightmare.
185 |
186 | Specifically, one-more-re-nightmare provides the return type
187 | @code-template{(or null (simple-vector @var{@${2(n+1)}}))} for a call
188 | to @cl{first-match} with a regular expression with @${n} submatches.
189 | one-more-re-nightmare provides the type @cl{alexandria:array-index}
190 | for the first two register variables, and the type @cl{(or null
191 | alexandria:array-index)} for the remaining variables for @cl{do-matches}.
192 |
193 | }
194 |
195 | @definition-section["Examples"]{
196 |
197 | @cl{(svref (first-match "abc" "abc") 2)} generates the warning
198 | "Derived type (INTEGER 2 2) is not a suitable index for (SIMPLE-VECTOR
199 | 2)."
200 |
201 | @cl{(do-matches ((s) "ab|ac" "ab") (print (symbol-name s)))} generates
202 | the warning "Derived type of ... is (VALUES (MOD ...) &OPTIONAL)
203 | conflicting with its asserted type SYMBOL." The variable @cl{s} will
204 | always be bound to an index, and never @cl{nil}, because the first two
205 | registers designate the bounds of the entire match.
206 |
207 | }
208 |
--------------------------------------------------------------------------------
/Code/DFA-construction/make-dfa.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defstruct transition
4 | class
5 | next-state
6 | tags-to-set)
7 |
8 | (defclass state ()
9 | ((exit-map :initarg :exit-map :accessor state-exit-map)
10 | (exit-effects :initarg :exit-effects :accessor state-exit-effects)
11 | (expression :initarg :expression :accessor state-expression)
12 | (transitions :initform '() :accessor state-transitions)))
13 |
14 | (defmethod print-object ((state state) stream)
15 | (print-unreadable-object (state stream :type t)
16 | (prin1 (state-expression state) stream)))
17 |
18 | (defun find-similar-state (states state)
19 | "Find another state which we can re-use with some transformation, returning that state and the required transformation."
20 | (flet ((win (other-state substitutions)
21 | (return-from find-similar-state
22 | (values other-state
23 | (loop with used = (used-tags (state-expression other-state))
24 | for ((v1 r1) . (v2 r2))
25 | in (alexandria:hash-table-alist substitutions)
26 | when (member (list v2 r2) used :test #'equal)
27 | collect (cons (list v2 r2) (list v1 r1)))))))
28 | (loop with expression = (state-expression state)
29 | for other-state in states
30 | for other-expression = (state-expression other-state)
31 | for substitutions = (similar expression other-expression)
32 | for used = (used-tags other-expression)
33 | unless (null substitutions)
34 | do (win other-state substitutions))))
35 |
36 | (defun add-transition (class last-state next-state tags-to-set)
37 | (let* ((old-transitions (state-transitions last-state))
38 | (same-transition
39 | (find-if (lambda (transition)
40 | (and
41 | (equal tags-to-set (transition-tags-to-set transition))
42 | (eq next-state (transition-next-state transition))))
43 | old-transitions)))
44 | (cond
45 | ((null same-transition)
46 | (push (make-transition
47 | :class class
48 | :next-state next-state
49 | :tags-to-set tags-to-set)
50 | (state-transitions last-state)))
51 | (t
52 | (setf (transition-class same-transition)
53 | (csum-union (transition-class same-transition)
54 | class))))))
55 |
56 | (trivia:defun-match re-stopped-p (re)
57 | ((alpha (empty-set) _) t)
58 | ((empty-set) t)
59 | (_ nil))
60 |
61 | (defun peephole-optimize (assignments used-tags)
62 | (let ((result '())
63 | (remaining-assignments assignments))
64 | (flet ((substitute-variable (variable replica source)
65 | (setf remaining-assignments
66 | (loop for ((v r) . s) in remaining-assignments
67 | ;; Rewrite {A <- B} C <- A to C <- B
68 | if (equal s (list variable replica))
69 | collect (cons (list v r) source)
70 | else
71 | collect (cons (list v r) s)))))
72 | (loop until (null remaining-assignments)
73 | do (destructuring-bind ((variable replica) . source)
74 | (pop remaining-assignments)
75 | (if (member (list variable replica) used-tags :test #'equal)
76 | (push (cons (list variable replica) source) result)
77 | (substitute-variable variable replica source))))
78 | (reverse result))))
79 |
80 | (defvar *state-limit* 1000)
81 | (define-condition exceeded-state-limit (error)
82 | ()
83 | (:report "Made too many states - either your regular expression is too complicated, or one-more-re-nightmare is broken.
84 | (Either way, you're not going to get this compiled any time soon.)"))
85 |
86 | (defun make-dfa-from-expressions (expressions)
87 | (let ((states (make-hash-table))
88 | (possibly-similar-states (make-hash-table))
89 | (work-list expressions))
90 | (flet ((find-state (expression)
91 | (multiple-value-bind (state present?)
92 | (gethash expression states)
93 | (if present?
94 | (values state nil)
95 | (values (setf (gethash expression states)
96 | (make-instance 'state
97 | :expression expression))
98 | t)))))
99 | (loop
100 | (when (null work-list) (return))
101 | (when (> (hash-table-count states) *state-limit*)
102 | (error 'exceeded-state-limit))
103 | (let* ((expression (pop work-list))
104 | (state (find-state expression)))
105 | (cond
106 | ((or (re-stopped-p expression) (re-empty-p expression))
107 | nil)
108 | (t
109 | (let ((classes (derivative-classes expression)))
110 | (dolist (class classes)
111 | (unless (csum-null-p class)
112 | (let* ((next-expression (derivative expression class))
113 | (tags-to-set (keep-used-assignments
114 | next-expression
115 | (effects expression))))
116 | (multiple-value-bind (next-state new?)
117 | (find-state next-expression)
118 | (multiple-value-bind (other-state transformation)
119 | (find-similar-state
120 | (cons state
121 | (gethash (remove-tags next-expression) possibly-similar-states '()))
122 | next-state)
123 | (cond
124 | ((null other-state)
125 | ;; No state to reuse, so check if we need to process the next state.
126 | (when new?
127 | (pushnew next-expression work-list)))
128 | (t
129 | ;; Reuse this state.
130 | (when new?
131 | (remhash (state-expression next-state) states))
132 | (setf tags-to-set (peephole-optimize
133 | (append tags-to-set transformation)
134 | (used-tags (state-expression other-state)))
135 | next-state other-state)))
136 | (add-transition class
137 | state next-state
138 | tags-to-set)))))))))
139 | (push state (gethash (remove-tags expression)
140 | possibly-similar-states))
141 | (setf (state-exit-map state)
142 | (tags (nullable expression))
143 | (state-exit-effects state)
144 | (keep-used-assignments
145 | (nullable expression)
146 | (effects expression))))))
147 | states))
148 |
149 | (defun make-dfa-from-expression (expression)
150 | (make-dfa-from-expressions (list expression)))
151 |
--------------------------------------------------------------------------------
/Code/DFA-construction/re-types.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (define-types
4 | (literal set)
5 | (empty-string)
6 | (repeat r min max can-empty)
7 | (tag-set substitutions)
8 | (alpha expression history)
9 | (grep vector prototype)
10 | (either r s)
11 | (both r s)
12 | (invert r)
13 | (join r s))
14 |
15 | (define-rewrites (literal set)
16 | :printer ((literal set)
17 | (print-csum set stream)))
18 |
19 | (defun kleene (r)
20 | (repeat r 0 nil nil))
21 | (trivia:defpattern kleene (r)
22 | `(repeat ,r 0 nil nil))
23 |
24 | (defun empty-set () (literal +empty-set+))
25 | (trivia:defpattern empty-set ()
26 | (alexandria:with-gensyms (set)
27 | `(trivia:guard (literal ,set)
28 | (csum-null-p ,set))))
29 |
30 | (defun universal-set ()
31 | (repeat (literal +universal-set+) 0 nil nil))
32 | (trivia:defpattern universal-set ()
33 | `(kleene (literal ',+universal-set+)))
34 |
35 | (defvar *subscripts* "₀₁₂₃₄₅₆₇₈₉")
36 | (defun subscripts (number)
37 | (map 'string
38 | (lambda (char)
39 | (aref *subscripts* (digit-char-p char)))
40 | (princ-to-string number)))
41 |
42 | (define-rewrites (empty-string)
43 | :printer (_ (write-string "ε" stream)))
44 |
45 | (define-rewrites (repeat r min max can-empty)
46 | :simplify (((repeat _ _ 0 _) (empty-string))
47 | ((repeat r 1 1 _) r)
48 | ((repeat (empty-set) 0 nil _) (empty-string))
49 | ((repeat (repeat r 0 nil _) 0 nil c)
50 | (repeat r 0 nil c)))
51 | :printer ((repeat r min max can-empty)
52 | (format stream "[~a]{~a~a,~a}"
53 | r (if can-empty "^" "")
54 | min
55 | (or max ""))))
56 |
57 | (define-rewrites (tag-set substitutions)
58 | :simplify (((tag-set (list))
59 | (empty-string)))
60 | :printer ((tag-set s)
61 | (format stream "{~{~a ← ~a~^, ~}}"
62 | (loop for ((variable replica) . source) in s
63 | if (zerop replica)
64 | collect variable
65 | else
66 | collect (format nil "~a~a"
67 | variable
68 | (subscripts replica))
69 | collect (case source
70 | ((position) "P")
71 | ((nil) "NIL")
72 | (otherwise
73 | (format nil "~a~a"
74 | (first source)
75 | (subscripts (second source)))))))))
76 | (define-rewrites (alpha expression history)
77 | :simplify (((alpha (empty-set) (empty-set)) (empty-set)))
78 | :printer ((alpha r n)
79 | (format stream "α[~@<~a, ~_~a]~:>" r n)))
80 | (define-rewrites (grep match-vector prototype)
81 | :simplify (((grep r _)
82 | (if (eq (nullable r) (empty-set))
83 | (trivia.next:next)
84 | r))
85 | ((grep (empty-set) _)
86 | (empty-set)))
87 | :printer ((grep r _)
88 | (format stream "γ[~a]" r)))
89 |
90 | (define-rewrites (either r s)
91 | :simplify (((either (either p r) s)
92 | (either p (either r s)))
93 | ((either (empty-set) r) r)
94 | ((either r (empty-set)) r)
95 | ((either (literal s1) (literal s2))
96 | (literal (csum-union s1 s2)))
97 | ((either r (universal-set))
98 | (if (has-tags-p r)
99 | (trivia.next:next) ; Preserve tags then
100 | (universal-set)))
101 | ((either (join (literal s1) p)
102 | (join (literal s2) r))
103 | ;; Try to expose more prefixes.
104 | (if (equal s1 s2)
105 | (join (literal s1) (either p r))
106 | (trivia.next:next)))
107 | ((either r s)
108 | (scan-either-for-duplicates r s)))
109 | :printer ((either r s)
110 | (format stream "~@<(~a) ∪ ~_(~a)~:>" r s)))
111 |
112 | (defun scan-either-for-duplicates (r s)
113 | (labels ((e (r s)
114 | ;; We can't call EITHER, since EITHER calls us, so we
115 | ;; handle simplifying out empty sets here.
116 | (cond
117 | ((eq s (empty-set)) r)
118 | ((eq r (empty-set)) s)
119 | (t (%either r s))))
120 | (scan (rhs)
121 | (trivia:match rhs
122 | ((either lhs next-rhs)
123 | (if (eq (remove-tags lhs) (remove-tags r))
124 | next-rhs
125 | (e lhs (scan next-rhs))))
126 | (_
127 | (if (eq (remove-tags rhs) (remove-tags r))
128 | (empty-set)
129 | rhs)))))
130 | (e r (scan s))))
131 |
132 | (define-rewrites (both r s)
133 | :simplify (((both r s)
134 | (if (eq r s)
135 | r
136 | (trivia.next:next)))
137 | ((both _ (empty-set)) (empty-set))
138 | ((both (empty-set) _) (empty-set))
139 | ((both (tag-set s1) (tag-set s2))
140 | (tag-set (merge-tag-sets s1 s2)))
141 | ((both (tag-set s) (empty-string))
142 | (tag-set s))
143 | ((both (empty-string) (tag-set s))
144 | (tag-set s))
145 | ((both (literal s1) (literal s2))
146 | (literal (csum-intersection s1 s2))))
147 | :printer ((both r s)
148 | (format stream "(~a) ∩ (~a)" r s)))
149 | (define-rewrites (invert r)
150 | :simplify (((invert (invert r)) r)
151 | ((invert s)
152 | (if (has-tags-p s)
153 | (invert (remove-tags s))
154 | (trivia.next:next)))
155 | ((invert (universal-set)) (empty-set)))
156 | :printer ((invert r)
157 | (format stream "¬[~a]" r)))
158 | (define-rewrites (join r s)
159 | :simplify (((join (tag-set s) (either p r))
160 | (either (join (tag-set s) p)
161 | (join (tag-set s) r)))
162 | ((join (empty-set) _) (empty-set))
163 | ((join _ (empty-set)) (empty-set))
164 | ((join (empty-string) r) r)
165 | ((join r (empty-string)) r)
166 | ((join (join (tag-set s1) r) s)
167 | ;; Rotate so that a TAG-SET is always at the start.
168 | (join (tag-set s1) (join r s)))
169 | ((join (tag-set s1) (join (tag-set s2) r))
170 | (join (tag-set (merge-tag-sets s1 s2)) r))
171 | ;; Avoid A*A*
172 | ((join (kleene a) (join (kleene b) c))
173 | (if (eq a b)
174 | (join (kleene a) c)
175 | (trivia.next:next)))
176 | ((join (kleene a) (kleene b))
177 | (if (eq a b)
178 | (kleene a)
179 | (trivia.next:next)))
180 | ((join (tag-set s1) (tag-set s2))
181 | (tag-set (merge-tag-sets s1 s2))))
182 | :printer ((join r s)
183 | (format stream "~a~a" r s)))
184 |
185 | (defun text (vector)
186 | (reduce #'join (map 'vector
187 | (lambda (e)
188 | (literal (singleton-set (char-code e))))
189 | vector)
190 | :initial-value (empty-string)
191 | :from-end t))
192 |
193 | (defun group (r n)
194 | (join (tag-set `(((,(1- (* 2 n)) 0) . position)))
195 | (join r
196 | (tag-set `(((,(* 2 n) 0) . position))))))
197 |
--------------------------------------------------------------------------------
/Code/DFA-construction/sets.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | ;;;; Character sums
4 | ;;; Character sums (csums) represent a set of characters as
5 | ;;; a combination of ranges and "symbolic" classes (such as
6 | ;;; [:alpha:], [:digit:], etc). Their implementation is somewhat
7 | ;;; similar to Gilbert Baumann's "isum" integer sums, used
8 | ;;; in clex2 and earlier versions of one-more-re-nightmare.
9 |
10 | ;;; Class sets
11 | ;; A class set is an element of ℙ(ℙ(classes)) i.e. a set of sets
12 | ;; of character classes that are part of a range.
13 |
14 | ;; This gets normalised to P on SBCL, but it looks pretty.
15 | (eval-when (:compile-toplevel :load-toplevel :execute)
16 | (defun ℙ (x) (expt 2 x)))
17 | (alexandria:define-constant +classes+
18 | '((:alpha alpha-char-p) (:digit digit-char-p) (:lower lower-case-p) (:upper upper-case-p))
19 | :test 'equal)
20 | (defconstant +empty-class-set+ 0)
21 | (defconstant +class-set-bits+ (ℙ (length +classes+)))
22 | (defconstant +universal-class-set+ (1- (ℙ (ℙ (length +classes+)))))
23 | (defun class-set-complement (c) (logxor #xFFFF c))
24 |
25 | ;;; Character sets
26 | ;; A character set is a union of intersections of character ranges
27 | ;; and class sets. A character set has the form ((class-set start end) ...)
28 | ;; with the first start fixed to 0 and the last end fixed to *CODE-LIMIT*.
29 |
30 | (defvar *code-limit* char-code-limit)
31 | (define-symbol-macro +empty-set+ (list (list +empty-class-set+ 0 *code-limit*)))
32 | (define-symbol-macro +universal-set+ (list (list +universal-class-set+ 0 *code-limit*)))
33 | (defun range (start limit)
34 | "The character set for [START, LIMIT)"
35 | (list (list +empty-class-set+ 0 start)
36 | (list +universal-class-set+ start limit)
37 | (list +empty-class-set+ limit *code-limit*)))
38 | (defun singleton-set (x) (range x (1+ x)))
39 | (defun class-set (class)
40 | (let ((p (position class +classes+ :key #'first)))
41 | (when (null p) (error "No class named ~S" class))
42 | (loop for i below +class-set-bits+
43 | when (logbitp p i)
44 | sum (ash 1 i) into class-set
45 | finally (return (list (list class-set 0 *code-limit*))))))
46 | (defun remove-empty-ranges (csum)
47 | (remove 0 csum :key #'first))
48 |
49 | (defun print-csum (csum stream)
50 | (labels ((range (start end)
51 | (if (= (1- end) start)
52 | (string (code-char start))
53 | (format nil "~C-~C" (code-char start) (code-char (1- end))))))
54 | (trivia:match (remove-empty-ranges csum)
55 | ('() (write-string "[]" stream))
56 | ((equal +universal-set+) (write-string "Σ" stream))
57 | ((list (list (= +universal-class-set+) start end))
58 | (if (= (1- end) start)
59 | (write-char (code-char start) stream)
60 | (format stream "[~C-~C]" (code-char start) (code-char (1- end)))))
61 | (pos-parts
62 | (trivia:match (remove-empty-ranges (csum-complement csum))
63 | ((list (list (= +universal-class-set+) start end))
64 | (format stream "[¬~A]" (range start end)))
65 | (neg-parts
66 | (write-char #\[ stream)
67 | (multiple-value-bind (negative? parts)
68 | (if (> (length pos-parts) (length neg-parts))
69 | (values t neg-parts)
70 | (values nil pos-parts))
71 | (when negative? (write-char #\¬ stream))
72 | (loop for (classes start end) in parts
73 | if (= classes +universal-class-set+)
74 | do (write-string (range start end) stream)
75 | else
76 | do (write (list classes start end) :stream stream)))
77 | (write-char #\] stream)))))))
78 |
79 | ;;; Operations on character sets
80 |
81 | (defun coalesce-csum (cset)
82 | "Coalesce adjacent ranges with the same class set in a character set."
83 | (loop until (null cset)
84 | collect (let* ((f (first cset))
85 | (l (member (first f) (rest cset) :key #'first :test #'/=)))
86 | (setf cset l)
87 | (if (null l)
88 | (list (first f) (second f) *code-limit*)
89 | (list (first f) (second f) (second (first l)))))))
90 |
91 | ;; A set table is a list of lists of values, and a list of ranges.
92 | ;; We ensure that ranges line up by only storing the ranges
93 | ;; once.
94 | (defun align-csums (csets)
95 | "Align the ranges in a list of character sets, returning a list of lists of values, and a list of ranges."
96 | (labels ((align (csets values ranges start)
97 | (if (null (first csets))
98 | (values (reverse values) (reverse ranges))
99 | (let ((end (reduce #'min csets :key (lambda (c) (third (first c))))))
100 | ;; Take a step.
101 | (align
102 | (loop for c in csets
103 | collect (if (= (third (first c)) end) (rest c) c))
104 | (cons (loop for c in csets collect (first (first c))) values)
105 | (cons (list start end) ranges)
106 | end)))))
107 | (align csets '() '() 0)))
108 |
109 | (defmacro define-csum-op (name class-op arguments)
110 | `(defun ,name ,arguments
111 | (multiple-value-bind (values ranges)
112 | (align-csums (list ,@arguments))
113 | (coalesce-csum
114 | (mapcar (lambda (v r) (cons (apply #',class-op v) r))
115 | values ranges)))))
116 |
117 | (define-csum-op csum-union logior (a b))
118 | (define-csum-op csum-intersection logand (a b))
119 | (define-csum-op csum-complement class-set-complement (a))
120 | (define-csum-op csum-difference logandc2 (a b))
121 | (defun csum-null-p (csum) (equal csum +empty-set+))
122 |
123 | ;;; Character set dispatch
124 | ;; This could have element type (UNSIGNED-BYTE 4) but that'd take
125 | ;; more effort to decode; so we go with bytes.
126 | ;; Rhetorical question: Are there any other ways to compress this
127 | ;; table?
128 | (alexandria:define-constant +character-class-table+
129 | (let ((table
130 | (make-array char-code-limit
131 | :element-type '(unsigned-byte 8)
132 | :initial-element 0)))
133 | (dotimes (i char-code-limit table)
134 | (let ((character (code-char i)))
135 | (unless (null character)
136 | (loop for (nil predicate) in +classes+
137 | for x = 1 then (ash x 1)
138 | do (when (funcall predicate (code-char i))
139 | (setf (aref table i) (logior x (aref table i)))))))))
140 | :test 'equalp)
141 | (declaim (inline lookup-class))
142 | (defun lookup-class (code)
143 | (aref +character-class-table+ code))
144 |
145 | (defmacro csum-case (var less-than equal &body cases)
146 | (labels ((dispatch-classes (values)
147 | (if (alexandria:length= 1 values)
148 | `(progn ,@(cdar values))
149 | (alexandria:with-gensyms (result)
150 | `(let ((,result (lookup-class ,var)))
151 | (cond
152 | ,@(loop for (class-set . body) in values
153 | collect `(,(if (= +universal-class-set+ class-set)
154 | 't
155 | `(logbitp ,result ,class-set))
156 | ,@body)))))))
157 | (singleton-p (range) (= (1+ (first range)) (second range)))
158 | (middle (list) (butlast (rest list)))
159 | (dispatch-csums (values ranges)
160 | (cond
161 | ((alexandria:length= 1 ranges)
162 | ;; There's only one more range, so dispatch on classes.
163 | (dispatch-classes (first values)))
164 | ;; Detect singleton sets to use = on, e.g. [^ab], a and b.
165 | ((and (equal (first values) (first (last values)))
166 | (every #'singleton-p (middle ranges)))
167 | `(cond
168 | ,@(loop for r in (middle ranges)
169 | for v in (middle values)
170 | collect `((,equal ,var ,(first r)) ,(dispatch-classes v)))
171 | (t ,(dispatch-classes (first values)))))
172 | ;; Bisect and continue dispatching.
173 | (t
174 | (let* ((mid (floor (length values) 2)))
175 | `(if (,less-than ,var ,(first (nth mid ranges)))
176 | ,(dispatch-csums (subseq values 0 mid)
177 | (subseq ranges 0 mid))
178 | ,(dispatch-csums (subseq values mid)
179 | (subseq ranges mid))))))))
180 | (multiple-value-bind (values ranges)
181 | (align-csums
182 | (loop for (csum . body) in cases
183 | collect (loop for (cl s e) in csum
184 | collect `((,cl . ,body) ,s ,e))))
185 | ;; Remove unreachable values from the set table.
186 | (let ((values (mapcar #'remove-empty-ranges values)))
187 | (dispatch-csums values ranges)))))
188 |
189 | (defun csum-has-classes-p (csum)
190 | "Does a character sum use any non-trivial character classes?"
191 | (loop for (c s e) in csum
192 | thereis (and (/= c +empty-class-set+) (/= c +universal-class-set+))))
193 |
194 | (defun make-test-form (csum variable)
195 | "Compute a form which tests if VARIABLE is an element of CSUM, using OR, <= and ="
196 | (cond
197 | ((equal csum +empty-set+) 'nil)
198 | ((equal csum +universal-set+) 't)
199 | (t
200 | `(or ,@(loop for (c s e) in csum
201 | unless (= c +empty-class-set+)
202 | do (assert (= c +universal-class-set+))
203 | and collect (if (= (1+ s) e)
204 | `(= ,s ,variable)
205 | `(<= ,s ,variable ,(1- e))))))))
206 |
207 | ;;; Named sets
208 |
209 | (defun named-range (name)
210 | (labels ((∪ (&rest rest) (reduce #'csum-union rest))
211 | (d (a b) (csum-intersection a (csum-complement b)))
212 | (s (&rest rest) (reduce #'csum-union rest :key (alexandria:compose #'singleton-set #'char-code))))
213 | (alexandria:eswitch (name :test 'string=)
214 | ("alpha" (class-set :alpha))
215 | ("alnum" (∪ (class-set :alpha) (class-set :digit)))
216 | ("blank" (s #\Space #\Tab))
217 | ("cntrl" (∪ (range 0 32) (singleton-set 127)))
218 | ("digit" (class-set :digit))
219 | ("graph" (csum-complement (∪ (named-range "cntrl") (s #\Space))))
220 | ("lower" (class-set :lower))
221 | ("print" (∪ (named-range "graph") (s #\Space)))
222 | ("punct" (d (named-range "graph") (∪ (class-set :alpha) (class-set :digit))))
223 | ("space" (∪ (singleton-set 11) (s #\Space #\Return #\Newline #\Tab)))
224 | ("upper" (class-set :upper))
225 | ("xdigit" (∪ (class-set "digit") (s #\A #\B #\C #\D #\E #\F #\a #\b #\c #\d #\e #\f))))))
226 |
--------------------------------------------------------------------------------
/Code/SIMD/sbcl-x86-64.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defun find-op (name)
4 | (or (find-symbol (format nil "V-~a~d" name *bits*)
5 | ':one-more-re-nightmare.vector-primops)
6 | (error "No primop named ~a" name)))
7 |
8 | ;; AVX2 is a pile of stink and only has instructions for signed
9 | ;; comparisons. So, in order to fake an unsigned comparison, we
10 | ;; subtract #x80 from everything.
11 |
12 | (defun find-8-bit-broadcast (n)
13 | (find-broadcast (mod (- n #x80) #x100)))
14 | (defvar *swizzled-name*)
15 | (defun swizzle-8-bits ()
16 | (or *swizzled-name*
17 | (setf *swizzled-name* (make-symbol "SWIZZLE"))))
18 |
19 | (defun translate-scalar-code (variable code)
20 | (let* ((*swizzled-name* nil)
21 | (translated (%translate-scalar-code code)))
22 | (if (null *swizzled-name*)
23 | translated
24 | `(let ((,*swizzled-name*
25 | (one-more-re-nightmare.vector-primops:v8- ,variable ,(find-broadcast #x80))))
26 | ,translated))))
27 |
28 | (trivia:defun-ematch %translate-scalar-code (code)
29 | "Translate some 'scalar' code generated by MAKE-TEST-FORM into a vectorised computation."
30 | ('t :always)
31 | ('nil :never)
32 | ;; All the Boolean operators just map over their arguments.
33 | ((list 'not thing)
34 | `(,(find-op "NOT")
35 | ,(%translate-scalar-code thing)))
36 | ((list* 'or things)
37 | (reduce (lambda (a b) `(,(find-op "OR") ,a ,b))
38 | (mapcar #'%translate-scalar-code things)))
39 | ;; Ditto for = really.
40 | ((list '= value variable)
41 | ;; Note that = works the same if it's signed or not; it's only >
42 | ;; that requires more effort
43 | `(,(ecase *bits*
44 | (32 'one-more-re-nightmare.vector-primops:v32=)
45 | (8 'one-more-re-nightmare.vector-primops:v8=))
46 | ,(find-broadcast value) ,variable))
47 | ;; Generating good code for <= is tricky though. Whoever designed
48 | ;; SSE2 and AVX2 decided that just having = and > were good enough,
49 | ;; so we need an efficient implementation of ≤ from those.
50 | ((list '<= 0 value high)
51 | ;; No lower bounds here. Note that X ≤ N ⇔ N + 1 > X
52 | (ecase *bits*
53 | (32 `(one-more-re-nightmare.vector-primops:v32> ,(find-broadcast (1+ high)) ,value))
54 | (8 `(one-more-re-nightmare.vector-primops:v8> ,(find-8-bit-broadcast (1+ high)) ,value))))
55 | ((list '<= low value high)
56 | ;; Similarly, N ≤ X ⇔ X > N - 1
57 | (ecase *bits*
58 | (32
59 | `(one-more-re-nightmare.vector-primops:v-and32
60 | ;; Similarly, N ≤ X ⇔ X > N - 1
61 | (one-more-re-nightmare.vector-primops:v32> ,value ,(find-broadcast (1- low)))
62 | (one-more-re-nightmare.vector-primops:v32> ,(find-broadcast (1+ high)) ,value)))
63 | (8
64 | `(one-more-re-nightmare.vector-primops:v-and8
65 | (one-more-re-nightmare.vector-primops:v8> ,(swizzle-8-bits)
66 | ,(find-8-bit-broadcast (1- low)))
67 | (one-more-re-nightmare.vector-primops:v8> ,(find-8-bit-broadcast (1+ high))
68 | ,(swizzle-8-bits)))))))
69 |
70 | (defmacro define-boring-vop (name args result &body generator)
71 | `(progn
72 | (sb-vm::define-vop (,name)
73 | (:translate ,name)
74 | (:policy :fast-safe)
75 | (:args ,@(loop for (name nil . rest) in args
76 | collect (cons name rest)))
77 | (:arg-types ,@(mapcar #'second args))
78 | (:results (,(first result) ,@(rest (rest result))))
79 | (:result-types ,(second result))
80 | (:generator 0 ,@generator))))
81 |
82 | (defmacro define-op (name bits args instruction-name)
83 | (let ((primitive-type (ecase bits
84 | (8 'sb-vm::simd-pack-256-ub8)
85 | (32 'sb-vm::simd-pack-256-ub32))))
86 | `(progn
87 | (sb-c:defknown ,name
88 | ,(loop for nil in args collect `(sb-ext:simd-pack-256 (unsigned-byte ,bits)))
89 | (sb-ext:simd-pack-256 (unsigned-byte ,bits))
90 | (sb-c:foldable sb-c:flushable sb-c:movable)
91 | :overwrite-fndb-silently t)
92 | (define-boring-vop ,name
93 | ,(loop for arg in args
94 | collect `(,arg ,primitive-type :scs (sb-vm::int-avx2-reg)))
95 | (result ,primitive-type :scs (sb-vm::int-avx2-reg))
96 | (sb-vm::inst ,instruction-name result ,@args)))))
97 |
98 | (defun one-more-re-nightmare.vector-primops:all-of (variables)
99 | (reduce (lambda (a b) `(,(find-op "AND") ,a ,b))
100 | variables))
101 |
102 | (defconstant one-more-re-nightmare.vector-primops:+v-length+ 256)
103 |
104 | (in-package :sb-vm)
105 |
106 | ;;;; Boolean operations
107 |
108 | (one-more-re-nightmare::define-op
109 | one-more-re-nightmare.vector-primops:v-and8 8 (a b) vpand)
110 | (one-more-re-nightmare::define-op
111 | one-more-re-nightmare.vector-primops:v-and32 32 (a b) vpand)
112 |
113 | (one-more-re-nightmare::define-op
114 | one-more-re-nightmare.vector-primops:v-or8 8 (a b) vpor)
115 | (one-more-re-nightmare::define-op
116 | one-more-re-nightmare.vector-primops:v-or32 32 (a b) vpor)
117 |
118 | (macrolet ((frob (name bits arg-type)
119 | `(progn
120 | (defknown ,name
121 | ((simd-pack-256 (unsigned-byte ,bits)))
122 | (simd-pack-256 (unsigned-byte ,bits))
123 | (foldable flushable movable)
124 | :overwrite-fndb-silently t)
125 | (define-vop (,name)
126 | (:translate ,name)
127 | (:policy :fast-safe)
128 | (:args (value :scs (int-avx2-reg)))
129 | (:arg-types ,arg-type)
130 | (:results (result :scs (int-avx2-reg)))
131 | (:result-types ,arg-type)
132 | (:temporary (:sc int-avx2-reg) ones)
133 | (:generator 0
134 | (inst vpcmpeqd ones ones ones) ; get all 1s
135 | (inst vpxor result ones value)))))) ; 1111... (+) A = ¬A
136 | (frob one-more-re-nightmare.vector-primops:v-not8 8 simd-pack-256-ub8)
137 | (frob one-more-re-nightmare.vector-primops:v-not32 32 simd-pack-256-ub32))
138 |
139 | ;;;; Comparisons
140 |
141 | ;; This is a signed comparison, but as there are fewer than 2³¹
142 | ;; Unicode characters, no one needs to know that.
143 | (one-more-re-nightmare::define-op
144 | one-more-re-nightmare.vector-primops:v32> 32 (a b) vpcmpgtd)
145 |
146 | ;; We do need to know that this is a signed comparison, since we do
147 | ;; want to target (UNSIGNED-BYTE 8) too, and we handle it above.
148 | (one-more-re-nightmare::define-op
149 | one-more-re-nightmare.vector-primops:v8> 8 (a b) vpcmpgtb)
150 |
151 | (one-more-re-nightmare::define-op
152 | one-more-re-nightmare.vector-primops:v32= 32 (a b) vpcmpeqd)
153 |
154 | (one-more-re-nightmare::define-op
155 | one-more-re-nightmare.vector-primops:v8= 8 (a b) vpcmpeqb)
156 |
157 | (one-more-re-nightmare::define-op
158 | one-more-re-nightmare.vector-primops:v8- 8 (a b) vpsubb)
159 |
160 | ;;;; Broadcasts
161 |
162 | (defknown one-more-re-nightmare.vector-primops:v-broadcast32
163 | ((unsigned-byte 32))
164 | (simd-pack-256 (unsigned-byte 32))
165 | ;; Not constant folding, because loading a folded broadcast is
166 | ;; slower than reproducing it again.
167 | (flushable movable)
168 | :overwrite-fndb-silently t)
169 |
170 | (one-more-re-nightmare::define-boring-vop
171 | one-more-re-nightmare.vector-primops:v-broadcast32
172 | ((integer unsigned-num :scs (unsigned-reg)))
173 | (result simd-pack-256-ub32 :scs (int-avx2-reg))
174 | (inst movq result integer)
175 | (inst vpbroadcastd result result))
176 |
177 | (defknown one-more-re-nightmare.vector-primops:v-broadcast8
178 | ((unsigned-byte 8))
179 | (simd-pack-256 (unsigned-byte 8))
180 | (flushable movable)
181 | :overwrite-fndb-silently t)
182 |
183 | (one-more-re-nightmare::define-boring-vop
184 | one-more-re-nightmare.vector-primops:v-broadcast8
185 | ((integer unsigned-num :scs (unsigned-reg)))
186 | (result simd-pack-256-ub8 :scs (int-avx2-reg))
187 | (inst movq result integer)
188 | (inst vpbroadcastb result result))
189 |
190 | ;;;; Move mask
191 |
192 | (defknown one-more-re-nightmare.vector-primops:v-movemask32
193 | ((simd-pack-256 (unsigned-byte 32)))
194 | (unsigned-byte 8)
195 | (flushable movable)
196 | :overwrite-fndb-silently t)
197 |
198 | (one-more-re-nightmare::define-boring-vop
199 | one-more-re-nightmare.vector-primops:v-movemask32
200 | ((pack simd-pack-256-ub32 :scs (int-avx2-reg)))
201 | (result unsigned-num :scs (unsigned-reg))
202 | (inst vmovmskps result pack))
203 |
204 | (defknown one-more-re-nightmare.vector-primops:v-movemask8
205 | ((simd-pack-256 (unsigned-byte 8)))
206 | (unsigned-byte 32)
207 | (flushable movable)
208 | :overwrite-fndb-silently t)
209 |
210 | (one-more-re-nightmare::define-boring-vop
211 | one-more-re-nightmare.vector-primops:v-movemask8
212 | ((pack simd-pack-256-ub8 :scs (int-avx2-reg)))
213 | (result unsigned-num :scs (unsigned-reg))
214 | (inst vpmovmskb result pack))
215 |
216 | ;;;; Load
217 |
218 | (defknown one-more-re-nightmare.vector-primops:v-load32
219 | ((simple-array character 1) sb-int:index)
220 | (simd-pack-256 (unsigned-byte 32))
221 | (foldable flushable movable)
222 | :overwrite-fndb-silently t)
223 |
224 | (one-more-re-nightmare::define-boring-vop
225 | one-more-re-nightmare.vector-primops:v-load32
226 | ((string simple-character-string :scs (descriptor-reg))
227 | (index tagged-num :scs (any-reg)))
228 | (result simd-pack-256-ub32 :scs (int-avx2-reg))
229 | (inst vmovdqu result
230 | (ea (- (* vector-data-offset n-word-bytes)
231 | other-pointer-lowtag)
232 | ;; Characters are 4 bytes, fixnums have a trailing 0 so
233 | ;; just multiply by 2.
234 | string index 2)))
235 |
236 | (defknown one-more-re-nightmare.vector-primops:v-load8
237 | ((simple-array base-char 1) sb-int:index)
238 | (simd-pack-256 (unsigned-byte 8))
239 | (foldable flushable movable)
240 | :overwrite-fndb-silently t)
241 |
242 | (one-more-re-nightmare::define-boring-vop
243 | one-more-re-nightmare.vector-primops:v-load8
244 | ((string simple-base-string :scs (descriptor-reg))
245 | (index unsigned-num :scs (unsigned-reg)))
246 | (result simd-pack-256-ub8 :scs (int-avx2-reg))
247 | (inst vmovdqu result
248 | (ea (- (* vector-data-offset n-word-bytes)
249 | other-pointer-lowtag)
250 | string index 1)))
251 |
252 | ;;;; Find first set
253 |
254 | (defknown one-more-re-nightmare.vector-primops:find-first-set
255 | ((unsigned-byte 64))
256 | (mod 64)
257 | (foldable flushable movable)
258 | :overwrite-fndb-silently t)
259 |
260 | (one-more-re-nightmare::define-boring-vop
261 | one-more-re-nightmare.vector-primops:find-first-set
262 | ((integer unsigned-num :scs (unsigned-reg)))
263 | (result unsigned-num :scs (unsigned-reg))
264 | (inst bsf result integer))
265 |
--------------------------------------------------------------------------------
/Code/Interface/interface.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defun re-groups (regular-expression)
4 | (nth-value 1
5 | (with-hash-consing-tables ()
6 | (parse-regular-expression regular-expression))))
7 |
8 | (defmacro collect ((function) &body body)
9 | (alexandria:with-gensyms (list tail)
10 | `(let* ((,list (list 'nil))
11 | (,tail ,list))
12 | (flet ((,function (element)
13 | (let ((new-tail (list element)))
14 | (setf (cdr ,tail) new-tail
15 | ,tail new-tail))))
16 | (declare (dynamic-extent #',function)
17 | (inline ,function))
18 | ,@body
19 | (cdr ,list)))))
20 |
21 | (defmacro with-code (((function size) code) &body body)
22 | `(let ((,function (car ,code))
23 | (,size (cdr ,code)))
24 | ,@body))
25 |
26 | (eval-when (:compile-toplevel :load-toplevel :execute)
27 | (defun constant-safe-to-eval-p (form)
28 | (trivia:match form
29 | ((or (type string) (type symbol) (list 'quote (type string))) t)
30 | (_ nil)))
31 |
32 | (defun try-to-evaluate-constant-re (form)
33 | (if (and (constantp form)
34 | (constant-safe-to-eval-p form))
35 | (let ((result (eval form)))
36 | (if (stringp result)
37 | result
38 | nil))
39 | nil)))
40 |
41 | (define-compiler-macro compile-regular-expression (&whole w expression)
42 | (let ((re (try-to-evaluate-constant-re expression)))
43 | (cond
44 | ((null re) w)
45 | (t
46 | (handler-case
47 | (lint-regular-expression re)
48 | (error (e)
49 | (warn "Error while linting:~%~a" e)
50 | w)
51 | (:no-error (&rest values)
52 | (declare (ignore values))
53 | `(make-compiled-regular-expression
54 | :codes (vector ,@(loop for type in *string-types*
55 | collect `(find-code ,re ',type)))
56 | :original-re ',re)))))))
57 |
58 | (defmacro with-code-for-vector ((function size vector regular-expression bailout-form) &body body)
59 | (alexandria:with-gensyms (result)
60 | `(let ((,result (try-to-evaluate-constant-re ,regular-expression)))
61 | (cond
62 | ((null ,result)
63 | ,bailout-form)
64 | (t
65 | (handler-case
66 | (lint-regular-expression ,result)
67 | (error (e)
68 | (warn "Error while linting:~%~a" e)
69 | ,bailout-form)
70 | (:no-error (&rest values)
71 | (declare (ignore values))
72 | (alexandria:once-only (,vector)
73 | (alexandria:with-gensyms (,function ,size)
74 | `(let ((,,size ,(match-vector-size (re-groups ,result)))
75 | (,,function
76 | (cond
77 | ,@(loop for string-type in *string-types*
78 | collect `((typep ,,vector ',string-type)
79 | (load-time-value (car (find-code ,,result ',string-type)))))
80 | (t (car (find-code ,,result (string-type-of ,,vector)))))))
81 | ,(progn ,@body)))))))))))
82 |
83 | (declaim (inline %all-matches)
84 | (ftype (function * list) %all-matches))
85 | (defun %all-matches (function size vector start end)
86 | (declare (alexandria:array-index start end)
87 | (function function)
88 | (fixnum size))
89 | (assert (and (<= end (length vector))
90 | (<= 0 start end)))
91 | ;; The code function will fill in values as needed.
92 | (let ((match (make-array size)))
93 | (collect (result)
94 | (funcall function vector start end match
95 | (lambda ()
96 | (result (copy-seq match)))))))
97 |
98 | (declaim (ftype (function
99 | (re-designator string
100 | &key (:start alexandria:array-index)
101 | (:end (or alexandria:array-length null)))
102 | list)
103 | all-matches all-string-matches))
104 |
105 | (defun all-matches (regular-expression vector
106 | &key (start 0) (end (length vector)))
107 | "Find every match, as a list of match vectors."
108 | (with-code ((function size)
109 | (find-code regular-expression (string-type-of vector)))
110 | (%all-matches function size vector start (or end (length vector)))))
111 |
112 | (define-compiler-macro all-matches (&whole w
113 | regular-expression vector
114 | &key (start 0)
115 | (end nil end-p))
116 | ;; Grab code at load-time if possible.
117 | (with-code-for-vector (function size vector regular-expression w)
118 | `(%all-matches ,function ,size ,vector
119 | ,start ,(if end-p `(or ,end (length ,vector)) `(length ,vector)))))
120 |
121 | (declaim (inline subsequences))
122 | (defun subsequences (vector match-vector)
123 | (declare (simple-vector match-vector)
124 | (vector vector)
125 | (optimize (speed 3) (safety 0)))
126 | (let* ((sequences (floor (length match-vector) 2))
127 | (string-match-vector (make-array sequences)))
128 | (loop for n below sequences
129 | for start = (aref match-vector (* n 2))
130 | for end = (aref match-vector (1+ (* n 2)))
131 | if (null start)
132 | do (setf (aref string-match-vector n) nil)
133 | else
134 | do (setf (aref string-match-vector n)
135 | (subseq vector start end)))
136 | string-match-vector))
137 |
138 | (defun all-string-matches (regular-expression vector
139 | &key (start 0) (end (length vector)))
140 | "Find every match, as a list of match string vectors."
141 | (mapcar (lambda (match) (subsequences vector match))
142 | (all-matches regular-expression
143 | vector
144 | :start start
145 | :end end)))
146 |
147 | (define-compiler-macro all-string-matches (&whole w
148 | regular-expression vector
149 | &key (start 0)
150 | (end nil end-p))
151 | ;; Grab code at load-time if possible.
152 | (with-code-for-vector (function size vector regular-expression w)
153 | `(mapcar (lambda (match) (subsequences ,vector match))
154 | (%all-matches ,function ,size ,vector
155 | ,start ,(if end-p `(or ,end nil) `(length ,vector))))))
156 |
157 | (declaim (inline %first-match))
158 | (defun %first-match (function size vector start end)
159 | (declare (alexandria:array-index start end)
160 | (fixnum size)
161 | (function function))
162 | (assert (and (<= end (length vector))
163 | (<= 0 start end)))
164 | (let ((tag-vector (make-array size)))
165 | (funcall function vector start end tag-vector
166 | (lambda ()
167 | (return-from %first-match tag-vector)))
168 | nil))
169 |
170 | (declaim (ftype (function
171 | (re-designator string
172 | &key (:start alexandria:array-index)
173 | (:end (or alexandria:array-length null)))
174 | (or null simple-vector))
175 | first-match first-string-match))
176 |
177 | (defun first-match (regular-expression vector
178 | &key (start 0) (end (length vector)))
179 | "Find the first match, returning a match vector, or NIL."
180 | (with-code ((function size)
181 | (find-code regular-expression (string-type-of vector)))
182 | (%first-match function size vector start (or end (length vector)))))
183 |
184 | (define-compiler-macro first-match (&whole w
185 | regular-expression vector
186 | &key (start 0) (end nil end-p))
187 | (with-code-for-vector (function size vector regular-expression w)
188 | `(%first-match ,function ,size ,vector ,start
189 | ,(if end-p `(or ,end nil) `(length ,vector)))))
190 |
191 | (defun first-string-match (regular-expression vector
192 | &key (start 0) (end (length vector)))
193 | "Find the first match, returning a match string vector or NIL"
194 | (let ((results (first-match regular-expression vector
195 | :start start :end end)))
196 | (if (null results)
197 | nil
198 | (subsequences vector results))))
199 |
200 | (define-compiler-macro first-string-match (&whole w
201 | regular-expression vector
202 | &key (start 0) (end nil end-p))
203 | (with-code-for-vector (function size vector regular-expression w)
204 | `(subsequences ,vector
205 | (%first-match ,function ,size ,vector ,start
206 | ,(if end-p `(or ,end (length ,vector)) `(length ,vector))))))
207 |
208 | (defmacro do-matches (((&rest registers) regular-expression vector
209 | &key (start 0) (end nil))
210 | &body body)
211 | "Iterate over every match, binding match registers."
212 | (alexandria:with-gensyms (function size match-vector)
213 | (alexandria:once-only (start end vector)
214 | (labels ((consume (function size known-register-count)
215 | (when (and (not (null known-register-count))
216 | (> (length registers) known-register-count))
217 | (warn "This regular expression only produces ~r register~:p, but ~r variables were provided."
218 | known-register-count
219 | (length registers))
220 | (setf known-register-count nil))
221 | `(progn
222 | (when (null ,end)
223 | (setf ,end (length ,vector)))
224 | (assert (and (<= ,end (length ,vector))
225 | (<= 0 ,start ,end)))
226 | (let ((,match-vector (make-array ,size)))
227 | ,(if (null known-register-count)
228 | `(assert (>= (length ,match-vector)
229 | ,(length registers))
230 | ()
231 | "This regular expression only produces ~r register~:p, but ~r variables were provided."
232 | (length ,match-vector)
233 | ,(length registers))
234 | `(declare (dynamic-extent ,match-vector)))
235 | (funcall ,function ,vector ,start ,end ,match-vector
236 | (lambda ()
237 | (let ,(loop for register in registers
238 | for n from 0
239 | collect `(,register (svref ,match-vector ,n)))
240 | ,(if (>= (length registers) 2)
241 | `(declare ((or null alexandria:array-index) ,@(subseq registers 2))
242 | (alexandria:array-index ,@(subseq registers 0 2)))
243 | `(declare (alexandria:array-index ,@registers)))
244 | ,@body))))))
245 | (fallback ()
246 | (alexandria:once-only (vector)
247 | `(with-code ((,function ,size)
248 | (find-code ,regular-expression (string-type-of ,vector)))
249 | ,(consume function size nil)))))
250 | (with-code-for-vector (function size vector regular-expression (fallback))
251 | (consume function size
252 | (match-vector-size (re-groups regular-expression))))))))
253 |
--------------------------------------------------------------------------------
/Code/Compiler/code-generation.lisp:
--------------------------------------------------------------------------------
1 | (in-package :one-more-re-nightmare)
2 |
3 | (defclass compiler-state ()
4 | ((variable-names :initform (make-hash-table :test 'equal)
5 | :reader variable-names)
6 | (state-names :initform (make-hash-table :test 'equal)
7 | :reader state-names)
8 | (next-state-name :initform 0
9 | :accessor next-state-name)
10 | (variable-map :initarg :variable-map
11 | :reader variable-map)))
12 |
13 | (defun find-variable-name (variable)
14 | (when (member variable '(nil position))
15 | (return-from find-variable-name variable))
16 | (let ((names (variable-names *compiler-state*)))
17 | (multiple-value-bind (name present?)
18 | (gethash variable names)
19 | (if present?
20 | name
21 | (setf (gethash variable names)
22 | (make-symbol (format nil "~{~a.~a~}" variable)))))))
23 |
24 | (defun find-state-name (state &optional (entry-point :bounds-check))
25 | (let ((names (state-names *compiler-state*)))
26 | (multiple-value-bind (name present?)
27 | (gethash (cons state entry-point) names)
28 | (if present?
29 | name
30 | (setf (gethash (cons state entry-point) names)
31 | (incf (next-state-name *compiler-state*)))))))
32 |
33 | (defun match-vector-size (groups)
34 | (* 2 (1+ groups)))
35 |
36 | (defvar *nowhere* (make-broadcast-stream))
37 | (defvar *code-type* :compiled)
38 | (defun %compile-regular-expression (expression
39 | &key (layout *default-layout*)
40 | (strategy #'make-default-strategy))
41 | (let ((*tag-gensym-counter* 0))
42 | (with-hash-consing-tables ()
43 | (multiple-value-bind (expression groups)
44 | (parse-regular-expression expression)
45 | (let ((*layout* layout)
46 | (*error-output* *nowhere*)
47 | (strategy (funcall strategy layout expression)))
48 | (values
49 | (with-naughty-compiler-switches ()
50 | (let ((form (make-lambda-form
51 | expression
52 | strategy
53 | groups)))
54 | (ecase *code-type*
55 | #+sbcl
56 | (:interpreted
57 | (let ((sb-ext:*evaluator-mode* :interpret))
58 | (eval form)))
59 | (:compiled
60 | (compile nil form))
61 | (:literal
62 | form))))
63 | (match-vector-size groups)))))))
64 |
65 | (defun variable-map-from-groups (groups)
66 | (coerce `(start end ,@(alexandria:iota (* groups 2) :start 1))
67 | 'vector))
68 |
69 | (defun make-lambda-form (expression strategy groups)
70 | (let* ((*compiler-state*
71 | (make-instance 'compiler-state
72 | :variable-map (variable-map-from-groups groups)))
73 | (macros (macros-for-strategy strategy)))
74 | (multiple-value-bind (variables declarations body)
75 | (make-prog-parts strategy expression)
76 | `(lambda ,(lambda-list strategy)
77 | (declare (optimize ,@*optimize-settings*)
78 | ,@(declarations strategy)
79 | #+sbcl (sb-ext:muffle-conditions sb-ext:compiler-note style-warning))
80 | (macrolet ,macros
81 | (prog* ,variables
82 | (declare ,@declarations)
83 | ,@body))))))
84 |
85 | (defgeneric make-prog-parts (strategy expression)
86 | (:method (strategy expression)
87 | (let* ((initial-expressions (initial-states strategy expression))
88 | (states (make-dfa-from-expressions initial-expressions)))
89 | (compute-predecessor-lists states)
90 | (compute-minimum-lengths states)
91 | (let* ((body (make-body-from-dfa strategy states))
92 | (initial-states (loop for expression in initial-expressions
93 | collect (gethash expression states)))
94 | (start-code (start-code strategy initial-states))
95 | (variables (alexandria:hash-table-values
96 | (variable-names *compiler-state*))))
97 | (values
98 | `((start start)
99 | (position start)
100 | ,@(loop for variable in variables collect `(,variable 0)))
101 | `((alexandria:array-index start position ,@variables))
102 | (append start-code body))))))
103 |
104 | (defun make-body-from-dfa (strategy states)
105 | (loop for state being the hash-values of states
106 | for expression = (state-expression state)
107 | for nullable = (nullable expression)
108 | ;; We "inline" these states into transitions rather than
109 | ;; emitting code for separate states, because they are simple
110 | ;; enough.
111 | unless (or (re-stopped-p expression)
112 | (re-empty-p expression)
113 | (state-never-succeeds-p state))
114 | append
115 | `(,(find-state-name state :bounds-check)
116 | #+print-traces
117 | (print (list :bounds-checking
118 | position
119 | ,(prin1-to-string expression)
120 | ,(minimum-length state)))
121 | (when (> (the alexandria:array-index
122 | (+ position ,(max (minimum-length state) 1)))
123 | end)
124 | (when (> position end)
125 | (return))
126 | #+print-traces
127 | (print (list :eof ,(prin1-to-string nullable)))
128 | ,(if (eq (empty-set) nullable)
129 | `(return)
130 | ;; We hit EOF and this state is nullable, so
131 | ;; succeed with what we got so far.
132 | `(progn
133 | ;; See below for commentary on why we have to
134 | ;; nudge register values around.
135 | (let ((position (1+ position)))
136 | ,(setf-from-assignments
137 | (state-exit-effects state)))
138 | #+print-traces
139 | (print (list :exit-map ',(state-exit-map state)
140 | (1- ,(find-in-map 'end (state-exit-map state)))
141 | (1+ start)))
142 | (setf start (max (1- ,(find-in-map 'end (state-exit-map state)))
143 | (1+ start)))
144 | (win ,@(win-locations (state-exit-map state))))))
145 | ,(find-state-name state :no-bounds-check)
146 | #+print-traces
147 | (print (list :bounds-ok
148 | position
149 | ,(prin1-to-string expression)
150 | ,(minimum-length state)))
151 | (let ((value (,(layout-to-number *layout*)
152 | (,(layout-ref *layout*) vector position))))
153 | ;; We assign early so that the ADD instruction doesn't
154 | ;; force our Lisp compiler to create a new basic block,
155 | ;; and can just JMP directly to the next state when no tag
156 | ;; assignments are done. This means that every register is
157 | ;; going to have a value that is 1 too high, but we can
158 | ;; just subtract in the (infrequent) places we read
159 | ;; registers.
160 | (incf position)
161 | ,(let ((labels (loop for nil in (state-transitions state)
162 | for n from 0
163 | collect (alexandria:format-symbol nil "TRANSITION-~d" n))))
164 | `(tagbody
165 | (csum-case value
166 | ,(layout-less *layout*)
167 | ,(layout-equal *layout*)
168 | ,@(loop for transition in (state-transitions state)
169 | for label in labels
170 | collect `(,(transition-class transition)
171 | (go ,label))))
172 | ,@(loop for transition in (state-transitions state)
173 | for label in labels
174 | collect label
175 | collect (transition-code strategy state transition))))))))
176 |
177 | (defmethod transition-code (strategy previous-state transition)
178 | (declare (ignore strategy))
179 | (let* ((next-state (transition-next-state transition))
180 | (next-expression (state-expression next-state)))
181 | (cond
182 | ((or (state-never-succeeds-p next-state)
183 | (re-stopped-p next-expression))
184 | (if (eq (nullable (state-expression previous-state)) (empty-set))
185 | `(progn
186 | (setf start (1+ start))
187 | (restart))
188 | ;; Similarly to hitting EOF, if this state is nullable then
189 | ;; we can succeed with what we got.
190 | `(progn
191 | ,(setf-from-assignments
192 | (transition-tags-to-set transition))
193 | (setf start (max (1+ start)
194 | (1- ,(find-in-map 'end (state-exit-map next-state)))))
195 | #+print-traces
196 | (print `(:next ,start ,,(prin1-to-string next-expression)))
197 | (win ,@(win-locations (state-exit-map next-state))))))
198 | ((re-empty-p next-expression)
199 | `(progn
200 | ,(setf-from-assignments
201 | (transition-tags-to-set transition))
202 | ;; These assignments are evaluated as if we were at the
203 | ;; empty state -- we inline the empty state because it is
204 | ;; only a set of assignments and a call to WIN.
205 | (let ((position (1+ position)))
206 | ,(setf-from-assignments
207 | (tags next-expression)))
208 | (setf start position)
209 | (win ,@(win-locations (state-exit-map next-state)))))
210 | (t
211 | (let ((entry-point
212 | (if (and (< (minimum-length next-state)
213 | (minimum-length previous-state))
214 | (plusp (minimum-length next-state)))
215 | :no-bounds-check
216 | :bounds-check)))
217 | `(progn
218 | ,(setf-from-assignments
219 | (transition-tags-to-set transition))
220 | (go ,(find-state-name next-state entry-point))))))))
221 |
222 | (defun win-locations (exit-map)
223 | (loop for variable-name across (variable-map *compiler-state*)
224 | for (variable . source) = (find variable-name exit-map :key #'caar)
225 | if (and (not (null variable))
226 | (not (eql source 'nil)))
227 | collect `(,variable-name ,(find-variable-name source))
228 | else
229 | collect `(,variable-name 'nil)))
230 |
231 | (defun setf-from-assignments (assignments)
232 | `(setf
233 | ,@(loop for (target . source) in assignments
234 | ;; NIL sources are basically just a compile time thing.
235 | unless (or (equal target source)
236 | (eql source 'nil))
237 | collect (find-variable-name target)
238 | and collect (find-variable-name source))))
239 |
240 | (defun find-in-map (variable-name map)
241 | (let ((variable (find variable-name map :key #'caar)))
242 | (if (null variable)
243 | (error "~s not in the map ~s" variable-name map)
244 | (find-variable-name (cdr variable)))))
245 |
246 | (defmethod start-code ((strategy scan-everything) states)
247 | (destructuring-bind (state) states
248 | (let ((expression (state-expression state)))
249 | (cond
250 | ((state-never-succeeds-p state)
251 | ;; Just return immediately if we're told to match nothing.
252 | `(start (return)))
253 | ((re-empty-p expression)
254 | ;; Succeed for every character?
255 | `(start
256 | (cond
257 | ((> position end)
258 | (return))
259 | (t
260 | (incf position)
261 | ,(setf-from-assignments (state-exit-effects state))
262 | (win ,@(win-locations (state-exit-map state)))))))
263 | (t
264 | `(start
265 | (setf position start)
266 | (go ,(find-state-name state :bounds-check))))))))
267 |
268 | (defmethod start-code :around ((strategy call-continuation) states)
269 | ;; Calling the continuation gets its own state as to influence the
270 | ;; register allocator less. This change does actually reduce the
271 | ;; number of spills substantially (at least on SBCL).
272 | (append (call-next-method)
273 | `(win
274 | #+print-traces
275 | (print ':win)
276 | (funcall continuation)
277 | (restart))))
278 |
--------------------------------------------------------------------------------