├── Code ├── SIMD │ ├── i-got-nothing.lisp │ ├── package.lisp │ ├── one-more-re-nightmare-simd.asd │ ├── prefix.lisp │ ├── code-generation.lisp │ ├── loop.lisp │ ├── prefix-strategy.lisp │ └── sbcl-x86-64.lisp ├── Compiler │ ├── layout.lisp │ ├── optimize-settings.lisp │ ├── compilation-strategy.lisp │ ├── length-inference.lisp │ └── code-generation.lisp ├── DFA-construction │ ├── interpret.lisp │ ├── empty.lisp │ ├── effects.lisp │ ├── nullable.lisp │ ├── derivative-classes.lisp │ ├── similar.lisp │ ├── print-dfa.lisp │ ├── derivative.lisp │ ├── tag-sets.lisp │ ├── type.lisp │ ├── make-dfa.lisp │ ├── re-types.lisp │ └── sets.lisp ├── Interface │ ├── convert-to-bytes.lisp │ ├── code-cache.lisp │ ├── lint.lisp │ ├── syntax.lisp │ └── interface.lisp ├── package.lisp └── one-more-re-nightmare.asd ├── Tests ├── package.lisp ├── one-more-re-nightmare-tests.asd ├── regrind.lisp └── tests.lisp ├── .gitignore ├── Documentation ├── make-html.sh ├── make-pdf.sh ├── one-more-re-nightmare.scrbl ├── README.md ├── tex-prefix.tex ├── mobile-view.css ├── spec-macros.tex ├── bibliography.rkt ├── introduction.scrbl ├── posix.scrbl ├── spec-macros.css ├── interface.scrbl ├── spec-macros.scrbl └── linting.scrbl ├── LICENSE └── README.md /Code/SIMD/i-got-nothing.lisp: -------------------------------------------------------------------------------- 1 | (error "one-more-re-nightmare-simd only supports SBCL on x86-64. Sorry.") 2 | -------------------------------------------------------------------------------- /Tests/package.lisp: -------------------------------------------------------------------------------- 1 | (defpackage :one-more-re-nightmare-tests 2 | (:use :cl) 3 | (:export #:run-tests #:regrind)) 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *~ 2 | *.fasl 3 | .* 4 | Documentation/*.aux 5 | Documentation/*.log 6 | Documentation/*.out 7 | Documentation/*.pdf 8 | Documentation/*.toc 9 | Documentation/*.ilg 10 | Documentation/*.ind 11 | Documentation/one-more-re-nightmare.tex 12 | Documentation/katex/* 13 | Documentation/one-more-re-nightmare/* -------------------------------------------------------------------------------- /Code/SIMD/package.lisp: -------------------------------------------------------------------------------- 1 | (defpackage :one-more-re-nightmare.vector-primops 2 | (:use) 3 | (:export #:v-and #:v-or #:v-not #:all-of 4 | #:v-and8 #:v-and32 #:v-or8 #:v-or32 #:v-not8 #:v-not32 5 | #:v32> #:v32= #:v8> #:v8= #:v8- 6 | #:v-broadcast32 #:v-movemask32 #:v-movemask8 #:v-broadcast8 7 | #:v-load32 #:v-load8 #:+v-length+ #:find-first-set)) 8 | -------------------------------------------------------------------------------- /Tests/one-more-re-nightmare-tests.asd: -------------------------------------------------------------------------------- 1 | (asdf:defsystem :one-more-re-nightmare-tests 2 | :author "Hayley Patton" 3 | :description "Tests for a regular expression compiler" 4 | :license "BSD 2-clause" 5 | :depends-on (:parachute :one-more-re-nightmare :lparallel :cl-cpus) 6 | :serial t 7 | :components ((:file "package") 8 | (:file "tests") 9 | (:file "regrind"))) 10 | -------------------------------------------------------------------------------- /Code/Compiler/layout.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defstruct layout 4 | "A structure representing the type and accessors for a vector of some sort." 5 | (array-type '(simple-array character 1)) 6 | (ref 'aref) 7 | (from-number 'code-char) 8 | (to-number 'char-code) 9 | (less '<) 10 | (less-or-equal '<=) 11 | (equal '=)) 12 | 13 | (defvar *default-layout* (make-layout)) 14 | -------------------------------------------------------------------------------- /Documentation/make-html.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | scribble --htmls one-more-re-nightmare.scrbl 4 | for p in one-more-re-nightmare/*.html; do 5 | sed -i -e 's/]*>//' \ 6 | -e 's/initial-scale=0.8/initial-scale=1.0/' \ 7 | -e "s/{}’/{}'/g" \ 8 | "$p" 9 | done 10 | cat spec-macros.css mobile-view.css >> one-more-re-nightmare/scribble-style.css 11 | -------------------------------------------------------------------------------- /Documentation/make-pdf.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | scribble --prefix tex-prefix.tex --latex one-more-re-nightmare.scrbl 4 | sed -i -e 's/^\\renewrmdefault//g' \ 5 | -e 's/^\\packageTxfonts//g' \ 6 | -e 's/\\hspace\*{\\fill}//g' \ 7 | -e 's/\\HR{}\\\\/\\HR{}/g' \ 8 | one-more-re-nightmare.tex 9 | sed -i -e 's/^\\\\//g' one-more-re-nightmare.tex 10 | pdflatex one-more-re-nightmare.tex 11 | makeindex one-more-re-nightmare 12 | pdflatex one-more-re-nightmare.tex 13 | -------------------------------------------------------------------------------- /Documentation/one-more-re-nightmare.scrbl: -------------------------------------------------------------------------------- 1 | #lang scribble/base 2 | 3 | @require[scribble/core scribble/html-properties "bibliography.rkt"] 4 | 5 | @title{The @emph{one-more-re-nightmare} Manual} 6 | @author{Applied Language} 7 | 8 | @table-of-contents[] 9 | 10 | @include-section["introduction.scrbl"] 11 | @include-section["interface.scrbl"] 12 | @include-section["linting.scrbl"] 13 | @include-section["posix.scrbl"] 14 | @generate-bibliography[] 15 | 16 | @index-section[] 17 | -------------------------------------------------------------------------------- /Documentation/README.md: -------------------------------------------------------------------------------- 1 | Run `./make-html.sh` to compile the documentation into HTML 2 | documents. Run `./make-pdf.sh` to compile the documentation into a PDF 3 | book. 4 | 5 | The documentation is written using the 6 | [Scribble](https://docs.racket-lang.org/scribble/index.html) markup 7 | language. Note that compiling a PDF book also requires a LaTeX 8 | installation. 9 | 10 | Much of the style of both formats was taken from the [Netfarm 11 | book](https://cal-coop.gitlab.io/netfarm/documentation/). 12 | -------------------------------------------------------------------------------- /Code/DFA-construction/interpret.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defun interpret-regular-expression (expression input) 4 | (loop for char across input 5 | for position from 0 6 | do (when (eq expression (empty-set)) 7 | (return-from interpret-regular-expression nil)) 8 | (print expression) 9 | (setf expression (derivative expression (symbol-set char)))) 10 | (if (eq expression (empty-set)) 11 | nil 12 | (nullable expression))) 13 | -------------------------------------------------------------------------------- /Documentation/tex-prefix.tex: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | 3 | \usepackage[utf8]{inputenc} 4 | \usepackage[T1]{fontenc} 5 | \usepackage{amsmath} 6 | \usepackage{pdflscape} 7 | \usepackage{changepage} 8 | \usepackage{ClearSans} 9 | \usepackage{charter} 10 | \usepackage[margin=1.5in,a4paper]{geometry} 11 | 12 | % Set up fonts 13 | \usepackage{titling} 14 | \usepackage{titlesec} 15 | \titleformat*{\section}{\Large\bfseries\sffamily} 16 | \titleformat*{\subsection}{\large\bfseries\sffamily} 17 | \titleformat*{\subsubsection}{\large\sffamily} 18 | -------------------------------------------------------------------------------- /Code/SIMD/one-more-re-nightmare-simd.asd: -------------------------------------------------------------------------------- 1 | (asdf:defsystem :one-more-re-nightmare-simd 2 | :author "Hayley Patton" 3 | :description "SIMD acceleration for tight loops in DFAs" 4 | :license "BSD 2-clause" 5 | :depends-on (:one-more-re-nightmare) 6 | :serial t 7 | :components ((:file "package") 8 | (:file "code-generation") 9 | (:file "sbcl-x86-64") 10 | #-(and sbcl x86-64) 11 | (:file "i-got-nothing") 12 | (:file "prefix") 13 | (:file "prefix-strategy") 14 | (:file "loop"))) 15 | -------------------------------------------------------------------------------- /Code/DFA-construction/empty.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (trivia:defun-match re-empty-p (re) 4 | "Is a regular expression basically an empty string? 5 | This is different to NULLABLE, yes, NULLABLE would accept e.g. a* or anything that is a superset of { \"\" }, but this accepts only the empty string (± tags)." 6 | ((or (empty-string) (tag-set _)) t) 7 | ;; The innards of a grep vector. If we start an alternation with an 8 | ;; empty ALPHA, then the earliest scanner in the vector won. 9 | ((either (alpha (or (empty-set) (tag-set _)) _) _) t) 10 | ((alpha r s) 11 | (or (and (re-empty-p s) 12 | (eq (empty-set) r)) 13 | (re-empty-p r))) 14 | ((kleene r) 15 | (re-empty-p r)) 16 | ((or (join r s) (either r s) (both r s)) 17 | (and (re-empty-p r) (re-empty-p s))) 18 | (_ nil)) 19 | 20 | -------------------------------------------------------------------------------- /Code/DFA-construction/effects.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (trivia:defun-match has-literals-p (re) 4 | ((literal _) t) 5 | ((either r s) 6 | (and (has-literals-p r) (has-literals-p s))) 7 | ((or (join r s) (both r s)) 8 | (or (has-literals-p r) (has-literals-p s))) 9 | ((repeat r min _ _) 10 | (and (has-literals-p r) (plusp min))) 11 | ((invert r) (not (has-literals-p r))) 12 | (_ nil)) 13 | 14 | (trivia:defun-match effects (re) 15 | ((tag-set s) s) 16 | ((join r s) 17 | (if (has-literals-p r) 18 | (effects r) 19 | (union (effects r) (effects s) 20 | :test #'equal))) 21 | ((or (either r s) (both r s)) 22 | (union (effects r) (effects s) 23 | :test #'equal)) 24 | ((invert r) (effects r)) 25 | ((repeat r _ _ _) (effects r)) 26 | ((alpha e _) (effects e)) 27 | ((grep v _) (effects v)) 28 | (_ '())) 29 | -------------------------------------------------------------------------------- /Documentation/mobile-view.css: -------------------------------------------------------------------------------- 1 | @media screen and (max-width: 900px) { 2 | /* Make the table of contents appear before the main content. */ 3 | .tocsub { 4 | display: none; 5 | } 6 | .tocset { 7 | position: initial; 8 | float: none; 9 | margin-bottom: 2em; 10 | } 11 | .tocview { 12 | background: transparent; 13 | } 14 | 15 | /* We don't need a left margin for the main content now. */ 16 | .maincolumn, .tocset { 17 | width: calc(100% - 4em - var(--card-padding)); 18 | margin-left: 2em; 19 | margin-right: 2em; 20 | } 21 | 22 | /* Put margin comments inline too. */ 23 | .refpara, .refelem, .SAuthorListBox { 24 | position: initial; 25 | float: none; 26 | left: 0; 27 | width: 100%; 28 | margin: 0; 29 | height: initial; 30 | } 31 | 32 | .refcolumn { 33 | position: initial; 34 | width: initial; 35 | margin: initial; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /Code/Compiler/optimize-settings.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defvar *optimize-settings* 4 | ;; SBCL SPEED doesn't seem to change anything, so we'll just try to 5 | ;; make the compiler run faster. 6 | #+sbcl '((speed 0) (safety 0) (compilation-speed 3) (debug 0)) 7 | ;; Clozure doesn't bother to inline or type infer with high 8 | ;; COMPILATION-SPEED. Fortunately it's still fast enough. 9 | #+ccl '((speed 3) (safety 0) (compilation-speed 0) (debug 0)) 10 | ;; ECL drops in performance when we increase compilation speed, but 11 | ;; the compiler is not noticeably faster. I blame the C compiler. 12 | #+ecl '((speed 3) (safety 0) (compilation-speed 0) (debug 0)) 13 | ;; It seems that we get better performance on ABCL with low 14 | ;; compilation speed, but there's a lot of noise still. 15 | #+abcl '((speed 3) (safety 0) (compilation-speed 0) (debug 0)) 16 | #-(or sbcl ccl ecl abcl) '((speed 3) (safety 0) (compilation-speed 3) (debug 0))) 17 | 18 | (defmacro with-naughty-compiler-switches (() &body body) 19 | #+sbcl 20 | `(let ((sb-c::*reoptimize-limit* 3)) 21 | ,@body) 22 | #-sbcl 23 | `(progn ,@body)) 24 | -------------------------------------------------------------------------------- /Code/Interface/convert-to-bytes.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | #+(or) 4 | (trivia:defun-match string->byte-re (re) 5 | ((literal set) 6 | (etypecase set 7 | (positive-symbol-set 8 | (reduce #'either 9 | (elements set) 10 | :key (lambda (character) 11 | (text (babel:string-to-octets 12 | (string character)))) 13 | :initial-value (empty-set))) 14 | (negative-symbol-set 15 | (reduce #'both 16 | (elements set) 17 | :key (lambda (character) 18 | (invert 19 | (text (babel:string-to-octets 20 | (string character))))) 21 | :initial-value (invert (empty-set)))))) 22 | ((join r s) (join (string->byte-re r) 23 | (string->byte-re s))) 24 | ((either r s) (either (string->byte-re r) 25 | (string->byte-re s))) 26 | ((invert r) (invert (string->byte-re r))) 27 | ((kleene r) (kleene (string->byte-re r))) 28 | ((both r s) (both (string->byte-re r) 29 | (string->byte-re s))) 30 | ((type string) 31 | (string->byte-re (parse-regular-expression re))) 32 | (_ re)) 33 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2020-2022 Hayley Patton 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are 5 | met: 6 | 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 2. Redistributions in binary form must reproduce the above copyright 10 | notice, this list of conditions and the following disclaimer in the 11 | documentation and/or other materials provided with the distribution. 12 | 13 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 14 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 15 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 16 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 17 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 18 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 19 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 20 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 21 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | -------------------------------------------------------------------------------- /Code/SIMD/prefix.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (trivia:defun-match %prefix (re) 4 | "Find the constant string prefix of a regular expression." 5 | ((empty-set) 6 | (values '() (empty-string))) 7 | ((literal set) 8 | (if (csum-has-classes-p set) 9 | (values '() (empty-string)) 10 | (values `((:literal ,set)) (empty-string)))) 11 | ((tag-set tags) 12 | (values `((:tags ,tags)) (empty-string))) 13 | ((join r s) 14 | (multiple-value-bind (p1 s1) 15 | (%prefix r) 16 | (cond 17 | ((eq (empty-string) s1) 18 | ;; Haven't hit something not constant, keep searching. 19 | (multiple-value-bind (p2 s2) 20 | (%prefix s) 21 | (values (append p1 p2) s2))) 22 | (t 23 | (values p1 (join s1 s)))))) 24 | ((alpha r _) 25 | (%prefix r)) 26 | (_ 27 | (values '() re))) 28 | 29 | (defun prefix (re) 30 | (multiple-value-bind (prefix suffix) 31 | (%prefix re) 32 | (values prefix 33 | ;; Glue the tag map back on. 34 | (join (tag-set 35 | (loop for (type tags) in prefix 36 | when (eql type :tags) 37 | append (loop for ((v r) . nil) in tags 38 | collect (cons (list v r) (list v r))))) 39 | suffix)))) 40 | -------------------------------------------------------------------------------- /Code/DFA-construction/nullable.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defvar *gensym-assignments?* t) 4 | 5 | (defun cached-nullable* (re) 6 | (if *gensym-assignments?* 7 | (cached-nullable re) 8 | (cached-nullable-no-gensym re))) 9 | 10 | (defun (setf cached-nullable*) (value re) 11 | (if *gensym-assignments?* 12 | (setf (cached-nullable re) value) 13 | (setf (cached-nullable-no-gensym re) value))) 14 | 15 | (defun nullable (re) 16 | "(language-of (nullable RE)) = (language-of (both RE (empty-string)))" 17 | (with-slot-consing (cached-nullable* re) 18 | (trivia:ematch re 19 | ((empty-string) (empty-string)) 20 | ((literal _) (empty-set)) 21 | ((join r s) (join (nullable r) (nullable s))) 22 | ((either r s) (either (nullable r) (nullable s))) 23 | ((repeat r min _ c) (let ((rn (if c (nullable r) (empty-set)))) 24 | (if (plusp min) 25 | (empty-set) 26 | (either rn (empty-string))))) 27 | ((both r s) (both (nullable r) (nullable s))) 28 | ((tag-set s) (tag-set (gensym-position-assignments s))) 29 | ((invert r) (if (eq (nullable r) (empty-set)) 30 | (empty-string) 31 | (empty-set))) 32 | ((grep r _) (nullable r)) 33 | ((alpha r history) (either (nullable r) history))))) 34 | -------------------------------------------------------------------------------- /Code/DFA-construction/derivative-classes.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defun merge-sets (sets1 sets2) 4 | "Produce a list of every subset of sets1 and sets2." 5 | (let ((sets (make-hash-table :test 'equal))) 6 | (loop for set1 in sets1 7 | do (loop for set2 in sets2 8 | for intersection = (csum-intersection set1 set2) 9 | do (setf (gethash intersection sets) t))) 10 | (alexandria:hash-table-keys sets))) 11 | 12 | 13 | (define-hash-consing-table *derivative-classes*) 14 | 15 | (defun derivative-classes (re) 16 | "Produce a list of the 'classes' (sets) of characters that compiling the regular expression would have to dispatch on." 17 | (with-hash-consing (*derivative-classes* re) 18 | (trivia:ematch re 19 | ((literal set) (list set (csum-complement set))) 20 | ((or (empty-string) 21 | (tag-set _)) 22 | (list +universal-set+)) 23 | ((join r s) 24 | (if (eq (nullable r) (empty-set)) 25 | (derivative-classes r) 26 | (merge-sets (derivative-classes r) 27 | (derivative-classes s)))) 28 | ((or (either r s) (both r s) 29 | (grep r s)) 30 | (merge-sets (derivative-classes r) 31 | (derivative-classes s))) 32 | ((or (invert r) (repeat r _ _ _)) 33 | (derivative-classes r)) 34 | ((alpha r _) 35 | (derivative-classes r))))) 36 | -------------------------------------------------------------------------------- /Code/package.lisp: -------------------------------------------------------------------------------- 1 | (when (find-package "NET.DIDIERVERNA.DECLT") 2 | (loop with request = "Get me off your 'reference manual' generator. " 3 | with nil = (write-line "Get Me Off Your 'Reference Manual' Generator 4 | " *debug-io*) 5 | for (section size) in '(("ABSTRACT" 10) ("1 INTRODUCTION" 25) 6 | (2 5) (2.1 20) (2.2 10) (3 25) (3.1 15) 7 | (3.2 15) (3.3 20) (3.4 15) (4 20) 8 | ("5 SUMMARY" 5)) 9 | do (if (stringp section) 10 | (format *debug-io* "~%~%~A~%" section) 11 | (format *debug-io* "~%~%~D ~:@(~A~)~%" section request)) 12 | (dotimes (i size) 13 | (write-string request *debug-io*) 14 | (when (and (> size 10) (zerop (random 16))) 15 | (terpri *debug-io*) (terpri *debug-io*)))) 16 | #+sbcl (sb-ext:quit) 17 | ;; close enough 18 | #-sbcl (loop collect (make-array 10))) 19 | 20 | (defpackage :one-more-re-nightmare 21 | (:use :cl) 22 | (:export #:compile-regular-expression #:compiled-regular-expression 23 | #:all-matches #:all-string-matches 24 | #:first-match #:first-string-match 25 | #:do-matches 26 | #:*state-limit* #:exceeded-state-limit 27 | #:lint-style-warning 28 | #:not-matchable-style-warning 29 | #:matching-too-much-style-warning)) 30 | -------------------------------------------------------------------------------- /Code/SIMD/code-generation.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defvar *bits*) 4 | (defvar *broadcasts*) 5 | (defun find-broadcast (value) 6 | (or (gethash value *broadcasts*) 7 | (setf (gethash value *broadcasts*) 8 | (make-symbol (format nil "BROADCAST-~d" value))))) 9 | 10 | (defun test-from-isum (variable isum) 11 | (translate-scalar-code variable (make-test-form isum variable))) 12 | 13 | (defun code-from-prefix (prefix) 14 | (assert (not (null prefix)) () "Why /even bother/ with a zero-length prefix?") 15 | (let ((tests '()) 16 | (loads '()) 17 | (assignments '()) 18 | (n 0)) 19 | (dolist (part prefix) 20 | (trivia:ematch part 21 | ((list :literal isum) 22 | (let ((name (make-symbol (format nil "LOAD-~d" n)))) 23 | (trivia:ematch (test-from-isum name isum) 24 | (:always) 25 | (:never (error "This should never happen - the empty set has no prefix!")) 26 | (test 27 | (push `(,name (,(find-op "LOAD") vector (the fixnum (+ ,n start)))) 28 | loads) 29 | (push test tests)))) 30 | (incf n)) 31 | ((list :tags tags) 32 | (push `(let ((position (the fixnum (+ ,n position)))) 33 | ,(setf-from-assignments tags)) 34 | assignments)))) 35 | (values 36 | (one-more-re-nightmare.vector-primops:all-of tests) 37 | loads 38 | `(progn ,@(reverse assignments)) 39 | n))) 40 | -------------------------------------------------------------------------------- /Documentation/spec-macros.tex: -------------------------------------------------------------------------------- 1 | % The "type" part of @defthing. 2 | \newcommand{\FloatRight}[1]{\nobreak\hfill\enspace #1} 3 | \newcommand{\Param}[1]{\mbox{#1}} 4 | % For fancy looking definition section headers. 5 | \newcommand{\SmallCaps}[1]{\textsc{#1}} 6 | 7 | \renewenvironment{SInsetFlow}{% 8 | \hfill\begin{minipage}{\dimexpr\textwidth-2em}% 9 | }{ 10 | \end{minipage}\\ 11 | } 12 | 13 | \newenvironment{Definitions}{% 14 | \vspace{0.5em} 15 | \begin{minipage}{\textwidth} 16 | \setlength{\parskip}{0.5em} 17 | \noindent{\rule{\linewidth}{0.4pt}} 18 | }{ 19 | \end{minipage} 20 | } 21 | 22 | % Definition lines 23 | % Thanks to https://tex.stackexchange.com/questions/312581/how-to-calculate-width-of-remaining-part-of-line 24 | % for more or less showing how to get this layout working. 25 | 26 | \newsavebox{\defunnamebox} 27 | \newlength{\defunnamewidth} 28 | 29 | \newcommand{\DefunName}[1]{% 30 | \sbox{\defunnamebox}{\strut \textbf{#1}}% 31 | \global\defunnamewidth=\wd\defunnamebox 32 | \usebox{\defunnamebox}% 33 | } 34 | \newcommand{\LambdaList}[1]{% 35 | \parbox[t]{\dimexpr\columnwidth-\defunnamewidth-\fboxsep}{ 36 | \raggedright\strut\emph{#1} 37 | }% 38 | } 39 | 40 | \newenvironment{Landscape}{% 41 | \begin{landscape} 42 | }{ 43 | \end{landscape} 44 | } 45 | 46 | \newenvironment{CenteredContainer}{}{} 47 | 48 | % Okay, not really centering, but I don't know if I really want centering, 49 | % or just a big indent. 50 | \newenvironment{CenteredBlock}{% 51 | \hfill\begin{minipage}{\dimexpr\textwidth-6em}\raggedright 52 | }{ 53 | \end{minipage} 54 | } 55 | 56 | \setlength{\parindent}{0pt} 57 | 58 | % Have each section start on a new page. 59 | \let\oldsection\section 60 | \renewcommand\section{\clearpage\oldsection} 61 | \setlength{\parskip}{0.5em} -------------------------------------------------------------------------------- /Documentation/bibliography.rkt: -------------------------------------------------------------------------------- 1 | #lang racket 2 | 3 | (require scriblib/autobib) 4 | (provide generate-bibliography ~cite 5 | derivatives apl3000 derivatives-reexamined 6 | rte petalisp cox posix) 7 | 8 | (define-cite ~cite citet generate-bibliography) 9 | 10 | (define-syntax-rule (define-bib name title author date rest ...) 11 | (define name 12 | (make-bib #:title title 13 | #:author author 14 | #:date date 15 | rest ...))) 16 | 17 | 18 | (define-bib derivatives 19 | "Derivatives of Regular Expressions" 20 | "Janus A. Brzozowski" 21 | 1964 22 | #:url "https://dl.acm.org/doi/10.1145/321239.321249") 23 | 24 | (define-bib apl3000 25 | "The dynamic incremental compiler of APL\\3000" 26 | "Ronald L. Johnston" 27 | 1979 28 | #:url "http://www.softwarepreservation.org/projects/apl/Papers/DYNAMICINCREMENTAL") 29 | 30 | (define-bib derivatives-reexamined 31 | "Regular-expression derivatives reexamined" 32 | (authors "Scott Owens" "John Reppy" "Aaron Turon") 33 | 2009 34 | #:url "https://www.ccs.neu.edu/home/turon/re-deriv.pdf") 35 | 36 | (define-bib rte 37 | "Type-Checking of Heterogeneous Sequences in Common Lisp" 38 | (authors "Jim Newton" "Akim Demaille" "Didier Verna") 39 | 2016 40 | #:url "https://hal.archives-ouvertes.fr/hal-01380792/document") 41 | 42 | (define-bib petalisp 43 | "Petalisp: A Common Lisp Library for Data Parallel Programming" 44 | "Marco Heisig" 45 | 2018 46 | #:url "https://dl.acm.org/doi/10.5555/3323215.3323216") 47 | 48 | (define-bib cox 49 | "Regular Expression Matching: the Virtual Machine Approach" 50 | "Russ Cox" 51 | 2009 52 | #:url "https://swtch.com/~rsc/regexp/regexp2.html") 53 | 54 | (define-bib posix 55 | "Regular expressions" 56 | "IEEE" 57 | 2018 58 | #:url "https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html") 59 | -------------------------------------------------------------------------------- /Documentation/introduction.scrbl: -------------------------------------------------------------------------------- 1 | #lang scribble/base 2 | @require["spec-macros.scrbl" "bibliography.rkt"] 3 | 4 | @title{Introduction} 5 | 6 | one-more-re-nightmare is a new regular expression compiler implemented 7 | in Common Lisp. 8 | 9 | The aim is to generate code that will perform as well, if not better, 10 | than hand-written searching code. We believe we have achieved this 11 | aim; as finite state machine generation removes redundant tests that 12 | would not typically be removed by a human programmer, compilation is 13 | performed for specific array types, and the compiler can also make use 14 | of SIMD intrinsics for simple loops. We also intend to produce 15 | excellent analysis, to help with writing regular expressions that do 16 | the "right thing". 17 | 18 | @section{Prior work} 19 | 20 | The derivative approach was introduced by the late Janusz Brzozowski 21 | in @~cite[derivatives]. 22 | 23 | The name of the library is due to the song @term{One More Red 24 | Nightmare} by the band King Crimson in 1974. 25 | 26 | Ronald Johnston wrote on the APL\3000 compiler in @~cite[apl3000], 27 | which produced specialised code for different array layouts on demand. 28 | 29 | Scott Owens, John Reppy and Aaron Turon showed how to produce finite 30 | state machines that are very close to minimal in size in 31 | @~cite[derivatives-reexamined], using the derivative approach. This 32 | minimality was achieved by using additional rewrite rules, allowing 33 | the machine generation process to reuse more states, instead of 34 | producing more redundant states. 35 | 36 | Jim Newton compiled finite state machines to Common Lisp code in 37 | @~cite[rte], to implement @term{regular type expressions}. 38 | 39 | We were first introduced to using the Common Lisp compiler as a 40 | backend for a @term{just-in-time compiler} by the Petalisp language by 41 | Marco Heisig @~cite[petalisp]. 42 | 43 | Gilbert Baumann described how to implement submatching using 44 | derivatives in a currently unpublished paper. 45 | -------------------------------------------------------------------------------- /Code/one-more-re-nightmare.asd: -------------------------------------------------------------------------------- 1 | (asdf:defsystem :one-more-re-nightmare 2 | :author "Hayley Patton" 3 | :description "A regular expression compiler" 4 | :license "BSD 2-clause" 5 | :depends-on (:trivia :alexandria :babel 6 | :esrap :trivial-indent 7 | :dynamic-mixins :stealth-mixin 8 | :bordeaux-threads) 9 | :serial t 10 | :components ((:file "package") 11 | (:module "DFA-construction" 12 | :components ((:file "type") 13 | (:file "sets") 14 | (:file "re-types") 15 | (:file "nullable") 16 | (:file "tag-sets") 17 | (:file "derivative") 18 | (:file "derivative-classes") 19 | (:file "empty") 20 | (:file "effects") 21 | (:file "similar") 22 | (:file "make-dfa"))) 23 | (:module "Compiler" 24 | :components ((:file "layout") 25 | (:file "compilation-strategy") 26 | (:file "length-inference") 27 | (:file "optimize-settings") 28 | (:file "code-generation"))) 29 | (:module "Interface" 30 | :components ((:file "syntax") 31 | (:file "convert-to-bytes") 32 | (:file "code-cache") 33 | (:file "lint") 34 | (:file "interface")))) 35 | ;; Trivia emits warnings at compile-time that type-i can't infer 36 | ;; types for some patterns. 37 | :around-compile (lambda (thunk) 38 | (handler-bind ((warning 39 | (lambda (c) 40 | (when (typep c (find-symbol "FAILED-TYPE-INFERENCE" "TYPE-I")) 41 | (muffle-warning))))) 42 | (funcall thunk)))) 43 | -------------------------------------------------------------------------------- /Code/Interface/code-cache.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defvar *code-cache* 4 | (make-hash-table :test 'equal)) 5 | (defvar *code-lock* 6 | (bt:make-lock "Compiled code cache lock")) 7 | 8 | (defstruct (compiled-regular-expression (:conc-name cre-)) 9 | (codes (alexandria:required-argument) :type simple-vector) 10 | original-re) 11 | (deftype re-designator () 12 | `(or compiled-regular-expression string)) 13 | 14 | (defvar *string-types* 15 | (alexandria:map-product #'list 16 | '(simple-array array) 17 | '(character base-char) 18 | '(1))) 19 | 20 | (defun compile-regular-expression (expression) 21 | (make-compiled-regular-expression 22 | :codes (coerce 23 | (loop for type in *string-types* 24 | collect (find-code expression type)) 25 | 'vector) 26 | :original-re expression)) 27 | 28 | (defmethod print-object ((cre compiled-regular-expression) stream) 29 | (print-unreadable-object (cre stream :type t) 30 | (write-string (cre-original-re cre) stream))) 31 | 32 | (defvar *type-dispatcher* 33 | (compile nil 34 | `(lambda (cre type) 35 | (cond 36 | ,@(loop for type in *string-types* 37 | for index from 0 38 | collect `((eq type ',type) (svref (cre-codes cre) ,index))))))) 39 | 40 | (defun find-code (regular-expression type-specifier) 41 | (when (compiled-regular-expression-p regular-expression) 42 | (return-from find-code 43 | (funcall *type-dispatcher* regular-expression type-specifier))) 44 | (bt:with-lock-held (*code-lock*) 45 | (multiple-value-bind (code present?) 46 | (gethash (list regular-expression type-specifier) 47 | *code-cache*) 48 | (when present? 49 | (return-from find-code code)))) 50 | (multiple-value-bind (function groups) 51 | (%compile-regular-expression 52 | regular-expression 53 | :layout (make-layout :array-type type-specifier)) 54 | (bt:with-lock-held (*code-lock*) 55 | (setf (gethash (list (copy-seq regular-expression) 56 | type-specifier) 57 | *code-cache*) 58 | (cons function groups))))) 59 | 60 | (defun string-type-of (string) 61 | (loop for type in *string-types* 62 | when (typep string type) 63 | do (return-from string-type-of type)) 64 | (error 'type-error 65 | :datum string 66 | :expected-type 'string)) 67 | -------------------------------------------------------------------------------- /Code/SIMD/loop.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defclass simd-loop (simd-info) ()) 4 | 5 | (defun assignments-idempotent-p (assignments) 6 | "Are the assignments idempotent, i.e. would repeated applications of the assignments, interleaved with incrementing the position, be the same as applying the assignments once at the end?" 7 | ;; We can't form un-idempotent assignments unless we end up writing 8 | ;; a variable we also read. But apparently optimising loops that 9 | ;; carry over variables, i.e. aren't for the start, makes 10 | ;; performance worse. Weird. 11 | #-one-more-re-nightmare::precise-aip 12 | (null assignments) 13 | #+one-more-re-nightmare::precise-aip 14 | (progn 15 | (setf assignments 16 | (loop for assignment in assignments 17 | for (target . source) = assignment 18 | unless (equal target source) 19 | collect assignment)) 20 | (null (intersection (mapcar #'car assignments) 21 | (mapcar #'cdr assignments) 22 | :test #'equal)))) 23 | 24 | (defmethod transition-code ((strategy simd-loop) previous-state transition) 25 | (let* ((next-state (transition-next-state transition)) 26 | (next-expression (state-expression next-state))) 27 | ;; We optimise tight loops like A -> A. 28 | (when (or (re-stopped-p next-expression) 29 | (re-empty-p next-expression) 30 | (not (eq next-state previous-state)) 31 | (not (assignments-idempotent-p 32 | (transition-tags-to-set transition))) 33 | (csum-has-classes-p (transition-class transition))) 34 | (return-from transition-code (call-next-method))) 35 | ;; Try to skip to the first character after for which this transition doesn't apply. 36 | (let* ((vector-length (/ one-more-re-nightmare.vector-primops:+v-length+ *bits*))) 37 | (trivia:ematch 38 | (test-from-isum 'loaded 39 | (csum-complement (transition-class transition))) 40 | (:never (call-next-method)) 41 | (:always (error "Found a transition that is never taken.")) 42 | (test 43 | `(progn 44 | ;; Don't try to read over the END we were given. 45 | (loop 46 | (unless (< (the fixnum (+ ,vector-length position)) end) 47 | (return)) 48 | (let* ((loaded (,(find-op "LOAD") vector position)) 49 | (test (,(find-op "MOVEMASK") ,test))) 50 | (unless (zerop test) 51 | (incf position (one-more-re-nightmare.vector-primops:find-first-set test)) 52 | (return))) 53 | (incf position ,vector-length)) 54 | ,(call-next-method))))))) 55 | -------------------------------------------------------------------------------- /Tests/regrind.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare-tests) 2 | 3 | ;;;; It's regrind time! 4 | 5 | (defvar *start*) 6 | (defvar *end*) 7 | 8 | (defun checked-string-ref (string index) 9 | (assert (and (<= *start* index) (< index *end*))) 10 | (aref string index)) 11 | 12 | (defvar *layout* 13 | (one-more-re-nightmare::make-layout 14 | :ref 'checked-string-ref)) 15 | 16 | (defvar *remaining-depth* 4) 17 | (defun random-re () 18 | (macrolet ((terminal () 19 | ;; A random element of [A-Z]. 20 | '(string (code-char (+ 65 (random 26))))) 21 | (recurse (control n) 22 | `(format nil ,control ,@(loop repeat n collect '(random-re))))) 23 | (if (zerop *remaining-depth*) 24 | (terminal) 25 | (let ((*remaining-depth* (1- *remaining-depth*))) 26 | (case (random 8) 27 | (0 (terminal)) 28 | (1 (recurse "~a~a" 2)) 29 | (2 (recurse "(~a)" 1)) 30 | (3 (recurse "«~a»" 1)) 31 | (4 (recurse "(~a)|(~a)" 2)) 32 | (5 (recurse "(~a)&(~a)" 2)) 33 | (6 (recurse "(¬~a)" 1)) 34 | (7 (recurse "(~a)*" 1))))))) 35 | 36 | (defun random-haystack () 37 | (let* ((n (random 80)) 38 | (haystack (make-string n))) 39 | (dotimes (i n) 40 | (setf (char haystack i) (code-char (+ 65 (random 26))))) 41 | haystack)) 42 | 43 | (defun ensure-lparallel-kernel () 44 | (when (null lparallel:*kernel*) 45 | (setf lparallel:*kernel* 46 | (lparallel:make-kernel (cpus:get-number-of-processors))))) 47 | 48 | (defun regrind (n &key (depth 4)) 49 | (let ((success t)) 50 | (ensure-lparallel-kernel) 51 | (lparallel:pdotimes (i n success n) 52 | (let* ((*remaining-depth* depth) 53 | #+sbcl (one-more-re-nightmare::*code-type* :interpreted) 54 | (re (random-re)) 55 | (haystack (random-haystack))) 56 | (handler-case 57 | (one-more-re-nightmare::%compile-regular-expression 58 | re 59 | :layout *layout*) 60 | (error (e) 61 | (format t "~&Compiling ~s fails with:~&~a" re e) 62 | (setf success nil)) 63 | (:no-error (code registers) 64 | (let ((result (make-array registers)) 65 | (*start* 0) 66 | (*end* (length haystack))) 67 | (handler-case 68 | (funcall code haystack 0 (length haystack) result 69 | (lambda () 70 | (loop for p across result 71 | do (assert (or (null p) (<= 0 p *end*)))))) 72 | (error (e) 73 | (format t "~&Matching ~s on the haystack ~s fails with:~&~a" re haystack e) 74 | (setf success nil)) 75 | (:no-error (&rest stuff) 76 | (declare (ignore stuff)) 77 | (when (zerop (mod i 100)) 78 | (write-char #\.) 79 | (finish-output))))))))))) 80 | -------------------------------------------------------------------------------- /Code/Compiler/compilation-strategy.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defvar *compiler-state*) 4 | (defvar *layout*) 5 | 6 | (defclass strategy () 7 | () 8 | (:documentation "A compilation strategy describes how potential matches should be searched for.")) 9 | 10 | (defgeneric initial-states (strategy expression) 11 | (:documentation "Compute a list of states to start compiling from.")) 12 | (defgeneric macros-for-strategy (strategy) 13 | (:documentation "A list of macros (at least using including WIN and RESTART) to use for compilation.") 14 | (:method-combination append)) 15 | (defgeneric lambda-list (strategy) 16 | (:documentation "The lambda list of the function to generate.")) 17 | (defgeneric start-code (strategy states) 18 | (:documentation "Part of a TAGBODY body used to start running a DFA.")) 19 | (defgeneric transition-code (strategy previous-state transition)) 20 | (defgeneric declarations (strategy)) 21 | 22 | (defclass scan-everything (strategy) 23 | () 24 | (:documentation "A compilation strategy which runs a regular expression vector over every position.")) 25 | 26 | (defclass call-continuation (strategy) 27 | () 28 | (:documentation "A compilation strategy which calls a continuation when a match is found.")) 29 | 30 | (defun make-default-strategy (layout expression) 31 | (declare (ignore layout expression)) 32 | (make-instance (dynamic-mixins:mix 'scan-everything 'call-continuation))) 33 | 34 | (defun add-tags (expression) 35 | (join (tag-set '(((start 0) . position))) 36 | (join expression 37 | (tag-set '(((end 0) . position)))))) 38 | 39 | (defun make-search-machine (expression) 40 | ;; We add an ALPHA wrapper to store the last end point when we 41 | ;; succeed but have repetition, and a GREP wrapper to make sure we 42 | ;; continue when we fail to match. 43 | (let ((a (alpha (add-tags expression) (empty-set)))) 44 | (grep a a))) 45 | 46 | (defmethod initial-states ((strategy scan-everything) expression) 47 | (list (make-search-machine expression))) 48 | 49 | (defmethod macros-for-strategy append ((strategy scan-everything)) 50 | '((restart () 51 | '(go start)))) 52 | 53 | (defmethod macros-for-strategy append ((strategy call-continuation)) 54 | '((win (&rest variables) 55 | `(progn 56 | ;; We have to compensate for the DFA assigning tags 1 too late 57 | ;; here. 58 | ,@(loop for (nil variable) in variables 59 | for n from 0 60 | collect `(setf (svref result-vector ,n) 61 | ,(if (symbolp variable) 62 | `(1- ,variable) 63 | variable))) 64 | (go win))))) 65 | 66 | (defmethod lambda-list ((strategy call-continuation)) 67 | '(vector start end result-vector continuation)) 68 | 69 | (defmethod declarations ((strategy call-continuation)) 70 | `((,(layout-array-type *layout*) vector) 71 | (alexandria:array-index start end) 72 | (function continuation) 73 | (simple-vector result-vector))) 74 | -------------------------------------------------------------------------------- /Documentation/posix.scrbl: -------------------------------------------------------------------------------- 1 | #lang scribble/base 2 | 3 | @require["spec-macros.scrbl" "bibliography.rkt"] 4 | @title{Differences between POSIX and PCRE regexen} 5 | 6 | Many regular expression engines implement PCRE-like regular expression 7 | semantics, but one-more-re-nightmare implements POSIX semantics. While 8 | the syntax of POSIX regular expressions is a subset of that of PCRE, the 9 | semantics can differ drastically. 10 | 11 | We use one-more-re-nightmare and cl-ppcre to show the differences in 12 | some experiments; the latter is probably the most commonly used 13 | regular expression engine for Common Lisp. 14 | 15 | @section{The longest match wins} 16 | 17 | When faced with an alternation, PCRE regular expressions prefer the 18 | left-most option, whereas POSIX regular expressions prefer the longest. 19 | 20 | @lisp-code{ 21 | CL-USER> (one-more-re-nightmare:first-string-match "ant|antler" "antler") 22 | #("antler") 23 | CL-USER> (cl-ppcre:scan-to-strings "ant|antler" "antler") 24 | "ant" 25 | #() 26 | } 27 | 28 | Of course, the results are the same when the options are sorted 29 | longest to shortest. 30 | 31 | @lisp-code{ 32 | CL-USER> (cl-ppcre:scan-to-strings "antler|ant" "antler") 33 | "antler" 34 | #() 35 | } 36 | 37 | @section{The longest submatch wins} 38 | 39 | The results of the longest match rule produces interesting results 40 | when combined with submatches. 41 | 42 | @lisp-code{ 43 | CL-USER> (one-more-re-nightmare:first-string-match "«a*»*" "aaa") 44 | #("aaa" "aaa") 45 | CL-USER> (cl-ppcre:scan-to-strings "(a*)*" "aaa") 46 | "aaa" 47 | #("") 48 | } 49 | 50 | POSIX prefers matching everything, whereas PCRE prefers matching the empty 51 | string. The behaviour of PCRE can be fixed by requiring at least one iteration, 52 | by @cl{(a+)*}, but this is not necessary with POSIX. 53 | 54 | @lisp-code{ 55 | CL-USER> (cl-ppcre:scan-to-strings "(a+)*" "aaa") 56 | "aaa" 57 | #("aaa") 58 | } 59 | 60 | Another interesting (but more contrived) example is @cl{(a|aa)*}. 61 | 62 | @lisp-code{ 63 | CL-USER> (one-more-re-nightmare:first-string-match "«a|aa»*" "aaa") 64 | #("aaa" "a") 65 | CL-USER> (one-more-re-nightmare:first-string-match "«a|aa»*" "aaaa") 66 | #("aaaa" "aa") 67 | 68 | CL-USER> (cl-ppcre:scan-to-strings "(a|aa)*" "aaa") 69 | "aaa" 70 | #("a") 71 | CL-USER> (cl-ppcre:scan-to-strings "(a|aa)*" "aaaa") 72 | "aaaa" 73 | #("a") 74 | } 75 | 76 | Whereas PCRE always takes the left option (@cl{a}), POSIX attempts to 77 | take the right option (@cl{aa}) when possible. The result is that POSIX 78 | produces the submatch @cl{aa} when the string has an even (and 79 | non-zero) number of @cl{a}, and @cl{a} when the string has an odd 80 | number of @cl{a}. 81 | 82 | (We think that the "Pike virtual machine", popularised by a series of 83 | articles by Russ Cox @~cite[cox], cannot implement POSIX submatching 84 | semantics, despite its use in some implementations which claim POSIX 85 | compatibility; but we don't have a real proof yet. The basic idea is 86 | that the Pike machine prefers the shortest match that reaches a 87 | particular state, and thus only the shortest submatch is ever saved. 88 | The GNU libc implementation of regular expressions has the same 89 | behaviour as CL-PPCRE on @cl{(a|aa)*} for example.) 90 | -------------------------------------------------------------------------------- /Code/DFA-construction/similar.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (define-condition no-match () ()) 4 | 5 | (defvar *environment*) 6 | (defvar *backward-environment*) 7 | 8 | (defun assert-equivalent (from to) 9 | (when (or (and (eql from 'position) (eql to 'position)) 10 | (and (eql from 'nil) (eql to 'nil))) 11 | ;; Don't care for POSITION ~ POSITION or NIL ~ NIL 12 | (return-from assert-equivalent)) 13 | (multiple-value-bind (old-to present?) 14 | (gethash from *environment*) 15 | (when (and present? (not (equal to old-to))) 16 | (error 'no-match)) 17 | (multiple-value-bind (old-from present?) 18 | (gethash to *backward-environment*) 19 | (when (and present? (not (equal from old-from))) 20 | (error 'no-match)) 21 | (setf (gethash from *environment*) to 22 | (gethash to *backward-environment*) from)))) 23 | 24 | (defun assert-equivalent-sources (from to) 25 | "Ensure that we don't unify POSITION and a variable, or two different variables." 26 | (when (eql from to) 27 | (return-from assert-equivalent-sources)) 28 | (unless (and (listp from) (listp to)) 29 | (error 'no-match)) 30 | (unless (eql (first from) (first to)) 31 | (error 'no-match)) 32 | (assert-equivalent from to)) 33 | 34 | (trivia:defun-match* %similar (from to) 35 | (((both r1 s1) (both r2 s2)) 36 | (%similar r1 r2) 37 | (%similar s1 s2)) 38 | (((either r1 s1) (either r2 s2)) 39 | (%similar r1 r2) 40 | (%similar s1 s2)) 41 | (((join r1 s1) (join r2 s2)) 42 | (%similar r1 r2) 43 | (%similar s1 s2)) 44 | (((repeat r1 min1 max1 c1) (repeat r2 min2 max2 c2)) 45 | (unless (and (eql min1 min2) (eql max1 max2) (eql c1 c2)) 46 | (error 'no-match)) 47 | (%similar r1 r2)) 48 | (((invert r1) (invert r2)) 49 | (%similar r1 r2)) 50 | (((literal set1) (literal set2)) 51 | (unless (eq set1 set2) 52 | (error 'no-match))) 53 | (((empty-string) (empty-string))) 54 | (((tag-set set1) (tag-set set2)) 55 | (unless (= (length set1) (length set2)) 56 | (error 'no-match)) 57 | ;; This happens to work nicely as we know that NULLABLE 58 | ;; will never re-order substitutions in TAG-SETs. 59 | (loop for ((v1 r1) . s1) in set1 60 | for ((v2 r2) . s2) in set2 61 | do (unless (eql v1 v2) 62 | (error 'no-match)) 63 | (assert-equivalent-sources s1 s2) 64 | (assert-equivalent (list v1 r1) 65 | (list v2 r2)))) 66 | (((grep r1 _) (grep r2 _)) 67 | ;; We don't actually need to do anything for GREP prototypes; they 68 | ;; never cause effects, and they never change while generating a 69 | ;; DFA. Unifying them just causes unnecessary failures. 70 | (%similar r1 r2)) 71 | (((alpha r1 n1) (alpha r2 n2)) 72 | ;; Though we do need to unify the history/nullable part of ALPHA. 73 | (%similar r1 r2) 74 | (%similar n1 n2)) 75 | ((_ _) (error 'no-match))) 76 | 77 | (defun similar (from to) 78 | (let ((*environment* (make-hash-table :test 'equal)) 79 | (*backward-environment* (make-hash-table :test 'equal))) 80 | (handler-case 81 | (values (%similar from to) t) 82 | (no-match () 83 | nil) 84 | (:no-error (&rest r) 85 | (declare (ignore r)) 86 | *environment*)))) 87 | -------------------------------------------------------------------------------- /Code/DFA-construction/print-dfa.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | ;;; The DFA we are printing. 4 | (defvar *dfa*) 5 | (defvar *initial-state*) 6 | (defvar *print-state-names* t) 7 | 8 | (defun generate-dot-for-expression (expression &key (search? t) (name-states? t)) 9 | (uiop:with-temporary-file (:pathname p :stream s :keep t :type "dot") 10 | (with-hash-consing-tables () 11 | (let* ((*tag-gensym-counter* 0) 12 | (*print-state-names* name-states?) 13 | (regex (parse-regular-expression expression)) 14 | (initial-state 15 | (if search? (make-search-machine regex) regex)) 16 | (dfa (make-dfa-from-expression initial-state))) 17 | (print-dfa dfa initial-state s))) 18 | p)) 19 | 20 | (defun print-dfa (dfa initial-state &optional (stream *standard-output*)) 21 | (let ((*dfa* dfa) 22 | (*initial-state* (gethash initial-state dfa))) 23 | (cl-dot:print-graph 24 | (cl-dot:generate-graph-from-roots 25 | 'dfa 26 | (list (gethash initial-state dfa)) 27 | `(:node (:fontname "Inconsolata" :shape ,(if *print-state-names* :box :circle)) 28 | :edge (:fontname "Inconsolata"))) 29 | :stream stream))) 30 | 31 | (defmethod cl-dot:graph-object-node ((graph (eql 'dfa)) (state (eql 'nothing))) 32 | (make-instance 'cl-dot:node 33 | :attributes (list :label " " 34 | :color "#00000000" 35 | :fillcolor "#00000000"))) 36 | 37 | (defmethod cl-dot:graph-object-node ((graph (eql 'dfa)) state) 38 | ;; We add a newline to state names so that cl-dot will emit the 39 | ;; trailing \l, which is necessary even when there isn't another 40 | ;; line break. 41 | (make-instance 'cl-dot:node 42 | :attributes (append 43 | (if *print-state-names* 44 | (list :label (list :left (format nil "~A~%" (state-expression state)))) 45 | '()) 46 | (list :style (if (eq (empty-set) (nullable (state-expression state))) 47 | :solid :bold))))) 48 | 49 | (defun trim-assignments-for-show (assignments) 50 | (let ((new-assignments 51 | (loop for assignment in assignments 52 | for (target . source) = assignment 53 | unless (equal target source) 54 | collect assignment))) 55 | (if (null new-assignments) 56 | "" 57 | (tag-set new-assignments)))) 58 | 59 | (defmethod cl-dot:graph-object-edges ((graph (eql 'dfa))) 60 | (let ((edges (list (list 'nothing *initial-state*)))) 61 | (maphash (lambda (re state) 62 | (declare (ignore re)) 63 | (dolist (transition (state-transitions state)) 64 | (push (list state (transition-next-state transition) 65 | (list :label 66 | (format nil "~a ~a" 67 | (with-output-to-string (s) 68 | (print-csum 69 | (transition-class transition) 70 | s)) 71 | (trim-assignments-for-show 72 | (transition-tags-to-set transition))))) 73 | edges))) 74 | *dfa*) 75 | edges)) 76 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # one-more-re-nightmare 2 | 3 | one-more-re-nightmare is a regular expression engine that uses the 4 | technique presented in [Regular-expression derivatives 5 | revisited](https://www.ccs.neu.edu/home/turon/re-deriv.pdf) to 6 | interpret and compile regular expressions. We use a few tricks to make 7 | matching quite fast: 8 | 9 | - We use a deterministic finite automaton to have O(n) runtime. 10 | - We run the Common Lisp compiler to generate machine code, rather 11 | than interpreting a DFA or bytecode, or jumping through closures 12 | (like CL-PPCRE does). 13 | - We generate specialised code for each array type, so everything is 14 | inlined. 15 | - If you use the `one-more-re-nightmare-simd` system on SBCL 2.1.10 or 16 | newer with AVX2, we even use vectorised scanning of constant 17 | prefixes of regular expressions. 18 | 19 | Thanks to Gilbert Baumann for suggesting I use derivatives to compile 20 | regular expressions, and then for informing me of how to handle 21 | submatching properly, and my discrete mathematics teachers for 22 | formally introducing me to finite state machines. 23 | 24 | Please see [the reference 25 | book](https://applied-langua.ge/projects/one-more-re-nightmare/) for 26 | how to use one-more-re-nightmare, or [an 27 | article](https://applied-langua.ge/posts/omrn-compiler.html) on the 28 | history and theory involved. 29 | 30 | While the syntax is admittedly wonky (but somewhat more like how 31 | regular expressions are presented in papers), one-more-re-nightmare 32 | makes its best effort to implement POSIX semantics for matching (as 33 | described in the specification for [how `regcomp` 34 | works](https://pubs.opengroup.org/onlinepubs/9699919799/functions/regexec.html) 35 | and [regular expression 36 | definitions](https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html)). Any 37 | behaviour contrary to POSIX is a bug. 38 | 39 | ## A lousy benchmark 40 | 41 | ```lisp 42 | CL-USER> (let ((s (make-string 1000000 :initial-element #\a))) 43 | (setf (aref s 333333) #\b) 44 | (setf (aref s 555555) #\c) 45 | (the-cost-of-nothing:bench 46 | (all-string-matches "ab|ac" s))) 47 | 48 | CL-USER> (let ((s (make-string 1000000 :initial-element #\a))) 49 | (setf (aref s 333333) #\b) 50 | (setf (aref s 555555) #\c) 51 | (the-cost-of-nothing:bench 52 | (cl-ppcre:all-matches-as-strings "ab|ac" s))) 53 | ``` 54 | 55 | Note that, by nature of calling the Common Lisp compiler, 56 | one-more-re-nightmare will take longer to compile a regular 57 | expression, so it is better suited for many matching operations with 58 | few expressions. It does cache compiled expressions when using the 59 | high-level interface, so the initial cost may amortize well over many 60 | calls; and constant regular expression strings are compiled at 61 | compile-time, with no runtime overhead whatsoever. 62 | 63 | | engine | SBCL | Clozure CL | SBCL with AVX2 | ditto, SIMPLE-BASE-STRING | 64 | |------------------|-----------|------------|----------------|---------------------------| 65 | | o-m-r-n | 0.57ms | 2.93ms | 0.18ms | 55µs | 66 | | compilation time | 4.65ms | 3.76ms | 6.82ms | 6.43ms | 67 | | cl-ppcre | 22.8ms | 45.3ms | 22.8ms | 21.6ms | 68 | | break even after | 209kchars | 88.7kchars | 301kchars | 305kchars | 69 | -------------------------------------------------------------------------------- /Code/DFA-construction/derivative.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (define-hash-consing-table *derivative*) 4 | 5 | (defun derivative (re set) 6 | "Compute the derivative of a regular expression with regards to the set (i.e. the regular expression should be matched after a character in the set is matched)." 7 | (with-hash-consing (*derivative* (list re set)) 8 | (trivia:ematch re 9 | ((or (empty-string) (empty-set) (tag-set _)) (empty-set)) 10 | ((literal matching-set) 11 | (if (csum-null-p (csum-intersection matching-set set)) 12 | (empty-set) 13 | (empty-string))) 14 | ((join r s) 15 | (let ((r* (derivative r set)) 16 | (s* (derivative s set))) 17 | (cond 18 | ((eq r* (empty-set)) 19 | ;; Something like [...]A doesn't need gensym'ing. 20 | (let ((*gensym-assignments?* nil)) 21 | (join (nullable r) s*))) 22 | ((not (has-tags-p r*)) 23 | ;; Ditto for A[...] 24 | (either (join r* s) (join (nullable r) s*))) 25 | (t 26 | (either (join r* (unique-tags s)) (join (nullable r) s*)))))) 27 | ((repeat r min max _) 28 | (join (derivative r set) 29 | (repeat (unique-tags r) 30 | (max (1- min) 0) 31 | (if (null max) nil (1- max)) 32 | nil))) 33 | ((either r s) 34 | (either (derivative r set) (derivative s set))) 35 | ((both r s) 36 | (both (derivative r set) (derivative s set))) 37 | ((invert r) 38 | (invert (derivative r set))) 39 | ((grep r s) 40 | (let* ((r* (derivative r set)) 41 | (n (nullable r*))) 42 | (if (eq n (empty-set)) 43 | (grep (either r* 44 | (if (has-tags-p r*) 45 | (unique-tags s) 46 | s)) 47 | s) 48 | r*))) 49 | ((alpha r old-tags) 50 | (let* ((r* (derivative r set)) 51 | (*gensym-assignments?* nil) 52 | (nullable (nullable r))) 53 | (alpha r* 54 | (either nullable old-tags))))))) 55 | 56 | (defun derivative* (re sequence &key search) 57 | (let ((variables (make-hash-table :test 'equal)) 58 | (position 0) 59 | (*tag-gensym-counter* 0)) 60 | (with-hash-consing-tables () 61 | (when (stringp re) (setf re (parse-regular-expression re))) 62 | (when search (setf re (make-search-machine re))) 63 | (flet ((run-effects (effects) 64 | (loop for (target . source) in effects 65 | for value = (case source 66 | ((position) position) 67 | ((nil) 'nil) 68 | (otherwise (gethash source variables :unbound))) 69 | do (setf (gethash target variables) value)))) 70 | (map 'nil 71 | (lambda (element) 72 | (let* ((new-re (derivative re (singleton-set (char-code element)))) 73 | (effects (remove-if (lambda (x) (equal (car x) (cdr x))) 74 | (effects re)))) 75 | (format t "~&~a~& ~:c ~a" 76 | re element effects) 77 | (setf re new-re) 78 | (run-effects effects) 79 | (incf position))) 80 | sequence) 81 | (run-effects (effects re)) 82 | (format t "~&~a" (alexandria:hash-table-alist variables)) 83 | (values re 84 | (trivia:match (nullable re) 85 | ((tag-set s) 86 | (loop for ((name nil) . source) in s 87 | unless (null (gethash source variables)) 88 | collect (cons name (gethash source variables)))) 89 | ((empty-string) '()) 90 | ((empty-set) '())) 91 | (nullable re)))))) 92 | -------------------------------------------------------------------------------- /Documentation/spec-macros.css: -------------------------------------------------------------------------------- 1 | .FloatRight { 2 | float: right; 3 | } 4 | 5 | .SmallCaps { 6 | font-variant: small-caps; 7 | } 8 | 9 | .Indent { 10 | margin-left: 2em; 11 | } 12 | 13 | .Definitions { 14 | margin-top: 1em; 15 | margin-left: 0; 16 | margin-right: 0; 17 | border-top: 1px black solid; 18 | display: block; 19 | } 20 | 21 | .Landscape { 22 | margin: 0; 23 | } 24 | 25 | .Definitions > *:not(:empty):nth-of-type(1) { 26 | margin-top: 0.25em; 27 | } 28 | 29 | .TagDescription { 30 | float: right; 31 | text-style: italic; 32 | } 33 | 34 | .main { 35 | text-align: justify; 36 | } 37 | 38 | h1, h2, h3, h4, h5 { 39 | font-family: "Open Sans", sans-serif; 40 | text-align: left; 41 | } 42 | 43 | @media only screen { 44 | h4, h5 { 45 | background: #d1c4e9; 46 | } 47 | h3, h4, h5 { 48 | font-size: 120%; 49 | margin-bottom: 0.5em; 50 | } 51 | 52 | body { 53 | background: #eee; 54 | color: #111; 55 | --card-padding: 8pt; 56 | margin: var(--card-padding); 57 | } 58 | 59 | .tocset, .maincolumn, .refcolumn { 60 | padding: var(--card-padding); 61 | background: white; 62 | box-shadow: 0 3px 6px rgba(0,0,0,0.16), 0 3px 6px rgba(0,0,0,0.23); 63 | border-radius: 2px; 64 | } 65 | .maincolumn { 66 | width: min(50em, calc(100vw - 12.5em - 13.5em - 10 * var(--card-padding))); 67 | margin: 0 auto; 68 | } 69 | 70 | .tocview, .tocsub { 71 | background: transparent; 72 | } 73 | .tocviewlink, .tocviewselflink, .tocsubseclink { 74 | color: black; 75 | } 76 | .tocviewselflink { 77 | text-decoration: none; 78 | font-weight: 700; 79 | } 80 | 81 | h4, h5, .navsettop, .navsetbottom { 82 | padding: var(--card-padding); 83 | position: relative; 84 | left: calc(-1 * var(--card-padding)); 85 | width: 100%; 86 | } 87 | .navsettop { 88 | top: calc(-1 * var(--card-padding)); 89 | margin-bottom: calc(-1 * var(--card-padding)); 90 | } 91 | 92 | .navsettop, .navsetbottom { 93 | border: none; 94 | background: #FFCDD2; 95 | } 96 | .navright > a { 97 | color: black; 98 | font-style: italic; 99 | text-decoration: none; 100 | } 101 | .nonavigation { 102 | color: #333; 103 | } 104 | 105 | .refpara { 106 | left: calc(3 * var(--card-padding)); 107 | } 108 | 109 | li::marker { 110 | color: #888; 111 | } 112 | } 113 | 114 | .texMathInline, .texMathDisplay { 115 | /* Make the KaTeX output about the same size as the other text. */ 116 | font-size: 75%; 117 | } 118 | .texMathDisplay { 119 | /* Attempt to allow scrolling of long mathematical stuff. */ 120 | display: block; 121 | overflow-x: auto; 122 | overflow-y: clip; 123 | } 124 | 125 | .CenteredBlock, .CenteredContainer { 126 | margin: 0; 127 | } 128 | .CenteredBlock { 129 | text-align: left; 130 | } 131 | 132 | .CenteredBlock > .SIntrapara, .CenteredBlock > p { 133 | margin: 0; 134 | } 135 | 136 | .CenteredContainer { 137 | display: block; 138 | margin-left: 2em; 139 | margin-bottom: 1em; 140 | overflow-x: auto; 141 | overflow-y: clip; 142 | } 143 | 144 | p:empty { 145 | display: none; 146 | } 147 | 148 | tr { 149 | vertical-align: top; 150 | } 151 | 152 | .stt { 153 | /* Disable italics we inherit from argument lists */ 154 | font-style: normal; 155 | /* Don't try to justify code, that looks bad */ 156 | text-justify: none; 157 | } 158 | 159 | .refcolumn { 160 | text-align: left; 161 | border: none; 162 | } 163 | 164 | p { 165 | line-height: 1.2; 166 | } 167 | 168 | body { 169 | line-height: 1.2; 170 | hyphens: auto; 171 | } 172 | 173 | blockquote { 174 | margin-top: 0; 175 | margin-bottom: 0; 176 | } 177 | 178 | h2 { 179 | margin-top: 0.5em; /* Foo, who made it 0 in Scribble? */ 180 | } 181 | 182 | .AutobibLink { 183 | color: #111; 184 | font-style: italic; 185 | text-decoration: dashed underline; 186 | } 187 | -------------------------------------------------------------------------------- /Code/Interface/lint.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (define-condition lint-style-warning (style-warning) 4 | ()) 5 | 6 | (define-condition not-matchable-style-warning (lint-style-warning) 7 | ()) 8 | 9 | (define-condition expression-not-matchable (not-matchable-style-warning) 10 | () 11 | (:report "This expression is impossible to match.")) 12 | 13 | (define-condition group-not-matchable (not-matchable-style-warning) 14 | ((n :initarg :n :reader warning-group-number) 15 | (string :initarg :string :reader warning-group-string)) 16 | (:report 17 | (lambda (c s) 18 | (format s "The ~:R group~%~4T~A~%in this expression is impossible to match." 19 | (warning-group-number c) 20 | (warning-group-string c))))) 21 | 22 | (define-condition matching-too-much-style-warning (lint-style-warning) 23 | ()) 24 | 25 | (define-condition expression-matches-everything (matching-too-much-style-warning) 26 | () 27 | (:report "This expression can match the empty string at every position.")) 28 | 29 | (define-condition expression-matches-empty-string (matching-too-much-style-warning) 30 | () 31 | (:report "This expression will only ever match the empty string at every position.")) 32 | 33 | (defun check-liveness (dfa groups group-strings) 34 | (let ((matching? nil) 35 | (matched-groups '())) 36 | (maphash (lambda (re state) 37 | (unless (eq (nullable re) (empty-set)) 38 | (setf matching? t)) 39 | (loop for ((n nil) . nil) in (state-exit-map state) 40 | ;; Group #1 uses tags #1 and #2 - we'll use 41 | ;; #2 being written to test if #1 is alive. 42 | for group-number = (floor (1+ n) 2) 43 | when (evenp n) 44 | do (pushnew group-number matched-groups))) 45 | dfa) 46 | (if (not matching?) 47 | (warn 'expression-not-matchable) 48 | (loop for n from 1 to groups 49 | unless (member n matched-groups) 50 | do (warn 'group-not-matchable 51 | :n n 52 | :string (aref group-strings n)))))) 53 | 54 | (defun check-empty-matches (dfa) 55 | (let ((warned-empty-string? nil) 56 | (expressions (remove (empty-set) 57 | (alexandria:hash-table-alist dfa) 58 | :key #'car))) 59 | ;; Check if this RE only matches the empty string. We can't just 60 | ;; do this syntactically, i.e. (eq expression (empty-string)) as e.g. 61 | ;; "|(a&bc)" confounds rewriting, and would produce a false negative. 62 | ;; Instead, we check that there is only one state that isn't the 63 | ;; empty set, and that it can only transition to the empty set. 64 | ;; Hence the DFA can only ever match the empty string. 65 | (when (alexandria:length= 1 expressions) 66 | (let ((re (car (first expressions)))) 67 | (when (and (not (eq (nullable re) (empty-set))) 68 | (null 69 | (remove (empty-set) 70 | (state-transitions (cdr (first expressions))) 71 | :key (alexandria:compose #'state-expression 72 | #'transition-next-state)))) 73 | (warn 'expression-matches-empty-string) 74 | (setf warned-empty-string? t)))) 75 | (unless (or warned-empty-string? (null expressions)) 76 | ;; We will match everywhere if every state is nullable. 77 | (let ((something-wont-match? nil)) 78 | (maphash (lambda (re state) 79 | (declare (ignore state)) 80 | (when (and (eq (nullable re) (empty-set)) 81 | (not (eq re (empty-set)))) 82 | (setf something-wont-match? t))) 83 | dfa) 84 | (unless something-wont-match? (warn 'expression-matches-everything)))))) 85 | 86 | (defun lint-regular-expression (expression) 87 | (with-hash-consing-tables () 88 | (multiple-value-bind (expression group-count group-strings) 89 | (parse-regular-expression expression) 90 | (let ((dfa (make-dfa-from-expression expression))) 91 | (check-liveness dfa group-count group-strings) 92 | (check-empty-matches dfa))))) 93 | -------------------------------------------------------------------------------- /Code/Compiler/length-inference.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (stealth-mixin:define-stealth-mixin length-inference-info () 4 | state 5 | ((predecessors :accessor predecessors) 6 | (minimum-length :accessor minimum-length))) 7 | 8 | (defun compute-predecessor-lists (states) 9 | ;; Clear predecessor lists first. 10 | (maphash (lambda (ex state) 11 | (declare (ignore ex)) 12 | (setf (predecessors state) '())) 13 | states) 14 | ;; For each transition, add the predecessor to the predecessor list 15 | ;; of the successor. 16 | (maphash (lambda (ex predecessor) 17 | (declare (ignore ex)) 18 | (dolist (transition (state-transitions predecessor)) 19 | (let ((successor (transition-next-state transition))) 20 | (unless (null successor) 21 | (push predecessor (predecessors successor)))))) 22 | states)) 23 | 24 | ;;; The following function implements Kildall's algorithm for 25 | ;;; optimisation, as described in 26 | ;;; . 27 | ;;; The end result of this analysis is that we can replace several 28 | ;;; bounds checks, each checking for one more character, with one 29 | ;;; check that checks for multiple characters. 30 | 31 | ;;; To compute the minimum lengths required to match from each state, 32 | ;;; this algorithm first optimistically makes far too high guesses 33 | ;;; about minimum lengths: if a state is not accepting, it guesses 34 | ;;; that the state will never reach an accepting state, and thus has 35 | ;;; an infinite minimum length; else, it correctly determines that the 36 | ;;; minimum length is 0 when the state is accepting. We then propagate 37 | ;;; lengths. A state takes one more character to match than its 38 | ;;; successors (the "transfer" function), and a set of successors 39 | ;;; states has a minimum length that is the minimum of the minimum 40 | ;;; length of each successor (the "confluence" function). We treat 41 | ;;; states that never reach an accepting state as having infinite 42 | ;;; minimum length, so our implementation has to handle some 43 | ;;; additional cases that aren't integer arithmetic: 44 | ;;; X ≠ infinity ⇒ X < infinity 45 | ;;; min(infinity, X) = min(X, infinity) = X for all X 46 | ;;; infinity + 1 = infinity 47 | ;;; Minimum lengths are repeatedly propagated until we don't actually 48 | ;;; lower anything by propagation. Then we have a correct result. 49 | 50 | (defun compute-minimum-lengths (states) 51 | (let ((work-list '())) 52 | (flet ((recompute-predecessors-of (state) 53 | (dolist (pred (predecessors state)) 54 | (pushnew pred work-list))) 55 | (confluence (x y) 56 | (cond 57 | ((eql x :infinity) y) 58 | ((eql y :infinity) x) 59 | (t (min x y)))) 60 | (transfer (x) 61 | (if (eql x :infinity) 62 | :infinity 63 | (1+ x))) 64 | (lower-p (new old) 65 | (if (eql old :infinity) 66 | (not (eql new :infinity)) 67 | (< new old)))) 68 | ;; Set the minimum length of every nullable state to be 0, and 69 | ;; the minimum length of every other state to be pretty large. 70 | (maphash (lambda (ex state) 71 | (setf (minimum-length state) 72 | (if (eq (nullable ex) (empty-set)) 73 | :infinity 74 | 0)) 75 | (recompute-predecessors-of state)) 76 | states) 77 | ;; Set each minimum length to be one more than the minimum 78 | ;; length of the successors. 79 | (loop until (null work-list) 80 | do (let* ((state (pop work-list)) 81 | (minimum-successors-length 82 | (transfer 83 | (reduce #'confluence 84 | (state-transitions state) 85 | :key (lambda (transition) 86 | (minimum-length (transition-next-state transition))) 87 | :initial-value :infinity)))) 88 | (when (lower-p minimum-successors-length (minimum-length state)) 89 | (setf (minimum-length state) minimum-successors-length) 90 | (recompute-predecessors-of state))))))) 91 | 92 | (defun state-never-succeeds-p (state) 93 | (eql (minimum-length state) :infinity)) 94 | -------------------------------------------------------------------------------- /Code/DFA-construction/tag-sets.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defvar *tag-gensym-counter* 0) 4 | 5 | (defun tag-gensym () 6 | (incf *tag-gensym-counter*)) 7 | 8 | (defun gensym-position-assignments (set) 9 | "Replicate any assignments, turning T_n <- s for all s into T^r_n <- T_n for some arbitrary r" 10 | (loop for (target . source) in set 11 | for (variable nil) = target 12 | collect (cond 13 | ((eql source 'nil) 14 | (if *gensym-assignments?* 15 | (cons (list variable (tag-gensym)) 'nil) 16 | (cons target 'nil))) 17 | ((not *gensym-assignments?*) 18 | (cons target target)) 19 | (t 20 | (cons (list variable (tag-gensym)) target))))) 21 | 22 | (defun unique-assignments (set) 23 | "Make assignments unique, turning T_n <- s for all s into T^r_n <- s" 24 | (loop for ((variable nil) . source) in set 25 | collect (cons (list variable (tag-gensym)) source))) 26 | 27 | (defun merge-tag-sets (set1 set2) 28 | (append (loop for assignment in set1 29 | for (target . source) = assignment 30 | unless (find (first target) set2 :key #'caar) 31 | collect assignment) 32 | set2)) 33 | 34 | (defun used-tags (re) 35 | (with-slot-consing (cached-used-tags re) 36 | (trivia:match re 37 | ((tag-set s) (mapcar #'cdr s)) 38 | ((or (either r s) (both r s) (join r s)) 39 | (union (used-tags r) (used-tags s) :test #'equal)) 40 | ((or (invert r) (repeat r _ _ _)) 41 | (used-tags r)) 42 | ((grep vector _) (used-tags vector)) 43 | ((alpha r history) 44 | (union (used-tags r) (used-tags history) :test #'equal)) 45 | (_ '())))) 46 | 47 | (defun tags (re) 48 | (with-slot-consing (cached-tags re) 49 | (trivia:match re 50 | ((tag-set s) s) 51 | ((or (either r s) (both r s) (join r s)) 52 | (union (tags r) (tags s) :test #'equal)) 53 | ((or (invert r) (repeat r _ _ _)) 54 | (tags r)) 55 | ((grep r _) (tags r)) 56 | ((alpha r _) (tags r)) 57 | (_ '())))) 58 | 59 | (defun keep-used-assignments (new-re assignments) 60 | (loop with used = (used-tags new-re) 61 | for assignment in assignments 62 | for (target . nil) = assignment 63 | when (member target used :test #'equal) 64 | collect assignment)) 65 | 66 | (defun remove-tags (re) 67 | (with-slot-consing (cached-removed-tags re) 68 | (trivia:match re 69 | ((tag-set _) (empty-string)) 70 | ((either r s) (either (remove-tags r) (remove-tags s))) 71 | ((both r s) (both (remove-tags r) (remove-tags s))) 72 | ((join r s) (join (remove-tags r) (remove-tags s))) 73 | ((repeat r min max c) (repeat (remove-tags r) min max c)) 74 | ((invert r) (invert (remove-tags r))) 75 | ((alpha r s) 76 | (either (remove-tags r) 77 | (if (eq s (empty-set)) 78 | (empty-set) 79 | (empty-string)))) 80 | ((grep r s) (grep (remove-tags r) (remove-tags s))) 81 | (_ re)))) 82 | 83 | (defun has-tags-p (re) 84 | (with-slot-consing (cached-has-tags-p re) 85 | (trivia:match re 86 | ((tag-set _) t) 87 | ((or (either r s) (both r s) (join r s) (alpha r s) (grep r s)) 88 | (or (has-tags-p r) (has-tags-p s))) 89 | ((or (invert r) (repeat r _ _ _)) 90 | (has-tags-p r)) 91 | (_ nil)))) 92 | 93 | (defvar *allow-alpha* t) 94 | (defun map-tags (f re) 95 | ;; Return the same RE if we have no tags to replace. 96 | (unless (has-tags-p re) 97 | (return-from map-tags re)) 98 | (trivia:match re 99 | ((tag-set set) (tag-set (funcall f set))) 100 | ((either r s) (either (map-tags f r) (map-tags f s))) 101 | ((both r s) (both (map-tags f r) (map-tags f s))) 102 | ((join r s) (join (map-tags f r) (map-tags f s))) 103 | ((invert r) (invert (map-tags f r))) 104 | ((repeat r min max c) (repeat (map-tags f r) min max c)) 105 | ((alpha r old-tags) 106 | (unless (or *allow-alpha* (eq old-tags (empty-set))) 107 | (error "Can't modify tags with history")) 108 | (alpha (map-tags f r) 109 | (map-tags f old-tags))) 110 | ((grep r s) 111 | (grep (map-tags f r) 112 | (map-tags f s))) 113 | (_ re))) 114 | 115 | (defun unique-tags (re) 116 | (let ((*allow-alpha* nil)) 117 | (map-tags #'unique-assignments re))) 118 | -------------------------------------------------------------------------------- /Code/DFA-construction/type.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defvar *table-names* '()) 4 | 5 | (defmacro define-hash-consing-table (name) 6 | `(progn 7 | (defvar ,name) 8 | (pushnew ',name *table-names*) 9 | ',name)) 10 | 11 | (defconstant +uncomputed+ '+uncomputed+) 12 | (defclass regular-expression () 13 | ((%nullable :initform +uncomputed+ :accessor cached-nullable) 14 | (%nullable-no-gensym :initform +uncomputed+ :accessor cached-nullable-no-gensym) 15 | (%used-tags :initform +uncomputed+ :accessor cached-used-tags) 16 | (%tags :initform +uncomputed+ :accessor cached-tags) 17 | (%removed-tags :initform +uncomputed+ :accessor cached-removed-tags) 18 | (%has-tags-p :initform +uncomputed+ :accessor cached-has-tags-p))) 19 | 20 | (defmacro define-types (&body types) 21 | (loop for (name . slots) in types 22 | collect (let ((variables (loop for slot in slots collect (gensym (symbol-name slot)))) 23 | (internal-creator (alexandria:format-symbol t "%~a" name)) 24 | (table-name (alexandria:format-symbol '#:one-more-re-nightmare 25 | "*~A-TABLE*" name))) 26 | `(progn 27 | (defclass ,name (regular-expression) ,slots) 28 | (trivia:defpattern ,name ,variables 29 | (alexandria:with-gensyms (instance-name) 30 | (list 'trivia:guard1 (list instance-name ':type ',name) 31 | (list 'typep instance-name '',name) 32 | ,@(loop for slot in slots 33 | for variable in variables 34 | appending `((list 'slot-value instance-name '',slot) ,variable))))) 35 | (define-hash-consing-table ,table-name) 36 | (defun ,internal-creator ,slots 37 | (or (gethash (list ,@slots) ,table-name) 38 | (let ((instance (make-instance ',name))) 39 | ,@(loop for slot in slots 40 | collect `(setf (slot-value instance ',slot) ,slot)) 41 | (setf (gethash (list ,@slots) ,table-name) 42 | instance)))))) 43 | into forms 44 | finally (return `(progn ,@forms)))) 45 | 46 | (defmacro define-rewrites ((name &rest slots) &key simplify hash-cons printer) 47 | (let ((internal-creator (alexandria:format-symbol t "%~a" name)) 48 | (table-name (alexandria:format-symbol '#:one-more-re-nightmare 49 | "*~A-TABLE*" name))) 50 | `(progn 51 | ,@(unless (null printer) 52 | `((defmethod print-object ((instance ,name) stream) 53 | (trivia:ematch instance 54 | ,printer)))) 55 | (defun ,name ,slots 56 | (trivia:match (list ,@slots) 57 | ,@(loop for ((nil . pattern) replacement) in simplify 58 | collect `((list ,@pattern) ,replacement)) 59 | ,@(loop for ((nil . pattern) (nil . replacement)) in hash-cons 60 | collect `((list ,@pattern) 61 | (or (gethash (list ,@replacement) ,table-name) 62 | (trivia.next:next)))) 63 | (_ (,internal-creator ,@slots))))))) 64 | (indent:define-indentation define-type (4 &body)) 65 | 66 | (defmacro with-hash-consing ((table key) &body body) 67 | (alexandria:once-only (table key) 68 | (alexandria:with-gensyms (value present?) 69 | `(multiple-value-bind (,value ,present?) 70 | (gethash ,key ,table) 71 | (if ,present? 72 | ,value 73 | (setf (gethash ,key ,table) 74 | (progn ,@body))))))) 75 | 76 | (defmacro with-slot-consing ((accessor object &key (when 't)) &body body) 77 | (alexandria:once-only (object) 78 | (alexandria:with-gensyms (value) 79 | `(let ((,value (,accessor ,object))) 80 | (flet ((compute-the-damn-value () 81 | ,@body)) 82 | (cond 83 | ((not ,when) 84 | (compute-the-damn-value)) 85 | ((eq ,value +uncomputed+) 86 | (setf (,accessor ,object) 87 | (compute-the-damn-value))) 88 | (t 89 | ,value))))))) 90 | 91 | (defmacro with-hash-consing-tables (() &body body) 92 | `(let ,(loop for name in *table-names* 93 | collect `(,name (make-hash-table :test 'equal))) 94 | ,@body)) 95 | 96 | (defmacro clear-global-tables () 97 | "Set up global tables for testing." 98 | `(setf ,@(loop for name in *table-names* 99 | append `(,name (make-hash-table :test 'equal))))) 100 | -------------------------------------------------------------------------------- /Code/SIMD/prefix-strategy.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defclass simd-info () 4 | ((bits :initarg :bits :reader bits))) 5 | 6 | (defclass simd-prefix (simd-info strategy) 7 | () 8 | (:documentation "Match a prefix of the string using SIMD operations before entering a DFA. 9 | A prefix P of some regular expression R is defined to be a sequence of literals such that P·S = R for some other suffix regular expression S.")) 10 | 11 | (defmethod initial-states ((strategy simd-prefix) expression) 12 | (multiple-value-bind (prefix suffix) 13 | (prefix (add-tags expression)) 14 | (declare (ignore prefix)) 15 | (list (alpha (add-tags expression) (empty-set)) 16 | (alpha suffix (empty-set))))) 17 | 18 | (defmethod start-code ((strategy simd-prefix) states) 19 | ;; Note that, by definition, having a prefix implies that the RE 20 | ;; can't produce only zero length matches. Thus we don't have to 21 | ;; worry about that case. 22 | (when (eql (minimum-length (first states)) :infinity) 23 | ;; This can't actually match anything, so return immediately. 24 | (return-from start-code `(start (return)))) 25 | (multiple-value-bind (prefix suffix) 26 | (prefix (state-expression (first states))) 27 | (declare (ignore suffix)) 28 | (multiple-value-bind (test loads assignments jump-length) 29 | (code-from-prefix prefix) 30 | `(start 31 | ;; Don't try to read over the END we were given. 32 | (when (>= (the fixnum (+ ,jump-length 33 | ,(/ one-more-re-nightmare.vector-primops:+v-length+ *bits*) 34 | start)) 35 | end) 36 | (setf position start) 37 | (go ,(find-state-name (first states) :bounds-check))) 38 | ;; Now perform the SIMD test. 39 | (let* (,@loads 40 | (test-results (,(find-op "MOVEMASK") ,test))) 41 | (unless (zerop test-results) 42 | ;; Found a match! 43 | (setf position (+ start (one-more-re-nightmare.vector-primops:find-first-set test-results))) 44 | ;; We increment POSITION by 1 before assigning to act like 45 | ;; the actual DFA, and then do the rest of the "jump" 46 | ;; after assignments. 47 | (incf position) 48 | ,assignments 49 | (incf position ,(1- jump-length)) 50 | ;; The same deal as in START-CODE for SCAN-EVERYTHING: we 51 | ;; "inline" succeeding states, so we might need to succeed 52 | ;; rather than go to another state. 53 | ,(let ((expression (state-expression (second states)))) 54 | (if (re-empty-p expression) 55 | (let ((effects (effects expression))) 56 | `(progn 57 | ;; Surely there wouldn't be any new 58 | ;; assignments, as PREFIX would strip them 59 | ;; off. 60 | (let ((position (1+ position))) 61 | ,(setf-from-assignments effects)) 62 | (setf start (max (1+ start) 63 | (1- ,(find-in-map 'end (state-exit-map (second states)))))) 64 | (win ,@(win-locations effects)))) 65 | `(go ,(find-state-name (second states) :bounds-check))))) 66 | ;; No match, so just bump and try again. 67 | (incf start ,(/ one-more-re-nightmare.vector-primops:+v-length+ *bits*)) 68 | (go start)))))) 69 | 70 | (defmethod make-prog-parts :around ((strategy simd-info) expression) 71 | (let ((*broadcasts* (make-hash-table)) 72 | (*bits* (bits strategy))) 73 | (multiple-value-bind (variables declarations body) 74 | (call-next-method) 75 | (maphash (lambda (value name) 76 | (push (list name `(,(find-op "BROADCAST") ,value)) 77 | variables)) 78 | *broadcasts*) 79 | (values variables declarations body)))) 80 | 81 | ;; Surely we don't need this macro. Come on. 82 | (defmethod macros-for-strategy append ((strategy simd-prefix)) 83 | '((restart () 84 | '(go start)))) 85 | 86 | (defun make-default-strategy (layout expression) 87 | (let ((bits 88 | (alexandria:switch ((layout-array-type layout) :test 'equal) 89 | ('(simple-array character 1) 32) 90 | ('(simple-array base-char 1) 8) 91 | (otherwise nil)))) 92 | (cond 93 | ((eql *code-type* :interpreted) 94 | ;; Interpreter doesn't recognise VOPs, so we can't use SIMD. 95 | ;; Not that you'd really want to. 96 | (make-instance (dynamic-mixins:mix 'scan-everything 'call-continuation))) 97 | ((and (> (count :literal (prefix expression) :key #'first) 1) 98 | (not (null bits))) 99 | (make-instance (dynamic-mixins:mix 'simd-loop 'simd-prefix 'call-continuation) 100 | :bits bits)) 101 | ((not (null bits)) 102 | (make-instance (dynamic-mixins:mix 'simd-loop 'scan-everything 'call-continuation) 103 | :bits bits)) 104 | (t 105 | (make-instance (dynamic-mixins:mix 'scan-everything 'call-continuation)))))) 106 | -------------------------------------------------------------------------------- /Tests/tests.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare-tests) 2 | 3 | (parachute:define-test one-more-re-nightmare) 4 | 5 | (defmacro first-match (haystack &body body) 6 | `(progn 7 | ,@(loop for (re result-vector) on body by #'cddr 8 | collect `(parachute:is equalp 9 | ,result-vector 10 | (one-more-re-nightmare:first-match ,re ,haystack))))) 11 | 12 | (defmacro all-string-matches (haystack &body body) 13 | `(progn 14 | ,@(loop for (re result-vector) on body by #'cddr 15 | collect `(parachute:is equalp 16 | ,result-vector 17 | (one-more-re-nightmare:all-string-matches ,re ,haystack))))) 18 | 19 | (parachute:define-test easy-stuff 20 | :parent one-more-re-nightmare 21 | (first-match "Hello world" 22 | "Hello" #(0 5) 23 | "world" #(6 11) 24 | "[a-z]+" #(1 5) 25 | "(h|l)o" #(3 5)) 26 | ;; The engine should finish the match at the right spot. 27 | ;; (The first OMRN compiler would not do this correctly.) 28 | (first-match "ababa" 29 | "ab" #(0 2) 30 | "(ab)+" #(0 4)) 31 | ;; The engine should also restart just after the actual match. 32 | (all-string-matches "ababc" 33 | "ab" '(#("ab") #("ab")) 34 | "ab(c|)" '(#("ab") #("abc"))) 35 | ;; Per HAKMEM item #176. 36 | (all-string-matches "banana" 37 | "ana" '(#("ana"))) 38 | (all-string-matches "aaa" 39 | "a+" '(#("aaa")) 40 | ;; Make sure we match the empty string at the end of the haystack.. 41 | ;; One program used at university would loop indefinitely, because 42 | ;; it would not advance past the zero-length match at the end. 43 | "a*" '(#("aaa") #("")))) 44 | 45 | (parachute:define-test annoying-submatches 46 | :parent one-more-re-nightmare 47 | ;; Per footnote #14 on page #12 of gilberth's paper. 48 | (first-match "aaaaa" 49 | "«a|aa»+" #(0 5 4 5)) 50 | (first-match "aaaaaa" 51 | "«a|aa»+" #(0 6 4 6)) 52 | (first-match "aaaaaaaa" 53 | "«a|aa»+" #(0 8 6 8)) 54 | ;; Per 55 | (first-match "ab" 56 | ;; This involves clearing registers after each iteration of 57 | ;; repetition due to * or +. 58 | ;; We only should match here: 59 | ;; | | 60 | ;; V V 61 | "«««a*»|b»|b»+" #(0 2 1 2 1 2 nil nil) 62 | "«««a*»|b»|b»*" #(0 2 1 2 1 2 nil nil)) 63 | ;; Per : 64 | ;; "For this purpose, a null string shall be considered to be longer 65 | ;; than no match at all." 66 | (first-match "" 67 | "«a*»*" #(0 0 0 0))) 68 | 69 | (parachute:define-test annoying-parsing 70 | :parent one-more-re-nightmare 71 | ;; Per 72 | (all-string-matches "/* abc */ /* def */" 73 | "/\\*¬($*\\*/$*)\\*/" '(#("/* abc */") #("/* def */")))) 74 | 75 | (parachute:define-test annoying-negation 76 | :parent one-more-re-nightmare 77 | ;; We used to erroneously rewrite ¬ε to ø, which is plain wrong. 78 | (all-string-matches "abcde" 79 | "¬a" '(#("abcde") #("")) 80 | "$+&¬a" '(#("abcde")))) 81 | 82 | (defun compiler-macroexpand-1 (form) 83 | (funcall (compiler-macro-function (first form)) 84 | form 85 | nil)) 86 | (defmacro compiler-macro-warns (form type reason) 87 | ;; I mean, it doesn't really _fail_ if it just signals a 88 | ;; warning. But okay. 89 | `(parachute:fail (compiler-macroexpand-1 ',form) ,type 90 | "~A" ,reason)) 91 | (defmacro macro-warns (form type reason) 92 | `(parachute:fail (macroexpand-1 ',form) ,type 93 | "~A" ,reason)) 94 | 95 | (deftype full-warning () '(and warning (not style-warning))) 96 | 97 | (parachute:define-test user-interface 98 | :parent one-more-re-nightmare 99 | (compiler-macro-warns 100 | (one-more-re-nightmare:all-matches "a|«a»" s) 101 | one-more-re-nightmare:not-matchable-style-warning 102 | "A style-warning should be generated for dead submatches.") 103 | (compiler-macro-warns 104 | (one-more-re-nightmare:all-matches "a&b" s) 105 | one-more-re-nightmare:not-matchable-style-warning 106 | "A style-warning should be generated for dead REs.") 107 | (compiler-macro-warns 108 | (one-more-re-nightmare:all-matches "" s) 109 | one-more-re-nightmare:matching-too-much-style-warning 110 | "A style-warning should be generated for REs that only match the empty string.") 111 | (compiler-macro-warns 112 | (one-more-re-nightmare:all-matches "a*" s) 113 | one-more-re-nightmare:matching-too-much-style-warning 114 | "A style-warning should be generated for REs that match the empty string.") 115 | (compiler-macro-warns 116 | (one-more-re-nightmare:all-matches "«a" s) 117 | full-warning 118 | "A full warning should be generated for invalid RE syntax.") 119 | (macro-warns 120 | (one-more-re-nightmare:do-matches ((start end s1 e1) "Hello" s) 121 | (print (list s1 e1))) 122 | full-warning 123 | "A full warning should be generated by trying to address registers that don't exist.")) 124 | 125 | (parachute:define-test character-classes 126 | :parent one-more-re-nightmare 127 | ;; Per 128 | (all-string-matches "'HI EVERYBODY!!!!!!!!!!' 'try pressing the the Caps Lock key'" 129 | "[[:upper:]][[:lower:]]+" '(#("Caps") #("Lock")) 130 | "[[:punct:]]{3,}" '(#("!!!!!!!!!!'")) 131 | "[[:digit:]]" '() 132 | "[[:digit:]y]" '(#("y") #("y")) 133 | "[¬[:punct:]]+" '(#("HI EVERYBODY") #(" ") #("try pressing the the Caps Lock key")))) 134 | 135 | (defun run-tests () 136 | (parachute:test 'one-more-re-nightmare)) 137 | -------------------------------------------------------------------------------- /Code/Interface/syntax.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (esrap:defrule top-level 4 | (or two-expressions empty-string)) 5 | (esrap:defrule two-expressions 6 | (and expression top-level) 7 | (:destructure (a b) (join a b))) 8 | (esrap:defrule expression 9 | (or either below-either)) 10 | (esrap:defrule below-either 11 | (or both below-both)) 12 | (esrap:defrule below-both 13 | (or join below-join)) 14 | (esrap:defrule below-join 15 | (or invert 16 | kleene plus maybe repeated 17 | parens match-group character-range universal-set 18 | literal empty-string)) 19 | 20 | (defvar *next-group*) 21 | (defun next-group () 22 | (incf *next-group*)) 23 | (defvar *group-strings*) 24 | 25 | (defun parse-regular-expression (string) 26 | (let ((*next-group* 0) 27 | (*group-strings* (make-hash-table))) 28 | (values (esrap:parse 'top-level string) 29 | *next-group* 30 | (coerce (cons string 31 | (loop for group from 1 to *next-group* 32 | for (s . e) = (gethash group *group-strings*) 33 | collect (subseq string s e))) 34 | 'vector)))) 35 | 36 | ;;; Parens 37 | (esrap:defrule parens 38 | (and "(" expression ")") 39 | (:destructure (left expression right) 40 | (declare (ignore left right)) 41 | expression)) 42 | 43 | (esrap:defrule match-group 44 | (and "«" expression "»") 45 | (:around (esrap:&bounds start end) 46 | (let ((group-number (next-group))) 47 | (destructuring-bind (left expressions right) 48 | (esrap:call-transform) 49 | (declare (ignore left right)) 50 | (setf (gethash group-number *group-strings*) 51 | (cons start end)) 52 | (group expressions group-number))))) 53 | 54 | ;;; Binary operators 55 | (esrap:defrule either 56 | (and below-either "|" (or either below-either)) 57 | (:destructure (e1 bar e2) 58 | (declare (ignore bar)) 59 | (either e1 e2))) 60 | 61 | (esrap:defrule both 62 | (and below-both "&" (or both below-both)) 63 | (:destructure (e1 bar e2) 64 | (declare (ignore bar)) 65 | (both e1 e2))) 66 | 67 | (esrap:defrule join 68 | (and below-join (or join below-join)) 69 | (:destructure (e1 e2) (join e1 e2))) 70 | 71 | ;;; Repeats 72 | (defun clear-registers (expression) 73 | (join (tag-set 74 | (loop for ((v nil) . nil) in (tags expression) 75 | collect (cons (list v (tag-gensym)) 'nil))) 76 | expression)) 77 | 78 | (esrap:defrule kleene 79 | (and below-join "*") 80 | (:destructure (expression star) 81 | (declare (ignore star)) 82 | (repeat (clear-registers expression) 0 nil t))) 83 | 84 | (esrap:defrule plus 85 | (and below-join "+") 86 | (:destructure (expression plus) 87 | (declare (ignore plus)) 88 | (repeat (clear-registers expression) 1 nil t))) 89 | 90 | (esrap:defrule maybe 91 | (and below-join "?") 92 | (:destructure (expression q) 93 | (declare (ignore q)) 94 | (repeat expression 0 1 t))) 95 | 96 | (esrap:defrule repetitions 97 | (and (esrap:? integer) "," (esrap:? integer)) 98 | (:destructure (min comma max) 99 | (declare (ignore comma)) 100 | (assert (or (null max) (null min) (> max min)) 101 | (max min) 102 | "The maximum repetition count ~d cannot be less than the minimum count ~d." 103 | max min) 104 | (cons (or min 0) max))) 105 | 106 | (esrap:defrule repetition 107 | integer 108 | (:lambda (count) (cons count count))) 109 | 110 | (esrap:defrule repeated 111 | (and below-join "{" (or repetitions repetition) "}") 112 | (:destructure (e left counts right) 113 | (declare (ignore left right)) 114 | (repeat e (car counts) (cdr counts) t))) 115 | 116 | (esrap:defrule invert 117 | (and (or "¬" "~") below-join) 118 | (:destructure (bar expression) 119 | (declare (ignore bar)) 120 | (invert expression))) 121 | 122 | ;;; "Terminals" 123 | (esrap:defrule universal-set 124 | "$" 125 | (:constant (literal +universal-set+))) 126 | 127 | (esrap:defrule integer 128 | (+ (or "0" "1" "2" "3" "4" "5" "6" "7" "8" "9")) 129 | (:lambda (list) 130 | (parse-integer (format nil "~{~A~}" list)))) 131 | 132 | 133 | (esrap:defrule escaped-character 134 | (and #\\ character) 135 | (:destructure (backslash char) 136 | (declare (ignore backslash)) 137 | char)) 138 | 139 | (esrap:defrule special-character 140 | (or "(" ")" "«" "»" "[" "]" "{" "}" "¬" "~" "|" "&" "*" "+" "?" "$")) 141 | 142 | (esrap:defrule literal 143 | (or escaped-character (not special-character)) 144 | (:lambda (character) (literal (singleton-set (char-code character))))) 145 | 146 | (esrap:defrule empty-string 147 | "" 148 | (:constant (empty-string))) 149 | 150 | ;;; Character ranges 151 | (esrap:defrule character-range-escaped-constituent 152 | (and #\\ (or #\[ #\] #\- #\^ #\¬)) 153 | (:destructure (backslash char) 154 | (declare (ignore backslash)) 155 | (char char 0))) 156 | 157 | (esrap:defrule character-range-name-constituent (not (or ":" "[" "]"))) 158 | 159 | (esrap:defrule character-range-named 160 | (and "[:" (+ character-range-name-constituent) ":]") 161 | (:destructure (open characters close) 162 | (declare (ignore open close)) 163 | (named-range (coerce characters 'string)))) 164 | 165 | (esrap:defrule character-range-character 166 | (or character-range-escaped-constituent (not (or "-" "]" "[" "\\")))) 167 | 168 | (esrap:defrule character-range-single 169 | character-range-character 170 | (:lambda (character) 171 | (singleton-set (char-code character)))) 172 | 173 | (esrap:defrule character-range-range 174 | (and character-range-character "-" character-range-character) 175 | (:destructure (low dash high) 176 | (declare (ignore dash)) 177 | (range (char-code low) (1+ (char-code high))))) 178 | 179 | (esrap:defrule character-range 180 | (and "[" 181 | (esrap:? (or "^" "¬")) 182 | (* (or character-range-named 183 | character-range-range 184 | character-range-single)) 185 | "]") 186 | (:destructure (left invert ranges right) 187 | (declare (ignore left right)) 188 | (let ((sum (reduce #'csum-union ranges 189 | :initial-value +empty-set+))) 190 | (literal (if invert (csum-complement sum) sum))))) 191 | -------------------------------------------------------------------------------- /Documentation/interface.scrbl: -------------------------------------------------------------------------------- 1 | #lang scribble/base 2 | @require["spec-macros.scrbl" "bibliography.rkt"] 3 | 4 | @title{Interface} 5 | 6 | @section{Syntax} 7 | 8 | We currently use our own syntax for regular expressions, since the 9 | POSIX syntax does not allow for expressing complements or 10 | intersections of regular expressions. 11 | 12 | @centered-block[ 13 | @bnf[ 14 | @rule["E" 15 | "E | E" "union" 16 | "E & E" "intersection" 17 | "E E" "concatenate" 18 | "¬E" "complement" 19 | "~E" "complement" 20 | "E*" "zero or more repeats" 21 | "E+" "one or more repeats" 22 | "E?" "zero or one repeats" 23 | "E{j,j}" "repeat" 24 | "«E»" "submatch" 25 | "(E)" "change precedence" 26 | "[r]" "character range" 27 | "[¬r]" "complement ranges" 28 | "[^r]" "complement ranges" 29 | "$" "every character" 30 | "c" "literal character" 31 | "" ""] 32 | @rule["r" "" "empty range" 33 | "cr" "single character" 34 | "c-cr" "character range" 35 | "[::]r" "character class"] 36 | @rule["j" "" "bound" 37 | "" "no bound"] 38 | @rule["c" "" ""] 39 | ] 40 | ] 41 | 42 | Rules higher on this list are "looser" than rules lower on the list. 43 | For example, the expression @cl{ab&cd|ef} is equivalent to 44 | @cl{((ab)&(cd))|(ef)}. 45 | 46 | @section{Semantics} 47 | 48 | Matches and submatches are performed using the semantics for 49 | POSIX regular expressions @~cite[posix], with additional rules pertaining 50 | to intersection and complements. 51 | 52 | Submatches on both sides of an intersection operator are matched. For 53 | example, 54 | 55 | @lisp-code{ 56 | (first-string-match "«a»b&a«b»" "ab") ; => #("ab" "a" "b") 57 | } 58 | 59 | No submatches in a complement can ever be matched, including in the 60 | complement of a complement. Thus @cl{¬¬A} is not the same as @cl{A}: 61 | 62 | @lisp-code{ 63 | (first-match "¬¬«a»" "a") ; => #(0 1 NIL NIL) 64 | (first-match "«a»" "a") ; => #(0 1 0 1) 65 | } 66 | 67 | @section{Matching} 68 | 69 | Note that one-more-re-nightmare can avoid a cache lookup (involving 70 | acquiring a lock and hash table searching) if the regular expression 71 | is a literal string, or a constant variable bound to a string. 72 | 73 | @definitions{ 74 | @defun["first-match"]{regular-expression string @param{@&key start end}} 75 | 76 | @defun["first-string-match"]{regular-expression string @param{@&key start end}} 77 | 78 | Find the first match for @cl{regular-expression} in @cl{string} 79 | between @cl{start} and @cl{end}. 80 | 81 | @cl{first-match} returns a simple vector, where each element is a 82 | @concept{register}, or @cl{nil} when there is no match. The first two 83 | registers are always the start and end of the match, and then 84 | subsequent registers are the start and end of each submatch. A 85 | register is either a bounding index of @cl{string} or @cl{nil} when 86 | there is no submatch. 87 | 88 | @cl{first-string-match} either returns a simple vector, every element 89 | of which is a fresh string or @cl{nil} (when there is no submatch), or 90 | @cl{nil} if there is no match. 91 | 92 | @definition-section["Examples"]{ 93 | @lisp-code{ 94 | (first-match "[0-9]([0-9]| )+" "Phone: 632 3003") 95 | ;; => #(6 15) 96 | (first-string-match "[0-9]([0-9]| )+" "Phone: 632 3003") 97 | ;; => "632 3003" 98 | 99 | (first-match 100 | "«[0-9]+»x«[0-9]+»|«[0-9]+»p" 101 | "Foobar 1920x1080 17-inch display") 102 | ;; => #(7 16 7 11 12 16 NIL NIL) 103 | (first-string-match 104 | "«[0-9]+»x«[0-9]+»|«[0-9]+»p" 105 | "Foobar 1920x1080 17-inch display") 106 | ;; => #("1920x1080" "1920" "1080" NIL) 107 | } 108 | } 109 | } 110 | 111 | @definitions{ 112 | @defun["all-matches"]{regular-expression string @param{@&key start end}} 113 | 114 | @defun["all-string-matches"]{regular-expression string @param{@&key start end}} 115 | 116 | Find all matches for @cl{regular-expression} in @cl{string} between 117 | @cl{start} and @cl{end}. 118 | 119 | Both functions return a list of matches; @cl{all-matches} represents 120 | matches as @cl{first-match} does, and @cl{all-string-matches} 121 | represents matches as @cl{first-string-match} does. 122 | 123 | @definition-section["Examples"]{ 124 | @lisp-code{ 125 | (all-matches 126 | "«[0-9]+»x«[0-9]+»|«[0-9]+»p" 127 | "Foobar 1920x1080 17-inch display or Quux 19-inch 720p display?") 128 | ;; => (#(7 16 7 11 12 16 NIL NIL) #(49 53 NIL NIL NIL NIL 49 52)) 129 | (all-string-matches 130 | "«[0-9]+»x«[0-9]+»|«[0-9]+»p" 131 | "Foobar 1920x1080 17-inch display or Quux 19-inch 720p display?") 132 | ;; => (#("1920x1080" "1920" "1080" NIL) #("720p" NIL NIL "720")) 133 | } 134 | } 135 | } 136 | 137 | @definitions{ 138 | @defmacro["do-matches"]{((@param{@&rest registers}) regular-expression 139 | string @param{@&key start end}) 140 | @param{@&body body}} 141 | 142 | @cl{do-matches} iterates over all matches for @cl{regular-expression} 143 | across @cl{string}. The @cl{registers} variables are bound to the 144 | @term{registers} produced, as described for @cl{first-match}. 145 | 146 | It is possible to provide fewer variables than registers in the 147 | regular expression, but an error will be signalled if there are more 148 | variables than registers. 149 | } 150 | 151 | @section{Compiling} 152 | 153 | The compiler may be run manually, when the regular expression is not 154 | known at compile time, and the code cache takes too long to search. 155 | (The latter can happen if many threads are accessing the code cache, 156 | and the time taken searching is sufficiently short, as lookups grab a 157 | global lock currently.) 158 | 159 | @definitions{ 160 | @defclass["compiled-regular-expression"] 161 | 162 | An object representing a compiled regular expression. An instance of 163 | this class can be provided as a regular expression to all the searching 164 | functions, instead of a string. 165 | } 166 | 167 | @definitions{ 168 | @defun["compile-regular-expression"]{expression} 169 | 170 | Compile a regular expression, returning an instance of 171 | @cl{compiled-regular-expression}. 172 | 173 | An error is signalled if the expression has invalid syntax. 174 | } 175 | 176 | @definitions{ 177 | @defvar["*state-limit*"]{1000} 178 | 179 | The maximum number of states which the compiler will generate. 180 | 181 | @define-condition["exceeded-state-limit"]{error} 182 | 183 | This error type is signalled when the compiler exceeds the limit on 184 | the number of states set by @cl{*state-limit*}. 185 | } 186 | -------------------------------------------------------------------------------- /Documentation/spec-macros.scrbl: -------------------------------------------------------------------------------- 1 | #lang scribble/base 2 | 3 | @(require scribble/core 4 | scribble/html-properties 5 | scribble/latex-properties 6 | racket/list) 7 | @(provide definition-section definitions 8 | defthing defun defmacro defgeneric defmethod 9 | defaccessor defreader definitarg 10 | defclass defprotoclass defvar 11 | define-condition 12 | cl lisp-code 13 | param &keyword &key &optional &rest &allow-other-keys &body 14 | term concept 15 | bnf rule tag-rule 16 | todo note 17 | image/width 18 | code-template var 19 | landscape centered-block) 20 | 21 | @(define spec-macro-additions 22 | (list (make-tex-addition "spec-macros.tex"))) 23 | @(define block-thing-additions 24 | (cons (alt-tag "div") spec-macro-additions)) 25 | 26 | @(define float-right-style 27 | (make-style "FloatRight" spec-macro-additions)) 28 | @(define sc-style 29 | (make-style "SmallCaps" spec-macro-additions)) 30 | @(define definitions-style 31 | (make-style "Definitions" block-thing-additions)) 32 | @(define indent-style 33 | (make-style "Indent" spec-macro-additions)) 34 | @(define landscape-style 35 | (make-style "Landscape" block-thing-additions)) 36 | @(define centered-block-style 37 | (make-style "CenteredBlock" block-thing-additions)) 38 | @(define centered-container-style 39 | (make-style "CenteredContainer" block-thing-additions)) 40 | @(define param-style 41 | (make-style "Param" spec-macro-additions)) 42 | @(define defun-name-style 43 | (make-style "DefunName" (cons (alt-tag "b") spec-macro-additions))) 44 | @(define lambda-list-style 45 | (make-style "LambdaList" (cons (alt-tag "i") spec-macro-additions))) 46 | 47 | @(define @float-right[thing] 48 | (elem thing #:style float-right-style)) 49 | @(define @indent[thing] 50 | (nested thing #:style 'inset)) 51 | @(define @sc[thing] 52 | (elem thing #:style sc-style)) 53 | @(define (landscape . stuff) 54 | (nested stuff #:style landscape-style)) 55 | @(define (centered-block . stuff) 56 | (nested (nested stuff #:style centered-block-style) 57 | #:style centered-container-style)) 58 | 59 | @(define @defthing[name more type #:index? [index? #t]] 60 | (let ([base (list (elem name #:style defun-name-style) 61 | " " (elem (list more (float-right (italic type))) 62 | #:style lambda-list-style))]) 63 | (if index? 64 | (index* (list name) (list (list (string-append name " " type))) 65 | base) 66 | base))) 67 | 68 | @(define (definition-section name . things) 69 | (list (linebreak) 70 | (sc name) 71 | (linebreak) 72 | things)) 73 | 74 | @(define (definitions . stuff) 75 | (nested #:style definitions-style 76 | stuff)) 77 | 78 | @(define (def-function-thing name arguments type #:index? [index? #t]) 79 | (list (defthing name (list " " (italic arguments)) type #:index? index?) 80 | (linebreak))) 81 | 82 | @(define (defun name . arguments) 83 | (def-function-thing name arguments "Function")) 84 | @(define (defmacro name . arguments) 85 | (def-function-thing name arguments "Macro")) 86 | @(define (defgeneric name . arguments) 87 | (def-function-thing name arguments "Generic Function")) 88 | @(define (defmethod name #:qualifier [q ""] . arguments) 89 | (def-function-thing name arguments 90 | (if (string=? q "") 91 | "Method" 92 | (string-append q " Method")) 93 | #:index? #f)) 94 | @(define (defaccessor name . arguments) 95 | (def-function-thing name arguments "Accessor")) 96 | @(define (defreader name . arguments) 97 | (def-function-thing name arguments "Reader")) 98 | @(define (definitarg name) 99 | (defthing name "" "Initarg")) 100 | 101 | @(define (interleave v l) 102 | (reverse 103 | (for/fold ([a '()]) 104 | ([x (in-list l)]) 105 | (list* x v a)))) 106 | 107 | @(define (superclass-part superclasses) 108 | (if (or (equal? superclasses "") 109 | (equal? superclasses '()) 110 | (equal? superclasses '(""))) 111 | "" 112 | (interleave " " (cons "<:" superclasses)))) 113 | 114 | @(define (defclass name . superclasses) 115 | (defthing name (superclass-part superclasses) "Class")) 116 | 117 | @(define (define-condition name . supertypes) 118 | (def-function-thing name (superclass-part supertypes) "Condition Type")) 119 | 120 | @(define (defprotoclass name) 121 | (defthing name "" "Protocol Class")) 122 | 123 | @(define (defvar name initial-value) 124 | (defthing name (list "initially " initial-value) "Variable")) 125 | 126 | @(define (&keyword name) 127 | (elem #:style 'tt "&" name)) 128 | 129 | @(define (param . stuff) (elem stuff #:style param-style)) 130 | @(define &key @&keyword{key}) 131 | @(define &rest @&keyword{rest}) 132 | @(define &body @&keyword{body}) 133 | @(define &optional @&keyword{optional}) 134 | @(define &allow-other-keys @&keyword{allow-other-keys}) 135 | @(define cl tt) 136 | @(define (lisp-code . stuff) 137 | (nested #:style 'code-inset 138 | (apply verbatim stuff))) 139 | 140 | @(define term italic) 141 | @(define (concept name) 142 | (index name (italic name))) 143 | 144 | @(define (groups x n) 145 | (if (> (length x) n) 146 | (cons (take x n) (groups (drop x n) n)) 147 | (list x))) 148 | 149 | @(define (bnf . rules) 150 | (tabular #:sep (hspace 1) 151 | #:column-properties '(right center left right) 152 | #:style (style #f (list (attributes '((style . "width: 100%"))))) 153 | (apply append rules))) 154 | @(define (rule name . generates) 155 | (for/list ([rhs (in-list (groups generates 2))] 156 | [n (in-naturals)]) 157 | (if (zero? n) 158 | (list* name "::=" rhs) 159 | (list* "" "|" rhs)))) 160 | @(define (tag-rule lhs tag rhs) 161 | (rule lhs (list (tt tag) " " rhs))) 162 | 163 | @(define (todo . stuff) 164 | (list (bold "TODO: ") stuff)) 165 | @(define (note . stuff) 166 | (list (bold "NOTE: ") stuff)) 167 | 168 | @(define (image/width width pathname #:scale [scale 0.5]) 169 | (image #:style (style #f 170 | (list 171 | (attributes 172 | `((style . ,(string-append "height: auto; width: " width)))) 173 | (command-optional (list (string-append "width=" width))))) 174 | #:scale scale 175 | pathname)) 176 | 177 | @(define-syntax code-template 178 | (syntax-rules [var] 179 | [(code-template) '()] 180 | [(code-template (var x) rest ...) 181 | (cons x (code-template rest ...))] 182 | [(code-template x rest ...) 183 | (cons (tt x) (code-template rest ...))])) 184 | @(define (var _) (error "No var outside code-template")) 185 | -------------------------------------------------------------------------------- /Documentation/linting.scrbl: -------------------------------------------------------------------------------- 1 | #lang scribble/base 2 | @require["spec-macros.scrbl" scribble-math/dollar] 3 | 4 | @title{Linting and warnings} 5 | 6 | one-more-re-nightmare can produce warnings at compile-time for various 7 | mistakes when writing regular expressions. The compiler produces a 8 | finite state machine, which involves traversing every execution path 9 | of the machine, so it can perform analysis with no false positives or 10 | negatives. 11 | 12 | @definitions{ 13 | @define-condition["lint-style-warning" "style-warning"] 14 | } 15 | 16 | Linting occurs when regular expressions are provided as literal 17 | strings in the source code. All @term{style warnings} generated are of the 18 | type @cl{lint-style-warning}. 19 | 20 | @section{Unreachability} 21 | 22 | @definitions{ 23 | @define-condition["not-matchable-style-warning" "lint-style-warning"] 24 | } 25 | 26 | The following issues generate @term{style warnings} at 27 | compile-time, of type @cl{not-matchable-style-warning}. They do not 28 | indicate that something will go wrong at run-time, but their behaviour 29 | is rarely desirable. 30 | 31 | @subsection{"This expression is impossible to match."} 32 | 33 | @definition-section["Explanation"]{ 34 | 35 | The expression will never match any expressions; it is equivalent to 36 | the empty set. 37 | 38 | } 39 | 40 | @definition-section["Examples"]{ 41 | 42 | @cl{a&b}: There are no characters that are simultaneously @cl{a} and @cl{b}. 43 | 44 | @cl{¬(a$|$b)&¬(¬(a$)&¬($b))}: There are no strings that match 45 | @cl{¬(a$|$b)} but not @cl{¬(a$)&¬($b)}. In other words, the linter is 46 | used to prove @${\overline{a \lor b} \Rightarrow \overline a \land 47 | \overline b}. While it is a fun idea, we don't recommend using the 48 | linter to check equivalence of Boolean expressions. 49 | 50 | } 51 | 52 | @subsection{"The group in this expression is impossible to match."} 53 | 54 | @definition-section["Explanation"]{ 55 | 56 | A submatch in the expression will never match any expressions; it may 57 | either correspond to the empty set, or is "shadowed" by an alternate 58 | expression. 59 | 60 | } 61 | 62 | @definition-section["Examples"]{ 63 | 64 | @cl{a|«a»} generates the warning "The first group in this expression 65 | is impossible to match.". The only string that the expression can 66 | match is @cl{a}, and the left-hand side of the @cl{|} operator takes 67 | precedence with POSIX semantics, so the right hand side can never match. 68 | 69 | @cl{a|«b&c»} generates the same warning. There are no characters that 70 | are simultaneously @cl{b} and @cl{c}. 71 | 72 | } 73 | 74 | @section{Matching too much} 75 | 76 | @definitions{ 77 | @define-condition["matching-too-much-style-warning" "lint-style-warning"] 78 | } 79 | 80 | Some regular expressions may match at every position, which is usually 81 | a sign of a mistake, as one usually wants to extract something from a 82 | string, and not everything. The following issues generate 83 | @term{style warnings} at compile-time, of type 84 | @cl{matching-too-much-style-warning}. 85 | 86 | @subsection{"This expression matches the empty string at every position."} 87 | 88 | @definition-section["Explanation"]{ 89 | 90 | The expression will match at every position, and most matches will 91 | have zero length. Often some @cl{*} repetition needs to be replaced with 92 | some @cl{+} repetition, to ensure matches contain at least one character. 93 | 94 | } 95 | 96 | @definition-section["Examples"]{ 97 | 98 | The following code will produce too many matches: 99 | 100 | @lisp-code{ 101 | (defun numbers (string) 102 | (one-more-re-nightmare:all-string-matches "[0-9]*" string)) 103 | (numbers "Phone: 6323003") 104 | ;; => (#("") #("") #("") #("") #("") #("") #("") #("6323003") #("")) 105 | } 106 | 107 | one-more-re-nightmare generates this warning when the @cl{numbers} 108 | function is submitted. One solution is to replace the @cl{*} repetition 109 | with @cl{+} repetition. 110 | 111 | @lisp-code{ 112 | (defun numbers (string) 113 | (one-more-re-nightmare:all-string-matches "[0-9]+" string)) 114 | (numbers "Phone: 6323003") 115 | ;; => (#("6323003")) 116 | } 117 | 118 | } 119 | 120 | @subsection{"This expression will only ever match the empty string at 121 | every position."} 122 | 123 | @definition-section["Explanation"]{ 124 | 125 | The expression will only match at every position, and all matches will 126 | have zero length. 127 | 128 | } 129 | 130 | @definition-section["Examples"]{ 131 | 132 | Using the empty string as a regular expression generates this warning. 133 | Other regular expressions which are not just the empty string can still 134 | generate this warning; @cl{|b&c} will generate this warning, as the 135 | regular expression still can only match the empty string. 136 | 137 | } 138 | 139 | @section{Syntax errors} 140 | 141 | Syntax errors can also be caught at compile-time, signalling full 142 | warnings, as function with invalid syntax will always fail at 143 | run-time. 144 | 145 | @definition-section["Examples"]{ 146 | 147 | @cl{(} generates a parsing error. The open-parenthesis should be 148 | matched with a closing @cl{)}. 149 | 150 | } 151 | 152 | @section{Type errors} 153 | 154 | Type errors can be caught at compile-time, signalling full warnings, 155 | as functions with type errors will always fail at run-time. 156 | 157 | @subsection{"This regular expression only produces registers, but 158 | variables were provided."} 159 | 160 | @definition-section["Explanation"]{ 161 | 162 | Too many register variables were provided for the regular expression 163 | provided to @cl{do-matches}. 164 | 165 | } 166 | 167 | @definition-section["Examples"]{ 168 | 169 | @cl{(one-more-re-nightmare:do-matches ((start end s1 e1) "abcde" x) 170 | (print (list s1 e1)))} generates the warning "This regular expression 171 | only produces two registers, but four variables were provided." There 172 | are no submatches in @cl{abcde}, but the @cl{do-matches} form was 173 | provided the variable names @cl{s1} and @cl{e1} for a submatch. 174 | 175 | } 176 | 177 | @subsection{SBCL reports a type conflict} 178 | 179 | @definition-section["Explanation"]{ 180 | 181 | one-more-re-nightmare provides specific types to SBCL for regular 182 | expressions provided as string literals. The SBCL compiler can use 183 | these types to detect errors in code that uses the results produced by 184 | one-more-re-nightmare. 185 | 186 | Specifically, one-more-re-nightmare provides the return type 187 | @code-template{(or null (simple-vector @var{@${2(n+1)}}))} for a call 188 | to @cl{first-match} with a regular expression with @${n} submatches. 189 | one-more-re-nightmare provides the type @cl{alexandria:array-index} 190 | for the first two register variables, and the type @cl{(or null 191 | alexandria:array-index)} for the remaining variables for @cl{do-matches}. 192 | 193 | } 194 | 195 | @definition-section["Examples"]{ 196 | 197 | @cl{(svref (first-match "abc" "abc") 2)} generates the warning 198 | "Derived type (INTEGER 2 2) is not a suitable index for (SIMPLE-VECTOR 199 | 2)." 200 | 201 | @cl{(do-matches ((s) "ab|ac" "ab") (print (symbol-name s)))} generates 202 | the warning "Derived type of ... is (VALUES (MOD ...) &OPTIONAL) 203 | conflicting with its asserted type SYMBOL." The variable @cl{s} will 204 | always be bound to an index, and never @cl{nil}, because the first two 205 | registers designate the bounds of the entire match. 206 | 207 | } 208 | -------------------------------------------------------------------------------- /Code/DFA-construction/make-dfa.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defstruct transition 4 | class 5 | next-state 6 | tags-to-set) 7 | 8 | (defclass state () 9 | ((exit-map :initarg :exit-map :accessor state-exit-map) 10 | (exit-effects :initarg :exit-effects :accessor state-exit-effects) 11 | (expression :initarg :expression :accessor state-expression) 12 | (transitions :initform '() :accessor state-transitions))) 13 | 14 | (defmethod print-object ((state state) stream) 15 | (print-unreadable-object (state stream :type t) 16 | (prin1 (state-expression state) stream))) 17 | 18 | (defun find-similar-state (states state) 19 | "Find another state which we can re-use with some transformation, returning that state and the required transformation." 20 | (flet ((win (other-state substitutions) 21 | (return-from find-similar-state 22 | (values other-state 23 | (loop with used = (used-tags (state-expression other-state)) 24 | for ((v1 r1) . (v2 r2)) 25 | in (alexandria:hash-table-alist substitutions) 26 | when (member (list v2 r2) used :test #'equal) 27 | collect (cons (list v2 r2) (list v1 r1))))))) 28 | (loop with expression = (state-expression state) 29 | for other-state in states 30 | for other-expression = (state-expression other-state) 31 | for substitutions = (similar expression other-expression) 32 | for used = (used-tags other-expression) 33 | unless (null substitutions) 34 | do (win other-state substitutions)))) 35 | 36 | (defun add-transition (class last-state next-state tags-to-set) 37 | (let* ((old-transitions (state-transitions last-state)) 38 | (same-transition 39 | (find-if (lambda (transition) 40 | (and 41 | (equal tags-to-set (transition-tags-to-set transition)) 42 | (eq next-state (transition-next-state transition)))) 43 | old-transitions))) 44 | (cond 45 | ((null same-transition) 46 | (push (make-transition 47 | :class class 48 | :next-state next-state 49 | :tags-to-set tags-to-set) 50 | (state-transitions last-state))) 51 | (t 52 | (setf (transition-class same-transition) 53 | (csum-union (transition-class same-transition) 54 | class)))))) 55 | 56 | (trivia:defun-match re-stopped-p (re) 57 | ((alpha (empty-set) _) t) 58 | ((empty-set) t) 59 | (_ nil)) 60 | 61 | (defun peephole-optimize (assignments used-tags) 62 | (let ((result '()) 63 | (remaining-assignments assignments)) 64 | (flet ((substitute-variable (variable replica source) 65 | (setf remaining-assignments 66 | (loop for ((v r) . s) in remaining-assignments 67 | ;; Rewrite {A <- B} C <- A to C <- B 68 | if (equal s (list variable replica)) 69 | collect (cons (list v r) source) 70 | else 71 | collect (cons (list v r) s))))) 72 | (loop until (null remaining-assignments) 73 | do (destructuring-bind ((variable replica) . source) 74 | (pop remaining-assignments) 75 | (if (member (list variable replica) used-tags :test #'equal) 76 | (push (cons (list variable replica) source) result) 77 | (substitute-variable variable replica source)))) 78 | (reverse result)))) 79 | 80 | (defvar *state-limit* 1000) 81 | (define-condition exceeded-state-limit (error) 82 | () 83 | (:report "Made too many states - either your regular expression is too complicated, or one-more-re-nightmare is broken. 84 | (Either way, you're not going to get this compiled any time soon.)")) 85 | 86 | (defun make-dfa-from-expressions (expressions) 87 | (let ((states (make-hash-table)) 88 | (possibly-similar-states (make-hash-table)) 89 | (work-list expressions)) 90 | (flet ((find-state (expression) 91 | (multiple-value-bind (state present?) 92 | (gethash expression states) 93 | (if present? 94 | (values state nil) 95 | (values (setf (gethash expression states) 96 | (make-instance 'state 97 | :expression expression)) 98 | t))))) 99 | (loop 100 | (when (null work-list) (return)) 101 | (when (> (hash-table-count states) *state-limit*) 102 | (error 'exceeded-state-limit)) 103 | (let* ((expression (pop work-list)) 104 | (state (find-state expression))) 105 | (cond 106 | ((or (re-stopped-p expression) (re-empty-p expression)) 107 | nil) 108 | (t 109 | (let ((classes (derivative-classes expression))) 110 | (dolist (class classes) 111 | (unless (csum-null-p class) 112 | (let* ((next-expression (derivative expression class)) 113 | (tags-to-set (keep-used-assignments 114 | next-expression 115 | (effects expression)))) 116 | (multiple-value-bind (next-state new?) 117 | (find-state next-expression) 118 | (multiple-value-bind (other-state transformation) 119 | (find-similar-state 120 | (cons state 121 | (gethash (remove-tags next-expression) possibly-similar-states '())) 122 | next-state) 123 | (cond 124 | ((null other-state) 125 | ;; No state to reuse, so check if we need to process the next state. 126 | (when new? 127 | (pushnew next-expression work-list))) 128 | (t 129 | ;; Reuse this state. 130 | (when new? 131 | (remhash (state-expression next-state) states)) 132 | (setf tags-to-set (peephole-optimize 133 | (append tags-to-set transformation) 134 | (used-tags (state-expression other-state))) 135 | next-state other-state))) 136 | (add-transition class 137 | state next-state 138 | tags-to-set))))))))) 139 | (push state (gethash (remove-tags expression) 140 | possibly-similar-states)) 141 | (setf (state-exit-map state) 142 | (tags (nullable expression)) 143 | (state-exit-effects state) 144 | (keep-used-assignments 145 | (nullable expression) 146 | (effects expression)))))) 147 | states)) 148 | 149 | (defun make-dfa-from-expression (expression) 150 | (make-dfa-from-expressions (list expression))) 151 | -------------------------------------------------------------------------------- /Code/DFA-construction/re-types.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (define-types 4 | (literal set) 5 | (empty-string) 6 | (repeat r min max can-empty) 7 | (tag-set substitutions) 8 | (alpha expression history) 9 | (grep vector prototype) 10 | (either r s) 11 | (both r s) 12 | (invert r) 13 | (join r s)) 14 | 15 | (define-rewrites (literal set) 16 | :printer ((literal set) 17 | (print-csum set stream))) 18 | 19 | (defun kleene (r) 20 | (repeat r 0 nil nil)) 21 | (trivia:defpattern kleene (r) 22 | `(repeat ,r 0 nil nil)) 23 | 24 | (defun empty-set () (literal +empty-set+)) 25 | (trivia:defpattern empty-set () 26 | (alexandria:with-gensyms (set) 27 | `(trivia:guard (literal ,set) 28 | (csum-null-p ,set)))) 29 | 30 | (defun universal-set () 31 | (repeat (literal +universal-set+) 0 nil nil)) 32 | (trivia:defpattern universal-set () 33 | `(kleene (literal ',+universal-set+))) 34 | 35 | (defvar *subscripts* "₀₁₂₃₄₅₆₇₈₉") 36 | (defun subscripts (number) 37 | (map 'string 38 | (lambda (char) 39 | (aref *subscripts* (digit-char-p char))) 40 | (princ-to-string number))) 41 | 42 | (define-rewrites (empty-string) 43 | :printer (_ (write-string "ε" stream))) 44 | 45 | (define-rewrites (repeat r min max can-empty) 46 | :simplify (((repeat _ _ 0 _) (empty-string)) 47 | ((repeat r 1 1 _) r) 48 | ((repeat (empty-set) 0 nil _) (empty-string)) 49 | ((repeat (repeat r 0 nil _) 0 nil c) 50 | (repeat r 0 nil c))) 51 | :printer ((repeat r min max can-empty) 52 | (format stream "[~a]{~a~a,~a}" 53 | r (if can-empty "^" "") 54 | min 55 | (or max "")))) 56 | 57 | (define-rewrites (tag-set substitutions) 58 | :simplify (((tag-set (list)) 59 | (empty-string))) 60 | :printer ((tag-set s) 61 | (format stream "{~{~a ← ~a~^, ~}}" 62 | (loop for ((variable replica) . source) in s 63 | if (zerop replica) 64 | collect variable 65 | else 66 | collect (format nil "~a~a" 67 | variable 68 | (subscripts replica)) 69 | collect (case source 70 | ((position) "P") 71 | ((nil) "NIL") 72 | (otherwise 73 | (format nil "~a~a" 74 | (first source) 75 | (subscripts (second source))))))))) 76 | (define-rewrites (alpha expression history) 77 | :simplify (((alpha (empty-set) (empty-set)) (empty-set))) 78 | :printer ((alpha r n) 79 | (format stream "α[~@<~a, ~_~a]~:>" r n))) 80 | (define-rewrites (grep match-vector prototype) 81 | :simplify (((grep r _) 82 | (if (eq (nullable r) (empty-set)) 83 | (trivia.next:next) 84 | r)) 85 | ((grep (empty-set) _) 86 | (empty-set))) 87 | :printer ((grep r _) 88 | (format stream "γ[~a]" r))) 89 | 90 | (define-rewrites (either r s) 91 | :simplify (((either (either p r) s) 92 | (either p (either r s))) 93 | ((either (empty-set) r) r) 94 | ((either r (empty-set)) r) 95 | ((either (literal s1) (literal s2)) 96 | (literal (csum-union s1 s2))) 97 | ((either r (universal-set)) 98 | (if (has-tags-p r) 99 | (trivia.next:next) ; Preserve tags then 100 | (universal-set))) 101 | ((either (join (literal s1) p) 102 | (join (literal s2) r)) 103 | ;; Try to expose more prefixes. 104 | (if (equal s1 s2) 105 | (join (literal s1) (either p r)) 106 | (trivia.next:next))) 107 | ((either r s) 108 | (scan-either-for-duplicates r s))) 109 | :printer ((either r s) 110 | (format stream "~@<(~a) ∪ ~_(~a)~:>" r s))) 111 | 112 | (defun scan-either-for-duplicates (r s) 113 | (labels ((e (r s) 114 | ;; We can't call EITHER, since EITHER calls us, so we 115 | ;; handle simplifying out empty sets here. 116 | (cond 117 | ((eq s (empty-set)) r) 118 | ((eq r (empty-set)) s) 119 | (t (%either r s)))) 120 | (scan (rhs) 121 | (trivia:match rhs 122 | ((either lhs next-rhs) 123 | (if (eq (remove-tags lhs) (remove-tags r)) 124 | next-rhs 125 | (e lhs (scan next-rhs)))) 126 | (_ 127 | (if (eq (remove-tags rhs) (remove-tags r)) 128 | (empty-set) 129 | rhs))))) 130 | (e r (scan s)))) 131 | 132 | (define-rewrites (both r s) 133 | :simplify (((both r s) 134 | (if (eq r s) 135 | r 136 | (trivia.next:next))) 137 | ((both _ (empty-set)) (empty-set)) 138 | ((both (empty-set) _) (empty-set)) 139 | ((both (tag-set s1) (tag-set s2)) 140 | (tag-set (merge-tag-sets s1 s2))) 141 | ((both (tag-set s) (empty-string)) 142 | (tag-set s)) 143 | ((both (empty-string) (tag-set s)) 144 | (tag-set s)) 145 | ((both (literal s1) (literal s2)) 146 | (literal (csum-intersection s1 s2)))) 147 | :printer ((both r s) 148 | (format stream "(~a) ∩ (~a)" r s))) 149 | (define-rewrites (invert r) 150 | :simplify (((invert (invert r)) r) 151 | ((invert s) 152 | (if (has-tags-p s) 153 | (invert (remove-tags s)) 154 | (trivia.next:next))) 155 | ((invert (universal-set)) (empty-set))) 156 | :printer ((invert r) 157 | (format stream "¬[~a]" r))) 158 | (define-rewrites (join r s) 159 | :simplify (((join (tag-set s) (either p r)) 160 | (either (join (tag-set s) p) 161 | (join (tag-set s) r))) 162 | ((join (empty-set) _) (empty-set)) 163 | ((join _ (empty-set)) (empty-set)) 164 | ((join (empty-string) r) r) 165 | ((join r (empty-string)) r) 166 | ((join (join (tag-set s1) r) s) 167 | ;; Rotate so that a TAG-SET is always at the start. 168 | (join (tag-set s1) (join r s))) 169 | ((join (tag-set s1) (join (tag-set s2) r)) 170 | (join (tag-set (merge-tag-sets s1 s2)) r)) 171 | ;; Avoid A*A* 172 | ((join (kleene a) (join (kleene b) c)) 173 | (if (eq a b) 174 | (join (kleene a) c) 175 | (trivia.next:next))) 176 | ((join (kleene a) (kleene b)) 177 | (if (eq a b) 178 | (kleene a) 179 | (trivia.next:next))) 180 | ((join (tag-set s1) (tag-set s2)) 181 | (tag-set (merge-tag-sets s1 s2)))) 182 | :printer ((join r s) 183 | (format stream "~a~a" r s))) 184 | 185 | (defun text (vector) 186 | (reduce #'join (map 'vector 187 | (lambda (e) 188 | (literal (singleton-set (char-code e)))) 189 | vector) 190 | :initial-value (empty-string) 191 | :from-end t)) 192 | 193 | (defun group (r n) 194 | (join (tag-set `(((,(1- (* 2 n)) 0) . position))) 195 | (join r 196 | (tag-set `(((,(* 2 n) 0) . position)))))) 197 | -------------------------------------------------------------------------------- /Code/DFA-construction/sets.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | ;;;; Character sums 4 | ;;; Character sums (csums) represent a set of characters as 5 | ;;; a combination of ranges and "symbolic" classes (such as 6 | ;;; [:alpha:], [:digit:], etc). Their implementation is somewhat 7 | ;;; similar to Gilbert Baumann's "isum" integer sums, used 8 | ;;; in clex2 and earlier versions of one-more-re-nightmare. 9 | 10 | ;;; Class sets 11 | ;; A class set is an element of ℙ(ℙ(classes)) i.e. a set of sets 12 | ;; of character classes that are part of a range. 13 | 14 | ;; This gets normalised to P on SBCL, but it looks pretty. 15 | (eval-when (:compile-toplevel :load-toplevel :execute) 16 | (defun ℙ (x) (expt 2 x))) 17 | (alexandria:define-constant +classes+ 18 | '((:alpha alpha-char-p) (:digit digit-char-p) (:lower lower-case-p) (:upper upper-case-p)) 19 | :test 'equal) 20 | (defconstant +empty-class-set+ 0) 21 | (defconstant +class-set-bits+ (ℙ (length +classes+))) 22 | (defconstant +universal-class-set+ (1- (ℙ (ℙ (length +classes+))))) 23 | (defun class-set-complement (c) (logxor #xFFFF c)) 24 | 25 | ;;; Character sets 26 | ;; A character set is a union of intersections of character ranges 27 | ;; and class sets. A character set has the form ((class-set start end) ...) 28 | ;; with the first start fixed to 0 and the last end fixed to *CODE-LIMIT*. 29 | 30 | (defvar *code-limit* char-code-limit) 31 | (define-symbol-macro +empty-set+ (list (list +empty-class-set+ 0 *code-limit*))) 32 | (define-symbol-macro +universal-set+ (list (list +universal-class-set+ 0 *code-limit*))) 33 | (defun range (start limit) 34 | "The character set for [START, LIMIT)" 35 | (list (list +empty-class-set+ 0 start) 36 | (list +universal-class-set+ start limit) 37 | (list +empty-class-set+ limit *code-limit*))) 38 | (defun singleton-set (x) (range x (1+ x))) 39 | (defun class-set (class) 40 | (let ((p (position class +classes+ :key #'first))) 41 | (when (null p) (error "No class named ~S" class)) 42 | (loop for i below +class-set-bits+ 43 | when (logbitp p i) 44 | sum (ash 1 i) into class-set 45 | finally (return (list (list class-set 0 *code-limit*)))))) 46 | (defun remove-empty-ranges (csum) 47 | (remove 0 csum :key #'first)) 48 | 49 | (defun print-csum (csum stream) 50 | (labels ((range (start end) 51 | (if (= (1- end) start) 52 | (string (code-char start)) 53 | (format nil "~C-~C" (code-char start) (code-char (1- end)))))) 54 | (trivia:match (remove-empty-ranges csum) 55 | ('() (write-string "[]" stream)) 56 | ((equal +universal-set+) (write-string "Σ" stream)) 57 | ((list (list (= +universal-class-set+) start end)) 58 | (if (= (1- end) start) 59 | (write-char (code-char start) stream) 60 | (format stream "[~C-~C]" (code-char start) (code-char (1- end))))) 61 | (pos-parts 62 | (trivia:match (remove-empty-ranges (csum-complement csum)) 63 | ((list (list (= +universal-class-set+) start end)) 64 | (format stream "[¬~A]" (range start end))) 65 | (neg-parts 66 | (write-char #\[ stream) 67 | (multiple-value-bind (negative? parts) 68 | (if (> (length pos-parts) (length neg-parts)) 69 | (values t neg-parts) 70 | (values nil pos-parts)) 71 | (when negative? (write-char #\¬ stream)) 72 | (loop for (classes start end) in parts 73 | if (= classes +universal-class-set+) 74 | do (write-string (range start end) stream) 75 | else 76 | do (write (list classes start end) :stream stream))) 77 | (write-char #\] stream))))))) 78 | 79 | ;;; Operations on character sets 80 | 81 | (defun coalesce-csum (cset) 82 | "Coalesce adjacent ranges with the same class set in a character set." 83 | (loop until (null cset) 84 | collect (let* ((f (first cset)) 85 | (l (member (first f) (rest cset) :key #'first :test #'/=))) 86 | (setf cset l) 87 | (if (null l) 88 | (list (first f) (second f) *code-limit*) 89 | (list (first f) (second f) (second (first l))))))) 90 | 91 | ;; A set table is a list of lists of values, and a list of ranges. 92 | ;; We ensure that ranges line up by only storing the ranges 93 | ;; once. 94 | (defun align-csums (csets) 95 | "Align the ranges in a list of character sets, returning a list of lists of values, and a list of ranges." 96 | (labels ((align (csets values ranges start) 97 | (if (null (first csets)) 98 | (values (reverse values) (reverse ranges)) 99 | (let ((end (reduce #'min csets :key (lambda (c) (third (first c)))))) 100 | ;; Take a step. 101 | (align 102 | (loop for c in csets 103 | collect (if (= (third (first c)) end) (rest c) c)) 104 | (cons (loop for c in csets collect (first (first c))) values) 105 | (cons (list start end) ranges) 106 | end))))) 107 | (align csets '() '() 0))) 108 | 109 | (defmacro define-csum-op (name class-op arguments) 110 | `(defun ,name ,arguments 111 | (multiple-value-bind (values ranges) 112 | (align-csums (list ,@arguments)) 113 | (coalesce-csum 114 | (mapcar (lambda (v r) (cons (apply #',class-op v) r)) 115 | values ranges))))) 116 | 117 | (define-csum-op csum-union logior (a b)) 118 | (define-csum-op csum-intersection logand (a b)) 119 | (define-csum-op csum-complement class-set-complement (a)) 120 | (define-csum-op csum-difference logandc2 (a b)) 121 | (defun csum-null-p (csum) (equal csum +empty-set+)) 122 | 123 | ;;; Character set dispatch 124 | ;; This could have element type (UNSIGNED-BYTE 4) but that'd take 125 | ;; more effort to decode; so we go with bytes. 126 | ;; Rhetorical question: Are there any other ways to compress this 127 | ;; table? 128 | (alexandria:define-constant +character-class-table+ 129 | (let ((table 130 | (make-array char-code-limit 131 | :element-type '(unsigned-byte 8) 132 | :initial-element 0))) 133 | (dotimes (i char-code-limit table) 134 | (let ((character (code-char i))) 135 | (unless (null character) 136 | (loop for (nil predicate) in +classes+ 137 | for x = 1 then (ash x 1) 138 | do (when (funcall predicate (code-char i)) 139 | (setf (aref table i) (logior x (aref table i))))))))) 140 | :test 'equalp) 141 | (declaim (inline lookup-class)) 142 | (defun lookup-class (code) 143 | (aref +character-class-table+ code)) 144 | 145 | (defmacro csum-case (var less-than equal &body cases) 146 | (labels ((dispatch-classes (values) 147 | (if (alexandria:length= 1 values) 148 | `(progn ,@(cdar values)) 149 | (alexandria:with-gensyms (result) 150 | `(let ((,result (lookup-class ,var))) 151 | (cond 152 | ,@(loop for (class-set . body) in values 153 | collect `(,(if (= +universal-class-set+ class-set) 154 | 't 155 | `(logbitp ,result ,class-set)) 156 | ,@body))))))) 157 | (singleton-p (range) (= (1+ (first range)) (second range))) 158 | (middle (list) (butlast (rest list))) 159 | (dispatch-csums (values ranges) 160 | (cond 161 | ((alexandria:length= 1 ranges) 162 | ;; There's only one more range, so dispatch on classes. 163 | (dispatch-classes (first values))) 164 | ;; Detect singleton sets to use = on, e.g. [^ab], a and b. 165 | ((and (equal (first values) (first (last values))) 166 | (every #'singleton-p (middle ranges))) 167 | `(cond 168 | ,@(loop for r in (middle ranges) 169 | for v in (middle values) 170 | collect `((,equal ,var ,(first r)) ,(dispatch-classes v))) 171 | (t ,(dispatch-classes (first values))))) 172 | ;; Bisect and continue dispatching. 173 | (t 174 | (let* ((mid (floor (length values) 2))) 175 | `(if (,less-than ,var ,(first (nth mid ranges))) 176 | ,(dispatch-csums (subseq values 0 mid) 177 | (subseq ranges 0 mid)) 178 | ,(dispatch-csums (subseq values mid) 179 | (subseq ranges mid)))))))) 180 | (multiple-value-bind (values ranges) 181 | (align-csums 182 | (loop for (csum . body) in cases 183 | collect (loop for (cl s e) in csum 184 | collect `((,cl . ,body) ,s ,e)))) 185 | ;; Remove unreachable values from the set table. 186 | (let ((values (mapcar #'remove-empty-ranges values))) 187 | (dispatch-csums values ranges))))) 188 | 189 | (defun csum-has-classes-p (csum) 190 | "Does a character sum use any non-trivial character classes?" 191 | (loop for (c s e) in csum 192 | thereis (and (/= c +empty-class-set+) (/= c +universal-class-set+)))) 193 | 194 | (defun make-test-form (csum variable) 195 | "Compute a form which tests if VARIABLE is an element of CSUM, using OR, <= and =" 196 | (cond 197 | ((equal csum +empty-set+) 'nil) 198 | ((equal csum +universal-set+) 't) 199 | (t 200 | `(or ,@(loop for (c s e) in csum 201 | unless (= c +empty-class-set+) 202 | do (assert (= c +universal-class-set+)) 203 | and collect (if (= (1+ s) e) 204 | `(= ,s ,variable) 205 | `(<= ,s ,variable ,(1- e)))))))) 206 | 207 | ;;; Named sets 208 | 209 | (defun named-range (name) 210 | (labels ((∪ (&rest rest) (reduce #'csum-union rest)) 211 | (d (a b) (csum-intersection a (csum-complement b))) 212 | (s (&rest rest) (reduce #'csum-union rest :key (alexandria:compose #'singleton-set #'char-code)))) 213 | (alexandria:eswitch (name :test 'string=) 214 | ("alpha" (class-set :alpha)) 215 | ("alnum" (∪ (class-set :alpha) (class-set :digit))) 216 | ("blank" (s #\Space #\Tab)) 217 | ("cntrl" (∪ (range 0 32) (singleton-set 127))) 218 | ("digit" (class-set :digit)) 219 | ("graph" (csum-complement (∪ (named-range "cntrl") (s #\Space)))) 220 | ("lower" (class-set :lower)) 221 | ("print" (∪ (named-range "graph") (s #\Space))) 222 | ("punct" (d (named-range "graph") (∪ (class-set :alpha) (class-set :digit)))) 223 | ("space" (∪ (singleton-set 11) (s #\Space #\Return #\Newline #\Tab))) 224 | ("upper" (class-set :upper)) 225 | ("xdigit" (∪ (class-set "digit") (s #\A #\B #\C #\D #\E #\F #\a #\b #\c #\d #\e #\f)))))) 226 | -------------------------------------------------------------------------------- /Code/SIMD/sbcl-x86-64.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defun find-op (name) 4 | (or (find-symbol (format nil "V-~a~d" name *bits*) 5 | ':one-more-re-nightmare.vector-primops) 6 | (error "No primop named ~a" name))) 7 | 8 | ;; AVX2 is a pile of stink and only has instructions for signed 9 | ;; comparisons. So, in order to fake an unsigned comparison, we 10 | ;; subtract #x80 from everything. 11 | 12 | (defun find-8-bit-broadcast (n) 13 | (find-broadcast (mod (- n #x80) #x100))) 14 | (defvar *swizzled-name*) 15 | (defun swizzle-8-bits () 16 | (or *swizzled-name* 17 | (setf *swizzled-name* (make-symbol "SWIZZLE")))) 18 | 19 | (defun translate-scalar-code (variable code) 20 | (let* ((*swizzled-name* nil) 21 | (translated (%translate-scalar-code code))) 22 | (if (null *swizzled-name*) 23 | translated 24 | `(let ((,*swizzled-name* 25 | (one-more-re-nightmare.vector-primops:v8- ,variable ,(find-broadcast #x80)))) 26 | ,translated)))) 27 | 28 | (trivia:defun-ematch %translate-scalar-code (code) 29 | "Translate some 'scalar' code generated by MAKE-TEST-FORM into a vectorised computation." 30 | ('t :always) 31 | ('nil :never) 32 | ;; All the Boolean operators just map over their arguments. 33 | ((list 'not thing) 34 | `(,(find-op "NOT") 35 | ,(%translate-scalar-code thing))) 36 | ((list* 'or things) 37 | (reduce (lambda (a b) `(,(find-op "OR") ,a ,b)) 38 | (mapcar #'%translate-scalar-code things))) 39 | ;; Ditto for = really. 40 | ((list '= value variable) 41 | ;; Note that = works the same if it's signed or not; it's only > 42 | ;; that requires more effort 43 | `(,(ecase *bits* 44 | (32 'one-more-re-nightmare.vector-primops:v32=) 45 | (8 'one-more-re-nightmare.vector-primops:v8=)) 46 | ,(find-broadcast value) ,variable)) 47 | ;; Generating good code for <= is tricky though. Whoever designed 48 | ;; SSE2 and AVX2 decided that just having = and > were good enough, 49 | ;; so we need an efficient implementation of ≤ from those. 50 | ((list '<= 0 value high) 51 | ;; No lower bounds here. Note that X ≤ N ⇔ N + 1 > X 52 | (ecase *bits* 53 | (32 `(one-more-re-nightmare.vector-primops:v32> ,(find-broadcast (1+ high)) ,value)) 54 | (8 `(one-more-re-nightmare.vector-primops:v8> ,(find-8-bit-broadcast (1+ high)) ,value)))) 55 | ((list '<= low value high) 56 | ;; Similarly, N ≤ X ⇔ X > N - 1 57 | (ecase *bits* 58 | (32 59 | `(one-more-re-nightmare.vector-primops:v-and32 60 | ;; Similarly, N ≤ X ⇔ X > N - 1 61 | (one-more-re-nightmare.vector-primops:v32> ,value ,(find-broadcast (1- low))) 62 | (one-more-re-nightmare.vector-primops:v32> ,(find-broadcast (1+ high)) ,value))) 63 | (8 64 | `(one-more-re-nightmare.vector-primops:v-and8 65 | (one-more-re-nightmare.vector-primops:v8> ,(swizzle-8-bits) 66 | ,(find-8-bit-broadcast (1- low))) 67 | (one-more-re-nightmare.vector-primops:v8> ,(find-8-bit-broadcast (1+ high)) 68 | ,(swizzle-8-bits))))))) 69 | 70 | (defmacro define-boring-vop (name args result &body generator) 71 | `(progn 72 | (sb-vm::define-vop (,name) 73 | (:translate ,name) 74 | (:policy :fast-safe) 75 | (:args ,@(loop for (name nil . rest) in args 76 | collect (cons name rest))) 77 | (:arg-types ,@(mapcar #'second args)) 78 | (:results (,(first result) ,@(rest (rest result)))) 79 | (:result-types ,(second result)) 80 | (:generator 0 ,@generator)))) 81 | 82 | (defmacro define-op (name bits args instruction-name) 83 | (let ((primitive-type (ecase bits 84 | (8 'sb-vm::simd-pack-256-ub8) 85 | (32 'sb-vm::simd-pack-256-ub32)))) 86 | `(progn 87 | (sb-c:defknown ,name 88 | ,(loop for nil in args collect `(sb-ext:simd-pack-256 (unsigned-byte ,bits))) 89 | (sb-ext:simd-pack-256 (unsigned-byte ,bits)) 90 | (sb-c:foldable sb-c:flushable sb-c:movable) 91 | :overwrite-fndb-silently t) 92 | (define-boring-vop ,name 93 | ,(loop for arg in args 94 | collect `(,arg ,primitive-type :scs (sb-vm::int-avx2-reg))) 95 | (result ,primitive-type :scs (sb-vm::int-avx2-reg)) 96 | (sb-vm::inst ,instruction-name result ,@args))))) 97 | 98 | (defun one-more-re-nightmare.vector-primops:all-of (variables) 99 | (reduce (lambda (a b) `(,(find-op "AND") ,a ,b)) 100 | variables)) 101 | 102 | (defconstant one-more-re-nightmare.vector-primops:+v-length+ 256) 103 | 104 | (in-package :sb-vm) 105 | 106 | ;;;; Boolean operations 107 | 108 | (one-more-re-nightmare::define-op 109 | one-more-re-nightmare.vector-primops:v-and8 8 (a b) vpand) 110 | (one-more-re-nightmare::define-op 111 | one-more-re-nightmare.vector-primops:v-and32 32 (a b) vpand) 112 | 113 | (one-more-re-nightmare::define-op 114 | one-more-re-nightmare.vector-primops:v-or8 8 (a b) vpor) 115 | (one-more-re-nightmare::define-op 116 | one-more-re-nightmare.vector-primops:v-or32 32 (a b) vpor) 117 | 118 | (macrolet ((frob (name bits arg-type) 119 | `(progn 120 | (defknown ,name 121 | ((simd-pack-256 (unsigned-byte ,bits))) 122 | (simd-pack-256 (unsigned-byte ,bits)) 123 | (foldable flushable movable) 124 | :overwrite-fndb-silently t) 125 | (define-vop (,name) 126 | (:translate ,name) 127 | (:policy :fast-safe) 128 | (:args (value :scs (int-avx2-reg))) 129 | (:arg-types ,arg-type) 130 | (:results (result :scs (int-avx2-reg))) 131 | (:result-types ,arg-type) 132 | (:temporary (:sc int-avx2-reg) ones) 133 | (:generator 0 134 | (inst vpcmpeqd ones ones ones) ; get all 1s 135 | (inst vpxor result ones value)))))) ; 1111... (+) A = ¬A 136 | (frob one-more-re-nightmare.vector-primops:v-not8 8 simd-pack-256-ub8) 137 | (frob one-more-re-nightmare.vector-primops:v-not32 32 simd-pack-256-ub32)) 138 | 139 | ;;;; Comparisons 140 | 141 | ;; This is a signed comparison, but as there are fewer than 2³¹ 142 | ;; Unicode characters, no one needs to know that. 143 | (one-more-re-nightmare::define-op 144 | one-more-re-nightmare.vector-primops:v32> 32 (a b) vpcmpgtd) 145 | 146 | ;; We do need to know that this is a signed comparison, since we do 147 | ;; want to target (UNSIGNED-BYTE 8) too, and we handle it above. 148 | (one-more-re-nightmare::define-op 149 | one-more-re-nightmare.vector-primops:v8> 8 (a b) vpcmpgtb) 150 | 151 | (one-more-re-nightmare::define-op 152 | one-more-re-nightmare.vector-primops:v32= 32 (a b) vpcmpeqd) 153 | 154 | (one-more-re-nightmare::define-op 155 | one-more-re-nightmare.vector-primops:v8= 8 (a b) vpcmpeqb) 156 | 157 | (one-more-re-nightmare::define-op 158 | one-more-re-nightmare.vector-primops:v8- 8 (a b) vpsubb) 159 | 160 | ;;;; Broadcasts 161 | 162 | (defknown one-more-re-nightmare.vector-primops:v-broadcast32 163 | ((unsigned-byte 32)) 164 | (simd-pack-256 (unsigned-byte 32)) 165 | ;; Not constant folding, because loading a folded broadcast is 166 | ;; slower than reproducing it again. 167 | (flushable movable) 168 | :overwrite-fndb-silently t) 169 | 170 | (one-more-re-nightmare::define-boring-vop 171 | one-more-re-nightmare.vector-primops:v-broadcast32 172 | ((integer unsigned-num :scs (unsigned-reg))) 173 | (result simd-pack-256-ub32 :scs (int-avx2-reg)) 174 | (inst movq result integer) 175 | (inst vpbroadcastd result result)) 176 | 177 | (defknown one-more-re-nightmare.vector-primops:v-broadcast8 178 | ((unsigned-byte 8)) 179 | (simd-pack-256 (unsigned-byte 8)) 180 | (flushable movable) 181 | :overwrite-fndb-silently t) 182 | 183 | (one-more-re-nightmare::define-boring-vop 184 | one-more-re-nightmare.vector-primops:v-broadcast8 185 | ((integer unsigned-num :scs (unsigned-reg))) 186 | (result simd-pack-256-ub8 :scs (int-avx2-reg)) 187 | (inst movq result integer) 188 | (inst vpbroadcastb result result)) 189 | 190 | ;;;; Move mask 191 | 192 | (defknown one-more-re-nightmare.vector-primops:v-movemask32 193 | ((simd-pack-256 (unsigned-byte 32))) 194 | (unsigned-byte 8) 195 | (flushable movable) 196 | :overwrite-fndb-silently t) 197 | 198 | (one-more-re-nightmare::define-boring-vop 199 | one-more-re-nightmare.vector-primops:v-movemask32 200 | ((pack simd-pack-256-ub32 :scs (int-avx2-reg))) 201 | (result unsigned-num :scs (unsigned-reg)) 202 | (inst vmovmskps result pack)) 203 | 204 | (defknown one-more-re-nightmare.vector-primops:v-movemask8 205 | ((simd-pack-256 (unsigned-byte 8))) 206 | (unsigned-byte 32) 207 | (flushable movable) 208 | :overwrite-fndb-silently t) 209 | 210 | (one-more-re-nightmare::define-boring-vop 211 | one-more-re-nightmare.vector-primops:v-movemask8 212 | ((pack simd-pack-256-ub8 :scs (int-avx2-reg))) 213 | (result unsigned-num :scs (unsigned-reg)) 214 | (inst vpmovmskb result pack)) 215 | 216 | ;;;; Load 217 | 218 | (defknown one-more-re-nightmare.vector-primops:v-load32 219 | ((simple-array character 1) sb-int:index) 220 | (simd-pack-256 (unsigned-byte 32)) 221 | (foldable flushable movable) 222 | :overwrite-fndb-silently t) 223 | 224 | (one-more-re-nightmare::define-boring-vop 225 | one-more-re-nightmare.vector-primops:v-load32 226 | ((string simple-character-string :scs (descriptor-reg)) 227 | (index tagged-num :scs (any-reg))) 228 | (result simd-pack-256-ub32 :scs (int-avx2-reg)) 229 | (inst vmovdqu result 230 | (ea (- (* vector-data-offset n-word-bytes) 231 | other-pointer-lowtag) 232 | ;; Characters are 4 bytes, fixnums have a trailing 0 so 233 | ;; just multiply by 2. 234 | string index 2))) 235 | 236 | (defknown one-more-re-nightmare.vector-primops:v-load8 237 | ((simple-array base-char 1) sb-int:index) 238 | (simd-pack-256 (unsigned-byte 8)) 239 | (foldable flushable movable) 240 | :overwrite-fndb-silently t) 241 | 242 | (one-more-re-nightmare::define-boring-vop 243 | one-more-re-nightmare.vector-primops:v-load8 244 | ((string simple-base-string :scs (descriptor-reg)) 245 | (index unsigned-num :scs (unsigned-reg))) 246 | (result simd-pack-256-ub8 :scs (int-avx2-reg)) 247 | (inst vmovdqu result 248 | (ea (- (* vector-data-offset n-word-bytes) 249 | other-pointer-lowtag) 250 | string index 1))) 251 | 252 | ;;;; Find first set 253 | 254 | (defknown one-more-re-nightmare.vector-primops:find-first-set 255 | ((unsigned-byte 64)) 256 | (mod 64) 257 | (foldable flushable movable) 258 | :overwrite-fndb-silently t) 259 | 260 | (one-more-re-nightmare::define-boring-vop 261 | one-more-re-nightmare.vector-primops:find-first-set 262 | ((integer unsigned-num :scs (unsigned-reg))) 263 | (result unsigned-num :scs (unsigned-reg)) 264 | (inst bsf result integer)) 265 | -------------------------------------------------------------------------------- /Code/Interface/interface.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defun re-groups (regular-expression) 4 | (nth-value 1 5 | (with-hash-consing-tables () 6 | (parse-regular-expression regular-expression)))) 7 | 8 | (defmacro collect ((function) &body body) 9 | (alexandria:with-gensyms (list tail) 10 | `(let* ((,list (list 'nil)) 11 | (,tail ,list)) 12 | (flet ((,function (element) 13 | (let ((new-tail (list element))) 14 | (setf (cdr ,tail) new-tail 15 | ,tail new-tail)))) 16 | (declare (dynamic-extent #',function) 17 | (inline ,function)) 18 | ,@body 19 | (cdr ,list))))) 20 | 21 | (defmacro with-code (((function size) code) &body body) 22 | `(let ((,function (car ,code)) 23 | (,size (cdr ,code))) 24 | ,@body)) 25 | 26 | (eval-when (:compile-toplevel :load-toplevel :execute) 27 | (defun constant-safe-to-eval-p (form) 28 | (trivia:match form 29 | ((or (type string) (type symbol) (list 'quote (type string))) t) 30 | (_ nil))) 31 | 32 | (defun try-to-evaluate-constant-re (form) 33 | (if (and (constantp form) 34 | (constant-safe-to-eval-p form)) 35 | (let ((result (eval form))) 36 | (if (stringp result) 37 | result 38 | nil)) 39 | nil))) 40 | 41 | (define-compiler-macro compile-regular-expression (&whole w expression) 42 | (let ((re (try-to-evaluate-constant-re expression))) 43 | (cond 44 | ((null re) w) 45 | (t 46 | (handler-case 47 | (lint-regular-expression re) 48 | (error (e) 49 | (warn "Error while linting:~%~a" e) 50 | w) 51 | (:no-error (&rest values) 52 | (declare (ignore values)) 53 | `(make-compiled-regular-expression 54 | :codes (vector ,@(loop for type in *string-types* 55 | collect `(find-code ,re ',type))) 56 | :original-re ',re))))))) 57 | 58 | (defmacro with-code-for-vector ((function size vector regular-expression bailout-form) &body body) 59 | (alexandria:with-gensyms (result) 60 | `(let ((,result (try-to-evaluate-constant-re ,regular-expression))) 61 | (cond 62 | ((null ,result) 63 | ,bailout-form) 64 | (t 65 | (handler-case 66 | (lint-regular-expression ,result) 67 | (error (e) 68 | (warn "Error while linting:~%~a" e) 69 | ,bailout-form) 70 | (:no-error (&rest values) 71 | (declare (ignore values)) 72 | (alexandria:once-only (,vector) 73 | (alexandria:with-gensyms (,function ,size) 74 | `(let ((,,size ,(match-vector-size (re-groups ,result))) 75 | (,,function 76 | (cond 77 | ,@(loop for string-type in *string-types* 78 | collect `((typep ,,vector ',string-type) 79 | (load-time-value (car (find-code ,,result ',string-type))))) 80 | (t (car (find-code ,,result (string-type-of ,,vector))))))) 81 | ,(progn ,@body))))))))))) 82 | 83 | (declaim (inline %all-matches) 84 | (ftype (function * list) %all-matches)) 85 | (defun %all-matches (function size vector start end) 86 | (declare (alexandria:array-index start end) 87 | (function function) 88 | (fixnum size)) 89 | (assert (and (<= end (length vector)) 90 | (<= 0 start end))) 91 | ;; The code function will fill in values as needed. 92 | (let ((match (make-array size))) 93 | (collect (result) 94 | (funcall function vector start end match 95 | (lambda () 96 | (result (copy-seq match))))))) 97 | 98 | (declaim (ftype (function 99 | (re-designator string 100 | &key (:start alexandria:array-index) 101 | (:end (or alexandria:array-length null))) 102 | list) 103 | all-matches all-string-matches)) 104 | 105 | (defun all-matches (regular-expression vector 106 | &key (start 0) (end (length vector))) 107 | "Find every match, as a list of match vectors." 108 | (with-code ((function size) 109 | (find-code regular-expression (string-type-of vector))) 110 | (%all-matches function size vector start (or end (length vector))))) 111 | 112 | (define-compiler-macro all-matches (&whole w 113 | regular-expression vector 114 | &key (start 0) 115 | (end nil end-p)) 116 | ;; Grab code at load-time if possible. 117 | (with-code-for-vector (function size vector regular-expression w) 118 | `(%all-matches ,function ,size ,vector 119 | ,start ,(if end-p `(or ,end (length ,vector)) `(length ,vector))))) 120 | 121 | (declaim (inline subsequences)) 122 | (defun subsequences (vector match-vector) 123 | (declare (simple-vector match-vector) 124 | (vector vector) 125 | (optimize (speed 3) (safety 0))) 126 | (let* ((sequences (floor (length match-vector) 2)) 127 | (string-match-vector (make-array sequences))) 128 | (loop for n below sequences 129 | for start = (aref match-vector (* n 2)) 130 | for end = (aref match-vector (1+ (* n 2))) 131 | if (null start) 132 | do (setf (aref string-match-vector n) nil) 133 | else 134 | do (setf (aref string-match-vector n) 135 | (subseq vector start end))) 136 | string-match-vector)) 137 | 138 | (defun all-string-matches (regular-expression vector 139 | &key (start 0) (end (length vector))) 140 | "Find every match, as a list of match string vectors." 141 | (mapcar (lambda (match) (subsequences vector match)) 142 | (all-matches regular-expression 143 | vector 144 | :start start 145 | :end end))) 146 | 147 | (define-compiler-macro all-string-matches (&whole w 148 | regular-expression vector 149 | &key (start 0) 150 | (end nil end-p)) 151 | ;; Grab code at load-time if possible. 152 | (with-code-for-vector (function size vector regular-expression w) 153 | `(mapcar (lambda (match) (subsequences ,vector match)) 154 | (%all-matches ,function ,size ,vector 155 | ,start ,(if end-p `(or ,end nil) `(length ,vector)))))) 156 | 157 | (declaim (inline %first-match)) 158 | (defun %first-match (function size vector start end) 159 | (declare (alexandria:array-index start end) 160 | (fixnum size) 161 | (function function)) 162 | (assert (and (<= end (length vector)) 163 | (<= 0 start end))) 164 | (let ((tag-vector (make-array size))) 165 | (funcall function vector start end tag-vector 166 | (lambda () 167 | (return-from %first-match tag-vector))) 168 | nil)) 169 | 170 | (declaim (ftype (function 171 | (re-designator string 172 | &key (:start alexandria:array-index) 173 | (:end (or alexandria:array-length null))) 174 | (or null simple-vector)) 175 | first-match first-string-match)) 176 | 177 | (defun first-match (regular-expression vector 178 | &key (start 0) (end (length vector))) 179 | "Find the first match, returning a match vector, or NIL." 180 | (with-code ((function size) 181 | (find-code regular-expression (string-type-of vector))) 182 | (%first-match function size vector start (or end (length vector))))) 183 | 184 | (define-compiler-macro first-match (&whole w 185 | regular-expression vector 186 | &key (start 0) (end nil end-p)) 187 | (with-code-for-vector (function size vector regular-expression w) 188 | `(%first-match ,function ,size ,vector ,start 189 | ,(if end-p `(or ,end nil) `(length ,vector))))) 190 | 191 | (defun first-string-match (regular-expression vector 192 | &key (start 0) (end (length vector))) 193 | "Find the first match, returning a match string vector or NIL" 194 | (let ((results (first-match regular-expression vector 195 | :start start :end end))) 196 | (if (null results) 197 | nil 198 | (subsequences vector results)))) 199 | 200 | (define-compiler-macro first-string-match (&whole w 201 | regular-expression vector 202 | &key (start 0) (end nil end-p)) 203 | (with-code-for-vector (function size vector regular-expression w) 204 | `(subsequences ,vector 205 | (%first-match ,function ,size ,vector ,start 206 | ,(if end-p `(or ,end (length ,vector)) `(length ,vector)))))) 207 | 208 | (defmacro do-matches (((&rest registers) regular-expression vector 209 | &key (start 0) (end nil)) 210 | &body body) 211 | "Iterate over every match, binding match registers." 212 | (alexandria:with-gensyms (function size match-vector) 213 | (alexandria:once-only (start end vector) 214 | (labels ((consume (function size known-register-count) 215 | (when (and (not (null known-register-count)) 216 | (> (length registers) known-register-count)) 217 | (warn "This regular expression only produces ~r register~:p, but ~r variables were provided." 218 | known-register-count 219 | (length registers)) 220 | (setf known-register-count nil)) 221 | `(progn 222 | (when (null ,end) 223 | (setf ,end (length ,vector))) 224 | (assert (and (<= ,end (length ,vector)) 225 | (<= 0 ,start ,end))) 226 | (let ((,match-vector (make-array ,size))) 227 | ,(if (null known-register-count) 228 | `(assert (>= (length ,match-vector) 229 | ,(length registers)) 230 | () 231 | "This regular expression only produces ~r register~:p, but ~r variables were provided." 232 | (length ,match-vector) 233 | ,(length registers)) 234 | `(declare (dynamic-extent ,match-vector))) 235 | (funcall ,function ,vector ,start ,end ,match-vector 236 | (lambda () 237 | (let ,(loop for register in registers 238 | for n from 0 239 | collect `(,register (svref ,match-vector ,n))) 240 | ,(if (>= (length registers) 2) 241 | `(declare ((or null alexandria:array-index) ,@(subseq registers 2)) 242 | (alexandria:array-index ,@(subseq registers 0 2))) 243 | `(declare (alexandria:array-index ,@registers))) 244 | ,@body)))))) 245 | (fallback () 246 | (alexandria:once-only (vector) 247 | `(with-code ((,function ,size) 248 | (find-code ,regular-expression (string-type-of ,vector))) 249 | ,(consume function size nil))))) 250 | (with-code-for-vector (function size vector regular-expression (fallback)) 251 | (consume function size 252 | (match-vector-size (re-groups regular-expression)))))))) 253 | -------------------------------------------------------------------------------- /Code/Compiler/code-generation.lisp: -------------------------------------------------------------------------------- 1 | (in-package :one-more-re-nightmare) 2 | 3 | (defclass compiler-state () 4 | ((variable-names :initform (make-hash-table :test 'equal) 5 | :reader variable-names) 6 | (state-names :initform (make-hash-table :test 'equal) 7 | :reader state-names) 8 | (next-state-name :initform 0 9 | :accessor next-state-name) 10 | (variable-map :initarg :variable-map 11 | :reader variable-map))) 12 | 13 | (defun find-variable-name (variable) 14 | (when (member variable '(nil position)) 15 | (return-from find-variable-name variable)) 16 | (let ((names (variable-names *compiler-state*))) 17 | (multiple-value-bind (name present?) 18 | (gethash variable names) 19 | (if present? 20 | name 21 | (setf (gethash variable names) 22 | (make-symbol (format nil "~{~a.~a~}" variable))))))) 23 | 24 | (defun find-state-name (state &optional (entry-point :bounds-check)) 25 | (let ((names (state-names *compiler-state*))) 26 | (multiple-value-bind (name present?) 27 | (gethash (cons state entry-point) names) 28 | (if present? 29 | name 30 | (setf (gethash (cons state entry-point) names) 31 | (incf (next-state-name *compiler-state*))))))) 32 | 33 | (defun match-vector-size (groups) 34 | (* 2 (1+ groups))) 35 | 36 | (defvar *nowhere* (make-broadcast-stream)) 37 | (defvar *code-type* :compiled) 38 | (defun %compile-regular-expression (expression 39 | &key (layout *default-layout*) 40 | (strategy #'make-default-strategy)) 41 | (let ((*tag-gensym-counter* 0)) 42 | (with-hash-consing-tables () 43 | (multiple-value-bind (expression groups) 44 | (parse-regular-expression expression) 45 | (let ((*layout* layout) 46 | (*error-output* *nowhere*) 47 | (strategy (funcall strategy layout expression))) 48 | (values 49 | (with-naughty-compiler-switches () 50 | (let ((form (make-lambda-form 51 | expression 52 | strategy 53 | groups))) 54 | (ecase *code-type* 55 | #+sbcl 56 | (:interpreted 57 | (let ((sb-ext:*evaluator-mode* :interpret)) 58 | (eval form))) 59 | (:compiled 60 | (compile nil form)) 61 | (:literal 62 | form)))) 63 | (match-vector-size groups))))))) 64 | 65 | (defun variable-map-from-groups (groups) 66 | (coerce `(start end ,@(alexandria:iota (* groups 2) :start 1)) 67 | 'vector)) 68 | 69 | (defun make-lambda-form (expression strategy groups) 70 | (let* ((*compiler-state* 71 | (make-instance 'compiler-state 72 | :variable-map (variable-map-from-groups groups))) 73 | (macros (macros-for-strategy strategy))) 74 | (multiple-value-bind (variables declarations body) 75 | (make-prog-parts strategy expression) 76 | `(lambda ,(lambda-list strategy) 77 | (declare (optimize ,@*optimize-settings*) 78 | ,@(declarations strategy) 79 | #+sbcl (sb-ext:muffle-conditions sb-ext:compiler-note style-warning)) 80 | (macrolet ,macros 81 | (prog* ,variables 82 | (declare ,@declarations) 83 | ,@body)))))) 84 | 85 | (defgeneric make-prog-parts (strategy expression) 86 | (:method (strategy expression) 87 | (let* ((initial-expressions (initial-states strategy expression)) 88 | (states (make-dfa-from-expressions initial-expressions))) 89 | (compute-predecessor-lists states) 90 | (compute-minimum-lengths states) 91 | (let* ((body (make-body-from-dfa strategy states)) 92 | (initial-states (loop for expression in initial-expressions 93 | collect (gethash expression states))) 94 | (start-code (start-code strategy initial-states)) 95 | (variables (alexandria:hash-table-values 96 | (variable-names *compiler-state*)))) 97 | (values 98 | `((start start) 99 | (position start) 100 | ,@(loop for variable in variables collect `(,variable 0))) 101 | `((alexandria:array-index start position ,@variables)) 102 | (append start-code body)))))) 103 | 104 | (defun make-body-from-dfa (strategy states) 105 | (loop for state being the hash-values of states 106 | for expression = (state-expression state) 107 | for nullable = (nullable expression) 108 | ;; We "inline" these states into transitions rather than 109 | ;; emitting code for separate states, because they are simple 110 | ;; enough. 111 | unless (or (re-stopped-p expression) 112 | (re-empty-p expression) 113 | (state-never-succeeds-p state)) 114 | append 115 | `(,(find-state-name state :bounds-check) 116 | #+print-traces 117 | (print (list :bounds-checking 118 | position 119 | ,(prin1-to-string expression) 120 | ,(minimum-length state))) 121 | (when (> (the alexandria:array-index 122 | (+ position ,(max (minimum-length state) 1))) 123 | end) 124 | (when (> position end) 125 | (return)) 126 | #+print-traces 127 | (print (list :eof ,(prin1-to-string nullable))) 128 | ,(if (eq (empty-set) nullable) 129 | `(return) 130 | ;; We hit EOF and this state is nullable, so 131 | ;; succeed with what we got so far. 132 | `(progn 133 | ;; See below for commentary on why we have to 134 | ;; nudge register values around. 135 | (let ((position (1+ position))) 136 | ,(setf-from-assignments 137 | (state-exit-effects state))) 138 | #+print-traces 139 | (print (list :exit-map ',(state-exit-map state) 140 | (1- ,(find-in-map 'end (state-exit-map state))) 141 | (1+ start))) 142 | (setf start (max (1- ,(find-in-map 'end (state-exit-map state))) 143 | (1+ start))) 144 | (win ,@(win-locations (state-exit-map state)))))) 145 | ,(find-state-name state :no-bounds-check) 146 | #+print-traces 147 | (print (list :bounds-ok 148 | position 149 | ,(prin1-to-string expression) 150 | ,(minimum-length state))) 151 | (let ((value (,(layout-to-number *layout*) 152 | (,(layout-ref *layout*) vector position)))) 153 | ;; We assign early so that the ADD instruction doesn't 154 | ;; force our Lisp compiler to create a new basic block, 155 | ;; and can just JMP directly to the next state when no tag 156 | ;; assignments are done. This means that every register is 157 | ;; going to have a value that is 1 too high, but we can 158 | ;; just subtract in the (infrequent) places we read 159 | ;; registers. 160 | (incf position) 161 | ,(let ((labels (loop for nil in (state-transitions state) 162 | for n from 0 163 | collect (alexandria:format-symbol nil "TRANSITION-~d" n)))) 164 | `(tagbody 165 | (csum-case value 166 | ,(layout-less *layout*) 167 | ,(layout-equal *layout*) 168 | ,@(loop for transition in (state-transitions state) 169 | for label in labels 170 | collect `(,(transition-class transition) 171 | (go ,label)))) 172 | ,@(loop for transition in (state-transitions state) 173 | for label in labels 174 | collect label 175 | collect (transition-code strategy state transition)))))))) 176 | 177 | (defmethod transition-code (strategy previous-state transition) 178 | (declare (ignore strategy)) 179 | (let* ((next-state (transition-next-state transition)) 180 | (next-expression (state-expression next-state))) 181 | (cond 182 | ((or (state-never-succeeds-p next-state) 183 | (re-stopped-p next-expression)) 184 | (if (eq (nullable (state-expression previous-state)) (empty-set)) 185 | `(progn 186 | (setf start (1+ start)) 187 | (restart)) 188 | ;; Similarly to hitting EOF, if this state is nullable then 189 | ;; we can succeed with what we got. 190 | `(progn 191 | ,(setf-from-assignments 192 | (transition-tags-to-set transition)) 193 | (setf start (max (1+ start) 194 | (1- ,(find-in-map 'end (state-exit-map next-state))))) 195 | #+print-traces 196 | (print `(:next ,start ,,(prin1-to-string next-expression))) 197 | (win ,@(win-locations (state-exit-map next-state)))))) 198 | ((re-empty-p next-expression) 199 | `(progn 200 | ,(setf-from-assignments 201 | (transition-tags-to-set transition)) 202 | ;; These assignments are evaluated as if we were at the 203 | ;; empty state -- we inline the empty state because it is 204 | ;; only a set of assignments and a call to WIN. 205 | (let ((position (1+ position))) 206 | ,(setf-from-assignments 207 | (tags next-expression))) 208 | (setf start position) 209 | (win ,@(win-locations (state-exit-map next-state))))) 210 | (t 211 | (let ((entry-point 212 | (if (and (< (minimum-length next-state) 213 | (minimum-length previous-state)) 214 | (plusp (minimum-length next-state))) 215 | :no-bounds-check 216 | :bounds-check))) 217 | `(progn 218 | ,(setf-from-assignments 219 | (transition-tags-to-set transition)) 220 | (go ,(find-state-name next-state entry-point)))))))) 221 | 222 | (defun win-locations (exit-map) 223 | (loop for variable-name across (variable-map *compiler-state*) 224 | for (variable . source) = (find variable-name exit-map :key #'caar) 225 | if (and (not (null variable)) 226 | (not (eql source 'nil))) 227 | collect `(,variable-name ,(find-variable-name source)) 228 | else 229 | collect `(,variable-name 'nil))) 230 | 231 | (defun setf-from-assignments (assignments) 232 | `(setf 233 | ,@(loop for (target . source) in assignments 234 | ;; NIL sources are basically just a compile time thing. 235 | unless (or (equal target source) 236 | (eql source 'nil)) 237 | collect (find-variable-name target) 238 | and collect (find-variable-name source)))) 239 | 240 | (defun find-in-map (variable-name map) 241 | (let ((variable (find variable-name map :key #'caar))) 242 | (if (null variable) 243 | (error "~s not in the map ~s" variable-name map) 244 | (find-variable-name (cdr variable))))) 245 | 246 | (defmethod start-code ((strategy scan-everything) states) 247 | (destructuring-bind (state) states 248 | (let ((expression (state-expression state))) 249 | (cond 250 | ((state-never-succeeds-p state) 251 | ;; Just return immediately if we're told to match nothing. 252 | `(start (return))) 253 | ((re-empty-p expression) 254 | ;; Succeed for every character? 255 | `(start 256 | (cond 257 | ((> position end) 258 | (return)) 259 | (t 260 | (incf position) 261 | ,(setf-from-assignments (state-exit-effects state)) 262 | (win ,@(win-locations (state-exit-map state))))))) 263 | (t 264 | `(start 265 | (setf position start) 266 | (go ,(find-state-name state :bounds-check)))))))) 267 | 268 | (defmethod start-code :around ((strategy call-continuation) states) 269 | ;; Calling the continuation gets its own state as to influence the 270 | ;; register allocator less. This change does actually reduce the 271 | ;; number of spills substantially (at least on SBCL). 272 | (append (call-next-method) 273 | `(win 274 | #+print-traces 275 | (print ':win) 276 | (funcall continuation) 277 | (restart)))) 278 | --------------------------------------------------------------------------------