├── doc └── intro.md ├── .gitignore ├── src-java └── com │ └── champbacon │ └── pex │ ├── CharMatcher.java │ ├── ParseAction.java │ ├── PEGMatcher.java │ ├── ValueStackManip.java │ ├── ParsingExpressionGrammar.java │ └── impl │ ├── OpCodes.java │ ├── StackEntry.java │ ├── Matchers.java │ ├── Actions.java │ └── PEGByteCodeVM.java ├── test └── com │ └── champbacon │ └── pex_test.clj ├── project.clj ├── TODO ├── src └── com │ └── champbacon │ ├── pex │ ├── examples │ │ ├── csv.clj │ │ └── json.clj │ └── impl │ │ ├── tree.clj │ │ └── codegen.clj │ └── pex.clj └── README.md /doc/intro.md: -------------------------------------------------------------------------------- 1 | # Introduction to pex 2 | 3 | TODO: write [great documentation](http://jacobian.org/writing/what-to-write/) 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | /classes 3 | /checkouts 4 | pom.xml 5 | pom.xml.asc 6 | *.jar 7 | *.class 8 | /.lein-* 9 | /.nrepl-port 10 | .hgignore 11 | .hg/ 12 | -------------------------------------------------------------------------------- /src-java/com/champbacon/pex/CharMatcher.java: -------------------------------------------------------------------------------- 1 | 2 | package com.champbacon.pex; 3 | 4 | public interface CharMatcher { 5 | public boolean match(int ch); 6 | } 7 | -------------------------------------------------------------------------------- /test/com/champbacon/pex_test.clj: -------------------------------------------------------------------------------- 1 | (ns com.champbacon.pex-test 2 | (:require [clojure.test :refer :all] 3 | [com.champbacon.pex :refer :all])) 4 | 5 | (deftest a-test 6 | (testing "FIXME, I fail." 7 | (is (= 0 1)))) 8 | -------------------------------------------------------------------------------- /src-java/com/champbacon/pex/ParseAction.java: -------------------------------------------------------------------------------- 1 | package com.champbacon.pex; 2 | 3 | public interface ParseAction { 4 | 5 | public void execute(ValueStackManip vm); 6 | // subjectPosition, context, captureList, capturePosition 7 | } 8 | -------------------------------------------------------------------------------- /src-java/com/champbacon/pex/PEGMatcher.java: -------------------------------------------------------------------------------- 1 | 2 | package com.champbacon.pex; 3 | 4 | public interface PEGMatcher { 5 | 6 | public int match(); 7 | public int match(int pos); 8 | 9 | public void reset(); 10 | 11 | public Object[] getCaptures(); 12 | } 13 | -------------------------------------------------------------------------------- /project.clj: -------------------------------------------------------------------------------- 1 | (defproject com.champbacon/pex "0.0.1-SNAPSHOT" 2 | :description "a data-driven parsing library" 3 | :url "http://github.com/ghadishayban/pex" 4 | :license {:name "Eclipse Public License" 5 | :url "http://www.eclipse.org/legal/epl-v10.html"} 6 | :dependencies [[org.clojure/clojure "1.7.0"]] 7 | :java-source-paths ["src-java"]) 8 | -------------------------------------------------------------------------------- /TODO: -------------------------------------------------------------------------------- 1 | * TRACE opcode (Logs rule applications with debug info. Rule name and Position). 2 | * cuts 3 | * string capture uses a slow constructor 4 | * input shouldn't be a char[] 5 | * make json/cast-number less dumb 6 | * maybe a case-match statement for tail recursion 7 | * better documentation 8 | * codegen/emit should be a multimethod 9 | * add CSV example 10 | * reset the VM 11 | * LPEG tree optimizations 12 | -------------------------------------------------------------------------------- /src-java/com/champbacon/pex/ValueStackManip.java: -------------------------------------------------------------------------------- 1 | package com.champbacon.pex; 2 | 3 | public interface ValueStackManip { 4 | public Object getUserParseContext(); 5 | public void setUserParseContext(Object ctx); 6 | 7 | public int getInputPosition(); 8 | public char[] getInput(); 9 | public char getLastMatch(); 10 | 11 | public int getCaptureStart(); 12 | public int getCaptureEnd(); 13 | public void setCaptureEnd(int i); 14 | public Object[] getCurrentCaptures(); 15 | public void push(Object v); 16 | } 17 | -------------------------------------------------------------------------------- /src-java/com/champbacon/pex/ParsingExpressionGrammar.java: -------------------------------------------------------------------------------- 1 | package com.champbacon.pex; 2 | 3 | import com.champbacon.pex.CharMatcher; 4 | import com.champbacon.pex.PEGMatcher; 5 | import com.champbacon.pex.ParseAction; 6 | import com.champbacon.pex.impl.PEGByteCodeVM; 7 | 8 | /** 9 | * Created by ghadi on 11/13/15. 10 | */ 11 | public class ParsingExpressionGrammar { 12 | 13 | public final int[] instructions; 14 | public final CharMatcher[] charMatchers; 15 | public final ParseAction[] actions; 16 | 17 | public ParsingExpressionGrammar(int[] instructions, CharMatcher[] charMatchers, ParseAction[] actions) { 18 | this.instructions = instructions; 19 | this.charMatchers = charMatchers; 20 | this.actions = actions; 21 | } 22 | 23 | public PEGMatcher matcher(char[] input, Object context) { 24 | return new PEGByteCodeVM(this, input, context); 25 | } 26 | 27 | } 28 | -------------------------------------------------------------------------------- /src-java/com/champbacon/pex/impl/OpCodes.java: -------------------------------------------------------------------------------- 1 | package com.champbacon.pex.impl; 2 | 3 | public interface OpCodes { 4 | 5 | final int CALL = 0; 6 | final int RET = 1; 7 | final int CHOICE = 2; 8 | final int COMMIT = 3; 9 | final int PARTIAL_COMMIT = 4; 10 | final int BACK_COMMIT = 5; 11 | final int JUMP = 6; 12 | final int FAIL_TWICE = 7; 13 | final int FAIL = 8; 14 | final int END = 9; 15 | 16 | final int MATCH_CHAR = 10; 17 | final int TEST_CHAR = 11; 18 | final int CHARSET = 12; 19 | final int TEST_CHARSET = 13; 20 | final int ANY = 14; 21 | final int TEST_ANY = 15; 22 | final int SPAN = 16; 23 | 24 | final int BEGIN_CAPTURE = 17; 25 | final int END_CAPTURE = 18; 26 | final int FULL_CAPTURE = 19; 27 | final int BEHIND = 20; 28 | 29 | final int END_OF_INPUT = 21; 30 | 31 | final int ACTION = 22; 32 | // final int PUSH_VALUE = 23; 33 | // final int UPDATE_STACK_TOP = 24; 34 | // SET_VAL 35 | } -------------------------------------------------------------------------------- /src/com/champbacon/pex/examples/csv.clj: -------------------------------------------------------------------------------- 1 | (ns com.champbacon.pex.examples.csv 2 | (:require [com.champbacon.pex :as pex])) 3 | 4 | ;; THIS IS INCOMPLETE AND INCORRECT 5 | 6 | (def CSV '{file [OWS record (* NL record) EOI] 7 | 8 | record [field (* field-delimeter field)] 9 | 10 | field-delimeter "," 11 | field (/ quoted unquoted) 12 | 13 | unquoted (capture (* (class nonquotechars))) 14 | ;; (capture (not \") (* ANY)) 15 | quoted [OWS \" 16 | (capture (* (/ (class quotechars) "\"\""))) 17 | ;; (apply unescape-quotes (* (/ (class quotechars) 18 | ;; "\"\""))) 19 | ;; 20 | 21 | \" OWS 22 | (action unescape-quotes)] 23 | 24 | NL [(? \r) \n] 25 | OWS (* (class :ws))}) 26 | 27 | (def csv-field '{record [field (* sep field) EOI] 28 | field (/ quoted unquoted) 29 | quoted ["\"" [(not "\"") ANY] ] 30 | sep ","}) 31 | 32 | (def csv-macros {:ws (fn [patt] [patt 'whitespace]) 33 | :join (fn [patt sep] [patt (list '* sep patt)])}) -------------------------------------------------------------------------------- /src-java/com/champbacon/pex/impl/StackEntry.java: -------------------------------------------------------------------------------- 1 | package com.champbacon.pex.impl; 2 | 3 | final class StackEntry { 4 | 5 | static final int NO_OPEN_CAPTURE = -1; 6 | static final int IS_CALL = -1; 7 | private int returnAddress; 8 | private int subjectPosition = IS_CALL; 9 | private int captureHeight; 10 | private int currentCaptureBegin = NO_OPEN_CAPTURE; 11 | 12 | public void reset() { 13 | returnAddress = 0; 14 | subjectPosition = IS_CALL; 15 | captureHeight = 0; 16 | currentCaptureBegin = NO_OPEN_CAPTURE; 17 | } 18 | 19 | public int getCaptureHeight() { 20 | return captureHeight; 21 | } 22 | 23 | public void setCaptureHeight(int captureHeight) { 24 | this.captureHeight = captureHeight; 25 | } 26 | 27 | public int getSubjectPosition() { 28 | return subjectPosition; 29 | } 30 | 31 | public void setSubjectPosition(int subjectPosition) { 32 | this.subjectPosition = subjectPosition; 33 | } 34 | 35 | public int getReturnAddress() { 36 | return returnAddress; 37 | } 38 | 39 | public void setReturnAddress(int returnAddress) { 40 | this.returnAddress = returnAddress; 41 | } 42 | 43 | public boolean isCall() { 44 | return subjectPosition == -1; 45 | }; 46 | 47 | public void setCurrentCaptureBegin(int subjectPosition) { 48 | if (currentCaptureBegin == NO_OPEN_CAPTURE) { 49 | currentCaptureBegin = subjectPosition; 50 | } else throw new IllegalStateException("Nested capture within a single rule."); 51 | } 52 | 53 | public void clearOpenCapture() { 54 | currentCaptureBegin = NO_OPEN_CAPTURE; 55 | } 56 | 57 | public int getCurrentCaptureBegin() { 58 | if (currentCaptureBegin != NO_OPEN_CAPTURE) 59 | return currentCaptureBegin; 60 | else throw new IllegalStateException("No open capture."); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src-java/com/champbacon/pex/impl/Matchers.java: -------------------------------------------------------------------------------- 1 | package com.champbacon.pex.impl; 2 | 3 | import com.champbacon.pex.CharMatcher; 4 | 5 | /** 6 | * Created by ghadi on 11/13/15. 7 | * 8 | * TODO Add a mask-based implementation & some combinators 9 | * https://github.com/sirthias/parboiled2/blob/master/parboiled-core/src/main/scala/org/parboiled2/CharPredicate.scala#L145 10 | */ 11 | 12 | public class Matchers { 13 | public static class SingleRangeMatcher implements CharMatcher { 14 | 15 | final int low; 16 | final int high; 17 | 18 | public SingleRangeMatcher(int low, int high) { 19 | if (high <= low) 20 | throw new IllegalArgumentException("low must be <= high"); 21 | 22 | this.low = low; 23 | this.high = high; 24 | } 25 | 26 | public boolean match(int ch) { 27 | if (ch < low) { 28 | return false; 29 | } 30 | 31 | if (ch < high) { 32 | return true; 33 | } 34 | return false; 35 | } 36 | } 37 | 38 | 39 | public static class RangeMatcher implements CharMatcher { 40 | 41 | final int[] chars; 42 | 43 | public RangeMatcher(int[] chars) { 44 | this.chars = chars; 45 | } 46 | 47 | public boolean match(int ch) { 48 | 49 | // Peek at the first few pairs. 50 | // Should handle ASCII well. 51 | for (int j = 0; j < chars.length && j <= 8; j += 2) { 52 | if (ch < chars[j]) { 53 | return false; 54 | } 55 | if (ch < chars[j + 1]) { 56 | return true; 57 | } 58 | } 59 | 60 | // Otherwise binary search. 61 | for (int lo = 0, hi = chars.length / 2; lo < hi; ) { 62 | int m = lo + (hi - lo) / 2; 63 | int c = chars[2 * m]; 64 | if (c <= ch) { 65 | if (ch < chars[2 * m + 1]) { 66 | return true; 67 | } 68 | lo = m + 1; 69 | } else { 70 | hi = m; 71 | } 72 | } 73 | return false; 74 | 75 | } 76 | } 77 | 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/com/champbacon/pex.clj: -------------------------------------------------------------------------------- 1 | (ns com.champbacon.pex 2 | (:refer-clojure :exclude [compile]) 3 | (:require [com.champbacon.pex.impl.tree :as tree] 4 | [com.champbacon.pex.impl.codegen :as codegen]) 5 | (:import (com.champbacon.pex.impl Matchers$SingleRangeMatcher 6 | Matchers$RangeMatcher 7 | Actions 8 | Actions$PushAction 9 | Actions$UpdateStackTop 10 | Actions$FoldCaptures 11 | Actions$ReplaceCaptures) 12 | (com.champbacon.pex ParsingExpressionGrammar))) 13 | 14 | (defn single-range-matcher 15 | [low high] 16 | (Matchers$SingleRangeMatcher. (int low) (int high))) 17 | 18 | (defn range-matcher 19 | [ranges] 20 | (let [nums (int-array (sequence (comp cat (map int)) (sort-by first ranges)))] 21 | (Matchers$RangeMatcher. nums))) 22 | 23 | (defn update-stack-top 24 | [f] 25 | (Actions$UpdateStackTop. f)) 26 | 27 | (defn push 28 | [val] 29 | (Actions$PushAction. val)) 30 | 31 | (defn replace-captures 32 | "f will be passed array, low-idx, high-idx. 33 | The extent of captures will be replaced with the result of f 34 | high-idx is exclusive" 35 | [f] 36 | (Actions$ReplaceCaptures. f)) 37 | 38 | (def clear-sb Actions/CLEAR_STRING_BUFFER) 39 | (def append-sb Actions/APPEND_STRING_BUFFER) 40 | (def push-sb Actions/PUSH_STRING_BUFFER) 41 | 42 | (defn fold-cap 43 | [rf] 44 | (Actions$FoldCaptures. rf)) 45 | 46 | (defn compile 47 | ([grammar entrypoint matchers] 48 | (compile grammar entrypoint matchers {} {})) 49 | ([grammar entrypoint matchers actions] 50 | (compile grammar entrypoint matchers actions {})) 51 | ([grammar entrypoint matchers actions macros] 52 | (when-not (contains? grammar entrypoint) 53 | (throw (ex-info "Unknown entrypoint" {:grammar grammar 54 | :entrypoint entrypoint}))) 55 | (let [ast (tree/parse-grammar grammar macros)] 56 | (codegen/compile-grammar ast entrypoint matchers actions)))) 57 | 58 | (defn matcher 59 | ([peg input] 60 | (matcher peg input nil)) 61 | ([^ParsingExpressionGrammar peg input user-parse-context] 62 | (.matcher peg input user-parse-context))) 63 | 64 | (defn print-instructions 65 | [insts] 66 | (println "ADDR INST") 67 | (doseq [[idx inst] (map vector (range) insts)] 68 | (printf "%4d %s%n" idx inst))) 69 | -------------------------------------------------------------------------------- /src-java/com/champbacon/pex/impl/Actions.java: -------------------------------------------------------------------------------- 1 | package com.champbacon.pex.impl; 2 | 3 | import clojure.lang.IFn; 4 | import com.champbacon.pex.ParseAction; 5 | import com.champbacon.pex.ValueStackManip; 6 | 7 | /** 8 | * Created by ghadi on 11/13/15. 9 | */ 10 | public class Actions { 11 | public static class PushAction implements ParseAction { 12 | final Object val; 13 | 14 | public PushAction(Object val) { 15 | this.val = val; 16 | } 17 | 18 | public void execute(ValueStackManip vm) { 19 | vm.push(val); 20 | } 21 | } 22 | 23 | public static class UpdateStackTop implements ParseAction { 24 | IFn f; 25 | 26 | public UpdateStackTop(IFn f) { 27 | this.f = f; 28 | } 29 | 30 | public void execute(ValueStackManip vm) { 31 | Object[] captures = vm.getCurrentCaptures(); 32 | 33 | int cur = vm.getCaptureEnd() - 1; 34 | captures[cur] = f.invoke(captures[cur]); 35 | } 36 | } 37 | 38 | public static class FoldCaptures implements ParseAction { 39 | private final IFn f; 40 | 41 | public FoldCaptures(IFn f) { 42 | this.f = f; 43 | } 44 | 45 | public void execute(ValueStackManip vm) { 46 | int low = vm.getCaptureStart(); 47 | int high = vm.getCaptureEnd(); 48 | Object[] caps = vm.getCurrentCaptures(); 49 | 50 | Object ret = f.invoke(); 51 | for(int i = low; i (.indexOf "bfnrt\\/\"" ch) 0))) 70 | :whitespace (reify CharMatcher 71 | (match [_ ch] 72 | (Character/isWhitespace ch)))} 73 | actions {:append-hexdigit (reify ParseAction 74 | (execute [_ vsm] 75 | (let [^StringBuffer sb (.getUserParseContext vsm) 76 | captures (.getCurrentCaptures vsm) 77 | top (.getCaptureEnd vsm) 78 | hex (aget captures top)] 79 | (.append sb (char (Integer/parseInt hex 16))) 80 | (.setCaptureEnd vsm (dec top))))) 81 | :capture-object (pex/replace-captures make-json-object) 82 | :capture-array (pex/fold-cap (fn 83 | ([] (transient [])) 84 | ([res] (persistent! res)) 85 | ([res input] (conj! res input)))) 86 | :append-escape (reify ParseAction 87 | (execute [_ vsm] 88 | (let [^StringBuffer sb (.getUserParseContext vsm) 89 | last-ch (.getLastMatch vsm)] 90 | (.append sb ^char (escapes (char last-ch)))))) 91 | :cast-number (pex/update-stack-top #(Double/valueOf ^String %)) 92 | :push-true (pex/push true) 93 | :push-false (pex/push false) 94 | :push-nil (pex/push nil) 95 | :clear-sb pex/clear-sb 96 | :append-sb pex/append-sb 97 | :push-sb pex/push-sb}] 98 | (pex/compile JSON 'json matchers actions json-macros))) 99 | 100 | (comment 101 | (let [json (json-parser) 102 | ;;input "\"42\"" 103 | input "{\"bar\": [\"this\", 42, {}, [1,2,3], \"foo\"]}" 104 | m (pex/matcher json (.toCharArray input) (StringBuffer.)) 105 | result (.match m 0)] 106 | (first (.getCaptures m)))) -------------------------------------------------------------------------------- /src/com/champbacon/pex/impl/codegen.clj: -------------------------------------------------------------------------------- 1 | (ns com.champbacon.pex.impl.codegen 2 | (:import com.champbacon.pex.impl.OpCodes 3 | (com.champbacon.pex ParsingExpressionGrammar CharMatcher ParseAction))) 4 | 5 | (declare emit) 6 | 7 | (defn label 8 | [env] 9 | ((:next-label env))) 10 | 11 | (defn emit-choice 12 | [env ast] 13 | (let [{:keys [children]} ast 14 | n (count children) 15 | last-pos? (fn [i] (= i (dec n))) 16 | 17 | labels (into [] (take n) (repeatedly #(label env))) 18 | blocks (mapv (partial emit env) children) 19 | 20 | end (label env) 21 | 22 | emit-alternative (fn [idx tree] 23 | (let [header (when-not (zero? idx) 24 | [[:label (labels idx)]]) 25 | choice (when-not (last-pos? idx) 26 | [[:choice (labels (inc idx))]]) 27 | commit (when-not (last-pos? idx) 28 | [[:commit end]])] 29 | (into [] cat 30 | [header 31 | choice 32 | (blocks idx) 33 | commit])))] 34 | (-> (into [] (comp (map-indexed emit-alternative) cat) children) 35 | (conj [:label end])))) 36 | 37 | (defn concat-trees 38 | [env ts] 39 | (into [] (mapcat (partial emit env)) ts)) 40 | 41 | (defn emit-cat 42 | [env ast] 43 | (concat-trees env (:children ast))) 44 | 45 | (defn emit-char 46 | [env ast] 47 | (let [{:keys [codepoint]} ast] 48 | [[:char codepoint]])) 49 | 50 | (defn emit-rep 51 | [env ast] 52 | (let [body (concat-trees env (:children ast)) 53 | head (label env) 54 | cont (label env)] 55 | (into [] cat 56 | [[[:choice cont] 57 | [:label head]] 58 | body 59 | [[:partial-commit head] 60 | [:label cont]]]))) 61 | 62 | (defn emit-call 63 | [env ast] 64 | (let [{:keys [target]} ast] 65 | (when-not (contains? (:non-terminals env) target) 66 | (throw (ex-info "Undefined non-terminal" {:target target}))) 67 | [[:call target]])) 68 | 69 | (defn emit-optional 70 | [env ast] 71 | (let [body (concat-trees env (:children ast)) 72 | cont (label env)] 73 | (-> (into [[:choice cont]] body) 74 | (conj [:commit cont] [:label cont])))) 75 | 76 | (defn emit-not-predicate 77 | [env ast] 78 | (let [body (concat-trees env (:children ast)) 79 | L1 (label env)] 80 | (-> (into [[:choice L1]] body) 81 | (conj [:fail-twice] 82 | [:label L1])))) 83 | 84 | (defn emit-and-predicate 85 | [env ast] 86 | (let [body (concat-trees env (:children ast)) 87 | L1 (label env) 88 | L2 (label env)] 89 | (-> (into [[:choice L1]] body) 90 | (into [[:back-commit L2] 91 | [:label L1] 92 | [:fail] 93 | [:label L2]])))) 94 | 95 | (defn emit-capture 96 | [env ast] 97 | ;; optimize 98 | (let [body (concat-trees env (:children ast))] 99 | (-> (into [[:begin-capture]] body) 100 | (conj [:end-capture])))) 101 | 102 | (defn emit-linked-instruction 103 | [k env ast] 104 | (let [linked-constant (-> ast :args first keyword) 105 | n (or (get-in env [:constants k linked-constant]) 106 | (throw (ex-info "Linked constant not found" ast)))] 107 | [[k n]])) 108 | 109 | (def dispatch {:choice emit-choice 110 | :char emit-char 111 | :cat emit-cat 112 | :rep emit-rep 113 | :open-call emit-call 114 | :optional emit-optional 115 | :true (constantly []) 116 | :fail (constantly [[:fail]]) 117 | :any (constantly [[:any]]) 118 | :end-of-input (constantly [[:end-of-input]]) 119 | :not emit-not-predicate 120 | :and emit-and-predicate 121 | :capture emit-capture 122 | :action (partial emit-linked-instruction :action) 123 | :charset (partial emit-linked-instruction :charset)}) 124 | 125 | (defn emit 126 | [env ast] 127 | (let [f (dispatch (:op ast))] 128 | (when-not f (throw (ex-info "bad ast" ast))) 129 | (f env ast))) 130 | 131 | (defn initial-jump-block 132 | [instrs entrypoint] 133 | (let [preamble [[:call entrypoint] 134 | [:end]]] 135 | (into preamble instrs))) 136 | 137 | (def branching? 138 | #{:commit 139 | :choice 140 | :jump 141 | :call 142 | :back-commit 143 | :partial-commit}) 144 | 145 | (def op->code 146 | (let [m {:call OpCodes/CALL 147 | :return OpCodes/RET 148 | :choice OpCodes/CHOICE 149 | :commit OpCodes/COMMIT 150 | :partial-commit OpCodes/PARTIAL_COMMIT 151 | :back-commit OpCodes/BACK_COMMIT 152 | :jump OpCodes/JUMP 153 | :fail-twice OpCodes/FAIL_TWICE 154 | :fail OpCodes/FAIL 155 | :end OpCodes/END 156 | 157 | :char OpCodes/MATCH_CHAR 158 | :test-char OpCodes/TEST_CHAR 159 | :charset OpCodes/CHARSET 160 | :test-charset OpCodes/TEST_CHARSET 161 | :any OpCodes/ANY 162 | :test-any OpCodes/TEST_ANY 163 | :span OpCodes/SPAN 164 | 165 | :begin-capture OpCodes/BEGIN_CAPTURE 166 | :end-capture OpCodes/END_CAPTURE 167 | :full-capture OpCodes/FULL_CAPTURE 168 | :behind OpCodes/BEHIND 169 | :end-of-input OpCodes/END_OF_INPUT 170 | :action OpCodes/ACTION}] 171 | (fn [kw] 172 | (or (get m kw) 173 | (throw (IllegalArgumentException. (str "No opcode defined " kw))))))) 174 | 175 | (defn link 176 | "Turns all symbolic jumps into relative address jumps" 177 | [instructions] 178 | (let [[insts labels] (reduce (fn [[insts labels] [op arg :as inst]] 179 | (if (= :label op) 180 | [insts (assoc labels arg (count insts))] 181 | [(into insts inst) labels])) 182 | [[] {}] instructions) 183 | 184 | patch-jumps (fn [stream] 185 | (let [n (count stream)] 186 | (loop [i 0 stream stream] 187 | (if (< i n) 188 | (let [op (get stream i)] 189 | (if (and (keyword? op) (branching? op)) 190 | (let [target (inc i)] 191 | (recur (inc target) (update stream target labels))) 192 | (recur (inc i) stream))) 193 | stream))))] 194 | (patch-jumps insts))) 195 | 196 | (defn add-entrypoint 197 | [env code entrypoint] 198 | (let [end (label env)] 199 | (concat [[:call entrypoint] 200 | [:jump end]] 201 | code 202 | [[:label end] 203 | [:end]]))) 204 | 205 | (defn empty-env 206 | [grammar matchers actions] 207 | (let [current-id (atom 0)] 208 | {:non-terminals (set (keys grammar)) 209 | :next-label #(swap! current-id inc) 210 | :matchers (vec (vals matchers)) 211 | :actions (vec (vals actions)) 212 | :constants {:charset (into {} (map vector (keys matchers) (range))) 213 | :action (into {} (map vector (keys actions) (range)))}})) 214 | 215 | (defn transform-instructions 216 | [insts] 217 | (let [->bytecode (fn [i] 218 | (if (keyword? i) 219 | (op->code i) 220 | i))] 221 | (into [] (map ->bytecode) insts))) 222 | 223 | (defn compile-grammar 224 | [grammar entrypoint matchers actions] 225 | (let [env (empty-env grammar matchers actions) 226 | emit-rule (fn [[sym ast]] 227 | (-> (into [[:label sym :call]] 228 | (emit env ast)) 229 | (conj [:return]))) 230 | instructions (into [] (mapcat emit-rule) grammar)] 231 | (ParsingExpressionGrammar. 232 | (-> (add-entrypoint env instructions entrypoint) 233 | (link) 234 | (transform-instructions) 235 | (int-array)) 236 | (into-array CharMatcher (:matchers env)) 237 | (into-array ParseAction (:actions env))))) 238 | -------------------------------------------------------------------------------- /src-java/com/champbacon/pex/impl/PEGByteCodeVM.java: -------------------------------------------------------------------------------- 1 | package com.champbacon.pex.impl; 2 | 3 | import com.champbacon.pex.*; 4 | 5 | public final class PEGByteCodeVM implements PEGMatcher, ValueStackManip { 6 | 7 | private static boolean DEBUG = false; 8 | 9 | public static final int INITIAL_STACK = 16; 10 | public static final int INITIAL_CAPTURES = 4; 11 | 12 | private StackEntry[] stack = new StackEntry[INITIAL_STACK]; 13 | private int stk = 0; 14 | 15 | private Object[] captureStack = new Object[INITIAL_CAPTURES]; 16 | private int captureTop = 0; 17 | 18 | private final int[] instructions; 19 | 20 | private final ParseAction[] actions; 21 | private final CharMatcher[] charMatchers; 22 | 23 | private int pc = 0; 24 | 25 | private int getMatchEnd() { 26 | if (matchFailed) { 27 | return -1; 28 | } 29 | return subjectPointer; 30 | } 31 | 32 | private int subjectPointer; 33 | 34 | private final char[] input; 35 | 36 | public Object getUserParseContext() { 37 | return userParseContext; 38 | } 39 | 40 | public void setUserParseContext(Object userParseContext) { 41 | this.userParseContext = userParseContext; 42 | } 43 | 44 | private Object userParseContext; 45 | 46 | private boolean matchFailed = false; 47 | 48 | public PEGByteCodeVM(ParsingExpressionGrammar peg, 49 | char[] input, 50 | Object userParseContext) { 51 | this.instructions = peg.instructions; 52 | this.charMatchers = peg.charMatchers; 53 | this.actions = peg.actions; 54 | this.input = input; 55 | this.userParseContext = userParseContext; 56 | } 57 | 58 | private final StackEntry ensure1() { 59 | if (stk >= stack.length) doubleStack(); 60 | StackEntry e = stack[stk]; 61 | if (e == null) { 62 | stack[stk] = e = new StackEntry(); 63 | } else { 64 | e.reset(); 65 | } 66 | return e; 67 | } 68 | 69 | private final void doubleStack() { 70 | StackEntry[] newStack = new StackEntry[stack.length << 1]; 71 | System.arraycopy(stack, 0, newStack, 0, stack.length); 72 | stack = newStack; 73 | } 74 | 75 | private final void doubleCaptures() { 76 | Object[] newCaptures = new Object[captureStack.length << 1]; 77 | System.arraycopy(captureStack, 0, newCaptures, 0, captureStack.length); 78 | captureStack = newCaptures; 79 | } 80 | 81 | private void opCall() { 82 | StackEntry e = ensure1(); 83 | 84 | // do not set subjectPosition 85 | e.setCaptureHeight(captureTop); 86 | e.setReturnAddress(pc + 1); 87 | 88 | stk++; 89 | pc = instructions[pc]; 90 | } 91 | 92 | private void opRet() { 93 | stk--; 94 | StackEntry s = stack[stk]; 95 | // captureTop = s.getCaptureHeight(); 96 | pc = s.getReturnAddress(); 97 | } 98 | 99 | private void opChoice() { 100 | StackEntry s = ensure1(); 101 | s.setReturnAddress(instructions[pc]); 102 | s.setCaptureHeight(captureTop); 103 | s.setSubjectPosition(subjectPointer); 104 | 105 | stk++; 106 | pc++; 107 | } 108 | 109 | private void opCommit() { 110 | stk--; 111 | pc = instructions[pc]; 112 | } 113 | 114 | private void opPartialCommit() { 115 | StackEntry s = stack[stk - 1]; 116 | s.setSubjectPosition(subjectPointer); 117 | s.setCaptureHeight(captureTop); 118 | pc = instructions[pc]; 119 | } 120 | 121 | // VALIDATE SEMANTICS 122 | private void opBackCommit() { 123 | stk--; 124 | StackEntry s = stack[stk]; 125 | subjectPointer = s.getSubjectPosition(); 126 | captureTop = s.getCaptureHeight(); 127 | } 128 | 129 | private void opJump() { 130 | pc = instructions[pc]; 131 | } 132 | 133 | private void opFailTwice() { 134 | stk--; 135 | opFail(); 136 | } 137 | 138 | private void opFail() { 139 | // pop off any plain CALL frames 140 | StackEntry s; 141 | do { 142 | stk--; 143 | s = stack[stk]; 144 | } while (s.isCall() && stk > 0); 145 | 146 | if (stk == 0) { 147 | 148 | matchFailed = true; 149 | pc = instructions.length - 1; // jump to the final instruction, always END 150 | return; 151 | } 152 | 153 | subjectPointer = s.getSubjectPosition(); 154 | captureTop = s.getCaptureHeight(); 155 | pc = s.getReturnAddress(); 156 | } 157 | 158 | private void opMatchChar() { 159 | int ch = instructions[pc]; 160 | if (subjectPointer < input.length && input[subjectPointer] == ch) { 161 | pc++; 162 | subjectPointer++; 163 | } else { 164 | opFail(); 165 | } 166 | } 167 | 168 | /* private void opTestChar() { 169 | int ch = instructions[pc]; 170 | if (subjectPointer < input.length && input[subjectPointer] == ch) { 171 | pc++; 172 | subjectPointer++; 173 | } else { 174 | opFail(); 175 | } 176 | } 177 | */ 178 | 179 | private void opAny() { 180 | if (subjectPointer < input.length) { 181 | subjectPointer++; 182 | } else { 183 | opFail(); 184 | } 185 | } 186 | 187 | private void opBeginCapture() { 188 | StackEntry s = stack[stk - 1]; 189 | s.setCurrentCaptureBegin(subjectPointer); 190 | } 191 | 192 | private void opEndCapture() { 193 | StackEntry s = stack[stk - 1]; 194 | 195 | int captureBegin = s.getCurrentCaptureBegin(); 196 | s.clearOpenCapture(); 197 | 198 | String cap = new String(input, captureBegin, subjectPointer - captureBegin); 199 | 200 | push(cap); 201 | } 202 | 203 | private void opAction() { 204 | ParseAction a = actions[instructions[pc]]; 205 | a.execute(this); 206 | pc++; 207 | } 208 | 209 | private void opCharset() { 210 | CharMatcher m = charMatchers[instructions[pc]]; 211 | if (subjectPointer < input.length && m.match(input[subjectPointer])) { 212 | pc++; 213 | subjectPointer++; 214 | } else { 215 | opFail(); 216 | } 217 | } 218 | 219 | private void opEndOfInput() { 220 | if (subjectPointer != input.length) 221 | opFail(); 222 | } 223 | 224 | private void debug() { 225 | if (subjectPointer >= input.length) return; 226 | System.out.printf( 227 | "{:pc %3d :op %2d :subj [\"%s\" %5d] :captop %2d :stk %2d}%n", 228 | pc, 229 | instructions[pc], 230 | input[subjectPointer], subjectPointer, 231 | captureTop, 232 | stk); 233 | } 234 | 235 | private void unimplemented() { 236 | throw new UnsupportedOperationException(); 237 | } 238 | 239 | public int match() { 240 | return match(0); 241 | } 242 | 243 | public int match(int pos) { 244 | subjectPointer = pos; 245 | 246 | vm: 247 | while (true) { 248 | final int op = instructions[pc++]; 249 | 250 | switch(op) { 251 | case OpCodes.CALL: opCall(); break; 252 | case OpCodes.RET: opRet(); break; 253 | 254 | case OpCodes.CHOICE: opChoice(); break; 255 | case OpCodes.COMMIT: opCommit(); break; 256 | 257 | case OpCodes.PARTIAL_COMMIT: opPartialCommit(); break; 258 | case OpCodes.BACK_COMMIT: opBackCommit(); break; 259 | 260 | case OpCodes.JUMP: opJump(); break; 261 | 262 | case OpCodes.FAIL_TWICE: opFailTwice(); break; 263 | case OpCodes.FAIL: opFail(); break; 264 | case OpCodes.END: break vm; 265 | 266 | case OpCodes.MATCH_CHAR: opMatchChar(); break; 267 | case OpCodes.CHARSET: opCharset(); break; 268 | case OpCodes.ANY: opAny(); break; 269 | case OpCodes.TEST_CHAR: unimplemented(); break; 270 | case OpCodes.TEST_CHARSET: unimplemented(); break; 271 | case OpCodes.TEST_ANY: unimplemented(); break; 272 | case OpCodes.SPAN: unimplemented(); break; 273 | 274 | case OpCodes.BEGIN_CAPTURE: opBeginCapture(); break; 275 | case OpCodes.END_CAPTURE: opEndCapture(); break; 276 | case OpCodes.FULL_CAPTURE: unimplemented(); break; 277 | case OpCodes.BEHIND: unimplemented(); break; 278 | case OpCodes.END_OF_INPUT: opEndOfInput(); break; 279 | 280 | case OpCodes.ACTION: opAction(); break; 281 | default: throw new IllegalStateException("unknown instruction: " + op + " at pc " + pc); 282 | } 283 | 284 | 285 | } 286 | 287 | return getMatchEnd(); 288 | 289 | } 290 | 291 | public void reset() { 292 | unimplemented(); 293 | } 294 | 295 | public int getCaptureStart() { 296 | StackEntry s = stack[stk - 1]; 297 | return s.getCaptureHeight(); 298 | } 299 | 300 | public int getCaptureEnd() { 301 | return captureTop; 302 | } 303 | 304 | public void setCaptureEnd(int i) { 305 | captureTop = i; 306 | } 307 | 308 | public Object[] getCurrentCaptures() { 309 | return captureStack; 310 | } 311 | 312 | public char[] getInput() { 313 | return input; 314 | } 315 | 316 | public int getInputPosition() { 317 | return subjectPointer; 318 | } 319 | 320 | public char getLastMatch() { 321 | return input[subjectPointer - 1]; 322 | } 323 | 324 | public void push(Object v) { 325 | if (captureTop >= captureStack.length) doubleCaptures(); 326 | captureStack[captureTop] = v; 327 | captureTop++; 328 | } 329 | 330 | public Object[] getCaptures() { 331 | if (matchFailed) { 332 | return null; 333 | } 334 | Object[] captures = new Object[captureTop]; 335 | System.arraycopy(captureStack, 0, captures, 0, captureTop); 336 | return captures; 337 | } 338 | 339 | } 340 | --------------------------------------------------------------------------------