├── .gitignore ├── LANGUAGES.md ├── LICENSE.md ├── Makefile ├── README.md ├── TOOLS.md ├── cfg2kv-0-re ├── README.md ├── cfg2kv.js └── package.json ├── cfg2kv-1-sm ├── README.md ├── cfg2kv.js └── package.json ├── cfg2kv-2-sm-ast ├── README.md ├── cfg2kv.js └── package.json ├── cfg2kv-3-ls-rdp-ast ├── README.md ├── cfg2kv.js └── package.json ├── cfg2kv-4-pc-ast ├── README.md ├── cfg2kv.js └── package.json ├── cfg2kv-5-peg-ast ├── README.md ├── cfg2kv.js ├── cfg2kv.pegjs └── package.json ├── cfg2kv.js ├── package.json ├── sample.cfg └── sample.kv /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | -------------------------------------------------------------------------------- /LANGUAGES.md: -------------------------------------------------------------------------------- 1 | 2 | Markup Languages 3 | ---------------- 4 | 5 | Markup languages are actually two languages combined: a "host" language 6 | (often just plain text) and the embedded markup language to mark 7 | up various parts of the host language. As the host language can be 8 | arbitrary, markup languages sometimes are hard to parse because they 9 | have to recognize tokens representing the arbitrary plain text. 10 | 11 | ### POD-like 12 | 13 | [Plain Old Document](http://perldoc.perl.org/perlpod.html) (POD) style 14 | markup is a very concise and still flexible enough markup language. 15 | It is often used in variants where it is mixed together with Markdown elements. 16 | 17 | ``` 18 | API Management 19 | -------------- 20 | 21 | Change the API symbol in the global variable namespace under which ComponentJS 22 | is exposed. By default ComponentJS is exposed under the symbol name 23 | C. It is a common convention to change the symbol to C (for 24 | "component system/service") to have a convenient short-hand. 25 | 26 | - ComponentJS.M([P: T]): T 27 | 28 | Change symbol of ComponentJS API to global variable P and return it. 29 | If P is not given, ComponentJS does not occupy any global namespace slot at all -- 30 | then it is required to store the return value and use ComponentJS directly through it. 31 | 32 | | ComponentJS.symbol("cs") /* standard */ 33 | | var cs = ComponentJS.symbol() /* alternative */ 34 | 35 | - ComponentJS.M = { 36 | F: T, F: T, F: T, F: T 37 | } 38 | 39 | Access the ComponentJS implementation version "FC<.>FC<.>F" 40 | and the corresponding release F (in format YYYYMMDD). 41 | 42 | | if (ComponentJS.version.date < 20120101) 43 | | throw new Error("need at least ComponentJS as of 20120101") 44 | ``` 45 | 46 | Pros | Cons 47 | ---------------------------|----------------------------------- 48 | very concise | somewhat "cryptic" in style 49 | easy to parse | 50 | supports custom tags | 51 | 52 | ### Markdown 53 | 54 | [Markdown](https://daringfireball.net/projects/markdown/) is a very 55 | readable markup language for writing documents, inspired by ASCII-based 56 | email texts of the 1990's. 57 | 58 | ```md 59 | Parsing Techniques 60 | ================== 61 | 62 | There are lots of [formal languages](LANGUAGES.md) for various kinds of 63 | practical purposes. But they all have one thing in common: for further 64 | processing them inside a program, they first have to be parsed from 65 | their character string representation. This is the material of a lecture 66 | about various techniques to perform this parsing step. 67 | 68 | Notice: the code is all written in [ECMAScript 69 | 6](http://en.wikipedia.org/wiki/ECMAScript), is on-the-fly transpiled 70 | to ECMAScript 5 and then executed under [Node.js](http://nodejs.org/), 71 | but it actually doesn't matter very much. Equivalent code can be written 72 | in Java or C#, too. The only major point is just that the required 73 | third-party libraries have to be also changed, of course. 74 | 75 | Parsing Input 76 | ------------- 77 | 78 | Let's image a formal language for describing key/value based 79 | configurations in a redundancy-free nested structure. 80 | A sample configuration can be: 81 | ``` 82 | 83 | Pros | Cons 84 | ---------------------------|----------------------------------- 85 | very concise | does not provide custom tags 86 | easy to parse | 87 | 88 | ### LaTeX 89 | 90 | [LaTeX](http://en.wikipedia.org/wiki/LaTeX) is one of the oldest 91 | markup languages. It is based on the underlying TeX macro language 92 | and used for writing documents. 93 | 94 | ```tex 95 | \documentclass{scrartcl} 96 | 97 | \title{Ein Testdokument} 98 | 99 | \begin{document} 100 | 101 | \section{Einleitung} 102 | 103 | Hier kommt die Einleitung. Ihre Überschrift kommt 104 | automatisch in das Inhaltsverzeichnis. 105 | 106 | \subsection{Formeln} 107 | 108 | \LaTeX{} ist auch ohne Formeln sehr nützlich und 109 | einfach zu verwenden. Grafiken, Tabellen, 110 | Querverweise aller Art, Literatur- und 111 | Stichwortverzeichnis sind kein Problem. 112 | 113 | Formeln sind etwas schwieriger, dennoch hier ein 114 | einfaches Beispiel. Zwei von Einsteins 115 | berühmtesten Formeln lauten: 116 | 117 | \begin{align} 118 | E &= mc^2 \\ 119 | m &= \frac{m_0}{\sqrt{1-\frac{v^2}{c^2}}} 120 | \end{align} 121 | 122 | Aber wer keine Formeln schreibt, braucht sich 123 | damit auch nicht zu beschäftigen. 124 | 125 | \end{document} 126 | ``` 127 | 128 | Pros | Cons 129 | ---------------------------|----------------------------------- 130 | supports custom tags | harder to parse 131 | 132 | ### SGML/XML (HTML) 133 | 134 | SGML and XML are the basis for many markup languages. 135 | Most prominently, (X)HTML is based on them to markup webpages. 136 | 137 | ```html 138 | 139 | 140 | Foo 141 | 142 | 143 |

Foo

144 | Bar Baz Quux 145 | 146 | 147 | ``` 148 | 149 | Pros | Cons 150 | ---------------------------------|----------------------------------- 151 | very widespread | much boilerplate, because of very chatty syntax 152 | parsers available out-of-the-box | 153 | 154 | 155 | Configuration Languages 156 | ----------------------- 157 | 158 | ### YAML 159 | 160 | ```yaml 161 | config: 162 | Foo: 163 | bar: "quux" 164 | baz: 42 165 | Bar: 166 | bar: "foo" 167 | baz: 7 168 | ``` 169 | 170 | #### HOCON 171 | 172 | ``` 173 | config { 174 | Foo { 175 | bar = "quux" 176 | baz = 42 177 | }, 178 | Bar { 179 | bar = "foo", 180 | baz = 7 181 | } 182 | } 183 | ``` 184 | 185 | ### INI 186 | 187 | ``` 188 | [config] 189 | 190 | [config.Foo] 191 | bar = "quux" 192 | baz = 42 193 | 194 | [config.Bar] 195 | bar = "foo" 196 | baz = 7 197 | ``` 198 | 199 | #### JSON 200 | 201 | ```json 202 | { 203 | "Foo": { 204 | "bar": "quux", 205 | "baz": 42 206 | }, 207 | "Bar": { 208 | "bar": "foo", 209 | "baz": 7 210 | } 211 | } 212 | ``` 213 | 214 | ### XML 215 | 216 | ```xml 217 | 218 | 219 | quux 220 | 42 221 | 222 | 223 | foo 224 | 7 225 | 226 | 227 | ``` 228 | 229 | 230 | Serialization Languages 231 | ----------------------- 232 | 233 | ### JSON 234 | 235 | ```json 236 | { 237 | "foo": 42, 238 | "bar": [ 7, "baz" ], 239 | "quux": { 240 | "a": 1, 241 | "b": 2 242 | } 243 | } 244 | ``` 245 | 246 | ### YAML 247 | 248 | ```yaml 249 | sample: 250 | foo: 42 251 | bar: 252 | - 7 253 | - "baz" 254 | quux: 255 | a: 1 256 | b: 2 257 | ``` 258 | 259 | ### XML 260 | 261 | ```xml 262 | 263 | 42 264 | 265 | 7 266 | baz 267 | 268 | 269 | 1 270 | 2 271 | 272 | 273 | ``` 274 | 275 | 276 | Query Languages 277 | --------------- 278 | 279 | ### XPath 280 | 281 | ``` 282 | // Foo [ @bar == 'quux' ] 283 | ``` 284 | 285 | ### ASTq 286 | 287 | ```js 288 | astq.query(` 289 | // VariableDeclarator [ 290 | /:id Identifier [ @name ] 291 | && /:init Literal [ @value ] 292 | ] 293 | `) 294 | ``` 295 | 296 | ## CSS Selectors 297 | 298 | ``` 299 | @id .class > *[type="foo"] 300 | ``` 301 | 302 | ### DuckyJS Selectors 303 | 304 | ```js 305 | ducky.select({ 306 | foo: { 307 | bar: { 308 | baz: [ 42, 7, "Quux" ] 309 | } 310 | } 311 | }, 312 | "foo['bar'].baz[2]" 313 | ) 314 | ``` 315 | 316 | ### Neo4J Cypher 317 | 318 | ``` 319 | MATCH 320 | (you {name:"You"}), 321 | (expert)-[:WORKED_WITH]->(db:Database {name:"Neo4j"}), 322 | p = shortestPath((you)-[:FRIEND*..5]-(expert)) 323 | RETURN p, db 324 | ``` 325 | 326 | ### ArangoDB AQL 327 | 328 | ``` 329 | FOR user IN users 330 | SORT user.name, user.gender 331 | RETURN user 332 | ``` 333 | 334 | ### SQL 335 | 336 | ``` 337 | SELECT * FROM users 338 | WHERE active = 1 339 | ORDER BY name, gender; 340 | ``` 341 | 342 | 343 | Pattern Languages 344 | ----------------- 345 | 346 | ### DuckyJS Validation 347 | 348 | ``` 349 | ducky.validate({ 350 | foo: "Foo", 351 | bar: "Bar", 352 | baz: [ 42, 7, "Quux" ] 353 | }, `{ 354 | foo: string, 355 | bar: any, 356 | baz: [ number+, string* ], 357 | quux?: any 358 | }`) 359 | ``` 360 | 361 | ### Regular Expression 362 | 363 | ``` 364 | /[+-]?[0-9]*\.[0-9]+([eE][+-]?[0-9]+)?/.test(str) 365 | ``` 366 | 367 | ### Shell Glob Pattern 368 | 369 | ``` 370 | foo-*.txt 371 | ``` 372 | 373 | 374 | Programming Languages 375 | --------------------- 376 | 377 | ### ECMAScript 378 | 379 | ```js 380 | export default class OSet { 381 | constructor () { 382 | this._items = 0 383 | this._index = {} 384 | this._ring = {} 385 | this._ring.prev = this._ring 386 | this._ring.next = this._ring 387 | return this 388 | } 389 | length () { 390 | return this._items 391 | } 392 | keys () { 393 | return this.each(function (val, key) { 394 | this.push(key) 395 | }, []) 396 | } 397 | values () { 398 | return this.each(function (val /*, key */) { 399 | this.push(val) 400 | }, []) 401 | } 402 | find (predicate, ctx) { 403 | if (arguments < 2) 404 | ctx = this 405 | return this.each(function (val, key, order) { 406 | if (predicate.call(ctx, val, key, order)) 407 | this.push(val) 408 | }, []) 409 | } 410 | each (iterator, ctx) { 411 | if (arguments < 2) 412 | ctx = this 413 | let i = 0 414 | let bucket = this._ring.next 415 | while (bucket !== this._ring) { 416 | iterator.call(ctx, bucket.val, bucket.key, i++) 417 | bucket = bucket.next 418 | } 419 | return ctx 420 | } 421 | [...] 422 | } 423 | ``` 424 | 425 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | 2 | License 3 | ------- 4 | 5 | Copyright (c) 2015 Ralf S. Engelschall (http://engelschall.com/) 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining 8 | a copy of this software and associated documentation files (the 9 | "Software"), to deal in the Software without restriction, including 10 | without limitation the rights to use, copy, modify, merge, publish, 11 | distribute, sublicense, and/or sell copies of the Software, and to 12 | permit persons to whom the Software is furnished to do so, subject to 13 | the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included 16 | in all copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 19 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 21 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 22 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 | 26 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | NPM = npm 3 | NODE = npx babel-node --presets @babel/preset-env 4 | VARIANT = 5 | 6 | VARIANTS = \ 7 | 0-re \ 8 | 1-sm \ 9 | 2-sm-ast \ 10 | 3-ls-rdp-ast \ 11 | 4-pc-ast \ 12 | 5-peg-ast 13 | 14 | all: bootstrap $(VARIANTS) 15 | 16 | bootstrap: 17 | @if [ ! -d node_modules ]; then \ 18 | echo "++ installing required third-party modules (top-level)"; \ 19 | $(NPM) install; \ 20 | fi; \ 21 | for variant in $(VARIANTS); do \ 22 | if [ -f "cfg2kv-$$variant/package.json" -a ! -d "cfg2kv-$$variant/node_modules" ]; then \ 23 | echo "++ installing required third-party modules (cfg2kv-$$variant)"; \ 24 | (cd cfg2kv-$$variant && $(NPM) install); \ 25 | fi; \ 26 | done 27 | 28 | cfg2kv: 29 | @$(NODE) cfg2kv.js $(VARIANT) 30 | 31 | 0-re: 32 | @$(MAKE) $(MFLAGS) cfg2kv VARIANT=$@ 33 | 1-sm: 34 | @$(MAKE) $(MFLAGS) cfg2kv VARIANT=$@ 35 | 2-sm-ast: 36 | @$(MAKE) $(MFLAGS) cfg2kv VARIANT=$@ 37 | 3-ls-rdp-ast: 38 | @$(MAKE) $(MFLAGS) cfg2kv VARIANT=$@ 39 | 4-pc-ast: 40 | @$(MAKE) $(MFLAGS) cfg2kv VARIANT=$@ 41 | 5-peg-ast: 42 | @$(MAKE) $(MFLAGS) cfg2kv VARIANT=$@ 43 | 44 | clean: 45 | -rm -rf node_modules 46 | -rm -rf cfg2kv-*/node_modules 47 | 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | Parsing Techniques 3 | ================== 4 | 5 | There are lots of [formal languages](LANGUAGES.md) for various kinds of 6 | practical purposes. But they all have one thing in common: for further 7 | processing them inside a program, they first have to be parsed from 8 | their character string representation. This is the material of a lecture 9 | about various techniques to perform this parsing step. 10 | 11 | Notice: the code is all written in [ECMAScript 12 | 6](http://en.wikipedia.org/wiki/ECMAScript), is on-the-fly transpiled 13 | to ECMAScript 5 and then executed under [Node.js](http://nodejs.org/), 14 | but it actually doesn't matter very much. Equivalent code can be written 15 | in Java or C#, too. The only major point is just that the required 16 | third-party libraries have to be also changed, of course. 17 | 18 | Parsing Input 19 | ------------- 20 | 21 | Let's imagine a formal language for describing key/value based 22 | configurations in a redundancy-free nested structure. 23 | A [sample configuration](sample.cfg) can be: 24 | 25 | ``` 26 | foo { 27 | baz = 7 // some comment 28 | bar { 29 | quux = 42 30 | hello = "{hello} = \"world\"!" 31 | } 32 | quux = 3 33 | } 34 | bar = 1 35 | quux = 2 36 | ``` 37 | 38 | This is a very simple formal language, but it already has 39 | some cruxes which can become a hurdle for parsing: 40 | 41 | 1. nested sections 42 | 2. intermixed comments 43 | 3. alternatives (value is either number or string) 44 | 4. string value can contain spaces, quotes and section braces 45 | 46 | Parsing Output 47 | -------------- 48 | 49 | Let's imagine we want to parse configurations in the above format into a 50 | [simple key/value format](sample.kv) where the sections are flattened: 51 | 52 | ``` 53 | foo.bar.quux 42 54 | foo.bar.hello {hello} = "world"! 55 | foo.baz 7 56 | foo.quux 3 57 | bar 1 58 | quux 2 59 | ``` 60 | 61 | Parsing Techniques 62 | ------------------ 63 | 64 | There are various parsing techniques available, each with their pros and 65 | cons. For illustration purposes we've implemented a bunch of them. Each 66 | one can be run by executing `make ` where `` is one of `0-re`, 67 | `1-sm`, `2-sm-ast`, `3-ls-rdp-ast` or `4-peg-ast`. Follow the above 68 | links to their particular source code and documentation. 69 | 70 | - [`cfg2kv-0-re/`](cfg2kv-0-re/):
71 | **Regular Expressions (RE)** 72 | 73 | - [`cfg2kv-1-sm/`](cfg2kv-1-sm/):
74 | **State Machine (SM)** 75 | 76 | - [`cfg2kv-2-sm-ast/`](cfg2kv-2-sm-ast/):
77 | **State Machine (SM), Abstract Syntax Tree (AST)** 78 | 79 | - [`cfg2kv-3-ls-rdp-ast/`](cfg2kv-3-ls-rdp-ast/):
80 | **Lexical Scanner (LS), Recursive Descent Parser (RDP), Abstract Syntax Tree (AST)** 81 | 82 | - [`cfg2kv-4-pc-ast/`](cfg2kv-4-pc-ast/):
83 | **Parser Combinators (PC), Abstract Syntax Tree (AST)** 84 | 85 | - [`cfg2kv-5-peg-ast/`](cfg2kv-4-peg-ast/):
86 | **Parsing Expression Grammar (PEG) Parser, Abstract Syntax Tree (AST)** 87 | 88 | -------------------------------------------------------------------------------- /TOOLS.md: -------------------------------------------------------------------------------- 1 | 2 | Parsing Tools 3 | ============= 4 | 5 | The following is a list of known tools for support text parsing in JavaScript. 6 | Some of them (\*) are provided by the author itself. 7 | 8 | Parser Generators 9 | ----------------- 10 | 11 | Generate parsers from external grammars definitions. 12 | 13 | - [Nearley](https://nearley.js.org/) 14 | - [PEG.js](https://pegjs.org/) 15 | - [ANTLR](https://www.antlr.org/) 16 | - [WayEye](https://waxeye.org/) 17 | - [APG](https://github.com/ldthomas/apg-js2) 18 | - [Jison](http://zaa.ch/jison/) 19 | 20 | Parser Combinators 21 | ------------------ 22 | 23 | Generate parsers via internal combinator functions. 24 | 25 | - [Arcsecond](https://github.com/francisrstokes/arcsecond) 26 | - [Chevrotain](https://github.com/SAP/chevrotain) 27 | - [Bennu](http://bennu-js.com/) 28 | - [ParJS](https://github.com/GregRos/parjs) 29 | - [Parsimmon](https://github.com/jneen/parsimmon) 30 | 31 | Tokenizer/Scanner/Lexer 32 | ----------------------- 33 | 34 | Split a stream of characters into tokens. 35 | 36 | - [Moo](https://github.com/no-context/moo) 37 | - [Tokenizr](https://github.com/rse/tokenizr) * 38 | 39 | String Pattern Matcher 40 | ---------------------- 41 | 42 | Match patterns in strings. 43 | 44 | - [XRegExp](http://xregexp.com/) 45 | - [VerbalExpressions](https://github.com/VerbalExpressions/JSVerbalExpressions)] 46 | - [MicroMatch](https://github.com/micromatch/micromatch) 47 | - [FuzzySet](https://glench.github.io/fuzzyset.js/) 48 | 49 | String Manipulation 50 | ------------------- 51 | 52 | Manipulate strings. 53 | 54 | - [Voca](https://vocajs.com/) 55 | 56 | Object Pattern Matcher 57 | ---------------------- 58 | 59 | Match patterns in object graphs. 60 | 61 | - [DuckyJS](http://duckyjs.com/) * 62 | - [Extraction](http://extraction.js.org/) * 63 | - [paMatcher](http://pamatcher.js.org/) 64 | - [z](https://z-pattern-matching.github.io/) 65 | - [Tailored](https://github.com/elixirscript/tailored) 66 | - [Fuse.js](http://fusejs.io/) 67 | 68 | String Generators 69 | ----------------- 70 | 71 | Generate strings. 72 | 73 | - [Nunjucks](https://mozilla.github.io/nunjucks/) 74 | - [Divertr](https://github.com/rse/divertr) * 75 | - [TextFrame](https://github.com/rse/textframe) * 76 | - [Syntax](https://github.com/rse/syntax/) * 77 | 78 | Abstract Syntax Tree (AST) 79 | -------------------------- 80 | 81 | Manage and operate a tree representing a language structure. 82 | 83 | - [ASTy](https://github.com/rse/asty) * 84 | - [ASTq](https://github.com/rse/astq) * 85 | - [ASTy-ASTq](https://github.com/rse/asty-astq) * 86 | 87 | Parsing Utilities 88 | ----------------- 89 | 90 | Utilities for parsing. 91 | 92 | - [PEG.js OTF](https://github.com/rse/pegjs-otf) * 93 | - [PEG.js Util](https://github.com/rse/pegjs-util) * 94 | - [Source-Code-Error](https://github.com/rse/source-code-error) * 95 | 96 | -------------------------------------------------------------------------------- /cfg2kv-0-re/README.md: -------------------------------------------------------------------------------- 1 | 2 | Variant: **Regular Expressions (RE)**
3 | 4 | Source: [`cfg2kv.js`](cfg2kv.js), [`package.json`](package.json) 5 | 6 | This is the Unix rooky approach. It uses complex [Regular 7 | Expressions](http://en.wikipedia.org/wiki/Regular_expression) to 8 | match sections and property assignments. Section blocks are matched 9 | recursively while carrying forward the current namespace information. 10 | Properties are matched and then immediately lead to the resulting 11 | key/value output. 12 | 13 | **RECOMMENDATION**: Use only for "code once & forget" situations or 14 | if external dependencies are not acceptable, but be aware of the hard 15 | limitations. 16 | 17 | Pros | Cons 18 | ---------------------------|----------------------------------- 19 | very little code | hard to understand 20 | no external dependencies | does not support multiple sections at same level 21 | | fails if non-balanced braces occur in strings 22 | | produces key/values in section-first/properties-second order 23 | | inflexible in output generation 24 | | hard to implement error reporting 25 | | impossible to track line/column 26 | 27 | 28 | -------------------------------------------------------------------------------- /cfg2kv-0-re/cfg2kv.js: -------------------------------------------------------------------------------- 1 | 2 | module.exports = class CFG2KV { 3 | 4 | /* parse configuration format into key/value format */ 5 | cfg2kv (cfg) { 6 | let kv = "" 7 | 8 | /* get rid of all end-of-line comments */ 9 | cfg = cfg.replace(/\/\/[^\r\n]*\r?\n/g, "") 10 | 11 | /* parse a section block */ 12 | const parseBlock = (ns, cfg) => { 13 | /* parse sections */ 14 | cfg = cfg.replace(/([a-zA-Z_][a-zA-Z_0-9]*)\s*\{((?:.|\r?\n)*)\}/, (m, key, block) => { 15 | block = parseBlock(ns.concat(key), block) /* RECURSION */ 16 | return "" 17 | }) 18 | 19 | /* parse property assignments */ 20 | cfg = cfg.replace(/([a-zA-Z_][a-zA-Z_0-9]*)\s*=\s*(?:(\d+)|"((?:\\"|.)*?)")/g, (m, key, val1, val2) => { 21 | let val = val1 22 | if (val === undefined) 23 | val = val2.replace(/\\"/g, "\"") 24 | kv += `${[ ...ns, key ].join(".")} ${val}\n` 25 | return "" 26 | }) 27 | 28 | return cfg 29 | } 30 | 31 | /* bootstrap parsing */ 32 | cfg = parseBlock([], cfg) 33 | 34 | return kv 35 | } 36 | } 37 | 38 | -------------------------------------------------------------------------------- /cfg2kv-0-re/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cfg2kv-0-re", 3 | "description": "variant: Regular Expresssions (RE)", 4 | "repository": "rse/parsing-techniques", 5 | "license": "MIT", 6 | "dependencies": { 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /cfg2kv-1-sm/README.md: -------------------------------------------------------------------------------- 1 | 2 | Variant: **State Machine (SM)** 3 | 4 | Source: [`cfg2kv.js`](cfg2kv.js), [`package.json`](package.json) 5 | 6 | As Regular Expressions are (unfortunately) not everyone's favorite, 7 | many people would avoid them and code a parser with bare language 8 | features only. This is the next technique. One still needs to cope 9 | with the Cons of technique 0 above, so a [State Machine](http://en.wikipedia.org/wiki/Finite-state_machine) is used to 10 | support multiple sections, arbitrary nesting, more precisely string 11 | parsing and expected output ordering. 12 | 13 | **RECOMMENDATION**: Avoid this approach at all as it is too low-level 14 | and leads to too much boilerplate code. 15 | 16 | Pros | Cons 17 | ---------------------------|----------------------------------- 18 | very much code | hard to understand 19 | no external dependencies | still inflexible in output generation 20 | | hard to implement error reporting 21 | | hard to track line/column 22 | 23 | -------------------------------------------------------------------------------- /cfg2kv-1-sm/cfg2kv.js: -------------------------------------------------------------------------------- 1 | 2 | module.exports = class CFG2KV { 3 | 4 | /* parse configuration format into key/value format */ 5 | cfg2kv (cfg) { 6 | 7 | /* helper function for character class checking */ 8 | const isAlpha = (ch) => { 9 | let cc = ch.charCodeAt(0) 10 | return ( 11 | ("A".charCodeAt(0) <= cc && cc <= "Z".charCodeAt(0)) 12 | || ("a".charCodeAt(0) <= cc && cc <= "z".charCodeAt(0)) 13 | ) 14 | } 15 | const isDigit = (ch) => { 16 | let cc = ch.charCodeAt(0) 17 | return ("0".charCodeAt(0) <= cc && cc <= "9".charCodeAt(0)) 18 | } 19 | 20 | /* helper function for consuming and looking at next characters */ 21 | const consume = (cfg, num) => { 22 | if (num > 0) 23 | cfg = cfg.substr(num) 24 | return cfg 25 | } 26 | const lookahead = (cfg) => { 27 | let ch = cfg.substr(0, 1) 28 | let la = cfg.substr(1, 1) 29 | return { ch, la } 30 | } 31 | 32 | /* iterate over the configuration file content */ 33 | let kv = "" 34 | let ns = [] 35 | let id = "" 36 | let expect = "key" 37 | while (cfg !== "") { 38 | /* extract first and second (look-ahead) character */ 39 | let { ch, la } = lookahead(cfg) 40 | 41 | /* white-space */ 42 | if (ch === " " || ch === "\t" || ch === "\n") 43 | cfg = consume(cfg, 1) 44 | 45 | /* end-of-line comment */ 46 | else if (ch === "/" && la === "/") { 47 | while (cfg !== "" && ch !== "\n") { 48 | cfg = consume(cfg, 1) 49 | ch = lookahead(cfg).ch 50 | } 51 | } 52 | 53 | /* section start end end */ 54 | else if (ch === "{") { 55 | ns.push(id) 56 | cfg = consume(cfg, 1) 57 | expect = "key" 58 | } 59 | else if (ch === "}") { 60 | ns.pop() 61 | cfg = consume(cfg, 1) 62 | } 63 | 64 | /* key */ 65 | else if (expect === "key" && isAlpha(ch)) { 66 | id = "" 67 | while (isAlpha(ch)) { 68 | id += ch 69 | cfg = consume(cfg, 1) 70 | ch = lookahead(cfg).ch 71 | } 72 | expect = "val" 73 | } 74 | 75 | /* key to value punctuation operator */ 76 | else if (ch === "=") { 77 | cfg = consume(cfg, 1) 78 | } 79 | 80 | /* number value */ 81 | else if (expect === "val" && isDigit(ch)) { 82 | let num = "" 83 | while (isDigit(ch)) { 84 | num += ch 85 | cfg = consume(cfg, 1) 86 | ch = lookahead(cfg).ch 87 | } 88 | kv += `${[ ...ns, id].join(".")} ${parseInt(num, 10)}\n` 89 | expect = "key" 90 | } 91 | 92 | /* string value */ 93 | else if (expect === "val" && ch === "\"") { 94 | let str = "" 95 | let escaped = false 96 | cfg = consume(cfg, 1) 97 | ch = lookahead(cfg).ch 98 | while (ch !== "\"" || escaped) { 99 | if (ch === "\\") 100 | escaped = true 101 | else { 102 | escaped = false 103 | str += ch 104 | } 105 | cfg = consume(cfg, 1) 106 | ch = lookahead(cfg).ch 107 | } 108 | cfg = consume(cfg, 1) 109 | ch = lookahead(cfg).ch 110 | kv += `${[ ...ns, id ].join(".")} ${str}\n` 111 | expect = "key" 112 | } 113 | 114 | /* everything else leads to a parsing error */ 115 | else { 116 | throw new Error("unexpected character: \"" + ch + "\" (at \"" + cfg + "\")") 117 | } 118 | } 119 | return kv 120 | } 121 | } 122 | 123 | -------------------------------------------------------------------------------- /cfg2kv-1-sm/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cfg2kv-1-sm", 3 | "description": "variant: State Machine (SM)", 4 | "repository": "rse/parsing-techniques", 5 | "license": "MIT", 6 | "dependencies": { 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /cfg2kv-2-sm-ast/README.md: -------------------------------------------------------------------------------- 1 | 2 | Variant: **State Machine (SM), Abstract Syntax Tree (AST)** 3 | 4 | Source: [`cfg2kv.js`](cfg2kv.js), [`package.json`](package.json) 5 | 6 | As the direct output generation usually is not recommended (it works 7 | just for the cases where the output ordering directly follows the 8 | input ordering) one usually always wants to use an intermediate 9 | format, a so called [Abstract Syntax Tree](http://en.wikipedia.org/wiki/Abstract_syntax_tree) 10 | (AST). This way two passes are performed: parsing of the input 11 | syntax into the AST and then querying the AST to produce the 12 | output. For our trivial configuration language this looks like 13 | overhead, but for mostly all parsing approaches this is the 14 | recommended way. Additionally, some too much bare metal coding 15 | can be replaced with at least a little bit of Regular Expressions 16 | again. This uses my [ASTy](https://github.com/rse/asty) and 17 | [ASTq](https://github.com/rse/astq) as external libraries. 18 | 19 | **RECOMMENDATION**: Avoid this approach at all as it is also too low-level 20 | and leads to too much boilerplate code. 21 | 22 | Pros | Cons 23 | -----------------------------------|----------------------------------- 24 | still rather much code | still partly hard to understand 25 | very flexible in output generation | external dependencies 26 | | hard to implement error reporting 27 | | hard to track line/column 28 | 29 | -------------------------------------------------------------------------------- /cfg2kv-2-sm-ast/cfg2kv.js: -------------------------------------------------------------------------------- 1 | 2 | import ASTY from "asty-astq" 3 | 4 | module.exports = class CFG2KV { 5 | 6 | /* parse configuration format into key/value format */ 7 | cfg2kv (cfg) { 8 | let ast = this.cfg2ast(cfg) 9 | console.log(ast.dump()) 10 | let kv = this.ast2kv(ast) 11 | return kv 12 | } 13 | 14 | /* parse configuration format into Abstract Syntax Tree (AST) */ 15 | cfg2ast (cfg) { 16 | const asty = new ASTY() 17 | const AST = (type) => asty.create(type) 18 | let ast = AST("Section").pos(1, 1, 0).set({ ns: "" }) 19 | let cursor = ast 20 | 21 | /* helper function for character class checking */ 22 | const isAlpha = (ch) => /^[A-Za-z]$/.test(ch) 23 | const isDigit = (ch) => /^[0-9]$/.test(ch) 24 | 25 | /* helper function for consuming and looking at next characters */ 26 | let line = 1 27 | let column = 1 28 | let offset = 0 29 | const consume = (cfg, num) => { 30 | if (num > 0) { 31 | offset += num 32 | for (let i = 0; i < num; i++) { 33 | let c = cfg.charAt(i) 34 | if (c === "\r") 35 | column = 1 36 | else if (c === "\n") { 37 | line++ 38 | column = 1 39 | } 40 | else if (c === "\t") 41 | column += 8 - (column % 8) 42 | else 43 | column++ 44 | } 45 | cfg = cfg.substr(num) 46 | } 47 | return cfg 48 | } 49 | const lookahead = (cfg) => { 50 | let ch = cfg.substr(0, 1) 51 | let la = cfg.substr(1, 1) 52 | return { ch, la } 53 | } 54 | 55 | /* iterate over the configuration file content */ 56 | let id = "" 57 | let expect = "key" 58 | while (cfg !== "") { 59 | /* extract first and second (look-ahead) character */ 60 | let { ch, la } = lookahead(cfg) 61 | 62 | /* white-space */ 63 | if (ch === " " || ch === "\t" || ch === "\n") 64 | cfg = consume(cfg, 1) 65 | 66 | /* end-of-line comment */ 67 | else if (ch === "/" && la === "/") { 68 | while (cfg !== "" && ch !== "\n") { 69 | cfg = consume(cfg, 1) 70 | ch = lookahead(cfg).ch 71 | } 72 | } 73 | 74 | /* section start end end */ 75 | else if (ch === "{") { 76 | let node = AST("Section").pos(line, column, offset).set({ ns: id }) 77 | cursor.add(node) 78 | cursor = node 79 | cfg = consume(cfg, 1) 80 | expect = "key" 81 | } 82 | else if (ch === "}") { 83 | cursor = cursor.parent() 84 | cfg = consume(cfg, 1) 85 | } 86 | 87 | /* key */ 88 | else if (expect === "key" && isAlpha(ch)) { 89 | id = "" 90 | while (isAlpha(ch)) { 91 | id += ch 92 | cfg = consume(cfg, 1) 93 | ch = lookahead(cfg).ch 94 | } 95 | expect = "val" 96 | } 97 | 98 | /* key to value punctuation operator */ 99 | else if (ch === "=") { 100 | cfg = consume(cfg, 1) 101 | } 102 | 103 | /* number value */ 104 | else if (expect === "val" && isDigit(ch)) { 105 | let num = "" 106 | while (isDigit(ch)) { 107 | num += ch 108 | cfg = consume(cfg, 1) 109 | ch = lookahead(cfg).ch 110 | } 111 | let node = AST("Property").pos(line, column, offset).set({ key: id, val: parseInt(num, 10) }) 112 | cursor.add(node) 113 | expect = "key" 114 | } 115 | 116 | /* string value */ 117 | else if (expect === "val" && ch === "\"") { 118 | let str = "" 119 | let escaped = false 120 | cfg = consume(cfg, 1) 121 | ch = lookahead(cfg).ch 122 | while (ch !== "\"" || escaped) { 123 | if (ch === "\\") 124 | escaped = true 125 | else { 126 | escaped = false 127 | str += ch 128 | } 129 | cfg = consume(cfg, 1) 130 | ch = lookahead(cfg).ch 131 | } 132 | cfg = consume(cfg, 1) 133 | ch = lookahead(cfg).ch 134 | let node = AST("Property").pos(line, column, offset).set({ key: id, val: str }) 135 | cursor.add(node) 136 | expect = "key" 137 | } 138 | 139 | /* everything else leads to a parsing error */ 140 | else { 141 | throw new Error("unexpected character: \"" + ch + "\" (at \"" + cfg + "\")") 142 | } 143 | } 144 | return ast 145 | } 146 | 147 | /* generate key/value format from Abstract Syntax Tree (AST) */ 148 | ast2kv (ast) { 149 | let kv = "" 150 | ast.query("// Property").forEach((p) => { 151 | let ns = p.query("..// Section").reverse().slice(1).map((n) => n.get("ns")) 152 | let key = [ ...ns, p.get("key") ].join(".") 153 | let val = p.get("val") 154 | kv += `${key} ${val}\n` 155 | }) 156 | return kv 157 | } 158 | } 159 | 160 | -------------------------------------------------------------------------------- /cfg2kv-2-sm-ast/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cfg2kv-2-sm-ast", 3 | "description": "variant: State Machine (SM), Abstract Syntax Tree (AST)", 4 | "repository": "rse/parsing-techniques", 5 | "license": "MIT", 6 | "dependencies": { 7 | "asty-astq": "1.12.0" 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cfg2kv-3-ls-rdp-ast/README.md: -------------------------------------------------------------------------------- 1 | 2 | Variant: **Lexical Scanner (LS), Recursive Descent Parser (RDP), Abstract Syntax Tree (AST)** 3 | 4 | Source: [`cfg2kv.js`](cfg2kv.js), [`package.json`](package.json) 5 | 6 | One can get rid of the low-level character parsing by splitting the 7 | parsing into two streams: tokenization of characters and parsing the 8 | token structure. For the first we now use a library supporting the 9 | implementation of so-called [Lexical Scanners](http://en.wikipedia.org/wiki/Lexical_analysis). For the second, we now 10 | switch over to [Recursive Descent Parsing](http://en.wikipedia.org/wiki/Recursive_descent_parser). 11 | This uses my [Tokenizr](https://github.com/rse/tokenizr), [ASTy](https://github.com/rse/asty) 12 | and [ASTq](https://github.com/rse/astq) as external libraries. 13 | 14 | **RECOMMENDATION**: Acceptable for simple formal language structures and 15 | limited situations, but usually not worth the effort as it still 16 | causes noticable boilerplate code. 17 | 18 | Pros | Cons 19 | -----------------------------------|----------------------------------- 20 | acceptable amount of code | external dependencies 21 | easier to understand | still somewhat lower-level 22 | very flexible in output generation | hard to implement error reporting 23 | semi-automatic line/column tracking| 24 | 25 | -------------------------------------------------------------------------------- /cfg2kv-3-ls-rdp-ast/cfg2kv.js: -------------------------------------------------------------------------------- 1 | 2 | import ASTY from "asty-astq" 3 | import Tokenizr from "tokenizr" 4 | 5 | module.exports = class CFG2KV { 6 | 7 | /* parse configuration format into key/value format */ 8 | cfg2kv (cfg) { 9 | let ast = this.cfg2ast(cfg) 10 | console.log(ast.dump()) 11 | let kv = this.ast2kv(ast) 12 | return kv 13 | } 14 | 15 | /* parse configuration format into Abstract Syntax Tree (AST) */ 16 | cfg2ast (cfg) { 17 | /* establish abstract syntax tree (AST) node generator */ 18 | let asty = new ASTY() 19 | const AST = (type, ref) => { 20 | let ast = asty.create(type) 21 | if (typeof ref === "object" && ref instanceof Array && ref.length > 0) 22 | ref = ref[0] 23 | if (typeof ref === "object" && ref instanceof Tokenizr.Token) 24 | ast.pos(ref.line, ref.column, ref.pos) 25 | else if (typeof ref === "object" && asty.isA(ref)) 26 | ast.pos(ref.pos().line, ref.pos().column, ref.pos().offset) 27 | return ast 28 | } 29 | 30 | /* establish lexical scanner */ 31 | let lexer = new Tokenizr() 32 | lexer.rule(/[a-zA-Z_][a-zA-Z0-9_]*/, (ctx, m) => { 33 | ctx.accept("id") 34 | }) 35 | lexer.rule(/[+-]?[0-9]+/, (ctx, m) => { 36 | ctx.accept("number", parseInt(m[0])) 37 | }) 38 | lexer.rule(/"((?:\\\"|[^\r\n]+)+)"/, (ctx, m) => { 39 | ctx.accept("string", m[1].replace(/\\"/g, "\"")) 40 | }) 41 | lexer.rule(/\/\/[^\r\n]+\r?\n/, (ctx, m) => { 42 | ctx.ignore() 43 | }) 44 | lexer.rule(/[ \t\r\n]+/, (ctx, m) => { 45 | ctx.ignore() 46 | }) 47 | lexer.rule(/./, (ctx, m) => { 48 | ctx.accept("char") 49 | }) 50 | 51 | /* establish recursive descent parser */ 52 | let parser = { 53 | parseCfg () { 54 | let block = this.parseBlock() 55 | lexer.consume("EOF") 56 | return AST("Section", block).set({ ns: "" }).add(block) 57 | }, 58 | parseBlock () { 59 | let items = [] 60 | for (;;) { 61 | let item = lexer.alternatives( 62 | this.parseProperty.bind(this), 63 | this.parseSection.bind(this), 64 | this.parseEmpty.bind(this)) 65 | if (item === undefined) 66 | break 67 | items.push(item) 68 | } 69 | return items 70 | }, 71 | parseProperty () { 72 | let key = this.parseId() 73 | lexer.consume("char", "=") 74 | let value = lexer.alternatives( 75 | this.parseNumber.bind(this), 76 | this.parseString.bind(this)) 77 | return AST("Property", value).set({ key: key.value, val: value.value }) 78 | }, 79 | parseSection () { 80 | let ns = this.parseId() 81 | lexer.consume("char", "{") 82 | let block = this.parseBlock() 83 | lexer.consume("char", "}") 84 | return AST("Section", ns).set({ ns: ns.value }).add(block) 85 | }, 86 | parseId () { 87 | return lexer.consume("id") 88 | }, 89 | parseNumber () { 90 | return lexer.consume("number") 91 | }, 92 | parseString () { 93 | return lexer.consume("string") 94 | }, 95 | parseEmpty () { 96 | return undefined 97 | } 98 | } 99 | 100 | /* parse syntax character string into abstract syntax tree (AST) */ 101 | let ast 102 | try { 103 | lexer.input(cfg) 104 | ast = parser.parseCfg() 105 | } 106 | catch (ex) { 107 | console.log(ex.toString()) 108 | process.exit(0) 109 | } 110 | return ast 111 | } 112 | 113 | /* generate key/value format from Abstract Syntax Tree (AST) */ 114 | ast2kv (ast) { 115 | let kv = "" 116 | ast.query("// Property").forEach((p) => { 117 | let ns = p.query("..// Section").reverse().slice(1).map((n) => n.get("ns")) 118 | let key = [ ...ns, p.get("key") ].join(".") 119 | let val = p.get("val") 120 | kv += `${key} ${val}\n` 121 | }) 122 | return kv 123 | } 124 | } 125 | 126 | -------------------------------------------------------------------------------- /cfg2kv-3-ls-rdp-ast/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cfg2kv-3-ls-rdp-ast", 3 | "description": "variant: Lexical Scanner (LS), Recursive Descent Parser (RDP), Abstract Syntax Tree (AST)", 4 | "repository": "rse/parsing-techniques", 5 | "license": "MIT", 6 | "dependencies": { 7 | "tokenizr": "1.4.0", 8 | "asty-astq": "1.12.0" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /cfg2kv-4-pc-ast/README.md: -------------------------------------------------------------------------------- 1 | 2 | Variant: **Parser Combinators (PC), Abstract Syntax Tree (AST)** 3 | 4 | Source: [`cfg2kv.js`](cfg2kv.js), [`package.json`](package.json) 5 | 6 | To get rid of the separation between scanner and parser, one can use 7 | scannerless parsing. One way is the use Haskel Parsec like Parser Combinators like Arcsecond, which 8 | also provide Regular Expression based parsers. 9 | 10 | **RECOMMENDATION**: Acceptable for simple formal language structures and 11 | limited situations, but usually not worth the effort as it still 12 | causes noticable boilerplate code. 13 | 14 | Pros | Cons 15 | -----------------------------------|----------------------------------- 16 | no separation of scanner and parser| external dependencies 17 | everything is plain host language | hard to implement error reporting 18 | | noticable boilerplate code 19 | 20 | -------------------------------------------------------------------------------- /cfg2kv-4-pc-ast/cfg2kv.js: -------------------------------------------------------------------------------- 1 | 2 | import ASTY from "asty-astq" 3 | import * as AS from "arcsecond" 4 | 5 | module.exports = class CFG2KV { 6 | 7 | /* parse configuration format into key/value format */ 8 | cfg2kv (cfg) { 9 | let ast = this.cfg2ast(cfg) 10 | console.log(ast.dump()) 11 | let kv = this.ast2kv(ast) 12 | return kv 13 | } 14 | 15 | /* parse configuration format into Abstract Syntax Tree (AST) */ 16 | cfg2ast (cfg) { 17 | let asty = new ASTY() 18 | const AST = (type, ref) => { 19 | let ast = asty.create(type) 20 | if (typeof ref === "object" && ref instanceof Array && ref.length > 0) 21 | ref = ref[0] 22 | if (typeof ref === "object" && asty.isA(ref)) 23 | ast.pos(ref.pos().line, ref.pos().column, ref.pos().offset) 24 | return ast 25 | } 26 | 27 | const ruleWS = 28 | AS.regex(/^[ \t\r\n]+/) 29 | const ruleCO = 30 | AS.regex(/^\/\/[^\r\n]*(?:\r?\n|$)/) 31 | const _ = 32 | AS.many(AS.choice([ ruleCO, ruleWS ])) 33 | const ruleString = AS.pipeParsers([ 34 | AS.sequenceOf([ AS.char("\""), AS.many(AS.choice([ AS.str("\\\""), AS.anythingExcept(AS.char("\"")) ])), AS.char("\"") ]), 35 | AS.mapTo((m) => { 36 | let str = m[1].join("") 37 | return str.replace(/\\"/g, "\"") 38 | }) 39 | ]) 40 | const ruleNumber = AS.pipeParsers([ 41 | AS.regex(/^[+-]?[0-9]+/), 42 | AS.mapTo((num) => { 43 | return parseInt(num, 10) 44 | }) 45 | ]) 46 | const ruleId = 47 | AS.regex(/^[a-zA-Z_][a-zA-Z0-9_]*/) 48 | const ruleSection = AS.pipeParsers([ 49 | AS.sequenceOf([ _, ruleId, _, AS.char("{"), _, AS.recursiveParser(() => ruleBlock), _, AS.char("}"), _ ]), 50 | AS.mapTo((m) => { 51 | let [ ns, block ] = [ m[1], m[5] ] 52 | return AST("Section").set({ ns: ns }).add(block) 53 | }) 54 | ]) 55 | const ruleProperty = AS.pipeParsers([ 56 | AS.sequenceOf([ _, ruleId, _, AS.char("="), _, AS.choice([ ruleNumber, ruleString ]), _ ]), 57 | AS.mapTo((m) => { 58 | let [ key, val ] = [ m[1], m[5] ] 59 | return AST("Property").set({ key: key, val: val }) 60 | }) 61 | ]) 62 | const ruleBlock = 63 | AS.many(AS.choice([ ruleProperty, ruleSection ])) 64 | const ruleCfg = AS.pipeParsers([ 65 | ruleBlock, 66 | AS.mapTo((block) => { 67 | return AST("Section").set({ ns: "" }).add(block) 68 | }) 69 | ]) 70 | 71 | let parser = AS.parse(ruleCfg) 72 | let ast = parser(cfg).value 73 | 74 | return ast 75 | } 76 | 77 | /* generate key/value format from Abstract Syntax Tree (AST) */ 78 | ast2kv (ast) { 79 | let kv = "" 80 | ast.query("// Property").forEach((p) => { 81 | let ns = p.query("..// Section").reverse().slice(1).map((n) => n.get("ns")) 82 | let key = [ ...ns, p.get("key") ].join(".") 83 | let val = p.get("val") 84 | kv += `${key} ${val}\n` 85 | }) 86 | return kv 87 | } 88 | } 89 | 90 | -------------------------------------------------------------------------------- /cfg2kv-4-pc-ast/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cfg2kv-4-pc-ast", 3 | "description": "variant: Parser Combinators (PC), Abstract Syntax Tree (AST)", 4 | "repository": "rse/parsing-techniques", 5 | "license": "MIT", 6 | "dependencies": { 7 | "arcsecond": "1.1.1", 8 | "asty-astq": "1.12.0" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /cfg2kv-5-peg-ast/README.md: -------------------------------------------------------------------------------- 1 | 2 | Variant: **Parsing Expression Grammar (PEG) Parser, Abstract Syntax Tree (AST)** 3 | 4 | Source: [`cfg2kv.js`](cfg2kv.js), [`cfg2kv.pegjs`](cfg2kv.pegjs), [`package.json`](package.json) 5 | 6 | The structure behind Recursive Descent Parsing is actually an LL(1) 7 | [Context-Free Grammar](http://en.wikipedia.org/wiki/Context-free_grammar). An efficient variant of [LL(1)](http://en.wikipedia.org/wiki/LL_grammar) are 8 | [Parsing Expression Grammars](http://en.wikipedia.org/wiki/Parsing_expression_grammar) 9 | (PEG) where the lexical scanning and syntax parsing can be smartly 10 | interweaved again and where full-blown parser generators exist (which 11 | in the background generate the necessary Recursive Descent code for 12 | us). Hence, we now switch over to a PEG, but still use the flexible AST 13 | approach. This uses [PEG.js](http://pegjs.org) and my [ASTy](https://github.com/rse/asty) 14 | and [ASTq](https://github.com/rse/astq) as external libraries. 15 | 16 | **RECOMMENDATION**: The preferred approach which should be used whenever possible, 17 | as it has to really best Pros/Cons ratio. 18 | 19 | Pros | Cons 20 | -----------------------------------|----------------------------------- 21 | very less code | external dependencies 22 | easier to understand | you have to write an LL(1) grammar 23 | very high-level | 24 | very flexible in output generation | 25 | automatic error generation | 26 | automatic line/column tracking | 27 | 28 | -------------------------------------------------------------------------------- /cfg2kv-5-peg-ast/cfg2kv.js: -------------------------------------------------------------------------------- 1 | 2 | import path from "path" 3 | import ASTY from "asty-astq" 4 | import PEG from "pegjs-otf" 5 | import PEGutil from "pegjs-util" 6 | 7 | module.exports = class CFG2KV { 8 | 9 | /* parse configuration format into key/value format */ 10 | cfg2kv (cfg) { 11 | let ast = this.cfg2ast(cfg) 12 | console.log(ast.dump()) 13 | let kv = this.ast2kv(ast) 14 | return kv 15 | } 16 | 17 | /* parse configuration format into Abstract Syntax Tree (AST) */ 18 | cfg2ast (cfg) { 19 | let asty = new ASTY() 20 | let parser = PEG.generateFromFile(path.join(__dirname, "cfg2kv.pegjs"), { optimize: "speed" }) 21 | let result = PEGutil.parse(parser, cfg, { 22 | makeAST: (line, column, offset, args) => 23 | asty.create.apply(asty, args).pos(line, column, offset) 24 | }) 25 | if (result.error !== null) { 26 | console.error("ERROR: Parsing Failure:\n" + 27 | PEGutil.errorMessage(result.error, true).replace(/^/mg, "ERROR: ")) 28 | process.exit(0) 29 | } 30 | return result.ast 31 | } 32 | 33 | /* generate key/value format from Abstract Syntax Tree (AST) */ 34 | ast2kv (ast) { 35 | let kv = "" 36 | ast.query("// Property").forEach((p) => { 37 | let ns = p.query("..// Section").reverse().slice(1).map((n) => n.get("ns")) 38 | let key = [ ...ns, p.get("key") ].join(".") 39 | let val = p.get("val") 40 | kv += `${key} ${val}\n` 41 | }) 42 | return kv 43 | } 44 | } 45 | 46 | -------------------------------------------------------------------------------- /cfg2kv-5-peg-ast/cfg2kv.pegjs: -------------------------------------------------------------------------------- 1 | 2 | { 3 | var ast = options.util.makeAST(location, options) 4 | } 5 | 6 | cfg 7 | = block:block { 8 | return ast("Section").set({ ns: "" }).add(block) 9 | } 10 | 11 | block 12 | = (property / section)* 13 | 14 | section 15 | = _ ns:id _ "{" _ block:block _ "}" _ { 16 | return ast("Section").set({ ns: ns }).add(block) 17 | } 18 | 19 | property 20 | = _ key:id _ "=" _ val:(number / string) _ { 21 | return ast("Property").set({ key: key, val: val }) 22 | } 23 | 24 | id "identifier" 25 | = $([a-zA-Z_][a-zA-Z0-9_]*) 26 | 27 | number "integer number" 28 | = num:$([+-]? [0-9]+) { 29 | return parseInt(num, 10) 30 | } 31 | 32 | string "quoted string" 33 | = "\"" str:$(("\\\"" / [^"])*) "\"" { 34 | return str.replace(/\\"/g, "\"") 35 | } 36 | 37 | _ "blank" 38 | = (co / ws)* 39 | 40 | co "comment" 41 | = "//" (![\r\n] .)* 42 | 43 | ws "whitespaces" 44 | = [ \t\r\n]+ 45 | 46 | -------------------------------------------------------------------------------- /cfg2kv-5-peg-ast/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cfg2kv-5-peg-ast", 3 | "description": "variant: Parsing Expression Grammar (PEG) Parser, Abstract Syntax Tree (AST)", 4 | "repository": "rse/parsing-techniques", 5 | "license": "MIT", 6 | "dependencies": { 7 | "asty-astq": "1.12.0", 8 | "pegjs-util": "1.4.15", 9 | "pegjs-otf": "1.2.11", 10 | "pegjs": "0.10.0" 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /cfg2kv.js: -------------------------------------------------------------------------------- 1 | 2 | /* require some standard functionality */ 3 | import fs from "fs" 4 | import path from "path" 5 | 6 | /* read configuration file */ 7 | let cfg = fs.readFileSync("sample.cfg", "utf8") 8 | console.log(cfg) 9 | 10 | /* convert configuration to property format */ 11 | let CFG2KV = require(path.join(__dirname, `cfg2kv-${process.argv[2]}`, "cfg2kv")) 12 | let cfg2kv = new CFG2KV() 13 | let kv = cfg2kv.cfg2kv(cfg) 14 | 15 | /* write property file */ 16 | fs.writeFileSync("sample.kv", kv, "utf8") 17 | console.log(kv) 18 | 19 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cfg2kv", 3 | "description": "top-level driver", 4 | "repository": "rse/parsing-techniques", 5 | "license": "MIT", 6 | "dependencies": { 7 | "@babel/node": "7.2.2", 8 | "@babel/preset-env": "7.2.3", 9 | "@babel/core": "7.2.2" 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /sample.cfg: -------------------------------------------------------------------------------- 1 | 2 | foo { 3 | baz = 7 // some comment 4 | bar { 5 | quux = 42 6 | hello = "{hello} = \"world\"!" 7 | } 8 | quux = 3 9 | } 10 | bar = 1 11 | quux = 2 12 | 13 | -------------------------------------------------------------------------------- /sample.kv: -------------------------------------------------------------------------------- 1 | foo.baz 7 2 | foo.bar.quux 42 3 | foo.bar.hello {hello} = "world"! 4 | foo.quux 3 5 | bar 1 6 | quux 2 7 | --------------------------------------------------------------------------------