├── .eslintrc.json
├── .github
    └── workflows
    │   └── ci.yml
├── .gitignore
├── .gitmodules
├── GNUmakefile
├── README.md
├── generator
    ├── .gitignore
    ├── dub.sdl
    ├── dub.selections.json
    └── source
    │   ├── ddoc.d
    │   ├── generator.d
    │   ├── grammar.d
    │   ├── parser.d
    │   └── writer.d
├── grammar.js
├── package-lock.json
├── package.json
├── src
    └── scanner.cc
└── test
    ├── corpus
        ├── 2_lex-13_floatliteral.txt
        ├── 2_lex-16_special_token_sequence.txt
        ├── 2_lex-1_source_text.txt
        ├── 2_lex-6_comment.txt
        ├── 2_lex-9_string_literals.txt
        └── 30_iasm-11_gcc.txt
    ├── parse-success-xfail.txt
    ├── parse-success
        ├── dmd
        │   ├── compilable
        │   └── runnable
        └── dmd_asm.d
    ├── repos
        └── README.md
    └── tmp
        └── .gitignore


/.eslintrc.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"env": {
3 | 		"es2017": true
4 | 	}
5 | }
6 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: test
 2 | on: [ push, pull_request ]
 3 | jobs:
 4 |   test:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - uses: actions/checkout@v2
 8 |         with:
 9 |           submodules: true
10 | 
11 |       - uses: actions/setup-node@v2
12 |         with:
13 |           node-version: 14
14 | 
15 |       - run: make test
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # npm install
 2 | /node_modules/
 3 | 
 4 | # tree-sitter generate
 5 | !/src/
 6 | /src/*
 7 | !/src/scanner.cc
 8 | /bindings/
 9 | /Cargo.toml
10 | binding.gyp
11 | 
12 | # tree-sitter build-wasm
13 | /tree-sitter-d.wasm
14 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "dlang.org"]
2 | 	path = generator/dlang.org
3 | 	url = https://github.com/CyberShadow/d-programming-language.org
4 | [submodule "test/repos/dmd"]
5 | 	path = test/repos/dmd
6 | 	url = https://github.com/dlang/dmd.git
7 | 


--------------------------------------------------------------------------------
/GNUmakefile:
--------------------------------------------------------------------------------
 1 | # Configuration
 2 | 
 3 | TREE_SITTER=node_modules/.bin/tree-sitter
 4 | DOCKER_FLAG=--docker
 5 | SO_SUFFIX=.so
 6 | 
 7 | # Constants
 8 | 
 9 | PARSER=src/parser.c
10 | XDG_CACHE_HOME=$(HOME)/.cache
11 | SO=$(XDG_CACHE_HOME)/tree-sitter/lib/d$(SO_SUFFIX)
12 | WASM=tree-sitter-d.wasm
13 | TEST_TS_FILES=$(shell find test/corpus -type f)
14 | TEST_TS_OK=test/tmp/tree-sitter-test.ok
15 | TEST_PARSE_SUCCESS_OK=test/tmp/parse-success.ok
16 | TEST_PARSE_SUCCESS_XFAIL_OK=$(addsuffix .ok,$(subst test/parse-success-xfail/,test/tmp/parse-success-xfail/,$(shell find test/parse-success-xfail -type f)))
17 | 
18 | # Entry points
19 | 
20 | all : compile
21 | parser : $(PARSER)
22 | compile : $(SO)
23 | wasm : $(WASM)
24 | 
25 | test : test-ts test-parse-success
26 | test-ts : $(TEST_TS_OK)
27 | test-parse-success : $(TEST_PARSE_SUCCESS_OK)
28 | 
29 | # Implementation
30 | 
31 | .PHONY : all parser compile wasm test test-ts test-parse-success test-parse-success-xfail web-ui
32 | 
33 | # The default is to use the tree-sitter version which would be
34 | # installed by npm (according to package.json / package-lock.json).
35 | # If it hasn't been installed yet, do so automatically.
36 | node_modules/.bin/tree-sitter :
37 | 	npm install
38 | 
39 | # Build the grammar (grammar.json, parser.c etc.)
40 | $(PARSER) : grammar.js src/scanner.cc $(TREE_SITTER)
41 | 	$(TREE_SITTER) generate
42 | 
43 | # Build a shared object binary from the parser
44 | # This file mainly exists to avoid race conditions / duplicate work
45 | # when running the test targets in parallel.
46 | $(SO) : $(PARSER)
47 | 	@# No explicit "compile" command, so just parse an empty file
48 | 	$(TREE_SITTER) parse -q /dev/null
49 | 
50 | # Build a WASM binary from the parser
51 | # The default is to use Docker, which will ensure that the correct version is used
52 | # (https://github.com/tree-sitter/tree-sitter/pull/1180).
53 | # Run with DOCKER_FLAG= to use the host Emscripten version.
54 | $(WASM) : $(PARSER)
55 | 	$(TREE_SITTER) build-wasm $(DOCKER_FLAG)
56 | 
57 | # Launch web-ui
58 | web-ui : $(WASM)
59 | 	$(TREE_SITTER) web-ui
60 | 
61 | # tree-sitter test suite
62 | $(TEST_TS_OK) : $(TEST_TS_FILES) $(SO)
63 | 	$(TREE_SITTER) test
64 | 	@touch $@
65 | 
66 | # parse-success
67 | 
68 | PARSE_SUCCESS_RESULTS=test/tmp/parse-success-results.txt
69 | PARSE_SUCCESS_XFAIL_IN=test/parse-success-xfail.txt
70 | PARSE_SUCCESS_XFAIL=test/tmp/parse-success-xfail.txt
71 | 
72 | $(PARSE_SUCCESS_RESULTS) : $(SO)
73 | 	rm -f $@
74 | 	find -L test/parse-success -type f -name '*.d' -o -name '*.di' | sort | $(TREE_SITTER) parse -q --paths /dev/stdin | awk '{print $$1}' > $@
75 | 
76 | $(PARSE_SUCCESS_XFAIL) : $(PARSE_SUCCESS_XFAIL_IN)
77 | 	grep '^[^#]' $< | sort > $@
78 | 
79 | $(TEST_PARSE_SUCCESS_OK) : $(PARSE_SUCCESS_RESULTS) $(PARSE_SUCCESS_XFAIL)
80 | 	diff -u $+
81 | 	@touch $@
82 | 
83 | # parse-success-xfail
84 | test/tmp/parse-success-xfail/%.ok : test/parse-success-xfail/% $(SO)
85 | 	if $(TREE_SITTER) parse -q $< ; then exit 1 ; fi
86 | 	@mkdir -p "$$(dirname $@)"
87 | 	@touch $@
88 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | tree-sitter-d
 2 | =============
 3 | 
 4 | This repository hosts a [tree-sitter](https://tree-sitter.github.io/) grammar for the [D programming language](https://dlang.org/).
 5 | 
 6 | About
 7 | -----
 8 | 
 9 | The process of generating the grammar consists of a number of steps. The following lists the full process that the grammar goes through.
10 | 
11 | 1. The origin of the grammar described here is [the official specification of the D programming language](https://dlang.org/spec/spec.html).
12 | 
13 |    Though it can be perused online, we use the source code, which is written in [DDoc](https://dlang.org/spec/ddoc.html) (the D documentation macro processor)
14 |    and is maintained in the [dlang/dlang.org GitHub repository](https://github.com/dlang/dlang.org/tree/master/spec).
15 | 
16 |    The `generated/dlang.org` submodule points to the copy that is used by this repository, which may contain some fixes
17 |    (whether to make it more machine-readable or to more accurately describe the language) which have not been upstreamed yet.
18 | 
19 | 2. The grammar is then consumed by a custom program which attempts to automatically convert it as much as feasible into a tree-sitter grammar.
20 |    This program and its output are located in [the `generated` branch](https://github.com/CyberShadow/tree-sitter-d/tree/generated).
21 | 
22 |    The first step of processing the grammar is to parse it.
23 |    Thus, the grammar specification above is parsed into a DOM representing the document structure, with one node per DDoc macro.
24 | 
25 |    Though the canonical way to consume DDoc documents is to specify a file with custom macro definitions and to run DMD's DDoc macro processor using it,
26 |    the approach used here was to implement a [simple DDoc parser](https://github.com/CyberShadow/tree-sitter-d/blob/master/generator/source/ddoc.d) instead
27 |    (which also helped validate our assumptions about DDoc syntax).
28 | 
29 | 3. The DDoc DOM is then converted to the initial grammar definition, which roughly corresponds to tree-sitter grammar structure.
30 |    The conversion is done in the [parser](https://github.com/CyberShadow/tree-sitter-d/blob/master/generator/source/parser.d) module.
31 | 
32 | 4. After conversion, the grammar passes through a few preprocessing steps.
33 |    These mold the grammar into a shape which is more useful to be used for typical tree-sitter applications.
34 | 
35 |    Two main preprocessing steps are:
36 | 
37 |    - De-recursion, which converts definitions for lists of things from a recursive definition to one using explicit repetition.  
38 |      (Example: [`ImportList`](https://dlang.org/spec/module.html#ImportList))
39 | 
40 |    - Body extraction, which splits some definitions into two, in which one is the definition "body" containing the operation actually described by the definition's name,
41 |      and the other is a hidden rule which resolves either to the body or to the next operation with higher precedence.  
42 |      (Example: [`OrOrExpression`](https://dlang.org/spec/expression.html#OrOrExpression))
43 | 
44 |    The grammar is then optimized to reduce redundancies manifested during preprocessing.
45 | 
46 | 5. The grammar is now ready to be saved to `grammar.js`, the tree-sitter definition of the grammar.
47 | 
48 |    The latest version of this generated file can be found [in the root of the `generated` branch](https://github.com/CyberShadow/tree-sitter-d/blob/generated/grammar.js).
49 | 
50 | 6. The generated file is not quite ready to be used, and requires some manual fixups.
51 | 
52 |    For this purpose, the `master` branch holds these fixes on top of the `generated` branch (which is merged into `master` regularly).
53 | 
54 |    You can see all manual fixes by [comparing the two branches](https://github.com/CyberShadow/tree-sitter-d/compare/generated..master#diff-919ac210accac9ecc55a76d10a7590e3d85ca3f0e165b52d30f08faee486d0cb).
55 | 
56 |    The `master` branch also hosts the test suite, as well as the [custom scanner](https://github.com/CyberShadow/tree-sitter-d/blob/master/src/scanner.cc),
57 |    which implements D-specific syntax which cannot be described using the declarative tree-sitter grammar, such as nested comments or delimited string literals.
58 | 
59 | 7. From this point, `grammar.js` is ready to be passed on to tree-sitter's build process, so the steps below simply describe how any tree-sitter grammar is compiled.
60 | 
61 |    `tree-sitter-cli` is used to generate the parser C source code from `grammar.js`. If installed via `npm` (i.e. `npm install`), this can be done by running:
62 | 
63 |    ```
64 |    ./node_modules/.bin/tree-sitter generate
65 |    ```
66 | 
67 |    This will populate the `src` directory, as well as create [additional build files](https://github.com/cybershadow/tree-sitter-d/blob/master/.gitignore#L4-L9).
68 | 
69 | 8. Finally, the C source code is compiled into a loadable shared library, which can be directly used by a tree-sitter-enabled application.
70 | 
71 |    This step happens automatically when running `tree-sitter test`.
72 |    Alternatively, invoking `tree-sitter build-wasm` builds a WebAssembly module instead of a native shared object.
73 | 
74 | Contributing
75 | ------------
76 | 
77 | If you would like to help, please have a look at the [list of open issues](https://github.com/CyberShadow/tree-sitter-d/issues).
78 | 
79 | If you spot an error in the grammar or the way it behaves and would like to fix it, the first step would be to identify the correct place to perform the fix.
80 | 
81 | - If the problem is due to an incorrect grammar definition, and the error is also present in [the official specification](https://dlang.org/spec/spec.html),
82 |   then please fix and send a pull request there.
83 | 
84 | - Otherwise, if you believe that the problem is due to a translation error between the official grammar and the generated `grammar.js` file,
85 |   then it may be due to a bug in [the generator program](https://github.com/CyberShadow/tree-sitter-d/tree/generated/generator).
86 | 
87 | - Finally, if the problem is tree-sitter specific or cannot be fixed through the above avenues,
88 |   then the fix should be applied to [`grammar.js` on the master branch](https://github.com/CyberShadow/tree-sitter-d/blob/master/grammar.js).
89 | 
90 | If you are having trouble with anything, please don't hesitate to [open an issue](https://github.com/CyberShadow/tree-sitter-d/issues/new).
91 | 


--------------------------------------------------------------------------------
/generator/.gitignore:
--------------------------------------------------------------------------------
1 | # Dub
2 | /.dub
3 | /generator
4 | 
5 | # rdmd / dmd -i / rund
6 | /source/generator
7 | 


--------------------------------------------------------------------------------
/generator/dub.sdl:
--------------------------------------------------------------------------------
1 | name "generator"
2 | targetType "executable"
3 | dependency "ae" version="==0.0.3058"
4 | 


--------------------------------------------------------------------------------
/generator/dub.selections.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"fileVersion": 1,
3 | 	"versions": {
4 | 		"ae": "0.0.3058"
5 | 	}
6 | }
7 | 


--------------------------------------------------------------------------------
/generator/source/ddoc.d:
--------------------------------------------------------------------------------
  1 | module ddoc;
  2 | 
  3 | import std.algorithm.comparison;
  4 | import std.algorithm.searching;
  5 | import std.ascii;
  6 | import std.exception;
  7 | import std.format;
  8 | import std.functional;
  9 | import std.string;
 10 | 
 11 | import ae.utils.array;
 12 | import ae.utils.meta;
 13 | 
 14 | /// A DDoc DOM node
 15 | struct Node
 16 | {
 17 | 	/// Node type
 18 | 	enum Type
 19 | 	{
 20 | 		text, /// Verbatim inline text
 21 | 		call, /// Macro call
 22 | 		parameter, /// Placeholder for parameter in macro definition
 23 | 	}
 24 | 	Type type; /// ditto
 25 | 
 26 | 	union
 27 | 	{
 28 | 		/// When type == Type.text
 29 | 		string text;
 30 | 
 31 | 		/// When type == Type.call
 32 | 		struct Call
 33 | 		{
 34 | 			string macroName; /// The macro being called
 35 | 			const(Node)[] contents; /// The arguments (comma-separated).
 36 | 
 37 | 			/// Split `contents` into individual arguments.
 38 | 			DDoc[] splitArguments() const
 39 | 			{
 40 | 				auto arguments = contents.split(',');
 41 | 				// Remove the optional space after each ,
 42 | 				foreach (ref ddoc; arguments[1 .. $])
 43 | 					if (ddoc.length && ddoc[0].type == Node.Type.text)
 44 | 						ddoc[0].text.skipOver(" ");
 45 | 				return arguments;
 46 | 			}
 47 | 
 48 | 			/// Expand this macro call using the given definition.
 49 | 			DDoc expand(const(Node)[] definition) const
 50 | 			{
 51 | 				auto arguments = splitArguments();
 52 | 				DDoc visit(const(Node)[] def)
 53 | 				{
 54 | 					DDoc result;
 55 | 					foreach (defNode; def)
 56 | 						final switch (defNode.type)
 57 | 						{
 58 | 							case Type.text:
 59 | 								result ~= defNode;
 60 | 								break;
 61 | 							case Type.call:
 62 | 							{
 63 | 								Node node = defNode;
 64 | 								node.call.contents = visit(defNode.call.contents);
 65 | 								result ~= node;
 66 | 								break;
 67 | 							}
 68 | 							case Type.parameter:
 69 | 								switch (defNode.parameter)
 70 | 								{
 71 | 									case '1':
 72 | 										..
 73 | 									case '9':
 74 | 										result ~= arguments.get(defNode.parameter - '1');
 75 | 										break;
 76 | 									case '0':
 77 | 										result ~= contents;
 78 | 										break;
 79 | 									default:
 80 | 										throw new Exception("Don't understand macro parameter $" ~ defNode.parameter);
 81 | 								}
 82 | 								break;
 83 | 						}
 84 | 					return result;
 85 | 				}
 86 | 				return visit(definition);
 87 | 			}
 88 | 		}
 89 | 		Call call; /// ditto
 90 | 
 91 | 		/// When type == Type.parameter
 92 | 		char parameter;
 93 | 	}
 94 | 
 95 | 	/// Helper getters
 96 | 	bool isText  (string text     ) const { return type == Node.Type.text && this.text           == text     ; }
 97 | 	bool isCallTo(string macroName) const { return type == Node.Type.call && this.call.macroName == macroName; } /// ditto
 98 | 
 99 | 	string getSingleTextChild() const
100 | 	{
101 | 		enforce(
102 | 			type == Type.call &&
103 | 			call.contents.length == 1 &&
104 | 			call.contents[0].type == .Node.Type.text,
105 | 			"Macro does not have a single text child"
106 | 		);
107 | 		return call.contents[0].text;
108 | 	} /// ditto
109 | 
110 | 	bool isCallToEmpty(string macroName) const { return isCallTo(macroName) && !call.contents.length; } /// ditto
111 | 
112 |     void toString(scope void delegate(const(char)[]) sink) const
113 | 	{
114 | 		final switch (type)
115 | 		{
116 | 			case Type.text: sink.formattedWrite!"Node(%s, %(%s%))"(type, text.toArray); return;
117 | 			case Type.call: sink.formattedWrite!"%s"(call); return;
118 | 			case Type.parameter: sink.formattedWrite!"%s"(parameter); return;
119 | 		}
120 | 	} ///
121 | }
122 | 
123 | /// A DDoc span is a list of root nodes.
124 | alias DDoc = Node[];
125 | 
126 | private bool isMacroNameChar(char c) { return isAlphaNum(c) || c == '_'; }
127 | 
128 | private DDoc parseDDocFragment(ref string s, bool topLevel)
129 | {
130 | 	DDoc ddoc;
131 | 	size_t parenDepth;
132 | 	bool verbatim;
133 | 	scope (success) enforce(!verbatim, "Unclosed code block");
134 | 
135 | 	while (true)
136 | 	{
137 | 		if (!s.length)
138 | 		{
139 | 			enforce(topLevel, "Unexpected end of file");
140 | 			return ddoc;
141 | 		}
142 | 
143 | 		switch (s[0])
144 | 		{
145 | 			case '\n':
146 | 				if (s[1 .. $].findSplit("\n")[0].strip.I!(line => line.length >= 3 && line.representation.all!(c => c == '-')))
147 | 					verbatim = !verbatim;
148 | 				goto default;
149 | 
150 | 			case '$':
151 | 			{
152 | 				if (verbatim) goto default;
153 | 				Node node;
154 | 				if (s.length > 1 && s[1].among('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+'))
155 | 				{
156 | 					node.type = Node.Type.parameter;
157 | 					node.parameter = s[1];
158 | 					s = s[2 .. $];
159 | 				}
160 | 				else
161 | 				{
162 | 					// enforce(s[1 .. $].startsWith("("), "Expected macro call after $");
163 | 					if (!s[1 .. $].startsWith("(")) goto default;
164 | 					node.type = Node.Type.call;
165 | 					s = s[2 .. $];
166 | 					auto end = s.representation.countUntil!(not!isMacroNameChar);
167 | 					enforce(end > 0, "Expected macro name after $(");
168 | 					node.call.macroName = s[0 .. end];
169 | 					if (s[end] == ' ') end++; // Skip the space after the macro name
170 | 					s = s[end .. $];
171 | 					node.call.contents = parseDDocFragment(s, false);
172 | 				}
173 | 				ddoc ~= node;
174 | 				break;
175 | 			}
176 | 
177 | 			case '(':
178 | 				if (verbatim) goto default;
179 | 				parenDepth++;
180 | 				goto default;
181 | 
182 | 			case ')':
183 | 				if (verbatim) goto default;
184 | 				if (parenDepth)
185 | 				{
186 | 					parenDepth--;
187 | 					goto default;
188 | 				}
189 | 				enforce(!topLevel, "Mismatched )");
190 | 				s = s[1 .. $];
191 | 				return ddoc;
192 | 
193 | 			default:
194 | 				if (!ddoc.length || ddoc[$-1].type != Node.Type.text)
195 | 					ddoc ~= Node(Node.Type.text);
196 | 				ddoc[$-1].text ~= s[0];
197 | 				s = s[1 .. $];
198 | 				break;
199 | 		}
200 | 	}
201 | 
202 | 	assert(false);
203 | }
204 | 
205 | /// A DDoc document.
206 | struct Document
207 | {
208 | 	DDoc contents; /// Document body.
209 | 	DDoc[string] macros; /// Local macro definitions.
210 | }
211 | 
212 | /// Parse into DOM
213 | Document parseDDoc(string s)
214 | {
215 | 	auto os = s;
216 | 	scope(failure)
217 | 	{
218 | 		import std.stdio : stderr;
219 | 		stderr.writefln("Error at line %d:",
220 | 			1 + os[0 .. $ - s.length].representation.count('\n'),
221 | 		);
222 | 	}
223 | 	auto parts = s.findSplit("\nMacros:\n");
224 | 	os = s = parts[0];
225 | 	Document document;
226 | 	document.contents = parseDDocFragment(s, true);
227 | 	document.macros = parseMacros(parts[2]);
228 | 	return document;
229 | }
230 | 
231 | /// Parse the "Macros:" section of a .dd file, or a .ddoc file.
232 | DDoc[string] parseMacros(string s)
233 | {
234 | 	DDoc[string] macros;
235 | 
236 | 	size_t contentsStartPos, contentsEndPos, nameStartPos;
237 | 	bool maybeInName = true;
238 | 	string currentName;
239 | 
240 | 	void flush(size_t endPos)
241 | 	{
242 | 		auto contents = s[contentsStartPos .. endPos];
243 | 		if (!currentName)
244 | 		{
245 | 			enforce(contents.strip.length == 0, "Macro body without name");
246 | 			return;
247 | 		}
248 | 		macros[currentName] = parseDDocFragment(contents, true);
249 | 		currentName = null;
250 | 	}
251 | 
252 | 	size_t i;
253 | 	scope(failure)
254 | 	{
255 | 		import std.stdio : stderr;
256 | 		stderr.writefln("Error at line %d:",
257 | 			1 + s[0 .. i].representation.count('\n'),
258 | 		);
259 | 	}
260 | 
261 | 	for (; i < s.length; i++)
262 | 	{
263 | 		auto c = s[i];
264 | 		switch (c)
265 | 		{
266 | 			case '=':
267 | 				if (!maybeInName)
268 | 					goto default;
269 | 				maybeInName = false;
270 | 				flush(contentsEndPos);
271 | 				currentName = s[nameStartPos .. i].strip;
272 | 				contentsStartPos = i + 1;
273 | 				break;
274 | 			case '\n':
275 | 				contentsEndPos = i;
276 | 				nameStartPos = i + 1;
277 | 				maybeInName = true;
278 | 				break;
279 | 			default:
280 | 				if (!isWhite(c) && !isMacroNameChar(c))
281 | 					maybeInName = false;
282 | 				break;
283 | 		}
284 | 	}
285 | 	flush(s.length);
286 | 
287 | 	return macros;
288 | }
289 | 
290 | /// Split `contents` by `delim`, like `std.string.split`.
291 | DDoc[] split(const DDoc contents, char delim)
292 | {
293 | 	DDoc slice(size_t startNodeIndex, size_t startOffset, size_t endNodeIndex, size_t endOffset)
294 | 	{
295 | 		DDoc result;
296 | 		foreach (nodeIndex; startNodeIndex .. endNodeIndex + (endOffset > 0))
297 | 		{
298 | 			Node node = contents[nodeIndex];
299 | 			if (nodeIndex == endNodeIndex && endOffset > 0)
300 | 			{
301 | 				assert(node.type == Node.Type.text);
302 | 				node.text = node.text[0 .. endOffset];
303 | 			}
304 | 			if (nodeIndex == startNodeIndex && startOffset > 0)
305 | 			{
306 | 				assert(node.type == Node.Type.text);
307 | 				node.text = node.text[startOffset .. $];
308 | 			}
309 | 			result ~= node;
310 | 		}
311 | 		return result;
312 | 	}
313 | 
314 | 	DDoc[] result;
315 | 	size_t startNodeIndex = 0, startOffset = 0;
316 | 	foreach (nodeIndex, ref node; contents)
317 | 		if (node.type == Node.Type.text)
318 | 			foreach (offset; 0 .. node.text.length)
319 | 				if (node.text[offset] == delim)
320 | 				{
321 | 					result ~= slice(startNodeIndex, startOffset, nodeIndex, offset);
322 | 
323 | 					startNodeIndex = nodeIndex;
324 | 					startOffset = offset;
325 | 					startOffset++;
326 | 					if (startOffset == node.text.length)
327 | 					{
328 | 						startNodeIndex++;
329 | 						startOffset = 0;
330 | 					}
331 | 				}
332 | 	result ~= slice(startNodeIndex, startOffset, contents.length, 0);
333 | 	return result;
334 | }
335 | 
336 | /// Remove whitespace from around `d`, like `std.string.strip`.
337 | inout(DDoc) strip(/*DDoc*/inout(Node)[] d)
338 | {
339 | 	while (d.length && d[0].type == Node.Type.text)
340 | 	{
341 | 		auto s = d[0].text.stripLeft();
342 | 		if (!s.length)
343 | 			d = d[1 .. $];
344 | 		else
345 | 		{
346 | 			d = Node(Node.Type.text, s) ~ d[1 .. $];
347 | 			break;
348 | 		}
349 | 	}
350 | 	while (d.length && d[$-1].type == Node.Type.text)
351 | 	{
352 | 		auto s = d[$-1].text.stripRight();
353 | 		if (!s.length)
354 | 			d = d[0 .. $-1];
355 | 		else
356 | 		{
357 | 			d = d[0 .. $-1] ~ Node(Node.Type.text, s);
358 | 			break;
359 | 		}
360 | 	}
361 | 	return d;
362 | }
363 | 
364 | 
365 | /// Converts to a string by replacing basic macros with their characters.
366 | string toString(in Node[] d)
367 | {
368 | 	string s;
369 | 	foreach (ref node; d)
370 | 	{
371 | 		if (node.type == Node.Type.text)
372 | 			s ~= node.text.replace(`\\`, `\`);
373 | 		else
374 | 		if (node.isCallToEmpty("AMP"))
375 | 			s ~= "&";
376 | 		else
377 | 		if (node.isCallToEmpty("LT"))
378 | 			s ~= "<";
379 | 		else
380 | 		if (node.isCallToEmpty("GT"))
381 | 			s ~= ">";
382 | 		else
383 | 		if (node.isCallToEmpty("LPAREN"))
384 | 			s ~= "(";
385 | 		else
386 | 		if (node.isCallTo("RPAREN"))
387 | 			s ~= ")";
388 | 		else
389 | 		if (node.isCallTo("BACKTICK"))
390 | 			s ~= "`";
391 | 		else
392 | 			throw new Exception("Can't stringify: %s".format(node));
393 | 	}
394 | 	return s;
395 | }
396 | 
397 | private alias strip = std.string.strip;
398 | 


--------------------------------------------------------------------------------
/generator/source/generator.d:
--------------------------------------------------------------------------------
  1 | import std.algorithm.comparison;
  2 | import std.algorithm.iteration;
  3 | import std.algorithm.searching;
  4 | import std.array;
  5 | import std.exception;
  6 | import std.file;
  7 | import std.path;
  8 | import std.stdio;
  9 | import std.string;
 10 | 
 11 | import ae.utils.aa;
 12 | import ae.utils.array;
 13 | import ae.utils.funopt;
 14 | import ae.utils.main;
 15 | 
 16 | import ddoc;
 17 | import grammar;
 18 | import parser;
 19 | import writer;
 20 | 
 21 | enum dlangOrgPath = "dlang.org";
 22 | 
 23 | static immutable string[] extras = [
 24 | 	"WhiteSpace",
 25 | 	"EndOfLine",
 26 | 	"Comment",
 27 | 	"SpecialTokenSequence",
 28 | ];
 29 | 
 30 | /// Entry point.
 31 | void program()
 32 | {
 33 | 	if (!exists(dlangOrgPath) && exists("../" ~ dlangOrgPath))
 34 | 		chdir("..");
 35 | 
 36 | 	auto globalMacros = [
 37 | 		"dlang.org.ddoc",
 38 | 	]
 39 | 		.map!(fn => dlangOrgPath.buildPath(fn))
 40 | 		.map!readText
 41 | 		.map!parseMacros
 42 | 		.array;
 43 | 
 44 | 	string[] files;
 45 | 	void scanTOC(const DDoc ddoc)
 46 | 	{
 47 | 		foreach (ref node; ddoc)
 48 | 			if (node.type == Node.Type.call)
 49 | 			{
 50 | 				if (node.call.macroName == "ITEMIZE")
 51 | 					files = node.call.splitArguments()
 52 | 						.map!((DDoc argument) {
 53 | 							argument = argument.strip();
 54 | 							enforce(argument.length == 1);
 55 | 							enforce(argument[0].type == Node.Type.call);
 56 | 							enforce(argument[0].call.macroName == "A");
 57 | 							auto href = argument[0].call.splitArguments()[0].strip;
 58 | 							enforce(href.length == 1);
 59 | 							enforce(href[0].type == Node.Type.text);
 60 | 							enforce(href[0].text.endsWith(".html"));
 61 | 							return href[0].text[0 .. $ - 5];
 62 | 						})
 63 | 						.filter!(name => !name.among("abi")) // Skip mangling definition
 64 | 						.array;
 65 | 				else
 66 | 					scanTOC(node.call.contents);
 67 | 			}
 68 | 	}
 69 | 	scanTOC(dlangOrgPath.buildPath("spec", "spec.dd").readText.parseDDoc.contents);
 70 | 	enforce(files.length, "Failed to parse the table of contents (spec/spec.dd)");
 71 | 
 72 | 	Grammar grammar;
 73 | 	string[][][string] order;
 74 | 
 75 | 	foreach (file; files)
 76 | 	{
 77 | 		scope(failure) stderr.writeln("Error in file " ~ file ~ ":");
 78 | 		auto source = dlangOrgPath.buildPath("spec", file ~ ".dd").readText;
 79 | 		auto ddoc = source.parseDDoc;
 80 | 
 81 | 		if (source.indexOf(`$(GRAMMAR`) < 0)
 82 | 			continue;
 83 | 
 84 | 		void scan(ref const Node node)
 85 | 		{
 86 | 			if (node.type != Node.Type.call)
 87 | 				return;
 88 | 			scope(failure) stderr.writefln("Error on line %d:",
 89 | 				1 + source[0 .. source.sliceIndex(node.call.macroName)].representation.count('\n'));
 90 | 
 91 | 			if (node.call.macroName == "GRAMMAR" || node.call.macroName == "GRAMMAR_LEX")
 92 | 			{
 93 | 				enforce(node.call.contents.length &&
 94 | 					node.call.contents[$-1].type == Node.type.text &&
 95 | 					node.call.contents[$-1].isText("\n"),
 96 | 					"Unexpected text at the end of GRAMMAR node"
 97 | 				);
 98 | 				auto macros = (globalMacros ~ ddoc.macros).fold!merge((DDoc[string]).init);
 99 | 				auto kind = node.call.macroName == "GRAMMAR" ? Grammar.Def.Kind.tokens : Grammar.Def.Kind.chars;
100 | 				auto newDefs = grammar.parse(node.call.contents, file, macros, kind);
101 | 				order[file] ~= newDefs;
102 | 			}
103 | 			else
104 | 				foreach (ref childNode; node.call.contents)
105 | 					scan(childNode);
106 | 		}
107 | 		foreach (ref node; ddoc.contents)
108 | 			scan(node);
109 | 	}
110 | 
111 | 	grammar.defs["AsmStatement"].node = choice([
112 | 		grammar.defs["AsmStatement"].node,
113 | 		reference("GccAsmStatement"),
114 | 	]);
115 | 
116 | 	grammar.analyze(["SourceFile"] ~ extras);
117 | 
118 | 	foreach (defName; ["WhiteSpace", "EndOfLine"])
119 | 		grammar.defs[defName].hidden = true;
120 | 
121 | 	auto writer = Writer("../grammar.js", grammar, extras);
122 | 
123 | 	foreach (file; files)
124 | 	{
125 | 		writer.startFile(file);
126 | 		foreach (section; order.get(file, null))
127 | 		{
128 | 			writer.startSection();
129 | 			foreach (def; section)
130 | 				writer.writeRule(def);
131 | 		}
132 | 	}
133 | 
134 | 	writer.close();
135 | }
136 | 
137 | mixin main!(funopt!program);
138 | 


--------------------------------------------------------------------------------
/generator/source/grammar.d:
--------------------------------------------------------------------------------
   1 | import std.algorithm.comparison;
   2 | import std.algorithm.iteration;
   3 | import std.algorithm.searching;
   4 | import std.algorithm.sorting;
   5 | import std.array;
   6 | import std.exception;
   7 | import std.format;
   8 | import std.functional;
   9 | import std.range;
  10 | import std.sumtype;
  11 | 
  12 | import ae.utils.aa;
  13 | import ae.utils.meta;
  14 | import ae.utils.text;
  15 | 
  16 | import ddoc;
  17 | 
  18 | static this()
  19 | {
  20 | 	if (false)
  21 | 	{
  22 | 		// Avoid https://issues.dlang.org/show_bug.cgi?id=22010
  23 | 		// (or some similar bug)
  24 | 		Grammar.Node node;
  25 | 		auto b = node == node;
  26 | 	}
  27 | }
  28 | 
  29 | struct Grammar
  30 | {
  31 | 	struct RegExp { string regexp; } /// Regular expression, generally with the intent to describe some character set.
  32 | 	struct LiteralChars { string chars; } /// Describes contiguous characters (e.g. number syntax)
  33 | 	struct LiteralToken { string literal; } /// May be surrounded by whitespace/comments
  34 | 	struct Reference { string name; } /// Reference to another definition.
  35 | 	struct Choice { Node[] nodes; } /// Choice of multiple possible nodes.
  36 | 	struct Seq { Node[] nodes; } /// Consecutive sequence of nodes.
  37 | 	// https://issues.dlang.org/show_bug.cgi?id=22010
  38 | 	private mixin template OneNode() { Node[/*1*/] nodes; ref Node node() { assert(nodes.length == 1); return nodes[0]; } }
  39 | 	struct Repeat { mixin OneNode; } /// Zero-or-more occurrences of the given node.
  40 | 	struct Repeat1 { mixin OneNode; } /// One-or-more occurrences of the given node.
  41 | 	struct Optional { mixin OneNode; } /// Zero-or-one occurrences of the given node.
  42 | 	struct SeqChoice { Node[][] nodes; } /// Internal node, superset of Choice, Seq and Optional. `nodes` is a list of choices of sequences.
  43 | 
  44 | 	// https://issues.dlang.org/show_bug.cgi?id=22003
  45 | 	alias NodeValue = SumType!(
  46 | 		RegExp,
  47 | 		LiteralChars,
  48 | 		LiteralToken,
  49 | 		Reference,
  50 | 		Choice,
  51 | 		Repeat,
  52 | 		Repeat1,
  53 | 		Seq,
  54 | 		Optional,
  55 | 		SeqChoice,
  56 | 	);
  57 | 
  58 | 	/// A grammar node.
  59 | 	struct Node
  60 | 	{
  61 | 		NodeValue value;
  62 | 		alias value this;
  63 | 
  64 | 		void toString(scope void delegate(const(char)[]) sink)
  65 | 		{
  66 | 			value.match!(
  67 | 				(ref v) => sink.formattedWrite!"%s"(v),
  68 | 			);
  69 | 		}
  70 | 
  71 | 		void toString(scope void delegate(const(char)[]) sink) const
  72 | 		{
  73 | 			value.match!(
  74 | 				(ref v) => sink.formattedWrite!"%s"(v),
  75 | 			);
  76 | 		}
  77 | 	}
  78 | 
  79 | 	/// A grammar definition.
  80 | 	/// Emitted as `name: $ => ...`
  81 | 	struct Def
  82 | 	{
  83 | 		Node node; /// The root AST node.
  84 | 
  85 | 		/// How to emit this definition in the grammar.
  86 | 		enum Kind
  87 | 		{
  88 | 			tokens, /// As a regular rule.
  89 | 			chars, /// As a token(...) rule.
  90 | 		}
  91 | 		Kind kind; /// ditto
  92 | 
  93 | 		bool used; /// Include the definition in the generated grammar.
  94 | 		bool hidden; /// Hide in the tree-sitter AST (by prefixing the name with _).
  95 | 		bool synthetic; /// We made this one up - don't emit a dlang.org link.
  96 | 
  97 | 		string publicName; /// If set, use this name instead of the `defs` key.
  98 | 		string[] tail; /// Also write these (synthetic) rules after this one
  99 | 
 100 | 		HashSet!string definedIn; /// Used to check if GLINK2 links are correct.
 101 | 	}
 102 | 
 103 | 	/// All definitions in the grammar, indexed by their official names.
 104 | 	Def[string] defs;
 105 | 
 106 | 	HashSet!(string[2]) links; /// Used to check if GLINK2 links are correct.
 107 | 
 108 | 	/// Pre-process and prepare for writing
 109 | 	void analyze(string[] roots)
 110 | 	{
 111 | 		checkReferences();
 112 | 		checkLinks();
 113 | 		normalize();
 114 | 		optimize();
 115 | 		deRecurse();
 116 | 		extractBodies();
 117 | 		checkKinds();
 118 | 		scanUsed(roots);
 119 | 		scanHidden();
 120 | 		compile();
 121 | 	}
 122 | 
 123 | 	// Ensure that all referenced grammar definitions are defined.
 124 | 	private void checkReferences()
 125 | 	{
 126 | 		void scan(Node node)
 127 | 		{
 128 | 			node.match!(
 129 | 				(ref RegExp       v) {},
 130 | 				(ref LiteralChars v) {},
 131 | 				(ref LiteralToken v) {},
 132 | 				(ref Reference    v) { enforce(v.name in defs, "Unknown reference: " ~ v.name); },
 133 | 				(ref Choice       v) { v.nodes       .each!scan(); },
 134 | 				(ref Seq          v) { v.nodes       .each!scan(); },
 135 | 				(ref Repeat       v) { v.nodes       .each!scan(); },
 136 | 				(ref Repeat1      v) { v.nodes       .each!scan(); },
 137 | 				(ref Optional     v) { v.nodes       .each!scan(); },
 138 | 				(ref SeqChoice    v) { v.nodes.joiner.each!scan(); },
 139 | 			);
 140 | 		}
 141 | 		foreach (name, ref def; defs)
 142 | 			scan(def.node);
 143 | 	}
 144 | 
 145 | 	/// Ensure that GLINK2 destinations link to pages
 146 | 	/// which actually contain the linked definitions.
 147 | 	private void checkLinks()
 148 | 	{
 149 | 		foreach (pair; links.keys.sort)
 150 | 			enforce(pair[0] in defs[pair[1]].definedIn,
 151 | 				"Broken link to %s: links to page %s but it is defined in page(s) %-(%s, %)".format(
 152 | 					pair[1], pair[0], defs[pair[1]].definedIn.byKey,
 153 | 			));
 154 | 	}
 155 | 
 156 | 	// Convert rules to an intermediate normalized form, which makes other manipulations easier.
 157 | 	// In the normalized form, only the following nodes are allowed:
 158 | 	// - Leaf nodes (RegExp, LiteralChars, LiteralToken)
 159 | 	// - Reference
 160 | 	// - SeqChoice
 161 | 	// - Repeat1
 162 | 	// Seq, Choice, and Optional are expressed as SeqChoice nodes.
 163 | 	// Repeat is expressed as SeqChoice([[], [Repeat1(...)]]).
 164 | 	private void normalize()
 165 | 	{
 166 | 		void normalizeNode(ref Node node)
 167 | 		{
 168 | 			// Normalize children
 169 | 			node.match!(
 170 | 				(ref RegExp       v) {},
 171 | 				(ref LiteralChars v) {},
 172 | 				(ref LiteralToken v) {},
 173 | 				(ref Reference    v) {},
 174 | 				(ref Choice       v) { v.nodes.each!normalizeNode(); },
 175 | 				(ref Seq          v) { v.nodes.each!normalizeNode(); },
 176 | 				(ref Repeat       v) { v.nodes.each!normalizeNode(); },
 177 | 				(ref Repeat1      v) { v.nodes.each!normalizeNode(); },
 178 | 				(ref Optional     v) { v.nodes.each!normalizeNode(); },
 179 | 				(ref SeqChoice    v) { unexpected(v); },
 180 | 			);
 181 | 
 182 | 			// Normalize node
 183 | 			node = node.match!(
 184 | 				(ref RegExp       v) => node,
 185 | 				(ref LiteralChars v) => node,
 186 | 				(ref LiteralToken v) => node,
 187 | 				(ref Reference    v) => node,
 188 | 				(ref Choice       v) => seqChoice(v.nodes.map!((ref Node node) => node.match!(
 189 | 					(ref RegExp       v) => [[node]],
 190 | 					(ref LiteralChars v) => [[node]],
 191 | 					(ref LiteralToken v) => [[node]],
 192 | 					(ref Reference    v) => [[node]],
 193 | 					(ref SeqChoice    v) => v.nodes,
 194 | 					(ref Repeat1      v) => [[node]],
 195 | 					(ref              _) => unexpected(_).progn(null),
 196 | 				)).join),
 197 | 				(ref Seq          v) => seqChoice([v.nodes]),
 198 | 				(ref Repeat       v) => seqChoice([[], [repeat1(v.node)]]),
 199 | 				(ref Repeat1      v) => node,
 200 | 				(ref Optional     v) => seqChoice([[], v.nodes]),
 201 | 				(ref SeqChoice    v) { unexpected(v); return Node.init; },
 202 | 			);
 203 | 		}
 204 | 
 205 | 		foreach (defName, ref def; defs)
 206 | 			normalizeNode(def.node);
 207 | 	}
 208 | 
 209 | 	// Extract the empty choice from a SeqChoice, if it has one.
 210 | 	// If not, just return null and leave the argument unmodified.
 211 | 	// The return value can then be appended to a choice list to
 212 | 	// re-add the optional choice back in the tree.
 213 | 	private Node[][] extractOptional(ref Node[][] choices)
 214 | 	{
 215 | 		foreach (i, choice; choices)
 216 | 			if (!choice.length)
 217 | 			{
 218 | 				choices = choices[0 .. i] ~ choices[i + 1 .. $];
 219 | 				return [[]];
 220 | 			}
 221 | 		return null;
 222 | 	}
 223 | 
 224 | 	// Optimize the given normalized node in-place.
 225 | 	private void optimizeNode(ref Node node)
 226 | 	{
 227 | 		void optimizeNode(ref Node node) { Grammar.optimizeNode(node); }
 228 | 
 229 | 		// Optimize children
 230 | 		node.match!(
 231 | 			(ref RegExp       v) {},
 232 | 			(ref LiteralChars v) {},
 233 | 			(ref LiteralToken v) {},
 234 | 			(ref Reference    v) {},
 235 | 			(ref SeqChoice    v) { v.nodes.joiner.each!optimizeNode(); },
 236 | 			(ref Repeat1      v) { v.nodes       .each!optimizeNode(); },
 237 | 			(ref              _) { unexpected(_); },
 238 | 		);
 239 | 
 240 | 		// Replace unary SeqChoice nodes with their sole contents.
 241 | 		node = node.match!(
 242 | 			(ref SeqChoice    v) => v.nodes.length == 1 && v.nodes[0].length == 1 ? v.nodes[0][0] : node,
 243 | 			(ref              _) => node,
 244 | 		);
 245 | 
 246 | 		// Un-nest single-choice SeqChoice nodes.
 247 | 		node.match!(
 248 | 			(ref SeqChoice    v)
 249 | 			{
 250 | 				foreach (ref choice; v.nodes)
 251 | 					foreach_reverse (i; 0 .. choice.length)
 252 | 						choice[i].match!(
 253 | 							(ref SeqChoice v)
 254 | 							{
 255 | 								if (v.nodes.length == 1) // single-choice
 256 | 									choice = choice[0 .. i] ~ v.nodes[0] ~ choice[i + 1 .. $];
 257 | 							},
 258 | 							(ref _) {}
 259 | 						);
 260 | 			},
 261 | 			(ref              _) {},
 262 | 		);
 263 | 
 264 | 		// Collapse redundantly-optional repetition into non-optional repetition.
 265 | 		// x ( | repeat1(x) ) => repeat1(x)
 266 | 		// ( | repeat1(x) ) x => repeat1(x)
 267 | 		node.match!(
 268 | 			(ref SeqChoice v)
 269 | 			{
 270 | 				foreach (ref choice; v.nodes)
 271 | 					foreach_reverse (i; 0 .. choice.length)
 272 | 					{
 273 | 						if (i >= choice.length)
 274 | 							continue; // Already optimized; cursor is outside new range
 275 | 						choice[i].match!(
 276 | 							(ref SeqChoice sc)
 277 | 							{
 278 | 								auto choices = sc.nodes;
 279 | 								if (!extractOptional(choices))
 280 | 									return;
 281 | 								if (choices.length != 1 || choices[0].length != 1)
 282 | 									return; // Not single-choice (bar optional) or single-length
 283 | 
 284 | 								choices[0][0].match!(
 285 | 									(ref Repeat1 r)
 286 | 									{
 287 | 										// The list of repeating nodes to try to collapse
 288 | 										auto span = r.node.match!(
 289 | 											(ref SeqChoice scSpan) => scSpan.nodes.length == 1 ? scSpan.nodes[0] : r.nodes,
 290 | 											(ref _) => r.nodes,
 291 | 										);
 292 | 
 293 | 										if (choice[0 .. i].endsWith(span))
 294 | 											choice = choice[0 .. i - span.length] ~ choices[0] ~ choice[i + 1 .. $];
 295 | 										else
 296 | 										if (choice[i + 1 .. $].startsWith(span))
 297 | 											choice = choice[0 .. i] ~ choices[0] ~ choice[i + 1 + span.length .. $];
 298 | 									},
 299 | 									(ref _) {}
 300 | 								);
 301 | 							},
 302 | 							(ref _) {},
 303 | 						);
 304 | 					}
 305 | 			},
 306 | 			(ref _) {},
 307 | 		);
 308 | 
 309 | 		// Given a SeqChoice, try to segment all of its choices such that the set
 310 | 		// concatenation of the two sets containing each segment's halves is the exact set
 311 | 		// of the original choices.  This operation is more general than prefix/suffix
 312 | 		// extraction.
 313 | 		// ( a b | a c ) => a ( b | c )
 314 | 		// ( a b | b ) => ( | a ) b
 315 | 		// a x | a y | b x | b y => ( a | b ) ( x | y )
 316 | 		node.match!(
 317 | 			(ref SeqChoice sc)
 318 | 			{
 319 | 				auto choices = sc.nodes;
 320 | 				choices = choices.map!flattenChoices.join;
 321 | 
 322 | 				// Find all choices which have a chance of participating in segmentation.
 323 | 				bool[] choiceViable = choices.map!(choice =>
 324 | 					// A choice is minimally viable if any of its constituent nodes occur
 325 | 					// at least once somewhere else in the choice list.
 326 | 					choice.any!((ref Node node) =>
 327 | 						choices.map!(choice =>
 328 | 							choice.count(node)
 329 | 						).sum > 1
 330 | 					)
 331 | 					|| choice.length == 0 // Edge case
 332 | 				).array;
 333 | 
 334 | 				if (choices.length.iota.filter!(i => choiceViable[i]).walkLength > 15)
 335 | 					return; // Too slow :(
 336 | 
 337 | 				// Precompute all minimally viable cut points for choices.
 338 | 				bool[][] cutPosViable = choices.map!(choice =>
 339 | 					(choice.length + 1).iota.map!(pos =>
 340 | 						pos == 0 || pos == choice.length || // redundant / optimization
 341 | 						choices.count!(choice2 => choice2.startsWith(choice[0 .. pos])) > 1 ||
 342 | 						choices.count!(choice2 => choice2.endsWith  (choice[pos .. $])) > 1
 343 | 					).array
 344 | 				).array;
 345 | 
 346 | 				// How to cut the choice at the given index.
 347 | 				// -1 = doesn't participate in segmentation.
 348 | 				auto cutPos = new sizediff_t[choices.length];
 349 | 
 350 | 				// The two sets, represented by the index of some
 351 | 				// choice which is cut according to it.
 352 | 				auto leftSet  = new size_t[choices.length];
 353 | 				auto rightSet = new size_t[choices.length];
 354 | 				size_t leftSetSize, rightSetSize;
 355 | 
 356 | 				alias leftChoices = () => leftSetSize.iota.map!(setIndex =>
 357 | 					leftSet[setIndex].I!(choiceIndex =>
 358 | 						choices[choiceIndex][0 .. cutPos[choiceIndex]]
 359 | 					)
 360 | 				);
 361 | 				alias rightChoices = () => rightSetSize.iota.map!(setIndex =>
 362 | 					rightSet[setIndex].I!(choiceIndex =>
 363 | 						choices[choiceIndex][cutPos[choiceIndex] .. $]
 364 | 					)
 365 | 				);
 366 | 
 367 | 				// Number of choices which do not participate in
 368 | 				// segmentation.
 369 | 				size_t numExcluded;
 370 | 
 371 | 				// Avoid infinite recursion by only attempting to return (and re-optimize)
 372 | 				// a solution that is better than the status quo.
 373 | 				alias nodeScore = delegate size_t (ref Node node) => node.match!(
 374 | 					(ref SeqChoice sc) => sc.nodes.map!(choice => choice.map!nodeScore.sum).sum,
 375 | 					(ref _) => 1,
 376 | 				);
 377 | 
 378 | 				// Best solution found.
 379 | 				size_t bestScore = nodeScore(node);
 380 | 				Node bestNode;
 381 | 
 382 | 				// Use classic recursive backtracking to iterate
 383 | 				// through all possible valid solutions
 384 | 				void search(size_t choiceIndex)
 385 | 				{
 386 | 					// If the cardinality of the set concatenation exceeds the
 387 | 					// size of the input set, then it certainly contains strings
 388 | 					// which are not part of the input set.
 389 | 					if (leftSetSize * rightSetSize > choices.length - numExcluded)
 390 | 						return;
 391 | 
 392 | 					// Disallowing either set to grow larger than |choices|/2 greatly
 393 | 					// reduces the execution time, but prevents this algorithm from
 394 | 					// performing basic prefix/suffix extraction. Currently we don't need
 395 | 					// the optimization.
 396 | 					version (none)
 397 | 						if (leftSetSize  > (choices.length - numExcluded) / 2 ||
 398 | 							rightSetSize > (choices.length - numExcluded) / 2)
 399 | 							return;
 400 | 
 401 | 					if (choiceIndex < choices.length)
 402 | 					{
 403 | 						auto choice = choices[choiceIndex];
 404 | 
 405 | 						// Try segmenting the choice at every viable point
 406 | 						if (choiceViable[choiceIndex])
 407 | 							foreach_reverse (pos; 0 .. choice.length + 1)
 408 | 							{
 409 | 								if (!cutPosViable[choiceIndex][pos])
 410 | 									continue;
 411 | 
 412 | 								cutPos[choiceIndex] = pos;
 413 | 
 414 | 								auto left = choice[0 .. pos];
 415 | 								auto right = choice[pos .. $];
 416 | 
 417 | 								bool inLeftSet = leftChoices().canFind(left);
 418 | 								bool inRightSet = rightChoices().canFind(right);
 419 | 								if (!inLeftSet)
 420 | 									leftSet[leftSetSize++] = choiceIndex;
 421 | 								if (!inRightSet)
 422 | 									rightSet[rightSetSize++] = choiceIndex;
 423 | 								search(choiceIndex + 1);
 424 | 								if (!inLeftSet)
 425 | 									leftSetSize--;
 426 | 								if (!inRightSet)
 427 | 									rightSetSize--;
 428 | 							}
 429 | 
 430 | 						// Also try excluding this choice from segmentation
 431 | 						cutPos[choiceIndex] = -1;
 432 | 						numExcluded++;
 433 | 						search(choiceIndex + 1);
 434 | 						numExcluded--;
 435 | 					}
 436 | 					else
 437 | 					{
 438 | 						// scope(failure)
 439 | 						// {
 440 | 						// 	import std.stdio;
 441 | 						// 	writeln("Inputs:");
 442 | 						// 	foreach (i, choice; choices)
 443 | 						// 		if (cutPos[i] == -1)
 444 | 						// 			writeln("- ", choice, " (EXCLUDED)");
 445 | 						// 		else
 446 | 						// 			writeln("- ", choice[0 .. cutPos[i]], " | ", choice[cutPos[i] .. $]);
 447 | 						// 	writeln("Left set:");
 448 | 						// 	foreach (choice; leftChoices())
 449 | 						// 		writeln("- ", choice);
 450 | 						// 	writeln("Right set:");
 451 | 						// 	foreach (choice; rightChoices())
 452 | 						// 		writeln("- ", choice);
 453 | 						// 	writefln("Total: %d  Excluded: %d  Segmented: %d", choices.length, numExcluded, choices.length - numExcluded);
 454 | 						// 	writeln();
 455 | 						// 	writeln();
 456 | 						// }
 457 | 
 458 | 						if (numExcluded == choices.length)
 459 | 							return; // Degenerate case - all choices are excluded
 460 | 						if (leftChoices().equal([[]]) || rightChoices().equal([[]]))
 461 | 							return; // Degenerate case - extracting empty prefix/suffix
 462 | 
 463 | 						// The set concatenation (pair-wise concatenation of Cartesian
 464 | 						// product) of the two sets must result in the original full set
 465 | 						// of choices.
 466 | 						if (leftSetSize * rightSetSize + numExcluded != choices.length)
 467 | 							return;
 468 | 
 469 | 						size_t score;
 470 | 						foreach (ci; 0 .. choices.length)
 471 | 							if (cutPos[ci] == -1)
 472 | 								score += choices[ci].length;
 473 | 						foreach (choice; leftChoices())
 474 | 							score += choice.length;
 475 | 						foreach (choice; rightChoices())
 476 | 							score += choice.length;
 477 | 
 478 | 						if (score < bestScore)
 479 | 						{
 480 | 							bestScore = score;
 481 | 
 482 | 							// Excluded choices
 483 | 							auto newChoices = choices.length.iota
 484 | 								.filter!(choiceIndex => cutPos[choiceIndex] == -1)
 485 | 								.map!(choiceIndex => choices[choiceIndex])
 486 | 								.array;
 487 | 							// Container for the two sets
 488 | 							auto container = seqChoice([[
 489 | 								seqChoice(leftChoices().array),
 490 | 								seqChoice(rightChoices().array),
 491 | 							]]);
 492 | 							// Insert the container choice at the first occurrence of a
 493 | 							// refactored choice
 494 | 							auto insertPos = cutPos.countUntil!(pos => pos >= 0);
 495 | 							if (insertPos < 0)
 496 | 								insertPos = 0;
 497 | 							newChoices = newChoices[0 .. insertPos] ~ [container] ~ newChoices[insertPos .. $];
 498 | 							bestNode = seqChoice(newChoices);
 499 | 							assert(nodeScore(bestNode) == score);
 500 | 						}
 501 | 					}
 502 | 				}
 503 | 				search(0);
 504 | 
 505 | 				assert(numExcluded == 0 && leftSetSize == 0 && rightSetSize == 0);
 506 | 
 507 | 				if (bestNode !is Node.init)
 508 | 				{
 509 | 					// Apply solution
 510 | 					node = bestNode;
 511 | 					optimizeNode(node);
 512 | 				}
 513 | 			},
 514 | 			(ref _) {}
 515 | 		);
 516 | 
 517 | 		// Lift the common part (prefix or suffix) out of SeqChoice choices, e.g, transform:
 518 | 		// x | x a | x b | ... => x ( | a | b | ... )
 519 | 		// We do this if at least two choices have a non-empty common prefix or suffix,
 520 | 		// for every such possible prefix / suffix.
 521 | 		node.match!(
 522 | 			(ref SeqChoice scNode)
 523 | 			{
 524 | 				auto choices = scNode.nodes;
 525 | 
 526 | 				if (choices.length < 2)
 527 | 					return; // Must have at least two choices
 528 | 
 529 | 				size_t bestCount;
 530 | 
 531 | 				foreach (pass; [1, 2]) // Do a first pass to find the biggest group
 532 | 					foreach (i1; 0 .. choices.length)
 533 | 						foreach (i2; i1 + 1 .. choices.length)
 534 | 						{
 535 | 							auto choice1 = choices[i1];
 536 | 							auto choice2 = choices[i2];
 537 | 							auto prefix = commonPrefix(choice1      , choice2      )      ;
 538 | 							auto suffix = commonPrefix(choice1.retro, choice2.retro).retro;
 539 | 							if (prefix.length || suffix.length)
 540 | 							{
 541 | 								alias indexIsGrouped = i =>
 542 | 									choices[i].startsWith(prefix) &&
 543 | 									choices[i].endsWith(suffix) &&
 544 | 									choices[i].length >= prefix.length + suffix.length;
 545 | 								auto groupedIndices = choices.length.iota.filter!indexIsGrouped.array;
 546 | 								if (groupedIndices.length < 2)
 547 | 									continue;
 548 | 
 549 | 								if (pass == 1)
 550 | 									bestCount = max(bestCount, groupedIndices.length);
 551 | 								else
 552 | 								if (groupedIndices.length == bestCount)
 553 | 								{
 554 | 									auto remainingIndices = choices.length.iota.filter!(not!indexIsGrouped);
 555 | 									// auto groupedChoices = groupedIndices.map!(i => choices[i]);
 556 | 
 557 | 									auto newChoices = remainingIndices.map!(i => choices[i]).array;
 558 | 									// Insert the new group at the first occurrence of the prefix/suffix
 559 | 									auto insertionPoint = groupedIndices.front;
 560 | 									newChoices =
 561 | 										newChoices[0 .. insertionPoint] ~
 562 | 										chain(
 563 | 											prefix,
 564 | 											seqChoice(
 565 | 												groupedIndices.map!(i => choices[i][prefix.length .. $ - suffix.length]).array
 566 | 											).only,
 567 | 											suffix,
 568 | 										).array.only.array ~
 569 | 										newChoices[insertionPoint .. $];
 570 | 
 571 | 									node = seqChoice(newChoices);
 572 | 									optimizeNode(node);
 573 | 									return;
 574 | 								}
 575 | 							}
 576 | 						}
 577 | 			},
 578 | 			(ref _) {},
 579 | 		);
 580 | 	}
 581 | 
 582 | 	// Fold away unnecessary grammar nodes, simplify the node tree,
 583 | 	// and otherwise prepare it for the transformations to follow.
 584 | 	private void optimize()
 585 | 	{
 586 | 		foreach (ref def; defs)
 587 | 			optimizeNode(def.node);
 588 | 	}
 589 | 
 590 | 	// Name-based heuristic to decide which nodes to perform
 591 | 	// de-recursion / body-extraction for.
 592 | 	private bool isPlural(string defName)
 593 | 	{
 594 | 		return
 595 | 			// Lists of things generally involve repetition.
 596 | 			defName.splitByCamelCase.canFind("List") ||
 597 | 
 598 | 			// If the definition name is the plural of the name of another definition,
 599 | 			// then this is almost certainly used for repetition.
 600 | 			["s", "es"].any!(pluralSuffix =>
 601 | 				defName.endsWith(pluralSuffix) &&
 602 | 				["", "Name"].any!(singularSuffix =>
 603 | 					defName[0 .. $ - pluralSuffix.length] ~ singularSuffix in defs
 604 | 				)
 605 | 			);
 606 | 	}
 607 | 
 608 | 	// Attempt to remove recursion as needed
 609 | 	private void deRecurse()
 610 | 	{
 611 | 		foreach (defName, ref def; defs)
 612 | 		{
 613 | 			// In the D grammar, recursion is used for two purposes:
 614 | 			// - Repetition (e.g. Characters)
 615 | 			// - Nested constructs (e.g. binary expressions)
 616 | 			// We only want to de-recurse the first kind.
 617 | 			bool shouldDeRecurse =
 618 | 
 619 | 				// We must always de-recurse token fragments,
 620 | 				// because we can't use tree-sitter recursion with them.
 621 | 				def.kind == Def.Kind.chars ||
 622 | 
 623 | 				// Lists of things generally involve repetition.
 624 | 				isPlural(defName) ||
 625 | 
 626 | 				// Additional rules.
 627 | 				defName.among(
 628 | 					"ParameterAttributes",
 629 | 					"AsmInstruction",
 630 | 				);
 631 | 
 632 | 			if (shouldDeRecurse)
 633 | 			{
 634 | 				auto x = reference(defName);
 635 | 
 636 | 				// Transform x := a | b | c x into x := ( | ( c )+ ) ( a | b )
 637 | 				def.node.match!(
 638 | 					(ref SeqChoice sc1)
 639 | 					{
 640 | 						auto choices = sc1.nodes;
 641 | 						choices = choices.map!flattenChoices.join;
 642 | 
 643 | 						auto recursiveChoiceIndices = choices.length.iota.filter!(
 644 | 							i => choices[i].canFind(x),
 645 | 						).array;
 646 | 						if (recursiveChoiceIndices.length != 1)
 647 | 							return; // Single path to recursion
 648 | 						auto recursiveChoiceIndex = recursiveChoiceIndices.front;
 649 | 						auto recursiveChoice = choices[recursiveChoiceIndex];
 650 | 						if (recursiveChoice.countUntil(x) + 1 != recursiveChoice.length)
 651 | 							return; // More rules follow after recursion
 652 | 
 653 | 						def.node = seqChoice([[
 654 | 							// Recursive part
 655 | 							seqChoice([
 656 | 								[], // Optional (zero-or-more)
 657 | 								[repeat1(seqChoice([
 658 | 									recursiveChoice[0 .. $ - 1]
 659 | 								]))],
 660 | 							]),
 661 | 							// Non-recursive parts
 662 | 							seqChoice(
 663 | 								choices[0 .. recursiveChoiceIndex] ~ choices[recursiveChoiceIndex + 1 .. $],
 664 | 							),
 665 | 						]]);
 666 | 						optimizeNode(def.node);
 667 | 					},
 668 | 					(_) {}
 669 | 				);
 670 | 
 671 | 				// Transform x := y ( | z x ) into x := y ( | ( z y )+ )
 672 | 				def.node.match!(
 673 | 					(ref SeqChoice sc1)
 674 | 					{
 675 | 						if (sc1.nodes.length != 1)
 676 | 							return; // Single choice
 677 | 						if (sc1.nodes[0].length < 2)
 678 | 							return;
 679 | 
 680 | 						auto y = sc1.nodes[0][0 .. $-1];
 681 | 
 682 | 						sc1.nodes[0][$-1].match!(
 683 | 							(ref SeqChoice sc2)
 684 | 							{
 685 | 
 686 | 								auto choices = sc2.nodes;
 687 | 								if (!extractOptional(choices))
 688 | 									return;
 689 | 								if (choices.length != 1)
 690 | 									return;
 691 | 								if (choices[0][$-1] != x)
 692 | 									return;
 693 | 
 694 | 								auto z = choices[0][0 .. $-1];
 695 | 
 696 | 								def.node = seqChoice([
 697 | 									y ~
 698 | 									seqChoice([
 699 | 										[], // optional
 700 | 										[repeat1(
 701 | 											seqChoice([
 702 | 												z ~
 703 | 												y,
 704 | 											])
 705 | 										)],
 706 | 									]),
 707 | 								]);
 708 | 								optimizeNode(def.node);
 709 | 							},
 710 | 							(_) {}
 711 | 						);
 712 | 					},
 713 | 					(_) {}
 714 | 				);
 715 | 
 716 | 				// Transform x := ( | x z ) y into x := ( | ( y z )+ ) y
 717 | 				// Same as above, but in the other direction.
 718 | 				def.node.match!(
 719 | 					(ref SeqChoice sc1)
 720 | 					{
 721 | 						if (sc1.nodes.length != 1)
 722 | 							return; // Single choice
 723 | 						if (sc1.nodes[0].length < 2)
 724 | 							return;
 725 | 
 726 | 						auto y = sc1.nodes[0][1 .. $];
 727 | 
 728 | 						sc1.nodes[0][0].match!(
 729 | 							(ref SeqChoice sc2)
 730 | 							{
 731 | 								auto choices = sc2.nodes;
 732 | 								if (!extractOptional(choices))
 733 | 									return;
 734 | 								if (choices.length != 1)
 735 | 									return;
 736 | 								if (choices[0][0] != x)
 737 | 									return;
 738 | 
 739 | 								auto z = choices[0][1 .. $];
 740 | 
 741 | 								def.node = seqChoice([
 742 | 									seqChoice([
 743 | 										[], // optional
 744 | 										[repeat1(
 745 | 											seqChoice([
 746 | 												y ~
 747 | 												z,
 748 | 											])
 749 | 										)],
 750 | 									]) ~
 751 | 									y,
 752 | 								]);
 753 | 								optimizeNode(def.node);
 754 | 							},
 755 | 							(_) {}
 756 | 						);
 757 | 					},
 758 | 					(_) {}
 759 | 				);
 760 | 			}
 761 | 		}
 762 | 	}
 763 | 
 764 | 	// Recursively expand all nested choices into a flat list of all possible combinations.
 765 | 	// This form is used for some transformations.
 766 | 	private static Node[][] flattenChoices(Node[] nodes)
 767 | 	{
 768 | 		foreach (i, ref node; nodes)
 769 | 		{
 770 | 			auto result = node.match!(
 771 | 				(ref SeqChoice sc)
 772 | 				{
 773 | 					assert(sc.nodes.length > 1);
 774 | 					Node[][] result;
 775 | 					foreach (choice; sc.nodes.map!flattenChoices.joiner)
 776 | 						foreach (rightChoice; flattenChoices(nodes[i + 1 .. $]))
 777 | 							result ~= nodes[0 .. i] ~ choice ~ rightChoice;
 778 | 					return result;
 779 | 				},
 780 | 				(ref _) => null,
 781 | 			);
 782 | 			if (result)
 783 | 				return result;
 784 | 		}
 785 | 		return [nodes];
 786 | 	}
 787 | 
 788 | 	// Refactor some definitions into a descending part and an
 789 | 	// implementation part, so that we can hide the descending
 790 | 	// part to avoid excessive nesting in the tree-sitter AST.
 791 | 	// This aims to solve the problem described in
 792 | 	// http://tree-sitter.github.io/tree-sitter/creating-parsers#structuring-rules-well ,
 793 | 	// though using a different approach.
 794 | 	private void extractBodies()
 795 | 	{
 796 | 		foreach (defName; defs.keys)
 797 | 		{
 798 | 			auto def = &defs[defName];
 799 | 
 800 | 			if (def.kind != Def.Kind.tokens)
 801 | 				continue;
 802 | 
 803 | 			// The rule of thumb to decide whether a rule should have its body extracted
 804 | 			// is to see if the rule name makes sense even with just the minimal,
 805 | 			// non-body interpretation of the definition.
 806 | 			// E.g., an AddExpression is expected to always have an addition,
 807 | 			// but an Import is an import even without a ModuleAliasIdentifier.
 808 | 
 809 | 			// The following grammar definitions are eligible for body extraction,
 810 | 			// but it doesn't make sense to do so for them.
 811 | 			// As far as I can see, there is no way to mechanically distinguish these cases
 812 | 			// from the majority of cases where body extraction is desirable.
 813 | 			if (defName.among(
 814 | 					"SourceFile",
 815 | 					"Import",
 816 | 					"Slice", // needs to be de-recursed
 817 | 					"Symbol",
 818 | 					"AssertArguments", // uses AssignExpression
 819 | 				))
 820 | 				continue;
 821 | 
 822 | 			// One way we can decide whether to perform body
 823 | 			// extraction is to check if one of the choices that the
 824 | 			// definition can resolve to is a reference to a very
 825 | 			// generic rule, such as Identifier.  In this case, it is
 826 | 			// generally valuable to preserve this node in the AST, as
 827 | 			// it provides information over the generic rule.
 828 | 			bool wrapsGeneric = def.node.match!(
 829 | 				(ref SeqChoice sc) => sc.nodes.map!flattenChoices.joiner.any!(choice =>
 830 | 					choice.length == 1 && choice[0].match!(
 831 | 						(ref Reference r) => r.name.among(
 832 | 							"Identifier",
 833 | 							"DeclDefs",
 834 | 							"NonVoidInitializer",
 835 | 							// "AssignExpression", // Also used for descending
 836 | 							"BasicType",
 837 | 							"Parameters",
 838 | 							"InOutStatement",
 839 | 							"IntegerLiteral",
 840 | 							"Declaration",
 841 | 							"BlockStatement",
 842 | 							"Type",
 843 | 							"Opcode",
 844 | 						),
 845 | 						(ref _) => false,
 846 | 				)),
 847 | 				(ref _) => false,
 848 | 			);
 849 | 			if (wrapsGeneric)
 850 | 				continue;
 851 | 
 852 | 			// Another heuristic we can use is to check if the name
 853 | 			// suggests repetition.  An example is Packages: it is
 854 | 			// recursive, but unlike e.g. OrOrExpression (which is
 855 | 			// also recursive), we don't want to perform body
 856 | 			// extraction on it.
 857 | 			if (isPlural(defName))
 858 | 				continue;
 859 | 
 860 | 			auto x = reference(defName);
 861 | 
 862 | 			/*
 863 | 				x := y ( | a... | b... ) z
 864 | 				=>
 865 | 				x := y z | x_ts_body
 866 | 				x_ts_body := y ( a... | b... ) z
 867 | 
 868 | 				- y and z are the mandatory descending part (must be references)
 869 | 				- a, b, ... are the implementation part, which we will extract to a separate rule
 870 | 				  These should contain a token or such (i.e. consist of not just all references).
 871 | 			*/
 872 | 			def.node.match!(
 873 | 				(ref SeqChoice sc1)
 874 | 				{
 875 | 					if (sc1.nodes.length != 1)
 876 | 						return; // Single choice
 877 | 
 878 | 					auto optionalIndex =
 879 | 						sc1.nodes[0].countUntil!((ref Node node) => node.match!(
 880 | 							(ref SeqChoice sc2) => sc2.nodes.canFind(null),
 881 | 							(ref _) => false
 882 | 						));
 883 | 					if (optionalIndex < 0)
 884 | 						return;
 885 | 
 886 | 					auto y = sc1.nodes[0][0 .. optionalIndex];
 887 | 					auto z = sc1.nodes[0][optionalIndex + 1 .. $];
 888 | 					auto y_z = y ~ z;
 889 | 					if (y_z.length != 1) // Match logic in scanHidden
 890 | 						return;
 891 | 					bool yzOK = y_z.all!((ref Node node) => node.match!(
 892 | 						(ref Reference v) => true,
 893 | 						(ref           _) => false,
 894 | 					));
 895 | 					if (!yzOK)
 896 | 						return;
 897 | 
 898 | 					auto choices = sc1.nodes[0][optionalIndex].tryMatch!(
 899 | 						(ref SeqChoice sc2) => sc2.nodes,
 900 | 					);
 901 | 					extractOptional(choices).enforce();
 902 | 					alias choicesOK = delegate bool (choices) => choices.all!(choice => choice.any!((ref Node node) => node.match!(
 903 | 						(ref RegExp       v) => true,
 904 | 						(ref LiteralChars v) => true,
 905 | 						(ref LiteralToken v) => true,
 906 | 						(ref SeqChoice    v) => choicesOK(v.nodes),
 907 | 						(ref              _) => false,
 908 | 					)));
 909 | 					if (!choicesOK(choices))
 910 | 						return;
 911 | 
 912 | 					auto bodyName = defName ~ "TSBody";
 913 | 					def.node = seqChoice([
 914 | 						y_z,
 915 | 						[reference(bodyName)],
 916 | 					]);
 917 | 					def.tail ~= bodyName;
 918 | 					def.publicName = "Maybe" ~ (def.publicName ? def.publicName : defName);
 919 | 
 920 | 					Def bodyDef;
 921 | 					bodyDef.node = seqChoice([y ~ seqChoice(choices) ~ z]);
 922 | 					bodyDef.kind = Def.Kind.tokens;
 923 | 					bodyDef.synthetic = true;
 924 | 					bodyDef.publicName = defName;
 925 | 
 926 | 					optimizeNode(def.node);
 927 | 					optimizeNode(bodyDef.node);
 928 | 
 929 | 					defs[bodyName] = bodyDef;
 930 | 				},
 931 | 				(ref _) {}
 932 | 			);
 933 | 
 934 | 			/*
 935 | 			  x := choice(
 936 | 			    // Some choices are references (descending part)
 937 | 				reference(...),
 938 | 				reference(...),
 939 | 
 940 | 				// Some choices are sequences (implementation part)
 941 | 				seq(...),
 942 | 				seq(...),
 943 | 			  )
 944 | 
 945 | 			  =>
 946 | 
 947 | 			  x := choice(
 948 | 				reference(...),
 949 | 				reference(...),
 950 | 				reference(x_ts_body),
 951 | 			  )
 952 | 
 953 | 			  x_ts_body := choice(
 954 | 				seq(...),
 955 | 				seq(...),
 956 | 			  )
 957 | 			*/
 958 | 			def.node.match!(
 959 | 				(ref SeqChoice sc1)
 960 | 				{
 961 | 					auto choices = sc1.nodes;
 962 | 					choices = choices.map!flattenChoices.join;
 963 | 
 964 | 					alias isReference = (Node[] nodes) => nodes.length == 1 && nodes[0].match!(
 965 | 						(ref Reference    v) => true,
 966 | 						(_) => false,
 967 | 					);
 968 | 					auto references = choices.filter!isReference.array;
 969 | 					auto remainder  = choices.filter!(not!isReference).array;
 970 | 					if (!references || !remainder)
 971 | 						return;
 972 | 
 973 | 					auto bodyName = defName ~ "TSBody";
 974 | 					def.node = seqChoice(
 975 | 						references ~
 976 | 						[reference(bodyName)],
 977 | 					);
 978 | 					def.tail ~= bodyName;
 979 | 					def.publicName = "Maybe" ~ (def.publicName ? def.publicName : defName);
 980 | 
 981 | 					Def bodyDef;
 982 | 					bodyDef.node = seqChoice(
 983 | 						remainder,
 984 | 					);
 985 | 					bodyDef.kind = Def.Kind.tokens;
 986 | 					bodyDef.synthetic = true;
 987 | 					bodyDef.publicName = defName;
 988 | 
 989 | 					optimizeNode(def.node);
 990 | 					optimizeNode(bodyDef.node);
 991 | 
 992 | 					defs[bodyName] = bodyDef;
 993 | 				},
 994 | 				(ref _) {}
 995 | 			);
 996 | 		}
 997 | 	}
 998 | 
 999 | 	// Verify our assertions about definitions of the respective kind.
1000 | 	private void checkKinds()
1001 | 	{
1002 | 		foreach (defName, ref def; defs)
1003 | 			final switch (def.kind)
1004 | 			{
1005 | 				case Def.Kind.chars:
1006 | 				{
1007 | 					enum State : ubyte
1008 | 					{
1009 | 						hasChars = 1 << 0,
1010 | 						hasToken = 1 << 1,
1011 | 						recurses = 1 << 2,
1012 | 					}
1013 | 
1014 | 					HashSet!string scanning;
1015 | 
1016 | 					State checkDef(string defName)
1017 | 					{
1018 | 						scope(failure) { import std.stdio; stderr.writefln("While checking %s:", defName); }
1019 | 						if (defName in scanning)
1020 | 							return State.recurses;
1021 | 						scanning.add(defName);
1022 | 						scope(success) scanning.remove(defName);
1023 | 
1024 | 						State concat(State a, State b)
1025 | 						{
1026 | 							if (((a & State.hasToken) && b != 0) ||
1027 | 								((b & State.hasToken) && a != 0))
1028 | 								throw new Exception("Token / token fragment definition %s contains mixed %s and %s".format(defName, a, b));
1029 | 							return a | b;
1030 | 						}
1031 | 
1032 | 						State scanNode(ref Node node)
1033 | 						{
1034 | 							return node.match!(
1035 | 								(ref RegExp       v) => State.init,
1036 | 								(ref LiteralChars v) => State.hasChars,
1037 | 								(ref LiteralToken v) => State.hasToken,
1038 | 								(ref Reference    v) { enforce(defs[v.name].kind == Def.Kind.chars, "%s of kind %s references %s of kind %s".format(defName, def.kind, v.name, defs[v.name].kind)); return checkDef(v.name); },
1039 | 								(ref Repeat1      v) => v.node.I!scanNode().I!(x => concat(x, x)),
1040 | 								(ref SeqChoice    v) => v.nodes.map!(choiceSeq => choiceSeq.map!scanNode().fold!concat(State.init)).fold!((a, b) => State(a | b)),
1041 | 								(ref              _) { unexpected(_); return State.init; },
1042 | 							);
1043 | 						}
1044 | 						return scanNode(defs[defName].node);
1045 | 					}
1046 | 
1047 | 					checkDef(defName);
1048 | 					break;
1049 | 				}
1050 | 
1051 | 				case Def.Kind.tokens:
1052 | 				{
1053 | 					void scanNode(ref Node node)
1054 | 					{
1055 | 						node.match!(
1056 | 							(ref RegExp       v) {},
1057 | 							(ref LiteralChars v) { throw new Exception("Definition %s with kind %s has literal chars: %(%s%)".format(defName, def.kind, [v.chars])); },
1058 | 							(ref LiteralToken v) {},
1059 | 							(ref Reference    v) {},
1060 | 							(ref Repeat1      v) { v.nodes       .each!scanNode(); },
1061 | 							(ref SeqChoice    v) { v.nodes.joiner.each!scanNode(); },
1062 | 							(ref              _) { unexpected(_); },
1063 | 						);
1064 | 					}
1065 | 					scanNode(def.node);
1066 | 					break;
1067 | 				}
1068 | 			}
1069 | 	}
1070 | 
1071 | 	// Recursively visit definitions starting from `roots` to find
1072 | 	// which ones are used and should be generated grammar.
1073 | 	private void scanUsed(string[] roots)
1074 | 	{
1075 | 		void scanDef(string defName)
1076 | 		{
1077 | 			auto def = &defs[defName];
1078 | 			if (def.used)
1079 | 				return;
1080 | 			def.used = true;
1081 | 			if (def.kind == Def.Kind.chars)
1082 | 				return; // Referencees will be inlined
1083 | 
1084 | 			void scanNode(ref Node node)
1085 | 			{
1086 | 				node.match!(
1087 | 					(ref RegExp       v) {},
1088 | 					(ref LiteralChars v) {},
1089 | 					(ref LiteralToken v) {},
1090 | 					(ref Reference    v) { scanDef(v.name); },
1091 | 					(ref Repeat1      v) { v.nodes       .each!scanNode(); },
1092 | 					(ref SeqChoice    v) { v.nodes.joiner.each!scanNode(); },
1093 | 					(ref              _) { unexpected(_); },
1094 | 				);
1095 | 			}
1096 | 			scanNode(def.node);
1097 | 		}
1098 | 
1099 | 		foreach (root; roots)
1100 | 			scanDef(root);
1101 | 	}
1102 | 
1103 | 	// Choose which definitions should be hidden (inlined) in the tree-sitter AST.
1104 | 	// In the generated grammar, such definitions' names begin with an underscore.
1105 | 	private void scanHidden()
1106 | 	{
1107 | 		foreach (defName, ref def; defs)
1108 | 		{
1109 | 			if (def.kind == Def.Kind.chars)
1110 | 				continue; // Always represents a token; referencees are inlined
1111 | 
1112 | 			// We make a definition hidden if it always contains at most one other definition.
1113 | 			// Definitions which directly contain tokens are never hidden.
1114 | 
1115 | 			// Exception: nodes which contain only one reference and nothing else
1116 | 			// are implicitly understood to have semantic meaning, and are not hidden.
1117 | 			if (def.node.match!(
1118 | 				(ref Reference    v) => true,
1119 | 				(ref              _) => false,
1120 | 			))
1121 | 				continue;
1122 | 
1123 | 			size_t scanNode(ref Node node)
1124 | 			{
1125 | 				return node.match!(
1126 | 					(ref RegExp       v) => unexpected(v).progn(0),
1127 | 					(ref LiteralChars v) => unexpected(v).progn(0),
1128 | 					(ref LiteralToken v) => 2,
1129 | 					(ref Reference    v) => 1,
1130 | 					(ref Repeat1      v) => v.nodes.each!scanNode() * 2,
1131 | 					(ref SeqChoice    v) => v.nodes.map!(choiceSeq => choiceSeq.map!scanNode().sum()).reduce!max,
1132 | 					(ref              _) => unexpected(_).progn(0),
1133 | 				);
1134 | 			}
1135 | 			def.hidden = scanNode(def.node) <= 1;
1136 | 		}
1137 | 	}
1138 | 
1139 | 	// Convert rules from the internal normalized form to the tree-sitter form.
1140 | 	// This replaces SeqChoice nodes with Seq / Choice / Optional.
1141 | 	private void compile()
1142 | 	{
1143 | 		void compileNode(ref Node node)
1144 | 		{
1145 | 			// Compile children
1146 | 			node.match!(
1147 | 				(ref RegExp       v) {},
1148 | 				(ref LiteralChars v) {},
1149 | 				(ref LiteralToken v) {},
1150 | 				(ref Reference    v) {},
1151 | 				(ref Choice       v) { unexpected(v); },
1152 | 				(ref Seq          v) { unexpected(v); },
1153 | 				(ref Repeat       v) { unexpected(v); },
1154 | 				(ref Repeat1      v) { v.nodes       .each!compileNode(); },
1155 | 				(ref Optional     v) { unexpected(v); },
1156 | 				(ref SeqChoice    v) { v.nodes.joiner.each!compileNode(); },
1157 | 			);
1158 | 
1159 | 			// Compile node
1160 | 			node = node.match!(
1161 | 				(ref RegExp       v) => node,
1162 | 				(ref LiteralChars v) => node,
1163 | 				(ref LiteralToken v) => node,
1164 | 				(ref Reference    v) => node,
1165 | 				(ref SeqChoice    v)
1166 | 				{
1167 | 					auto optionalChoice = extractOptional(v.nodes);
1168 | 
1169 | 					alias maybeSeq = (Node[] nodes) => nodes.length == 1 ? nodes[0] : seq(nodes);
1170 | 
1171 | 					node = v.nodes.length == 1 ? maybeSeq(v.nodes[0]) : choice(v.nodes.map!maybeSeq.array);
1172 | 
1173 | 					if (optionalChoice)
1174 | 					{
1175 | 						// optional(repeat1(...)) -> repeat(...)
1176 | 						node = node.match!(
1177 | 							(ref Repeat1 v) => repeat(v.node),
1178 | 							(ref         _) => optional(node),
1179 | 						);
1180 | 					}
1181 | 
1182 | 					return node;
1183 | 				},
1184 | 				(ref Repeat1      v) => node,
1185 | 				(ref              _) { unexpected(_); return Node.init; },
1186 | 			);
1187 | 		}
1188 | 
1189 | 		foreach (defName, ref def; defs)
1190 | 			compileNode(def.node);
1191 | 	}
1192 | }
1193 | 
1194 | /// Convenience factory functions.
1195 | Grammar.Node regexp      (string           regexp ) { return Grammar.Node(Grammar.NodeValue(Grammar.RegExp      ( regexp  ))); }
1196 | Grammar.Node literalChars(string           chars  ) { return Grammar.Node(Grammar.NodeValue(Grammar.LiteralChars( chars   ))); } /// ditto
1197 | Grammar.Node literalToken(string           literal) { return Grammar.Node(Grammar.NodeValue(Grammar.LiteralToken( literal ))); } /// ditto
1198 | Grammar.Node reference   (string           name   ) { return Grammar.Node(Grammar.NodeValue(Grammar.Reference   ( name    ))); } /// ditto
1199 | Grammar.Node choice      (Grammar.Node[]   nodes  ) { return Grammar.Node(Grammar.NodeValue(Grammar.Choice      ( nodes   ))); } /// ditto
1200 | Grammar.Node seq         (Grammar.Node[]   nodes  ) { return Grammar.Node(Grammar.NodeValue(Grammar.Seq         ( nodes   ))); } /// ditto
1201 | Grammar.Node repeat      (Grammar.Node     node   ) { return Grammar.Node(Grammar.NodeValue(Grammar.Repeat      ([node   ]))); } /// ditto
1202 | Grammar.Node repeat1     (Grammar.Node     node   ) { return Grammar.Node(Grammar.NodeValue(Grammar.Repeat1     ([node   ]))); } /// ditto
1203 | Grammar.Node optional    (Grammar.Node     node   ) { return Grammar.Node(Grammar.NodeValue(Grammar.Optional    ([node   ]))); } /// ditto
1204 | Grammar.Node seqChoice   (Grammar.Node[][] nodes  ) { return Grammar.Node(Grammar.NodeValue(Grammar.SeqChoice   ( nodes   ))); } /// ditto
1205 | 
1206 | private void unexpected(T)(auto ref T v) { assert(false, "Unexpected " ~ T.stringof); }
1207 | 


--------------------------------------------------------------------------------
/generator/source/parser.d:
--------------------------------------------------------------------------------
  1 | module parser;
  2 | 
  3 | import std.algorithm.comparison;
  4 | import std.algorithm.iteration;
  5 | import std.algorithm.searching;
  6 | import std.array;
  7 | import std.conv : to;
  8 | import std.exception;
  9 | import std.string;
 10 | 
 11 | import ddoc;
 12 | import grammar;
 13 | 
 14 | string[] parse(ref Grammar grammar, const DDoc ddoc, string fileName, DDoc[string] macros, Grammar.Def.Kind kind)
 15 | {
 16 | 	alias RegExp = Grammar.RegExp;
 17 | 	alias LiteralChars = Grammar.LiteralChars;
 18 | 	alias LiteralToken = Grammar.LiteralToken;
 19 | 	alias Reference = Grammar.Reference;
 20 | 	alias Optional = Grammar.Optional;
 21 | 	alias Choice = Grammar.Choice;
 22 | 	alias Seq = Grammar.Seq;
 23 | 
 24 | 	alias NodeValue = Grammar.NodeValue;
 25 | 	alias Node = Grammar.Node;
 26 | 	alias Def = Grammar.Def;
 27 | 
 28 | 	static DDoc preprocess(const DDoc ddoc)
 29 | 	{
 30 | 		DDoc result;
 31 | 		foreach (ref node; ddoc)
 32 | 			if (node.type != .Node.Type.call)
 33 | 				result ~= node;
 34 | 			else
 35 | 			if (node.isCallTo("MULTICOLS"))
 36 | 				result ~= node.call.splitArguments()[1];
 37 | 			else
 38 | 			{
 39 | 				.Node node2 = node;
 40 | 				node2.call.contents = preprocess(node.call.contents);
 41 | 				result ~= node2;
 42 | 			}
 43 | 		return result;
 44 | 	}
 45 | 
 46 | 	struct ParseContext
 47 | 	{
 48 | 		string currentName;
 49 | 		string file;
 50 | 		DDoc[string] macros;
 51 | 		Def.Kind kind;
 52 | 	}
 53 | 
 54 | 	/*static*/ Node[] parseDefinition(const DDoc line, ref const ParseContext context)
 55 | 	{
 56 | 		scope(failure) { import std.stdio : stderr; stderr.writeln("Error with line: ", line); }
 57 | 		Node[] seqNodes;
 58 | 		foreach (ref node; line)
 59 | 		{
 60 | 			if (node.type == .Node.Type.text)
 61 | 				enforce(!node.text.strip.length, "Bare text node (%(%s%)) in grammar: %s".format([node.text], line));
 62 | 			else
 63 | 			if (node.isCallTo("I"))
 64 | 			{
 65 | 				auto text = node.getSingleTextChild();
 66 | 				switch (text)
 67 | 				{
 68 | 					case "any Unicode character":
 69 | 						seqNodes ~= regexp(`/[\s\S]/`);
 70 | 						break;
 71 | 					case "physical end of the file":
 72 | 						seqNodes ~= regexp(`/$/m`); // illustrative
 73 | 						break;
 74 | 					case "Letter":
 75 | 						seqNodes ~= regexp(`/[A-Za-z]/`);
 76 | 						break;
 77 | 					case "UniversalAlpha":
 78 | 						// src/dmd/utf.d
 79 | 						static immutable wchar[2][] ALPHA_TABLE =
 80 | 						[
 81 | 							[0x00AA, 0x00AA],
 82 | 							[0x00B5, 0x00B5],
 83 | 							[0x00B7, 0x00B7],
 84 | 							[0x00BA, 0x00BA],
 85 | 							[0x00C0, 0x00D6],
 86 | 							[0x00D8, 0x00F6],
 87 | 							[0x00F8, 0x01F5],
 88 | 							[0x01FA, 0x0217],
 89 | 							[0x0250, 0x02A8],
 90 | 							[0x02B0, 0x02B8],
 91 | 							[0x02BB, 0x02BB],
 92 | 							[0x02BD, 0x02C1],
 93 | 							[0x02D0, 0x02D1],
 94 | 							[0x02E0, 0x02E4],
 95 | 							[0x037A, 0x037A],
 96 | 							[0x0386, 0x0386],
 97 | 							[0x0388, 0x038A],
 98 | 							[0x038C, 0x038C],
 99 | 							[0x038E, 0x03A1],
100 | 							[0x03A3, 0x03CE],
101 | 							[0x03D0, 0x03D6],
102 | 							[0x03DA, 0x03DA],
103 | 							[0x03DC, 0x03DC],
104 | 							[0x03DE, 0x03DE],
105 | 							[0x03E0, 0x03E0],
106 | 							[0x03E2, 0x03F3],
107 | 							[0x0401, 0x040C],
108 | 							[0x040E, 0x044F],
109 | 							[0x0451, 0x045C],
110 | 							[0x045E, 0x0481],
111 | 							[0x0490, 0x04C4],
112 | 							[0x04C7, 0x04C8],
113 | 							[0x04CB, 0x04CC],
114 | 							[0x04D0, 0x04EB],
115 | 							[0x04EE, 0x04F5],
116 | 							[0x04F8, 0x04F9],
117 | 							[0x0531, 0x0556],
118 | 							[0x0559, 0x0559],
119 | 							[0x0561, 0x0587],
120 | 							[0x05B0, 0x05B9],
121 | 							[0x05BB, 0x05BD],
122 | 							[0x05BF, 0x05BF],
123 | 							[0x05C1, 0x05C2],
124 | 							[0x05D0, 0x05EA],
125 | 							[0x05F0, 0x05F2],
126 | 							[0x0621, 0x063A],
127 | 							[0x0640, 0x0652],
128 | 							[0x0660, 0x0669],
129 | 							[0x0670, 0x06B7],
130 | 							[0x06BA, 0x06BE],
131 | 							[0x06C0, 0x06CE],
132 | 							[0x06D0, 0x06DC],
133 | 							[0x06E5, 0x06E8],
134 | 							[0x06EA, 0x06ED],
135 | 							[0x06F0, 0x06F9],
136 | 							[0x0901, 0x0903],
137 | 							[0x0905, 0x0939],
138 | 							[0x093D, 0x094D],
139 | 							[0x0950, 0x0952],
140 | 							[0x0958, 0x0963],
141 | 							[0x0966, 0x096F],
142 | 							[0x0981, 0x0983],
143 | 							[0x0985, 0x098C],
144 | 							[0x098F, 0x0990],
145 | 							[0x0993, 0x09A8],
146 | 							[0x09AA, 0x09B0],
147 | 							[0x09B2, 0x09B2],
148 | 							[0x09B6, 0x09B9],
149 | 							[0x09BE, 0x09C4],
150 | 							[0x09C7, 0x09C8],
151 | 							[0x09CB, 0x09CD],
152 | 							[0x09DC, 0x09DD],
153 | 							[0x09DF, 0x09E3],
154 | 							[0x09E6, 0x09F1],
155 | 							[0x0A02, 0x0A02],
156 | 							[0x0A05, 0x0A0A],
157 | 							[0x0A0F, 0x0A10],
158 | 							[0x0A13, 0x0A28],
159 | 							[0x0A2A, 0x0A30],
160 | 							[0x0A32, 0x0A33],
161 | 							[0x0A35, 0x0A36],
162 | 							[0x0A38, 0x0A39],
163 | 							[0x0A3E, 0x0A42],
164 | 							[0x0A47, 0x0A48],
165 | 							[0x0A4B, 0x0A4D],
166 | 							[0x0A59, 0x0A5C],
167 | 							[0x0A5E, 0x0A5E],
168 | 							[0x0A66, 0x0A6F],
169 | 							[0x0A74, 0x0A74],
170 | 							[0x0A81, 0x0A83],
171 | 							[0x0A85, 0x0A8B],
172 | 							[0x0A8D, 0x0A8D],
173 | 							[0x0A8F, 0x0A91],
174 | 							[0x0A93, 0x0AA8],
175 | 							[0x0AAA, 0x0AB0],
176 | 							[0x0AB2, 0x0AB3],
177 | 							[0x0AB5, 0x0AB9],
178 | 							[0x0ABD, 0x0AC5],
179 | 							[0x0AC7, 0x0AC9],
180 | 							[0x0ACB, 0x0ACD],
181 | 							[0x0AD0, 0x0AD0],
182 | 							[0x0AE0, 0x0AE0],
183 | 							[0x0AE6, 0x0AEF],
184 | 							[0x0B01, 0x0B03],
185 | 							[0x0B05, 0x0B0C],
186 | 							[0x0B0F, 0x0B10],
187 | 							[0x0B13, 0x0B28],
188 | 							[0x0B2A, 0x0B30],
189 | 							[0x0B32, 0x0B33],
190 | 							[0x0B36, 0x0B39],
191 | 							[0x0B3D, 0x0B43],
192 | 							[0x0B47, 0x0B48],
193 | 							[0x0B4B, 0x0B4D],
194 | 							[0x0B5C, 0x0B5D],
195 | 							[0x0B5F, 0x0B61],
196 | 							[0x0B66, 0x0B6F],
197 | 							[0x0B82, 0x0B83],
198 | 							[0x0B85, 0x0B8A],
199 | 							[0x0B8E, 0x0B90],
200 | 							[0x0B92, 0x0B95],
201 | 							[0x0B99, 0x0B9A],
202 | 							[0x0B9C, 0x0B9C],
203 | 							[0x0B9E, 0x0B9F],
204 | 							[0x0BA3, 0x0BA4],
205 | 							[0x0BA8, 0x0BAA],
206 | 							[0x0BAE, 0x0BB5],
207 | 							[0x0BB7, 0x0BB9],
208 | 							[0x0BBE, 0x0BC2],
209 | 							[0x0BC6, 0x0BC8],
210 | 							[0x0BCA, 0x0BCD],
211 | 							[0x0BE7, 0x0BEF],
212 | 							[0x0C01, 0x0C03],
213 | 							[0x0C05, 0x0C0C],
214 | 							[0x0C0E, 0x0C10],
215 | 							[0x0C12, 0x0C28],
216 | 							[0x0C2A, 0x0C33],
217 | 							[0x0C35, 0x0C39],
218 | 							[0x0C3E, 0x0C44],
219 | 							[0x0C46, 0x0C48],
220 | 							[0x0C4A, 0x0C4D],
221 | 							[0x0C60, 0x0C61],
222 | 							[0x0C66, 0x0C6F],
223 | 							[0x0C82, 0x0C83],
224 | 							[0x0C85, 0x0C8C],
225 | 							[0x0C8E, 0x0C90],
226 | 							[0x0C92, 0x0CA8],
227 | 							[0x0CAA, 0x0CB3],
228 | 							[0x0CB5, 0x0CB9],
229 | 							[0x0CBE, 0x0CC4],
230 | 							[0x0CC6, 0x0CC8],
231 | 							[0x0CCA, 0x0CCD],
232 | 							[0x0CDE, 0x0CDE],
233 | 							[0x0CE0, 0x0CE1],
234 | 							[0x0CE6, 0x0CEF],
235 | 							[0x0D02, 0x0D03],
236 | 							[0x0D05, 0x0D0C],
237 | 							[0x0D0E, 0x0D10],
238 | 							[0x0D12, 0x0D28],
239 | 							[0x0D2A, 0x0D39],
240 | 							[0x0D3E, 0x0D43],
241 | 							[0x0D46, 0x0D48],
242 | 							[0x0D4A, 0x0D4D],
243 | 							[0x0D60, 0x0D61],
244 | 							[0x0D66, 0x0D6F],
245 | 							[0x0E01, 0x0E3A],
246 | 							[0x0E40, 0x0E5B],
247 | 							[0x0E81, 0x0E82],
248 | 							[0x0E84, 0x0E84],
249 | 							[0x0E87, 0x0E88],
250 | 							[0x0E8A, 0x0E8A],
251 | 							[0x0E8D, 0x0E8D],
252 | 							[0x0E94, 0x0E97],
253 | 							[0x0E99, 0x0E9F],
254 | 							[0x0EA1, 0x0EA3],
255 | 							[0x0EA5, 0x0EA5],
256 | 							[0x0EA7, 0x0EA7],
257 | 							[0x0EAA, 0x0EAB],
258 | 							[0x0EAD, 0x0EAE],
259 | 							[0x0EB0, 0x0EB9],
260 | 							[0x0EBB, 0x0EBD],
261 | 							[0x0EC0, 0x0EC4],
262 | 							[0x0EC6, 0x0EC6],
263 | 							[0x0EC8, 0x0ECD],
264 | 							[0x0ED0, 0x0ED9],
265 | 							[0x0EDC, 0x0EDD],
266 | 							[0x0F00, 0x0F00],
267 | 							[0x0F18, 0x0F19],
268 | 							[0x0F20, 0x0F33],
269 | 							[0x0F35, 0x0F35],
270 | 							[0x0F37, 0x0F37],
271 | 							[0x0F39, 0x0F39],
272 | 							[0x0F3E, 0x0F47],
273 | 							[0x0F49, 0x0F69],
274 | 							[0x0F71, 0x0F84],
275 | 							[0x0F86, 0x0F8B],
276 | 							[0x0F90, 0x0F95],
277 | 							[0x0F97, 0x0F97],
278 | 							[0x0F99, 0x0FAD],
279 | 							[0x0FB1, 0x0FB7],
280 | 							[0x0FB9, 0x0FB9],
281 | 							[0x10A0, 0x10C5],
282 | 							[0x10D0, 0x10F6],
283 | 							[0x1E00, 0x1E9B],
284 | 							[0x1EA0, 0x1EF9],
285 | 							[0x1F00, 0x1F15],
286 | 							[0x1F18, 0x1F1D],
287 | 							[0x1F20, 0x1F45],
288 | 							[0x1F48, 0x1F4D],
289 | 							[0x1F50, 0x1F57],
290 | 							[0x1F59, 0x1F59],
291 | 							[0x1F5B, 0x1F5B],
292 | 							[0x1F5D, 0x1F5D],
293 | 							[0x1F5F, 0x1F7D],
294 | 							[0x1F80, 0x1FB4],
295 | 							[0x1FB6, 0x1FBC],
296 | 							[0x1FBE, 0x1FBE],
297 | 							[0x1FC2, 0x1FC4],
298 | 							[0x1FC6, 0x1FCC],
299 | 							[0x1FD0, 0x1FD3],
300 | 							[0x1FD6, 0x1FDB],
301 | 							[0x1FE0, 0x1FEC],
302 | 							[0x1FF2, 0x1FF4],
303 | 							[0x1FF6, 0x1FFC],
304 | 							[0x203F, 0x2040],
305 | 							[0x207F, 0x207F],
306 | 							[0x2102, 0x2102],
307 | 							[0x2107, 0x2107],
308 | 							[0x210A, 0x2113],
309 | 							[0x2115, 0x2115],
310 | 							[0x2118, 0x211D],
311 | 							[0x2124, 0x2124],
312 | 							[0x2126, 0x2126],
313 | 							[0x2128, 0x2128],
314 | 							[0x212A, 0x2131],
315 | 							[0x2133, 0x2138],
316 | 							[0x2160, 0x2182],
317 | 							[0x3005, 0x3007],
318 | 							[0x3021, 0x3029],
319 | 							[0x3041, 0x3093],
320 | 							[0x309B, 0x309C],
321 | 							[0x30A1, 0x30F6],
322 | 							[0x30FB, 0x30FC],
323 | 							[0x3105, 0x312C],
324 | 							[0x4E00, 0x9FA5],
325 | 							[0xAC00, 0xD7A3],
326 | 						];
327 | 						seqNodes ~= regexp(`/[%-(%s%)]/`.format(ALPHA_TABLE.map!(r =>
328 | 							r[0] == r[1]
329 | 							? `\u%04x`.format(r[0])
330 | 							: `\u%04x-\u%04x`.format(r[0], r[1])
331 | 						)));
332 | 						break;
333 | 					default:
334 | 						throw new Exception("Unknown I: " ~ text);
335 | 				}
336 | 			}
337 | 			else
338 | 			if (node.isCallTo("B"))
339 | 			{
340 | 				auto text = node.call.contents.toString();
341 | 				enforce(context.kind == Def.Kind.chars, `B in GRAMMAR block: ` ~ text);
342 | 				if (text.length == 6 && text.startsWith(`\u`))
343 | 					seqNodes ~= literalChars(wchar(text[2 .. $].to!ushort(16)).to!string);
344 | 				else
345 | 				{
346 | 					// These are to aid fixing usage of $(D ...)/$(B ...) in the spec
347 | 					enforce(text.among(
348 | 						"0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
349 | 						"a", "b", "c", "d", "e", "f",
350 | 						"A", "B", "C", "D", "E", "F",
351 | 						"/*",
352 | 						"*/",
353 | 						"//",
354 | 						"/+",
355 | 						"+/",
356 | 						`r"`,
357 | 						`"`,
358 | 						"`",
359 | 						"'",
360 | 						"c",
361 | 						"w",
362 | 						"d",
363 | 						`q"`,
364 | 						`q"(`, `)"`,
365 | 						`q"[`, `]"`,
366 | 						`q"{`, `}"`,
367 | 						`q"<`, `>"`,
368 | 						`(`, `[`, `<`, `{`,
369 | 						`)`, `]`, `>`, `}`,
370 | 						"L", "u", "U",
371 | 						"Lu", "LU",
372 | 						"uL", "UL",
373 | 						"0b",
374 | 						"0B",
375 | 						"_",
376 | 						".",
377 | 						`\'`,
378 | 						`\"`,
379 | 						`\?`,
380 | 						`\`,
381 | 						`\0`,
382 | 						`\a`,
383 | 						`\b`,
384 | 						`\f`,
385 | 						`\n`,
386 | 						`\r`,
387 | 						`\t`,
388 | 						`\v`,
389 | 						`\x`,
390 | 						`\\`,
391 | 						`\u`,
392 | 						`\U`,
393 | 						`x"`,
394 | 						`e+`,
395 | 						`E+`,
396 | 						`e-`,
397 | 						`E-`,
398 | 						`0x`,
399 | 						`0X`,
400 | 						`p`,
401 | 						`P`,
402 | 						`p+`,
403 | 						`P+`,
404 | 						`p-`,
405 | 						`P-`,
406 | 						`i`,
407 | 						`&`,
408 | 						`;`,
409 | 						`#!`,
410 | 					), "Unknown B: " ~ text);
411 | 					seqNodes ~= literalChars(text);
412 | 				}
413 | 			}
414 | 			else
415 | 			if (node.isCallTo("D"))
416 | 			{
417 | 				// ditto
418 | 				auto text = node.call.contents.toString();
419 | 				enforce(text.length);
420 | 				foreach (word; text.split)
421 | 				{
422 | 					enforce(
423 | 						// keywords
424 | 						(word.length >= 2 && word.representation.all!(c => "abcdefghijklmnopqrstuvwxyz_".representation.canFind(c))) ||
425 | 						// traits
426 | 						(["is", "has", "get"].any!(prefix => word.startsWith(prefix)) && word.representation.all!(c => "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".representation.canFind(c))) ||
427 | 						// magic keywords
428 | 						(word.startsWith("__") && word.endsWith("__") && word[2 .. $-2].representation.all!(c => "ABCDEFGHIJKLMNOPQRSTUVWXYZ_".representation.canFind(c))) ||
429 | 						// registers
430 | 						(word.length >= 2 && "ABCDEFGHIJKLMNOPQRSTUVWXYZ".representation.canFind(word[0]) && word.representation.all!(c => "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789()".representation.canFind(c))) ||
431 | 						// other tokens
432 | 						word.among(
433 | 							"/",
434 | 							"/=",
435 | 							".",
436 | 							"..",
437 | 							"...",
438 | 							"&",
439 | 							"&=",
440 | 							"&&",
441 | 							"|",
442 | 							"|=",
443 | 							"||",
444 | 							"-",
445 | 							"-=",
446 | 							"--",
447 | 							"+",
448 | 							"+=",
449 | 							"++",
450 | 							"<",
451 | 							"<=",
452 | 							"<<",
453 | 							"<<=",
454 | 							">",
455 | 							">=",
456 | 							">>=",
457 | 							">>>=",
458 | 							">>",
459 | 							">>>",
460 | 							"!",
461 | 							"!=",
462 | 							"(",
463 | 							")",
464 | 							"[",
465 | 							"]",
466 | 							"{",
467 | 							"}",
468 | 							"?",
469 | 							",",
470 | 							";",
471 | 							":",
472 | 							"$",
473 | 							"=",
474 | 							"==",
475 | 							"*",
476 | 							"*=",
477 | 							"%",
478 | 							"%=",
479 | 							"^",
480 | 							"^=",
481 | 							"^^",
482 | 							"^^=",
483 | 							"~",
484 | 							"~=",
485 | 							"@",
486 | 							"=>",
487 | 							"#",
488 | 
489 | 							`q{`,
490 | 
491 | 							"C",
492 | 							"C++",
493 | 							"D",
494 | 							"Windows",
495 | 							"System",
496 | 							"Objective-C",
497 | 
498 | 							"classInstanceSize", // should have been getClassInstanceSize
499 | 							"allMembers",        // should have been getAllMembers
500 | 							"derivedMembers",    // should have been getDerivedMembers
501 | 							"toType",
502 | 
503 | 							"__LOCAL_SIZE",
504 | 						), "Unknown D: " ~ word);
505 | 					seqNodes ~= literalToken(word);
506 | 				}
507 | 			}
508 | 			else
509 | 			if (node.isCallTo("GLINK") || node.isCallTo("GLINK_LEX"))
510 | 			{
511 | 				auto text = node.getSingleTextChild();
512 | 				enforce(text != context.currentName, "GLINK to %(%s%) should be GSELF".format([text]));
513 | 				seqNodes ~= reference(text);
514 | 				auto file = node.call.macroName == "GLINK_LEX" ? "lex" : context.file;
515 | 				grammar.links.add([file, text].staticArray);
516 | 			}
517 | 			else
518 | 			if (node.isCallTo("GLINK2"))
519 | 			{
520 | 				auto arguments = node.call.splitArguments();
521 | 				enforce(arguments.length == 2);
522 | 				auto file = arguments[0].toString();
523 | 				enforce(file != context.file, "GLINK2 to the current file should be GLINK");
524 | 				auto text = arguments[1].toString();
525 | 				enforce(text != context.currentName, "GLINK to %(%s%) should be GSELF".format([text]));
526 | 				seqNodes ~= reference(text);
527 | 				grammar.links.add([file, text].staticArray);
528 | 			}
529 | 			else
530 | 			if (node.isCallTo("LINK2") || node.isCallTo("RELATIVE_LINK2"))
531 | 			{
532 | 				auto arguments = node.call.splitArguments();
533 | 				enforce(arguments.length == 2);
534 | 				seqNodes ~= parseDefinition(arguments[1], context);
535 | 			}
536 | 			else
537 | 			if (node.isCallTo("GSELF"))
538 | 			{
539 | 				auto text = node.getSingleTextChild();
540 | 				enforce(text == context.currentName, "GSELF to %(%s%) should be GLINK or to %(%s%)".format([text], [context.currentName]));
541 | 				seqNodes ~= reference(text);
542 | 			}
543 | 			else
544 | 			if (node.isCallTo("OPT"))
545 | 			{
546 | 				enforce(seqNodes.length);
547 | 				seqNodes[$-1] = optional(seqNodes[$-1]);
548 | 			}
549 | 			else
550 | 			if (node.isCallTo("GDEPRECATED"))
551 | 				seqNodes ~= parseDefinition(node.call.contents, context);
552 | 			else
553 | 			if (node.isCallTo("GRESERVED"))
554 | 				seqNodes ~= parseDefinition(node.call.contents, context);
555 | 			else
556 | 			if (node.isCallToEmpty("CODE_AMP"))
557 | 				seqNodes ~= literalToken("&");
558 | 			else
559 | 			if (node.isCallToEmpty("CODE_LCURL"))
560 | 				seqNodes ~= literalToken("{");
561 | 			else
562 | 			if (node.isCallToEmpty("CODE_RCURL"))
563 | 				seqNodes ~= literalToken("}");
564 | 			else
565 | 			if (node.isCallToEmpty("CODE_PERCENT"))
566 | 				seqNodes ~= literalToken("%");
567 | 			else
568 | 			if (auto pdefinition = node.call.macroName in context.macros)
569 | 				seqNodes ~= parseDefinition(node.call.expand(*pdefinition), context);
570 | 			else
571 | 				throw new Exception("Unknown macro call (%(%s%)) in grammar".format([node.call.macroName]));
572 | 		}
573 | 		return seqNodes;
574 | 	}
575 | 
576 | 	/// Parse and accumulate definitions from DDoc AST
577 | 	{
578 | 		ParseContext context;
579 | 		context.file = fileName;
580 | 		context.macros = macros;
581 | 		context.kind = kind;
582 | 
583 | 		Node[] currentDefs;
584 | 		string[] newDefs;
585 | 
586 | 		void flush()
587 | 		{
588 | 			if (!context.currentName)
589 | 				return;
590 | 
591 | 			auto newDef = Def(choice(currentDefs), kind);
592 | 			grammar.defs.update(context.currentName,
593 | 				{ newDefs ~= context.currentName; return newDef; },
594 | 				(ref Def def)
595 | 				{
596 | 					enforce(Def(def.node, def.kind) == newDef,
597 | 						"Definition mismatch for " ~ context.currentName);
598 | 				}
599 | 			);
600 | 
601 | 			auto pDef = &grammar.defs[context.currentName];
602 | 			pDef.definedIn.add(fileName);
603 | 
604 | 			context.currentName = null;
605 | 			currentDefs = null;
606 | 		}
607 | 
608 | 		foreach (line; preprocess(ddoc).split('\n'))
609 | 		{
610 | 			if (!line.length || (line.length == 1 && line[0].isText("")))
611 | 			{}  // Empty line
612 | 			else
613 | 			if (line.length == 2 && line[0].isCallTo("GNAME") && line[1].isText(":"))
614 | 			{
615 | 				// Definition
616 | 				flush();
617 | 				context.currentName = line[0].getSingleTextChild();
618 | 			}
619 | 			else
620 | 			if (line.length >= 2 && line[0].isText("    "))
621 | 			{
622 | 				// Possible declaration
623 | 				enforce(context.currentName, "Body line without definition line");
624 | 				currentDefs ~= seq(parseDefinition(line, context));
625 | 			}
626 | 			else
627 | 				throw new Exception(format!"Can't parse grammar from: %s"(line));
628 | 		}
629 | 		flush();
630 | 
631 | 		return newDefs;
632 | 	}
633 | }
634 | 


--------------------------------------------------------------------------------
/generator/source/writer.d:
--------------------------------------------------------------------------------
  1 | module writer;
  2 | 
  3 | import std.algorithm.iteration;
  4 | import std.array;
  5 | import std.stdio;
  6 | import std.string;
  7 | import std.sumtype;
  8 | 
  9 | import ae.utils.aa;
 10 | import ae.utils.text : splitByCamelCase;
 11 | 
 12 | import grammar;
 13 | 
 14 | struct Writer
 15 | {
 16 | 	File f;
 17 | 	Grammar grammar;
 18 | 
 19 | 	this(string fileName, Grammar grammar, const string[] extras)
 20 | 	{
 21 | 		this.grammar = grammar;
 22 | 
 23 | 		f.open(fileName, "wb");
 24 | 
 25 | 		f.writef(q"EOF
 26 | module.exports = grammar({
 27 |   name: 'd',
 28 | 
 29 |   word: $ => $.identifier,
 30 | 
 31 |   extras: $ => [
 32 | %-(    $.%s,
 33 | %|%)  ],
 34 | 
 35 |   rules: {
 36 | EOF", extras.map!(extra => convertRuleName(extra)));
 37 | 	}
 38 | 
 39 | 	string currentFile;
 40 | 	bool fileHeaderPending;
 41 | 	bool sectionHeaderPending;
 42 | 
 43 | 	void startFile(string file)
 44 | 	{
 45 | 		currentFile = file;
 46 | 		fileHeaderPending = true;
 47 | 		sectionHeaderPending = true;
 48 | 	}
 49 | 
 50 | 	void startSection()
 51 | 	{
 52 | 		if (!fileHeaderPending)
 53 | 			sectionHeaderPending = true;
 54 | 	}
 55 | 
 56 | 	void writeRule(string defName)
 57 | 	{
 58 | 		scope(failure) { import std.stdio : stderr; stderr.writeln("Error while writing rule ", defName); }
 59 | 
 60 | 		auto def = &grammar.defs[defName];
 61 | 		if (!def.used)
 62 | 			return;
 63 | 
 64 | 		if (fileHeaderPending)
 65 | 		{
 66 | 			f.writef(q"EOF
 67 | 
 68 |     // ------------------------------------------------------------------------
 69 |     // https://dlang.org/spec/%s.html
 70 |     // ------------------------------------------------------------------------
 71 | EOF", currentFile);
 72 | 			fileHeaderPending = false;
 73 | 			sectionHeaderPending = false;
 74 | 		}
 75 | 
 76 | 		if (sectionHeaderPending)
 77 | 		{
 78 | 			f.write(q"EOF
 79 | 
 80 |     // ---
 81 | EOF");
 82 | 			sectionHeaderPending = false;
 83 | 		}
 84 | 
 85 | 		f.writeln();
 86 | 		if (!def.synthetic)
 87 | 			f.writefln("    // https://dlang.org/spec/%s.html#%s",
 88 | 				currentFile,
 89 | 				defName,
 90 | 			);
 91 | 
 92 | 		f.writefln("    %s: $ =>",
 93 | 			convertRuleName(defName));
 94 | 		writeRuleBody(defName);
 95 | 
 96 | 		foreach (tail; def.tail)
 97 | 			writeRule(tail);
 98 | 	}
 99 | 
100 | 	void close()
101 | 	{
102 | 		f.write(q"EOF
103 |   }
104 | });
105 | EOF");
106 | 	}
107 | 
108 | private:
109 | 	string convertRuleName(string name)
110 | 	{
111 | 		string publicName = name;
112 | 		if (auto defPublicName = grammar.defs[name].publicName)
113 | 			publicName = defPublicName;
114 | 		return (grammar.defs[name].hidden ? "_" : "") ~ publicName.splitByCamelCase.map!toLower.join("_");
115 | 	}
116 | 
117 | 	void writeRuleBody(string defName)
118 | 	{
119 | 		int indent = 6;
120 | 
121 | 		void line(string s) { f.writeln(" ".replicate(indent), s); }
122 | 		void single(string s) { line(s ~ ","); }
123 | 
124 | 		void list(T)(string fun, T[] children, void delegate(ref T) childWriter)
125 | 		{
126 | 			if (!children.length)
127 | 			{
128 | 				line(fun ~ "(),");
129 | 				return;
130 | 			}
131 | 			line(fun ~ "(");
132 | 			indent += 2;
133 | 			foreach (ref child; children)
134 | 				childWriter(child);
135 | 			indent -= 2;
136 | 			line("),");
137 | 		}
138 | 
139 | 		HashSet!string visiting;
140 | 
141 | 		void writeDef(ref string defName)
142 | 		{
143 | 			if (defName in visiting)
144 | 				return single("/* recursion */");
145 | 			visiting.add(defName);
146 | 			scope(success) visiting.remove(defName);
147 | 
148 | 			auto def = &grammar.defs[defName];
149 | 			if (def.kind == Grammar.Def.Kind.chars)
150 | 				line("// " ~ defName);
151 | 
152 | 			void writeNode(ref Grammar.Node node)
153 | 			{
154 | 				node.match!(
155 | 					(ref Grammar.RegExp       v) => single(v.regexp),
156 | 					(ref Grammar.LiteralChars v) => single(format!"%(%s%)"([v.chars])),
157 | 					(ref Grammar.LiteralToken v) => single(format!"%(%s%)"([v.literal])),
158 | 					// https://issues.dlang.org/show_bug.cgi?id=22016
159 | 					(ref Grammar.Reference    v) { if (def.kind == Grammar.Def.Kind.chars) writeDef(v.name); else single("$." ~ convertRuleName(v.name)); },
160 | 					(ref Grammar.Choice       v) => list("choice"  , v.nodes, &writeNode),
161 | 					(ref Grammar.Seq          v) => list("seq"     , v.nodes, &writeNode),
162 | 					(ref Grammar.Repeat       v) => list("repeat"  , v.nodes, &writeNode),
163 | 					(ref Grammar.Repeat1      v) => list("repeat1" , v.nodes, &writeNode),
164 | 					(ref Grammar.Optional     v) => list("optional", v.nodes, &writeNode),
165 | 					(ref Grammar.SeqChoice    v) { assert(false); },
166 | 				);
167 | 			}
168 | 			writeNode(def.node);
169 | 		}
170 | 
171 | 		auto def = &grammar.defs[defName];
172 | 		final switch (def.kind)
173 | 		{
174 | 			case Grammar.Def.Kind.chars:
175 | 				list("token", [defName], &writeDef);
176 | 				break;
177 | 			case Grammar.Def.Kind.tokens:
178 | 				writeDef(defName);
179 | 				break;
180 | 		}
181 | 	}
182 | }
183 | 
184 | // "


--------------------------------------------------------------------------------
/package-lock.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tree-sitter-d",
 3 |   "version": "0.0.1",
 4 |   "lockfileVersion": 1,
 5 |   "requires": true,
 6 |   "dependencies": {
 7 |     "nan": {
 8 |       "version": "2.14.2",
 9 |       "resolved": "https://registry.npmjs.org/nan/-/nan-2.14.2.tgz",
10 |       "integrity": "sha512-M2ufzIiINKCuDfBSAUr1vWQ+vuVcA9kqx8JJUsbQi6yf1uGRyb7HfpdfUr5qLXf3B/t8dPvcjhKMmlfnP47EzQ=="
11 |     },
12 |     "tree-sitter-cli": {
13 |       "version": "0.20.0",
14 |       "resolved": "https://registry.npmjs.org/tree-sitter-cli/-/tree-sitter-cli-0.20.0.tgz",
15 |       "integrity": "sha512-4D1qapWbJXZ5rrSUGM5rcw5Vuq/smzn9KbiFRhlON6KeuuXjra+KAtDYVrDgAoLIG4ku+jbEEGrJxCptUGi3dg==",
16 |       "dev": true
17 |     }
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tree-sitter-d",
 3 |   "version": "0.0.1",
 4 |   "description": "D grammar for tree-sitter",
 5 |   "main": "bindings/node",
 6 |   "keywords": [
 7 |     "parser",
 8 |     "lexer"
 9 |   ],
10 |   "author": "Vladimir Panteleev and the D Language Foundation",
11 |   "license": "BSL-1.0",
12 |   "dependencies": {
13 |     "nan": "^2.14.2"
14 |   },
15 |   "devDependencies": {
16 |     "tree-sitter-cli": "^0.20.0"
17 |   },
18 |   "scripts": {
19 |     "generate": "tree-sitter generate",
20 |     "test": "tree-sitter test"
21 |   },
22 |   "repository": {
23 |     "type": "git",
24 |     "url": "git+https://github.com/CyberShadow/tree-sitter-d.git"
25 |   },
26 |   "bugs": {
27 |     "url": "https://github.com/CyberShadow/tree-sitter-d/issues"
28 |   },
29 |   "homepage": "https://github.com/CyberShadow/tree-sitter-d#readme",
30 |   "tree-sitter": [
31 |     {
32 |       "scope": "source.d",
33 |       "file-types": [
34 |         "d",
35 |         "di"
36 |       ]
37 |     }
38 |   ]
39 | }
40 | 


--------------------------------------------------------------------------------
/src/scanner.cc:
--------------------------------------------------------------------------------
  1 | #include <tree_sitter/parser.h>
  2 | #include <vector>
  3 | 
  4 | enum TokenType {
  5 |   NESTING_BLOCK_COMMENT,
  6 |   DELIMITED_STRING,
  7 | };
  8 | 
  9 | // This is only an approximation of the exact definition.
 10 | static bool is_identifier_char(int32_t c) {
 11 |   return
 12 |     (c >= 'a' && c <= 'z') ||
 13 |     (c >= 'A' && c <= 'Z') ||
 14 |     (c >= '0' && c <= '9') ||
 15 |     c == '_';
 16 | }
 17 | 
 18 | extern "C" {
 19 | 
 20 | void *tree_sitter_d_external_scanner_create() {
 21 |   return NULL;
 22 | }
 23 | 
 24 | bool tree_sitter_d_external_scanner_scan(void *payload, TSLexer *lexer,
 25 |                                          const bool *valid_symbols) {
 26 |   if (lexer->lookahead == '/' && valid_symbols[NESTING_BLOCK_COMMENT]) {
 27 |     lexer->advance(lexer, false);
 28 |     if (lexer->lookahead != '+') {
 29 |       return false;
 30 |     }
 31 |     lexer->advance(lexer, false);
 32 | 
 33 |     size_t depth = 1;
 34 |     int32_t last = 0;
 35 |     while (depth > 0) {
 36 |       last = lexer->lookahead;
 37 |       lexer->advance(lexer, false);
 38 |       if (last == '/' && lexer->lookahead == '+') {
 39 |         depth++;
 40 |         last = 0;
 41 |         lexer->advance(lexer, false);
 42 |       } else if (last == '+' && lexer->lookahead == '/') {
 43 |         depth--;
 44 |         last = 0;
 45 |         lexer->advance(lexer, false);
 46 |       } else if (lexer->lookahead == 0) {
 47 |         return false; // EOF
 48 |       }
 49 |     }
 50 |     lexer->result_symbol = NESTING_BLOCK_COMMENT;
 51 |     return true;
 52 |   }
 53 | 
 54 |   if (lexer->lookahead == 'q' && valid_symbols[DELIMITED_STRING]) {
 55 |     lexer->advance(lexer, false);
 56 |     if (lexer->lookahead != '"') {
 57 |       return false;
 58 |     }
 59 |     lexer->advance(lexer, false);
 60 |     lexer->result_symbol = DELIMITED_STRING;
 61 | 
 62 |     int32_t opener = lexer->lookahead, closer;
 63 |     switch (opener) {
 64 |       case '(': closer = ')'; break;
 65 |       case '[': closer = ']'; break;
 66 |       case '{': closer = '}'; break;
 67 |       case '<': closer = '>'; break;
 68 |       default:
 69 |       {
 70 |         // Handle the identifier case
 71 |         std::vector<int32_t> delimiter;
 72 |         delimiter.push_back('\n');
 73 |         while (lexer->lookahead != '\n') {
 74 |           if (!is_identifier_char(lexer->lookahead))
 75 |             return false; // bad syntax or EOF
 76 |           delimiter.push_back(lexer->lookahead);
 77 |           lexer->advance(lexer, false);
 78 |         }
 79 |         delimiter.push_back('"');
 80 | 
 81 |         size_t delimiter_pos = 0;
 82 |         while (true) {
 83 |           if (lexer->lookahead == 0)
 84 |             return false; // EOF
 85 |           if (delimiter_pos == delimiter.size())
 86 |             return true;
 87 |           if (lexer->lookahead == delimiter.at(delimiter_pos))
 88 |             delimiter_pos++;
 89 |           else
 90 |             delimiter_pos = lexer->lookahead == delimiter.at(0) ? 1 : 0;
 91 |           lexer->advance(lexer, false);
 92 |         }
 93 |       }
 94 |     }
 95 | 
 96 |     // Handle the punctuation case
 97 |     size_t depth = 1;
 98 |     while (depth > 0) {
 99 |       lexer->advance(lexer, false);
100 |       if (lexer->lookahead == opener) {
101 |         depth++;
102 |       } else if (lexer->lookahead == closer) {
103 |         depth--;
104 |       } else if (lexer->lookahead == 0) {
105 |         return false; // EOF
106 |       }
107 |     }
108 |     lexer->advance(lexer, false); // last closer
109 |     if (lexer->lookahead != '"')
110 |       return false;
111 |     lexer->advance(lexer, false); // "
112 |     return true;
113 |   }
114 | 
115 |   return false;
116 | }
117 | 
118 | unsigned tree_sitter_d_external_scanner_serialize(void *payload, char *buffer) {
119 |   return 0;
120 | }
121 | 
122 | void tree_sitter_d_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
123 | }
124 | 
125 | void tree_sitter_d_external_scanner_destroy(void *payload) {
126 | }
127 | 
128 | }
129 | 


--------------------------------------------------------------------------------
/test/corpus/2_lex-13_floatliteral.txt:
--------------------------------------------------------------------------------
 1 | ====================
 2 | FloatLiteral + UFCS
 3 | ====================
 4 | 
 5 | float a = 1.0;
 6 | float a = 1.foo;
 7 | float a = 1. .foo;
 8 | 
 9 | ---
10 | 
11 | (source_file
12 |   (module
13 |     (decl_defs
14 | 
15 |       (var_declarations
16 |         (fundamental_type)
17 |         (declarators
18 |           (declarator_initializer
19 |             (var_declarator
20 |               (identifier))
21 |             (exp_initializer
22 |               (primary_expression
23 |                 (float_literal))))))
24 | 
25 |       (var_declarations
26 |         (fundamental_type)
27 |         (declarators
28 |           (declarator_initializer
29 |             (var_declarator
30 |               (identifier))
31 |             (exp_initializer
32 |               (postfix_expression
33 |                 (primary_expression
34 |                   (integer_literal))
35 |                 (identifier))))))
36 | 
37 |       (var_declarations
38 |         (fundamental_type)
39 |         (declarators
40 |           (declarator_initializer
41 |             (var_declarator
42 |               (identifier))
43 |             (exp_initializer
44 |               (postfix_expression
45 |                 (primary_expression
46 |                   (float_literal))
47 |                 (identifier))))))
48 |     )))
49 | 


--------------------------------------------------------------------------------
/test/corpus/2_lex-16_special_token_sequence.txt:
--------------------------------------------------------------------------------
 1 | ====================
 2 | SpecialTokenSequence
 3 | ====================
 4 | 
 5 | #line 1
 6 | # line 2
 7 | # line 3 "file.d"
 8 | 
 9 | ---
10 | 
11 | (source_file
12 |   (special_token_sequence (integer_literal))
13 |   (special_token_sequence (integer_literal))
14 |   (special_token_sequence (integer_literal) (filespec)))
15 | 


--------------------------------------------------------------------------------
/test/corpus/2_lex-1_source_text.txt:
--------------------------------------------------------------------------------
 1 | ====================
 2 | the empty file
 3 | ====================
 4 | 
 5 | ---
 6 | 
 7 | (source_file)
 8 | 
 9 | ====================
10 | Shebang
11 | ====================
12 | 
13 | #!/usr/bin/dmd -run
14 | 
15 | ---
16 | 
17 | (source_file
18 |   (shebang))
19 | 


--------------------------------------------------------------------------------
/test/corpus/2_lex-6_comment.txt:
--------------------------------------------------------------------------------
 1 | ====================
 2 | LineComment
 3 | ====================
 4 | 
 5 | // this is a comment
 6 | 
 7 | ---
 8 | 
 9 | (source_file
10 |   (line_comment))
11 | 
12 | ====================
13 | BlockComment
14 | ====================
15 | 
16 | /* this is a comment */
17 | 
18 | /* multi
19 |    line
20 |    comment */
21 | 
22 | /* no nesting /* */
23 | 
24 | /* no interior line comments // */
25 | 
26 | ---
27 | 
28 | (source_file
29 |   (block_comment)
30 |   (block_comment)
31 |   (block_comment)
32 |   (block_comment))
33 | 
34 | ====================
35 | BlockComment 2
36 | ====================
37 | 
38 | int i;
39 | 
40 | /***************************************************/
41 | 
42 | ---
43 | 
44 | (source_file
45 |   (module
46 |     (decl_defs
47 |       (var_declarations
48 |         (fundamental_type)
49 |         (declarators
50 |           (var_declarator
51 |             (identifier))))))
52 |   (block_comment))
53 | 
54 | ====================
55 | NestingBlockComment
56 | ====================
57 | 
58 | /+ this is a comment +/
59 | 
60 | /+ /+ nesting! +/ +/
61 | 
62 | /+ /* +/
63 | /+ */ +/
64 | /+ // +/
65 | 
66 | /+ /+/ +/ +/
67 | /+ /+ +/+ +/
68 | 
69 | int/+ +/a;
70 | 
71 | ---
72 | 
73 | (source_file
74 |   (nesting_block_comment)
75 |   (nesting_block_comment)
76 |   (nesting_block_comment)
77 |   (nesting_block_comment)
78 |   (nesting_block_comment)
79 |   (nesting_block_comment)
80 |   (nesting_block_comment)
81 | 
82 |   (module
83 | 	 (decl_defs
84 |     (var_declarations
85 |      (fundamental_type)
86 |      (nesting_block_comment)
87 |      (declarators
88 |       (var_declarator
89 |        (identifier)))))))
90 | 


--------------------------------------------------------------------------------
/test/corpus/2_lex-9_string_literals.txt:
--------------------------------------------------------------------------------
  1 | ====================
  2 | WysiwygString
  3 | ====================
  4 | 
  5 | x!r"Hello, world!" y;
  6 | x!r"`\" y;
  7 | x!r"
  8 | " y;
  9 | 
 10 | ---
 11 | 
 12 | (source_file
 13 |   (module
 14 |     (decl_defs
 15 | 
 16 |       (var_declarations
 17 |         (qualified_identifier
 18 |           (template_instance
 19 |             (identifier)
 20 |             (template_arguments
 21 |               (template_single_argument
 22 |                 (wysiwyg_string)))))
 23 |         (declarators
 24 |           (var_declarator
 25 |             (identifier))))
 26 | 
 27 |       (var_declarations
 28 |         (qualified_identifier
 29 |           (template_instance
 30 |             (identifier)
 31 |             (template_arguments
 32 |               (template_single_argument
 33 |                 (wysiwyg_string)))))
 34 |         (declarators
 35 |           (var_declarator
 36 |             (identifier))))
 37 | 
 38 |       (var_declarations
 39 |         (qualified_identifier
 40 |           (template_instance
 41 |             (identifier)
 42 |             (template_arguments
 43 |               (template_single_argument
 44 |                 (wysiwyg_string)))))
 45 |         (declarators
 46 |           (var_declarator
 47 |             (identifier)))))))
 48 | 
 49 | ====================
 50 | AlternateWysiwygString
 51 | ====================
 52 | 
 53 | x!`Hello, world!` y;
 54 | x!`\` y;
 55 | x!`
 56 | ` y;
 57 | 
 58 | ---
 59 | 
 60 | (source_file
 61 |   (module
 62 |     (decl_defs
 63 | 
 64 |       (var_declarations
 65 |         (qualified_identifier
 66 |           (template_instance
 67 |             (identifier)
 68 |             (template_arguments
 69 |               (template_single_argument
 70 |                 (alternate_wysiwyg_string)))))
 71 |         (declarators
 72 |           (var_declarator
 73 |             (identifier))))
 74 | 
 75 |       (var_declarations
 76 |         (qualified_identifier
 77 |           (template_instance
 78 |             (identifier)
 79 |             (template_arguments
 80 |               (template_single_argument
 81 |                 (alternate_wysiwyg_string)))))
 82 |         (declarators
 83 |           (var_declarator
 84 |             (identifier))))
 85 | 
 86 |       (var_declarations
 87 |         (qualified_identifier
 88 |           (template_instance
 89 |             (identifier)
 90 |             (template_arguments
 91 |               (template_single_argument
 92 |                 (alternate_wysiwyg_string)))))
 93 |         (declarators
 94 |           (var_declarator
 95 |             (identifier)))))))
 96 | 
 97 | ====================
 98 | DoubleQuotedString
 99 | ====================
100 | 
101 | x!"Hello, world!" y;
102 | x!"\"" y;
103 | x!"
104 | " y;
105 | 
106 | ---
107 | 
108 | (source_file
109 |   (module
110 |     (decl_defs
111 | 
112 |       (var_declarations
113 |         (qualified_identifier
114 |           (template_instance
115 |             (identifier)
116 |             (template_arguments
117 |               (template_single_argument
118 |                 (double_quoted_string)))))
119 |         (declarators
120 |           (var_declarator
121 |             (identifier))))
122 | 
123 |       (var_declarations
124 |         (qualified_identifier
125 |           (template_instance
126 |             (identifier)
127 |             (template_arguments
128 |               (template_single_argument
129 |                 (double_quoted_string)))))
130 |         (declarators
131 |           (var_declarator
132 |             (identifier))))
133 | 
134 |       (var_declarations
135 |         (qualified_identifier
136 |           (template_instance
137 |             (identifier)
138 |             (template_arguments
139 |               (template_single_argument
140 |                 (double_quoted_string)))))
141 |         (declarators
142 |           (var_declarator
143 |             (identifier)))))))
144 | 
145 | ====================
146 | DelimitedString
147 | ====================
148 | 
149 | x!q"EOF
150 | Hello, world!
151 | Not the end: EOF"
152 | EOF: also not the end
153 | Also not the end:
154 | EOF
155 | The real end:
156 | EOF" y;
157 | 
158 | x!q"( ( [ ) < { )" y;
159 | x!q"[ [ ( ] < { ]" y;
160 | x!q"{ { [ } < ) }" y;
161 | x!q"< < ( > [ { >" y;
162 | 
163 | ---
164 | 
165 | (source_file
166 |   (module
167 |     (decl_defs
168 | 
169 |       (var_declarations
170 |         (qualified_identifier
171 |           (template_instance
172 |             (identifier)
173 |             (template_arguments
174 |               (template_single_argument
175 |                 (delimited_string)))))
176 |         (declarators
177 |           (var_declarator
178 |             (identifier))))
179 | 
180 |       (var_declarations
181 |         (qualified_identifier
182 |           (template_instance
183 |             (identifier)
184 |             (template_arguments
185 |               (template_single_argument
186 |                 (delimited_string)))))
187 |         (declarators
188 |           (var_declarator
189 |             (identifier))))
190 | 
191 |       (var_declarations
192 |         (qualified_identifier
193 |           (template_instance
194 |             (identifier)
195 |             (template_arguments
196 |               (template_single_argument
197 |                 (delimited_string)))))
198 |         (declarators
199 |           (var_declarator
200 |             (identifier))))
201 | 
202 |       (var_declarations
203 |         (qualified_identifier
204 |           (template_instance
205 |             (identifier)
206 |             (template_arguments
207 |               (template_single_argument
208 |                 (delimited_string)))))
209 |         (declarators
210 |           (var_declarator
211 |             (identifier))))
212 | 
213 |       (var_declarations
214 |         (qualified_identifier
215 |           (template_instance
216 |             (identifier)
217 |             (template_arguments
218 |               (template_single_argument
219 |                 (delimited_string)))))
220 |         (declarators
221 |           (var_declarator
222 |             (identifier)))))))
223 | 
224 | 


--------------------------------------------------------------------------------
/test/corpus/30_iasm-11_gcc.txt:
--------------------------------------------------------------------------------
 1 | ====================
 2 | GccExtAsmInstruction
 3 | ====================
 4 | 
 5 | void cpuid()
 6 | {
 7 |     uint u;
 8 |     asm { "cpuid" : "=eax" (u) : "eax" (0x8000_0000) : "ebx", "ecx", "edx"; }
 9 | }
10 | 
11 | ---
12 | 
13 | (source_file
14 |  (module
15 |   (decl_defs
16 |    (func_declaration
17 |     (fundamental_type)
18 |     (func_declarator
19 |      (identifier)
20 |      (func_declarator_suffix
21 |       (parameters)))
22 | 
23 |     (specified_function_body
24 |      (block_statement
25 |       (statement_list
26 |        (declaration_statement
27 |         (var_declarations
28 |          (fundamental_type)
29 |          (declarators
30 |           (var_declarator
31 |            (identifier)))))
32 | 
33 |        (gcc_asm_statement
34 |         (gcc_asm_instruction_list
35 |          (gcc_ext_asm_instruction
36 |           (primary_expression
37 |            (string_literals
38 |             (double_quoted_string)))
39 | 
40 |           (gcc_asm_operands
41 |            (double_quoted_string)
42 |            (primary_expression
43 |             (identifier)))
44 | 
45 |           (gcc_asm_operands
46 |            (double_quoted_string)
47 |            (primary_expression
48 |             (integer_literal)))
49 | 
50 |           (gcc_asm_clobbers
51 |            (double_quoted_string)
52 |            (gcc_asm_clobbers
53 |             (double_quoted_string)
54 |             (double_quoted_string)))))))))))))
55 | 


--------------------------------------------------------------------------------
/test/parse-success-xfail.txt:
--------------------------------------------------------------------------------
 1 | # TODO
 2 | test/parse-success/dmd/runnable/complex.d
 3 | test/parse-success/dmd/runnable/helloUTF16BE.d
 4 | test/parse-success/dmd/runnable/helloUTF16.d
 5 | test/parse-success/dmd/runnable/sdtor.d
 6 | test/parse-success/dmd/runnable/test15.d
 7 | test/parse-success/dmd/runnable/testcontracts.d
 8 | test/parse-success/dmd/runnable/testUTF32.d
 9 | test/parse-success/dmd/runnable/uda.d
10 | 


--------------------------------------------------------------------------------
/test/parse-success/dmd/compilable:
--------------------------------------------------------------------------------
1 | ../../repos/dmd/test/compilable


--------------------------------------------------------------------------------
/test/parse-success/dmd/runnable:
--------------------------------------------------------------------------------
1 | ../../repos/dmd/test/runnable


--------------------------------------------------------------------------------
/test/parse-success/dmd_asm.d:
--------------------------------------------------------------------------------
1 | void fun()
2 | {
3 | 	asm
4 | 	{
5 | 		int 80;
6 | 	}
7 | }
8 | 


--------------------------------------------------------------------------------
/test/repos/README.md:
--------------------------------------------------------------------------------
1 | D projects which we use for testing the parser are referenced as submodules here.
2 | 


--------------------------------------------------------------------------------
/test/tmp/.gitignore:
--------------------------------------------------------------------------------
1 | *
2 | !/.gitignore
3 | 


--------------------------------------------------------------------------------