├── .gitignore
├── CHANGELOG.md
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── README.tpl
├── benches
    ├── full_parse_reflexives.rs
    └── reflexives.fgr
├── cli
    ├── Cargo.toml
    └── src
    │   └── main.rs
├── examples
    ├── asl-wordorder.fgr
    ├── dative-shift.fgr
    ├── no-features.fgr
    └── reflexives.fgr
├── rustfmt.toml
└── src
    ├── earley.rs
    ├── featurestructure
        ├── mod.rs
        ├── node.rs
        └── serialized.rs
    ├── fgr
        ├── mod.rs
        └── parse_grammar.rs
    ├── forest.rs
    ├── lib.rs
    ├── rules.rs
    ├── syntree.rs
    └── utils.rs


/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | **/*.rs.bk
3 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## 0.1.2
 4 | 
 5 | Changed all uses of `Rc` into `Arc`, for multi-threaded use.
 6 | 
 7 | ## 0.1.1
 8 | 
 9 | - Added `From<NodeRef>` implementation for `HashMap<String, String>` that gives
10 |   an easier way to work with the DAG, if you don't care about forwarding
11 |   relationships.
12 | 
13 | ## 0.1.0
14 | 
15 | - Initial release.
16 | 


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 4
  4 | 
  5 | [[package]]
  6 | name = "aho-corasick"
  7 | version = "1.1.3"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916"
 10 | dependencies = [
 11 |  "memchr",
 12 | ]
 13 | 
 14 | [[package]]
 15 | name = "anes"
 16 | version = "0.1.6"
 17 | source = "registry+https://github.com/rust-lang/crates.io-index"
 18 | checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 19 | 
 20 | [[package]]
 21 | name = "anstyle"
 22 | version = "1.0.10"
 23 | source = "registry+https://github.com/rust-lang/crates.io-index"
 24 | checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9"
 25 | 
 26 | [[package]]
 27 | name = "autocfg"
 28 | version = "1.4.0"
 29 | source = "registry+https://github.com/rust-lang/crates.io-index"
 30 | checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
 31 | 
 32 | [[package]]
 33 | name = "bumpalo"
 34 | version = "3.17.0"
 35 | source = "registry+https://github.com/rust-lang/crates.io-index"
 36 | checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf"
 37 | 
 38 | [[package]]
 39 | name = "cast"
 40 | version = "0.3.0"
 41 | source = "registry+https://github.com/rust-lang/crates.io-index"
 42 | checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 43 | 
 44 | [[package]]
 45 | name = "cfg-if"
 46 | version = "1.0.0"
 47 | source = "registry+https://github.com/rust-lang/crates.io-index"
 48 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 49 | 
 50 | [[package]]
 51 | name = "ciborium"
 52 | version = "0.2.2"
 53 | source = "registry+https://github.com/rust-lang/crates.io-index"
 54 | checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e"
 55 | dependencies = [
 56 |  "ciborium-io",
 57 |  "ciborium-ll",
 58 |  "serde",
 59 | ]
 60 | 
 61 | [[package]]
 62 | name = "ciborium-io"
 63 | version = "0.2.2"
 64 | source = "registry+https://github.com/rust-lang/crates.io-index"
 65 | checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757"
 66 | 
 67 | [[package]]
 68 | name = "ciborium-ll"
 69 | version = "0.2.2"
 70 | source = "registry+https://github.com/rust-lang/crates.io-index"
 71 | checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9"
 72 | dependencies = [
 73 |  "ciborium-io",
 74 |  "half",
 75 | ]
 76 | 
 77 | [[package]]
 78 | name = "clap"
 79 | version = "4.5.32"
 80 | source = "registry+https://github.com/rust-lang/crates.io-index"
 81 | checksum = "6088f3ae8c3608d19260cd7445411865a485688711b78b5be70d78cd96136f83"
 82 | dependencies = [
 83 |  "clap_builder",
 84 | ]
 85 | 
 86 | [[package]]
 87 | name = "clap_builder"
 88 | version = "4.5.32"
 89 | source = "registry+https://github.com/rust-lang/crates.io-index"
 90 | checksum = "22a7ef7f676155edfb82daa97f99441f3ebf4a58d5e32f295a56259f1b6facc8"
 91 | dependencies = [
 92 |  "anstyle",
 93 |  "clap_lex",
 94 | ]
 95 | 
 96 | [[package]]
 97 | name = "clap_lex"
 98 | version = "0.7.4"
 99 | source = "registry+https://github.com/rust-lang/crates.io-index"
100 | checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6"
101 | 
102 | [[package]]
103 | name = "cli"
104 | version = "0.2.0"
105 | dependencies = [
106 |  "tracing-subscriber",
107 |  "treebender",
108 | ]
109 | 
110 | [[package]]
111 | name = "criterion"
112 | version = "0.5.1"
113 | source = "registry+https://github.com/rust-lang/crates.io-index"
114 | checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f"
115 | dependencies = [
116 |  "anes",
117 |  "cast",
118 |  "ciborium",
119 |  "clap",
120 |  "criterion-plot",
121 |  "is-terminal",
122 |  "itertools",
123 |  "num-traits",
124 |  "once_cell",
125 |  "oorandom",
126 |  "plotters",
127 |  "rayon",
128 |  "regex",
129 |  "serde",
130 |  "serde_derive",
131 |  "serde_json",
132 |  "tinytemplate",
133 |  "walkdir",
134 | ]
135 | 
136 | [[package]]
137 | name = "criterion-plot"
138 | version = "0.5.0"
139 | source = "registry+https://github.com/rust-lang/crates.io-index"
140 | checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1"
141 | dependencies = [
142 |  "cast",
143 |  "itertools",
144 | ]
145 | 
146 | [[package]]
147 | name = "crossbeam-deque"
148 | version = "0.8.6"
149 | source = "registry+https://github.com/rust-lang/crates.io-index"
150 | checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
151 | dependencies = [
152 |  "crossbeam-epoch",
153 |  "crossbeam-utils",
154 | ]
155 | 
156 | [[package]]
157 | name = "crossbeam-epoch"
158 | version = "0.9.18"
159 | source = "registry+https://github.com/rust-lang/crates.io-index"
160 | checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
161 | dependencies = [
162 |  "crossbeam-utils",
163 | ]
164 | 
165 | [[package]]
166 | name = "crossbeam-utils"
167 | version = "0.8.21"
168 | source = "registry+https://github.com/rust-lang/crates.io-index"
169 | checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
170 | 
171 | [[package]]
172 | name = "crunchy"
173 | version = "0.2.3"
174 | source = "registry+https://github.com/rust-lang/crates.io-index"
175 | checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929"
176 | 
177 | [[package]]
178 | name = "either"
179 | version = "1.15.0"
180 | source = "registry+https://github.com/rust-lang/crates.io-index"
181 | checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
182 | 
183 | [[package]]
184 | name = "half"
185 | version = "2.4.1"
186 | source = "registry+https://github.com/rust-lang/crates.io-index"
187 | checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888"
188 | dependencies = [
189 |  "cfg-if",
190 |  "crunchy",
191 | ]
192 | 
193 | [[package]]
194 | name = "hermit-abi"
195 | version = "0.5.0"
196 | source = "registry+https://github.com/rust-lang/crates.io-index"
197 | checksum = "fbd780fe5cc30f81464441920d82ac8740e2e46b29a6fad543ddd075229ce37e"
198 | 
199 | [[package]]
200 | name = "is-terminal"
201 | version = "0.4.16"
202 | source = "registry+https://github.com/rust-lang/crates.io-index"
203 | checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9"
204 | dependencies = [
205 |  "hermit-abi",
206 |  "libc",
207 |  "windows-sys",
208 | ]
209 | 
210 | [[package]]
211 | name = "itertools"
212 | version = "0.10.5"
213 | source = "registry+https://github.com/rust-lang/crates.io-index"
214 | checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473"
215 | dependencies = [
216 |  "either",
217 | ]
218 | 
219 | [[package]]
220 | name = "itoa"
221 | version = "1.0.15"
222 | source = "registry+https://github.com/rust-lang/crates.io-index"
223 | checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
224 | 
225 | [[package]]
226 | name = "js-sys"
227 | version = "0.3.77"
228 | source = "registry+https://github.com/rust-lang/crates.io-index"
229 | checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f"
230 | dependencies = [
231 |  "once_cell",
232 |  "wasm-bindgen",
233 | ]
234 | 
235 | [[package]]
236 | name = "lazy_static"
237 | version = "1.5.0"
238 | source = "registry+https://github.com/rust-lang/crates.io-index"
239 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
240 | 
241 | [[package]]
242 | name = "libc"
243 | version = "0.2.170"
244 | source = "registry+https://github.com/rust-lang/crates.io-index"
245 | checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828"
246 | 
247 | [[package]]
248 | name = "log"
249 | version = "0.4.26"
250 | source = "registry+https://github.com/rust-lang/crates.io-index"
251 | checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e"
252 | 
253 | [[package]]
254 | name = "matchers"
255 | version = "0.1.0"
256 | source = "registry+https://github.com/rust-lang/crates.io-index"
257 | checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558"
258 | dependencies = [
259 |  "regex-automata 0.1.10",
260 | ]
261 | 
262 | [[package]]
263 | name = "memchr"
264 | version = "2.7.4"
265 | source = "registry+https://github.com/rust-lang/crates.io-index"
266 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
267 | 
268 | [[package]]
269 | name = "nu-ansi-term"
270 | version = "0.46.0"
271 | source = "registry+https://github.com/rust-lang/crates.io-index"
272 | checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84"
273 | dependencies = [
274 |  "overload",
275 |  "winapi",
276 | ]
277 | 
278 | [[package]]
279 | name = "num-traits"
280 | version = "0.2.19"
281 | source = "registry+https://github.com/rust-lang/crates.io-index"
282 | checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
283 | dependencies = [
284 |  "autocfg",
285 | ]
286 | 
287 | [[package]]
288 | name = "once_cell"
289 | version = "1.21.0"
290 | source = "registry+https://github.com/rust-lang/crates.io-index"
291 | checksum = "cde51589ab56b20a6f686b2c68f7a0bd6add753d697abf720d63f8db3ab7b1ad"
292 | 
293 | [[package]]
294 | name = "oorandom"
295 | version = "11.1.5"
296 | source = "registry+https://github.com/rust-lang/crates.io-index"
297 | checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e"
298 | 
299 | [[package]]
300 | name = "overload"
301 | version = "0.1.1"
302 | source = "registry+https://github.com/rust-lang/crates.io-index"
303 | checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
304 | 
305 | [[package]]
306 | name = "pin-project-lite"
307 | version = "0.2.16"
308 | source = "registry+https://github.com/rust-lang/crates.io-index"
309 | checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b"
310 | 
311 | [[package]]
312 | name = "plotters"
313 | version = "0.3.7"
314 | source = "registry+https://github.com/rust-lang/crates.io-index"
315 | checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747"
316 | dependencies = [
317 |  "num-traits",
318 |  "plotters-backend",
319 |  "plotters-svg",
320 |  "wasm-bindgen",
321 |  "web-sys",
322 | ]
323 | 
324 | [[package]]
325 | name = "plotters-backend"
326 | version = "0.3.7"
327 | source = "registry+https://github.com/rust-lang/crates.io-index"
328 | checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a"
329 | 
330 | [[package]]
331 | name = "plotters-svg"
332 | version = "0.3.7"
333 | source = "registry+https://github.com/rust-lang/crates.io-index"
334 | checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670"
335 | dependencies = [
336 |  "plotters-backend",
337 | ]
338 | 
339 | [[package]]
340 | name = "proc-macro2"
341 | version = "1.0.94"
342 | source = "registry+https://github.com/rust-lang/crates.io-index"
343 | checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84"
344 | dependencies = [
345 |  "unicode-ident",
346 | ]
347 | 
348 | [[package]]
349 | name = "quote"
350 | version = "1.0.39"
351 | source = "registry+https://github.com/rust-lang/crates.io-index"
352 | checksum = "c1f1914ce909e1658d9907913b4b91947430c7d9be598b15a1912935b8c04801"
353 | dependencies = [
354 |  "proc-macro2",
355 | ]
356 | 
357 | [[package]]
358 | name = "rayon"
359 | version = "1.10.0"
360 | source = "registry+https://github.com/rust-lang/crates.io-index"
361 | checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
362 | dependencies = [
363 |  "either",
364 |  "rayon-core",
365 | ]
366 | 
367 | [[package]]
368 | name = "rayon-core"
369 | version = "1.12.1"
370 | source = "registry+https://github.com/rust-lang/crates.io-index"
371 | checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
372 | dependencies = [
373 |  "crossbeam-deque",
374 |  "crossbeam-utils",
375 | ]
376 | 
377 | [[package]]
378 | name = "regex"
379 | version = "1.11.1"
380 | source = "registry+https://github.com/rust-lang/crates.io-index"
381 | checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
382 | dependencies = [
383 |  "aho-corasick",
384 |  "memchr",
385 |  "regex-automata 0.4.9",
386 |  "regex-syntax 0.8.5",
387 | ]
388 | 
389 | [[package]]
390 | name = "regex-automata"
391 | version = "0.1.10"
392 | source = "registry+https://github.com/rust-lang/crates.io-index"
393 | checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
394 | dependencies = [
395 |  "regex-syntax 0.6.29",
396 | ]
397 | 
398 | [[package]]
399 | name = "regex-automata"
400 | version = "0.4.9"
401 | source = "registry+https://github.com/rust-lang/crates.io-index"
402 | checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
403 | dependencies = [
404 |  "aho-corasick",
405 |  "memchr",
406 |  "regex-syntax 0.8.5",
407 | ]
408 | 
409 | [[package]]
410 | name = "regex-syntax"
411 | version = "0.6.29"
412 | source = "registry+https://github.com/rust-lang/crates.io-index"
413 | checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
414 | 
415 | [[package]]
416 | name = "regex-syntax"
417 | version = "0.8.5"
418 | source = "registry+https://github.com/rust-lang/crates.io-index"
419 | checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
420 | 
421 | [[package]]
422 | name = "rustversion"
423 | version = "1.0.20"
424 | source = "registry+https://github.com/rust-lang/crates.io-index"
425 | checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2"
426 | 
427 | [[package]]
428 | name = "ryu"
429 | version = "1.0.20"
430 | source = "registry+https://github.com/rust-lang/crates.io-index"
431 | checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f"
432 | 
433 | [[package]]
434 | name = "same-file"
435 | version = "1.0.6"
436 | source = "registry+https://github.com/rust-lang/crates.io-index"
437 | checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502"
438 | dependencies = [
439 |  "winapi-util",
440 | ]
441 | 
442 | [[package]]
443 | name = "serde"
444 | version = "1.0.219"
445 | source = "registry+https://github.com/rust-lang/crates.io-index"
446 | checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
447 | dependencies = [
448 |  "serde_derive",
449 | ]
450 | 
451 | [[package]]
452 | name = "serde_derive"
453 | version = "1.0.219"
454 | source = "registry+https://github.com/rust-lang/crates.io-index"
455 | checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
456 | dependencies = [
457 |  "proc-macro2",
458 |  "quote",
459 |  "syn",
460 | ]
461 | 
462 | [[package]]
463 | name = "serde_json"
464 | version = "1.0.140"
465 | source = "registry+https://github.com/rust-lang/crates.io-index"
466 | checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373"
467 | dependencies = [
468 |  "itoa",
469 |  "memchr",
470 |  "ryu",
471 |  "serde",
472 | ]
473 | 
474 | [[package]]
475 | name = "sharded-slab"
476 | version = "0.1.7"
477 | source = "registry+https://github.com/rust-lang/crates.io-index"
478 | checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6"
479 | dependencies = [
480 |  "lazy_static",
481 | ]
482 | 
483 | [[package]]
484 | name = "smallvec"
485 | version = "1.14.0"
486 | source = "registry+https://github.com/rust-lang/crates.io-index"
487 | checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd"
488 | 
489 | [[package]]
490 | name = "syn"
491 | version = "2.0.100"
492 | source = "registry+https://github.com/rust-lang/crates.io-index"
493 | checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0"
494 | dependencies = [
495 |  "proc-macro2",
496 |  "quote",
497 |  "unicode-ident",
498 | ]
499 | 
500 | [[package]]
501 | name = "thread_local"
502 | version = "1.1.8"
503 | source = "registry+https://github.com/rust-lang/crates.io-index"
504 | checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c"
505 | dependencies = [
506 |  "cfg-if",
507 |  "once_cell",
508 | ]
509 | 
510 | [[package]]
511 | name = "tinytemplate"
512 | version = "1.2.1"
513 | source = "registry+https://github.com/rust-lang/crates.io-index"
514 | checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc"
515 | dependencies = [
516 |  "serde",
517 |  "serde_json",
518 | ]
519 | 
520 | [[package]]
521 | name = "tracing"
522 | version = "0.1.41"
523 | source = "registry+https://github.com/rust-lang/crates.io-index"
524 | checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
525 | dependencies = [
526 |  "pin-project-lite",
527 |  "tracing-attributes",
528 |  "tracing-core",
529 | ]
530 | 
531 | [[package]]
532 | name = "tracing-attributes"
533 | version = "0.1.28"
534 | source = "registry+https://github.com/rust-lang/crates.io-index"
535 | checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
536 | dependencies = [
537 |  "proc-macro2",
538 |  "quote",
539 |  "syn",
540 | ]
541 | 
542 | [[package]]
543 | name = "tracing-core"
544 | version = "0.1.33"
545 | source = "registry+https://github.com/rust-lang/crates.io-index"
546 | checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
547 | dependencies = [
548 |  "once_cell",
549 |  "valuable",
550 | ]
551 | 
552 | [[package]]
553 | name = "tracing-log"
554 | version = "0.2.0"
555 | source = "registry+https://github.com/rust-lang/crates.io-index"
556 | checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3"
557 | dependencies = [
558 |  "log",
559 |  "once_cell",
560 |  "tracing-core",
561 | ]
562 | 
563 | [[package]]
564 | name = "tracing-subscriber"
565 | version = "0.3.19"
566 | source = "registry+https://github.com/rust-lang/crates.io-index"
567 | checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
568 | dependencies = [
569 |  "matchers",
570 |  "nu-ansi-term",
571 |  "once_cell",
572 |  "regex",
573 |  "sharded-slab",
574 |  "smallvec",
575 |  "thread_local",
576 |  "tracing",
577 |  "tracing-core",
578 |  "tracing-log",
579 | ]
580 | 
581 | [[package]]
582 | name = "treebender"
583 | version = "0.2.0"
584 | dependencies = [
585 |  "criterion",
586 |  "lazy_static",
587 |  "regex",
588 |  "tracing",
589 | ]
590 | 
591 | [[package]]
592 | name = "unicode-ident"
593 | version = "1.0.18"
594 | source = "registry+https://github.com/rust-lang/crates.io-index"
595 | checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
596 | 
597 | [[package]]
598 | name = "valuable"
599 | version = "0.1.1"
600 | source = "registry+https://github.com/rust-lang/crates.io-index"
601 | checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65"
602 | 
603 | [[package]]
604 | name = "walkdir"
605 | version = "2.5.0"
606 | source = "registry+https://github.com/rust-lang/crates.io-index"
607 | checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b"
608 | dependencies = [
609 |  "same-file",
610 |  "winapi-util",
611 | ]
612 | 
613 | [[package]]
614 | name = "wasm-bindgen"
615 | version = "0.2.100"
616 | source = "registry+https://github.com/rust-lang/crates.io-index"
617 | checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5"
618 | dependencies = [
619 |  "cfg-if",
620 |  "once_cell",
621 |  "rustversion",
622 |  "wasm-bindgen-macro",
623 | ]
624 | 
625 | [[package]]
626 | name = "wasm-bindgen-backend"
627 | version = "0.2.100"
628 | source = "registry+https://github.com/rust-lang/crates.io-index"
629 | checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6"
630 | dependencies = [
631 |  "bumpalo",
632 |  "log",
633 |  "proc-macro2",
634 |  "quote",
635 |  "syn",
636 |  "wasm-bindgen-shared",
637 | ]
638 | 
639 | [[package]]
640 | name = "wasm-bindgen-macro"
641 | version = "0.2.100"
642 | source = "registry+https://github.com/rust-lang/crates.io-index"
643 | checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407"
644 | dependencies = [
645 |  "quote",
646 |  "wasm-bindgen-macro-support",
647 | ]
648 | 
649 | [[package]]
650 | name = "wasm-bindgen-macro-support"
651 | version = "0.2.100"
652 | source = "registry+https://github.com/rust-lang/crates.io-index"
653 | checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
654 | dependencies = [
655 |  "proc-macro2",
656 |  "quote",
657 |  "syn",
658 |  "wasm-bindgen-backend",
659 |  "wasm-bindgen-shared",
660 | ]
661 | 
662 | [[package]]
663 | name = "wasm-bindgen-shared"
664 | version = "0.2.100"
665 | source = "registry+https://github.com/rust-lang/crates.io-index"
666 | checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d"
667 | dependencies = [
668 |  "unicode-ident",
669 | ]
670 | 
671 | [[package]]
672 | name = "web-sys"
673 | version = "0.3.77"
674 | source = "registry+https://github.com/rust-lang/crates.io-index"
675 | checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2"
676 | dependencies = [
677 |  "js-sys",
678 |  "wasm-bindgen",
679 | ]
680 | 
681 | [[package]]
682 | name = "winapi"
683 | version = "0.3.9"
684 | source = "registry+https://github.com/rust-lang/crates.io-index"
685 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
686 | dependencies = [
687 |  "winapi-i686-pc-windows-gnu",
688 |  "winapi-x86_64-pc-windows-gnu",
689 | ]
690 | 
691 | [[package]]
692 | name = "winapi-i686-pc-windows-gnu"
693 | version = "0.4.0"
694 | source = "registry+https://github.com/rust-lang/crates.io-index"
695 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
696 | 
697 | [[package]]
698 | name = "winapi-util"
699 | version = "0.1.9"
700 | source = "registry+https://github.com/rust-lang/crates.io-index"
701 | checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb"
702 | dependencies = [
703 |  "windows-sys",
704 | ]
705 | 
706 | [[package]]
707 | name = "winapi-x86_64-pc-windows-gnu"
708 | version = "0.4.0"
709 | source = "registry+https://github.com/rust-lang/crates.io-index"
710 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
711 | 
712 | [[package]]
713 | name = "windows-sys"
714 | version = "0.59.0"
715 | source = "registry+https://github.com/rust-lang/crates.io-index"
716 | checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
717 | dependencies = [
718 |  "windows-targets",
719 | ]
720 | 
721 | [[package]]
722 | name = "windows-targets"
723 | version = "0.52.6"
724 | source = "registry+https://github.com/rust-lang/crates.io-index"
725 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
726 | dependencies = [
727 |  "windows_aarch64_gnullvm",
728 |  "windows_aarch64_msvc",
729 |  "windows_i686_gnu",
730 |  "windows_i686_gnullvm",
731 |  "windows_i686_msvc",
732 |  "windows_x86_64_gnu",
733 |  "windows_x86_64_gnullvm",
734 |  "windows_x86_64_msvc",
735 | ]
736 | 
737 | [[package]]
738 | name = "windows_aarch64_gnullvm"
739 | version = "0.52.6"
740 | source = "registry+https://github.com/rust-lang/crates.io-index"
741 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
742 | 
743 | [[package]]
744 | name = "windows_aarch64_msvc"
745 | version = "0.52.6"
746 | source = "registry+https://github.com/rust-lang/crates.io-index"
747 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
748 | 
749 | [[package]]
750 | name = "windows_i686_gnu"
751 | version = "0.52.6"
752 | source = "registry+https://github.com/rust-lang/crates.io-index"
753 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
754 | 
755 | [[package]]
756 | name = "windows_i686_gnullvm"
757 | version = "0.52.6"
758 | source = "registry+https://github.com/rust-lang/crates.io-index"
759 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
760 | 
761 | [[package]]
762 | name = "windows_i686_msvc"
763 | version = "0.52.6"
764 | source = "registry+https://github.com/rust-lang/crates.io-index"
765 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
766 | 
767 | [[package]]
768 | name = "windows_x86_64_gnu"
769 | version = "0.52.6"
770 | source = "registry+https://github.com/rust-lang/crates.io-index"
771 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
772 | 
773 | [[package]]
774 | name = "windows_x86_64_gnullvm"
775 | version = "0.52.6"
776 | source = "registry+https://github.com/rust-lang/crates.io-index"
777 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
778 | 
779 | [[package]]
780 | name = "windows_x86_64_msvc"
781 | version = "0.52.6"
782 | source = "registry+https://github.com/rust-lang/crates.io-index"
783 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
784 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "treebender"
 3 | version = "0.2.0"
 4 | authors = ["Theia Vogel <theia@vgel.me>"]
 5 | edition = "2024"
 6 | description = "An HDPSG inspired symbolic NLP library for Rust"
 7 | repository = "https://github.com/vgel/treebender"
 8 | license = "MIT"
 9 | keywords = ["nlp", "parsing", "earley", "syntax", "hdpsg"]
10 | categories = ["science", "text-processing"]
11 | 
12 | [badges]
13 | maintenance = { status = "experimental" }
14 | 
15 | [workspace]
16 | resolver = "2"
17 | members = ["cli"]
18 | 
19 | [dependencies]
20 | regex = "1"
21 | lazy_static = "1"
22 | tracing = "0.1.41"
23 | 
24 | [dev-dependencies]
25 | criterion = "0.5"
26 | 
27 | [[bench]]
28 | name = "full_parse_reflexives"
29 | harness = false
30 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Theia Vogel
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Crates.io](https://img.shields.io/crates/v/treebender.svg)](https://crates.io/crates/treebender)
  2 | ![Maintenance](https://img.shields.io/badge/maintenance-experimental-blue.svg)
  3 | 
  4 | # Treebender
  5 | 
  6 | A symbolic natural language parsing library for Rust, inspired by
  7 | [HDPSG](https://en.wikipedia.org/wiki/Head-driven_phrase_structure_grammar).
  8 | 
  9 | ## What is this?
 10 | This is a library for parsing natural or constructed languages into syntax trees
 11 | and feature structures. There's no machine learning or probabilistic models,
 12 | everything is hand-crafted and deterministic.
 13 | 
 14 | You can find out more about the motivations of this project in
 15 | [this blog post](https://vgel.me/posts/symbolic-linguistics-part1/).
 16 | 
 17 | ### But what are you using it for?
 18 | I'm using this to parse a constructed language for my upcoming xenolinguistics
 19 | game, [Themengi](https://vgel.me/themengi/).
 20 | 
 21 | ## Motivation
 22 | Using a simple 80-line grammar, introduced in the tutorial below, we can parse
 23 | a simple subset of English, checking reflexive pronoun binding, case, and
 24 | number agreement.
 25 | 
 26 | ```
 27 | $ cargo run --bin cli examples/reflexives.fgr
 28 | > she likes himself
 29 | Parsed 0 trees
 30 | 
 31 | > her likes herself
 32 | Parsed 0 trees
 33 | 
 34 | > she like herself
 35 | Parsed 0 trees
 36 | 
 37 | > she likes herself
 38 | Parsed 1 tree
 39 | (0..3: S
 40 |   (0..1: N (0..1: she))
 41 |   (1..2: TV (1..2: likes))
 42 |   (2..3: N (2..3: herself)))
 43 | [
 44 |   child-2: [
 45 |     case: acc
 46 |     pron: ref
 47 |     needs_pron: #0 she
 48 |     num: sg
 49 |     child-0: [ word: herself ]
 50 |   ]
 51 |   child-1: [
 52 |     tense: nonpast
 53 |     child-0: [ word: likes ]
 54 |     num: #1 sg
 55 |   ]
 56 |   child-0: [
 57 |     child-0: [ word: she ]
 58 |     case: nom
 59 |     pron: #0
 60 |     num: #1
 61 |   ]
 62 | ]
 63 | ```
 64 | 
 65 | Low resource language? Low problem! No need to train on gigabytes of text, just
 66 | write a grammar using your brain. Let's hypothesize that in
 67 | American Sign Language, topicalized nouns (expressed with raised eyebrows)
 68 | must appear first in the sentence. We can write a small grammar (18 lines),
 69 | and plug in some sentences:
 70 | 
 71 | ```
 72 | $ cargo run --bin cli examples/asl-wordorder.fgr -n
 73 | > boy sit
 74 | Parsed 1 tree
 75 | (0..2: S
 76 |   (0..1: NP ((0..1: N (0..1: boy))))
 77 |   (1..2: IV (1..2: sit)))
 78 | 
 79 | > boy throw ball
 80 | Parsed 1 tree
 81 | (0..3: S
 82 |   (0..1: NP ((0..1: N (0..1: boy))))
 83 |   (1..2: TV (1..2: throw))
 84 |   (2..3: NP ((2..3: N (2..3: ball)))))
 85 | 
 86 | > ball nm-raised-eyebrows boy throw
 87 | Parsed 1 tree
 88 | (0..4: S
 89 |   (0..2: NP
 90 |     (0..1: N (0..1: ball))
 91 |     (1..2: Topic (1..2: nm-raised-eyebrows)))
 92 |   (2..3: NP ((2..3: N (2..3: boy))))
 93 |   (3..4: TV (3..4: throw)))
 94 | 
 95 | > boy throw ball nm-raised-eyebrows
 96 | Parsed 0 trees
 97 | ```
 98 | 
 99 | ## Tutorial
100 | As an example, let's say we want to build a parser for English reflexive
101 | pronouns (himself, herself, themselves, themself, itself). We'll also support
102 | number ("He likes X" v.s. "They like X") and simple embedded clauses
103 | ("He said that they like X").
104 | 
105 | Grammar files are written in a custom language, similar to BNF, called
106 | Feature GRammar (.fgr). There's a VSCode syntax highlighting extension for these
107 | files available as [`fgr-syntax`](https://marketplace.visualstudio.com/items?itemName=vgel.fgr-syntax).
108 | 
109 | We'll start by defining our lexicon. The lexicon is the set of terminal symbols
110 | (symbols in the actual input) that the grammar will match. Terminal symbols must
111 | start with a lowercase letter, and non-terminal symbols must start with an
112 | uppercase letter.
113 | 
114 | ```fgr
115 | // pronouns
116 | N -> he
117 | N -> him
118 | N -> himself
119 | N -> she
120 | N -> her
121 | N -> herself
122 | N -> they
123 | N -> them
124 | N -> themselves
125 | N -> themself
126 | 
127 | // names, lowercase as they are terminals
128 | N -> mary
129 | N -> sue
130 | N -> takeshi
131 | N -> robert
132 | 
133 | // complementizer
134 | Comp -> that
135 | 
136 | // verbs -- intransitive, transitive, and clausal
137 | IV -> falls
138 | IV -> fall
139 | IV -> fell
140 | 
141 | TV -> likes
142 | TV -> like
143 | TV -> liked
144 | 
145 | CV -> says
146 | CV -> say
147 | CV -> said
148 | ```
149 | 
150 | Next, we can add our sentence rules (they must be added at the top, as the first
151 | rule in the file is assumed to be the top-level rule):
152 | 
153 | ```fgr
154 | // sentence rules
155 | S -> N IV
156 | S -> N TV N
157 | S -> N CV Comp S
158 | 
159 | // ... previous lexicon ...
160 | ```
161 | 
162 | Assuming this file is saved as `examples/no-features.fgr` (which it is :wink:),
163 | we can test this file with the built-in CLI:
164 | 
165 | ```
166 | $ cargo run --bin cli examples/no-features.fgr
167 | > he falls
168 | Parsed 1 tree
169 | (0..2: S
170 |   (0..1: N (0..1: he))
171 |   (1..2: IV (1..2: falls)))
172 | [
173 |   child-1: [ child-0: [ word: falls ] ]
174 |   child-0: [ child-0: [ word: he ] ]
175 | ]
176 | 
177 | > he falls her
178 | Parsed 0 trees
179 | 
180 | > he likes her
181 | Parsed 1 tree
182 | (0..3: S
183 |   (0..1: N (0..1: he))
184 |   (1..2: TV (1..2: likes))
185 |   (2..3: N (2..3: her)))
186 | [
187 |   child-2: [ child-0: [ word: her ] ]
188 |   child-1: [ child-0: [ word: likes ] ]
189 |   child-0: [ child-0: [ word: he ] ]
190 | ]
191 | 
192 | > he likes
193 | Parsed 0 trees
194 | 
195 | > he said that he likes her
196 | Parsed 1 tree
197 | (0..6: S
198 |   (0..1: N (0..1: he))
199 |   (1..2: CV (1..2: said))
200 |   (2..3: Comp (2..3: that))
201 |   (3..6: S
202 |     (3..4: N (3..4: he))
203 |     (4..5: TV (4..5: likes))
204 |     (5..6: N (5..6: her))))
205 | [
206 |   child-0: [ child-0: [ word: he ] ]
207 |   child-2: [ child-0: [ word: that ] ]
208 |   child-1: [ child-0: [ word: said ] ]
209 |   child-3: [
210 |     child-2: [ child-0: [ word: her ] ]
211 |     child-1: [ child-0: [ word: likes ] ]
212 |     child-0: [ child-0: [ word: he ] ]
213 |   ]
214 | ]
215 | 
216 | > he said that he
217 | Parsed 0 trees
218 | ```
219 | 
220 | This grammar already parses some correct sentences, and blocks some trivially
221 | incorrect ones. However, it doesn't care about number, case, or reflexives
222 | right now:
223 | 
224 | ```
225 | > she likes himself  // unbound reflexive pronoun
226 | Parsed 1 tree
227 | (0..3: S
228 |   (0..1: N (0..1: she))
229 |   (1..2: TV (1..2: likes))
230 |   (2..3: N (2..3: himself)))
231 | [
232 |   child-0: [ child-0: [ word: she ] ]
233 |   child-2: [ child-0: [ word: himself ] ]
234 |   child-1: [ child-0: [ word: likes ] ]
235 | ]
236 | 
237 | > him like her  // incorrect case on the subject pronoun, should be nominative
238 |                 // (he) instead of accusative (him)
239 | Parsed 1 tree
240 | (0..3: S
241 |   (0..1: N (0..1: him))
242 |   (1..2: TV (1..2: like))
243 |   (2..3: N (2..3: her)))
244 | [
245 |   child-0: [ child-0: [ word: him ] ]
246 |   child-1: [ child-0: [ word: like ] ]
247 |   child-2: [ child-0: [ word: her ] ]
248 | ]
249 | 
250 | > he like her  // incorrect verb number agreement
251 | Parsed 1 tree
252 | (0..3: S
253 |   (0..1: N (0..1: he))
254 |   (1..2: TV (1..2: like))
255 |   (2..3: N (2..3: her)))
256 | [
257 |   child-2: [ child-0: [ word: her ] ]
258 |   child-1: [ child-0: [ word: like ] ]
259 |   child-0: [ child-0: [ word: he ] ]
260 | ]
261 | ```
262 | 
263 | To fix this, we need to add *features* to our lexicon, and restrict the sentence
264 | rules based on features.
265 | 
266 | Features are added with square brackets, and are key: value pairs separated by
267 | commas. `**top**` is a special feature value, which basically means
268 | "unspecified" -- we'll come back to it later. Features that are unspecified are
269 | also assumed to have a `**top**` value, but sometimes explicitly stating top is
270 | more clear.
271 | 
272 | ```fgr
273 | /// Pronouns
274 | // The added features are:
275 | // * num: sg or pl, whether this noun wants a singular verb (likes) or
276 | //   a plural verb (like). note this is grammatical number, so for example
277 | //   singular they takes plural agreement ("they like X", not *"they likes X")
278 | // * case: nom or acc, whether this noun is nominative or accusative case.
279 | //   nominative case goes in the subject, and accusative in the object.
280 | //   e.g., "he fell" and "she likes him", not *"him fell" and *"her likes he"
281 | // * pron: he, she, they, or ref -- what type of pronoun this is
282 | // * needs_pron: whether this is a reflexive that needs to bind to another
283 | //   pronoun.
284 | N[ num: sg, case: nom, pron: he ]                    -> he
285 | N[ num: sg, case: acc, pron: he ]                    -> him
286 | N[ num: sg, case: acc, pron: ref, needs_pron: he ]   -> himself
287 | N[ num: sg, case: nom, pron: she ]                   -> she
288 | N[ num: sg, case: acc, pron: she ]                   -> her
289 | N[ num: sg, case: acc, pron: ref, needs_pron: she]   -> herself
290 | N[ num: pl, case: nom, pron: they ]                  -> they
291 | N[ num: pl, case: acc, pron: they ]                  -> them
292 | N[ num: pl, case: acc, pron: ref, needs_pron: they ] -> themselves
293 | N[ num: sg, case: acc, pron: ref, needs_pron: they ] -> themself
294 | 
295 | // Names
296 | // The added features are:
297 | // * num: sg, as people are singular ("mary likes her" / *"mary like her")
298 | // * case: **top**, as names can be both subjects and objects
299 | //   ("mary likes her" / "she likes mary")
300 | // * pron: whichever pronoun the person uses for reflexive agreement
301 | //   mary    pron: she  => mary likes herself
302 | //   sue     pron: they => sue likes themself
303 | //   takeshi pron: he   => takeshi likes himself
304 | N[ num: sg, case: **top**, pron: she ]  -> mary
305 | N[ num: sg, case: **top**, pron: they ] -> sue
306 | N[ num: sg, case: **top**, pron: he ]   -> takeshi
307 | N[ num: sg, case: **top**, pron: he ]   -> robert
308 | 
309 | // Complementizer doesn't need features
310 | Comp -> that
311 | 
312 | // Verbs -- intransitive, transitive, and clausal
313 | // The added features are:
314 | // * num: sg, pl, or **top** -- to match the noun numbers.
315 | //   **top** will match either sg or pl, as past-tense verbs in English
316 | //   don't agree in number: "he fell" and "they fell" are both fine
317 | // * tense: past or nonpast -- this won't be used for agreement, but will be
318 | //   copied into the final feature structure, and the client code could do
319 | //   something with it
320 | IV[ num:      sg, tense: nonpast ] -> falls
321 | IV[ num:      pl, tense: nonpast ] -> fall
322 | IV[ num: **top**, tense: past ]    -> fell
323 | 
324 | TV[ num:      sg, tense: nonpast ] -> likes
325 | TV[ num:      pl, tense: nonpast ] -> like
326 | TV[ num: **top**, tense: past ]    -> liked
327 | 
328 | CV[ num:      sg, tense: nonpast ] -> says
329 | CV[ num:      pl, tense: nonpast ] -> say
330 | CV[ num: **top**, tense: past ]    -> said
331 | ```
332 | 
333 | Now that our lexicon is updated with features, we can update our sentence rules
334 | to constrain parsing based on those features. This uses two new features,
335 | tags and unification. Tags allow features to be associated between nodes in a
336 | rule, and unification controls how those features are compatible. The rules for
337 | unification are:
338 | 
339 | 1. A string feature can unify with a string feature with the same value
340 | 2. A **top** feature can unify with anything, and the nodes are merged
341 | 3. A complex feature ([ ... ] structure) is recursively unified with another
342 |    complex feature.
343 | 
344 | If unification fails anywhere, the parse is aborted and the tree is discarded.
345 | This allows the programmer to discard trees if features don't match.
346 | 
347 | ```fgr
348 | // Sentence rules
349 | // Intransitive verb:
350 | // * Subject must be nominative case
351 | // * Subject and verb must agree in number (copied through #1)
352 | S -> N[ case: nom, num: #1 ] IV[ num: #1 ]
353 | // Transitive verb:
354 | // * Subject must be nominative case
355 | // * Subject and verb must agree in number (copied through #2)
356 | // * If there's a reflexive in the object position, make sure its `needs_pron`
357 | //   feature matches the subject's `pron` feature. If the object isn't a
358 | //   reflexive, then its `needs_pron` feature will implicitly be `**top**`, so
359 | //   will unify with anything.
360 | S -> N[ case: nom, pron: #1, num: #2 ] TV[ num: #2 ] N[ case: acc, needs_pron: #1 ]
361 | // Clausal verb:
362 | // * Subject must be nominative case
363 | // * Subject and verb must agree in number (copied through #1)
364 | // * Reflexives can't cross clause boundaries (*"He said that she likes himself"),
365 | //   so we can ignore reflexives and delegate to inner clause rule
366 | S -> N[ case: nom, num: #1 ] CV[ num: #1 ] Comp S
367 | ```
368 | 
369 | Now that we have this augmented grammar (available as `examples/reflexives.fgr`),
370 | we can try it out and see that it rejects illicit sentences that were previously
371 | accepted, while still accepting valid ones:
372 | 
373 | ```
374 | > he fell
375 | Parsed 1 tree
376 | (0..2: S
377 |   (0..1: N (0..1: he))
378 |   (1..2: IV (1..2: fell)))
379 | [
380 |   child-1: [
381 |     child-0: [ word: fell ]
382 |     num: #0 sg
383 |     tense: past
384 |   ]
385 |   child-0: [
386 |     pron: he
387 |     case: nom
388 |     num: #0
389 |     child-0: [ word: he ]
390 |   ]
391 | ]
392 | 
393 | > he like him
394 | Parsed 0 trees
395 | 
396 | > he likes himself
397 | Parsed 1 tree
398 | (0..3: S
399 |   (0..1: N (0..1: he))
400 |   (1..2: TV (1..2: likes))
401 |   (2..3: N (2..3: himself)))
402 | [
403 |   child-1: [
404 |     num: #0 sg
405 |     child-0: [ word: likes ]
406 |     tense: nonpast
407 |   ]
408 |   child-2: [
409 |     needs_pron: #1 he
410 |     num: sg
411 |     child-0: [ word: himself ]
412 |     pron: ref
413 |     case: acc
414 |   ]
415 |   child-0: [
416 |     child-0: [ word: he ]
417 |     pron: #1
418 |     num: #0
419 |     case: nom
420 |   ]
421 | ]
422 | 
423 | > he likes herself
424 | Parsed 0 trees
425 | 
426 | > mary likes herself
427 | Parsed 1 tree
428 | (0..3: S
429 |   (0..1: N (0..1: mary))
430 |   (1..2: TV (1..2: likes))
431 |   (2..3: N (2..3: herself)))
432 | [
433 |   child-0: [
434 |     pron: #0 she
435 |     num: #1 sg
436 |     case: nom
437 |     child-0: [ word: mary ]
438 |   ]
439 |   child-1: [
440 |     tense: nonpast
441 |     child-0: [ word: likes ]
442 |     num: #1
443 |   ]
444 |   child-2: [
445 |     child-0: [ word: herself ]
446 |     num: sg
447 |     pron: ref
448 |     case: acc
449 |     needs_pron: #0
450 |   ]
451 | ]
452 | 
453 | > mary likes themself
454 | Parsed 0 trees
455 | 
456 | > sue likes themself
457 | Parsed 1 tree
458 | (0..3: S
459 |   (0..1: N (0..1: sue))
460 |   (1..2: TV (1..2: likes))
461 |   (2..3: N (2..3: themself)))
462 | [
463 |   child-0: [
464 |     pron: #0 they
465 |     child-0: [ word: sue ]
466 |     case: nom
467 |     num: #1 sg
468 |   ]
469 |   child-1: [
470 |     tense: nonpast
471 |     num: #1
472 |     child-0: [ word: likes ]
473 |   ]
474 |   child-2: [
475 |     needs_pron: #0
476 |     case: acc
477 |     pron: ref
478 |     child-0: [ word: themself ]
479 |     num: sg
480 |   ]
481 | ]
482 | 
483 | > sue likes himself
484 | Parsed 0 trees
485 | ```
486 | 
487 | If this is interesting to you and you want to learn more, you can check out
488 | [my blog series](https://vgel.me/posts/symbolic-linguistics-part1/),
489 | the excellent textbook [Syntactic Theory: A Formal Introduction (2nd ed.)](https://web.stanford.edu/group/cslipublications/cslipublications/site/1575864002.shtml),
490 | and the [DELPH-IN project](http://www.delph-in.net/wiki/index.php/Home), whose
491 | work on the LKB inspired this simplified version.
492 | 
493 | ## Using from code
494 | I need to write this section in more detail, but if you're comfortable with Rust,
495 | I suggest looking through the codebase. It's not perfect, it started as one of
496 | my first Rust projects (after migrating through F# -> TypeScript -> C in search
497 | of the right performance/ergonomics tradeoff), and it could use more tests,
498 | but overall it's not too bad.
499 | 
500 | Basically, the processing pipeline is:
501 | 
502 | 1. Make a `Grammar` struct
503 |   * `Grammar` is defined in `rules.rs`.
504 |   * The easiest way to make a `Grammar` is `Grammar::parse_from_file`, which is
505 |     mostly a hand-written recusive descent parser in `parse_grammar.rs`. Yes,
506 |     I recognize the irony here.
507 | 2. It takes input (in `Grammar::parse`, which does everything for you, or
508 |    `Grammar::parse_chart`, which just does the chart)
509 | 3. The input is first chart-parsed in `earley.rs`
510 | 4. Then, a forest is built from the chart, in `forest.rs`, using an algorithm
511 |     I found in a very useful blog series I forget the URL for, because the
512 |     algorithms in the academic literature for this are... weird.
513 | 5. Finally, the feature unification is used to prune the forest down to only
514 |    valid trees. It would be more efficient to do this during parsing, but meh.
515 | 
516 | The most interesting thing you can do via code and not via the CLI is probably
517 | getting at the raw feature DAG, as that would let you do things like pronoun
518 | coreference. The DAG code is in `featurestructure.rs`, and should be fairly
519 | approachable -- there's a lot of Rust ceremony around `Rc<RefCell<...>>`
520 | because using an arena allocation crate seemed ~~too har~~like overkill, but
521 | that is somewhat mitigated by the `NodeRef` type alias. Hit me up at
522 | https://vgel.me/contact if you need help with anything here!
523 | 
524 | ## License
525 | 
526 | Licensed under the [MIT license](https://opensource.org/licenses/MIT).
527 | 
528 | ### Contribution
529 | 
530 | Unless you explicitly state otherwise, any contribution intentionally
531 | submitted for inclusion in the work shall be licensed as above, without any
532 | additional terms or conditions.
533 | 


--------------------------------------------------------------------------------
/README.tpl:
--------------------------------------------------------------------------------
 1 | [![Crates.io](https://img.shields.io/crates/v/treebender.svg)](https://crates.io/crates/treebender)
 2 | {{badges}}
 3 | 
 4 | # Treebender
 5 | 
 6 | {{readme}}
 7 | 
 8 | ## License
 9 | 
10 | Licensed under the [MIT license](https://opensource.org/licenses/MIT).
11 | 
12 | ### Contribution
13 | 
14 | Unless you explicitly state otherwise, any contribution intentionally
15 | submitted for inclusion in the work shall be licensed as above, without any
16 | additional terms or conditions.


--------------------------------------------------------------------------------
/benches/full_parse_reflexives.rs:
--------------------------------------------------------------------------------
 1 | use criterion::{Criterion, black_box, criterion_group, criterion_main};
 2 | 
 3 | use treebender::Grammar;
 4 | 
 5 | const GRAMMAR_SRC: &str = include_str!("./reflexives.fgr");
 6 | 
 7 | fn parse(g: &Grammar, input: &[&str]) -> usize {
 8 |   g.parse(input).len()
 9 | }
10 | 
11 | fn criterion_benchmark(c: &mut Criterion) {
12 |   let grammar = GRAMMAR_SRC.parse::<Grammar>().unwrap();
13 |   let simple_input = "mary likes sue".split(' ').collect::<Vec<_>>();
14 |   let complex_input = "mary said that she likes herself"
15 |     .split(' ')
16 |     .collect::<Vec<_>>();
17 | 
18 |   c.bench_function("parse simple", |b| {
19 |     b.iter(|| parse(black_box(&grammar), black_box(&simple_input)))
20 |   });
21 | 
22 |   c.bench_function("parse complex reflexive", |b| {
23 |     b.iter(|| parse(black_box(&grammar), black_box(&complex_input)))
24 |   });
25 | }
26 | 
27 | criterion_group!(benches, criterion_benchmark);
28 | criterion_main!(benches);
29 | 


--------------------------------------------------------------------------------
/benches/reflexives.fgr:
--------------------------------------------------------------------------------
 1 | // Sentence rules
 2 | // Intransitive:
 3 | // * Subject must be nominative case
 4 | // * Subject and verb must agree in number (copied through #1)
 5 | S -> N[ case: nom, num: #1 ] IV[ num: #1 ]
 6 | // Transitive:
 7 | // * Subject must be nominative case
 8 | // * Subject and verb must agree in number (copied through #2)
 9 | // * If there's a reflexive in the object position, make sure its `needs_pron`
10 | //   feature matches the subject's `pron` feature. If the object isn't a
11 | //   reflexive, then its `needs_pron` feature will implicitly be `**top**`, so
12 | //   will unify with anything.
13 | S -> N[ case: nom, pron: #1, num: #2 ] TV[ num: #2 ] N[ case: acc, needs_pron: #1 ]
14 | // Clausal:
15 | // * Subject must be nominative case
16 | // * Subject and verb must agree in number (copied through #1)
17 | // * Reflexives can't cross clause boundaries (*"He said that she likes himself"),
18 | //   so we can ignore reflexives and delegate to inner clause rule
19 | S -> N[ case: nom, num: #1 ] CV[ num: #1 ] Comp S
20 | 
21 | // Pronouns
22 | // The added features are:
23 | // * num: sg or pl, whether this noun wants a singular verb (likes) or
24 | //   a plural verb (like). note this is grammatical number, so for example
25 | //   singular they takes plural agreement ("they like X", not *"they likes X")
26 | // * case: nom or acc, whether this noun is nominative or accusative case.
27 | //   nominative case goes in the subject, and accusative in the object.
28 | //   e.g., "he fell" and "she likes him", not *"him fell" and *"her likes he"
29 | // * pron: he, she, they, or ref -- what type of pronoun this is
30 | // * needs_pron: whether this is a reflexive that needs to bind to another
31 | //   pronoun.
32 | N[ num: sg, case: nom, pron: he ]                    -> he
33 | N[ num: sg, case: acc, pron: he ]                    -> him
34 | N[ num: sg, case: acc, pron: ref, needs_pron: he ]   -> himself
35 | N[ num: sg, case: nom, pron: she ]                   -> she
36 | N[ num: sg, case: acc, pron: she ]                   -> her
37 | N[ num: sg, case: acc, pron: ref, needs_pron: she]   -> herself
38 | N[ num: pl, case: nom, pron: they ]                  -> they
39 | N[ num: pl, case: acc, pron: they ]                  -> them
40 | N[ num: pl, case: acc, pron: ref, needs_pron: they ] -> themselves
41 | N[ num: sg, case: acc, pron: ref, needs_pron: they ] -> themself
42 | 
43 | // Names
44 | // The added features are:
45 | // * num: sg, as people are singular ("mary likes her" / *"mary like her")
46 | // * case: **top**, as names can be both subjects and objects
47 | //   ("mary likes her" / "she likes mary")
48 | // * pron: whichever pronoun the person uses for reflexive agreement
49 | //   mary    pron: she  => mary likes herself
50 | //   sue     pron: they => sue likes themself
51 | //   takeshi pron: he   => takeshi likes himself
52 | N[ num: sg, case: **top**, pron: she ]  -> mary
53 | N[ num: sg, case: **top**, pron: they ] -> sue
54 | N[ num: sg, case: **top**, pron: he ]   -> takeshi
55 | N[ num: sg, case: **top**, pron: he ]   -> robert
56 | 
57 | // Complementizer doesn't need features
58 | Comp -> that
59 | 
60 | // Verbs -- intransitive, transitive, and clausal
61 | // The added features are:
62 | // * num: sg, pl, or **top** -- to match the noun numbers.
63 | //   **top** will match either sg or pl, as past-tense verbs in English
64 | //   don't agree in number: "he fell" and "they fell" are both fine
65 | // * tense: past or nonpast -- this won't be used for agreement, but will be
66 | //   copied into the final feature structure, and the client code could do
67 | //   something with it
68 | IV[ num:      sg, tense: nonpast ] -> falls
69 | IV[ num:      pl, tense: nonpast ] -> fall
70 | IV[ num: **top**, tense: past ]    -> fell
71 | 
72 | TV[ num:      sg, tense: nonpast ] -> likes
73 | TV[ num:      pl, tense: nonpast ] -> like
74 | TV[ num: **top**, tense: past ]    -> liked
75 | 
76 | CV[ num:      sg, tense: nonpast ] -> says
77 | CV[ num:      pl, tense: nonpast ] -> say
78 | CV[ num: **top**, tense: past ]    -> said
79 | 


--------------------------------------------------------------------------------
/cli/Cargo.toml:
--------------------------------------------------------------------------------
1 | [package]
2 | name = "cli"
3 | version = "0.2.0"
4 | authors = ["Theia Vogel <theia@vgel.me>"]
5 | 
6 | [dependencies]
7 | tracing-subscriber = { version = "0.3.19", features = ["env-filter"] }
8 | treebender = { path = "../" }
9 | 


--------------------------------------------------------------------------------
/cli/src/main.rs:
--------------------------------------------------------------------------------
  1 | extern crate tracing_subscriber;
  2 | extern crate treebender;
  3 | 
  4 | use std::env;
  5 | use std::io;
  6 | use std::io::Write;
  7 | use std::process;
  8 | 
  9 | use tracing_subscriber::EnvFilter;
 10 | 
 11 | use treebender::rules::Grammar;
 12 | use treebender::Err;
 13 | 
 14 | fn usage(prog_name: &str) -> String {
 15 |   format!(
 16 |     r"Usage: {} FILE [options]
 17 | 
 18 | Options:
 19 |   -h, --help    Print this message
 20 |   -c, --chart   Print the parse chart (defaults to not printing)
 21 |   -n, --no-fs   Don't print feature structures (defaults to printing)",
 22 |     prog_name
 23 |   )
 24 | }
 25 | 
 26 | fn parse(g: &Grammar, sentence: &str, print_chart: bool, print_fs: bool) -> Result<(), Err> {
 27 |   let sentence = sentence.split(' ').collect::<Vec<_>>();
 28 | 
 29 |   let chart = g.parse_chart(&sentence);
 30 | 
 31 |   if print_chart {
 32 |     println!("chart:\n{}\n", chart);
 33 |   }
 34 | 
 35 |   let trees = g.parse(&sentence);
 36 | 
 37 |   println!(
 38 |     "Parsed {} tree{}",
 39 |     trees.len(),
 40 |     if trees.len() == 1 { "" } else { "s" }
 41 |   );
 42 | 
 43 |   for (t, idx, arena) in trees {
 44 |     println!("{}", t);
 45 |     if print_fs {
 46 |       println!("{}", arena.display(idx));
 47 |     }
 48 |     println!();
 49 |   }
 50 | 
 51 |   Ok(())
 52 | }
 53 | 
 54 | struct Args {
 55 |   filename: String,
 56 |   print_fs: bool,
 57 |   print_chart: bool,
 58 | }
 59 | 
 60 | impl Args {
 61 |   fn make_error_message(msg: &str, prog_name: impl AsRef<str>) -> String {
 62 |     format!("argument error: {}.\n\n{}", msg, usage(prog_name.as_ref()))
 63 |   }
 64 | 
 65 |   fn parse(v: Vec<String>) -> Result<Self, String> {
 66 |     if v.is_empty() {
 67 |       return Err(Self::make_error_message(
 68 |         "bad argument vector",
 69 |         "treebender",
 70 |       ));
 71 |     }
 72 | 
 73 |     let args_len = v.len();
 74 |     let mut iter = v.into_iter();
 75 |     let prog_name = iter.next().unwrap();
 76 | 
 77 |     if args_len < 2 {
 78 |       return Err(Self::make_error_message("not enough arguments", prog_name));
 79 |     }
 80 | 
 81 |     let mut filename: Option<String> = None;
 82 |     let mut print_fs = true; // default to printing feature structures
 83 |     let mut print_chart = false; // default to *not* printing the chart
 84 | 
 85 |     for o in iter {
 86 |       if o == "-h" || o == "--help" {
 87 |         eprintln!("{}", usage(&prog_name));
 88 |         process::exit(0);
 89 |       } else if o == "-n" || o == "--no-fs" {
 90 |         print_fs = false;
 91 |       } else if o == "-c" || o == "--chart" {
 92 |         print_chart = true;
 93 |       } else if filename.is_none() {
 94 |         filename = Some(o);
 95 |       } else {
 96 |         return Err(Self::make_error_message("invalid arguments", prog_name));
 97 |       }
 98 |     }
 99 | 
100 |     if let Some(filename) = filename {
101 |       Ok(Self {
102 |         filename,
103 |         print_fs,
104 |         print_chart,
105 |       })
106 |     } else {
107 |       Err(Self::make_error_message("missing filename", prog_name))
108 |     }
109 |   }
110 | }
111 | 
112 | fn main() -> Result<(), Err> {
113 |   let opts = match Args::parse(env::args().collect()) {
114 |     Ok(opts) => opts,
115 |     Err(msg) => {
116 |       eprintln!("{}", msg);
117 |       process::exit(255);
118 |     }
119 |   };
120 | 
121 |   tracing_subscriber::fmt()
122 |     .with_env_filter(EnvFilter::from_default_env())
123 |     .with_writer(std::io::stderr)
124 |     .init();
125 | 
126 |   let g: Grammar = Grammar::read_from_file(&opts.filename)?;
127 | 
128 |   let mut input = String::new();
129 |   loop {
130 |     print!("> ");
131 |     io::stdout().flush()?;
132 | 
133 |     match io::stdin().read_line(&mut input) {
134 |       Ok(_) => {
135 |         if input.is_empty() {
136 |           // ctrl+d
137 |           return Ok(());
138 |         }
139 |         input.make_ascii_lowercase();
140 |         parse(&g, input.trim(), opts.print_chart, opts.print_fs)?;
141 |         input.clear();
142 |       }
143 |       Err(error) => return Err(error.into()),
144 |     }
145 |   }
146 | }
147 | 


--------------------------------------------------------------------------------
/examples/asl-wordorder.fgr:
--------------------------------------------------------------------------------
 1 | // *very* basic / incomplete grammar for ASL topicalization
 2 | // allow all word orders, but topicalized elements must come first
 3 | 
 4 | S -> NP IV
 5 | S -> IV NP[ topicalized: n ]
 6 | S -> NP TV NP[ topicalized: n ]
 7 | S -> NP NP[ topicalized: n ] TV
 8 | S -> TV NP[ topicalized: n ] NP[ topicalized: n ]
 9 | 
10 | NP -> N
11 | NP[ topicalized: y ] -> N Topic
12 | Topic -> nm-raised-eyebrows
13 | 
14 | N -> boy
15 | N -> ball
16 | 
17 | IV -> sit
18 | TV -> throw


--------------------------------------------------------------------------------
/examples/dative-shift.fgr:
--------------------------------------------------------------------------------
 1 | S[roles: #1] -> Arg[st: #2] V[roles: #1, sts.s: #2, sts.do: #3, sts.io: #4] Arg[st: #3] Arg[st: #4]
 2 | 
 3 | Vbare[
 4 |   sts.s.case:  nom, sts.s.arg:  #1 **top**,
 5 |   sts.do.case: acc, sts.do.arg: #2 **top**,
 6 |   sts.io.case: acc, sts.io.arg: #3 **top**,
 7 |   roles.agent: #1, roles.recipient: #2, roles.theme: #3
 8 | ] -> gave
 9 | // go go gadget dative shifter (swap direct and indirect object, assign dative)
10 | Vdative[
11 |   roles: #1, sts.s: #2, sts.do: #3, sts.io.case: dat, sts.io.arg: #4
12 | ] -> Vbare[
13 |   roles: #1, sts.s: #2, sts.io: #3, sts.do.arg: #4 
14 | ]
15 | V[ sts: #1, roles: #2 ] -> Vbare[ sts: #1, roles: #2 ]
16 | V[ sts: #1, roles: #2 ] -> Vdative[ sts: #1, roles: #2 ]
17 | 
18 | Arg[ st: #1 ] -> PP[ st: #1 ]
19 | Arg[ st: #1 ] -> NP[ st: #1 ]
20 | PP[ st.case: dat, st.arg: #1     ] -> to NP[ st.case: acc, st.arg: #1 ]
21 | NP[ st.case: nom, st.arg: i      ] -> i
22 | NP[ st.case: acc, st.arg: i      ] -> me
23 | NP[ st.case: nom, st.arg: she    ] -> she
24 | NP[ st.case: acc, st.arg: she    ] -> her
25 | NP[ st.case: nom, st.arg: apples ] -> apples
26 | NP[ st.case: acc, st.arg: apples ] -> apples
27 | 


--------------------------------------------------------------------------------
/examples/no-features.fgr:
--------------------------------------------------------------------------------
 1 | // sentence rules
 2 | S -> N IV
 3 | S -> N TV N
 4 | S -> N CV Comp S
 5 | 
 6 | // pronouns
 7 | N -> he
 8 | N -> him
 9 | N -> himself
10 | N -> she
11 | N -> her
12 | N -> herself
13 | N -> they
14 | N -> them
15 | N -> themselves
16 | N -> themself
17 | 
18 | // names, lowercase as they are terminals
19 | N -> mary
20 | N -> sue
21 | N -> takeshi
22 | N -> robert
23 | 
24 | // complementizer
25 | Comp -> that
26 | 
27 | // verbs -- intransitive, transitive, and clausal
28 | IV -> falls
29 | IV -> fall
30 | IV -> fell
31 | 
32 | TV -> likes
33 | TV -> like
34 | TV -> liked
35 | 
36 | CV -> says
37 | CV -> say
38 | CV -> said


--------------------------------------------------------------------------------
/examples/reflexives.fgr:
--------------------------------------------------------------------------------
 1 | // Sentence rules
 2 | // Intransitive:
 3 | // * Subject must be nominative case
 4 | // * Subject and verb must agree in number (copied through #1)
 5 | S -> N[ case: nom, num: #1 ] IV[ num: #1 ]
 6 | // Transitive:
 7 | // * Subject must be nominative case
 8 | // * Subject and verb must agree in number (copied through #2)
 9 | // * If there's a reflexive in the object position, make sure its `needs_pron`
10 | //   feature matches the subject's `pron` feature. If the object isn't a
11 | //   reflexive, then its `needs_pron` feature will implicitly be `**top**`, so
12 | //   will unify with anything.
13 | S -> N[ case: nom, pron: #1, num: #2 ] TV[ num: #2 ] N[ case: acc, needs_pron: #1 ]
14 | // Clausal:
15 | // * Subject must be nominative case
16 | // * Subject and verb must agree in number (copied through #1)
17 | // * Reflexives can't cross clause boundaries (*"He said that she likes himself"),
18 | //   so we can ignore reflexives and delegate to inner clause rule
19 | S -> N[ case: nom, num: #1 ] CV[ num: #1 ] Comp S
20 | 
21 | // Pronouns
22 | // The added features are:
23 | // * num: sg or pl, whether this noun wants a singular verb (likes) or
24 | //   a plural verb (like). note this is grammatical number, so for example
25 | //   singular they takes plural agreement ("they like X", not *"they likes X")
26 | // * case: nom or acc, whether this noun is nominative or accusative case.
27 | //   nominative case goes in the subject, and accusative in the object.
28 | //   e.g., "he fell" and "she likes him", not *"him fell" and *"her likes he"
29 | // * pron: he, she, they, or ref -- what type of pronoun this is
30 | // * needs_pron: whether this is a reflexive that needs to bind to another
31 | //   pronoun.
32 | N[ num: sg, case: nom, pron: he ]                    -> he
33 | N[ num: sg, case: acc, pron: he ]                    -> him
34 | N[ num: sg, case: acc, pron: ref, needs_pron: he ]   -> himself
35 | N[ num: sg, case: nom, pron: she ]                   -> she
36 | N[ num: sg, case: acc, pron: she ]                   -> her
37 | N[ num: sg, case: acc, pron: ref, needs_pron: she]   -> herself
38 | N[ num: pl, case: nom, pron: they ]                  -> they
39 | N[ num: pl, case: acc, pron: they ]                  -> them
40 | N[ num: pl, case: acc, pron: ref, needs_pron: they ] -> themselves
41 | N[ num: sg, case: acc, pron: ref, needs_pron: they ] -> themself
42 | 
43 | // Names
44 | // The added features are:
45 | // * num: sg, as people are singular ("mary likes her" / *"mary like her")
46 | // * case: **top**, as names can be both subjects and objects
47 | //   ("mary likes her" / "she likes mary")
48 | // * pron: whichever pronoun the person uses for reflexive agreement
49 | //   mary    pron: she  => mary likes herself
50 | //   sue     pron: they => sue likes themself
51 | //   takeshi pron: he   => takeshi likes himself
52 | N[ num: sg, case: **top**, pron: she ]  -> mary
53 | N[ num: sg, case: **top**, pron: they ] -> sue
54 | N[ num: sg, case: **top**, pron: he ]   -> takeshi
55 | N[ num: sg, case: **top**, pron: he ]   -> robert
56 | 
57 | // Complementizer doesn't need features
58 | Comp -> that
59 | 
60 | // Verbs -- intransitive, transitive, and clausal
61 | // The added features are:
62 | // * num: sg, pl, or **top** -- to match the noun numbers.
63 | //   **top** will match either sg or pl, as past-tense verbs in English
64 | //   don't agree in number: "he fell" and "they fell" are both fine
65 | // * tense: past or nonpast -- this won't be used for agreement, but will be
66 | //   copied into the final feature structure, and the client code could do
67 | //   something with it
68 | IV[ num:      sg, tense: nonpast ] -> falls
69 | IV[ num:      pl, tense: nonpast ] -> fall
70 | IV[ num: **top**, tense: past ]    -> fell
71 | 
72 | TV[ num:      sg, tense: nonpast ] -> likes
73 | TV[ num:      pl, tense: nonpast ] -> like
74 | TV[ num: **top**, tense: past ]    -> liked
75 | 
76 | CV[ num:      sg, tense: nonpast ] -> says
77 | CV[ num:      pl, tense: nonpast ] -> say
78 | CV[ num: **top**, tense: past ]    -> said
79 | 


--------------------------------------------------------------------------------
/rustfmt.toml:
--------------------------------------------------------------------------------
1 | tab_spaces = 2


--------------------------------------------------------------------------------
/src/earley.rs:
--------------------------------------------------------------------------------
  1 | use std::fmt;
  2 | use std::sync::Arc;
  3 | 
  4 | use crate::rules::{Grammar, Production, Rule};
  5 | 
  6 | #[derive(Debug, Clone, PartialEq, Eq)]
  7 | pub struct LR0 {
  8 |   pub rule: Arc<Rule>,
  9 |   pub pos: usize,
 10 | }
 11 | 
 12 | impl LR0 {
 13 |   pub fn new(rule: &Arc<Rule>) -> Self {
 14 |     Self {
 15 |       rule: rule.clone(),
 16 |       pos: 0,
 17 |     }
 18 |   }
 19 | 
 20 |   pub fn is_active(&self) -> bool {
 21 |     self.pos < self.rule.len()
 22 |   }
 23 | 
 24 |   pub fn advance(&self) -> Self {
 25 |     assert!(self.is_active());
 26 |     Self {
 27 |       rule: self.rule.clone(),
 28 |       pos: self.pos + 1,
 29 |     }
 30 |   }
 31 | 
 32 |   pub fn next_production(&self) -> Option<&Production> {
 33 |     self.rule.productions.get(self.pos)
 34 |   }
 35 | }
 36 | 
 37 | impl fmt::Display for LR0 {
 38 |   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 39 |     write!(f, "{} →", self.rule.symbol)?;
 40 |     for idx in 0..self.rule.len() {
 41 |       if idx == self.pos {
 42 |         write!(f, " ・")?;
 43 |       }
 44 |       write!(f, " {}", self.rule.productions[idx])?;
 45 |     }
 46 |     if !self.is_active() {
 47 |       write!(f, " ・")?;
 48 |     }
 49 |     Ok(())
 50 |   }
 51 | }
 52 | 
 53 | #[derive(Debug, Clone, PartialEq, Eq)]
 54 | pub struct State {
 55 |   pub lr0: LR0,
 56 |   pub origin: usize,
 57 | }
 58 | 
 59 | impl State {
 60 |   pub fn new(lr0: LR0, origin: usize) -> Self {
 61 |     Self { lr0, origin }
 62 |   }
 63 | 
 64 |   pub fn advance(&self) -> Self {
 65 |     Self::new(self.lr0.advance(), self.origin)
 66 |   }
 67 | }
 68 | 
 69 | #[derive(Debug)]
 70 | pub struct Chart(Vec<Vec<State>>);
 71 | 
 72 | impl Chart {
 73 |   pub fn new(length: usize) -> Self {
 74 |     Self(vec![Vec::new(); length])
 75 |   }
 76 | 
 77 |   pub fn len(&self) -> usize {
 78 |     self.0.len()
 79 |   }
 80 | 
 81 |   pub fn is_empty(&self) -> bool {
 82 |     self.len() == 0
 83 |   }
 84 | 
 85 |   pub fn len_at(&self, k: usize) -> usize {
 86 |     self.0[k].len()
 87 |   }
 88 | 
 89 |   pub fn has(&self, k: usize, state: &State) -> bool {
 90 |     self.0[k].contains(state)
 91 |   }
 92 | 
 93 |   pub fn add(&mut self, k: usize, state: State) {
 94 |     if !self.has(k, &state) {
 95 |       self.0[k].push(state);
 96 |     }
 97 |   }
 98 | 
 99 |   /// Get an owned state so that passing around &mut chart is more ergonomic
100 |   /// The clone is fairly cheap, only an rc + 2 usize, State would be copy if not
101 |   /// for the Arc<Rule>
102 |   fn get_state(&self, k: usize, idx: usize) -> State {
103 |     self.0[k][idx].clone()
104 |   }
105 | }
106 | 
107 | impl IntoIterator for Chart {
108 |   type Item = (usize, Vec<State>);
109 |   type IntoIter = std::iter::Enumerate<std::vec::IntoIter<Vec<State>>>;
110 | 
111 |   fn into_iter(self) -> Self::IntoIter {
112 |     self.0.into_iter().enumerate()
113 |   }
114 | }
115 | 
116 | impl fmt::Display for Chart {
117 |   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
118 |     for k in 0..self.len() {
119 |       writeln!(f, "State {}:", k)?;
120 |       for state in self.0[k].iter() {
121 |         writeln!(f, "  {}..{}: {}", state.origin, k, state.lr0)?;
122 |       }
123 |     }
124 |     Ok(())
125 |   }
126 | }
127 | 
128 | pub fn parse_chart(g: &Grammar, input: &[&str]) -> Chart {
129 |   let mut chart = Chart::new(input.len() + 1);
130 | 
131 |   for rule in g.rules.get(&g.start).expect("grammar missing start rules") {
132 |     chart.add(0, State::new(LR0::new(rule), 0));
133 |   }
134 | 
135 |   for k in 0..chart.len() {
136 |     // need to use while loop because the number of states at k can expand during the loop
137 |     let mut idx = 0;
138 |     while idx < chart.len_at(k) {
139 |       let state = chart.get_state(k, idx);
140 |       idx += 1;
141 | 
142 |       if let Some(production) = state.lr0.next_production() {
143 |         if production.is_nonterminal() {
144 |           predictor(g, &mut chart, k, &state);
145 |         } else {
146 |           scanner(&mut chart, k, &state, input);
147 |         }
148 |       } else {
149 |         completer(&mut chart, k, &state);
150 |       }
151 |     }
152 |   }
153 | 
154 |   chart
155 | }
156 | 
157 | fn completer(chart: &mut Chart, k: usize, state: &State) {
158 |   assert!(!state.lr0.is_active(), "tried to complete active state");
159 | 
160 |   // lr0 has been completed, now look for states in the chart that are waiting for its symbol
161 |   for idx in 0..chart.len_at(state.origin) {
162 |     let other = chart.get_state(state.origin, idx);
163 | 
164 |     if let Some(np) = other.lr0.next_production() {
165 |       if np.symbol == state.lr0.rule.symbol {
166 |         // found one, advance its dot and add the new state to the chart *at k*,
167 |         // because it's now waiting on a token there
168 |         chart.add(k, other.advance())
169 |       }
170 |     }
171 |   }
172 | }
173 | 
174 | fn predictor(g: &Grammar, chart: &mut Chart, k: usize, state: &State) {
175 |   assert!(state.lr0.is_active(), "tried to predict non-active state");
176 |   assert!(
177 |     state.lr0.next_production().unwrap().is_nonterminal(),
178 |     "tried to predict a terminal"
179 |   );
180 | 
181 |   // this lr0 is waiting for the next production
182 |   // let's hypothesize that one of the rules that can build this production will
183 |   // succeed at its current position
184 |   let needed_symbol = &state.lr0.next_production().unwrap().symbol;
185 |   for wanted_rule in g
186 |     .rules
187 |     .get(needed_symbol)
188 |     .unwrap_or_else(|| panic!("missing rules for production {}", needed_symbol))
189 |   {
190 |     chart.add(k, State::new(LR0::new(wanted_rule), k));
191 | 
192 |     if g.is_nullable(needed_symbol) {
193 |       // automatically complete `state` early, because we know
194 |       // it will be completable anyways, because its next_production may be produced
195 |       // by empty input. If we don't do this, nullable rules won't be completed
196 |       // correctly, because complete() won't run after predict() without a new symbol.
197 |       chart.add(k, state.advance());
198 |     }
199 |   }
200 | }
201 | 
202 | fn scanner(chart: &mut Chart, k: usize, state: &State, input: &[&str]) {
203 |   assert!(state.lr0.is_active(), "tried to scan non-active state");
204 |   assert!(
205 |     state.lr0.next_production().unwrap().is_terminal(),
206 |     "tried to scan a nonterminal"
207 |   );
208 | 
209 |   let needed_symbol = &state.lr0.next_production().unwrap().symbol;
210 |   if k < input.len() && input[k] == needed_symbol {
211 |     // advance the state to consume this token, and add to state k + 1, where
212 |     // it will look for the next token
213 |     chart.add(k + 1, state.advance());
214 |   }
215 | }
216 | 


--------------------------------------------------------------------------------
/src/featurestructure/mod.rs:
--------------------------------------------------------------------------------
 1 | mod node;
 2 | mod serialized;
 3 | 
 4 | pub use node::{Feature, NodeArena, NodeIdx};
 5 | pub use serialized::SerializedNode;
 6 | 
 7 | #[cfg(test)]
 8 | mod tests {
 9 |   use super::*;
10 | 
11 |   #[test]
12 |   fn test_construct_fs() {
13 |     let mut arena = NodeArena::new();
14 | 
15 |     let features = vec![
16 |       Feature {
17 |         path: "a.b".to_string(),
18 |         tag: Some("1".to_string()),
19 |         value: arena.alloc_top(),
20 |       },
21 |       Feature {
22 |         path: "a.b.c".to_string(),
23 |         tag: None,
24 |         value: arena.alloc_str("foo".to_string()),
25 |       },
26 |       Feature {
27 |         path: "a.b.d".to_string(),
28 |         tag: None,
29 |         value: arena.alloc_str("bar".to_string()),
30 |       },
31 |       Feature {
32 |         path: "e".to_string(),
33 |         tag: Some("1".to_string()),
34 |         value: arena.alloc_top(),
35 |       },
36 |     ];
37 | 
38 |     let root = arena.alloc_from_features(features).unwrap();
39 | 
40 |     println!("{}", arena.display(root));
41 |   }
42 | 
43 |   #[test]
44 |   fn test_unify_tags() {
45 |     let mut arena = NodeArena::new();
46 | 
47 |     let features1 = vec![
48 |       Feature {
49 |         path: "a.b".to_string(),
50 |         tag: Some("1".to_string()),
51 |         value: arena.alloc_top(),
52 |       },
53 |       Feature {
54 |         path: "c".to_string(),
55 |         tag: Some("1".to_string()),
56 |         value: arena.alloc_top(),
57 |       },
58 |     ];
59 | 
60 |     let fs1 = arena.alloc_from_features(features1).unwrap();
61 | 
62 |     let features2 = vec![Feature {
63 |       path: "c".to_string(),
64 |       tag: None,
65 |       value: arena.alloc_str("foo".to_string()),
66 |     }];
67 | 
68 |     let fs2 = arena.alloc_from_features(features2).unwrap();
69 | 
70 |     // everything is **top** so goes away
71 |     assert!(SerializedNode::from_node(&arena, fs1).is_none());
72 | 
73 |     let gold = SerializedNode::Edged(vec![("c".into(), "foo".into())].into_iter().collect());
74 | 
75 |     assert!(SerializedNode::from_node(&arena, fs2) == Some(gold));
76 | 
77 |     arena.unify(fs1, fs2).unwrap();
78 | 
79 |     let gold = SerializedNode::Edged(
80 |       vec![
81 |         (
82 |           "a".into(),
83 |           SerializedNode::Edged(vec![("b".into(), "foo".into())].into_iter().collect()),
84 |         ),
85 |         ("c".into(), "foo".into()),
86 |       ]
87 |       .into_iter()
88 |       .collect(),
89 |     );
90 | 
91 |     assert!(SerializedNode::from_node(&arena, fs1) == Some(gold.clone()));
92 |     assert!(SerializedNode::from_node(&arena, fs2) == Some(gold));
93 |   }
94 | }
95 | 


--------------------------------------------------------------------------------
/src/featurestructure/node.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | use std::fmt;
  3 | 
  4 | use crate::utils::Err;
  5 | 
  6 | /// Index type for the node arena
  7 | #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)]
  8 | pub struct NodeIdx(pub u32);
  9 | 
 10 | /// Unpacked representation of a feature, that NodeArena::new_from_paths can turn into a Node
 11 | #[derive(Debug)]
 12 | pub struct Feature {
 13 |   /// Dotted path where each segment will be a node: "a.b.c" -> [a: [b: [c: ...]]]
 14 |   pub path: String,
 15 |   /// Unique string that will link features into a reentrant node, or None
 16 |   pub tag: Option<String>,
 17 |   /// What will end up at `path`. Will be unified with any other feature values with the same tag.
 18 |   pub value: NodeIdx,
 19 | }
 20 | 
 21 | /// A node in the feature structure graph
 22 | #[derive(Debug, Clone, PartialEq, Eq)]
 23 | pub enum Node {
 24 |   /// Top can unify with anything
 25 |   Top,
 26 |   /// A string-valued feature, such as "nom" in [case: nom]. Unifies with eq. Str nodes
 27 |   Str(String),
 28 |   /// An arc-containing node with arcs to other NodeIdxs
 29 |   Edged(HashMap<String, NodeIdx>),
 30 |   /// A node that has been forwarded to another node through unification.
 31 |   /// Before using a node, it should be dereferenced to resolve its forward
 32 |   Forwarded(NodeIdx),
 33 | }
 34 | 
 35 | impl Node {
 36 |   fn new_str(s: String) -> Self {
 37 |     Self::Str(s)
 38 |   }
 39 | 
 40 |   fn new_edged() -> Self {
 41 |     Self::Edged(HashMap::new())
 42 |   }
 43 | 
 44 |   fn is_top(&self) -> bool {
 45 |     matches!(self, Self::Top)
 46 |   }
 47 | 
 48 |   fn str(&self) -> Option<&str> {
 49 |     match self {
 50 |       Self::Str(s) => Some(s),
 51 |       _ => None,
 52 |     }
 53 |   }
 54 | 
 55 |   fn is_str(&self) -> bool {
 56 |     self.str().is_some()
 57 |   }
 58 | 
 59 |   fn edged(&self) -> Option<&HashMap<String, NodeIdx>> {
 60 |     match self {
 61 |       Self::Edged(v) => Some(v),
 62 |       _ => None,
 63 |     }
 64 |   }
 65 | 
 66 |   fn edged_mut(&mut self) -> Option<&mut HashMap<String, NodeIdx>> {
 67 |     match self {
 68 |       Self::Edged(v) => Some(v),
 69 |       _ => None,
 70 |     }
 71 |   }
 72 | 
 73 |   fn is_edged(&self) -> bool {
 74 |     self.edged().is_some()
 75 |   }
 76 | }
 77 | 
 78 | /// An arena that stores all nodes and provides methods to operate on them
 79 | #[derive(Debug, Default, Clone, PartialEq, Eq)]
 80 | pub struct NodeArena {
 81 |   nodes: Vec<Node>,
 82 | }
 83 | 
 84 | impl NodeArena {
 85 |   pub fn new() -> Self {
 86 |     Default::default()
 87 |   }
 88 | 
 89 |   pub fn alloc(&mut self, node: Node) -> NodeIdx {
 90 |     let idx = self.nodes.len() as u32;
 91 |     self.nodes.push(node);
 92 |     NodeIdx(idx)
 93 |   }
 94 | 
 95 |   pub fn replace(&mut self, idx: NodeIdx, node: Node) -> Node {
 96 |     std::mem::replace(&mut self.nodes[idx.0 as usize], node)
 97 |   }
 98 | 
 99 |   pub fn alloc_top(&mut self) -> NodeIdx {
100 |     self.alloc(Node::Top)
101 |   }
102 | 
103 |   pub fn alloc_str(&mut self, s: String) -> NodeIdx {
104 |     self.alloc(Node::new_str(s))
105 |   }
106 | 
107 |   pub fn alloc_edged(&mut self) -> NodeIdx {
108 |     self.alloc(Node::new_edged())
109 |   }
110 | 
111 |   /// Recursively make a copy of a node
112 |   pub fn clone(&mut self, n: NodeIdx) -> NodeIdx {
113 |     let mut seen = HashMap::new();
114 |     self._clone(n, &mut seen)
115 |   }
116 | 
117 |   fn _clone(&mut self, n: NodeIdx, seen: &mut HashMap<NodeIdx, NodeIdx>) -> NodeIdx {
118 |     if let Some(new_idx) = seen.get(&n) {
119 |       return *new_idx;
120 |     }
121 | 
122 |     // TODO this `node` clone is ugly, we need it because we're adding new nodes in here, which might move the vec,
123 |     // invalidating the iteration over the hashmap ref in edged. ideally we'd be using a proper arena here, so we
124 |     // could guarantee the node doesn't move, so we didn't need this clone. need to fix that.
125 | 
126 |     // we can't dereference here because we need to preserve the DAG structure
127 |     let node = self.get(n).clone();
128 |     let new = match node {
129 |       Node::Top => self.alloc_top(),
130 |       Node::Str(s) => self.alloc_str(s),
131 |       Node::Edged(old_arcs) => {
132 |         let mut arcs = HashMap::<String, NodeIdx>::new();
133 |         for (label, value) in old_arcs.into_iter() {
134 |           arcs.insert(label, self._clone(value, seen));
135 |         }
136 |         self.alloc(Node::Edged(arcs))
137 |       }
138 |       Node::Forwarded(target) => {
139 |         let new = self._clone(target, seen);
140 |         self.alloc(Node::Forwarded(new))
141 |       }
142 |     };
143 | 
144 |     seen.insert(n, new);
145 |     new
146 |   }
147 | 
148 |   /// Display a NodeIdx
149 |   pub fn display(&self, idx: NodeIdx) -> NodeDisplay {
150 |     NodeDisplay { arena: self, idx }
151 |   }
152 | 
153 |   /// Creates a Node from a list of (name, noderef) features. Names CANNOT be dotted!
154 |   pub fn alloc_from_edges<I>(&mut self, edges: I) -> Result<NodeIdx, Err>
155 |   where
156 |     I: IntoIterator<Item = (String, NodeIdx)>,
157 |   {
158 |     let node = self.alloc_edged();
159 | 
160 |     for (label, target) in edges {
161 |       assert!(
162 |         !label.contains('.'),
163 |         "new_with_edges cannot take dotted paths!"
164 |       );
165 | 
166 |       self.push_edge(node, label, target)?; // error if unification failure
167 |     }
168 | 
169 |     Ok(node)
170 |   }
171 | 
172 |   pub fn alloc_from_features<I>(&mut self, paths: I) -> Result<NodeIdx, Err>
173 |   where
174 |     I: IntoIterator<Item = Feature>,
175 |   {
176 |     let root = self.alloc_edged();
177 | 
178 |     let mut tags: HashMap<String, NodeIdx> = HashMap::new();
179 |     for Feature { value, tag, path } in paths {
180 |       if let Some(tag) = tag {
181 |         if tags.contains_key(&tag) {
182 |           let tagged = tags[&tag];
183 |           self.unify(value, tagged)?;
184 |         } else {
185 |           tags.insert(tag.to_string(), value);
186 |         }
187 |       }
188 | 
189 |       let mut current = root;
190 |       let mut parts = path.split('.').peekable();
191 |       loop {
192 |         let next = parts.next().expect("shouldn't be empty b/c path.len() > 0");
193 |         let is_last = parts.peek().is_none();
194 | 
195 |         if is_last {
196 |           self.push_edge(current, next.to_string(), value)?;
197 |           break;
198 |         } else {
199 |           let new = self.alloc_edged();
200 |           self.push_edge(current, next.to_string(), new)?;
201 |           current = new;
202 |         }
203 |       }
204 |     }
205 | 
206 |     Ok(root)
207 |   }
208 | 
209 |   /// Get an idx. Assumes valid, panics on OOB
210 |   pub fn get(&self, idx: NodeIdx) -> &Node {
211 |     self.nodes.get(idx.0 as usize).expect("Invalid NodeIdx")
212 |   }
213 | 
214 |   /// Mutably get an idx. Assumes valid, panics on OOB
215 |   pub fn get_mut(&mut self, idx: NodeIdx) -> &mut Node {
216 |     self.nodes.get_mut(idx.0 as usize).expect("Invalid NodeIdx")
217 |   }
218 | 
219 |   pub fn forward_to(&mut self, target: NodeIdx, to: NodeIdx) {
220 |     self.nodes[target.0 as usize] = Node::Forwarded(to);
221 |   }
222 | 
223 |   pub fn is_top(&self, n: NodeIdx) -> bool {
224 |     self.get(n).is_top()
225 |   }
226 | 
227 |   pub fn is_str(&self, n: NodeIdx) -> bool {
228 |     self.get(n).is_str()
229 |   }
230 | 
231 |   pub fn is_edged(&self, n: NodeIdx) -> bool {
232 |     self.get(n).is_edged()
233 |   }
234 | 
235 |   fn str(&self, n: NodeIdx) -> Option<&str> {
236 |     self.get(n).str()
237 |   }
238 | 
239 |   fn edged(&self, n: NodeIdx) -> Option<&HashMap<String, NodeIdx>> {
240 |     self.get(n).edged()
241 |   }
242 | 
243 |   fn edged_mut(&mut self, n: NodeIdx) -> Option<&mut HashMap<String, NodeIdx>> {
244 |     self.get_mut(n).edged_mut()
245 |   }
246 | 
247 |   #[allow(clippy::map_entry)]
248 |   fn push_edge(&mut self, parent: NodeIdx, label: String, target: NodeIdx) -> Result<(), Err> {
249 |     let node = self.get_mut(parent);
250 | 
251 |     if node.is_top() {
252 |       *node = Node::new_edged();
253 |     }
254 | 
255 |     if let Some(arcs) = node.edged_mut() {
256 |       if arcs.contains_key(&label) {
257 |         let existing = arcs[&label];
258 |         self.unify(existing, target)?;
259 |       } else {
260 |         arcs.insert(label, target);
261 |       }
262 |       return Ok(());
263 |     }
264 | 
265 |     Err(format!("unification failure: {}", label).into())
266 |   }
267 | 
268 |   pub fn dereference(&self, mut idx: NodeIdx) -> NodeIdx {
269 |     while let Node::Forwarded(r) = self.get(idx) {
270 |       idx = *r;
271 |     }
272 |     idx
273 |   }
274 | 
275 |   /// Unify two feature structures within this arena. Both may be mutated.
276 |   pub fn unify(&mut self, n1: NodeIdx, n2: NodeIdx) -> Result<(), Err> {
277 |     let n1 = self.dereference(n1);
278 |     let n2 = self.dereference(n2);
279 | 
280 |     // if same node, already unified
281 |     if n1 == n2 {
282 |       return Ok(());
283 |     }
284 | 
285 |     // If either is top, forward to the other
286 |     if self.is_top(n1) {
287 |       self.forward_to(n1, n2);
288 |       return Ok(());
289 |     } else if self.is_top(n2) {
290 |       self.forward_to(n2, n1);
291 |       return Ok(());
292 |     }
293 | 
294 |     // try to unify string values
295 |     if self.is_str(n1) && self.is_str(n2) {
296 |       let n1_str = self.str(n1).unwrap();
297 |       let n2_str = self.str(n2).unwrap();
298 | 
299 |       if n1_str == n2_str {
300 |         self.forward_to(n1, n2);
301 |         return Ok(());
302 |       } else {
303 |         return Err(format!("unification failure: {n1_str} & {n2_str}").into());
304 |       }
305 |     }
306 | 
307 |     // if both are edged, unify their contents
308 |     if self.is_edged(n1) && self.is_edged(n2) {
309 |       let n1 = self.replace(n1, Node::Forwarded(n2));
310 |       let n1arcs = n1.edged().unwrap();
311 | 
312 |       for (label, value) in n1arcs.iter() {
313 |         if self.edged(n2).unwrap().contains_key(label) {
314 |           // shared arc
315 |           let other = self.edged(n2).unwrap().get(label).unwrap();
316 |           self.unify(*value, *other)?;
317 |         } else {
318 |           // complement arc
319 |           self.edged_mut(n2).unwrap().insert(label.clone(), *value);
320 |         }
321 |       }
322 | 
323 |       return Ok(());
324 |     }
325 | 
326 |     Err(
327 |       format!(
328 |         "unification failure: {:?} & {:?}",
329 |         self.get(n1),
330 |         self.get(n2)
331 |       )
332 |       .into(),
333 |     )
334 |   }
335 | }
336 | 
337 | /// Helper struct for displaying a node
338 | #[derive(Clone)]
339 | pub struct NodeDisplay<'a> {
340 |   pub arena: &'a NodeArena,
341 |   pub idx: NodeIdx,
342 | }
343 | 
344 | impl fmt::Display for NodeDisplay<'_> {
345 |   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
346 |     let mut counts = HashMap::new();
347 |     count_in_pointers(self, &mut counts);
348 |     let mut has_printed = HashMap::new();
349 |     format_node(self, &counts, &mut has_printed, 0, f)
350 |   }
351 | }
352 | 
353 | // for fmt::Display impl
354 | #[allow(clippy::map_entry)]
355 | fn count_in_pointers(n: &NodeDisplay, seen: &mut HashMap<NodeIdx, usize>) {
356 |   let nref = n.arena.dereference(n.idx);
357 |   if seen.contains_key(&nref) {
358 |     seen.entry(nref).and_modify(|cnt| *cnt += 1);
359 |   } else {
360 |     seen.insert(nref, 1);
361 |     if let Some(arcs) = n.arena.edged(nref) {
362 |       for value in arcs.values() {
363 |         count_in_pointers(
364 |           &NodeDisplay {
365 |             arena: n.arena,
366 |             idx: *value,
367 |           },
368 |           seen,
369 |         );
370 |       }
371 |     }
372 |   }
373 | }
374 | 
375 | // for fmt::Display impl
376 | fn format_node(
377 |   nd: &NodeDisplay,
378 |   counts: &HashMap<NodeIdx, usize>,
379 |   has_printed: &mut HashMap<NodeIdx, usize>,
380 |   indent: usize,
381 |   f: &mut fmt::Formatter<'_>,
382 | ) -> fmt::Result {
383 |   let arena = nd.arena;
384 |   let idx = arena.dereference(nd.idx);
385 | 
386 |   if counts[&idx] > 1 && has_printed.contains_key(&idx) {
387 |     return write!(f, "#{}", has_printed[&idx]);
388 |   }
389 | 
390 |   if counts[&idx] > 1 {
391 |     let id = has_printed.len();
392 |     has_printed.insert(idx, id);
393 |     write!(f, "#{} ", id)?;
394 |   }
395 | 
396 |   let r = nd.arena.get(idx);
397 |   match r {
398 |     Node::Top => write!(f, "**top**"),
399 |     Node::Str(s) => write!(f, "{}", s),
400 |     Node::Edged(arcs) => {
401 |       if arcs.is_empty() {
402 |         write!(f, "[]")
403 |       } else if arcs.len() == 1 {
404 |         let (label, value) = arcs.iter().next().unwrap();
405 |         write!(f, "[ {}: ", label)?;
406 |         format_node(
407 |           &NodeDisplay { arena, idx: *value },
408 |           counts,
409 |           has_printed,
410 |           0,
411 |           f,
412 |         )?;
413 |         write!(f, " ]")
414 |       } else {
415 |         writeln!(f, "[")?;
416 |         for (label, value) in arcs.iter() {
417 |           write!(f, "{:indent$}{}: ", "", label, indent = indent + 2)?;
418 |           format_node(
419 |             &NodeDisplay { arena, idx: *value },
420 |             counts,
421 |             has_printed,
422 |             indent + 2,
423 |             f,
424 |           )?;
425 |           writeln!(f)?;
426 |         }
427 |         write!(f, "{:indent$}]", "", indent = indent)
428 |       }
429 |     }
430 |     Node::Forwarded(_) => panic!("unexpected forward"),
431 |   }
432 | }
433 | 


--------------------------------------------------------------------------------
/src/featurestructure/serialized.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | 
  3 | use super::node::{Node, NodeArena, NodeIdx};
  4 | 
  5 | /// A noderef that's been serialized into a tree structure. Nodes with multiple
  6 | /// in-pointers are duplicated.
  7 | /// IMPORTANT: **top** is /stripped out/. All top features will not be present in
  8 | /// the serialized tree.
  9 | #[derive(Debug, Clone)]
 10 | pub enum SerializedNode {
 11 |   Str(String),
 12 |   Edged(HashMap<String, SerializedNode>),
 13 | }
 14 | 
 15 | impl SerializedNode {
 16 |   pub fn as_str(&self) -> Option<&str> {
 17 |     match self {
 18 |       Self::Str(s) => Some(s.as_str()),
 19 |       _ => None,
 20 |     }
 21 |   }
 22 | 
 23 |   pub fn into_str(self) -> Option<String> {
 24 |     match self {
 25 |       Self::Str(s) => Some(s),
 26 |       _ => None,
 27 |     }
 28 |   }
 29 | 
 30 |   pub fn as_edged(&self) -> Option<&HashMap<String, SerializedNode>> {
 31 |     match self {
 32 |       Self::Edged(map) => Some(map),
 33 |       _ => None,
 34 |     }
 35 |   }
 36 | 
 37 |   pub fn into_edged(self) -> Option<HashMap<String, SerializedNode>> {
 38 |     match self {
 39 |       Self::Edged(map) => Some(map),
 40 |       _ => None,
 41 |     }
 42 |   }
 43 | 
 44 |   pub fn get_path(&self, path: &[&str]) -> Option<&SerializedNode> {
 45 |     let mut node = self;
 46 |     let mut path = path;
 47 |     while !path.is_empty() {
 48 |       node = node.as_edged()?.get(path[0])?;
 49 |       path = &path[1..];
 50 |     }
 51 |     Some(node)
 52 |   }
 53 | 
 54 |   pub fn get_path_str(&self, path: &[&str]) -> Option<&str> {
 55 |     self.get_path(path).and_then(Self::as_str)
 56 |   }
 57 | 
 58 |   /// Create a SerializedNode from a NodeArena and NodeIdx
 59 |   pub fn from_node(arena: &NodeArena, idx: NodeIdx) -> Option<Self> {
 60 |     let idx = arena.dereference(idx);
 61 |     match arena.get(idx) {
 62 |       Node::Forwarded(_) => panic!("unexpected forward after dereference"),
 63 |       Node::Top => None,
 64 |       Node::Str(s) => Some(SerializedNode::Str(s.to_string())),
 65 |       Node::Edged(edges) => {
 66 |         let mut map: HashMap<String, SerializedNode> = HashMap::new();
 67 |         for (k, v) in edges.iter() {
 68 |           let value = Self::from_node(arena, *v);
 69 |           if let Some(value) = value {
 70 |             map.insert(k.to_string(), value);
 71 |           }
 72 |         }
 73 |         if map.is_empty() {
 74 |           None
 75 |         } else {
 76 |           Some(SerializedNode::Edged(map))
 77 |         }
 78 |       }
 79 |     }
 80 |   }
 81 | }
 82 | 
 83 | impl From<&str> for SerializedNode {
 84 |   fn from(s: &str) -> Self {
 85 |     s.to_string().into()
 86 |   }
 87 | }
 88 | 
 89 | impl From<String> for SerializedNode {
 90 |   fn from(s: String) -> Self {
 91 |     Self::Str(s)
 92 |   }
 93 | }
 94 | 
 95 | impl From<HashMap<String, SerializedNode>> for SerializedNode {
 96 |   fn from(hm: HashMap<String, SerializedNode>) -> Self {
 97 |     Self::Edged(hm)
 98 |   }
 99 | }
100 | 
101 | impl PartialEq for SerializedNode {
102 |   fn eq(&self, other: &Self) -> bool {
103 |     match (&self, &other) {
104 |       (SerializedNode::Str(s1), SerializedNode::Str(s2)) => s1 == s2,
105 |       (SerializedNode::Str(_), SerializedNode::Edged(_))
106 |       | (SerializedNode::Edged(_), SerializedNode::Str(_)) => false,
107 |       (SerializedNode::Edged(m1), &SerializedNode::Edged(m2)) => {
108 |         if m1.len() != m2.len() {
109 |           return false;
110 |         }
111 | 
112 |         m1.iter().all(|(k, v)| m2.get(k) == Some(v))
113 |       }
114 |     }
115 |   }
116 | }
117 | 


--------------------------------------------------------------------------------
/src/fgr/mod.rs:
--------------------------------------------------------------------------------
 1 | pub mod parse_grammar;
 2 | 
 3 | pub use parse_grammar::*;
 4 | 
 5 | #[cfg(test)]
 6 | mod tests {
 7 |   use crate::Grammar;
 8 | 
 9 |   macro_rules! example_file {
10 |     ($filename:expr) => {
11 |       (
12 |         $filename,
13 |         include_str!(concat!("../../examples/", $filename)),
14 |       )
15 |     };
16 |   }
17 | 
18 |   #[test]
19 |   fn smoke_test_examples() {
20 |     let examples = [
21 |       example_file!("asl-wordorder.fgr"),
22 |       example_file!("dative-shift.fgr"),
23 |       example_file!("no-features.fgr"),
24 |       example_file!("reflexives.fgr"),
25 |     ];
26 | 
27 |     for (filename, src) in examples {
28 |       assert!(src.parse::<Grammar>().is_ok(), "failed to parse {filename}");
29 |     }
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/fgr/parse_grammar.rs:
--------------------------------------------------------------------------------
  1 | /// Simple recursive-descent parsing of grammar files
  2 | use std::str::FromStr;
  3 | 
  4 | use regex::Regex;
  5 | 
  6 | use crate::featurestructure::{Feature, NodeArena, NodeIdx};
  7 | use crate::rules::{Grammar, Production, Rule};
  8 | use crate::utils::Err;
  9 | 
 10 | pub const TOP_STR: &str = "**top**";
 11 | 
 12 | /// Parses a str into a tuple of (rules, nonterminals)
 13 | /// Errors if the grammar doesn't parse or is malformed
 14 | impl FromStr for Grammar {
 15 |   type Err = Err;
 16 | 
 17 |   /// Parses a grammar from a string. Assumes the first rule's symbol
 18 |   /// is the start symbol.
 19 |   fn from_str(s: &str) -> Result<Self, Self::Err> {
 20 |     let mut arena = NodeArena::new();
 21 |     let (rules, s) = parse_rules(s, &mut arena)?;
 22 |     assert!(s.is_empty());
 23 | 
 24 |     if rules.is_empty() {
 25 |       Err("empty ruleset".into())
 26 |     } else {
 27 |       Self::new(rules, arena)
 28 |     }
 29 |   }
 30 | }
 31 | 
 32 | type Infallible<'a, T> = (T, &'a str);
 33 | type ParseResult<'a, T> = Result<(T, &'a str), Err>;
 34 | 
 35 | /// helper macro for initializing a regex with lazy_static!
 36 | macro_rules! regex_static {
 37 |   ($name:ident, $pattern:expr) => {
 38 |     lazy_static! {
 39 |       static ref $name: Regex = Regex::new($pattern).unwrap();
 40 |     }
 41 |   };
 42 | }
 43 | 
 44 | /// Try to consume a regex, returning None if it doesn't match
 45 | fn optional_re<'a>(re: &'static Regex, s: &'a str) -> Infallible<'a, Option<&'a str>> {
 46 |   if let Some(caps) = re.captures(s) {
 47 |     let m = caps.get(0).unwrap();
 48 |     if m.start() > 0 {
 49 |       return (None, s);
 50 |     }
 51 |     let (_, rest) = s.split_at(m.end());
 52 |     (Some(m.as_str()), rest)
 53 |   } else {
 54 |     (None, s)
 55 |   }
 56 | }
 57 | 
 58 | /// Try to consume a regex, failing if it doesn't match
 59 | fn needed_re<'a>(re: &'static Regex, s: &'a str) -> ParseResult<'a, &'a str> {
 60 |   if let (Some(c), rest) = optional_re(re, s) {
 61 |     Ok((c, rest))
 62 |   } else {
 63 |     Err(format!("couldn't match {} at {}", re, s).into())
 64 |   }
 65 | }
 66 | 
 67 | /// Try to consume a char, returning None if it doesn't match
 68 | fn optional_char(c: char, s: &str) -> Infallible<Option<char>> {
 69 |   let mut iter = s.char_indices().peekable();
 70 |   if let Some((_, c1)) = iter.next() {
 71 |     if c == c1 {
 72 |       let rest = if let Some((idx, _)) = iter.peek() {
 73 |         s.split_at(*idx).1
 74 |       } else {
 75 |         ""
 76 |       };
 77 |       return (Some(c), rest);
 78 |     }
 79 |   }
 80 |   (None, s)
 81 | }
 82 | 
 83 | /// Try to consume a char, failing if it doesn't match
 84 | fn needed_char(c: char, s: &str) -> ParseResult<char> {
 85 |   if let (Some(c), rest) = optional_char(c, s) {
 86 |     Ok((c, rest))
 87 |   } else {
 88 |     Err(format!("couldn't match {} at {}", c, s).into())
 89 |   }
 90 | }
 91 | 
 92 | /// Tries to skip 1 or more \s characters and comments
 93 | fn skip_whitespace(s: &str) -> &str {
 94 |   regex_static!(WHITESPACE_OR_COMMENT, r"\s*(//.*?\n\s*)*");
 95 |   optional_re(&WHITESPACE_OR_COMMENT, s).1
 96 | }
 97 | 
 98 | // Tries to skip 1 or more non-newline whitespace characters
 99 | fn skip_whitespace_nonnewline(s: &str) -> &str {
100 |   regex_static!(WHITESPACE_NONNEWLINE, r"[\s&&[^\n]]*");
101 |   optional_re(&WHITESPACE_NONNEWLINE, s).1
102 | }
103 | 
104 | /// Tries to parse a name made of letters, numbers, - and _
105 | fn parse_name(s: &str) -> ParseResult<&str> {
106 |   regex_static!(NAME, r"[a-zA-Z0-9\-_]+");
107 |   needed_re(&NAME, s).map_err(|err| format!("name: {}", err).into())
108 | }
109 | 
110 | /// Tries to parse a name made of dotted segments (foo.bar.c.d)
111 | fn parse_dotted(s: &str) -> ParseResult<&str> {
112 |   regex_static!(DOTTED, r"[a-zA-Z0-9\-_]+(\.[a-zA-Z0-9\-_]+)*");
113 |   needed_re(&DOTTED, s).map_err(|e| format!("dotted name: {}", e).into())
114 | }
115 | 
116 | /// Parses an optional #tag
117 | fn parse_tag(s: &str) -> ParseResult<Option<String>> {
118 |   let (hash, s) = optional_char('#', s);
119 |   if hash.is_none() {
120 |     Ok((None, s))
121 |   } else {
122 |     let s = skip_whitespace(s);
123 |     let (name, s) = parse_name(s).map_err(|e| -> Err { format!("tag: {}", e).into() })?;
124 |     Ok((Some(name.to_string()), s))
125 |   }
126 | }
127 | 
128 | /// Parses a value with an optional tag: #tag value
129 | fn parse_feature_value<'a>(
130 |   s: &'a str,
131 |   arena: &mut NodeArena,
132 | ) -> ParseResult<'a, (Option<String>, NodeIdx)> {
133 |   regex_static!(VALUE, r"[a-zA-Z0-9\-_\*]+");
134 |   let (tag, s) = parse_tag(s)?;
135 |   let s = skip_whitespace(s);
136 |   let (name, s) = optional_re(&VALUE, s);
137 |   let value = if let Some(name) = name {
138 |     if name == TOP_STR {
139 |       arena.alloc_top()
140 |     } else {
141 |       arena.alloc_str(name.to_string())
142 |     }
143 |   } else if tag.is_some() {
144 |     arena.alloc_top()
145 |   } else {
146 |     return Err(format!("feature needs tag or value at {}", s).into());
147 |   };
148 |   Ok(((tag, value), s))
149 | }
150 | 
151 | fn parse_feature<'a>(s: &'a str, arena: &mut NodeArena) -> ParseResult<'a, Feature> {
152 |   let (name, s) = parse_dotted(s).map_err(|e| format!("feature name: {}", e))?;
153 |   let s = skip_whitespace(s);
154 |   let (_, s) = needed_char(':', s)?;
155 |   let s = skip_whitespace(s);
156 |   let (value, s) = parse_feature_value(s, arena).map_err(|e| format!("feature value: {}", e))?;
157 |   let s = skip_whitespace(s);
158 |   let (_, s) = optional_char(',', s);
159 | 
160 |   Ok((
161 |     Feature {
162 |       path: name.to_string(),
163 |       tag: value.0,
164 |       value: value.1,
165 |     },
166 |     s,
167 |   ))
168 | }
169 | 
170 | fn parse_featurestructure<'a>(s: &'a str, arena: &mut NodeArena) -> ParseResult<'a, Vec<Feature>> {
171 |   let mut pairs = Vec::new();
172 |   let mut rem = needed_char('[', s)?.1;
173 |   loop {
174 |     rem = skip_whitespace(rem);
175 |     if let (Some(_), rem) = optional_char(']', rem) {
176 |       return Ok((pairs, rem));
177 |     }
178 |     let (feature, s) = parse_feature(rem, arena)?;
179 |     pairs.push(feature);
180 |     rem = s;
181 |   }
182 | }
183 | 
184 | fn parse_production<'a>(
185 |   s: &'a str,
186 |   arena: &mut NodeArena,
187 | ) -> ParseResult<'a, (Production, Vec<Feature>)> {
188 |   let (name, s) = parse_name(s).map_err(|e| -> Err { format!("symbol: {}", e).into() })?;
189 |   let s = skip_whitespace_nonnewline(s);
190 |   let (features, s) = if s.starts_with('[') {
191 |     parse_featurestructure(s, arena)?
192 |   } else {
193 |     (Vec::new(), s)
194 |   };
195 | 
196 |   if name.chars().next().unwrap().is_uppercase() {
197 |     Ok(((Production::new_nonterminal(name.to_string()), features), s))
198 |   } else if !features.is_empty() {
199 |     Err(format!("terminal (lower-case) cannot have features: {} {}", name, s).into())
200 |   } else {
201 |     // annotate terminals with their matching string
202 |     Ok((
203 |       (
204 |         Production::new_terminal(name.to_string()),
205 |         vec![Feature {
206 |           path: "word".to_string(),
207 |           tag: None,
208 |           value: arena.alloc_str(name.to_string()),
209 |         }],
210 |       ),
211 |       s,
212 |     ))
213 |   }
214 | }
215 | 
216 | fn parse_nonterminal<'a>(
217 |   s: &'a str,
218 |   arena: &mut NodeArena,
219 | ) -> ParseResult<'a, (String, Vec<Feature>)> {
220 |   let ((prod, features), s) = parse_production(s, arena)?;
221 |   if prod.is_nonterminal() {
222 |     Ok(((prod.symbol, features), s))
223 |   } else {
224 |     Err(format!("expected nonterminal, got terminal {}: {}", prod.symbol, s).into())
225 |   }
226 | }
227 | 
228 | /// Symbol, productions, terminated by final newline
229 | fn parse_rule<'a>(s: &'a str, arena: &mut NodeArena) -> ParseResult<'a, Rule> {
230 |   #![allow(clippy::trivial_regex)]
231 |   regex_static!(ARROW, "->");
232 | 
233 |   let ((symbol, features), s) =
234 |     parse_nonterminal(s, arena).map_err(|e| -> Err { format!("rule symbol: {}", e).into() })?;
235 |   let s = skip_whitespace(s);
236 |   let (_, s) = needed_re(&ARROW, s).map_err(|e| -> Err { format!("rule arrow: {}", e).into() })?;
237 | 
238 |   let mut prods_features = Vec::new();
239 |   let mut rem = s;
240 |   loop {
241 |     rem = skip_whitespace_nonnewline(rem);
242 | 
243 |     let try_newline = skip_whitespace(rem);
244 |     if rem.is_empty() || try_newline != rem {
245 |       // end of line, exit loop
246 |       rem = try_newline;
247 |       break;
248 |     }
249 | 
250 |     let (prod, s) = parse_production(rem, arena)
251 |       .map_err(|e| -> Err { format!("rule production: {}", e).into() })?;
252 |     prods_features.push(prod);
253 |     rem = s;
254 |   }
255 | 
256 |   let (features, productions) = adopt_child_features(features, prods_features);
257 |   let features = arena.alloc_from_features(features)?;
258 | 
259 |   Ok((
260 |     Rule {
261 |       symbol,
262 |       features,
263 |       productions,
264 |     },
265 |     rem,
266 |   ))
267 | }
268 | 
269 | /// We want rules to be able to access their child features, and to be able to
270 | /// unify between them
271 | /// So we have the rule symbol "adopt" the features of its children, copying the
272 | /// child features into child-0.(...), child-1.(...), etc.
273 | ///
274 | /// We could try to implement this when constructing the rule, but it's easier
275 | /// to do as a simple AST transform.
276 | fn adopt_child_features(
277 |   mut rule_features: Vec<Feature>,
278 |   prods_features: Vec<(Production, Vec<Feature>)>,
279 | ) -> (Vec<Feature>, Vec<Production>) {
280 |   let mut productions = Vec::with_capacity(prods_features.len());
281 | 
282 |   for (idx, (prod, features)) in prods_features.into_iter().enumerate() {
283 |     productions.push(prod);
284 |     let prefix = format!("child-{}.", idx);
285 |     for feature in features.into_iter() {
286 |       rule_features.push(Feature {
287 |         path: prefix.clone() + &feature.path,
288 |         tag: feature.tag,
289 |         value: feature.value,
290 |       });
291 |     }
292 |   }
293 | 
294 |   (rule_features, productions)
295 | }
296 | 
297 | fn parse_rules<'a>(s: &'a str, arena: &mut NodeArena) -> ParseResult<'a, Vec<Rule>> {
298 |   let mut rules = Vec::new();
299 |   let mut rem = s;
300 |   loop {
301 |     rem = skip_whitespace(rem);
302 |     if rem.is_empty() {
303 |       return Ok((rules, rem));
304 |     }
305 |     let (rule, s) = parse_rule(rem, arena)?;
306 |     rules.push(rule);
307 |     rem = s;
308 |   }
309 | }
310 | 


--------------------------------------------------------------------------------
/src/forest.rs:
--------------------------------------------------------------------------------
  1 | use std::fmt;
  2 | use std::sync::Arc;
  3 | 
  4 | use crate::earley::Chart;
  5 | use crate::rules::{Grammar, Rule};
  6 | use crate::syntree::{Constituent, SynTree, Word};
  7 | use crate::utils::combinations;
  8 | 
  9 | #[derive(Debug, Clone, PartialEq, Eq)]
 10 | pub struct ForestState {
 11 |   rule: Arc<Rule>,
 12 |   span: (usize, usize),
 13 | }
 14 | 
 15 | impl ForestState {
 16 |   pub fn new(rule: &Arc<Rule>, start: usize, end: usize) -> Self {
 17 |     Self {
 18 |       rule: rule.clone(),
 19 |       span: (start, end),
 20 |     }
 21 |   }
 22 | }
 23 | 
 24 | impl fmt::Display for ForestState {
 25 |   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 26 |     write!(f, "{}..{}: {}", self.span.0, self.span.1, self.rule)
 27 |   }
 28 | }
 29 | 
 30 | impl From<&ForestState> for Constituent<Arc<Rule>> {
 31 |   fn from(fs: &ForestState) -> Self {
 32 |     Self {
 33 |       value: fs.rule.clone(),
 34 |       span: fs.span,
 35 |     }
 36 |   }
 37 | }
 38 | 
 39 | #[derive(Debug, Clone, PartialEq, Eq)]
 40 | pub struct Forest(Vec<Vec<ForestState>>);
 41 | 
 42 | impl Forest {
 43 |   pub fn len(&self) -> usize {
 44 |     self.0.len()
 45 |   }
 46 | 
 47 |   pub fn is_empty(&self) -> bool {
 48 |     self.len() == 0
 49 |   }
 50 | 
 51 |   /// Checks if a subtree has already been completed by make_trees(),
 52 |   /// or if it is a leaf and doesn't need to be completed
 53 |   fn subtree_is_complete(node: &SynTree<Arc<Rule>, String>) -> bool {
 54 |     if let Some((cons, children)) = node.get_branch() {
 55 |       cons.value.productions.len() == children.len()
 56 |     } else {
 57 |       // is a leaf
 58 |       true
 59 |     }
 60 |   }
 61 | 
 62 |   /// Takes a rule and search span, and returns a vec of all possible sequences
 63 |   /// of trees that correspond to the rule's productions.
 64 |   /// So for the situation:
 65 |   /// ```text
 66 |   /// g := '''
 67 |   ///   S -> x
 68 |   ///   S -> S S
 69 |   /// '''
 70 |   /// chart := parse(g, "x x x")
 71 |   /// chart.extend_out(g, S -> S S, start = 0, end = 3)
 72 |   /// ```
 73 |   /// , which, recall, has a chart that looks like:
 74 |   ///
 75 |   /// ```text
 76 |   /// 0..1: S -> x
 77 |   /// 0..2: S -> S S
 78 |   /// 0..3: S -> S S
 79 |   /// 1..2: S -> x
 80 |   /// 1..3: S -> S S
 81 |   /// 2..3: S -> x
 82 |   /// ```
 83 |   ///
 84 |   /// You'd get
 85 |   ///
 86 |   /// ```text
 87 |   /// [[(S -> x, 0..1), (S -> S S, (), 1..3)],
 88 |   ///  [(S -> S S, (), 0..2), (S -> x, 2..3)]]
 89 |   /// ```
 90 |   fn extend_out(
 91 |     &self,
 92 |     rule: &Rule,
 93 |     prod_idx: usize,
 94 |     search_start: usize,
 95 |     search_end: usize,
 96 |   ) -> Vec<Vec<SynTree<Arc<Rule>, String>>> {
 97 |     if prod_idx == rule.len() && search_start == search_end {
 98 |       // base case, we consumed the whole rule and the whole span together.
 99 |       // provide a single empty sequence as a base for prepending onto as we unwind the stack
100 |       return vec![Vec::new()];
101 |     } else if prod_idx == rule.len() || search_start == search_end {
102 |       // we either ran out of productions before consuming everything, or ran out of stuff to consume before
103 |       // satisfying all the productions. bail with 0 possible sequences.
104 |       return Vec::new();
105 |     }
106 | 
107 |     let next_production = &rule.productions[prod_idx];
108 |     if next_production.is_nonterminal() {
109 |       let wanted_symbol = &next_production.symbol;
110 |       // look for potential next states to produce this production at the search start
111 |       self.0[search_start]
112 |         .iter()
113 |         // only consider states that are contained within the search range, and have our wanted symbol
114 |         .filter(|s| s.span.1 <= search_end && wanted_symbol == &s.rule.symbol)
115 |         .flat_map(|state| {
116 |           // recursively find possible sequences that start directly after this state
117 |           // TODO: this is probably easily amenable to some dynamic programming to reduce repeated work
118 |           self
119 |             .extend_out(rule, prod_idx + 1, state.span.1, search_end)
120 |             .into_iter()
121 |             // if there are any, prepend an uncompleted tree headed by this state onto the sequence and throw it on the pile
122 |             .map(move |mut seq| {
123 |               seq.insert(0, SynTree::Branch(state.into(), Vec::new()));
124 |               seq
125 |             })
126 |         })
127 |         .collect()
128 |     } else {
129 |       // similar to the nonterminal case, but we don't have to search for multiple potential states --
130 |       // all terminals with the same symbol_str are identical.
131 |       let leaf = SynTree::Leaf(Word {
132 |         value: next_production.symbol.to_string(),
133 |         span: (search_start, search_start + 1),
134 |       });
135 | 
136 |       // recursively find possible sequences, like before
137 |       self
138 |         .extend_out(rule, prod_idx + 1, search_start + 1, search_end)
139 |         .into_iter()
140 |         .map(move |mut seq| {
141 |           // prepend our new leaf to them
142 |           seq.insert(0, leaf.clone());
143 |           seq
144 |         })
145 |         .collect()
146 |     }
147 |   }
148 | 
149 |   /// Takes a possibly-uncompleted tree, and returns all possible trees it describes.
150 |   /// An uncompleted tree is a non-nullable constituent with 0 children. It needs to be passed
151 |   /// into extend_out, and then glued onto
152 |   fn make_trees(&self, tree: SynTree<Arc<Rule>, String>) -> Vec<SynTree<Arc<Rule>, String>> {
153 |     if Self::subtree_is_complete(&tree) {
154 |       vec![tree]
155 |     } else {
156 |       let (cons, _) = tree.get_branch().unwrap();
157 |       self
158 |         .extend_out(&cons.value, 0, cons.span.0, cons.span.1)
159 |         .into_iter()
160 |         .flat_map(|children| {
161 |           let child_sets = children
162 |             .into_iter()
163 |             .map(|child| self.make_trees(child))
164 |             .collect::<Vec<_>>();
165 |           combinations(&child_sets)
166 |             .into_iter()
167 |             .map(|set| SynTree::Branch(cons.clone(), set))
168 |         })
169 |         .collect::<Vec<_>>()
170 |     }
171 |   }
172 | 
173 |   pub fn trees(&self, g: &Grammar) -> Vec<SynTree<Arc<Rule>, String>> {
174 |     if self.is_empty() {
175 |       Vec::new()
176 |     } else {
177 |       // seed our search with all LR0s that started at position 0, span to
178 |       // the end of the string, and are named by the grammar's start symbol
179 |       let root_states = self.0[0]
180 |         .iter()
181 |         .filter(|state| state.span.1 == self.len() && state.rule.symbol == g.start)
182 |         .map(|state| SynTree::Branch(state.into(), Vec::new()));
183 |       // use make_trees to generate all possible filled-in trees from each seed tree
184 |       root_states.fold(
185 |         Vec::<SynTree<Arc<Rule>, String>>::new(),
186 |         |mut prev, tree| {
187 |           let mut trees = self.make_trees(tree);
188 |           prev.append(&mut trees);
189 |           prev
190 |         },
191 |       )
192 |     }
193 |   }
194 | }
195 | 
196 | impl From<Chart> for Forest {
197 |   fn from(chart: Chart) -> Self {
198 |     // the new chart will be indexed by origin location, and no rule can have
199 |     // its origin at the end of the string, so len is chart.len - 1
200 |     let mut v = vec![Vec::new(); chart.len() - 1];
201 | 
202 |     for (k, states) in chart.into_iter() {
203 |       for state in states {
204 |         // exclude unfinished rules that can't contribute to a tree
205 |         if !state.lr0.is_active() {
206 |           v.get_mut(state.origin)
207 |             .expect("origin > input len")
208 |             .push(ForestState::new(&state.lr0.rule, state.origin, k));
209 |         }
210 |       }
211 |     }
212 | 
213 |     Self(v)
214 |   }
215 | }
216 | 
217 | impl fmt::Display for Forest {
218 |   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
219 |     for k in 0..self.len() {
220 |       writeln!(f, "Origin {}:", k)?;
221 |       for fs in self.0[k].iter() {
222 |         writeln!(f, "  {}", fs)?;
223 |       }
224 |     }
225 | 
226 |     Ok(())
227 |   }
228 | }
229 | 
230 | #[test]
231 | fn test_parse_chart() {
232 |   let g: Grammar = r#"
233 |     S -> x
234 |     S -> S S
235 |   "#
236 |   .parse()
237 |   .unwrap();
238 | 
239 |   let get_rule_with_len = |len: usize| {
240 |     g.rules
241 |       .get("S")
242 |       .unwrap()
243 |       .iter()
244 |       .find(|r| r.len() == len)
245 |       .unwrap()
246 |   };
247 | 
248 |   let rule1 = get_rule_with_len(1);
249 |   let rule2 = get_rule_with_len(2);
250 | 
251 |   let forest: Forest = crate::earley::parse_chart(&g, &["x", "x", "x"]).into();
252 | 
253 |   assert_eq!(
254 |     forest,
255 |     Forest(vec![
256 |       vec![
257 |         ForestState::new(rule1, 0, 1),
258 |         ForestState::new(rule2, 0, 2),
259 |         ForestState::new(rule2, 0, 3),
260 |       ],
261 |       vec![ForestState::new(rule1, 1, 2), ForestState::new(rule2, 1, 3),],
262 |       vec![ForestState::new(rule1, 2, 3)],
263 |     ])
264 |   );
265 | 
266 |   println!("{}", forest);
267 | }
268 | 
269 | #[test]
270 | fn test_tree_generation() {
271 |   // test the tree ambiguity problem that naive earley forest processing has
272 |   // correct algorithm finds 2 trees:
273 |   //  (S (S x) (S (S x) (S x)))           -> [x][xx]
274 |   //  (S (S (S x) (S x)) (S x))           -> [xx][x]
275 |   // naive algorithm finds 2 addl. spurious trees:
276 |   //  (S (S x) (S x))                     -> [x][x]
277 |   //  (S (S (S x) (S x)) (S (S x) (S x))) -> [xx][xx]
278 | 
279 |   let g = r#"
280 |       S -> x
281 |       S -> S S
282 |     "#
283 |   .parse()
284 |   .unwrap();
285 | 
286 |   let forest: Forest = crate::earley::parse_chart(&g, &["x", "x", "x"]).into();
287 |   let trees = forest.trees(&g);
288 | 
289 |   for tree in trees.iter() {
290 |     println!("{}\n", tree);
291 |   }
292 | 
293 |   assert_eq!(trees.len(), 2);
294 | }
295 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | /*!
  2 | A symbolic natural language parsing library for Rust, inspired by
  3 | [HDPSG](https://en.wikipedia.org/wiki/Head-driven_phrase_structure_grammar).
  4 | 
  5 | # What is this?
  6 | This is a library for parsing natural or constructed languages into syntax trees
  7 | and feature structures. There's no machine learning or probabilistic models,
  8 | everything is hand-crafted and deterministic.
  9 | 
 10 | You can find out more about the motivations of this project in
 11 | [this blog post](https://vgel.me/posts/symbolic-linguistics-part1/).
 12 | 
 13 | ## But what are you using it for?
 14 | I'm using this to parse a constructed language for my upcoming xenolinguistics
 15 | game, [Themengi](https://vgel.me/themengi/).
 16 | 
 17 | # Motivation
 18 | Using a simple 80-line grammar, introduced in the tutorial below, we can parse
 19 | a simple subset of English, checking reflexive pronoun binding, case, and
 20 | number agreement.
 21 | 
 22 | ```text
 23 | $ cargo run --bin cli examples/reflexives.fgr
 24 | > she likes himself
 25 | Parsed 0 trees
 26 | 
 27 | > her likes herself
 28 | Parsed 0 trees
 29 | 
 30 | > she like herself
 31 | Parsed 0 trees
 32 | 
 33 | > she likes herself
 34 | Parsed 1 tree
 35 | (0..3: S
 36 |   (0..1: N (0..1: she))
 37 |   (1..2: TV (1..2: likes))
 38 |   (2..3: N (2..3: herself)))
 39 | [
 40 |   child-2: [
 41 |     case: acc
 42 |     pron: ref
 43 |     needs_pron: #0 she
 44 |     num: sg
 45 |     child-0: [ word: herself ]
 46 |   ]
 47 |   child-1: [
 48 |     tense: nonpast
 49 |     child-0: [ word: likes ]
 50 |     num: #1 sg
 51 |   ]
 52 |   child-0: [
 53 |     child-0: [ word: she ]
 54 |     case: nom
 55 |     pron: #0
 56 |     num: #1
 57 |   ]
 58 | ]
 59 | ```
 60 | 
 61 | Low resource language? Low problem! No need to train on gigabytes of text, just
 62 | write a grammar using your brain. Let's hypothesize that in
 63 | American Sign Language, topicalized nouns (expressed with raised eyebrows)
 64 | must appear first in the sentence. We can write a small grammar (18 lines),
 65 | and plug in some sentences:
 66 | 
 67 | ```text
 68 | $ cargo run --bin cli examples/asl-wordorder.fgr -n
 69 | > boy sit
 70 | Parsed 1 tree
 71 | (0..2: S
 72 |   (0..1: NP ((0..1: N (0..1: boy))))
 73 |   (1..2: IV (1..2: sit)))
 74 | 
 75 | > boy throw ball
 76 | Parsed 1 tree
 77 | (0..3: S
 78 |   (0..1: NP ((0..1: N (0..1: boy))))
 79 |   (1..2: TV (1..2: throw))
 80 |   (2..3: NP ((2..3: N (2..3: ball)))))
 81 | 
 82 | > ball nm-raised-eyebrows boy throw
 83 | Parsed 1 tree
 84 | (0..4: S
 85 |   (0..2: NP
 86 |     (0..1: N (0..1: ball))
 87 |     (1..2: Topic (1..2: nm-raised-eyebrows)))
 88 |   (2..3: NP ((2..3: N (2..3: boy))))
 89 |   (3..4: TV (3..4: throw)))
 90 | 
 91 | > boy throw ball nm-raised-eyebrows
 92 | Parsed 0 trees
 93 | ```
 94 | 
 95 | # Tutorial
 96 | As an example, let's say we want to build a parser for English reflexive
 97 | pronouns (himself, herself, themselves, themself, itself). We'll also support
 98 | number ("He likes X" v.s. "They like X") and simple embedded clauses
 99 | ("He said that they like X").
100 | 
101 | Grammar files are written in a custom language, similar to BNF, called
102 | Feature GRammar (.fgr). There's a VSCode syntax highlighting extension for these
103 | files available as [`fgr-syntax`](https://marketplace.visualstudio.com/items?itemName=vgel.fgr-syntax).
104 | 
105 | We'll start by defining our lexicon. The lexicon is the set of terminal symbols
106 | (symbols in the actual input) that the grammar will match. Terminal symbols must
107 | start with a lowercase letter, and non-terminal symbols must start with an
108 | uppercase letter.
109 | 
110 | ```fgr
111 | // pronouns
112 | N -> he
113 | N -> him
114 | N -> himself
115 | N -> she
116 | N -> her
117 | N -> herself
118 | N -> they
119 | N -> them
120 | N -> themselves
121 | N -> themself
122 | 
123 | // names, lowercase as they are terminals
124 | N -> mary
125 | N -> sue
126 | N -> takeshi
127 | N -> robert
128 | 
129 | // complementizer
130 | Comp -> that
131 | 
132 | // verbs -- intransitive, transitive, and clausal
133 | IV -> falls
134 | IV -> fall
135 | IV -> fell
136 | 
137 | TV -> likes
138 | TV -> like
139 | TV -> liked
140 | 
141 | CV -> says
142 | CV -> say
143 | CV -> said
144 | ```
145 | 
146 | Next, we can add our sentence rules (they must be added at the top, as the first
147 | rule in the file is assumed to be the top-level rule):
148 | 
149 | ```fgr
150 | // sentence rules
151 | S -> N IV
152 | S -> N TV N
153 | S -> N CV Comp S
154 | 
155 | // ... previous lexicon ...
156 | ```
157 | 
158 | Assuming this file is saved as `examples/no-features.fgr` (which it is :wink:),
159 | we can test this file with the built-in CLI:
160 | 
161 | ```text
162 | $ cargo run --bin cli examples/no-features.fgr
163 | > he falls
164 | Parsed 1 tree
165 | (0..2: S
166 |   (0..1: N (0..1: he))
167 |   (1..2: IV (1..2: falls)))
168 | [
169 |   child-1: [ child-0: [ word: falls ] ]
170 |   child-0: [ child-0: [ word: he ] ]
171 | ]
172 | 
173 | > he falls her
174 | Parsed 0 trees
175 | 
176 | > he likes her
177 | Parsed 1 tree
178 | (0..3: S
179 |   (0..1: N (0..1: he))
180 |   (1..2: TV (1..2: likes))
181 |   (2..3: N (2..3: her)))
182 | [
183 |   child-2: [ child-0: [ word: her ] ]
184 |   child-1: [ child-0: [ word: likes ] ]
185 |   child-0: [ child-0: [ word: he ] ]
186 | ]
187 | 
188 | > he likes
189 | Parsed 0 trees
190 | 
191 | > he said that he likes her
192 | Parsed 1 tree
193 | (0..6: S
194 |   (0..1: N (0..1: he))
195 |   (1..2: CV (1..2: said))
196 |   (2..3: Comp (2..3: that))
197 |   (3..6: S
198 |     (3..4: N (3..4: he))
199 |     (4..5: TV (4..5: likes))
200 |     (5..6: N (5..6: her))))
201 | [
202 |   child-0: [ child-0: [ word: he ] ]
203 |   child-2: [ child-0: [ word: that ] ]
204 |   child-1: [ child-0: [ word: said ] ]
205 |   child-3: [
206 |     child-2: [ child-0: [ word: her ] ]
207 |     child-1: [ child-0: [ word: likes ] ]
208 |     child-0: [ child-0: [ word: he ] ]
209 |   ]
210 | ]
211 | 
212 | > he said that he
213 | Parsed 0 trees
214 | ```
215 | 
216 | This grammar already parses some correct sentences, and blocks some trivially
217 | incorrect ones. However, it doesn't care about number, case, or reflexives
218 | right now:
219 | 
220 | ```text
221 | > she likes himself  // unbound reflexive pronoun
222 | Parsed 1 tree
223 | (0..3: S
224 |   (0..1: N (0..1: she))
225 |   (1..2: TV (1..2: likes))
226 |   (2..3: N (2..3: himself)))
227 | [
228 |   child-0: [ child-0: [ word: she ] ]
229 |   child-2: [ child-0: [ word: himself ] ]
230 |   child-1: [ child-0: [ word: likes ] ]
231 | ]
232 | 
233 | > him like her  // incorrect case on the subject pronoun, should be nominative
234 |                 // (he) instead of accusative (him)
235 | Parsed 1 tree
236 | (0..3: S
237 |   (0..1: N (0..1: him))
238 |   (1..2: TV (1..2: like))
239 |   (2..3: N (2..3: her)))
240 | [
241 |   child-0: [ child-0: [ word: him ] ]
242 |   child-1: [ child-0: [ word: like ] ]
243 |   child-2: [ child-0: [ word: her ] ]
244 | ]
245 | 
246 | > he like her  // incorrect verb number agreement
247 | Parsed 1 tree
248 | (0..3: S
249 |   (0..1: N (0..1: he))
250 |   (1..2: TV (1..2: like))
251 |   (2..3: N (2..3: her)))
252 | [
253 |   child-2: [ child-0: [ word: her ] ]
254 |   child-1: [ child-0: [ word: like ] ]
255 |   child-0: [ child-0: [ word: he ] ]
256 | ]
257 | ```
258 | 
259 | To fix this, we need to add *features* to our lexicon, and restrict the sentence
260 | rules based on features.
261 | 
262 | Features are added with square brackets, and are key: value pairs separated by
263 | commas. `**top**` is a special feature value, which basically means
264 | "unspecified" -- we'll come back to it later. Features that are unspecified are
265 | also assumed to have a `**top**` value, but sometimes explicitly stating top is
266 | more clear.
267 | 
268 | ```fgr
269 | /// Pronouns
270 | // The added features are:
271 | // * num: sg or pl, whether this noun wants a singular verb (likes) or
272 | //   a plural verb (like). note this is grammatical number, so for example
273 | //   singular they takes plural agreement ("they like X", not *"they likes X")
274 | // * case: nom or acc, whether this noun is nominative or accusative case.
275 | //   nominative case goes in the subject, and accusative in the object.
276 | //   e.g., "he fell" and "she likes him", not *"him fell" and *"her likes he"
277 | // * pron: he, she, they, or ref -- what type of pronoun this is
278 | // * needs_pron: whether this is a reflexive that needs to bind to another
279 | //   pronoun.
280 | N[ num: sg, case: nom, pron: he ]                    -> he
281 | N[ num: sg, case: acc, pron: he ]                    -> him
282 | N[ num: sg, case: acc, pron: ref, needs_pron: he ]   -> himself
283 | N[ num: sg, case: nom, pron: she ]                   -> she
284 | N[ num: sg, case: acc, pron: she ]                   -> her
285 | N[ num: sg, case: acc, pron: ref, needs_pron: she]   -> herself
286 | N[ num: pl, case: nom, pron: they ]                  -> they
287 | N[ num: pl, case: acc, pron: they ]                  -> them
288 | N[ num: pl, case: acc, pron: ref, needs_pron: they ] -> themselves
289 | N[ num: sg, case: acc, pron: ref, needs_pron: they ] -> themself
290 | 
291 | // Names
292 | // The added features are:
293 | // * num: sg, as people are singular ("mary likes her" / *"mary like her")
294 | // * case: **top**, as names can be both subjects and objects
295 | //   ("mary likes her" / "she likes mary")
296 | // * pron: whichever pronoun the person uses for reflexive agreement
297 | //   mary    pron: she  => mary likes herself
298 | //   sue     pron: they => sue likes themself
299 | //   takeshi pron: he   => takeshi likes himself
300 | N[ num: sg, case: **top**, pron: she ]  -> mary
301 | N[ num: sg, case: **top**, pron: they ] -> sue
302 | N[ num: sg, case: **top**, pron: he ]   -> takeshi
303 | N[ num: sg, case: **top**, pron: he ]   -> robert
304 | 
305 | // Complementizer doesn't need features
306 | Comp -> that
307 | 
308 | // Verbs -- intransitive, transitive, and clausal
309 | // The added features are:
310 | // * num: sg, pl, or **top** -- to match the noun numbers.
311 | //   **top** will match either sg or pl, as past-tense verbs in English
312 | //   don't agree in number: "he fell" and "they fell" are both fine
313 | // * tense: past or nonpast -- this won't be used for agreement, but will be
314 | //   copied into the final feature structure, and the client code could do
315 | //   something with it
316 | IV[ num:      sg, tense: nonpast ] -> falls
317 | IV[ num:      pl, tense: nonpast ] -> fall
318 | IV[ num: **top**, tense: past ]    -> fell
319 | 
320 | TV[ num:      sg, tense: nonpast ] -> likes
321 | TV[ num:      pl, tense: nonpast ] -> like
322 | TV[ num: **top**, tense: past ]    -> liked
323 | 
324 | CV[ num:      sg, tense: nonpast ] -> says
325 | CV[ num:      pl, tense: nonpast ] -> say
326 | CV[ num: **top**, tense: past ]    -> said
327 | ```
328 | 
329 | Now that our lexicon is updated with features, we can update our sentence rules
330 | to constrain parsing based on those features. This uses two new features,
331 | tags and unification. Tags allow features to be associated between nodes in a
332 | rule, and unification controls how those features are compatible. The rules for
333 | unification are:
334 | 
335 | 1. A string feature can unify with a string feature with the same value
336 | 2. A **top** feature can unify with anything, and the nodes are merged
337 | 3. A complex feature ([ ... ] structure) is recursively unified with another
338 |    complex feature.
339 | 
340 | If unification fails anywhere, the parse is aborted and the tree is discarded.
341 | This allows the programmer to discard trees if features don't match.
342 | 
343 | ```fgr
344 | // Sentence rules
345 | // Intransitive verb:
346 | // * Subject must be nominative case
347 | // * Subject and verb must agree in number (copied through #1)
348 | S -> N[ case: nom, num: #1 ] IV[ num: #1 ]
349 | // Transitive verb:
350 | // * Subject must be nominative case
351 | // * Subject and verb must agree in number (copied through #2)
352 | // * If there's a reflexive in the object position, make sure its `needs_pron`
353 | //   feature matches the subject's `pron` feature. If the object isn't a
354 | //   reflexive, then its `needs_pron` feature will implicitly be `**top**`, so
355 | //   will unify with anything.
356 | S -> N[ case: nom, pron: #1, num: #2 ] TV[ num: #2 ] N[ case: acc, needs_pron: #1 ]
357 | // Clausal verb:
358 | // * Subject must be nominative case
359 | // * Subject and verb must agree in number (copied through #1)
360 | // * Reflexives can't cross clause boundaries (*"He said that she likes himself"),
361 | //   so we can ignore reflexives and delegate to inner clause rule
362 | S -> N[ case: nom, num: #1 ] CV[ num: #1 ] Comp S
363 | ```
364 | 
365 | Now that we have this augmented grammar (available as `examples/reflexives.fgr`),
366 | we can try it out and see that it rejects illicit sentences that were previously
367 | accepted, while still accepting valid ones:
368 | 
369 | ```text
370 | > he fell
371 | Parsed 1 tree
372 | (0..2: S
373 |   (0..1: N (0..1: he))
374 |   (1..2: IV (1..2: fell)))
375 | [
376 |   child-1: [
377 |     child-0: [ word: fell ]
378 |     num: #0 sg
379 |     tense: past
380 |   ]
381 |   child-0: [
382 |     pron: he
383 |     case: nom
384 |     num: #0
385 |     child-0: [ word: he ]
386 |   ]
387 | ]
388 | 
389 | > he like him
390 | Parsed 0 trees
391 | 
392 | > he likes himself
393 | Parsed 1 tree
394 | (0..3: S
395 |   (0..1: N (0..1: he))
396 |   (1..2: TV (1..2: likes))
397 |   (2..3: N (2..3: himself)))
398 | [
399 |   child-1: [
400 |     num: #0 sg
401 |     child-0: [ word: likes ]
402 |     tense: nonpast
403 |   ]
404 |   child-2: [
405 |     needs_pron: #1 he
406 |     num: sg
407 |     child-0: [ word: himself ]
408 |     pron: ref
409 |     case: acc
410 |   ]
411 |   child-0: [
412 |     child-0: [ word: he ]
413 |     pron: #1
414 |     num: #0
415 |     case: nom
416 |   ]
417 | ]
418 | 
419 | > he likes herself
420 | Parsed 0 trees
421 | 
422 | > mary likes herself
423 | Parsed 1 tree
424 | (0..3: S
425 |   (0..1: N (0..1: mary))
426 |   (1..2: TV (1..2: likes))
427 |   (2..3: N (2..3: herself)))
428 | [
429 |   child-0: [
430 |     pron: #0 she
431 |     num: #1 sg
432 |     case: nom
433 |     child-0: [ word: mary ]
434 |   ]
435 |   child-1: [
436 |     tense: nonpast
437 |     child-0: [ word: likes ]
438 |     num: #1
439 |   ]
440 |   child-2: [
441 |     child-0: [ word: herself ]
442 |     num: sg
443 |     pron: ref
444 |     case: acc
445 |     needs_pron: #0
446 |   ]
447 | ]
448 | 
449 | > mary likes themself
450 | Parsed 0 trees
451 | 
452 | > sue likes themself
453 | Parsed 1 tree
454 | (0..3: S
455 |   (0..1: N (0..1: sue))
456 |   (1..2: TV (1..2: likes))
457 |   (2..3: N (2..3: themself)))
458 | [
459 |   child-0: [
460 |     pron: #0 they
461 |     child-0: [ word: sue ]
462 |     case: nom
463 |     num: #1 sg
464 |   ]
465 |   child-1: [
466 |     tense: nonpast
467 |     num: #1
468 |     child-0: [ word: likes ]
469 |   ]
470 |   child-2: [
471 |     needs_pron: #0
472 |     case: acc
473 |     pron: ref
474 |     child-0: [ word: themself ]
475 |     num: sg
476 |   ]
477 | ]
478 | 
479 | > sue likes himself
480 | Parsed 0 trees
481 | ```
482 | 
483 | If this is interesting to you and you want to learn more, you can check out
484 | [my blog series](https://vgel.me/posts/symbolic-linguistics-part1/),
485 | the excellent textbook [Syntactic Theory: A Formal Introduction (2nd ed.)](https://web.stanford.edu/group/cslipublications/cslipublications/site/1575864002.shtml),
486 | and the [DELPH-IN project](http://www.delph-in.net/wiki/index.php/Home), whose
487 | work on the LKB inspired this simplified version.
488 | 
489 | # Using from code
490 | I need to write this section in more detail, but if you're comfortable with Rust,
491 | I suggest looking through the codebase. It's not perfect, it started as one of
492 | my first Rust projects (after migrating through F# -> TypeScript -> C in search
493 | of the right performance/ergonomics tradeoff), and it could use more tests,
494 | but overall it's not too bad.
495 | 
496 | Basically, the processing pipeline is:
497 | 
498 | 1. Make a `Grammar` struct
499 |   * `Grammar` is defined in `rules.rs`.
500 |   * The easiest way to make a `Grammar` is `Grammar::parse_from_file`, which is
501 |     mostly a hand-written recusive descent parser in `parse_grammar.rs`. Yes,
502 |     I recognize the irony here.
503 | 2. It takes input (in `Grammar::parse`, which does everything for you, or
504 |    `Grammar::parse_chart`, which just does the chart)
505 | 3. The input is first chart-parsed in `earley.rs`
506 | 4. Then, a forest is built from the chart, in `forest.rs`, using an algorithm
507 |     I found in a very useful blog series I forget the URL for, because the
508 |     algorithms in the academic literature for this are... weird.
509 | 5. Finally, the feature unification is used to prune the forest down to only
510 |    valid trees. It would be more efficient to do this during parsing, but meh.
511 | 
512 | The most interesting thing you can do via code and not via the CLI is probably
513 | getting at the raw feature DAG, as that would let you do things like pronoun
514 | coreference. The DAG code is in `featurestructure.rs`, and should be fairly
515 | approachable -- there's a lot of Rust ceremony around `Rc<RefCell<...>>`
516 | because using an arena allocation crate seemed ~~too har~~like overkill, but
517 | that is somewhat mitigated by the `NodeRef` type alias. Hit me up at
518 | https://vgel.me/contact if you need help with anything here!
519 | */
520 | 
521 | #[macro_use]
522 | extern crate lazy_static;
523 | 
524 | pub mod earley;
525 | pub mod featurestructure;
526 | pub mod fgr;
527 | pub mod forest;
528 | pub mod rules;
529 | pub mod syntree;
530 | pub mod utils;
531 | 
532 | use std::fs;
533 | use std::path;
534 | use std::sync::Arc;
535 | 
536 | use tracing::{debug, trace};
537 | 
538 | pub use crate::earley::{Chart, parse_chart};
539 | pub use crate::featurestructure::{NodeArena, NodeIdx};
540 | pub use crate::forest::Forest;
541 | pub use crate::rules::{Grammar, Rule};
542 | pub use crate::syntree::{Constituent, SynTree};
543 | pub use crate::utils::Err;
544 | 
545 | impl Grammar {
546 |   pub fn parse_chart(&self, input: &[&str]) -> Chart {
547 |     parse_chart(self, input)
548 |   }
549 | 
550 |   pub fn parse_forest(&self, input: &[&str]) -> Forest {
551 |     Forest::from(self.parse_chart(input))
552 |   }
553 | 
554 |   pub fn unify_tree(
555 |     tree: SynTree<Arc<Rule>, String>,
556 |     arena: &mut NodeArena,
557 |   ) -> Result<(SynTree<String, String>, NodeIdx), Err> {
558 |     match tree {
559 |       SynTree::Leaf(w) => Ok((SynTree::Leaf(w), arena.alloc_top())),
560 |       SynTree::Branch(cons, children) => {
561 |         let features = arena.clone(cons.value.features);
562 | 
563 |         let mut bare_children = Vec::with_capacity(children.len());
564 |         for (idx, child) in children.into_iter().enumerate() {
565 |           let (child_tree, child_features) = Self::unify_tree(child, arena)?;
566 |           bare_children.push(child_tree);
567 | 
568 |           let to_unify =
569 |             arena.alloc_from_edges(vec![(format!("child-{}", idx), child_features)])?;
570 | 
571 |           trace!("unifying {} with child-{}", cons.value.symbol, idx);
572 |           trace!(
573 |             "{} features: {}",
574 |             cons.value.symbol,
575 |             arena.display(cons.value.features)
576 |           );
577 |           arena.unify(features, to_unify)?;
578 |         }
579 | 
580 |         let bare_self = SynTree::Branch(
581 |           Constituent {
582 |             span: cons.span,
583 |             value: cons.value.symbol.clone(),
584 |           },
585 |           bare_children,
586 |         );
587 | 
588 |         Ok((bare_self, features))
589 |       }
590 |     }
591 |   }
592 | 
593 |   pub fn parse(&self, input: &[&str]) -> Vec<(SynTree<String, String>, NodeIdx, NodeArena)> {
594 |     let forest = self.parse_forest(input);
595 |     let trees = forest.trees(self);
596 | 
597 |     let mut results = Vec::new();
598 | 
599 |     for tree in trees {
600 |       // TODO might be able to share arena between parse attempts
601 |       let mut arena = self.create_parse_arena();
602 |       match Self::unify_tree(tree, &mut arena) {
603 |         Ok((syn_tree, idx)) => results.push((syn_tree, idx, arena)),
604 |         Err(e) => debug!("{e}"),
605 |       }
606 |     }
607 | 
608 |     results
609 |   }
610 | 
611 |   pub fn read_from_file<P: AsRef<path::Path>>(path: P) -> Result<Self, Err> {
612 |     fs::read_to_string(path)?.parse()
613 |   }
614 | }
615 | 
616 | #[test]
617 | fn test_unification_blocking() {
618 |   let g: Grammar = r#"
619 |     S -> N[ case: nom, pron: #1 ] TV N[ case: acc, needs_pron: #1 ]
620 |     TV -> likes
621 |     N[ case: nom, pron: she ] -> she
622 |     N[ case: nom, pron: he ] -> he
623 |     N[ case: acc, pron: he ] -> him
624 |     N[ case: acc, pron: ref, needs_pron: he ] -> himself
625 |   "#
626 |   .parse()
627 |   .unwrap();
628 | 
629 |   assert_eq!(g.parse(&["he", "likes", "himself"]).len(), 1);
630 |   assert_eq!(g.parse(&["he", "likes", "him"]).len(), 1);
631 |   assert_eq!(g.parse(&["she", "likes", "him"]).len(), 1);
632 | 
633 |   assert_eq!(g.parse(&["himself", "likes", "himself"]).len(), 0);
634 |   assert_eq!(g.parse(&["she", "likes", "himself"]).len(), 0);
635 |   assert_eq!(g.parse(&["himself", "likes", "him"]).len(), 0);
636 | }
637 | 
638 | #[test]
639 | fn test_complex() {
640 |   let g: Grammar = std::fs::read_to_string("examples/dative-shift.fgr")
641 |     .unwrap()
642 |     .parse()
643 |     .unwrap();
644 | 
645 |   assert_eq!(g.parse(&["i", "gave", "her", "apples"]).len(), 1);
646 |   assert_eq!(g.parse(&["i", "gave", "apples", "to", "her"]).len(), 1);
647 |   assert_eq!(g.parse(&["i", "gave", "to", "her", "apples"]).len(), 0);
648 | }
649 | 


--------------------------------------------------------------------------------
/src/rules.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::{HashMap, HashSet};
  2 | use std::fmt;
  3 | use std::sync::Arc;
  4 | 
  5 | use crate::featurestructure::{NodeArena, NodeIdx};
  6 | use crate::utils::Err;
  7 | 
  8 | #[derive(Debug, Copy, Clone, PartialEq, Eq)]
  9 | pub enum ProductionKind {
 10 |   Terminal,
 11 |   Nonterminal,
 12 | }
 13 | 
 14 | #[derive(Debug, Clone, PartialEq, Eq)]
 15 | pub struct Production {
 16 |   pub kind: ProductionKind,
 17 |   pub symbol: String,
 18 | }
 19 | 
 20 | impl Production {
 21 |   pub fn new_terminal(symbol: String) -> Self {
 22 |     Self {
 23 |       kind: ProductionKind::Terminal,
 24 |       symbol,
 25 |     }
 26 |   }
 27 | 
 28 |   pub fn new_nonterminal(symbol: String) -> Self {
 29 |     Self {
 30 |       kind: ProductionKind::Nonterminal,
 31 |       symbol,
 32 |     }
 33 |   }
 34 | 
 35 |   pub fn is_terminal(&self) -> bool {
 36 |     self.kind == ProductionKind::Terminal
 37 |   }
 38 | 
 39 |   pub fn is_nonterminal(&self) -> bool {
 40 |     self.kind == ProductionKind::Nonterminal
 41 |   }
 42 | }
 43 | 
 44 | impl fmt::Display for Production {
 45 |   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 46 |     write!(f, "{}", self.symbol)
 47 |   }
 48 | }
 49 | 
 50 | #[derive(Debug, PartialEq, Eq)]
 51 | pub struct Rule {
 52 |   pub symbol: String,
 53 |   pub features: NodeIdx,
 54 |   pub productions: Vec<Production>,
 55 | }
 56 | 
 57 | impl Rule {
 58 |   pub fn len(&self) -> usize {
 59 |     self.productions.len()
 60 |   }
 61 | 
 62 |   pub fn is_empty(&self) -> bool {
 63 |     self.len() == 0
 64 |   }
 65 | }
 66 | 
 67 | impl std::fmt::Display for Rule {
 68 |   fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 69 |     // Note: can't display features here without an arena reference
 70 |     write!(f, "{} ->", self.symbol)?;
 71 |     for p in self.productions.iter() {
 72 |       write!(f, " {}", p)?;
 73 |     }
 74 |     Ok(())
 75 |   }
 76 | }
 77 | 
 78 | #[derive(Debug)]
 79 | pub struct Grammar {
 80 |   pub start: String,
 81 |   pub rules: HashMap<String, Vec<Arc<Rule>>>,
 82 |   pub arena: NodeArena,
 83 |   nullables: HashSet<String>,
 84 |   nonterminals: HashSet<String>,
 85 | }
 86 | 
 87 | impl std::fmt::Display for Grammar {
 88 |   fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 89 |     writeln!(f, "//** start: {}", self.start)?;
 90 |     write!(f, "//** nonterminals:")?;
 91 |     for nt in self.nonterminals.iter() {
 92 |       write!(f, " {}", nt)?;
 93 |     }
 94 |     writeln!(f)?;
 95 | 
 96 |     write!(f, "//** nullables:")?;
 97 |     for nt in self.nullables.iter() {
 98 |       write!(f, " {}", nt)?;
 99 |     }
100 |     writeln!(f)?;
101 | 
102 |     for rule in self.rules.values().flatten() {
103 |       writeln!(f, "{}\n", rule)?;
104 |     }
105 | 
106 |     Ok(())
107 |   }
108 | }
109 | 
110 | impl Grammar {
111 |   pub fn new(rules: Vec<Rule>, arena: NodeArena) -> Result<Self, Err> {
112 |     assert!(!rules.is_empty());
113 | 
114 |     let nonterminals: HashSet<String> = rules.iter().map(|r| r.symbol.clone()).collect();
115 |     let start = rules[0].symbol.clone();
116 | 
117 |     for r in rules.iter() {
118 |       for p in r.productions.iter() {
119 |         if p.is_nonterminal() && !nonterminals.contains(&p.symbol) {
120 |           return Err(format!("missing rules for nonterminal {}", p.symbol).into());
121 |         }
122 |       }
123 |     }
124 | 
125 |     let rules: HashMap<String, Vec<Arc<Rule>>> =
126 |       rules.into_iter().fold(HashMap::new(), |mut map, rule| {
127 |         map
128 |           .entry(rule.symbol.clone())
129 |           .or_default()
130 |           .push(Arc::new(rule));
131 |         map
132 |       });
133 | 
134 |     let nullables = Self::find_nullables(&rules);
135 | 
136 |     Ok(Self {
137 |       start,
138 |       rules,
139 |       arena,
140 |       nonterminals,
141 |       nullables,
142 |     })
143 |   }
144 | 
145 |   // Create a fresh arena for parsing, with a clone of the grammar's arena
146 |   pub fn create_parse_arena(&self) -> NodeArena {
147 |     self.arena.clone()
148 |   }
149 | 
150 |   pub fn is_nullable(&self, s: &str) -> bool {
151 |     self.nullables.contains(s)
152 |   }
153 | }
154 | 
155 | impl Grammar {
156 |   fn rule_is_nullable(nullables: &HashSet<String>, rule: &Rule) -> bool {
157 |     rule.is_empty()
158 |       || rule
159 |         .productions
160 |         .iter()
161 |         .all(|p| p.is_nonterminal() && nullables.contains(&p.symbol))
162 |   }
163 | 
164 |   fn find_nullables(rules: &HashMap<String, Vec<Arc<Rule>>>) -> HashSet<String> {
165 |     let mut nullables: HashSet<String> = HashSet::new();
166 | 
167 |     let mut last_length = 1;
168 |     while last_length != nullables.len() {
169 |       last_length = nullables.len();
170 |       for r in rules.values().flatten() {
171 |         if !nullables.contains(&r.symbol) && Self::rule_is_nullable(&nullables, r) {
172 |           nullables.insert(r.symbol.clone());
173 |         }
174 |       }
175 |     }
176 | 
177 |     nullables
178 |   }
179 | }
180 | 
181 | #[test]
182 | fn test_parse_grammar() {
183 |   let g: Grammar = r#"
184 |        S -> N[ case: nom, num: #1 ] IV[ num: #1 ]
185 |        S -> N[ case: nom, pron: #1, num: #2 ] TV[ num: #2 ] N[ case: acc, needs_pron: #1 ]
186 |        S -> N[ case: nom, num: #1 ] CV[ num: #num ] Comp S
187 | 
188 |        N[ num: sg, pron: she ]     -> mary
189 |        IV[ num: top, tense: past ] -> fell
190 |        TV[ num: top, tense: past ] -> kissed
191 |        CV[ num: top, tense: past ] -> said
192 |        Comp -> that
193 |      "#
194 |   .parse()
195 |   .unwrap();
196 | 
197 |   let nonterminals: HashSet<String> = ["S", "N", "IV", "TV", "CV", "Comp"]
198 |     .iter()
199 |     .map(|&s| s.to_string())
200 |     .collect();
201 |   assert_eq!(nonterminals, g.nonterminals);
202 |   assert_eq!(g.rules.len(), 6);
203 | 
204 |   assert_eq!(g.rules.get("S").unwrap().len(), 3);
205 |   assert_eq!(g.rules.get("N").unwrap().len(), 1);
206 |   assert_eq!(g.rules.get("IV").unwrap().len(), 1);
207 |   assert_eq!(g.rules.get("TV").unwrap().len(), 1);
208 |   assert_eq!(g.rules.get("CV").unwrap().len(), 1);
209 |   assert_eq!(g.rules.get("Comp").unwrap().len(), 1);
210 |   assert!(!g.rules.contains_key("that"));
211 |   assert!(!g.rules.contains_key("mary"));
212 | }
213 | 
214 | #[test]
215 | fn test_find_nullables() {
216 |   let g: Grammar = r#"
217 |       S -> A B
218 |       A -> c
219 |       B -> D D
220 |       D ->
221 |     "#
222 |   .parse()
223 |   .unwrap();
224 | 
225 |   let nl: HashSet<String> = ["B", "D"].iter().map(|&s| s.to_string()).collect();
226 |   assert_eq!(g.nullables, nl);
227 | }
228 | 


--------------------------------------------------------------------------------
/src/syntree.rs:
--------------------------------------------------------------------------------
  1 | use std::fmt;
  2 | 
  3 | #[derive(Debug, Clone, PartialEq, Eq)]
  4 | pub struct Constituent<T> {
  5 |   pub value: T,
  6 |   pub span: (usize, usize),
  7 | }
  8 | 
  9 | impl<T> fmt::Display for Constituent<T>
 10 | where
 11 |   T: fmt::Display,
 12 | {
 13 |   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 14 |     write!(f, "{}..{}: {}", self.span.0, self.span.1, self.value)
 15 |   }
 16 | }
 17 | 
 18 | #[derive(Debug, Clone, PartialEq, Eq)]
 19 | pub struct Word<U> {
 20 |   pub value: U,
 21 |   pub span: (usize, usize),
 22 | }
 23 | 
 24 | impl<U> fmt::Display for Word<U>
 25 | where
 26 |   U: fmt::Display,
 27 | {
 28 |   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
 29 |     write!(f, "{}..{}: {}", self.span.0, self.span.1, self.value)
 30 |   }
 31 | }
 32 | 
 33 | #[derive(Debug, PartialEq, Clone)]
 34 | pub enum SynTree<T, U> {
 35 |   Branch(Constituent<T>, Vec<SynTree<T, U>>),
 36 |   Leaf(Word<U>),
 37 | }
 38 | 
 39 | impl<T, U> SynTree<T, U> {
 40 |   pub fn is_leaf(&self) -> bool {
 41 |     matches!(self, Self::Leaf(_))
 42 |   }
 43 | 
 44 |   pub fn is_branch(&self) -> bool {
 45 |     matches!(self, Self::Branch(_, _))
 46 |   }
 47 | 
 48 |   pub fn get_leaf(&self) -> Option<&Word<U>> {
 49 |     match self {
 50 |       Self::Leaf(w) => Some(w),
 51 |       _ => None,
 52 |     }
 53 |   }
 54 | 
 55 |   #[allow(clippy::type_complexity)] // TODO
 56 |   pub fn get_branch(&self) -> Option<(&Constituent<T>, &Vec<SynTree<T, U>>)> {
 57 |     match self {
 58 |       Self::Branch(c, cs) => Some((c, cs)),
 59 |       _ => None,
 60 |     }
 61 |   }
 62 | 
 63 |   #[allow(clippy::type_complexity)] // TODO
 64 |   pub fn into_branch(self) -> Option<(Constituent<T>, Vec<SynTree<T, U>>)> {
 65 |     match self {
 66 |       Self::Branch(c, cs) => Some((c, cs)),
 67 |       _ => None,
 68 |     }
 69 |   }
 70 | 
 71 |   pub fn map<V, W>(
 72 |     &self,
 73 |     map_branch: fn(&Constituent<T>) -> V,
 74 |     map_leaf: fn(&Word<U>) -> W,
 75 |   ) -> SynTree<V, W> {
 76 |     match self {
 77 |       Self::Branch(t, children) => {
 78 |         let children = children
 79 |           .iter()
 80 |           .map(|c| c.map(map_branch, map_leaf))
 81 |           .collect::<Vec<_>>();
 82 |         SynTree::Branch(
 83 |           Constituent {
 84 |             span: t.span,
 85 |             value: map_branch(t),
 86 |           },
 87 |           children,
 88 |         )
 89 |       }
 90 |       Self::Leaf(u) => SynTree::Leaf(Word {
 91 |         span: u.span,
 92 |         value: map_leaf(u),
 93 |       }),
 94 |     }
 95 |   }
 96 | }
 97 | 
 98 | impl<T, U> fmt::Display for SynTree<T, U>
 99 | where
100 |   T: fmt::Display,
101 |   U: fmt::Display,
102 | {
103 |   fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
104 |     match self {
105 |       Self::Leaf(t) => write!(f, "{}", t),
106 |       Self::Branch(t, ts) => {
107 |         write!(f, "({}", t)?;
108 |         if ts.len() == 1 {
109 |           write!(f, " ({}))", ts[0])
110 |         } else {
111 |           for t in ts.iter() {
112 |             // TODO: is there a nice way to do this that doesn't allocate a String?
113 |             let fmt = format!("{}", t);
114 |             for line in fmt.lines() {
115 |               write!(f, "\n  {}", line)?;
116 |             }
117 |           }
118 |           write!(f, ")")
119 |         }
120 |       }
121 |     }
122 |   }
123 | }
124 | 


--------------------------------------------------------------------------------
/src/utils.rs:
--------------------------------------------------------------------------------
 1 | use std::error::Error;
 2 | 
 3 | /// Boxed static error type
 4 | pub type Err = Box<dyn Error + 'static>;
 5 | 
 6 | /// Takes a list where each element is a set of choices, and returns all the possible sets
 7 | /// generated. Will clone the elements.
 8 | ///
 9 | /// ```
10 | /// let v = vec![
11 | ///   vec![1],
12 | ///   vec![2, 3],
13 | ///   vec![4],
14 | ///   vec![5, 6, 7],
15 | /// ];
16 | ///
17 | /// assert_eq!(treebender::utils::combinations(&v), vec![
18 | ///   vec![1, 2, 4, 5],
19 | ///   vec![1, 3, 4, 5],
20 | ///   vec![1, 2, 4, 6],
21 | ///   vec![1, 3, 4, 6],
22 | ///   vec![1, 2, 4, 7],
23 | ///   vec![1, 3, 4, 7],
24 | /// ]);
25 | /// ```
26 | pub fn combinations<T>(list: &[Vec<T>]) -> Vec<Vec<T>>
27 | where
28 |   T: Clone,
29 | {
30 |   if list.is_empty() {
31 |     Vec::new()
32 |   } else if list.len() == 1 {
33 |     list[0].iter().map(|e| vec![e.clone()]).collect()
34 |   } else {
35 |     let (head, tail) = list.split_at(1);
36 |     let head = &head[0];
37 | 
38 |     combinations(tail)
39 |       .into_iter()
40 |       .flat_map(|subseq| {
41 |         // prepend every element of the head to every possible subseq
42 |         head.iter().map(move |v| {
43 |           let mut newseq = subseq.clone();
44 |           newseq.insert(0, v.clone());
45 |           newseq
46 |         })
47 |       })
48 |       .collect()
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------