├── .gitignore ├── CHANGELOG.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── README.tpl ├── benches ├── full_parse_reflexives.rs └── reflexives.fgr ├── cli ├── Cargo.toml └── src │ └── main.rs ├── examples ├── asl-wordorder.fgr ├── dative-shift.fgr ├── no-features.fgr └── reflexives.fgr ├── rustfmt.toml └── src ├── earley.rs ├── featurestructure ├── mod.rs ├── node.rs └── serialized.rs ├── fgr ├── mod.rs └── parse_grammar.rs ├── forest.rs ├── lib.rs ├── rules.rs ├── syntree.rs └── utils.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | **/*.rs.bk 3 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.1.2 4 | 5 | Changed all uses of `Rc` into `Arc`, for multi-threaded use. 6 | 7 | ## 0.1.1 8 | 9 | - Added `From` implementation for `HashMap` that gives 10 | an easier way to work with the DAG, if you don't care about forwarding 11 | relationships. 12 | 13 | ## 0.1.0 14 | 15 | - Initial release. 16 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 4 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "1.1.3" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "anes" 16 | version = "0.1.6" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" 19 | 20 | [[package]] 21 | name = "anstyle" 22 | version = "1.0.10" 23 | source = "registry+https://github.com/rust-lang/crates.io-index" 24 | checksum = "55cc3b69f167a1ef2e161439aa98aed94e6028e5f9a59be9a6ffb47aef1651f9" 25 | 26 | [[package]] 27 | name = "autocfg" 28 | version = "1.4.0" 29 | source = "registry+https://github.com/rust-lang/crates.io-index" 30 | checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" 31 | 32 | [[package]] 33 | name = "bumpalo" 34 | version = "3.17.0" 35 | source = "registry+https://github.com/rust-lang/crates.io-index" 36 | checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" 37 | 38 | [[package]] 39 | name = "cast" 40 | version = "0.3.0" 41 | source = "registry+https://github.com/rust-lang/crates.io-index" 42 | checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5" 43 | 44 | [[package]] 45 | name = "cfg-if" 46 | version = "1.0.0" 47 | source = "registry+https://github.com/rust-lang/crates.io-index" 48 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 49 | 50 | [[package]] 51 | name = "ciborium" 52 | version = "0.2.2" 53 | source = "registry+https://github.com/rust-lang/crates.io-index" 54 | checksum = "42e69ffd6f0917f5c029256a24d0161db17cea3997d185db0d35926308770f0e" 55 | dependencies = [ 56 | "ciborium-io", 57 | "ciborium-ll", 58 | "serde", 59 | ] 60 | 61 | [[package]] 62 | name = "ciborium-io" 63 | version = "0.2.2" 64 | source = "registry+https://github.com/rust-lang/crates.io-index" 65 | checksum = "05afea1e0a06c9be33d539b876f1ce3692f4afea2cb41f740e7743225ed1c757" 66 | 67 | [[package]] 68 | name = "ciborium-ll" 69 | version = "0.2.2" 70 | source = "registry+https://github.com/rust-lang/crates.io-index" 71 | checksum = "57663b653d948a338bfb3eeba9bb2fd5fcfaecb9e199e87e1eda4d9e8b240fd9" 72 | dependencies = [ 73 | "ciborium-io", 74 | "half", 75 | ] 76 | 77 | [[package]] 78 | name = "clap" 79 | version = "4.5.32" 80 | source = "registry+https://github.com/rust-lang/crates.io-index" 81 | checksum = "6088f3ae8c3608d19260cd7445411865a485688711b78b5be70d78cd96136f83" 82 | dependencies = [ 83 | "clap_builder", 84 | ] 85 | 86 | [[package]] 87 | name = "clap_builder" 88 | version = "4.5.32" 89 | source = "registry+https://github.com/rust-lang/crates.io-index" 90 | checksum = "22a7ef7f676155edfb82daa97f99441f3ebf4a58d5e32f295a56259f1b6facc8" 91 | dependencies = [ 92 | "anstyle", 93 | "clap_lex", 94 | ] 95 | 96 | [[package]] 97 | name = "clap_lex" 98 | version = "0.7.4" 99 | source = "registry+https://github.com/rust-lang/crates.io-index" 100 | checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" 101 | 102 | [[package]] 103 | name = "cli" 104 | version = "0.2.0" 105 | dependencies = [ 106 | "tracing-subscriber", 107 | "treebender", 108 | ] 109 | 110 | [[package]] 111 | name = "criterion" 112 | version = "0.5.1" 113 | source = "registry+https://github.com/rust-lang/crates.io-index" 114 | checksum = "f2b12d017a929603d80db1831cd3a24082f8137ce19c69e6447f54f5fc8d692f" 115 | dependencies = [ 116 | "anes", 117 | "cast", 118 | "ciborium", 119 | "clap", 120 | "criterion-plot", 121 | "is-terminal", 122 | "itertools", 123 | "num-traits", 124 | "once_cell", 125 | "oorandom", 126 | "plotters", 127 | "rayon", 128 | "regex", 129 | "serde", 130 | "serde_derive", 131 | "serde_json", 132 | "tinytemplate", 133 | "walkdir", 134 | ] 135 | 136 | [[package]] 137 | name = "criterion-plot" 138 | version = "0.5.0" 139 | source = "registry+https://github.com/rust-lang/crates.io-index" 140 | checksum = "6b50826342786a51a89e2da3a28f1c32b06e387201bc2d19791f622c673706b1" 141 | dependencies = [ 142 | "cast", 143 | "itertools", 144 | ] 145 | 146 | [[package]] 147 | name = "crossbeam-deque" 148 | version = "0.8.6" 149 | source = "registry+https://github.com/rust-lang/crates.io-index" 150 | checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" 151 | dependencies = [ 152 | "crossbeam-epoch", 153 | "crossbeam-utils", 154 | ] 155 | 156 | [[package]] 157 | name = "crossbeam-epoch" 158 | version = "0.9.18" 159 | source = "registry+https://github.com/rust-lang/crates.io-index" 160 | checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" 161 | dependencies = [ 162 | "crossbeam-utils", 163 | ] 164 | 165 | [[package]] 166 | name = "crossbeam-utils" 167 | version = "0.8.21" 168 | source = "registry+https://github.com/rust-lang/crates.io-index" 169 | checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" 170 | 171 | [[package]] 172 | name = "crunchy" 173 | version = "0.2.3" 174 | source = "registry+https://github.com/rust-lang/crates.io-index" 175 | checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" 176 | 177 | [[package]] 178 | name = "either" 179 | version = "1.15.0" 180 | source = "registry+https://github.com/rust-lang/crates.io-index" 181 | checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" 182 | 183 | [[package]] 184 | name = "half" 185 | version = "2.4.1" 186 | source = "registry+https://github.com/rust-lang/crates.io-index" 187 | checksum = "6dd08c532ae367adf81c312a4580bc67f1d0fe8bc9c460520283f4c0ff277888" 188 | dependencies = [ 189 | "cfg-if", 190 | "crunchy", 191 | ] 192 | 193 | [[package]] 194 | name = "hermit-abi" 195 | version = "0.5.0" 196 | source = "registry+https://github.com/rust-lang/crates.io-index" 197 | checksum = "fbd780fe5cc30f81464441920d82ac8740e2e46b29a6fad543ddd075229ce37e" 198 | 199 | [[package]] 200 | name = "is-terminal" 201 | version = "0.4.16" 202 | source = "registry+https://github.com/rust-lang/crates.io-index" 203 | checksum = "e04d7f318608d35d4b61ddd75cbdaee86b023ebe2bd5a66ee0915f0bf93095a9" 204 | dependencies = [ 205 | "hermit-abi", 206 | "libc", 207 | "windows-sys", 208 | ] 209 | 210 | [[package]] 211 | name = "itertools" 212 | version = "0.10.5" 213 | source = "registry+https://github.com/rust-lang/crates.io-index" 214 | checksum = "b0fd2260e829bddf4cb6ea802289de2f86d6a7a690192fbe91b3f46e0f2c8473" 215 | dependencies = [ 216 | "either", 217 | ] 218 | 219 | [[package]] 220 | name = "itoa" 221 | version = "1.0.15" 222 | source = "registry+https://github.com/rust-lang/crates.io-index" 223 | checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" 224 | 225 | [[package]] 226 | name = "js-sys" 227 | version = "0.3.77" 228 | source = "registry+https://github.com/rust-lang/crates.io-index" 229 | checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" 230 | dependencies = [ 231 | "once_cell", 232 | "wasm-bindgen", 233 | ] 234 | 235 | [[package]] 236 | name = "lazy_static" 237 | version = "1.5.0" 238 | source = "registry+https://github.com/rust-lang/crates.io-index" 239 | checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" 240 | 241 | [[package]] 242 | name = "libc" 243 | version = "0.2.170" 244 | source = "registry+https://github.com/rust-lang/crates.io-index" 245 | checksum = "875b3680cb2f8f71bdcf9a30f38d48282f5d3c95cbf9b3fa57269bb5d5c06828" 246 | 247 | [[package]] 248 | name = "log" 249 | version = "0.4.26" 250 | source = "registry+https://github.com/rust-lang/crates.io-index" 251 | checksum = "30bde2b3dc3671ae49d8e2e9f044c7c005836e7a023ee57cffa25ab82764bb9e" 252 | 253 | [[package]] 254 | name = "matchers" 255 | version = "0.1.0" 256 | source = "registry+https://github.com/rust-lang/crates.io-index" 257 | checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" 258 | dependencies = [ 259 | "regex-automata 0.1.10", 260 | ] 261 | 262 | [[package]] 263 | name = "memchr" 264 | version = "2.7.4" 265 | source = "registry+https://github.com/rust-lang/crates.io-index" 266 | checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" 267 | 268 | [[package]] 269 | name = "nu-ansi-term" 270 | version = "0.46.0" 271 | source = "registry+https://github.com/rust-lang/crates.io-index" 272 | checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" 273 | dependencies = [ 274 | "overload", 275 | "winapi", 276 | ] 277 | 278 | [[package]] 279 | name = "num-traits" 280 | version = "0.2.19" 281 | source = "registry+https://github.com/rust-lang/crates.io-index" 282 | checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" 283 | dependencies = [ 284 | "autocfg", 285 | ] 286 | 287 | [[package]] 288 | name = "once_cell" 289 | version = "1.21.0" 290 | source = "registry+https://github.com/rust-lang/crates.io-index" 291 | checksum = "cde51589ab56b20a6f686b2c68f7a0bd6add753d697abf720d63f8db3ab7b1ad" 292 | 293 | [[package]] 294 | name = "oorandom" 295 | version = "11.1.5" 296 | source = "registry+https://github.com/rust-lang/crates.io-index" 297 | checksum = "d6790f58c7ff633d8771f42965289203411a5e5c68388703c06e14f24770b41e" 298 | 299 | [[package]] 300 | name = "overload" 301 | version = "0.1.1" 302 | source = "registry+https://github.com/rust-lang/crates.io-index" 303 | checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" 304 | 305 | [[package]] 306 | name = "pin-project-lite" 307 | version = "0.2.16" 308 | source = "registry+https://github.com/rust-lang/crates.io-index" 309 | checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" 310 | 311 | [[package]] 312 | name = "plotters" 313 | version = "0.3.7" 314 | source = "registry+https://github.com/rust-lang/crates.io-index" 315 | checksum = "5aeb6f403d7a4911efb1e33402027fc44f29b5bf6def3effcc22d7bb75f2b747" 316 | dependencies = [ 317 | "num-traits", 318 | "plotters-backend", 319 | "plotters-svg", 320 | "wasm-bindgen", 321 | "web-sys", 322 | ] 323 | 324 | [[package]] 325 | name = "plotters-backend" 326 | version = "0.3.7" 327 | source = "registry+https://github.com/rust-lang/crates.io-index" 328 | checksum = "df42e13c12958a16b3f7f4386b9ab1f3e7933914ecea48da7139435263a4172a" 329 | 330 | [[package]] 331 | name = "plotters-svg" 332 | version = "0.3.7" 333 | source = "registry+https://github.com/rust-lang/crates.io-index" 334 | checksum = "51bae2ac328883f7acdfea3d66a7c35751187f870bc81f94563733a154d7a670" 335 | dependencies = [ 336 | "plotters-backend", 337 | ] 338 | 339 | [[package]] 340 | name = "proc-macro2" 341 | version = "1.0.94" 342 | source = "registry+https://github.com/rust-lang/crates.io-index" 343 | checksum = "a31971752e70b8b2686d7e46ec17fb38dad4051d94024c88df49b667caea9c84" 344 | dependencies = [ 345 | "unicode-ident", 346 | ] 347 | 348 | [[package]] 349 | name = "quote" 350 | version = "1.0.39" 351 | source = "registry+https://github.com/rust-lang/crates.io-index" 352 | checksum = "c1f1914ce909e1658d9907913b4b91947430c7d9be598b15a1912935b8c04801" 353 | dependencies = [ 354 | "proc-macro2", 355 | ] 356 | 357 | [[package]] 358 | name = "rayon" 359 | version = "1.10.0" 360 | source = "registry+https://github.com/rust-lang/crates.io-index" 361 | checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" 362 | dependencies = [ 363 | "either", 364 | "rayon-core", 365 | ] 366 | 367 | [[package]] 368 | name = "rayon-core" 369 | version = "1.12.1" 370 | source = "registry+https://github.com/rust-lang/crates.io-index" 371 | checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" 372 | dependencies = [ 373 | "crossbeam-deque", 374 | "crossbeam-utils", 375 | ] 376 | 377 | [[package]] 378 | name = "regex" 379 | version = "1.11.1" 380 | source = "registry+https://github.com/rust-lang/crates.io-index" 381 | checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" 382 | dependencies = [ 383 | "aho-corasick", 384 | "memchr", 385 | "regex-automata 0.4.9", 386 | "regex-syntax 0.8.5", 387 | ] 388 | 389 | [[package]] 390 | name = "regex-automata" 391 | version = "0.1.10" 392 | source = "registry+https://github.com/rust-lang/crates.io-index" 393 | checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" 394 | dependencies = [ 395 | "regex-syntax 0.6.29", 396 | ] 397 | 398 | [[package]] 399 | name = "regex-automata" 400 | version = "0.4.9" 401 | source = "registry+https://github.com/rust-lang/crates.io-index" 402 | checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" 403 | dependencies = [ 404 | "aho-corasick", 405 | "memchr", 406 | "regex-syntax 0.8.5", 407 | ] 408 | 409 | [[package]] 410 | name = "regex-syntax" 411 | version = "0.6.29" 412 | source = "registry+https://github.com/rust-lang/crates.io-index" 413 | checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" 414 | 415 | [[package]] 416 | name = "regex-syntax" 417 | version = "0.8.5" 418 | source = "registry+https://github.com/rust-lang/crates.io-index" 419 | checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" 420 | 421 | [[package]] 422 | name = "rustversion" 423 | version = "1.0.20" 424 | source = "registry+https://github.com/rust-lang/crates.io-index" 425 | checksum = "eded382c5f5f786b989652c49544c4877d9f015cc22e145a5ea8ea66c2921cd2" 426 | 427 | [[package]] 428 | name = "ryu" 429 | version = "1.0.20" 430 | source = "registry+https://github.com/rust-lang/crates.io-index" 431 | checksum = "28d3b2b1366ec20994f1fd18c3c594f05c5dd4bc44d8bb0c1c632c8d6829481f" 432 | 433 | [[package]] 434 | name = "same-file" 435 | version = "1.0.6" 436 | source = "registry+https://github.com/rust-lang/crates.io-index" 437 | checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" 438 | dependencies = [ 439 | "winapi-util", 440 | ] 441 | 442 | [[package]] 443 | name = "serde" 444 | version = "1.0.219" 445 | source = "registry+https://github.com/rust-lang/crates.io-index" 446 | checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" 447 | dependencies = [ 448 | "serde_derive", 449 | ] 450 | 451 | [[package]] 452 | name = "serde_derive" 453 | version = "1.0.219" 454 | source = "registry+https://github.com/rust-lang/crates.io-index" 455 | checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" 456 | dependencies = [ 457 | "proc-macro2", 458 | "quote", 459 | "syn", 460 | ] 461 | 462 | [[package]] 463 | name = "serde_json" 464 | version = "1.0.140" 465 | source = "registry+https://github.com/rust-lang/crates.io-index" 466 | checksum = "20068b6e96dc6c9bd23e01df8827e6c7e1f2fddd43c21810382803c136b99373" 467 | dependencies = [ 468 | "itoa", 469 | "memchr", 470 | "ryu", 471 | "serde", 472 | ] 473 | 474 | [[package]] 475 | name = "sharded-slab" 476 | version = "0.1.7" 477 | source = "registry+https://github.com/rust-lang/crates.io-index" 478 | checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" 479 | dependencies = [ 480 | "lazy_static", 481 | ] 482 | 483 | [[package]] 484 | name = "smallvec" 485 | version = "1.14.0" 486 | source = "registry+https://github.com/rust-lang/crates.io-index" 487 | checksum = "7fcf8323ef1faaee30a44a340193b1ac6814fd9b7b4e88e9d4519a3e4abe1cfd" 488 | 489 | [[package]] 490 | name = "syn" 491 | version = "2.0.100" 492 | source = "registry+https://github.com/rust-lang/crates.io-index" 493 | checksum = "b09a44accad81e1ba1cd74a32461ba89dee89095ba17b32f5d03683b1b1fc2a0" 494 | dependencies = [ 495 | "proc-macro2", 496 | "quote", 497 | "unicode-ident", 498 | ] 499 | 500 | [[package]] 501 | name = "thread_local" 502 | version = "1.1.8" 503 | source = "registry+https://github.com/rust-lang/crates.io-index" 504 | checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" 505 | dependencies = [ 506 | "cfg-if", 507 | "once_cell", 508 | ] 509 | 510 | [[package]] 511 | name = "tinytemplate" 512 | version = "1.2.1" 513 | source = "registry+https://github.com/rust-lang/crates.io-index" 514 | checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" 515 | dependencies = [ 516 | "serde", 517 | "serde_json", 518 | ] 519 | 520 | [[package]] 521 | name = "tracing" 522 | version = "0.1.41" 523 | source = "registry+https://github.com/rust-lang/crates.io-index" 524 | checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0" 525 | dependencies = [ 526 | "pin-project-lite", 527 | "tracing-attributes", 528 | "tracing-core", 529 | ] 530 | 531 | [[package]] 532 | name = "tracing-attributes" 533 | version = "0.1.28" 534 | source = "registry+https://github.com/rust-lang/crates.io-index" 535 | checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d" 536 | dependencies = [ 537 | "proc-macro2", 538 | "quote", 539 | "syn", 540 | ] 541 | 542 | [[package]] 543 | name = "tracing-core" 544 | version = "0.1.33" 545 | source = "registry+https://github.com/rust-lang/crates.io-index" 546 | checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c" 547 | dependencies = [ 548 | "once_cell", 549 | "valuable", 550 | ] 551 | 552 | [[package]] 553 | name = "tracing-log" 554 | version = "0.2.0" 555 | source = "registry+https://github.com/rust-lang/crates.io-index" 556 | checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" 557 | dependencies = [ 558 | "log", 559 | "once_cell", 560 | "tracing-core", 561 | ] 562 | 563 | [[package]] 564 | name = "tracing-subscriber" 565 | version = "0.3.19" 566 | source = "registry+https://github.com/rust-lang/crates.io-index" 567 | checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008" 568 | dependencies = [ 569 | "matchers", 570 | "nu-ansi-term", 571 | "once_cell", 572 | "regex", 573 | "sharded-slab", 574 | "smallvec", 575 | "thread_local", 576 | "tracing", 577 | "tracing-core", 578 | "tracing-log", 579 | ] 580 | 581 | [[package]] 582 | name = "treebender" 583 | version = "0.2.0" 584 | dependencies = [ 585 | "criterion", 586 | "lazy_static", 587 | "regex", 588 | "tracing", 589 | ] 590 | 591 | [[package]] 592 | name = "unicode-ident" 593 | version = "1.0.18" 594 | source = "registry+https://github.com/rust-lang/crates.io-index" 595 | checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" 596 | 597 | [[package]] 598 | name = "valuable" 599 | version = "0.1.1" 600 | source = "registry+https://github.com/rust-lang/crates.io-index" 601 | checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" 602 | 603 | [[package]] 604 | name = "walkdir" 605 | version = "2.5.0" 606 | source = "registry+https://github.com/rust-lang/crates.io-index" 607 | checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" 608 | dependencies = [ 609 | "same-file", 610 | "winapi-util", 611 | ] 612 | 613 | [[package]] 614 | name = "wasm-bindgen" 615 | version = "0.2.100" 616 | source = "registry+https://github.com/rust-lang/crates.io-index" 617 | checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" 618 | dependencies = [ 619 | "cfg-if", 620 | "once_cell", 621 | "rustversion", 622 | "wasm-bindgen-macro", 623 | ] 624 | 625 | [[package]] 626 | name = "wasm-bindgen-backend" 627 | version = "0.2.100" 628 | source = "registry+https://github.com/rust-lang/crates.io-index" 629 | checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" 630 | dependencies = [ 631 | "bumpalo", 632 | "log", 633 | "proc-macro2", 634 | "quote", 635 | "syn", 636 | "wasm-bindgen-shared", 637 | ] 638 | 639 | [[package]] 640 | name = "wasm-bindgen-macro" 641 | version = "0.2.100" 642 | source = "registry+https://github.com/rust-lang/crates.io-index" 643 | checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" 644 | dependencies = [ 645 | "quote", 646 | "wasm-bindgen-macro-support", 647 | ] 648 | 649 | [[package]] 650 | name = "wasm-bindgen-macro-support" 651 | version = "0.2.100" 652 | source = "registry+https://github.com/rust-lang/crates.io-index" 653 | checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" 654 | dependencies = [ 655 | "proc-macro2", 656 | "quote", 657 | "syn", 658 | "wasm-bindgen-backend", 659 | "wasm-bindgen-shared", 660 | ] 661 | 662 | [[package]] 663 | name = "wasm-bindgen-shared" 664 | version = "0.2.100" 665 | source = "registry+https://github.com/rust-lang/crates.io-index" 666 | checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" 667 | dependencies = [ 668 | "unicode-ident", 669 | ] 670 | 671 | [[package]] 672 | name = "web-sys" 673 | version = "0.3.77" 674 | source = "registry+https://github.com/rust-lang/crates.io-index" 675 | checksum = "33b6dd2ef9186f1f2072e409e99cd22a975331a6b3591b12c764e0e55c60d5d2" 676 | dependencies = [ 677 | "js-sys", 678 | "wasm-bindgen", 679 | ] 680 | 681 | [[package]] 682 | name = "winapi" 683 | version = "0.3.9" 684 | source = "registry+https://github.com/rust-lang/crates.io-index" 685 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 686 | dependencies = [ 687 | "winapi-i686-pc-windows-gnu", 688 | "winapi-x86_64-pc-windows-gnu", 689 | ] 690 | 691 | [[package]] 692 | name = "winapi-i686-pc-windows-gnu" 693 | version = "0.4.0" 694 | source = "registry+https://github.com/rust-lang/crates.io-index" 695 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 696 | 697 | [[package]] 698 | name = "winapi-util" 699 | version = "0.1.9" 700 | source = "registry+https://github.com/rust-lang/crates.io-index" 701 | checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" 702 | dependencies = [ 703 | "windows-sys", 704 | ] 705 | 706 | [[package]] 707 | name = "winapi-x86_64-pc-windows-gnu" 708 | version = "0.4.0" 709 | source = "registry+https://github.com/rust-lang/crates.io-index" 710 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 711 | 712 | [[package]] 713 | name = "windows-sys" 714 | version = "0.59.0" 715 | source = "registry+https://github.com/rust-lang/crates.io-index" 716 | checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" 717 | dependencies = [ 718 | "windows-targets", 719 | ] 720 | 721 | [[package]] 722 | name = "windows-targets" 723 | version = "0.52.6" 724 | source = "registry+https://github.com/rust-lang/crates.io-index" 725 | checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" 726 | dependencies = [ 727 | "windows_aarch64_gnullvm", 728 | "windows_aarch64_msvc", 729 | "windows_i686_gnu", 730 | "windows_i686_gnullvm", 731 | "windows_i686_msvc", 732 | "windows_x86_64_gnu", 733 | "windows_x86_64_gnullvm", 734 | "windows_x86_64_msvc", 735 | ] 736 | 737 | [[package]] 738 | name = "windows_aarch64_gnullvm" 739 | version = "0.52.6" 740 | source = "registry+https://github.com/rust-lang/crates.io-index" 741 | checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" 742 | 743 | [[package]] 744 | name = "windows_aarch64_msvc" 745 | version = "0.52.6" 746 | source = "registry+https://github.com/rust-lang/crates.io-index" 747 | checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" 748 | 749 | [[package]] 750 | name = "windows_i686_gnu" 751 | version = "0.52.6" 752 | source = "registry+https://github.com/rust-lang/crates.io-index" 753 | checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" 754 | 755 | [[package]] 756 | name = "windows_i686_gnullvm" 757 | version = "0.52.6" 758 | source = "registry+https://github.com/rust-lang/crates.io-index" 759 | checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" 760 | 761 | [[package]] 762 | name = "windows_i686_msvc" 763 | version = "0.52.6" 764 | source = "registry+https://github.com/rust-lang/crates.io-index" 765 | checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" 766 | 767 | [[package]] 768 | name = "windows_x86_64_gnu" 769 | version = "0.52.6" 770 | source = "registry+https://github.com/rust-lang/crates.io-index" 771 | checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" 772 | 773 | [[package]] 774 | name = "windows_x86_64_gnullvm" 775 | version = "0.52.6" 776 | source = "registry+https://github.com/rust-lang/crates.io-index" 777 | checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" 778 | 779 | [[package]] 780 | name = "windows_x86_64_msvc" 781 | version = "0.52.6" 782 | source = "registry+https://github.com/rust-lang/crates.io-index" 783 | checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" 784 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "treebender" 3 | version = "0.2.0" 4 | authors = ["Theia Vogel "] 5 | edition = "2024" 6 | description = "An HDPSG inspired symbolic NLP library for Rust" 7 | repository = "https://github.com/vgel/treebender" 8 | license = "MIT" 9 | keywords = ["nlp", "parsing", "earley", "syntax", "hdpsg"] 10 | categories = ["science", "text-processing"] 11 | 12 | [badges] 13 | maintenance = { status = "experimental" } 14 | 15 | [workspace] 16 | resolver = "2" 17 | members = ["cli"] 18 | 19 | [dependencies] 20 | regex = "1" 21 | lazy_static = "1" 22 | tracing = "0.1.41" 23 | 24 | [dev-dependencies] 25 | criterion = "0.5" 26 | 27 | [[bench]] 28 | name = "full_parse_reflexives" 29 | harness = false 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Theia Vogel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Crates.io](https://img.shields.io/crates/v/treebender.svg)](https://crates.io/crates/treebender) 2 | ![Maintenance](https://img.shields.io/badge/maintenance-experimental-blue.svg) 3 | 4 | # Treebender 5 | 6 | A symbolic natural language parsing library for Rust, inspired by 7 | [HDPSG](https://en.wikipedia.org/wiki/Head-driven_phrase_structure_grammar). 8 | 9 | ## What is this? 10 | This is a library for parsing natural or constructed languages into syntax trees 11 | and feature structures. There's no machine learning or probabilistic models, 12 | everything is hand-crafted and deterministic. 13 | 14 | You can find out more about the motivations of this project in 15 | [this blog post](https://vgel.me/posts/symbolic-linguistics-part1/). 16 | 17 | ### But what are you using it for? 18 | I'm using this to parse a constructed language for my upcoming xenolinguistics 19 | game, [Themengi](https://vgel.me/themengi/). 20 | 21 | ## Motivation 22 | Using a simple 80-line grammar, introduced in the tutorial below, we can parse 23 | a simple subset of English, checking reflexive pronoun binding, case, and 24 | number agreement. 25 | 26 | ``` 27 | $ cargo run --bin cli examples/reflexives.fgr 28 | > she likes himself 29 | Parsed 0 trees 30 | 31 | > her likes herself 32 | Parsed 0 trees 33 | 34 | > she like herself 35 | Parsed 0 trees 36 | 37 | > she likes herself 38 | Parsed 1 tree 39 | (0..3: S 40 | (0..1: N (0..1: she)) 41 | (1..2: TV (1..2: likes)) 42 | (2..3: N (2..3: herself))) 43 | [ 44 | child-2: [ 45 | case: acc 46 | pron: ref 47 | needs_pron: #0 she 48 | num: sg 49 | child-0: [ word: herself ] 50 | ] 51 | child-1: [ 52 | tense: nonpast 53 | child-0: [ word: likes ] 54 | num: #1 sg 55 | ] 56 | child-0: [ 57 | child-0: [ word: she ] 58 | case: nom 59 | pron: #0 60 | num: #1 61 | ] 62 | ] 63 | ``` 64 | 65 | Low resource language? Low problem! No need to train on gigabytes of text, just 66 | write a grammar using your brain. Let's hypothesize that in 67 | American Sign Language, topicalized nouns (expressed with raised eyebrows) 68 | must appear first in the sentence. We can write a small grammar (18 lines), 69 | and plug in some sentences: 70 | 71 | ``` 72 | $ cargo run --bin cli examples/asl-wordorder.fgr -n 73 | > boy sit 74 | Parsed 1 tree 75 | (0..2: S 76 | (0..1: NP ((0..1: N (0..1: boy)))) 77 | (1..2: IV (1..2: sit))) 78 | 79 | > boy throw ball 80 | Parsed 1 tree 81 | (0..3: S 82 | (0..1: NP ((0..1: N (0..1: boy)))) 83 | (1..2: TV (1..2: throw)) 84 | (2..3: NP ((2..3: N (2..3: ball))))) 85 | 86 | > ball nm-raised-eyebrows boy throw 87 | Parsed 1 tree 88 | (0..4: S 89 | (0..2: NP 90 | (0..1: N (0..1: ball)) 91 | (1..2: Topic (1..2: nm-raised-eyebrows))) 92 | (2..3: NP ((2..3: N (2..3: boy)))) 93 | (3..4: TV (3..4: throw))) 94 | 95 | > boy throw ball nm-raised-eyebrows 96 | Parsed 0 trees 97 | ``` 98 | 99 | ## Tutorial 100 | As an example, let's say we want to build a parser for English reflexive 101 | pronouns (himself, herself, themselves, themself, itself). We'll also support 102 | number ("He likes X" v.s. "They like X") and simple embedded clauses 103 | ("He said that they like X"). 104 | 105 | Grammar files are written in a custom language, similar to BNF, called 106 | Feature GRammar (.fgr). There's a VSCode syntax highlighting extension for these 107 | files available as [`fgr-syntax`](https://marketplace.visualstudio.com/items?itemName=vgel.fgr-syntax). 108 | 109 | We'll start by defining our lexicon. The lexicon is the set of terminal symbols 110 | (symbols in the actual input) that the grammar will match. Terminal symbols must 111 | start with a lowercase letter, and non-terminal symbols must start with an 112 | uppercase letter. 113 | 114 | ```fgr 115 | // pronouns 116 | N -> he 117 | N -> him 118 | N -> himself 119 | N -> she 120 | N -> her 121 | N -> herself 122 | N -> they 123 | N -> them 124 | N -> themselves 125 | N -> themself 126 | 127 | // names, lowercase as they are terminals 128 | N -> mary 129 | N -> sue 130 | N -> takeshi 131 | N -> robert 132 | 133 | // complementizer 134 | Comp -> that 135 | 136 | // verbs -- intransitive, transitive, and clausal 137 | IV -> falls 138 | IV -> fall 139 | IV -> fell 140 | 141 | TV -> likes 142 | TV -> like 143 | TV -> liked 144 | 145 | CV -> says 146 | CV -> say 147 | CV -> said 148 | ``` 149 | 150 | Next, we can add our sentence rules (they must be added at the top, as the first 151 | rule in the file is assumed to be the top-level rule): 152 | 153 | ```fgr 154 | // sentence rules 155 | S -> N IV 156 | S -> N TV N 157 | S -> N CV Comp S 158 | 159 | // ... previous lexicon ... 160 | ``` 161 | 162 | Assuming this file is saved as `examples/no-features.fgr` (which it is :wink:), 163 | we can test this file with the built-in CLI: 164 | 165 | ``` 166 | $ cargo run --bin cli examples/no-features.fgr 167 | > he falls 168 | Parsed 1 tree 169 | (0..2: S 170 | (0..1: N (0..1: he)) 171 | (1..2: IV (1..2: falls))) 172 | [ 173 | child-1: [ child-0: [ word: falls ] ] 174 | child-0: [ child-0: [ word: he ] ] 175 | ] 176 | 177 | > he falls her 178 | Parsed 0 trees 179 | 180 | > he likes her 181 | Parsed 1 tree 182 | (0..3: S 183 | (0..1: N (0..1: he)) 184 | (1..2: TV (1..2: likes)) 185 | (2..3: N (2..3: her))) 186 | [ 187 | child-2: [ child-0: [ word: her ] ] 188 | child-1: [ child-0: [ word: likes ] ] 189 | child-0: [ child-0: [ word: he ] ] 190 | ] 191 | 192 | > he likes 193 | Parsed 0 trees 194 | 195 | > he said that he likes her 196 | Parsed 1 tree 197 | (0..6: S 198 | (0..1: N (0..1: he)) 199 | (1..2: CV (1..2: said)) 200 | (2..3: Comp (2..3: that)) 201 | (3..6: S 202 | (3..4: N (3..4: he)) 203 | (4..5: TV (4..5: likes)) 204 | (5..6: N (5..6: her)))) 205 | [ 206 | child-0: [ child-0: [ word: he ] ] 207 | child-2: [ child-0: [ word: that ] ] 208 | child-1: [ child-0: [ word: said ] ] 209 | child-3: [ 210 | child-2: [ child-0: [ word: her ] ] 211 | child-1: [ child-0: [ word: likes ] ] 212 | child-0: [ child-0: [ word: he ] ] 213 | ] 214 | ] 215 | 216 | > he said that he 217 | Parsed 0 trees 218 | ``` 219 | 220 | This grammar already parses some correct sentences, and blocks some trivially 221 | incorrect ones. However, it doesn't care about number, case, or reflexives 222 | right now: 223 | 224 | ``` 225 | > she likes himself // unbound reflexive pronoun 226 | Parsed 1 tree 227 | (0..3: S 228 | (0..1: N (0..1: she)) 229 | (1..2: TV (1..2: likes)) 230 | (2..3: N (2..3: himself))) 231 | [ 232 | child-0: [ child-0: [ word: she ] ] 233 | child-2: [ child-0: [ word: himself ] ] 234 | child-1: [ child-0: [ word: likes ] ] 235 | ] 236 | 237 | > him like her // incorrect case on the subject pronoun, should be nominative 238 | // (he) instead of accusative (him) 239 | Parsed 1 tree 240 | (0..3: S 241 | (0..1: N (0..1: him)) 242 | (1..2: TV (1..2: like)) 243 | (2..3: N (2..3: her))) 244 | [ 245 | child-0: [ child-0: [ word: him ] ] 246 | child-1: [ child-0: [ word: like ] ] 247 | child-2: [ child-0: [ word: her ] ] 248 | ] 249 | 250 | > he like her // incorrect verb number agreement 251 | Parsed 1 tree 252 | (0..3: S 253 | (0..1: N (0..1: he)) 254 | (1..2: TV (1..2: like)) 255 | (2..3: N (2..3: her))) 256 | [ 257 | child-2: [ child-0: [ word: her ] ] 258 | child-1: [ child-0: [ word: like ] ] 259 | child-0: [ child-0: [ word: he ] ] 260 | ] 261 | ``` 262 | 263 | To fix this, we need to add *features* to our lexicon, and restrict the sentence 264 | rules based on features. 265 | 266 | Features are added with square brackets, and are key: value pairs separated by 267 | commas. `**top**` is a special feature value, which basically means 268 | "unspecified" -- we'll come back to it later. Features that are unspecified are 269 | also assumed to have a `**top**` value, but sometimes explicitly stating top is 270 | more clear. 271 | 272 | ```fgr 273 | /// Pronouns 274 | // The added features are: 275 | // * num: sg or pl, whether this noun wants a singular verb (likes) or 276 | // a plural verb (like). note this is grammatical number, so for example 277 | // singular they takes plural agreement ("they like X", not *"they likes X") 278 | // * case: nom or acc, whether this noun is nominative or accusative case. 279 | // nominative case goes in the subject, and accusative in the object. 280 | // e.g., "he fell" and "she likes him", not *"him fell" and *"her likes he" 281 | // * pron: he, she, they, or ref -- what type of pronoun this is 282 | // * needs_pron: whether this is a reflexive that needs to bind to another 283 | // pronoun. 284 | N[ num: sg, case: nom, pron: he ] -> he 285 | N[ num: sg, case: acc, pron: he ] -> him 286 | N[ num: sg, case: acc, pron: ref, needs_pron: he ] -> himself 287 | N[ num: sg, case: nom, pron: she ] -> she 288 | N[ num: sg, case: acc, pron: she ] -> her 289 | N[ num: sg, case: acc, pron: ref, needs_pron: she] -> herself 290 | N[ num: pl, case: nom, pron: they ] -> they 291 | N[ num: pl, case: acc, pron: they ] -> them 292 | N[ num: pl, case: acc, pron: ref, needs_pron: they ] -> themselves 293 | N[ num: sg, case: acc, pron: ref, needs_pron: they ] -> themself 294 | 295 | // Names 296 | // The added features are: 297 | // * num: sg, as people are singular ("mary likes her" / *"mary like her") 298 | // * case: **top**, as names can be both subjects and objects 299 | // ("mary likes her" / "she likes mary") 300 | // * pron: whichever pronoun the person uses for reflexive agreement 301 | // mary pron: she => mary likes herself 302 | // sue pron: they => sue likes themself 303 | // takeshi pron: he => takeshi likes himself 304 | N[ num: sg, case: **top**, pron: she ] -> mary 305 | N[ num: sg, case: **top**, pron: they ] -> sue 306 | N[ num: sg, case: **top**, pron: he ] -> takeshi 307 | N[ num: sg, case: **top**, pron: he ] -> robert 308 | 309 | // Complementizer doesn't need features 310 | Comp -> that 311 | 312 | // Verbs -- intransitive, transitive, and clausal 313 | // The added features are: 314 | // * num: sg, pl, or **top** -- to match the noun numbers. 315 | // **top** will match either sg or pl, as past-tense verbs in English 316 | // don't agree in number: "he fell" and "they fell" are both fine 317 | // * tense: past or nonpast -- this won't be used for agreement, but will be 318 | // copied into the final feature structure, and the client code could do 319 | // something with it 320 | IV[ num: sg, tense: nonpast ] -> falls 321 | IV[ num: pl, tense: nonpast ] -> fall 322 | IV[ num: **top**, tense: past ] -> fell 323 | 324 | TV[ num: sg, tense: nonpast ] -> likes 325 | TV[ num: pl, tense: nonpast ] -> like 326 | TV[ num: **top**, tense: past ] -> liked 327 | 328 | CV[ num: sg, tense: nonpast ] -> says 329 | CV[ num: pl, tense: nonpast ] -> say 330 | CV[ num: **top**, tense: past ] -> said 331 | ``` 332 | 333 | Now that our lexicon is updated with features, we can update our sentence rules 334 | to constrain parsing based on those features. This uses two new features, 335 | tags and unification. Tags allow features to be associated between nodes in a 336 | rule, and unification controls how those features are compatible. The rules for 337 | unification are: 338 | 339 | 1. A string feature can unify with a string feature with the same value 340 | 2. A **top** feature can unify with anything, and the nodes are merged 341 | 3. A complex feature ([ ... ] structure) is recursively unified with another 342 | complex feature. 343 | 344 | If unification fails anywhere, the parse is aborted and the tree is discarded. 345 | This allows the programmer to discard trees if features don't match. 346 | 347 | ```fgr 348 | // Sentence rules 349 | // Intransitive verb: 350 | // * Subject must be nominative case 351 | // * Subject and verb must agree in number (copied through #1) 352 | S -> N[ case: nom, num: #1 ] IV[ num: #1 ] 353 | // Transitive verb: 354 | // * Subject must be nominative case 355 | // * Subject and verb must agree in number (copied through #2) 356 | // * If there's a reflexive in the object position, make sure its `needs_pron` 357 | // feature matches the subject's `pron` feature. If the object isn't a 358 | // reflexive, then its `needs_pron` feature will implicitly be `**top**`, so 359 | // will unify with anything. 360 | S -> N[ case: nom, pron: #1, num: #2 ] TV[ num: #2 ] N[ case: acc, needs_pron: #1 ] 361 | // Clausal verb: 362 | // * Subject must be nominative case 363 | // * Subject and verb must agree in number (copied through #1) 364 | // * Reflexives can't cross clause boundaries (*"He said that she likes himself"), 365 | // so we can ignore reflexives and delegate to inner clause rule 366 | S -> N[ case: nom, num: #1 ] CV[ num: #1 ] Comp S 367 | ``` 368 | 369 | Now that we have this augmented grammar (available as `examples/reflexives.fgr`), 370 | we can try it out and see that it rejects illicit sentences that were previously 371 | accepted, while still accepting valid ones: 372 | 373 | ``` 374 | > he fell 375 | Parsed 1 tree 376 | (0..2: S 377 | (0..1: N (0..1: he)) 378 | (1..2: IV (1..2: fell))) 379 | [ 380 | child-1: [ 381 | child-0: [ word: fell ] 382 | num: #0 sg 383 | tense: past 384 | ] 385 | child-0: [ 386 | pron: he 387 | case: nom 388 | num: #0 389 | child-0: [ word: he ] 390 | ] 391 | ] 392 | 393 | > he like him 394 | Parsed 0 trees 395 | 396 | > he likes himself 397 | Parsed 1 tree 398 | (0..3: S 399 | (0..1: N (0..1: he)) 400 | (1..2: TV (1..2: likes)) 401 | (2..3: N (2..3: himself))) 402 | [ 403 | child-1: [ 404 | num: #0 sg 405 | child-0: [ word: likes ] 406 | tense: nonpast 407 | ] 408 | child-2: [ 409 | needs_pron: #1 he 410 | num: sg 411 | child-0: [ word: himself ] 412 | pron: ref 413 | case: acc 414 | ] 415 | child-0: [ 416 | child-0: [ word: he ] 417 | pron: #1 418 | num: #0 419 | case: nom 420 | ] 421 | ] 422 | 423 | > he likes herself 424 | Parsed 0 trees 425 | 426 | > mary likes herself 427 | Parsed 1 tree 428 | (0..3: S 429 | (0..1: N (0..1: mary)) 430 | (1..2: TV (1..2: likes)) 431 | (2..3: N (2..3: herself))) 432 | [ 433 | child-0: [ 434 | pron: #0 she 435 | num: #1 sg 436 | case: nom 437 | child-0: [ word: mary ] 438 | ] 439 | child-1: [ 440 | tense: nonpast 441 | child-0: [ word: likes ] 442 | num: #1 443 | ] 444 | child-2: [ 445 | child-0: [ word: herself ] 446 | num: sg 447 | pron: ref 448 | case: acc 449 | needs_pron: #0 450 | ] 451 | ] 452 | 453 | > mary likes themself 454 | Parsed 0 trees 455 | 456 | > sue likes themself 457 | Parsed 1 tree 458 | (0..3: S 459 | (0..1: N (0..1: sue)) 460 | (1..2: TV (1..2: likes)) 461 | (2..3: N (2..3: themself))) 462 | [ 463 | child-0: [ 464 | pron: #0 they 465 | child-0: [ word: sue ] 466 | case: nom 467 | num: #1 sg 468 | ] 469 | child-1: [ 470 | tense: nonpast 471 | num: #1 472 | child-0: [ word: likes ] 473 | ] 474 | child-2: [ 475 | needs_pron: #0 476 | case: acc 477 | pron: ref 478 | child-0: [ word: themself ] 479 | num: sg 480 | ] 481 | ] 482 | 483 | > sue likes himself 484 | Parsed 0 trees 485 | ``` 486 | 487 | If this is interesting to you and you want to learn more, you can check out 488 | [my blog series](https://vgel.me/posts/symbolic-linguistics-part1/), 489 | the excellent textbook [Syntactic Theory: A Formal Introduction (2nd ed.)](https://web.stanford.edu/group/cslipublications/cslipublications/site/1575864002.shtml), 490 | and the [DELPH-IN project](http://www.delph-in.net/wiki/index.php/Home), whose 491 | work on the LKB inspired this simplified version. 492 | 493 | ## Using from code 494 | I need to write this section in more detail, but if you're comfortable with Rust, 495 | I suggest looking through the codebase. It's not perfect, it started as one of 496 | my first Rust projects (after migrating through F# -> TypeScript -> C in search 497 | of the right performance/ergonomics tradeoff), and it could use more tests, 498 | but overall it's not too bad. 499 | 500 | Basically, the processing pipeline is: 501 | 502 | 1. Make a `Grammar` struct 503 | * `Grammar` is defined in `rules.rs`. 504 | * The easiest way to make a `Grammar` is `Grammar::parse_from_file`, which is 505 | mostly a hand-written recusive descent parser in `parse_grammar.rs`. Yes, 506 | I recognize the irony here. 507 | 2. It takes input (in `Grammar::parse`, which does everything for you, or 508 | `Grammar::parse_chart`, which just does the chart) 509 | 3. The input is first chart-parsed in `earley.rs` 510 | 4. Then, a forest is built from the chart, in `forest.rs`, using an algorithm 511 | I found in a very useful blog series I forget the URL for, because the 512 | algorithms in the academic literature for this are... weird. 513 | 5. Finally, the feature unification is used to prune the forest down to only 514 | valid trees. It would be more efficient to do this during parsing, but meh. 515 | 516 | The most interesting thing you can do via code and not via the CLI is probably 517 | getting at the raw feature DAG, as that would let you do things like pronoun 518 | coreference. The DAG code is in `featurestructure.rs`, and should be fairly 519 | approachable -- there's a lot of Rust ceremony around `Rc>` 520 | because using an arena allocation crate seemed ~~too har~~like overkill, but 521 | that is somewhat mitigated by the `NodeRef` type alias. Hit me up at 522 | https://vgel.me/contact if you need help with anything here! 523 | 524 | ## License 525 | 526 | Licensed under the [MIT license](https://opensource.org/licenses/MIT). 527 | 528 | ### Contribution 529 | 530 | Unless you explicitly state otherwise, any contribution intentionally 531 | submitted for inclusion in the work shall be licensed as above, without any 532 | additional terms or conditions. 533 | -------------------------------------------------------------------------------- /README.tpl: -------------------------------------------------------------------------------- 1 | [![Crates.io](https://img.shields.io/crates/v/treebender.svg)](https://crates.io/crates/treebender) 2 | {{badges}} 3 | 4 | # Treebender 5 | 6 | {{readme}} 7 | 8 | ## License 9 | 10 | Licensed under the [MIT license](https://opensource.org/licenses/MIT). 11 | 12 | ### Contribution 13 | 14 | Unless you explicitly state otherwise, any contribution intentionally 15 | submitted for inclusion in the work shall be licensed as above, without any 16 | additional terms or conditions. -------------------------------------------------------------------------------- /benches/full_parse_reflexives.rs: -------------------------------------------------------------------------------- 1 | use criterion::{Criterion, black_box, criterion_group, criterion_main}; 2 | 3 | use treebender::Grammar; 4 | 5 | const GRAMMAR_SRC: &str = include_str!("./reflexives.fgr"); 6 | 7 | fn parse(g: &Grammar, input: &[&str]) -> usize { 8 | g.parse(input).len() 9 | } 10 | 11 | fn criterion_benchmark(c: &mut Criterion) { 12 | let grammar = GRAMMAR_SRC.parse::().unwrap(); 13 | let simple_input = "mary likes sue".split(' ').collect::>(); 14 | let complex_input = "mary said that she likes herself" 15 | .split(' ') 16 | .collect::>(); 17 | 18 | c.bench_function("parse simple", |b| { 19 | b.iter(|| parse(black_box(&grammar), black_box(&simple_input))) 20 | }); 21 | 22 | c.bench_function("parse complex reflexive", |b| { 23 | b.iter(|| parse(black_box(&grammar), black_box(&complex_input))) 24 | }); 25 | } 26 | 27 | criterion_group!(benches, criterion_benchmark); 28 | criterion_main!(benches); 29 | -------------------------------------------------------------------------------- /benches/reflexives.fgr: -------------------------------------------------------------------------------- 1 | // Sentence rules 2 | // Intransitive: 3 | // * Subject must be nominative case 4 | // * Subject and verb must agree in number (copied through #1) 5 | S -> N[ case: nom, num: #1 ] IV[ num: #1 ] 6 | // Transitive: 7 | // * Subject must be nominative case 8 | // * Subject and verb must agree in number (copied through #2) 9 | // * If there's a reflexive in the object position, make sure its `needs_pron` 10 | // feature matches the subject's `pron` feature. If the object isn't a 11 | // reflexive, then its `needs_pron` feature will implicitly be `**top**`, so 12 | // will unify with anything. 13 | S -> N[ case: nom, pron: #1, num: #2 ] TV[ num: #2 ] N[ case: acc, needs_pron: #1 ] 14 | // Clausal: 15 | // * Subject must be nominative case 16 | // * Subject and verb must agree in number (copied through #1) 17 | // * Reflexives can't cross clause boundaries (*"He said that she likes himself"), 18 | // so we can ignore reflexives and delegate to inner clause rule 19 | S -> N[ case: nom, num: #1 ] CV[ num: #1 ] Comp S 20 | 21 | // Pronouns 22 | // The added features are: 23 | // * num: sg or pl, whether this noun wants a singular verb (likes) or 24 | // a plural verb (like). note this is grammatical number, so for example 25 | // singular they takes plural agreement ("they like X", not *"they likes X") 26 | // * case: nom or acc, whether this noun is nominative or accusative case. 27 | // nominative case goes in the subject, and accusative in the object. 28 | // e.g., "he fell" and "she likes him", not *"him fell" and *"her likes he" 29 | // * pron: he, she, they, or ref -- what type of pronoun this is 30 | // * needs_pron: whether this is a reflexive that needs to bind to another 31 | // pronoun. 32 | N[ num: sg, case: nom, pron: he ] -> he 33 | N[ num: sg, case: acc, pron: he ] -> him 34 | N[ num: sg, case: acc, pron: ref, needs_pron: he ] -> himself 35 | N[ num: sg, case: nom, pron: she ] -> she 36 | N[ num: sg, case: acc, pron: she ] -> her 37 | N[ num: sg, case: acc, pron: ref, needs_pron: she] -> herself 38 | N[ num: pl, case: nom, pron: they ] -> they 39 | N[ num: pl, case: acc, pron: they ] -> them 40 | N[ num: pl, case: acc, pron: ref, needs_pron: they ] -> themselves 41 | N[ num: sg, case: acc, pron: ref, needs_pron: they ] -> themself 42 | 43 | // Names 44 | // The added features are: 45 | // * num: sg, as people are singular ("mary likes her" / *"mary like her") 46 | // * case: **top**, as names can be both subjects and objects 47 | // ("mary likes her" / "she likes mary") 48 | // * pron: whichever pronoun the person uses for reflexive agreement 49 | // mary pron: she => mary likes herself 50 | // sue pron: they => sue likes themself 51 | // takeshi pron: he => takeshi likes himself 52 | N[ num: sg, case: **top**, pron: she ] -> mary 53 | N[ num: sg, case: **top**, pron: they ] -> sue 54 | N[ num: sg, case: **top**, pron: he ] -> takeshi 55 | N[ num: sg, case: **top**, pron: he ] -> robert 56 | 57 | // Complementizer doesn't need features 58 | Comp -> that 59 | 60 | // Verbs -- intransitive, transitive, and clausal 61 | // The added features are: 62 | // * num: sg, pl, or **top** -- to match the noun numbers. 63 | // **top** will match either sg or pl, as past-tense verbs in English 64 | // don't agree in number: "he fell" and "they fell" are both fine 65 | // * tense: past or nonpast -- this won't be used for agreement, but will be 66 | // copied into the final feature structure, and the client code could do 67 | // something with it 68 | IV[ num: sg, tense: nonpast ] -> falls 69 | IV[ num: pl, tense: nonpast ] -> fall 70 | IV[ num: **top**, tense: past ] -> fell 71 | 72 | TV[ num: sg, tense: nonpast ] -> likes 73 | TV[ num: pl, tense: nonpast ] -> like 74 | TV[ num: **top**, tense: past ] -> liked 75 | 76 | CV[ num: sg, tense: nonpast ] -> says 77 | CV[ num: pl, tense: nonpast ] -> say 78 | CV[ num: **top**, tense: past ] -> said 79 | -------------------------------------------------------------------------------- /cli/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "cli" 3 | version = "0.2.0" 4 | authors = ["Theia Vogel "] 5 | 6 | [dependencies] 7 | tracing-subscriber = { version = "0.3.19", features = ["env-filter"] } 8 | treebender = { path = "../" } 9 | -------------------------------------------------------------------------------- /cli/src/main.rs: -------------------------------------------------------------------------------- 1 | extern crate tracing_subscriber; 2 | extern crate treebender; 3 | 4 | use std::env; 5 | use std::io; 6 | use std::io::Write; 7 | use std::process; 8 | 9 | use tracing_subscriber::EnvFilter; 10 | 11 | use treebender::rules::Grammar; 12 | use treebender::Err; 13 | 14 | fn usage(prog_name: &str) -> String { 15 | format!( 16 | r"Usage: {} FILE [options] 17 | 18 | Options: 19 | -h, --help Print this message 20 | -c, --chart Print the parse chart (defaults to not printing) 21 | -n, --no-fs Don't print feature structures (defaults to printing)", 22 | prog_name 23 | ) 24 | } 25 | 26 | fn parse(g: &Grammar, sentence: &str, print_chart: bool, print_fs: bool) -> Result<(), Err> { 27 | let sentence = sentence.split(' ').collect::>(); 28 | 29 | let chart = g.parse_chart(&sentence); 30 | 31 | if print_chart { 32 | println!("chart:\n{}\n", chart); 33 | } 34 | 35 | let trees = g.parse(&sentence); 36 | 37 | println!( 38 | "Parsed {} tree{}", 39 | trees.len(), 40 | if trees.len() == 1 { "" } else { "s" } 41 | ); 42 | 43 | for (t, idx, arena) in trees { 44 | println!("{}", t); 45 | if print_fs { 46 | println!("{}", arena.display(idx)); 47 | } 48 | println!(); 49 | } 50 | 51 | Ok(()) 52 | } 53 | 54 | struct Args { 55 | filename: String, 56 | print_fs: bool, 57 | print_chart: bool, 58 | } 59 | 60 | impl Args { 61 | fn make_error_message(msg: &str, prog_name: impl AsRef) -> String { 62 | format!("argument error: {}.\n\n{}", msg, usage(prog_name.as_ref())) 63 | } 64 | 65 | fn parse(v: Vec) -> Result { 66 | if v.is_empty() { 67 | return Err(Self::make_error_message( 68 | "bad argument vector", 69 | "treebender", 70 | )); 71 | } 72 | 73 | let args_len = v.len(); 74 | let mut iter = v.into_iter(); 75 | let prog_name = iter.next().unwrap(); 76 | 77 | if args_len < 2 { 78 | return Err(Self::make_error_message("not enough arguments", prog_name)); 79 | } 80 | 81 | let mut filename: Option = None; 82 | let mut print_fs = true; // default to printing feature structures 83 | let mut print_chart = false; // default to *not* printing the chart 84 | 85 | for o in iter { 86 | if o == "-h" || o == "--help" { 87 | eprintln!("{}", usage(&prog_name)); 88 | process::exit(0); 89 | } else if o == "-n" || o == "--no-fs" { 90 | print_fs = false; 91 | } else if o == "-c" || o == "--chart" { 92 | print_chart = true; 93 | } else if filename.is_none() { 94 | filename = Some(o); 95 | } else { 96 | return Err(Self::make_error_message("invalid arguments", prog_name)); 97 | } 98 | } 99 | 100 | if let Some(filename) = filename { 101 | Ok(Self { 102 | filename, 103 | print_fs, 104 | print_chart, 105 | }) 106 | } else { 107 | Err(Self::make_error_message("missing filename", prog_name)) 108 | } 109 | } 110 | } 111 | 112 | fn main() -> Result<(), Err> { 113 | let opts = match Args::parse(env::args().collect()) { 114 | Ok(opts) => opts, 115 | Err(msg) => { 116 | eprintln!("{}", msg); 117 | process::exit(255); 118 | } 119 | }; 120 | 121 | tracing_subscriber::fmt() 122 | .with_env_filter(EnvFilter::from_default_env()) 123 | .with_writer(std::io::stderr) 124 | .init(); 125 | 126 | let g: Grammar = Grammar::read_from_file(&opts.filename)?; 127 | 128 | let mut input = String::new(); 129 | loop { 130 | print!("> "); 131 | io::stdout().flush()?; 132 | 133 | match io::stdin().read_line(&mut input) { 134 | Ok(_) => { 135 | if input.is_empty() { 136 | // ctrl+d 137 | return Ok(()); 138 | } 139 | input.make_ascii_lowercase(); 140 | parse(&g, input.trim(), opts.print_chart, opts.print_fs)?; 141 | input.clear(); 142 | } 143 | Err(error) => return Err(error.into()), 144 | } 145 | } 146 | } 147 | -------------------------------------------------------------------------------- /examples/asl-wordorder.fgr: -------------------------------------------------------------------------------- 1 | // *very* basic / incomplete grammar for ASL topicalization 2 | // allow all word orders, but topicalized elements must come first 3 | 4 | S -> NP IV 5 | S -> IV NP[ topicalized: n ] 6 | S -> NP TV NP[ topicalized: n ] 7 | S -> NP NP[ topicalized: n ] TV 8 | S -> TV NP[ topicalized: n ] NP[ topicalized: n ] 9 | 10 | NP -> N 11 | NP[ topicalized: y ] -> N Topic 12 | Topic -> nm-raised-eyebrows 13 | 14 | N -> boy 15 | N -> ball 16 | 17 | IV -> sit 18 | TV -> throw -------------------------------------------------------------------------------- /examples/dative-shift.fgr: -------------------------------------------------------------------------------- 1 | S[roles: #1] -> Arg[st: #2] V[roles: #1, sts.s: #2, sts.do: #3, sts.io: #4] Arg[st: #3] Arg[st: #4] 2 | 3 | Vbare[ 4 | sts.s.case: nom, sts.s.arg: #1 **top**, 5 | sts.do.case: acc, sts.do.arg: #2 **top**, 6 | sts.io.case: acc, sts.io.arg: #3 **top**, 7 | roles.agent: #1, roles.recipient: #2, roles.theme: #3 8 | ] -> gave 9 | // go go gadget dative shifter (swap direct and indirect object, assign dative) 10 | Vdative[ 11 | roles: #1, sts.s: #2, sts.do: #3, sts.io.case: dat, sts.io.arg: #4 12 | ] -> Vbare[ 13 | roles: #1, sts.s: #2, sts.io: #3, sts.do.arg: #4 14 | ] 15 | V[ sts: #1, roles: #2 ] -> Vbare[ sts: #1, roles: #2 ] 16 | V[ sts: #1, roles: #2 ] -> Vdative[ sts: #1, roles: #2 ] 17 | 18 | Arg[ st: #1 ] -> PP[ st: #1 ] 19 | Arg[ st: #1 ] -> NP[ st: #1 ] 20 | PP[ st.case: dat, st.arg: #1 ] -> to NP[ st.case: acc, st.arg: #1 ] 21 | NP[ st.case: nom, st.arg: i ] -> i 22 | NP[ st.case: acc, st.arg: i ] -> me 23 | NP[ st.case: nom, st.arg: she ] -> she 24 | NP[ st.case: acc, st.arg: she ] -> her 25 | NP[ st.case: nom, st.arg: apples ] -> apples 26 | NP[ st.case: acc, st.arg: apples ] -> apples 27 | -------------------------------------------------------------------------------- /examples/no-features.fgr: -------------------------------------------------------------------------------- 1 | // sentence rules 2 | S -> N IV 3 | S -> N TV N 4 | S -> N CV Comp S 5 | 6 | // pronouns 7 | N -> he 8 | N -> him 9 | N -> himself 10 | N -> she 11 | N -> her 12 | N -> herself 13 | N -> they 14 | N -> them 15 | N -> themselves 16 | N -> themself 17 | 18 | // names, lowercase as they are terminals 19 | N -> mary 20 | N -> sue 21 | N -> takeshi 22 | N -> robert 23 | 24 | // complementizer 25 | Comp -> that 26 | 27 | // verbs -- intransitive, transitive, and clausal 28 | IV -> falls 29 | IV -> fall 30 | IV -> fell 31 | 32 | TV -> likes 33 | TV -> like 34 | TV -> liked 35 | 36 | CV -> says 37 | CV -> say 38 | CV -> said -------------------------------------------------------------------------------- /examples/reflexives.fgr: -------------------------------------------------------------------------------- 1 | // Sentence rules 2 | // Intransitive: 3 | // * Subject must be nominative case 4 | // * Subject and verb must agree in number (copied through #1) 5 | S -> N[ case: nom, num: #1 ] IV[ num: #1 ] 6 | // Transitive: 7 | // * Subject must be nominative case 8 | // * Subject and verb must agree in number (copied through #2) 9 | // * If there's a reflexive in the object position, make sure its `needs_pron` 10 | // feature matches the subject's `pron` feature. If the object isn't a 11 | // reflexive, then its `needs_pron` feature will implicitly be `**top**`, so 12 | // will unify with anything. 13 | S -> N[ case: nom, pron: #1, num: #2 ] TV[ num: #2 ] N[ case: acc, needs_pron: #1 ] 14 | // Clausal: 15 | // * Subject must be nominative case 16 | // * Subject and verb must agree in number (copied through #1) 17 | // * Reflexives can't cross clause boundaries (*"He said that she likes himself"), 18 | // so we can ignore reflexives and delegate to inner clause rule 19 | S -> N[ case: nom, num: #1 ] CV[ num: #1 ] Comp S 20 | 21 | // Pronouns 22 | // The added features are: 23 | // * num: sg or pl, whether this noun wants a singular verb (likes) or 24 | // a plural verb (like). note this is grammatical number, so for example 25 | // singular they takes plural agreement ("they like X", not *"they likes X") 26 | // * case: nom or acc, whether this noun is nominative or accusative case. 27 | // nominative case goes in the subject, and accusative in the object. 28 | // e.g., "he fell" and "she likes him", not *"him fell" and *"her likes he" 29 | // * pron: he, she, they, or ref -- what type of pronoun this is 30 | // * needs_pron: whether this is a reflexive that needs to bind to another 31 | // pronoun. 32 | N[ num: sg, case: nom, pron: he ] -> he 33 | N[ num: sg, case: acc, pron: he ] -> him 34 | N[ num: sg, case: acc, pron: ref, needs_pron: he ] -> himself 35 | N[ num: sg, case: nom, pron: she ] -> she 36 | N[ num: sg, case: acc, pron: she ] -> her 37 | N[ num: sg, case: acc, pron: ref, needs_pron: she] -> herself 38 | N[ num: pl, case: nom, pron: they ] -> they 39 | N[ num: pl, case: acc, pron: they ] -> them 40 | N[ num: pl, case: acc, pron: ref, needs_pron: they ] -> themselves 41 | N[ num: sg, case: acc, pron: ref, needs_pron: they ] -> themself 42 | 43 | // Names 44 | // The added features are: 45 | // * num: sg, as people are singular ("mary likes her" / *"mary like her") 46 | // * case: **top**, as names can be both subjects and objects 47 | // ("mary likes her" / "she likes mary") 48 | // * pron: whichever pronoun the person uses for reflexive agreement 49 | // mary pron: she => mary likes herself 50 | // sue pron: they => sue likes themself 51 | // takeshi pron: he => takeshi likes himself 52 | N[ num: sg, case: **top**, pron: she ] -> mary 53 | N[ num: sg, case: **top**, pron: they ] -> sue 54 | N[ num: sg, case: **top**, pron: he ] -> takeshi 55 | N[ num: sg, case: **top**, pron: he ] -> robert 56 | 57 | // Complementizer doesn't need features 58 | Comp -> that 59 | 60 | // Verbs -- intransitive, transitive, and clausal 61 | // The added features are: 62 | // * num: sg, pl, or **top** -- to match the noun numbers. 63 | // **top** will match either sg or pl, as past-tense verbs in English 64 | // don't agree in number: "he fell" and "they fell" are both fine 65 | // * tense: past or nonpast -- this won't be used for agreement, but will be 66 | // copied into the final feature structure, and the client code could do 67 | // something with it 68 | IV[ num: sg, tense: nonpast ] -> falls 69 | IV[ num: pl, tense: nonpast ] -> fall 70 | IV[ num: **top**, tense: past ] -> fell 71 | 72 | TV[ num: sg, tense: nonpast ] -> likes 73 | TV[ num: pl, tense: nonpast ] -> like 74 | TV[ num: **top**, tense: past ] -> liked 75 | 76 | CV[ num: sg, tense: nonpast ] -> says 77 | CV[ num: pl, tense: nonpast ] -> say 78 | CV[ num: **top**, tense: past ] -> said 79 | -------------------------------------------------------------------------------- /rustfmt.toml: -------------------------------------------------------------------------------- 1 | tab_spaces = 2 -------------------------------------------------------------------------------- /src/earley.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::sync::Arc; 3 | 4 | use crate::rules::{Grammar, Production, Rule}; 5 | 6 | #[derive(Debug, Clone, PartialEq, Eq)] 7 | pub struct LR0 { 8 | pub rule: Arc, 9 | pub pos: usize, 10 | } 11 | 12 | impl LR0 { 13 | pub fn new(rule: &Arc) -> Self { 14 | Self { 15 | rule: rule.clone(), 16 | pos: 0, 17 | } 18 | } 19 | 20 | pub fn is_active(&self) -> bool { 21 | self.pos < self.rule.len() 22 | } 23 | 24 | pub fn advance(&self) -> Self { 25 | assert!(self.is_active()); 26 | Self { 27 | rule: self.rule.clone(), 28 | pos: self.pos + 1, 29 | } 30 | } 31 | 32 | pub fn next_production(&self) -> Option<&Production> { 33 | self.rule.productions.get(self.pos) 34 | } 35 | } 36 | 37 | impl fmt::Display for LR0 { 38 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 39 | write!(f, "{} →", self.rule.symbol)?; 40 | for idx in 0..self.rule.len() { 41 | if idx == self.pos { 42 | write!(f, " ・")?; 43 | } 44 | write!(f, " {}", self.rule.productions[idx])?; 45 | } 46 | if !self.is_active() { 47 | write!(f, " ・")?; 48 | } 49 | Ok(()) 50 | } 51 | } 52 | 53 | #[derive(Debug, Clone, PartialEq, Eq)] 54 | pub struct State { 55 | pub lr0: LR0, 56 | pub origin: usize, 57 | } 58 | 59 | impl State { 60 | pub fn new(lr0: LR0, origin: usize) -> Self { 61 | Self { lr0, origin } 62 | } 63 | 64 | pub fn advance(&self) -> Self { 65 | Self::new(self.lr0.advance(), self.origin) 66 | } 67 | } 68 | 69 | #[derive(Debug)] 70 | pub struct Chart(Vec>); 71 | 72 | impl Chart { 73 | pub fn new(length: usize) -> Self { 74 | Self(vec![Vec::new(); length]) 75 | } 76 | 77 | pub fn len(&self) -> usize { 78 | self.0.len() 79 | } 80 | 81 | pub fn is_empty(&self) -> bool { 82 | self.len() == 0 83 | } 84 | 85 | pub fn len_at(&self, k: usize) -> usize { 86 | self.0[k].len() 87 | } 88 | 89 | pub fn has(&self, k: usize, state: &State) -> bool { 90 | self.0[k].contains(state) 91 | } 92 | 93 | pub fn add(&mut self, k: usize, state: State) { 94 | if !self.has(k, &state) { 95 | self.0[k].push(state); 96 | } 97 | } 98 | 99 | /// Get an owned state so that passing around &mut chart is more ergonomic 100 | /// The clone is fairly cheap, only an rc + 2 usize, State would be copy if not 101 | /// for the Arc 102 | fn get_state(&self, k: usize, idx: usize) -> State { 103 | self.0[k][idx].clone() 104 | } 105 | } 106 | 107 | impl IntoIterator for Chart { 108 | type Item = (usize, Vec); 109 | type IntoIter = std::iter::Enumerate>>; 110 | 111 | fn into_iter(self) -> Self::IntoIter { 112 | self.0.into_iter().enumerate() 113 | } 114 | } 115 | 116 | impl fmt::Display for Chart { 117 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 118 | for k in 0..self.len() { 119 | writeln!(f, "State {}:", k)?; 120 | for state in self.0[k].iter() { 121 | writeln!(f, " {}..{}: {}", state.origin, k, state.lr0)?; 122 | } 123 | } 124 | Ok(()) 125 | } 126 | } 127 | 128 | pub fn parse_chart(g: &Grammar, input: &[&str]) -> Chart { 129 | let mut chart = Chart::new(input.len() + 1); 130 | 131 | for rule in g.rules.get(&g.start).expect("grammar missing start rules") { 132 | chart.add(0, State::new(LR0::new(rule), 0)); 133 | } 134 | 135 | for k in 0..chart.len() { 136 | // need to use while loop because the number of states at k can expand during the loop 137 | let mut idx = 0; 138 | while idx < chart.len_at(k) { 139 | let state = chart.get_state(k, idx); 140 | idx += 1; 141 | 142 | if let Some(production) = state.lr0.next_production() { 143 | if production.is_nonterminal() { 144 | predictor(g, &mut chart, k, &state); 145 | } else { 146 | scanner(&mut chart, k, &state, input); 147 | } 148 | } else { 149 | completer(&mut chart, k, &state); 150 | } 151 | } 152 | } 153 | 154 | chart 155 | } 156 | 157 | fn completer(chart: &mut Chart, k: usize, state: &State) { 158 | assert!(!state.lr0.is_active(), "tried to complete active state"); 159 | 160 | // lr0 has been completed, now look for states in the chart that are waiting for its symbol 161 | for idx in 0..chart.len_at(state.origin) { 162 | let other = chart.get_state(state.origin, idx); 163 | 164 | if let Some(np) = other.lr0.next_production() { 165 | if np.symbol == state.lr0.rule.symbol { 166 | // found one, advance its dot and add the new state to the chart *at k*, 167 | // because it's now waiting on a token there 168 | chart.add(k, other.advance()) 169 | } 170 | } 171 | } 172 | } 173 | 174 | fn predictor(g: &Grammar, chart: &mut Chart, k: usize, state: &State) { 175 | assert!(state.lr0.is_active(), "tried to predict non-active state"); 176 | assert!( 177 | state.lr0.next_production().unwrap().is_nonterminal(), 178 | "tried to predict a terminal" 179 | ); 180 | 181 | // this lr0 is waiting for the next production 182 | // let's hypothesize that one of the rules that can build this production will 183 | // succeed at its current position 184 | let needed_symbol = &state.lr0.next_production().unwrap().symbol; 185 | for wanted_rule in g 186 | .rules 187 | .get(needed_symbol) 188 | .unwrap_or_else(|| panic!("missing rules for production {}", needed_symbol)) 189 | { 190 | chart.add(k, State::new(LR0::new(wanted_rule), k)); 191 | 192 | if g.is_nullable(needed_symbol) { 193 | // automatically complete `state` early, because we know 194 | // it will be completable anyways, because its next_production may be produced 195 | // by empty input. If we don't do this, nullable rules won't be completed 196 | // correctly, because complete() won't run after predict() without a new symbol. 197 | chart.add(k, state.advance()); 198 | } 199 | } 200 | } 201 | 202 | fn scanner(chart: &mut Chart, k: usize, state: &State, input: &[&str]) { 203 | assert!(state.lr0.is_active(), "tried to scan non-active state"); 204 | assert!( 205 | state.lr0.next_production().unwrap().is_terminal(), 206 | "tried to scan a nonterminal" 207 | ); 208 | 209 | let needed_symbol = &state.lr0.next_production().unwrap().symbol; 210 | if k < input.len() && input[k] == needed_symbol { 211 | // advance the state to consume this token, and add to state k + 1, where 212 | // it will look for the next token 213 | chart.add(k + 1, state.advance()); 214 | } 215 | } 216 | -------------------------------------------------------------------------------- /src/featurestructure/mod.rs: -------------------------------------------------------------------------------- 1 | mod node; 2 | mod serialized; 3 | 4 | pub use node::{Feature, NodeArena, NodeIdx}; 5 | pub use serialized::SerializedNode; 6 | 7 | #[cfg(test)] 8 | mod tests { 9 | use super::*; 10 | 11 | #[test] 12 | fn test_construct_fs() { 13 | let mut arena = NodeArena::new(); 14 | 15 | let features = vec![ 16 | Feature { 17 | path: "a.b".to_string(), 18 | tag: Some("1".to_string()), 19 | value: arena.alloc_top(), 20 | }, 21 | Feature { 22 | path: "a.b.c".to_string(), 23 | tag: None, 24 | value: arena.alloc_str("foo".to_string()), 25 | }, 26 | Feature { 27 | path: "a.b.d".to_string(), 28 | tag: None, 29 | value: arena.alloc_str("bar".to_string()), 30 | }, 31 | Feature { 32 | path: "e".to_string(), 33 | tag: Some("1".to_string()), 34 | value: arena.alloc_top(), 35 | }, 36 | ]; 37 | 38 | let root = arena.alloc_from_features(features).unwrap(); 39 | 40 | println!("{}", arena.display(root)); 41 | } 42 | 43 | #[test] 44 | fn test_unify_tags() { 45 | let mut arena = NodeArena::new(); 46 | 47 | let features1 = vec![ 48 | Feature { 49 | path: "a.b".to_string(), 50 | tag: Some("1".to_string()), 51 | value: arena.alloc_top(), 52 | }, 53 | Feature { 54 | path: "c".to_string(), 55 | tag: Some("1".to_string()), 56 | value: arena.alloc_top(), 57 | }, 58 | ]; 59 | 60 | let fs1 = arena.alloc_from_features(features1).unwrap(); 61 | 62 | let features2 = vec![Feature { 63 | path: "c".to_string(), 64 | tag: None, 65 | value: arena.alloc_str("foo".to_string()), 66 | }]; 67 | 68 | let fs2 = arena.alloc_from_features(features2).unwrap(); 69 | 70 | // everything is **top** so goes away 71 | assert!(SerializedNode::from_node(&arena, fs1).is_none()); 72 | 73 | let gold = SerializedNode::Edged(vec![("c".into(), "foo".into())].into_iter().collect()); 74 | 75 | assert!(SerializedNode::from_node(&arena, fs2) == Some(gold)); 76 | 77 | arena.unify(fs1, fs2).unwrap(); 78 | 79 | let gold = SerializedNode::Edged( 80 | vec![ 81 | ( 82 | "a".into(), 83 | SerializedNode::Edged(vec![("b".into(), "foo".into())].into_iter().collect()), 84 | ), 85 | ("c".into(), "foo".into()), 86 | ] 87 | .into_iter() 88 | .collect(), 89 | ); 90 | 91 | assert!(SerializedNode::from_node(&arena, fs1) == Some(gold.clone())); 92 | assert!(SerializedNode::from_node(&arena, fs2) == Some(gold)); 93 | } 94 | } 95 | -------------------------------------------------------------------------------- /src/featurestructure/node.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::fmt; 3 | 4 | use crate::utils::Err; 5 | 6 | /// Index type for the node arena 7 | #[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] 8 | pub struct NodeIdx(pub u32); 9 | 10 | /// Unpacked representation of a feature, that NodeArena::new_from_paths can turn into a Node 11 | #[derive(Debug)] 12 | pub struct Feature { 13 | /// Dotted path where each segment will be a node: "a.b.c" -> [a: [b: [c: ...]]] 14 | pub path: String, 15 | /// Unique string that will link features into a reentrant node, or None 16 | pub tag: Option, 17 | /// What will end up at `path`. Will be unified with any other feature values with the same tag. 18 | pub value: NodeIdx, 19 | } 20 | 21 | /// A node in the feature structure graph 22 | #[derive(Debug, Clone, PartialEq, Eq)] 23 | pub enum Node { 24 | /// Top can unify with anything 25 | Top, 26 | /// A string-valued feature, such as "nom" in [case: nom]. Unifies with eq. Str nodes 27 | Str(String), 28 | /// An arc-containing node with arcs to other NodeIdxs 29 | Edged(HashMap), 30 | /// A node that has been forwarded to another node through unification. 31 | /// Before using a node, it should be dereferenced to resolve its forward 32 | Forwarded(NodeIdx), 33 | } 34 | 35 | impl Node { 36 | fn new_str(s: String) -> Self { 37 | Self::Str(s) 38 | } 39 | 40 | fn new_edged() -> Self { 41 | Self::Edged(HashMap::new()) 42 | } 43 | 44 | fn is_top(&self) -> bool { 45 | matches!(self, Self::Top) 46 | } 47 | 48 | fn str(&self) -> Option<&str> { 49 | match self { 50 | Self::Str(s) => Some(s), 51 | _ => None, 52 | } 53 | } 54 | 55 | fn is_str(&self) -> bool { 56 | self.str().is_some() 57 | } 58 | 59 | fn edged(&self) -> Option<&HashMap> { 60 | match self { 61 | Self::Edged(v) => Some(v), 62 | _ => None, 63 | } 64 | } 65 | 66 | fn edged_mut(&mut self) -> Option<&mut HashMap> { 67 | match self { 68 | Self::Edged(v) => Some(v), 69 | _ => None, 70 | } 71 | } 72 | 73 | fn is_edged(&self) -> bool { 74 | self.edged().is_some() 75 | } 76 | } 77 | 78 | /// An arena that stores all nodes and provides methods to operate on them 79 | #[derive(Debug, Default, Clone, PartialEq, Eq)] 80 | pub struct NodeArena { 81 | nodes: Vec, 82 | } 83 | 84 | impl NodeArena { 85 | pub fn new() -> Self { 86 | Default::default() 87 | } 88 | 89 | pub fn alloc(&mut self, node: Node) -> NodeIdx { 90 | let idx = self.nodes.len() as u32; 91 | self.nodes.push(node); 92 | NodeIdx(idx) 93 | } 94 | 95 | pub fn replace(&mut self, idx: NodeIdx, node: Node) -> Node { 96 | std::mem::replace(&mut self.nodes[idx.0 as usize], node) 97 | } 98 | 99 | pub fn alloc_top(&mut self) -> NodeIdx { 100 | self.alloc(Node::Top) 101 | } 102 | 103 | pub fn alloc_str(&mut self, s: String) -> NodeIdx { 104 | self.alloc(Node::new_str(s)) 105 | } 106 | 107 | pub fn alloc_edged(&mut self) -> NodeIdx { 108 | self.alloc(Node::new_edged()) 109 | } 110 | 111 | /// Recursively make a copy of a node 112 | pub fn clone(&mut self, n: NodeIdx) -> NodeIdx { 113 | let mut seen = HashMap::new(); 114 | self._clone(n, &mut seen) 115 | } 116 | 117 | fn _clone(&mut self, n: NodeIdx, seen: &mut HashMap) -> NodeIdx { 118 | if let Some(new_idx) = seen.get(&n) { 119 | return *new_idx; 120 | } 121 | 122 | // TODO this `node` clone is ugly, we need it because we're adding new nodes in here, which might move the vec, 123 | // invalidating the iteration over the hashmap ref in edged. ideally we'd be using a proper arena here, so we 124 | // could guarantee the node doesn't move, so we didn't need this clone. need to fix that. 125 | 126 | // we can't dereference here because we need to preserve the DAG structure 127 | let node = self.get(n).clone(); 128 | let new = match node { 129 | Node::Top => self.alloc_top(), 130 | Node::Str(s) => self.alloc_str(s), 131 | Node::Edged(old_arcs) => { 132 | let mut arcs = HashMap::::new(); 133 | for (label, value) in old_arcs.into_iter() { 134 | arcs.insert(label, self._clone(value, seen)); 135 | } 136 | self.alloc(Node::Edged(arcs)) 137 | } 138 | Node::Forwarded(target) => { 139 | let new = self._clone(target, seen); 140 | self.alloc(Node::Forwarded(new)) 141 | } 142 | }; 143 | 144 | seen.insert(n, new); 145 | new 146 | } 147 | 148 | /// Display a NodeIdx 149 | pub fn display(&self, idx: NodeIdx) -> NodeDisplay { 150 | NodeDisplay { arena: self, idx } 151 | } 152 | 153 | /// Creates a Node from a list of (name, noderef) features. Names CANNOT be dotted! 154 | pub fn alloc_from_edges(&mut self, edges: I) -> Result 155 | where 156 | I: IntoIterator, 157 | { 158 | let node = self.alloc_edged(); 159 | 160 | for (label, target) in edges { 161 | assert!( 162 | !label.contains('.'), 163 | "new_with_edges cannot take dotted paths!" 164 | ); 165 | 166 | self.push_edge(node, label, target)?; // error if unification failure 167 | } 168 | 169 | Ok(node) 170 | } 171 | 172 | pub fn alloc_from_features(&mut self, paths: I) -> Result 173 | where 174 | I: IntoIterator, 175 | { 176 | let root = self.alloc_edged(); 177 | 178 | let mut tags: HashMap = HashMap::new(); 179 | for Feature { value, tag, path } in paths { 180 | if let Some(tag) = tag { 181 | if tags.contains_key(&tag) { 182 | let tagged = tags[&tag]; 183 | self.unify(value, tagged)?; 184 | } else { 185 | tags.insert(tag.to_string(), value); 186 | } 187 | } 188 | 189 | let mut current = root; 190 | let mut parts = path.split('.').peekable(); 191 | loop { 192 | let next = parts.next().expect("shouldn't be empty b/c path.len() > 0"); 193 | let is_last = parts.peek().is_none(); 194 | 195 | if is_last { 196 | self.push_edge(current, next.to_string(), value)?; 197 | break; 198 | } else { 199 | let new = self.alloc_edged(); 200 | self.push_edge(current, next.to_string(), new)?; 201 | current = new; 202 | } 203 | } 204 | } 205 | 206 | Ok(root) 207 | } 208 | 209 | /// Get an idx. Assumes valid, panics on OOB 210 | pub fn get(&self, idx: NodeIdx) -> &Node { 211 | self.nodes.get(idx.0 as usize).expect("Invalid NodeIdx") 212 | } 213 | 214 | /// Mutably get an idx. Assumes valid, panics on OOB 215 | pub fn get_mut(&mut self, idx: NodeIdx) -> &mut Node { 216 | self.nodes.get_mut(idx.0 as usize).expect("Invalid NodeIdx") 217 | } 218 | 219 | pub fn forward_to(&mut self, target: NodeIdx, to: NodeIdx) { 220 | self.nodes[target.0 as usize] = Node::Forwarded(to); 221 | } 222 | 223 | pub fn is_top(&self, n: NodeIdx) -> bool { 224 | self.get(n).is_top() 225 | } 226 | 227 | pub fn is_str(&self, n: NodeIdx) -> bool { 228 | self.get(n).is_str() 229 | } 230 | 231 | pub fn is_edged(&self, n: NodeIdx) -> bool { 232 | self.get(n).is_edged() 233 | } 234 | 235 | fn str(&self, n: NodeIdx) -> Option<&str> { 236 | self.get(n).str() 237 | } 238 | 239 | fn edged(&self, n: NodeIdx) -> Option<&HashMap> { 240 | self.get(n).edged() 241 | } 242 | 243 | fn edged_mut(&mut self, n: NodeIdx) -> Option<&mut HashMap> { 244 | self.get_mut(n).edged_mut() 245 | } 246 | 247 | #[allow(clippy::map_entry)] 248 | fn push_edge(&mut self, parent: NodeIdx, label: String, target: NodeIdx) -> Result<(), Err> { 249 | let node = self.get_mut(parent); 250 | 251 | if node.is_top() { 252 | *node = Node::new_edged(); 253 | } 254 | 255 | if let Some(arcs) = node.edged_mut() { 256 | if arcs.contains_key(&label) { 257 | let existing = arcs[&label]; 258 | self.unify(existing, target)?; 259 | } else { 260 | arcs.insert(label, target); 261 | } 262 | return Ok(()); 263 | } 264 | 265 | Err(format!("unification failure: {}", label).into()) 266 | } 267 | 268 | pub fn dereference(&self, mut idx: NodeIdx) -> NodeIdx { 269 | while let Node::Forwarded(r) = self.get(idx) { 270 | idx = *r; 271 | } 272 | idx 273 | } 274 | 275 | /// Unify two feature structures within this arena. Both may be mutated. 276 | pub fn unify(&mut self, n1: NodeIdx, n2: NodeIdx) -> Result<(), Err> { 277 | let n1 = self.dereference(n1); 278 | let n2 = self.dereference(n2); 279 | 280 | // if same node, already unified 281 | if n1 == n2 { 282 | return Ok(()); 283 | } 284 | 285 | // If either is top, forward to the other 286 | if self.is_top(n1) { 287 | self.forward_to(n1, n2); 288 | return Ok(()); 289 | } else if self.is_top(n2) { 290 | self.forward_to(n2, n1); 291 | return Ok(()); 292 | } 293 | 294 | // try to unify string values 295 | if self.is_str(n1) && self.is_str(n2) { 296 | let n1_str = self.str(n1).unwrap(); 297 | let n2_str = self.str(n2).unwrap(); 298 | 299 | if n1_str == n2_str { 300 | self.forward_to(n1, n2); 301 | return Ok(()); 302 | } else { 303 | return Err(format!("unification failure: {n1_str} & {n2_str}").into()); 304 | } 305 | } 306 | 307 | // if both are edged, unify their contents 308 | if self.is_edged(n1) && self.is_edged(n2) { 309 | let n1 = self.replace(n1, Node::Forwarded(n2)); 310 | let n1arcs = n1.edged().unwrap(); 311 | 312 | for (label, value) in n1arcs.iter() { 313 | if self.edged(n2).unwrap().contains_key(label) { 314 | // shared arc 315 | let other = self.edged(n2).unwrap().get(label).unwrap(); 316 | self.unify(*value, *other)?; 317 | } else { 318 | // complement arc 319 | self.edged_mut(n2).unwrap().insert(label.clone(), *value); 320 | } 321 | } 322 | 323 | return Ok(()); 324 | } 325 | 326 | Err( 327 | format!( 328 | "unification failure: {:?} & {:?}", 329 | self.get(n1), 330 | self.get(n2) 331 | ) 332 | .into(), 333 | ) 334 | } 335 | } 336 | 337 | /// Helper struct for displaying a node 338 | #[derive(Clone)] 339 | pub struct NodeDisplay<'a> { 340 | pub arena: &'a NodeArena, 341 | pub idx: NodeIdx, 342 | } 343 | 344 | impl fmt::Display for NodeDisplay<'_> { 345 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 346 | let mut counts = HashMap::new(); 347 | count_in_pointers(self, &mut counts); 348 | let mut has_printed = HashMap::new(); 349 | format_node(self, &counts, &mut has_printed, 0, f) 350 | } 351 | } 352 | 353 | // for fmt::Display impl 354 | #[allow(clippy::map_entry)] 355 | fn count_in_pointers(n: &NodeDisplay, seen: &mut HashMap) { 356 | let nref = n.arena.dereference(n.idx); 357 | if seen.contains_key(&nref) { 358 | seen.entry(nref).and_modify(|cnt| *cnt += 1); 359 | } else { 360 | seen.insert(nref, 1); 361 | if let Some(arcs) = n.arena.edged(nref) { 362 | for value in arcs.values() { 363 | count_in_pointers( 364 | &NodeDisplay { 365 | arena: n.arena, 366 | idx: *value, 367 | }, 368 | seen, 369 | ); 370 | } 371 | } 372 | } 373 | } 374 | 375 | // for fmt::Display impl 376 | fn format_node( 377 | nd: &NodeDisplay, 378 | counts: &HashMap, 379 | has_printed: &mut HashMap, 380 | indent: usize, 381 | f: &mut fmt::Formatter<'_>, 382 | ) -> fmt::Result { 383 | let arena = nd.arena; 384 | let idx = arena.dereference(nd.idx); 385 | 386 | if counts[&idx] > 1 && has_printed.contains_key(&idx) { 387 | return write!(f, "#{}", has_printed[&idx]); 388 | } 389 | 390 | if counts[&idx] > 1 { 391 | let id = has_printed.len(); 392 | has_printed.insert(idx, id); 393 | write!(f, "#{} ", id)?; 394 | } 395 | 396 | let r = nd.arena.get(idx); 397 | match r { 398 | Node::Top => write!(f, "**top**"), 399 | Node::Str(s) => write!(f, "{}", s), 400 | Node::Edged(arcs) => { 401 | if arcs.is_empty() { 402 | write!(f, "[]") 403 | } else if arcs.len() == 1 { 404 | let (label, value) = arcs.iter().next().unwrap(); 405 | write!(f, "[ {}: ", label)?; 406 | format_node( 407 | &NodeDisplay { arena, idx: *value }, 408 | counts, 409 | has_printed, 410 | 0, 411 | f, 412 | )?; 413 | write!(f, " ]") 414 | } else { 415 | writeln!(f, "[")?; 416 | for (label, value) in arcs.iter() { 417 | write!(f, "{:indent$}{}: ", "", label, indent = indent + 2)?; 418 | format_node( 419 | &NodeDisplay { arena, idx: *value }, 420 | counts, 421 | has_printed, 422 | indent + 2, 423 | f, 424 | )?; 425 | writeln!(f)?; 426 | } 427 | write!(f, "{:indent$}]", "", indent = indent) 428 | } 429 | } 430 | Node::Forwarded(_) => panic!("unexpected forward"), 431 | } 432 | } 433 | -------------------------------------------------------------------------------- /src/featurestructure/serialized.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use super::node::{Node, NodeArena, NodeIdx}; 4 | 5 | /// A noderef that's been serialized into a tree structure. Nodes with multiple 6 | /// in-pointers are duplicated. 7 | /// IMPORTANT: **top** is /stripped out/. All top features will not be present in 8 | /// the serialized tree. 9 | #[derive(Debug, Clone)] 10 | pub enum SerializedNode { 11 | Str(String), 12 | Edged(HashMap), 13 | } 14 | 15 | impl SerializedNode { 16 | pub fn as_str(&self) -> Option<&str> { 17 | match self { 18 | Self::Str(s) => Some(s.as_str()), 19 | _ => None, 20 | } 21 | } 22 | 23 | pub fn into_str(self) -> Option { 24 | match self { 25 | Self::Str(s) => Some(s), 26 | _ => None, 27 | } 28 | } 29 | 30 | pub fn as_edged(&self) -> Option<&HashMap> { 31 | match self { 32 | Self::Edged(map) => Some(map), 33 | _ => None, 34 | } 35 | } 36 | 37 | pub fn into_edged(self) -> Option> { 38 | match self { 39 | Self::Edged(map) => Some(map), 40 | _ => None, 41 | } 42 | } 43 | 44 | pub fn get_path(&self, path: &[&str]) -> Option<&SerializedNode> { 45 | let mut node = self; 46 | let mut path = path; 47 | while !path.is_empty() { 48 | node = node.as_edged()?.get(path[0])?; 49 | path = &path[1..]; 50 | } 51 | Some(node) 52 | } 53 | 54 | pub fn get_path_str(&self, path: &[&str]) -> Option<&str> { 55 | self.get_path(path).and_then(Self::as_str) 56 | } 57 | 58 | /// Create a SerializedNode from a NodeArena and NodeIdx 59 | pub fn from_node(arena: &NodeArena, idx: NodeIdx) -> Option { 60 | let idx = arena.dereference(idx); 61 | match arena.get(idx) { 62 | Node::Forwarded(_) => panic!("unexpected forward after dereference"), 63 | Node::Top => None, 64 | Node::Str(s) => Some(SerializedNode::Str(s.to_string())), 65 | Node::Edged(edges) => { 66 | let mut map: HashMap = HashMap::new(); 67 | for (k, v) in edges.iter() { 68 | let value = Self::from_node(arena, *v); 69 | if let Some(value) = value { 70 | map.insert(k.to_string(), value); 71 | } 72 | } 73 | if map.is_empty() { 74 | None 75 | } else { 76 | Some(SerializedNode::Edged(map)) 77 | } 78 | } 79 | } 80 | } 81 | } 82 | 83 | impl From<&str> for SerializedNode { 84 | fn from(s: &str) -> Self { 85 | s.to_string().into() 86 | } 87 | } 88 | 89 | impl From for SerializedNode { 90 | fn from(s: String) -> Self { 91 | Self::Str(s) 92 | } 93 | } 94 | 95 | impl From> for SerializedNode { 96 | fn from(hm: HashMap) -> Self { 97 | Self::Edged(hm) 98 | } 99 | } 100 | 101 | impl PartialEq for SerializedNode { 102 | fn eq(&self, other: &Self) -> bool { 103 | match (&self, &other) { 104 | (SerializedNode::Str(s1), SerializedNode::Str(s2)) => s1 == s2, 105 | (SerializedNode::Str(_), SerializedNode::Edged(_)) 106 | | (SerializedNode::Edged(_), SerializedNode::Str(_)) => false, 107 | (SerializedNode::Edged(m1), &SerializedNode::Edged(m2)) => { 108 | if m1.len() != m2.len() { 109 | return false; 110 | } 111 | 112 | m1.iter().all(|(k, v)| m2.get(k) == Some(v)) 113 | } 114 | } 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /src/fgr/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod parse_grammar; 2 | 3 | pub use parse_grammar::*; 4 | 5 | #[cfg(test)] 6 | mod tests { 7 | use crate::Grammar; 8 | 9 | macro_rules! example_file { 10 | ($filename:expr) => { 11 | ( 12 | $filename, 13 | include_str!(concat!("../../examples/", $filename)), 14 | ) 15 | }; 16 | } 17 | 18 | #[test] 19 | fn smoke_test_examples() { 20 | let examples = [ 21 | example_file!("asl-wordorder.fgr"), 22 | example_file!("dative-shift.fgr"), 23 | example_file!("no-features.fgr"), 24 | example_file!("reflexives.fgr"), 25 | ]; 26 | 27 | for (filename, src) in examples { 28 | assert!(src.parse::().is_ok(), "failed to parse {filename}"); 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/fgr/parse_grammar.rs: -------------------------------------------------------------------------------- 1 | /// Simple recursive-descent parsing of grammar files 2 | use std::str::FromStr; 3 | 4 | use regex::Regex; 5 | 6 | use crate::featurestructure::{Feature, NodeArena, NodeIdx}; 7 | use crate::rules::{Grammar, Production, Rule}; 8 | use crate::utils::Err; 9 | 10 | pub const TOP_STR: &str = "**top**"; 11 | 12 | /// Parses a str into a tuple of (rules, nonterminals) 13 | /// Errors if the grammar doesn't parse or is malformed 14 | impl FromStr for Grammar { 15 | type Err = Err; 16 | 17 | /// Parses a grammar from a string. Assumes the first rule's symbol 18 | /// is the start symbol. 19 | fn from_str(s: &str) -> Result { 20 | let mut arena = NodeArena::new(); 21 | let (rules, s) = parse_rules(s, &mut arena)?; 22 | assert!(s.is_empty()); 23 | 24 | if rules.is_empty() { 25 | Err("empty ruleset".into()) 26 | } else { 27 | Self::new(rules, arena) 28 | } 29 | } 30 | } 31 | 32 | type Infallible<'a, T> = (T, &'a str); 33 | type ParseResult<'a, T> = Result<(T, &'a str), Err>; 34 | 35 | /// helper macro for initializing a regex with lazy_static! 36 | macro_rules! regex_static { 37 | ($name:ident, $pattern:expr) => { 38 | lazy_static! { 39 | static ref $name: Regex = Regex::new($pattern).unwrap(); 40 | } 41 | }; 42 | } 43 | 44 | /// Try to consume a regex, returning None if it doesn't match 45 | fn optional_re<'a>(re: &'static Regex, s: &'a str) -> Infallible<'a, Option<&'a str>> { 46 | if let Some(caps) = re.captures(s) { 47 | let m = caps.get(0).unwrap(); 48 | if m.start() > 0 { 49 | return (None, s); 50 | } 51 | let (_, rest) = s.split_at(m.end()); 52 | (Some(m.as_str()), rest) 53 | } else { 54 | (None, s) 55 | } 56 | } 57 | 58 | /// Try to consume a regex, failing if it doesn't match 59 | fn needed_re<'a>(re: &'static Regex, s: &'a str) -> ParseResult<'a, &'a str> { 60 | if let (Some(c), rest) = optional_re(re, s) { 61 | Ok((c, rest)) 62 | } else { 63 | Err(format!("couldn't match {} at {}", re, s).into()) 64 | } 65 | } 66 | 67 | /// Try to consume a char, returning None if it doesn't match 68 | fn optional_char(c: char, s: &str) -> Infallible> { 69 | let mut iter = s.char_indices().peekable(); 70 | if let Some((_, c1)) = iter.next() { 71 | if c == c1 { 72 | let rest = if let Some((idx, _)) = iter.peek() { 73 | s.split_at(*idx).1 74 | } else { 75 | "" 76 | }; 77 | return (Some(c), rest); 78 | } 79 | } 80 | (None, s) 81 | } 82 | 83 | /// Try to consume a char, failing if it doesn't match 84 | fn needed_char(c: char, s: &str) -> ParseResult { 85 | if let (Some(c), rest) = optional_char(c, s) { 86 | Ok((c, rest)) 87 | } else { 88 | Err(format!("couldn't match {} at {}", c, s).into()) 89 | } 90 | } 91 | 92 | /// Tries to skip 1 or more \s characters and comments 93 | fn skip_whitespace(s: &str) -> &str { 94 | regex_static!(WHITESPACE_OR_COMMENT, r"\s*(//.*?\n\s*)*"); 95 | optional_re(&WHITESPACE_OR_COMMENT, s).1 96 | } 97 | 98 | // Tries to skip 1 or more non-newline whitespace characters 99 | fn skip_whitespace_nonnewline(s: &str) -> &str { 100 | regex_static!(WHITESPACE_NONNEWLINE, r"[\s&&[^\n]]*"); 101 | optional_re(&WHITESPACE_NONNEWLINE, s).1 102 | } 103 | 104 | /// Tries to parse a name made of letters, numbers, - and _ 105 | fn parse_name(s: &str) -> ParseResult<&str> { 106 | regex_static!(NAME, r"[a-zA-Z0-9\-_]+"); 107 | needed_re(&NAME, s).map_err(|err| format!("name: {}", err).into()) 108 | } 109 | 110 | /// Tries to parse a name made of dotted segments (foo.bar.c.d) 111 | fn parse_dotted(s: &str) -> ParseResult<&str> { 112 | regex_static!(DOTTED, r"[a-zA-Z0-9\-_]+(\.[a-zA-Z0-9\-_]+)*"); 113 | needed_re(&DOTTED, s).map_err(|e| format!("dotted name: {}", e).into()) 114 | } 115 | 116 | /// Parses an optional #tag 117 | fn parse_tag(s: &str) -> ParseResult> { 118 | let (hash, s) = optional_char('#', s); 119 | if hash.is_none() { 120 | Ok((None, s)) 121 | } else { 122 | let s = skip_whitespace(s); 123 | let (name, s) = parse_name(s).map_err(|e| -> Err { format!("tag: {}", e).into() })?; 124 | Ok((Some(name.to_string()), s)) 125 | } 126 | } 127 | 128 | /// Parses a value with an optional tag: #tag value 129 | fn parse_feature_value<'a>( 130 | s: &'a str, 131 | arena: &mut NodeArena, 132 | ) -> ParseResult<'a, (Option, NodeIdx)> { 133 | regex_static!(VALUE, r"[a-zA-Z0-9\-_\*]+"); 134 | let (tag, s) = parse_tag(s)?; 135 | let s = skip_whitespace(s); 136 | let (name, s) = optional_re(&VALUE, s); 137 | let value = if let Some(name) = name { 138 | if name == TOP_STR { 139 | arena.alloc_top() 140 | } else { 141 | arena.alloc_str(name.to_string()) 142 | } 143 | } else if tag.is_some() { 144 | arena.alloc_top() 145 | } else { 146 | return Err(format!("feature needs tag or value at {}", s).into()); 147 | }; 148 | Ok(((tag, value), s)) 149 | } 150 | 151 | fn parse_feature<'a>(s: &'a str, arena: &mut NodeArena) -> ParseResult<'a, Feature> { 152 | let (name, s) = parse_dotted(s).map_err(|e| format!("feature name: {}", e))?; 153 | let s = skip_whitespace(s); 154 | let (_, s) = needed_char(':', s)?; 155 | let s = skip_whitespace(s); 156 | let (value, s) = parse_feature_value(s, arena).map_err(|e| format!("feature value: {}", e))?; 157 | let s = skip_whitespace(s); 158 | let (_, s) = optional_char(',', s); 159 | 160 | Ok(( 161 | Feature { 162 | path: name.to_string(), 163 | tag: value.0, 164 | value: value.1, 165 | }, 166 | s, 167 | )) 168 | } 169 | 170 | fn parse_featurestructure<'a>(s: &'a str, arena: &mut NodeArena) -> ParseResult<'a, Vec> { 171 | let mut pairs = Vec::new(); 172 | let mut rem = needed_char('[', s)?.1; 173 | loop { 174 | rem = skip_whitespace(rem); 175 | if let (Some(_), rem) = optional_char(']', rem) { 176 | return Ok((pairs, rem)); 177 | } 178 | let (feature, s) = parse_feature(rem, arena)?; 179 | pairs.push(feature); 180 | rem = s; 181 | } 182 | } 183 | 184 | fn parse_production<'a>( 185 | s: &'a str, 186 | arena: &mut NodeArena, 187 | ) -> ParseResult<'a, (Production, Vec)> { 188 | let (name, s) = parse_name(s).map_err(|e| -> Err { format!("symbol: {}", e).into() })?; 189 | let s = skip_whitespace_nonnewline(s); 190 | let (features, s) = if s.starts_with('[') { 191 | parse_featurestructure(s, arena)? 192 | } else { 193 | (Vec::new(), s) 194 | }; 195 | 196 | if name.chars().next().unwrap().is_uppercase() { 197 | Ok(((Production::new_nonterminal(name.to_string()), features), s)) 198 | } else if !features.is_empty() { 199 | Err(format!("terminal (lower-case) cannot have features: {} {}", name, s).into()) 200 | } else { 201 | // annotate terminals with their matching string 202 | Ok(( 203 | ( 204 | Production::new_terminal(name.to_string()), 205 | vec![Feature { 206 | path: "word".to_string(), 207 | tag: None, 208 | value: arena.alloc_str(name.to_string()), 209 | }], 210 | ), 211 | s, 212 | )) 213 | } 214 | } 215 | 216 | fn parse_nonterminal<'a>( 217 | s: &'a str, 218 | arena: &mut NodeArena, 219 | ) -> ParseResult<'a, (String, Vec)> { 220 | let ((prod, features), s) = parse_production(s, arena)?; 221 | if prod.is_nonterminal() { 222 | Ok(((prod.symbol, features), s)) 223 | } else { 224 | Err(format!("expected nonterminal, got terminal {}: {}", prod.symbol, s).into()) 225 | } 226 | } 227 | 228 | /// Symbol, productions, terminated by final newline 229 | fn parse_rule<'a>(s: &'a str, arena: &mut NodeArena) -> ParseResult<'a, Rule> { 230 | #![allow(clippy::trivial_regex)] 231 | regex_static!(ARROW, "->"); 232 | 233 | let ((symbol, features), s) = 234 | parse_nonterminal(s, arena).map_err(|e| -> Err { format!("rule symbol: {}", e).into() })?; 235 | let s = skip_whitespace(s); 236 | let (_, s) = needed_re(&ARROW, s).map_err(|e| -> Err { format!("rule arrow: {}", e).into() })?; 237 | 238 | let mut prods_features = Vec::new(); 239 | let mut rem = s; 240 | loop { 241 | rem = skip_whitespace_nonnewline(rem); 242 | 243 | let try_newline = skip_whitespace(rem); 244 | if rem.is_empty() || try_newline != rem { 245 | // end of line, exit loop 246 | rem = try_newline; 247 | break; 248 | } 249 | 250 | let (prod, s) = parse_production(rem, arena) 251 | .map_err(|e| -> Err { format!("rule production: {}", e).into() })?; 252 | prods_features.push(prod); 253 | rem = s; 254 | } 255 | 256 | let (features, productions) = adopt_child_features(features, prods_features); 257 | let features = arena.alloc_from_features(features)?; 258 | 259 | Ok(( 260 | Rule { 261 | symbol, 262 | features, 263 | productions, 264 | }, 265 | rem, 266 | )) 267 | } 268 | 269 | /// We want rules to be able to access their child features, and to be able to 270 | /// unify between them 271 | /// So we have the rule symbol "adopt" the features of its children, copying the 272 | /// child features into child-0.(...), child-1.(...), etc. 273 | /// 274 | /// We could try to implement this when constructing the rule, but it's easier 275 | /// to do as a simple AST transform. 276 | fn adopt_child_features( 277 | mut rule_features: Vec, 278 | prods_features: Vec<(Production, Vec)>, 279 | ) -> (Vec, Vec) { 280 | let mut productions = Vec::with_capacity(prods_features.len()); 281 | 282 | for (idx, (prod, features)) in prods_features.into_iter().enumerate() { 283 | productions.push(prod); 284 | let prefix = format!("child-{}.", idx); 285 | for feature in features.into_iter() { 286 | rule_features.push(Feature { 287 | path: prefix.clone() + &feature.path, 288 | tag: feature.tag, 289 | value: feature.value, 290 | }); 291 | } 292 | } 293 | 294 | (rule_features, productions) 295 | } 296 | 297 | fn parse_rules<'a>(s: &'a str, arena: &mut NodeArena) -> ParseResult<'a, Vec> { 298 | let mut rules = Vec::new(); 299 | let mut rem = s; 300 | loop { 301 | rem = skip_whitespace(rem); 302 | if rem.is_empty() { 303 | return Ok((rules, rem)); 304 | } 305 | let (rule, s) = parse_rule(rem, arena)?; 306 | rules.push(rule); 307 | rem = s; 308 | } 309 | } 310 | -------------------------------------------------------------------------------- /src/forest.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | use std::sync::Arc; 3 | 4 | use crate::earley::Chart; 5 | use crate::rules::{Grammar, Rule}; 6 | use crate::syntree::{Constituent, SynTree, Word}; 7 | use crate::utils::combinations; 8 | 9 | #[derive(Debug, Clone, PartialEq, Eq)] 10 | pub struct ForestState { 11 | rule: Arc, 12 | span: (usize, usize), 13 | } 14 | 15 | impl ForestState { 16 | pub fn new(rule: &Arc, start: usize, end: usize) -> Self { 17 | Self { 18 | rule: rule.clone(), 19 | span: (start, end), 20 | } 21 | } 22 | } 23 | 24 | impl fmt::Display for ForestState { 25 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 26 | write!(f, "{}..{}: {}", self.span.0, self.span.1, self.rule) 27 | } 28 | } 29 | 30 | impl From<&ForestState> for Constituent> { 31 | fn from(fs: &ForestState) -> Self { 32 | Self { 33 | value: fs.rule.clone(), 34 | span: fs.span, 35 | } 36 | } 37 | } 38 | 39 | #[derive(Debug, Clone, PartialEq, Eq)] 40 | pub struct Forest(Vec>); 41 | 42 | impl Forest { 43 | pub fn len(&self) -> usize { 44 | self.0.len() 45 | } 46 | 47 | pub fn is_empty(&self) -> bool { 48 | self.len() == 0 49 | } 50 | 51 | /// Checks if a subtree has already been completed by make_trees(), 52 | /// or if it is a leaf and doesn't need to be completed 53 | fn subtree_is_complete(node: &SynTree, String>) -> bool { 54 | if let Some((cons, children)) = node.get_branch() { 55 | cons.value.productions.len() == children.len() 56 | } else { 57 | // is a leaf 58 | true 59 | } 60 | } 61 | 62 | /// Takes a rule and search span, and returns a vec of all possible sequences 63 | /// of trees that correspond to the rule's productions. 64 | /// So for the situation: 65 | /// ```text 66 | /// g := ''' 67 | /// S -> x 68 | /// S -> S S 69 | /// ''' 70 | /// chart := parse(g, "x x x") 71 | /// chart.extend_out(g, S -> S S, start = 0, end = 3) 72 | /// ``` 73 | /// , which, recall, has a chart that looks like: 74 | /// 75 | /// ```text 76 | /// 0..1: S -> x 77 | /// 0..2: S -> S S 78 | /// 0..3: S -> S S 79 | /// 1..2: S -> x 80 | /// 1..3: S -> S S 81 | /// 2..3: S -> x 82 | /// ``` 83 | /// 84 | /// You'd get 85 | /// 86 | /// ```text 87 | /// [[(S -> x, 0..1), (S -> S S, (), 1..3)], 88 | /// [(S -> S S, (), 0..2), (S -> x, 2..3)]] 89 | /// ``` 90 | fn extend_out( 91 | &self, 92 | rule: &Rule, 93 | prod_idx: usize, 94 | search_start: usize, 95 | search_end: usize, 96 | ) -> Vec, String>>> { 97 | if prod_idx == rule.len() && search_start == search_end { 98 | // base case, we consumed the whole rule and the whole span together. 99 | // provide a single empty sequence as a base for prepending onto as we unwind the stack 100 | return vec![Vec::new()]; 101 | } else if prod_idx == rule.len() || search_start == search_end { 102 | // we either ran out of productions before consuming everything, or ran out of stuff to consume before 103 | // satisfying all the productions. bail with 0 possible sequences. 104 | return Vec::new(); 105 | } 106 | 107 | let next_production = &rule.productions[prod_idx]; 108 | if next_production.is_nonterminal() { 109 | let wanted_symbol = &next_production.symbol; 110 | // look for potential next states to produce this production at the search start 111 | self.0[search_start] 112 | .iter() 113 | // only consider states that are contained within the search range, and have our wanted symbol 114 | .filter(|s| s.span.1 <= search_end && wanted_symbol == &s.rule.symbol) 115 | .flat_map(|state| { 116 | // recursively find possible sequences that start directly after this state 117 | // TODO: this is probably easily amenable to some dynamic programming to reduce repeated work 118 | self 119 | .extend_out(rule, prod_idx + 1, state.span.1, search_end) 120 | .into_iter() 121 | // if there are any, prepend an uncompleted tree headed by this state onto the sequence and throw it on the pile 122 | .map(move |mut seq| { 123 | seq.insert(0, SynTree::Branch(state.into(), Vec::new())); 124 | seq 125 | }) 126 | }) 127 | .collect() 128 | } else { 129 | // similar to the nonterminal case, but we don't have to search for multiple potential states -- 130 | // all terminals with the same symbol_str are identical. 131 | let leaf = SynTree::Leaf(Word { 132 | value: next_production.symbol.to_string(), 133 | span: (search_start, search_start + 1), 134 | }); 135 | 136 | // recursively find possible sequences, like before 137 | self 138 | .extend_out(rule, prod_idx + 1, search_start + 1, search_end) 139 | .into_iter() 140 | .map(move |mut seq| { 141 | // prepend our new leaf to them 142 | seq.insert(0, leaf.clone()); 143 | seq 144 | }) 145 | .collect() 146 | } 147 | } 148 | 149 | /// Takes a possibly-uncompleted tree, and returns all possible trees it describes. 150 | /// An uncompleted tree is a non-nullable constituent with 0 children. It needs to be passed 151 | /// into extend_out, and then glued onto 152 | fn make_trees(&self, tree: SynTree, String>) -> Vec, String>> { 153 | if Self::subtree_is_complete(&tree) { 154 | vec![tree] 155 | } else { 156 | let (cons, _) = tree.get_branch().unwrap(); 157 | self 158 | .extend_out(&cons.value, 0, cons.span.0, cons.span.1) 159 | .into_iter() 160 | .flat_map(|children| { 161 | let child_sets = children 162 | .into_iter() 163 | .map(|child| self.make_trees(child)) 164 | .collect::>(); 165 | combinations(&child_sets) 166 | .into_iter() 167 | .map(|set| SynTree::Branch(cons.clone(), set)) 168 | }) 169 | .collect::>() 170 | } 171 | } 172 | 173 | pub fn trees(&self, g: &Grammar) -> Vec, String>> { 174 | if self.is_empty() { 175 | Vec::new() 176 | } else { 177 | // seed our search with all LR0s that started at position 0, span to 178 | // the end of the string, and are named by the grammar's start symbol 179 | let root_states = self.0[0] 180 | .iter() 181 | .filter(|state| state.span.1 == self.len() && state.rule.symbol == g.start) 182 | .map(|state| SynTree::Branch(state.into(), Vec::new())); 183 | // use make_trees to generate all possible filled-in trees from each seed tree 184 | root_states.fold( 185 | Vec::, String>>::new(), 186 | |mut prev, tree| { 187 | let mut trees = self.make_trees(tree); 188 | prev.append(&mut trees); 189 | prev 190 | }, 191 | ) 192 | } 193 | } 194 | } 195 | 196 | impl From for Forest { 197 | fn from(chart: Chart) -> Self { 198 | // the new chart will be indexed by origin location, and no rule can have 199 | // its origin at the end of the string, so len is chart.len - 1 200 | let mut v = vec![Vec::new(); chart.len() - 1]; 201 | 202 | for (k, states) in chart.into_iter() { 203 | for state in states { 204 | // exclude unfinished rules that can't contribute to a tree 205 | if !state.lr0.is_active() { 206 | v.get_mut(state.origin) 207 | .expect("origin > input len") 208 | .push(ForestState::new(&state.lr0.rule, state.origin, k)); 209 | } 210 | } 211 | } 212 | 213 | Self(v) 214 | } 215 | } 216 | 217 | impl fmt::Display for Forest { 218 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 219 | for k in 0..self.len() { 220 | writeln!(f, "Origin {}:", k)?; 221 | for fs in self.0[k].iter() { 222 | writeln!(f, " {}", fs)?; 223 | } 224 | } 225 | 226 | Ok(()) 227 | } 228 | } 229 | 230 | #[test] 231 | fn test_parse_chart() { 232 | let g: Grammar = r#" 233 | S -> x 234 | S -> S S 235 | "# 236 | .parse() 237 | .unwrap(); 238 | 239 | let get_rule_with_len = |len: usize| { 240 | g.rules 241 | .get("S") 242 | .unwrap() 243 | .iter() 244 | .find(|r| r.len() == len) 245 | .unwrap() 246 | }; 247 | 248 | let rule1 = get_rule_with_len(1); 249 | let rule2 = get_rule_with_len(2); 250 | 251 | let forest: Forest = crate::earley::parse_chart(&g, &["x", "x", "x"]).into(); 252 | 253 | assert_eq!( 254 | forest, 255 | Forest(vec![ 256 | vec![ 257 | ForestState::new(rule1, 0, 1), 258 | ForestState::new(rule2, 0, 2), 259 | ForestState::new(rule2, 0, 3), 260 | ], 261 | vec![ForestState::new(rule1, 1, 2), ForestState::new(rule2, 1, 3),], 262 | vec![ForestState::new(rule1, 2, 3)], 263 | ]) 264 | ); 265 | 266 | println!("{}", forest); 267 | } 268 | 269 | #[test] 270 | fn test_tree_generation() { 271 | // test the tree ambiguity problem that naive earley forest processing has 272 | // correct algorithm finds 2 trees: 273 | // (S (S x) (S (S x) (S x))) -> [x][xx] 274 | // (S (S (S x) (S x)) (S x)) -> [xx][x] 275 | // naive algorithm finds 2 addl. spurious trees: 276 | // (S (S x) (S x)) -> [x][x] 277 | // (S (S (S x) (S x)) (S (S x) (S x))) -> [xx][xx] 278 | 279 | let g = r#" 280 | S -> x 281 | S -> S S 282 | "# 283 | .parse() 284 | .unwrap(); 285 | 286 | let forest: Forest = crate::earley::parse_chart(&g, &["x", "x", "x"]).into(); 287 | let trees = forest.trees(&g); 288 | 289 | for tree in trees.iter() { 290 | println!("{}\n", tree); 291 | } 292 | 293 | assert_eq!(trees.len(), 2); 294 | } 295 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | /*! 2 | A symbolic natural language parsing library for Rust, inspired by 3 | [HDPSG](https://en.wikipedia.org/wiki/Head-driven_phrase_structure_grammar). 4 | 5 | # What is this? 6 | This is a library for parsing natural or constructed languages into syntax trees 7 | and feature structures. There's no machine learning or probabilistic models, 8 | everything is hand-crafted and deterministic. 9 | 10 | You can find out more about the motivations of this project in 11 | [this blog post](https://vgel.me/posts/symbolic-linguistics-part1/). 12 | 13 | ## But what are you using it for? 14 | I'm using this to parse a constructed language for my upcoming xenolinguistics 15 | game, [Themengi](https://vgel.me/themengi/). 16 | 17 | # Motivation 18 | Using a simple 80-line grammar, introduced in the tutorial below, we can parse 19 | a simple subset of English, checking reflexive pronoun binding, case, and 20 | number agreement. 21 | 22 | ```text 23 | $ cargo run --bin cli examples/reflexives.fgr 24 | > she likes himself 25 | Parsed 0 trees 26 | 27 | > her likes herself 28 | Parsed 0 trees 29 | 30 | > she like herself 31 | Parsed 0 trees 32 | 33 | > she likes herself 34 | Parsed 1 tree 35 | (0..3: S 36 | (0..1: N (0..1: she)) 37 | (1..2: TV (1..2: likes)) 38 | (2..3: N (2..3: herself))) 39 | [ 40 | child-2: [ 41 | case: acc 42 | pron: ref 43 | needs_pron: #0 she 44 | num: sg 45 | child-0: [ word: herself ] 46 | ] 47 | child-1: [ 48 | tense: nonpast 49 | child-0: [ word: likes ] 50 | num: #1 sg 51 | ] 52 | child-0: [ 53 | child-0: [ word: she ] 54 | case: nom 55 | pron: #0 56 | num: #1 57 | ] 58 | ] 59 | ``` 60 | 61 | Low resource language? Low problem! No need to train on gigabytes of text, just 62 | write a grammar using your brain. Let's hypothesize that in 63 | American Sign Language, topicalized nouns (expressed with raised eyebrows) 64 | must appear first in the sentence. We can write a small grammar (18 lines), 65 | and plug in some sentences: 66 | 67 | ```text 68 | $ cargo run --bin cli examples/asl-wordorder.fgr -n 69 | > boy sit 70 | Parsed 1 tree 71 | (0..2: S 72 | (0..1: NP ((0..1: N (0..1: boy)))) 73 | (1..2: IV (1..2: sit))) 74 | 75 | > boy throw ball 76 | Parsed 1 tree 77 | (0..3: S 78 | (0..1: NP ((0..1: N (0..1: boy)))) 79 | (1..2: TV (1..2: throw)) 80 | (2..3: NP ((2..3: N (2..3: ball))))) 81 | 82 | > ball nm-raised-eyebrows boy throw 83 | Parsed 1 tree 84 | (0..4: S 85 | (0..2: NP 86 | (0..1: N (0..1: ball)) 87 | (1..2: Topic (1..2: nm-raised-eyebrows))) 88 | (2..3: NP ((2..3: N (2..3: boy)))) 89 | (3..4: TV (3..4: throw))) 90 | 91 | > boy throw ball nm-raised-eyebrows 92 | Parsed 0 trees 93 | ``` 94 | 95 | # Tutorial 96 | As an example, let's say we want to build a parser for English reflexive 97 | pronouns (himself, herself, themselves, themself, itself). We'll also support 98 | number ("He likes X" v.s. "They like X") and simple embedded clauses 99 | ("He said that they like X"). 100 | 101 | Grammar files are written in a custom language, similar to BNF, called 102 | Feature GRammar (.fgr). There's a VSCode syntax highlighting extension for these 103 | files available as [`fgr-syntax`](https://marketplace.visualstudio.com/items?itemName=vgel.fgr-syntax). 104 | 105 | We'll start by defining our lexicon. The lexicon is the set of terminal symbols 106 | (symbols in the actual input) that the grammar will match. Terminal symbols must 107 | start with a lowercase letter, and non-terminal symbols must start with an 108 | uppercase letter. 109 | 110 | ```fgr 111 | // pronouns 112 | N -> he 113 | N -> him 114 | N -> himself 115 | N -> she 116 | N -> her 117 | N -> herself 118 | N -> they 119 | N -> them 120 | N -> themselves 121 | N -> themself 122 | 123 | // names, lowercase as they are terminals 124 | N -> mary 125 | N -> sue 126 | N -> takeshi 127 | N -> robert 128 | 129 | // complementizer 130 | Comp -> that 131 | 132 | // verbs -- intransitive, transitive, and clausal 133 | IV -> falls 134 | IV -> fall 135 | IV -> fell 136 | 137 | TV -> likes 138 | TV -> like 139 | TV -> liked 140 | 141 | CV -> says 142 | CV -> say 143 | CV -> said 144 | ``` 145 | 146 | Next, we can add our sentence rules (they must be added at the top, as the first 147 | rule in the file is assumed to be the top-level rule): 148 | 149 | ```fgr 150 | // sentence rules 151 | S -> N IV 152 | S -> N TV N 153 | S -> N CV Comp S 154 | 155 | // ... previous lexicon ... 156 | ``` 157 | 158 | Assuming this file is saved as `examples/no-features.fgr` (which it is :wink:), 159 | we can test this file with the built-in CLI: 160 | 161 | ```text 162 | $ cargo run --bin cli examples/no-features.fgr 163 | > he falls 164 | Parsed 1 tree 165 | (0..2: S 166 | (0..1: N (0..1: he)) 167 | (1..2: IV (1..2: falls))) 168 | [ 169 | child-1: [ child-0: [ word: falls ] ] 170 | child-0: [ child-0: [ word: he ] ] 171 | ] 172 | 173 | > he falls her 174 | Parsed 0 trees 175 | 176 | > he likes her 177 | Parsed 1 tree 178 | (0..3: S 179 | (0..1: N (0..1: he)) 180 | (1..2: TV (1..2: likes)) 181 | (2..3: N (2..3: her))) 182 | [ 183 | child-2: [ child-0: [ word: her ] ] 184 | child-1: [ child-0: [ word: likes ] ] 185 | child-0: [ child-0: [ word: he ] ] 186 | ] 187 | 188 | > he likes 189 | Parsed 0 trees 190 | 191 | > he said that he likes her 192 | Parsed 1 tree 193 | (0..6: S 194 | (0..1: N (0..1: he)) 195 | (1..2: CV (1..2: said)) 196 | (2..3: Comp (2..3: that)) 197 | (3..6: S 198 | (3..4: N (3..4: he)) 199 | (4..5: TV (4..5: likes)) 200 | (5..6: N (5..6: her)))) 201 | [ 202 | child-0: [ child-0: [ word: he ] ] 203 | child-2: [ child-0: [ word: that ] ] 204 | child-1: [ child-0: [ word: said ] ] 205 | child-3: [ 206 | child-2: [ child-0: [ word: her ] ] 207 | child-1: [ child-0: [ word: likes ] ] 208 | child-0: [ child-0: [ word: he ] ] 209 | ] 210 | ] 211 | 212 | > he said that he 213 | Parsed 0 trees 214 | ``` 215 | 216 | This grammar already parses some correct sentences, and blocks some trivially 217 | incorrect ones. However, it doesn't care about number, case, or reflexives 218 | right now: 219 | 220 | ```text 221 | > she likes himself // unbound reflexive pronoun 222 | Parsed 1 tree 223 | (0..3: S 224 | (0..1: N (0..1: she)) 225 | (1..2: TV (1..2: likes)) 226 | (2..3: N (2..3: himself))) 227 | [ 228 | child-0: [ child-0: [ word: she ] ] 229 | child-2: [ child-0: [ word: himself ] ] 230 | child-1: [ child-0: [ word: likes ] ] 231 | ] 232 | 233 | > him like her // incorrect case on the subject pronoun, should be nominative 234 | // (he) instead of accusative (him) 235 | Parsed 1 tree 236 | (0..3: S 237 | (0..1: N (0..1: him)) 238 | (1..2: TV (1..2: like)) 239 | (2..3: N (2..3: her))) 240 | [ 241 | child-0: [ child-0: [ word: him ] ] 242 | child-1: [ child-0: [ word: like ] ] 243 | child-2: [ child-0: [ word: her ] ] 244 | ] 245 | 246 | > he like her // incorrect verb number agreement 247 | Parsed 1 tree 248 | (0..3: S 249 | (0..1: N (0..1: he)) 250 | (1..2: TV (1..2: like)) 251 | (2..3: N (2..3: her))) 252 | [ 253 | child-2: [ child-0: [ word: her ] ] 254 | child-1: [ child-0: [ word: like ] ] 255 | child-0: [ child-0: [ word: he ] ] 256 | ] 257 | ``` 258 | 259 | To fix this, we need to add *features* to our lexicon, and restrict the sentence 260 | rules based on features. 261 | 262 | Features are added with square brackets, and are key: value pairs separated by 263 | commas. `**top**` is a special feature value, which basically means 264 | "unspecified" -- we'll come back to it later. Features that are unspecified are 265 | also assumed to have a `**top**` value, but sometimes explicitly stating top is 266 | more clear. 267 | 268 | ```fgr 269 | /// Pronouns 270 | // The added features are: 271 | // * num: sg or pl, whether this noun wants a singular verb (likes) or 272 | // a plural verb (like). note this is grammatical number, so for example 273 | // singular they takes plural agreement ("they like X", not *"they likes X") 274 | // * case: nom or acc, whether this noun is nominative or accusative case. 275 | // nominative case goes in the subject, and accusative in the object. 276 | // e.g., "he fell" and "she likes him", not *"him fell" and *"her likes he" 277 | // * pron: he, she, they, or ref -- what type of pronoun this is 278 | // * needs_pron: whether this is a reflexive that needs to bind to another 279 | // pronoun. 280 | N[ num: sg, case: nom, pron: he ] -> he 281 | N[ num: sg, case: acc, pron: he ] -> him 282 | N[ num: sg, case: acc, pron: ref, needs_pron: he ] -> himself 283 | N[ num: sg, case: nom, pron: she ] -> she 284 | N[ num: sg, case: acc, pron: she ] -> her 285 | N[ num: sg, case: acc, pron: ref, needs_pron: she] -> herself 286 | N[ num: pl, case: nom, pron: they ] -> they 287 | N[ num: pl, case: acc, pron: they ] -> them 288 | N[ num: pl, case: acc, pron: ref, needs_pron: they ] -> themselves 289 | N[ num: sg, case: acc, pron: ref, needs_pron: they ] -> themself 290 | 291 | // Names 292 | // The added features are: 293 | // * num: sg, as people are singular ("mary likes her" / *"mary like her") 294 | // * case: **top**, as names can be both subjects and objects 295 | // ("mary likes her" / "she likes mary") 296 | // * pron: whichever pronoun the person uses for reflexive agreement 297 | // mary pron: she => mary likes herself 298 | // sue pron: they => sue likes themself 299 | // takeshi pron: he => takeshi likes himself 300 | N[ num: sg, case: **top**, pron: she ] -> mary 301 | N[ num: sg, case: **top**, pron: they ] -> sue 302 | N[ num: sg, case: **top**, pron: he ] -> takeshi 303 | N[ num: sg, case: **top**, pron: he ] -> robert 304 | 305 | // Complementizer doesn't need features 306 | Comp -> that 307 | 308 | // Verbs -- intransitive, transitive, and clausal 309 | // The added features are: 310 | // * num: sg, pl, or **top** -- to match the noun numbers. 311 | // **top** will match either sg or pl, as past-tense verbs in English 312 | // don't agree in number: "he fell" and "they fell" are both fine 313 | // * tense: past or nonpast -- this won't be used for agreement, but will be 314 | // copied into the final feature structure, and the client code could do 315 | // something with it 316 | IV[ num: sg, tense: nonpast ] -> falls 317 | IV[ num: pl, tense: nonpast ] -> fall 318 | IV[ num: **top**, tense: past ] -> fell 319 | 320 | TV[ num: sg, tense: nonpast ] -> likes 321 | TV[ num: pl, tense: nonpast ] -> like 322 | TV[ num: **top**, tense: past ] -> liked 323 | 324 | CV[ num: sg, tense: nonpast ] -> says 325 | CV[ num: pl, tense: nonpast ] -> say 326 | CV[ num: **top**, tense: past ] -> said 327 | ``` 328 | 329 | Now that our lexicon is updated with features, we can update our sentence rules 330 | to constrain parsing based on those features. This uses two new features, 331 | tags and unification. Tags allow features to be associated between nodes in a 332 | rule, and unification controls how those features are compatible. The rules for 333 | unification are: 334 | 335 | 1. A string feature can unify with a string feature with the same value 336 | 2. A **top** feature can unify with anything, and the nodes are merged 337 | 3. A complex feature ([ ... ] structure) is recursively unified with another 338 | complex feature. 339 | 340 | If unification fails anywhere, the parse is aborted and the tree is discarded. 341 | This allows the programmer to discard trees if features don't match. 342 | 343 | ```fgr 344 | // Sentence rules 345 | // Intransitive verb: 346 | // * Subject must be nominative case 347 | // * Subject and verb must agree in number (copied through #1) 348 | S -> N[ case: nom, num: #1 ] IV[ num: #1 ] 349 | // Transitive verb: 350 | // * Subject must be nominative case 351 | // * Subject and verb must agree in number (copied through #2) 352 | // * If there's a reflexive in the object position, make sure its `needs_pron` 353 | // feature matches the subject's `pron` feature. If the object isn't a 354 | // reflexive, then its `needs_pron` feature will implicitly be `**top**`, so 355 | // will unify with anything. 356 | S -> N[ case: nom, pron: #1, num: #2 ] TV[ num: #2 ] N[ case: acc, needs_pron: #1 ] 357 | // Clausal verb: 358 | // * Subject must be nominative case 359 | // * Subject and verb must agree in number (copied through #1) 360 | // * Reflexives can't cross clause boundaries (*"He said that she likes himself"), 361 | // so we can ignore reflexives and delegate to inner clause rule 362 | S -> N[ case: nom, num: #1 ] CV[ num: #1 ] Comp S 363 | ``` 364 | 365 | Now that we have this augmented grammar (available as `examples/reflexives.fgr`), 366 | we can try it out and see that it rejects illicit sentences that were previously 367 | accepted, while still accepting valid ones: 368 | 369 | ```text 370 | > he fell 371 | Parsed 1 tree 372 | (0..2: S 373 | (0..1: N (0..1: he)) 374 | (1..2: IV (1..2: fell))) 375 | [ 376 | child-1: [ 377 | child-0: [ word: fell ] 378 | num: #0 sg 379 | tense: past 380 | ] 381 | child-0: [ 382 | pron: he 383 | case: nom 384 | num: #0 385 | child-0: [ word: he ] 386 | ] 387 | ] 388 | 389 | > he like him 390 | Parsed 0 trees 391 | 392 | > he likes himself 393 | Parsed 1 tree 394 | (0..3: S 395 | (0..1: N (0..1: he)) 396 | (1..2: TV (1..2: likes)) 397 | (2..3: N (2..3: himself))) 398 | [ 399 | child-1: [ 400 | num: #0 sg 401 | child-0: [ word: likes ] 402 | tense: nonpast 403 | ] 404 | child-2: [ 405 | needs_pron: #1 he 406 | num: sg 407 | child-0: [ word: himself ] 408 | pron: ref 409 | case: acc 410 | ] 411 | child-0: [ 412 | child-0: [ word: he ] 413 | pron: #1 414 | num: #0 415 | case: nom 416 | ] 417 | ] 418 | 419 | > he likes herself 420 | Parsed 0 trees 421 | 422 | > mary likes herself 423 | Parsed 1 tree 424 | (0..3: S 425 | (0..1: N (0..1: mary)) 426 | (1..2: TV (1..2: likes)) 427 | (2..3: N (2..3: herself))) 428 | [ 429 | child-0: [ 430 | pron: #0 she 431 | num: #1 sg 432 | case: nom 433 | child-0: [ word: mary ] 434 | ] 435 | child-1: [ 436 | tense: nonpast 437 | child-0: [ word: likes ] 438 | num: #1 439 | ] 440 | child-2: [ 441 | child-0: [ word: herself ] 442 | num: sg 443 | pron: ref 444 | case: acc 445 | needs_pron: #0 446 | ] 447 | ] 448 | 449 | > mary likes themself 450 | Parsed 0 trees 451 | 452 | > sue likes themself 453 | Parsed 1 tree 454 | (0..3: S 455 | (0..1: N (0..1: sue)) 456 | (1..2: TV (1..2: likes)) 457 | (2..3: N (2..3: themself))) 458 | [ 459 | child-0: [ 460 | pron: #0 they 461 | child-0: [ word: sue ] 462 | case: nom 463 | num: #1 sg 464 | ] 465 | child-1: [ 466 | tense: nonpast 467 | num: #1 468 | child-0: [ word: likes ] 469 | ] 470 | child-2: [ 471 | needs_pron: #0 472 | case: acc 473 | pron: ref 474 | child-0: [ word: themself ] 475 | num: sg 476 | ] 477 | ] 478 | 479 | > sue likes himself 480 | Parsed 0 trees 481 | ``` 482 | 483 | If this is interesting to you and you want to learn more, you can check out 484 | [my blog series](https://vgel.me/posts/symbolic-linguistics-part1/), 485 | the excellent textbook [Syntactic Theory: A Formal Introduction (2nd ed.)](https://web.stanford.edu/group/cslipublications/cslipublications/site/1575864002.shtml), 486 | and the [DELPH-IN project](http://www.delph-in.net/wiki/index.php/Home), whose 487 | work on the LKB inspired this simplified version. 488 | 489 | # Using from code 490 | I need to write this section in more detail, but if you're comfortable with Rust, 491 | I suggest looking through the codebase. It's not perfect, it started as one of 492 | my first Rust projects (after migrating through F# -> TypeScript -> C in search 493 | of the right performance/ergonomics tradeoff), and it could use more tests, 494 | but overall it's not too bad. 495 | 496 | Basically, the processing pipeline is: 497 | 498 | 1. Make a `Grammar` struct 499 | * `Grammar` is defined in `rules.rs`. 500 | * The easiest way to make a `Grammar` is `Grammar::parse_from_file`, which is 501 | mostly a hand-written recusive descent parser in `parse_grammar.rs`. Yes, 502 | I recognize the irony here. 503 | 2. It takes input (in `Grammar::parse`, which does everything for you, or 504 | `Grammar::parse_chart`, which just does the chart) 505 | 3. The input is first chart-parsed in `earley.rs` 506 | 4. Then, a forest is built from the chart, in `forest.rs`, using an algorithm 507 | I found in a very useful blog series I forget the URL for, because the 508 | algorithms in the academic literature for this are... weird. 509 | 5. Finally, the feature unification is used to prune the forest down to only 510 | valid trees. It would be more efficient to do this during parsing, but meh. 511 | 512 | The most interesting thing you can do via code and not via the CLI is probably 513 | getting at the raw feature DAG, as that would let you do things like pronoun 514 | coreference. The DAG code is in `featurestructure.rs`, and should be fairly 515 | approachable -- there's a lot of Rust ceremony around `Rc>` 516 | because using an arena allocation crate seemed ~~too har~~like overkill, but 517 | that is somewhat mitigated by the `NodeRef` type alias. Hit me up at 518 | https://vgel.me/contact if you need help with anything here! 519 | */ 520 | 521 | #[macro_use] 522 | extern crate lazy_static; 523 | 524 | pub mod earley; 525 | pub mod featurestructure; 526 | pub mod fgr; 527 | pub mod forest; 528 | pub mod rules; 529 | pub mod syntree; 530 | pub mod utils; 531 | 532 | use std::fs; 533 | use std::path; 534 | use std::sync::Arc; 535 | 536 | use tracing::{debug, trace}; 537 | 538 | pub use crate::earley::{Chart, parse_chart}; 539 | pub use crate::featurestructure::{NodeArena, NodeIdx}; 540 | pub use crate::forest::Forest; 541 | pub use crate::rules::{Grammar, Rule}; 542 | pub use crate::syntree::{Constituent, SynTree}; 543 | pub use crate::utils::Err; 544 | 545 | impl Grammar { 546 | pub fn parse_chart(&self, input: &[&str]) -> Chart { 547 | parse_chart(self, input) 548 | } 549 | 550 | pub fn parse_forest(&self, input: &[&str]) -> Forest { 551 | Forest::from(self.parse_chart(input)) 552 | } 553 | 554 | pub fn unify_tree( 555 | tree: SynTree, String>, 556 | arena: &mut NodeArena, 557 | ) -> Result<(SynTree, NodeIdx), Err> { 558 | match tree { 559 | SynTree::Leaf(w) => Ok((SynTree::Leaf(w), arena.alloc_top())), 560 | SynTree::Branch(cons, children) => { 561 | let features = arena.clone(cons.value.features); 562 | 563 | let mut bare_children = Vec::with_capacity(children.len()); 564 | for (idx, child) in children.into_iter().enumerate() { 565 | let (child_tree, child_features) = Self::unify_tree(child, arena)?; 566 | bare_children.push(child_tree); 567 | 568 | let to_unify = 569 | arena.alloc_from_edges(vec![(format!("child-{}", idx), child_features)])?; 570 | 571 | trace!("unifying {} with child-{}", cons.value.symbol, idx); 572 | trace!( 573 | "{} features: {}", 574 | cons.value.symbol, 575 | arena.display(cons.value.features) 576 | ); 577 | arena.unify(features, to_unify)?; 578 | } 579 | 580 | let bare_self = SynTree::Branch( 581 | Constituent { 582 | span: cons.span, 583 | value: cons.value.symbol.clone(), 584 | }, 585 | bare_children, 586 | ); 587 | 588 | Ok((bare_self, features)) 589 | } 590 | } 591 | } 592 | 593 | pub fn parse(&self, input: &[&str]) -> Vec<(SynTree, NodeIdx, NodeArena)> { 594 | let forest = self.parse_forest(input); 595 | let trees = forest.trees(self); 596 | 597 | let mut results = Vec::new(); 598 | 599 | for tree in trees { 600 | // TODO might be able to share arena between parse attempts 601 | let mut arena = self.create_parse_arena(); 602 | match Self::unify_tree(tree, &mut arena) { 603 | Ok((syn_tree, idx)) => results.push((syn_tree, idx, arena)), 604 | Err(e) => debug!("{e}"), 605 | } 606 | } 607 | 608 | results 609 | } 610 | 611 | pub fn read_from_file>(path: P) -> Result { 612 | fs::read_to_string(path)?.parse() 613 | } 614 | } 615 | 616 | #[test] 617 | fn test_unification_blocking() { 618 | let g: Grammar = r#" 619 | S -> N[ case: nom, pron: #1 ] TV N[ case: acc, needs_pron: #1 ] 620 | TV -> likes 621 | N[ case: nom, pron: she ] -> she 622 | N[ case: nom, pron: he ] -> he 623 | N[ case: acc, pron: he ] -> him 624 | N[ case: acc, pron: ref, needs_pron: he ] -> himself 625 | "# 626 | .parse() 627 | .unwrap(); 628 | 629 | assert_eq!(g.parse(&["he", "likes", "himself"]).len(), 1); 630 | assert_eq!(g.parse(&["he", "likes", "him"]).len(), 1); 631 | assert_eq!(g.parse(&["she", "likes", "him"]).len(), 1); 632 | 633 | assert_eq!(g.parse(&["himself", "likes", "himself"]).len(), 0); 634 | assert_eq!(g.parse(&["she", "likes", "himself"]).len(), 0); 635 | assert_eq!(g.parse(&["himself", "likes", "him"]).len(), 0); 636 | } 637 | 638 | #[test] 639 | fn test_complex() { 640 | let g: Grammar = std::fs::read_to_string("examples/dative-shift.fgr") 641 | .unwrap() 642 | .parse() 643 | .unwrap(); 644 | 645 | assert_eq!(g.parse(&["i", "gave", "her", "apples"]).len(), 1); 646 | assert_eq!(g.parse(&["i", "gave", "apples", "to", "her"]).len(), 1); 647 | assert_eq!(g.parse(&["i", "gave", "to", "her", "apples"]).len(), 0); 648 | } 649 | -------------------------------------------------------------------------------- /src/rules.rs: -------------------------------------------------------------------------------- 1 | use std::collections::{HashMap, HashSet}; 2 | use std::fmt; 3 | use std::sync::Arc; 4 | 5 | use crate::featurestructure::{NodeArena, NodeIdx}; 6 | use crate::utils::Err; 7 | 8 | #[derive(Debug, Copy, Clone, PartialEq, Eq)] 9 | pub enum ProductionKind { 10 | Terminal, 11 | Nonterminal, 12 | } 13 | 14 | #[derive(Debug, Clone, PartialEq, Eq)] 15 | pub struct Production { 16 | pub kind: ProductionKind, 17 | pub symbol: String, 18 | } 19 | 20 | impl Production { 21 | pub fn new_terminal(symbol: String) -> Self { 22 | Self { 23 | kind: ProductionKind::Terminal, 24 | symbol, 25 | } 26 | } 27 | 28 | pub fn new_nonterminal(symbol: String) -> Self { 29 | Self { 30 | kind: ProductionKind::Nonterminal, 31 | symbol, 32 | } 33 | } 34 | 35 | pub fn is_terminal(&self) -> bool { 36 | self.kind == ProductionKind::Terminal 37 | } 38 | 39 | pub fn is_nonterminal(&self) -> bool { 40 | self.kind == ProductionKind::Nonterminal 41 | } 42 | } 43 | 44 | impl fmt::Display for Production { 45 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 46 | write!(f, "{}", self.symbol) 47 | } 48 | } 49 | 50 | #[derive(Debug, PartialEq, Eq)] 51 | pub struct Rule { 52 | pub symbol: String, 53 | pub features: NodeIdx, 54 | pub productions: Vec, 55 | } 56 | 57 | impl Rule { 58 | pub fn len(&self) -> usize { 59 | self.productions.len() 60 | } 61 | 62 | pub fn is_empty(&self) -> bool { 63 | self.len() == 0 64 | } 65 | } 66 | 67 | impl std::fmt::Display for Rule { 68 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 69 | // Note: can't display features here without an arena reference 70 | write!(f, "{} ->", self.symbol)?; 71 | for p in self.productions.iter() { 72 | write!(f, " {}", p)?; 73 | } 74 | Ok(()) 75 | } 76 | } 77 | 78 | #[derive(Debug)] 79 | pub struct Grammar { 80 | pub start: String, 81 | pub rules: HashMap>>, 82 | pub arena: NodeArena, 83 | nullables: HashSet, 84 | nonterminals: HashSet, 85 | } 86 | 87 | impl std::fmt::Display for Grammar { 88 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 89 | writeln!(f, "//** start: {}", self.start)?; 90 | write!(f, "//** nonterminals:")?; 91 | for nt in self.nonterminals.iter() { 92 | write!(f, " {}", nt)?; 93 | } 94 | writeln!(f)?; 95 | 96 | write!(f, "//** nullables:")?; 97 | for nt in self.nullables.iter() { 98 | write!(f, " {}", nt)?; 99 | } 100 | writeln!(f)?; 101 | 102 | for rule in self.rules.values().flatten() { 103 | writeln!(f, "{}\n", rule)?; 104 | } 105 | 106 | Ok(()) 107 | } 108 | } 109 | 110 | impl Grammar { 111 | pub fn new(rules: Vec, arena: NodeArena) -> Result { 112 | assert!(!rules.is_empty()); 113 | 114 | let nonterminals: HashSet = rules.iter().map(|r| r.symbol.clone()).collect(); 115 | let start = rules[0].symbol.clone(); 116 | 117 | for r in rules.iter() { 118 | for p in r.productions.iter() { 119 | if p.is_nonterminal() && !nonterminals.contains(&p.symbol) { 120 | return Err(format!("missing rules for nonterminal {}", p.symbol).into()); 121 | } 122 | } 123 | } 124 | 125 | let rules: HashMap>> = 126 | rules.into_iter().fold(HashMap::new(), |mut map, rule| { 127 | map 128 | .entry(rule.symbol.clone()) 129 | .or_default() 130 | .push(Arc::new(rule)); 131 | map 132 | }); 133 | 134 | let nullables = Self::find_nullables(&rules); 135 | 136 | Ok(Self { 137 | start, 138 | rules, 139 | arena, 140 | nonterminals, 141 | nullables, 142 | }) 143 | } 144 | 145 | // Create a fresh arena for parsing, with a clone of the grammar's arena 146 | pub fn create_parse_arena(&self) -> NodeArena { 147 | self.arena.clone() 148 | } 149 | 150 | pub fn is_nullable(&self, s: &str) -> bool { 151 | self.nullables.contains(s) 152 | } 153 | } 154 | 155 | impl Grammar { 156 | fn rule_is_nullable(nullables: &HashSet, rule: &Rule) -> bool { 157 | rule.is_empty() 158 | || rule 159 | .productions 160 | .iter() 161 | .all(|p| p.is_nonterminal() && nullables.contains(&p.symbol)) 162 | } 163 | 164 | fn find_nullables(rules: &HashMap>>) -> HashSet { 165 | let mut nullables: HashSet = HashSet::new(); 166 | 167 | let mut last_length = 1; 168 | while last_length != nullables.len() { 169 | last_length = nullables.len(); 170 | for r in rules.values().flatten() { 171 | if !nullables.contains(&r.symbol) && Self::rule_is_nullable(&nullables, r) { 172 | nullables.insert(r.symbol.clone()); 173 | } 174 | } 175 | } 176 | 177 | nullables 178 | } 179 | } 180 | 181 | #[test] 182 | fn test_parse_grammar() { 183 | let g: Grammar = r#" 184 | S -> N[ case: nom, num: #1 ] IV[ num: #1 ] 185 | S -> N[ case: nom, pron: #1, num: #2 ] TV[ num: #2 ] N[ case: acc, needs_pron: #1 ] 186 | S -> N[ case: nom, num: #1 ] CV[ num: #num ] Comp S 187 | 188 | N[ num: sg, pron: she ] -> mary 189 | IV[ num: top, tense: past ] -> fell 190 | TV[ num: top, tense: past ] -> kissed 191 | CV[ num: top, tense: past ] -> said 192 | Comp -> that 193 | "# 194 | .parse() 195 | .unwrap(); 196 | 197 | let nonterminals: HashSet = ["S", "N", "IV", "TV", "CV", "Comp"] 198 | .iter() 199 | .map(|&s| s.to_string()) 200 | .collect(); 201 | assert_eq!(nonterminals, g.nonterminals); 202 | assert_eq!(g.rules.len(), 6); 203 | 204 | assert_eq!(g.rules.get("S").unwrap().len(), 3); 205 | assert_eq!(g.rules.get("N").unwrap().len(), 1); 206 | assert_eq!(g.rules.get("IV").unwrap().len(), 1); 207 | assert_eq!(g.rules.get("TV").unwrap().len(), 1); 208 | assert_eq!(g.rules.get("CV").unwrap().len(), 1); 209 | assert_eq!(g.rules.get("Comp").unwrap().len(), 1); 210 | assert!(!g.rules.contains_key("that")); 211 | assert!(!g.rules.contains_key("mary")); 212 | } 213 | 214 | #[test] 215 | fn test_find_nullables() { 216 | let g: Grammar = r#" 217 | S -> A B 218 | A -> c 219 | B -> D D 220 | D -> 221 | "# 222 | .parse() 223 | .unwrap(); 224 | 225 | let nl: HashSet = ["B", "D"].iter().map(|&s| s.to_string()).collect(); 226 | assert_eq!(g.nullables, nl); 227 | } 228 | -------------------------------------------------------------------------------- /src/syntree.rs: -------------------------------------------------------------------------------- 1 | use std::fmt; 2 | 3 | #[derive(Debug, Clone, PartialEq, Eq)] 4 | pub struct Constituent { 5 | pub value: T, 6 | pub span: (usize, usize), 7 | } 8 | 9 | impl fmt::Display for Constituent 10 | where 11 | T: fmt::Display, 12 | { 13 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 14 | write!(f, "{}..{}: {}", self.span.0, self.span.1, self.value) 15 | } 16 | } 17 | 18 | #[derive(Debug, Clone, PartialEq, Eq)] 19 | pub struct Word { 20 | pub value: U, 21 | pub span: (usize, usize), 22 | } 23 | 24 | impl fmt::Display for Word 25 | where 26 | U: fmt::Display, 27 | { 28 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 29 | write!(f, "{}..{}: {}", self.span.0, self.span.1, self.value) 30 | } 31 | } 32 | 33 | #[derive(Debug, PartialEq, Clone)] 34 | pub enum SynTree { 35 | Branch(Constituent, Vec>), 36 | Leaf(Word), 37 | } 38 | 39 | impl SynTree { 40 | pub fn is_leaf(&self) -> bool { 41 | matches!(self, Self::Leaf(_)) 42 | } 43 | 44 | pub fn is_branch(&self) -> bool { 45 | matches!(self, Self::Branch(_, _)) 46 | } 47 | 48 | pub fn get_leaf(&self) -> Option<&Word> { 49 | match self { 50 | Self::Leaf(w) => Some(w), 51 | _ => None, 52 | } 53 | } 54 | 55 | #[allow(clippy::type_complexity)] // TODO 56 | pub fn get_branch(&self) -> Option<(&Constituent, &Vec>)> { 57 | match self { 58 | Self::Branch(c, cs) => Some((c, cs)), 59 | _ => None, 60 | } 61 | } 62 | 63 | #[allow(clippy::type_complexity)] // TODO 64 | pub fn into_branch(self) -> Option<(Constituent, Vec>)> { 65 | match self { 66 | Self::Branch(c, cs) => Some((c, cs)), 67 | _ => None, 68 | } 69 | } 70 | 71 | pub fn map( 72 | &self, 73 | map_branch: fn(&Constituent) -> V, 74 | map_leaf: fn(&Word) -> W, 75 | ) -> SynTree { 76 | match self { 77 | Self::Branch(t, children) => { 78 | let children = children 79 | .iter() 80 | .map(|c| c.map(map_branch, map_leaf)) 81 | .collect::>(); 82 | SynTree::Branch( 83 | Constituent { 84 | span: t.span, 85 | value: map_branch(t), 86 | }, 87 | children, 88 | ) 89 | } 90 | Self::Leaf(u) => SynTree::Leaf(Word { 91 | span: u.span, 92 | value: map_leaf(u), 93 | }), 94 | } 95 | } 96 | } 97 | 98 | impl fmt::Display for SynTree 99 | where 100 | T: fmt::Display, 101 | U: fmt::Display, 102 | { 103 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 104 | match self { 105 | Self::Leaf(t) => write!(f, "{}", t), 106 | Self::Branch(t, ts) => { 107 | write!(f, "({}", t)?; 108 | if ts.len() == 1 { 109 | write!(f, " ({}))", ts[0]) 110 | } else { 111 | for t in ts.iter() { 112 | // TODO: is there a nice way to do this that doesn't allocate a String? 113 | let fmt = format!("{}", t); 114 | for line in fmt.lines() { 115 | write!(f, "\n {}", line)?; 116 | } 117 | } 118 | write!(f, ")") 119 | } 120 | } 121 | } 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | 3 | /// Boxed static error type 4 | pub type Err = Box; 5 | 6 | /// Takes a list where each element is a set of choices, and returns all the possible sets 7 | /// generated. Will clone the elements. 8 | /// 9 | /// ``` 10 | /// let v = vec![ 11 | /// vec![1], 12 | /// vec![2, 3], 13 | /// vec![4], 14 | /// vec![5, 6, 7], 15 | /// ]; 16 | /// 17 | /// assert_eq!(treebender::utils::combinations(&v), vec![ 18 | /// vec![1, 2, 4, 5], 19 | /// vec![1, 3, 4, 5], 20 | /// vec![1, 2, 4, 6], 21 | /// vec![1, 3, 4, 6], 22 | /// vec![1, 2, 4, 7], 23 | /// vec![1, 3, 4, 7], 24 | /// ]); 25 | /// ``` 26 | pub fn combinations(list: &[Vec]) -> Vec> 27 | where 28 | T: Clone, 29 | { 30 | if list.is_empty() { 31 | Vec::new() 32 | } else if list.len() == 1 { 33 | list[0].iter().map(|e| vec![e.clone()]).collect() 34 | } else { 35 | let (head, tail) = list.split_at(1); 36 | let head = &head[0]; 37 | 38 | combinations(tail) 39 | .into_iter() 40 | .flat_map(|subseq| { 41 | // prepend every element of the head to every possible subseq 42 | head.iter().map(move |v| { 43 | let mut newseq = subseq.clone(); 44 | newseq.insert(0, v.clone()); 45 | newseq 46 | }) 47 | }) 48 | .collect() 49 | } 50 | } 51 | --------------------------------------------------------------------------------