├── .gitignore ├── CONTRIBUTING.md ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md └── src ├── index.html ├── index.js ├── lexer.rs ├── main.rs ├── model.rs ├── server.rs └── snowball ├── algorithms ├── english_stemmer.rs └── mod.rs ├── among.rs ├── mod.rs └── snowball_env.rs /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | I have very limited resources in terms of handling feedback on my projects, sorry. So here are the limitations to keep in mind: 2 | 3 | - I don't look into reported Issues. 4 | - I only look into small PRs that suggest 5 | - bug fixes, 6 | - documentation fixes. 7 | - I do not look into PRs that 8 | - implement new features, 9 | - refactor/cleanup the code. 10 | - What qualifies as a bug, a feature, or refactoring is entirely upon my interpretation. 11 | 12 | Sorry for any inconveniences. If you want to stir the project in a particular direction in terms of features feel free to fork it, I don't mind. Just make sure you have fun while developing it! This is like the whole point! 13 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "anyhow" 7 | version = "1.0.70" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "7de8ce5e0f9f8d88245311066a578d72b7af3e7088f32783804676302df237e4" 10 | 11 | [[package]] 12 | name = "ascii" 13 | version = "1.1.0" 14 | source = "registry+https://github.com/rust-lang/crates.io-index" 15 | checksum = "d92bec98840b8f03a5ff5413de5293bfcd8bf96467cf5452609f939ec6f5de16" 16 | 17 | [[package]] 18 | name = "autocfg" 19 | version = "1.1.0" 20 | source = "registry+https://github.com/rust-lang/crates.io-index" 21 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" 22 | 23 | [[package]] 24 | name = "bitflags" 25 | version = "1.3.2" 26 | source = "registry+https://github.com/rust-lang/crates.io-index" 27 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 28 | 29 | [[package]] 30 | name = "cairo-rs" 31 | version = "0.17.0" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "a8af54f5d48af1226928adc1f57edd22f5df1349e7da1fc96ae15cf43db0e871" 34 | dependencies = [ 35 | "bitflags", 36 | "cairo-sys-rs", 37 | "glib", 38 | "libc", 39 | "once_cell", 40 | "thiserror", 41 | ] 42 | 43 | [[package]] 44 | name = "cairo-sys-rs" 45 | version = "0.17.0" 46 | source = "registry+https://github.com/rust-lang/crates.io-index" 47 | checksum = "f55382a01d30e5e53f185eee269124f5e21ab526595b872751278dfbb463594e" 48 | dependencies = [ 49 | "glib-sys", 50 | "libc", 51 | "system-deps", 52 | ] 53 | 54 | [[package]] 55 | name = "cfg-expr" 56 | version = "0.14.0" 57 | source = "registry+https://github.com/rust-lang/crates.io-index" 58 | checksum = "a35b255461940a32985c627ce82900867c61db1659764d3675ea81963f72a4c6" 59 | dependencies = [ 60 | "smallvec", 61 | ] 62 | 63 | [[package]] 64 | name = "cfg-if" 65 | version = "1.0.0" 66 | source = "registry+https://github.com/rust-lang/crates.io-index" 67 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 68 | 69 | [[package]] 70 | name = "chunked_transfer" 71 | version = "1.4.1" 72 | source = "registry+https://github.com/rust-lang/crates.io-index" 73 | checksum = "cca491388666e04d7248af3f60f0c40cfb0991c72205595d7c396e3510207d1a" 74 | 75 | [[package]] 76 | name = "futures-channel" 77 | version = "0.3.28" 78 | source = "registry+https://github.com/rust-lang/crates.io-index" 79 | checksum = "955518d47e09b25bbebc7a18df10b81f0c766eaf4c4f1cccef2fca5f2a4fb5f2" 80 | dependencies = [ 81 | "futures-core", 82 | ] 83 | 84 | [[package]] 85 | name = "futures-core" 86 | version = "0.3.28" 87 | source = "registry+https://github.com/rust-lang/crates.io-index" 88 | checksum = "4bca583b7e26f571124fe5b7561d49cb2868d79116cfa0eefce955557c6fee8c" 89 | 90 | [[package]] 91 | name = "futures-executor" 92 | version = "0.3.27" 93 | source = "registry+https://github.com/rust-lang/crates.io-index" 94 | checksum = "1997dd9df74cdac935c76252744c1ed5794fac083242ea4fe77ef3ed60ba0f83" 95 | dependencies = [ 96 | "futures-core", 97 | "futures-task", 98 | "futures-util", 99 | ] 100 | 101 | [[package]] 102 | name = "futures-io" 103 | version = "0.3.28" 104 | source = "registry+https://github.com/rust-lang/crates.io-index" 105 | checksum = "4fff74096e71ed47f8e023204cfd0aa1289cd54ae5430a9523be060cdb849964" 106 | 107 | [[package]] 108 | name = "futures-macro" 109 | version = "0.3.27" 110 | source = "registry+https://github.com/rust-lang/crates.io-index" 111 | checksum = "3eb14ed937631bd8b8b8977f2c198443447a8355b6e3ca599f38c975e5a963b6" 112 | dependencies = [ 113 | "proc-macro2", 114 | "quote", 115 | "syn", 116 | ] 117 | 118 | [[package]] 119 | name = "futures-task" 120 | version = "0.3.28" 121 | source = "registry+https://github.com/rust-lang/crates.io-index" 122 | checksum = "76d3d132be6c0e6aa1534069c705a74a5997a356c0dc2f86a47765e5617c5b65" 123 | 124 | [[package]] 125 | name = "futures-util" 126 | version = "0.3.27" 127 | source = "registry+https://github.com/rust-lang/crates.io-index" 128 | checksum = "3ef6b17e481503ec85211fed8f39d1970f128935ca1f814cd32ac4a6842e84ab" 129 | dependencies = [ 130 | "futures-core", 131 | "futures-macro", 132 | "futures-task", 133 | "pin-project-lite", 134 | "pin-utils", 135 | "slab", 136 | ] 137 | 138 | [[package]] 139 | name = "gio" 140 | version = "0.17.4" 141 | source = "registry+https://github.com/rust-lang/crates.io-index" 142 | checksum = "2261a3b4e922ec676d1c27ac466218c38cf5dcb49a759129e54bb5046e442125" 143 | dependencies = [ 144 | "bitflags", 145 | "futures-channel", 146 | "futures-core", 147 | "futures-io", 148 | "futures-util", 149 | "gio-sys", 150 | "glib", 151 | "libc", 152 | "once_cell", 153 | "pin-project-lite", 154 | "smallvec", 155 | "thiserror", 156 | ] 157 | 158 | [[package]] 159 | name = "gio-sys" 160 | version = "0.17.4" 161 | source = "registry+https://github.com/rust-lang/crates.io-index" 162 | checksum = "6b1d43b0d7968b48455244ecafe41192871257f5740aa6b095eb19db78e362a5" 163 | dependencies = [ 164 | "glib-sys", 165 | "gobject-sys", 166 | "libc", 167 | "system-deps", 168 | "winapi", 169 | ] 170 | 171 | [[package]] 172 | name = "glib" 173 | version = "0.17.5" 174 | source = "registry+https://github.com/rust-lang/crates.io-index" 175 | checksum = "cfb53061756195d76969292c2d2e329e01259276524a9bae6c9b73af62854773" 176 | dependencies = [ 177 | "bitflags", 178 | "futures-channel", 179 | "futures-core", 180 | "futures-executor", 181 | "futures-task", 182 | "futures-util", 183 | "gio-sys", 184 | "glib-macros", 185 | "glib-sys", 186 | "gobject-sys", 187 | "libc", 188 | "memchr", 189 | "once_cell", 190 | "smallvec", 191 | "thiserror", 192 | ] 193 | 194 | [[package]] 195 | name = "glib-macros" 196 | version = "0.17.6" 197 | source = "registry+https://github.com/rust-lang/crates.io-index" 198 | checksum = "32e73a9790e243f6d55d8e302426419f6084a1de7a84cd07f7268300408a19de" 199 | dependencies = [ 200 | "anyhow", 201 | "heck", 202 | "proc-macro-crate", 203 | "proc-macro-error", 204 | "proc-macro2", 205 | "quote", 206 | "syn", 207 | ] 208 | 209 | [[package]] 210 | name = "glib-sys" 211 | version = "0.17.4" 212 | source = "registry+https://github.com/rust-lang/crates.io-index" 213 | checksum = "49f00ad0a1bf548e61adfff15d83430941d9e1bb620e334f779edd1c745680a5" 214 | dependencies = [ 215 | "libc", 216 | "system-deps", 217 | ] 218 | 219 | [[package]] 220 | name = "gobject-sys" 221 | version = "0.17.4" 222 | source = "registry+https://github.com/rust-lang/crates.io-index" 223 | checksum = "15e75b0000a64632b2d8ca3cf856af9308e3a970844f6e9659bd197f026793d0" 224 | dependencies = [ 225 | "glib-sys", 226 | "libc", 227 | "system-deps", 228 | ] 229 | 230 | [[package]] 231 | name = "hashbrown" 232 | version = "0.12.3" 233 | source = "registry+https://github.com/rust-lang/crates.io-index" 234 | checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" 235 | 236 | [[package]] 237 | name = "heck" 238 | version = "0.4.1" 239 | source = "registry+https://github.com/rust-lang/crates.io-index" 240 | checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" 241 | 242 | [[package]] 243 | name = "httpdate" 244 | version = "1.0.2" 245 | source = "registry+https://github.com/rust-lang/crates.io-index" 246 | checksum = "c4a1e36c821dbe04574f602848a19f742f4fb3c98d40449f11bcad18d6b17421" 247 | 248 | [[package]] 249 | name = "indexmap" 250 | version = "1.9.3" 251 | source = "registry+https://github.com/rust-lang/crates.io-index" 252 | checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" 253 | dependencies = [ 254 | "autocfg", 255 | "hashbrown", 256 | ] 257 | 258 | [[package]] 259 | name = "itoa" 260 | version = "1.0.5" 261 | source = "registry+https://github.com/rust-lang/crates.io-index" 262 | checksum = "fad582f4b9e86b6caa621cabeb0963332d92eea04729ab12892c2533951e6440" 263 | 264 | [[package]] 265 | name = "libc" 266 | version = "0.2.140" 267 | source = "registry+https://github.com/rust-lang/crates.io-index" 268 | checksum = "99227334921fae1a979cf0bfdfcc6b3e5ce376ef57e16fb6fb3ea2ed6095f80c" 269 | 270 | [[package]] 271 | name = "log" 272 | version = "0.4.17" 273 | source = "registry+https://github.com/rust-lang/crates.io-index" 274 | checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" 275 | dependencies = [ 276 | "cfg-if", 277 | ] 278 | 279 | [[package]] 280 | name = "memchr" 281 | version = "2.5.0" 282 | source = "registry+https://github.com/rust-lang/crates.io-index" 283 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" 284 | 285 | [[package]] 286 | name = "once_cell" 287 | version = "1.17.1" 288 | source = "registry+https://github.com/rust-lang/crates.io-index" 289 | checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3" 290 | 291 | [[package]] 292 | name = "pin-project-lite" 293 | version = "0.2.9" 294 | source = "registry+https://github.com/rust-lang/crates.io-index" 295 | checksum = "e0a7ae3ac2f1173085d398531c705756c94a4c56843785df85a60c1a0afac116" 296 | 297 | [[package]] 298 | name = "pin-utils" 299 | version = "0.1.0" 300 | source = "registry+https://github.com/rust-lang/crates.io-index" 301 | checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" 302 | 303 | [[package]] 304 | name = "pkg-config" 305 | version = "0.3.26" 306 | source = "registry+https://github.com/rust-lang/crates.io-index" 307 | checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" 308 | 309 | [[package]] 310 | name = "poppler-rs" 311 | version = "0.21.0" 312 | source = "registry+https://github.com/rust-lang/crates.io-index" 313 | checksum = "ee1ec912c55fee25056d29dbe119c5f3b83ec521760f6381f01f3bd033ad7203" 314 | dependencies = [ 315 | "bitflags", 316 | "cairo-rs", 317 | "gio", 318 | "glib", 319 | "libc", 320 | "once_cell", 321 | "poppler-sys-rs", 322 | ] 323 | 324 | [[package]] 325 | name = "poppler-sys-rs" 326 | version = "0.21.0" 327 | source = "registry+https://github.com/rust-lang/crates.io-index" 328 | checksum = "e7bee91b998f39990a8600149c5b62a113e13ea5eabe1e577985756f45cf5e28" 329 | dependencies = [ 330 | "cairo-sys-rs", 331 | "gio-sys", 332 | "glib-sys", 333 | "gobject-sys", 334 | "libc", 335 | "system-deps", 336 | ] 337 | 338 | [[package]] 339 | name = "proc-macro-crate" 340 | version = "1.3.1" 341 | source = "registry+https://github.com/rust-lang/crates.io-index" 342 | checksum = "7f4c021e1093a56626774e81216a4ce732a735e5bad4868a03f3ed65ca0c3919" 343 | dependencies = [ 344 | "once_cell", 345 | "toml_edit", 346 | ] 347 | 348 | [[package]] 349 | name = "proc-macro-error" 350 | version = "1.0.4" 351 | source = "registry+https://github.com/rust-lang/crates.io-index" 352 | checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" 353 | dependencies = [ 354 | "proc-macro-error-attr", 355 | "proc-macro2", 356 | "quote", 357 | "syn", 358 | "version_check", 359 | ] 360 | 361 | [[package]] 362 | name = "proc-macro-error-attr" 363 | version = "1.0.4" 364 | source = "registry+https://github.com/rust-lang/crates.io-index" 365 | checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" 366 | dependencies = [ 367 | "proc-macro2", 368 | "quote", 369 | "version_check", 370 | ] 371 | 372 | [[package]] 373 | name = "proc-macro2" 374 | version = "1.0.50" 375 | source = "registry+https://github.com/rust-lang/crates.io-index" 376 | checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2" 377 | dependencies = [ 378 | "unicode-ident", 379 | ] 380 | 381 | [[package]] 382 | name = "quote" 383 | version = "1.0.23" 384 | source = "registry+https://github.com/rust-lang/crates.io-index" 385 | checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" 386 | dependencies = [ 387 | "proc-macro2", 388 | ] 389 | 390 | [[package]] 391 | name = "ryu" 392 | version = "1.0.12" 393 | source = "registry+https://github.com/rust-lang/crates.io-index" 394 | checksum = "7b4b9743ed687d4b4bcedf9ff5eaa7398495ae14e61cba0a295704edbc7decde" 395 | 396 | [[package]] 397 | name = "serde" 398 | version = "1.0.152" 399 | source = "registry+https://github.com/rust-lang/crates.io-index" 400 | checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" 401 | dependencies = [ 402 | "serde_derive", 403 | ] 404 | 405 | [[package]] 406 | name = "serde_derive" 407 | version = "1.0.152" 408 | source = "registry+https://github.com/rust-lang/crates.io-index" 409 | checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e" 410 | dependencies = [ 411 | "proc-macro2", 412 | "quote", 413 | "syn", 414 | ] 415 | 416 | [[package]] 417 | name = "serde_json" 418 | version = "1.0.91" 419 | source = "registry+https://github.com/rust-lang/crates.io-index" 420 | checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883" 421 | dependencies = [ 422 | "itoa", 423 | "ryu", 424 | "serde", 425 | ] 426 | 427 | [[package]] 428 | name = "serde_spanned" 429 | version = "0.6.1" 430 | source = "registry+https://github.com/rust-lang/crates.io-index" 431 | checksum = "0efd8caf556a6cebd3b285caf480045fcc1ac04f6bd786b09a6f11af30c4fcf4" 432 | dependencies = [ 433 | "serde", 434 | ] 435 | 436 | [[package]] 437 | name = "seroost" 438 | version = "0.1.0" 439 | dependencies = [ 440 | "poppler-rs", 441 | "serde", 442 | "serde_json", 443 | "tiny_http", 444 | "xml-rs", 445 | ] 446 | 447 | [[package]] 448 | name = "slab" 449 | version = "0.4.8" 450 | source = "registry+https://github.com/rust-lang/crates.io-index" 451 | checksum = "6528351c9bc8ab22353f9d776db39a20288e8d6c37ef8cfe3317cf875eecfc2d" 452 | dependencies = [ 453 | "autocfg", 454 | ] 455 | 456 | [[package]] 457 | name = "smallvec" 458 | version = "1.10.0" 459 | source = "registry+https://github.com/rust-lang/crates.io-index" 460 | checksum = "a507befe795404456341dfab10cef66ead4c041f62b8b11bbb92bffe5d0953e0" 461 | 462 | [[package]] 463 | name = "syn" 464 | version = "1.0.107" 465 | source = "registry+https://github.com/rust-lang/crates.io-index" 466 | checksum = "1f4064b5b16e03ae50984a5a8ed5d4f8803e6bc1fd170a3cda91a1be4b18e3f5" 467 | dependencies = [ 468 | "proc-macro2", 469 | "quote", 470 | "unicode-ident", 471 | ] 472 | 473 | [[package]] 474 | name = "system-deps" 475 | version = "6.0.4" 476 | source = "registry+https://github.com/rust-lang/crates.io-index" 477 | checksum = "555fc8147af6256f3931a36bb83ad0023240ce9cf2b319dec8236fd1f220b05f" 478 | dependencies = [ 479 | "cfg-expr", 480 | "heck", 481 | "pkg-config", 482 | "toml", 483 | "version-compare", 484 | ] 485 | 486 | [[package]] 487 | name = "thiserror" 488 | version = "1.0.39" 489 | source = "registry+https://github.com/rust-lang/crates.io-index" 490 | checksum = "a5ab016db510546d856297882807df8da66a16fb8c4101cb8b30054b0d5b2d9c" 491 | dependencies = [ 492 | "thiserror-impl", 493 | ] 494 | 495 | [[package]] 496 | name = "thiserror-impl" 497 | version = "1.0.39" 498 | source = "registry+https://github.com/rust-lang/crates.io-index" 499 | checksum = "5420d42e90af0c38c3290abcca25b9b3bdf379fc9f55c528f53a269d9c9a267e" 500 | dependencies = [ 501 | "proc-macro2", 502 | "quote", 503 | "syn", 504 | ] 505 | 506 | [[package]] 507 | name = "tiny_http" 508 | version = "0.12.0" 509 | source = "registry+https://github.com/rust-lang/crates.io-index" 510 | checksum = "389915df6413a2e74fb181895f933386023c71110878cd0825588928e64cdc82" 511 | dependencies = [ 512 | "ascii", 513 | "chunked_transfer", 514 | "httpdate", 515 | "log", 516 | ] 517 | 518 | [[package]] 519 | name = "toml" 520 | version = "0.7.3" 521 | source = "registry+https://github.com/rust-lang/crates.io-index" 522 | checksum = "b403acf6f2bb0859c93c7f0d967cb4a75a7ac552100f9322faf64dc047669b21" 523 | dependencies = [ 524 | "serde", 525 | "serde_spanned", 526 | "toml_datetime", 527 | "toml_edit", 528 | ] 529 | 530 | [[package]] 531 | name = "toml_datetime" 532 | version = "0.6.1" 533 | source = "registry+https://github.com/rust-lang/crates.io-index" 534 | checksum = "3ab8ed2edee10b50132aed5f331333428b011c99402b5a534154ed15746f9622" 535 | dependencies = [ 536 | "serde", 537 | ] 538 | 539 | [[package]] 540 | name = "toml_edit" 541 | version = "0.19.8" 542 | source = "registry+https://github.com/rust-lang/crates.io-index" 543 | checksum = "239410c8609e8125456927e6707163a3b1fdb40561e4b803bc041f466ccfdc13" 544 | dependencies = [ 545 | "indexmap", 546 | "serde", 547 | "serde_spanned", 548 | "toml_datetime", 549 | "winnow", 550 | ] 551 | 552 | [[package]] 553 | name = "unicode-ident" 554 | version = "1.0.6" 555 | source = "registry+https://github.com/rust-lang/crates.io-index" 556 | checksum = "84a22b9f218b40614adcb3f4ff08b703773ad44fa9423e4e0d346d5db86e4ebc" 557 | 558 | [[package]] 559 | name = "version-compare" 560 | version = "0.1.1" 561 | source = "registry+https://github.com/rust-lang/crates.io-index" 562 | checksum = "579a42fc0b8e0c63b76519a339be31bed574929511fa53c1a3acae26eb258f29" 563 | 564 | [[package]] 565 | name = "version_check" 566 | version = "0.9.4" 567 | source = "registry+https://github.com/rust-lang/crates.io-index" 568 | checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" 569 | 570 | [[package]] 571 | name = "winapi" 572 | version = "0.3.9" 573 | source = "registry+https://github.com/rust-lang/crates.io-index" 574 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 575 | dependencies = [ 576 | "winapi-i686-pc-windows-gnu", 577 | "winapi-x86_64-pc-windows-gnu", 578 | ] 579 | 580 | [[package]] 581 | name = "winapi-i686-pc-windows-gnu" 582 | version = "0.4.0" 583 | source = "registry+https://github.com/rust-lang/crates.io-index" 584 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 585 | 586 | [[package]] 587 | name = "winapi-x86_64-pc-windows-gnu" 588 | version = "0.4.0" 589 | source = "registry+https://github.com/rust-lang/crates.io-index" 590 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 591 | 592 | [[package]] 593 | name = "winnow" 594 | version = "0.4.1" 595 | source = "registry+https://github.com/rust-lang/crates.io-index" 596 | checksum = "ae8970b36c66498d8ff1d66685dc86b91b29db0c7739899012f63a63814b4b28" 597 | dependencies = [ 598 | "memchr", 599 | ] 600 | 601 | [[package]] 602 | name = "xml-rs" 603 | version = "0.8.4" 604 | source = "registry+https://github.com/rust-lang/crates.io-index" 605 | checksum = "d2d7d3948613f75c98fd9328cfdcc45acc4d360655289d0a7d4ec931392200a3" 606 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "seroost" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | poppler-rs = "0.21.0" 10 | serde = { version = "1.0.152", features = ["derive"] } 11 | serde_json = "1.0.91" 12 | tiny_http = "0.12.0" 13 | xml-rs = "0.8.4" 14 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2023 Alexey Kutepov 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining 4 | a copy of this software and associated documentation files (the 5 | "Software"), to deal in the Software without restriction, including 6 | without limitation the rights to use, copy, modify, merge, publish, 7 | distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so, subject to 9 | the following conditions: 10 | 11 | The above copyright notice and this permission notice shall be 12 | included in all copies or substantial portions of the Software. 13 | 14 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 15 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 16 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 17 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 18 | LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 19 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 20 | WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Local Search Engine in Rust 2 | 3 | **THIS SOFTWARE IS UNFINISHED!!! Don't have any high expectations.** 4 | 5 | ## Quick Start 6 | 7 | ```console 8 | $ cargo run serve ./folder/ 9 | $ iexplore.exe http://localhost:6969/ 10 | ``` 11 | -------------------------------------------------------------------------------- /src/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | Seroost 4 | 5 | 6 |

Provide Your Query:

7 | 8 |
9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | // TODO: live update results as you type 2 | async function search(prompt) { 3 | const results = document.getElementById("results") 4 | results.innerHTML = ""; 5 | const response = await fetch("/api/search", { 6 | method: 'POST', 7 | headers: {'Content-Type': 'text/plain'}, 8 | body: prompt, 9 | }); 10 | const json = await response.json(); 11 | results.innerHTML = ""; 12 | for ([path, rank] of json) { 13 | let item = document.createElement("span"); 14 | item.appendChild(document.createTextNode(path)); 15 | item.appendChild(document.createElement("br")); 16 | results.appendChild(item); 17 | } 18 | } 19 | 20 | let query = document.getElementById("query"); 21 | let currentSearch = Promise.resolve() 22 | 23 | query.addEventListener("keypress", (e) => { 24 | if (e.key == "Enter") { 25 | currentSearch.then(() => search(query.value)); 26 | } 27 | }) 28 | -------------------------------------------------------------------------------- /src/lexer.rs: -------------------------------------------------------------------------------- 1 | pub struct Lexer<'a> { 2 | content: &'a [char], 3 | } 4 | 5 | impl<'a> Lexer<'a> { 6 | pub fn new(content: &'a [char]) -> Self { 7 | Self { content } 8 | } 9 | 10 | fn trim_left(&mut self) { 11 | while !self.content.is_empty() && self.content[0].is_whitespace() { 12 | self.content = &self.content[1..]; 13 | } 14 | } 15 | 16 | fn chop(&mut self, n: usize) -> &'a [char] { 17 | let token = &self.content[0..n]; 18 | self.content = &self.content[n..]; 19 | token 20 | } 21 | 22 | fn chop_while

(&mut self, mut predicate: P) -> &'a [char] where P: FnMut(&char) -> bool { 23 | let mut n = 0; 24 | while n < self.content.len() && predicate(&self.content[n]) { 25 | n += 1; 26 | } 27 | self.chop(n) 28 | } 29 | 30 | pub fn next_token(&mut self) -> Option { 31 | self.trim_left(); 32 | if self.content.is_empty() { 33 | return None 34 | } 35 | 36 | if self.content[0].is_numeric() { 37 | return Some(self.chop_while(|x| x.is_numeric()).iter().collect()); 38 | } 39 | 40 | if self.content[0].is_alphabetic() { 41 | let term = self.chop_while(|x| x.is_alphanumeric()).iter().map(|x| x.to_ascii_lowercase()).collect::(); 42 | let mut env = crate::snowball::SnowballEnv::create(&term); 43 | crate::snowball::algorithms::english_stemmer::stem(&mut env); 44 | let stemmed_term = env.get_current().to_string(); 45 | return Some(stemmed_term); 46 | } 47 | 48 | return Some(self.chop(1).iter().collect()); 49 | } 50 | } 51 | 52 | impl<'a> Iterator for Lexer<'a> { 53 | type Item = String; 54 | 55 | fn next(&mut self) -> Option { 56 | self.next_token() 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use std::fs::{self, File}; 2 | use std::path::{Path}; 3 | use xml::reader::{XmlEvent, EventReader}; 4 | use xml::common::{Position, TextPosition}; 5 | use std::env; 6 | use std::result::Result; 7 | use std::process::ExitCode; 8 | use std::str; 9 | use std::io::{BufReader, BufWriter}; 10 | use std::sync::{Arc, Mutex}; 11 | use std::thread; 12 | 13 | mod model; 14 | use model::*; 15 | mod server; 16 | mod lexer; 17 | pub mod snowball; 18 | 19 | fn parse_entire_txt_file(file_path: &Path) -> Result { 20 | fs::read_to_string(file_path).map_err(|err| { 21 | eprintln!("ERROR: coult not open file {file_path}: {err}", file_path = file_path.display()); 22 | }) 23 | } 24 | 25 | fn parse_entire_pdf_file(file_path: &Path) -> Result { 26 | use poppler::Document; 27 | use std::io::Read; 28 | 29 | let mut content = Vec::new(); 30 | File::open(file_path) 31 | .and_then(|mut file| file.read_to_end(&mut content)) 32 | .map_err(|err| { 33 | eprintln!("ERROR: could not read file {file_path}: {err}", file_path = file_path.display()); 34 | })?; 35 | 36 | let pdf = Document::from_data(&content, None).map_err(|err| { 37 | eprintln!("ERROR: could not read file {file_path}: {err}", 38 | file_path = file_path.display()); 39 | })?; 40 | 41 | let mut result = String::new(); 42 | 43 | let n = pdf.n_pages(); 44 | for i in 0..n { 45 | let page = pdf.page(i).expect(&format!("{i} is within the bounds of the range of the page")); 46 | if let Some(content) = page.text() { 47 | result.push_str(content.as_str()); 48 | result.push(' '); 49 | } 50 | } 51 | 52 | Ok(result) 53 | } 54 | 55 | fn parse_entire_xml_file(file_path: &Path) -> Result { 56 | let file = File::open(file_path).map_err(|err| { 57 | eprintln!("ERROR: could not open file {file_path}: {err}", file_path = file_path.display()); 58 | })?; 59 | let er = EventReader::new(BufReader::new(file)); 60 | let mut content = String::new(); 61 | for event in er.into_iter() { 62 | let event = event.map_err(|err| { 63 | let TextPosition {row, column} = err.position(); 64 | let msg = err.msg(); 65 | eprintln!("{file_path}:{row}:{column}: ERROR: {msg}", file_path = file_path.display()); 66 | })?; 67 | 68 | if let XmlEvent::Characters(text) = event { 69 | content.push_str(&text); 70 | content.push(' '); 71 | } 72 | } 73 | Ok(content) 74 | } 75 | 76 | fn parse_entire_file_by_extension(file_path: &Path) -> Result { 77 | let extension = file_path.extension().ok_or_else(|| { 78 | eprintln!("ERROR: can't detect file type of {file_path} without extension", 79 | file_path = file_path.display()); 80 | })?.to_string_lossy(); 81 | match extension.as_ref() { 82 | "xhtml" | "xml" => parse_entire_xml_file(file_path), 83 | // TODO: specialized parser for markdown files 84 | "txt" | "md" => parse_entire_txt_file(file_path), 85 | "pdf" => parse_entire_pdf_file(file_path), 86 | _ => { 87 | eprintln!("ERROR: can't detect file type of {file_path}: unsupported extension {extension}", 88 | file_path = file_path.display(), 89 | extension = extension); 90 | Err(()) 91 | } 92 | } 93 | } 94 | 95 | fn save_model_as_json(model: &Model, index_path: &Path) -> Result<(), ()> { 96 | println!("Saving {index_path}...", index_path = index_path.display()); 97 | 98 | let index_file = File::create(index_path).map_err(|err| { 99 | eprintln!("ERROR: could not create index file {index_path}: {err}", 100 | index_path = index_path.display()); 101 | })?; 102 | 103 | serde_json::to_writer(BufWriter::new(index_file), &model).map_err(|err| { 104 | eprintln!("ERROR: could not serialize index into file {index_path}: {err}", 105 | index_path = index_path.display()); 106 | })?; 107 | 108 | Ok(()) 109 | } 110 | 111 | fn add_folder_to_model(dir_path: &Path, model: Arc>, processed: &mut usize) -> Result<(), ()> { 112 | let dir = fs::read_dir(dir_path).map_err(|err| { 113 | eprintln!("ERROR: could not open directory {dir_path} for indexing: {err}", 114 | dir_path = dir_path.display()); 115 | })?; 116 | 117 | 'next_file: for file in dir { 118 | let file = file.map_err(|err| { 119 | eprintln!("ERROR: could not read next file in directory {dir_path} during indexing: {err}", 120 | dir_path = dir_path.display()); 121 | })?; 122 | 123 | let file_path = file.path(); 124 | 125 | let dot_file = file_path 126 | .file_name() 127 | .and_then(|s| s.to_str()) 128 | .map(|s| s.starts_with(".")) 129 | .unwrap_or(false); 130 | 131 | if dot_file { 132 | continue 'next_file; 133 | } 134 | 135 | let file_type = file.file_type().map_err(|err| { 136 | eprintln!("ERROR: could not determine type of file {file_path}: {err}", 137 | file_path = file_path.display()); 138 | })?; 139 | let last_modified = file.metadata().map_err(|err| { 140 | eprintln!("ERROR: could not get the metadata of file {file_path}: {err}", 141 | file_path = file_path.display()); 142 | })?.modified().map_err(|err| { 143 | eprintln!("ERROR: could not get the last modification date of file {file_path}: {err}", 144 | file_path = file_path.display()) 145 | })?; 146 | 147 | if file_type.is_dir() { 148 | add_folder_to_model(&file_path, Arc::clone(&model), processed)?; 149 | continue 'next_file; 150 | } 151 | 152 | // TODO: how does this work with symlinks? 153 | 154 | let mut model = model.lock().unwrap(); 155 | if model.requires_reindexing(&file_path, last_modified) { 156 | println!("Indexing {:?}...", &file_path); 157 | 158 | let content = match parse_entire_file_by_extension(&file_path) { 159 | Ok(content) => content.chars().collect::>(), 160 | // TODO: still add the skipped files to the model to prevent their reindexing in the future 161 | Err(()) => continue 'next_file, 162 | }; 163 | 164 | model.add_document(file_path, last_modified, &content); 165 | *processed += 1; 166 | } 167 | } 168 | 169 | Ok(()) 170 | } 171 | 172 | fn usage(program: &str) { 173 | eprintln!("Usage: {program} [SUBCOMMAND] [OPTIONS]"); 174 | eprintln!("Subcommands:"); 175 | eprintln!(" serve [address] start local HTTP server with Web Interface"); 176 | } 177 | 178 | fn entry() -> Result<(), ()> { 179 | let mut args = env::args(); 180 | let program = args.next().expect("path to program is provided"); 181 | 182 | let subcommand = args.next().ok_or_else(|| { 183 | usage(&program); 184 | eprintln!("ERROR: no subcommand is provided"); 185 | })?; 186 | 187 | match subcommand.as_str() { 188 | "serve" => { 189 | let dir_path = args.next().ok_or_else(|| { 190 | usage(&program); 191 | eprintln!("ERROR: no directory is provided for {subcommand} subcommand"); 192 | })?; 193 | 194 | let mut index_path = Path::new(&dir_path).to_path_buf(); 195 | index_path.push(".seroost.json"); 196 | 197 | let address = args.next().unwrap_or("127.0.0.1:6969".to_string()); 198 | 199 | let exists = index_path.try_exists().map_err(|err| { 200 | eprintln!("ERROR: could not check the existence of file {index_path}: {err}", 201 | index_path = index_path.display()); 202 | })?; 203 | 204 | let model: Arc>; 205 | if exists { 206 | let index_file = File::open(&index_path).map_err(|err| { 207 | eprintln!("ERROR: could not open index file {index_path}: {err}", 208 | index_path = index_path.display()); 209 | })?; 210 | 211 | model = Arc::new(Mutex::new(serde_json::from_reader(index_file).map_err(|err| { 212 | eprintln!("ERROR: could not parse index file {index_path}: {err}", 213 | index_path = index_path.display()); 214 | })?)); 215 | } else { 216 | model = Arc::new(Mutex::new(Default::default())); 217 | } 218 | 219 | { 220 | let model = Arc::clone(&model); 221 | thread::spawn(move || { 222 | let mut processed = 0; 223 | // TODO: what should we do in case indexing thread crashes 224 | add_folder_to_model(Path::new(&dir_path), Arc::clone(&model), &mut processed).unwrap(); 225 | if processed > 0 { 226 | let model = model.lock().unwrap(); 227 | save_model_as_json(&model, &index_path).unwrap(); 228 | } 229 | println!("Finished indexing"); 230 | }); 231 | } 232 | 233 | server::start(&address, Arc::clone(&model)) 234 | } 235 | 236 | _ => { 237 | usage(&program); 238 | eprintln!("ERROR: unknown subcommand {subcommand}"); 239 | Err(()) 240 | } 241 | } 242 | } 243 | 244 | fn main() -> ExitCode { 245 | match entry() { 246 | Ok(()) => ExitCode::SUCCESS, 247 | Err(()) => ExitCode::FAILURE, 248 | } 249 | } 250 | 251 | // TODO: search result must consist of clickable links 252 | // TODO: synonym terms 253 | -------------------------------------------------------------------------------- /src/model.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::path::{PathBuf, Path}; 3 | use serde::{Deserialize, Serialize}; 4 | use super::lexer::Lexer; 5 | use std::time::SystemTime; 6 | 7 | type DocFreq = HashMap; 8 | type TermFreq = HashMap; 9 | #[derive(Deserialize, Serialize)] 10 | pub struct Doc { 11 | tf: TermFreq, 12 | count: usize, 13 | // TODO: make sure that the serde serialization of SystemTime also work on other platforms 14 | last_modified: SystemTime, 15 | } 16 | type Docs = HashMap; 17 | 18 | #[derive(Default, Deserialize, Serialize)] 19 | pub struct Model { 20 | pub docs: Docs, 21 | pub df: DocFreq, 22 | } 23 | 24 | impl Model { 25 | fn remove_document(&mut self, file_path: &Path) { 26 | if let Some(doc) = self.docs.remove(file_path) { 27 | for t in doc.tf.keys() { 28 | if let Some(f) = self.df.get_mut(t) { 29 | *f -= 1; 30 | } 31 | } 32 | } 33 | } 34 | 35 | pub fn requires_reindexing(&mut self, file_path: &Path, last_modified: SystemTime) -> bool { 36 | if let Some(doc) = self.docs.get(file_path) { 37 | return doc.last_modified < last_modified; 38 | } 39 | return true; 40 | } 41 | 42 | pub fn search_query(&self, query: &[char]) -> Vec<(PathBuf, f32)> { 43 | let mut result = Vec::new(); 44 | let tokens = Lexer::new(&query).collect::>(); 45 | for (path, doc) in &self.docs { 46 | let mut rank = 0f32; 47 | for token in &tokens { 48 | rank += compute_tf(token, doc) * compute_idf(&token, self.docs.len(), &self.df); 49 | } 50 | // TODO: investigate the sources of NaN 51 | if !rank.is_nan() { 52 | result.push((path.clone(), rank)); 53 | } 54 | } 55 | result.sort_by(|(_, rank1), (_, rank2)| rank1.partial_cmp(rank2).expect(&format!("{rank1} and {rank2} are not comparable"))); 56 | result.reverse(); 57 | result 58 | } 59 | 60 | pub fn add_document(&mut self, file_path: PathBuf, last_modified: SystemTime, content: &[char]) { 61 | self.remove_document(&file_path); 62 | 63 | let mut tf = TermFreq::new(); 64 | 65 | let mut count = 0; 66 | for t in Lexer::new(content) { 67 | if let Some(f) = tf.get_mut(&t) { 68 | *f += 1; 69 | } else { 70 | tf.insert(t, 1); 71 | } 72 | count += 1; 73 | } 74 | 75 | for t in tf.keys() { 76 | if let Some(f) = self.df.get_mut(t) { 77 | *f += 1; 78 | } else { 79 | self.df.insert(t.to_string(), 1); 80 | } 81 | } 82 | 83 | self.docs.insert(file_path, Doc {count, tf, last_modified}); 84 | } 85 | } 86 | 87 | fn compute_tf(t: &str, doc: &Doc) -> f32 { 88 | let n = doc.count as f32; 89 | let m = doc.tf.get(t).cloned().unwrap_or(0) as f32; 90 | m / n 91 | } 92 | 93 | fn compute_idf(t: &str, n: usize, df: &DocFreq) -> f32 { 94 | let n = n as f32; 95 | let m = df.get(t).cloned().unwrap_or(1) as f32; 96 | (n / m).log10() 97 | } 98 | -------------------------------------------------------------------------------- /src/server.rs: -------------------------------------------------------------------------------- 1 | use std::str; 2 | use std::io; 3 | use std::sync::{Arc, Mutex}; 4 | 5 | use super::model::*; 6 | 7 | use tiny_http::{Server, Request, Response, Header, Method, StatusCode}; 8 | 9 | fn serve_404(request: Request) -> io::Result<()> { 10 | request.respond(Response::from_string("404").with_status_code(StatusCode(404))) 11 | } 12 | 13 | fn serve_500(request: Request) -> io::Result<()> { 14 | request.respond(Response::from_string("500").with_status_code(StatusCode(500))) 15 | } 16 | 17 | fn serve_400(request: Request, message: &str) -> io::Result<()> { 18 | request.respond(Response::from_string(format!("400: {message}")).with_status_code(StatusCode(400))) 19 | } 20 | 21 | fn serve_bytes(request: Request, bytes: &[u8], content_type: &str) -> io::Result<()> { 22 | let content_type_header = Header::from_bytes("Content-Type", content_type) 23 | .expect("That we didn't put any garbage in the headers"); 24 | request.respond(Response::from_data(bytes).with_header(content_type_header)) 25 | } 26 | 27 | // TODO: the errors of serve_api_search should probably return JSON 28 | // 'Cause that's what expected from them. 29 | fn serve_api_search(model: Arc>, mut request: Request) -> io::Result<()> { 30 | let mut buf = Vec::new(); 31 | if let Err(err) = request.as_reader().read_to_end(&mut buf) { 32 | eprintln!("ERROR: could not read the body of the request: {err}"); 33 | return serve_500(request); 34 | } 35 | 36 | let body = match str::from_utf8(&buf) { 37 | Ok(body) => body.chars().collect::>(), 38 | Err(err) => { 39 | eprintln!("ERROR: could not interpret body as UTF-8 string: {err}"); 40 | return serve_400(request, "Body must be a valid UTF-8 string"); 41 | } 42 | }; 43 | 44 | let model = model.lock().unwrap(); 45 | let result = model.search_query(&body); 46 | 47 | let json = match serde_json::to_string(&result.iter().take(20).collect::>()) { 48 | Ok(json) => json, 49 | Err(err) => { 50 | eprintln!("ERROR: could not convert search results to JSON: {err}"); 51 | return serve_500(request) 52 | } 53 | }; 54 | 55 | let content_type_header = Header::from_bytes("Content-Type", "application/json") 56 | .expect("That we didn't put any garbage in the headers"); 57 | request.respond(Response::from_string(&json).with_header(content_type_header)) 58 | } 59 | 60 | fn serve_api_stats(model: Arc>, request: Request) -> io::Result<()> { 61 | use serde::Serialize; 62 | 63 | #[derive(Default, Serialize)] 64 | struct Stats { 65 | docs_count: usize, 66 | terms_count: usize, 67 | } 68 | 69 | let mut stats: Stats = Default::default(); 70 | { 71 | let model = model.lock().unwrap(); 72 | stats.docs_count = model.docs.len(); 73 | stats.terms_count = model.df.len(); 74 | } 75 | 76 | let json = match serde_json::to_string(&stats) { 77 | Ok(json) => json, 78 | Err(err) => { 79 | eprintln!("ERROR: could not convert stats results to JSON: {err}"); 80 | return serve_500(request) 81 | } 82 | }; 83 | 84 | let content_type_header = Header::from_bytes("Content-Type", "application/json") 85 | .expect("That we didn't put any garbage in the headers"); 86 | request.respond(Response::from_string(&json).with_header(content_type_header)) 87 | } 88 | 89 | fn serve_request(model: Arc>, request: Request) -> io::Result<()> { 90 | println!("INFO: received request! method: {:?}, url: {:?}", request.method(), request.url()); 91 | 92 | match (request.method(), request.url()) { 93 | (Method::Post, "/api/search") => { 94 | serve_api_search(model, request) 95 | } 96 | (Method::Get, "/api/stats") => { 97 | serve_api_stats(model, request) 98 | } 99 | (Method::Get, "/index.js") => { 100 | serve_bytes(request, include_bytes!("index.js"), "text/javascript; charset=utf-8") 101 | } 102 | (Method::Get, "/") | (Method::Get, "/index.html") => { 103 | serve_bytes(request, include_bytes!("index.html"), "text/html; charset=utf-8") 104 | } 105 | _ => { 106 | serve_404(request) 107 | } 108 | } 109 | } 110 | 111 | pub fn start(address: &str, model: Arc>) -> Result<(), ()> { 112 | let server = Server::http(&address).map_err(|err| { 113 | eprintln!("ERROR: could not start HTTP server at {address}: {err}"); 114 | })?; 115 | 116 | println!("INFO: listening at http://{address}/"); 117 | 118 | for request in server.incoming_requests() { 119 | serve_request(Arc::clone(&model), request).map_err(|err| { 120 | eprintln!("ERROR: could not serve the response: {err}"); 121 | }).ok(); // <- don't stop on errors, keep serving 122 | } 123 | 124 | eprintln!("ERROR: the server socket has shutdown"); 125 | Err(()) 126 | } 127 | -------------------------------------------------------------------------------- /src/snowball/algorithms/english_stemmer.rs: -------------------------------------------------------------------------------- 1 | //! Generated by Snowball 2.2.0 - https://snowballstem.org/ 2 | 3 | #![allow(non_snake_case)] 4 | #![allow(non_upper_case_globals)] 5 | #![allow(unused_mut)] 6 | #![allow(unused_parens)] 7 | #![allow(unused_variables)] 8 | use crate::snowball::SnowballEnv; 9 | use crate::snowball::Among; 10 | 11 | static A_0: &'static [Among; 3] = &[ 12 | Among("arsen", -1, -1, None), 13 | Among("commun", -1, -1, None), 14 | Among("gener", -1, -1, None), 15 | ]; 16 | 17 | static A_1: &'static [Among; 3] = &[ 18 | Among("'", -1, 1, None), 19 | Among("'s'", 0, 1, None), 20 | Among("'s", -1, 1, None), 21 | ]; 22 | 23 | static A_2: &'static [Among; 6] = &[ 24 | Among("ied", -1, 2, None), 25 | Among("s", -1, 3, None), 26 | Among("ies", 1, 2, None), 27 | Among("sses", 1, 1, None), 28 | Among("ss", 1, -1, None), 29 | Among("us", 1, -1, None), 30 | ]; 31 | 32 | static A_3: &'static [Among; 13] = &[ 33 | Among("", -1, 3, None), 34 | Among("bb", 0, 2, None), 35 | Among("dd", 0, 2, None), 36 | Among("ff", 0, 2, None), 37 | Among("gg", 0, 2, None), 38 | Among("bl", 0, 1, None), 39 | Among("mm", 0, 2, None), 40 | Among("nn", 0, 2, None), 41 | Among("pp", 0, 2, None), 42 | Among("rr", 0, 2, None), 43 | Among("at", 0, 1, None), 44 | Among("tt", 0, 2, None), 45 | Among("iz", 0, 1, None), 46 | ]; 47 | 48 | static A_4: &'static [Among; 6] = &[ 49 | Among("ed", -1, 2, None), 50 | Among("eed", 0, 1, None), 51 | Among("ing", -1, 2, None), 52 | Among("edly", -1, 2, None), 53 | Among("eedly", 3, 1, None), 54 | Among("ingly", -1, 2, None), 55 | ]; 56 | 57 | static A_5: &'static [Among; 24] = &[ 58 | Among("anci", -1, 3, None), 59 | Among("enci", -1, 2, None), 60 | Among("ogi", -1, 13, None), 61 | Among("li", -1, 15, None), 62 | Among("bli", 3, 12, None), 63 | Among("abli", 4, 4, None), 64 | Among("alli", 3, 8, None), 65 | Among("fulli", 3, 9, None), 66 | Among("lessli", 3, 14, None), 67 | Among("ousli", 3, 10, None), 68 | Among("entli", 3, 5, None), 69 | Among("aliti", -1, 8, None), 70 | Among("biliti", -1, 12, None), 71 | Among("iviti", -1, 11, None), 72 | Among("tional", -1, 1, None), 73 | Among("ational", 14, 7, None), 74 | Among("alism", -1, 8, None), 75 | Among("ation", -1, 7, None), 76 | Among("ization", 17, 6, None), 77 | Among("izer", -1, 6, None), 78 | Among("ator", -1, 7, None), 79 | Among("iveness", -1, 11, None), 80 | Among("fulness", -1, 9, None), 81 | Among("ousness", -1, 10, None), 82 | ]; 83 | 84 | static A_6: &'static [Among; 9] = &[ 85 | Among("icate", -1, 4, None), 86 | Among("ative", -1, 6, None), 87 | Among("alize", -1, 3, None), 88 | Among("iciti", -1, 4, None), 89 | Among("ical", -1, 4, None), 90 | Among("tional", -1, 1, None), 91 | Among("ational", 5, 2, None), 92 | Among("ful", -1, 5, None), 93 | Among("ness", -1, 5, None), 94 | ]; 95 | 96 | static A_7: &'static [Among; 18] = &[ 97 | Among("ic", -1, 1, None), 98 | Among("ance", -1, 1, None), 99 | Among("ence", -1, 1, None), 100 | Among("able", -1, 1, None), 101 | Among("ible", -1, 1, None), 102 | Among("ate", -1, 1, None), 103 | Among("ive", -1, 1, None), 104 | Among("ize", -1, 1, None), 105 | Among("iti", -1, 1, None), 106 | Among("al", -1, 1, None), 107 | Among("ism", -1, 1, None), 108 | Among("ion", -1, 2, None), 109 | Among("er", -1, 1, None), 110 | Among("ous", -1, 1, None), 111 | Among("ant", -1, 1, None), 112 | Among("ent", -1, 1, None), 113 | Among("ment", 15, 1, None), 114 | Among("ement", 16, 1, None), 115 | ]; 116 | 117 | static A_8: &'static [Among; 2] = &[ 118 | Among("e", -1, 1, None), 119 | Among("l", -1, 2, None), 120 | ]; 121 | 122 | static A_9: &'static [Among; 8] = &[ 123 | Among("succeed", -1, -1, None), 124 | Among("proceed", -1, -1, None), 125 | Among("exceed", -1, -1, None), 126 | Among("canning", -1, -1, None), 127 | Among("inning", -1, -1, None), 128 | Among("earring", -1, -1, None), 129 | Among("herring", -1, -1, None), 130 | Among("outing", -1, -1, None), 131 | ]; 132 | 133 | static A_10: &'static [Among; 18] = &[ 134 | Among("andes", -1, -1, None), 135 | Among("atlas", -1, -1, None), 136 | Among("bias", -1, -1, None), 137 | Among("cosmos", -1, -1, None), 138 | Among("dying", -1, 3, None), 139 | Among("early", -1, 9, None), 140 | Among("gently", -1, 7, None), 141 | Among("howe", -1, -1, None), 142 | Among("idly", -1, 6, None), 143 | Among("lying", -1, 4, None), 144 | Among("news", -1, -1, None), 145 | Among("only", -1, 10, None), 146 | Among("singly", -1, 11, None), 147 | Among("skies", -1, 2, None), 148 | Among("skis", -1, 1, None), 149 | Among("sky", -1, -1, None), 150 | Among("tying", -1, 5, None), 151 | Among("ugly", -1, 8, None), 152 | ]; 153 | 154 | static G_v: &'static [u8; 4] = &[17, 65, 16, 1]; 155 | 156 | static G_v_WXY: &'static [u8; 5] = &[1, 17, 65, 208, 1]; 157 | 158 | static G_valid_LI: &'static [u8; 3] = &[55, 141, 2]; 159 | 160 | #[derive(Clone)] 161 | struct Context { 162 | b_Y_found: bool, 163 | i_p2: i32, 164 | i_p1: i32, 165 | } 166 | 167 | fn r_prelude(env: &mut SnowballEnv, context: &mut Context) -> bool { 168 | context.b_Y_found = false; 169 | let v_1 = env.cursor; 170 | 'lab0: loop { 171 | env.bra = env.cursor; 172 | if !env.eq_s(&"'") { 173 | break 'lab0; 174 | } 175 | env.ket = env.cursor; 176 | if !env.slice_del() { 177 | return false; 178 | } 179 | break 'lab0; 180 | } 181 | env.cursor = v_1; 182 | let v_2 = env.cursor; 183 | 'lab1: loop { 184 | env.bra = env.cursor; 185 | if !env.eq_s(&"y") { 186 | break 'lab1; 187 | } 188 | env.ket = env.cursor; 189 | if !env.slice_from("Y") { 190 | return false; 191 | } 192 | context.b_Y_found = true; 193 | break 'lab1; 194 | } 195 | env.cursor = v_2; 196 | let v_3 = env.cursor; 197 | 'lab2: loop { 198 | 'replab3: loop{ 199 | let v_4 = env.cursor; 200 | 'lab4: for _ in 0..1 { 201 | 'golab5: loop { 202 | let v_5 = env.cursor; 203 | 'lab6: loop { 204 | if !env.in_grouping(G_v, 97, 121) { 205 | break 'lab6; 206 | } 207 | env.bra = env.cursor; 208 | if !env.eq_s(&"y") { 209 | break 'lab6; 210 | } 211 | env.ket = env.cursor; 212 | env.cursor = v_5; 213 | break 'golab5; 214 | } 215 | env.cursor = v_5; 216 | if env.cursor >= env.limit { 217 | break 'lab4; 218 | } 219 | env.next_char(); 220 | } 221 | if !env.slice_from("Y") { 222 | return false; 223 | } 224 | context.b_Y_found = true; 225 | continue 'replab3; 226 | } 227 | env.cursor = v_4; 228 | break 'replab3; 229 | } 230 | break 'lab2; 231 | } 232 | env.cursor = v_3; 233 | return true; 234 | } 235 | 236 | fn r_mark_regions(env: &mut SnowballEnv, context: &mut Context) -> bool { 237 | context.i_p1 = env.limit; 238 | context.i_p2 = env.limit; 239 | let v_1 = env.cursor; 240 | 'lab0: loop { 241 | 'lab1: loop { 242 | let v_2 = env.cursor; 243 | 'lab2: loop { 244 | if env.find_among(A_0, context) == 0 { 245 | break 'lab2; 246 | } 247 | break 'lab1; 248 | } 249 | env.cursor = v_2; 250 | 'golab3: loop { 251 | 'lab4: loop { 252 | if !env.in_grouping(G_v, 97, 121) { 253 | break 'lab4; 254 | } 255 | break 'golab3; 256 | } 257 | if env.cursor >= env.limit { 258 | break 'lab0; 259 | } 260 | env.next_char(); 261 | } 262 | 'golab5: loop { 263 | 'lab6: loop { 264 | if !env.out_grouping(G_v, 97, 121) { 265 | break 'lab6; 266 | } 267 | break 'golab5; 268 | } 269 | if env.cursor >= env.limit { 270 | break 'lab0; 271 | } 272 | env.next_char(); 273 | } 274 | break 'lab1; 275 | } 276 | context.i_p1 = env.cursor; 277 | 'golab7: loop { 278 | 'lab8: loop { 279 | if !env.in_grouping(G_v, 97, 121) { 280 | break 'lab8; 281 | } 282 | break 'golab7; 283 | } 284 | if env.cursor >= env.limit { 285 | break 'lab0; 286 | } 287 | env.next_char(); 288 | } 289 | 'golab9: loop { 290 | 'lab10: loop { 291 | if !env.out_grouping(G_v, 97, 121) { 292 | break 'lab10; 293 | } 294 | break 'golab9; 295 | } 296 | if env.cursor >= env.limit { 297 | break 'lab0; 298 | } 299 | env.next_char(); 300 | } 301 | context.i_p2 = env.cursor; 302 | break 'lab0; 303 | } 304 | env.cursor = v_1; 305 | return true; 306 | } 307 | 308 | fn r_shortv(env: &mut SnowballEnv, context: &mut Context) -> bool { 309 | 'lab0: loop { 310 | let v_1 = env.limit - env.cursor; 311 | 'lab1: loop { 312 | if !env.out_grouping_b(G_v_WXY, 89, 121) { 313 | break 'lab1; 314 | } 315 | if !env.in_grouping_b(G_v, 97, 121) { 316 | break 'lab1; 317 | } 318 | if !env.out_grouping_b(G_v, 97, 121) { 319 | break 'lab1; 320 | } 321 | break 'lab0; 322 | } 323 | env.cursor = env.limit - v_1; 324 | if !env.out_grouping_b(G_v, 97, 121) { 325 | return false; 326 | } 327 | if !env.in_grouping_b(G_v, 97, 121) { 328 | return false; 329 | } 330 | if env.cursor > env.limit_backward { 331 | return false; 332 | } 333 | break 'lab0; 334 | } 335 | return true; 336 | } 337 | 338 | fn r_R1(env: &mut SnowballEnv, context: &mut Context) -> bool { 339 | if !(context.i_p1 <= env.cursor){ 340 | return false; 341 | } 342 | return true; 343 | } 344 | 345 | fn r_R2(env: &mut SnowballEnv, context: &mut Context) -> bool { 346 | if !(context.i_p2 <= env.cursor){ 347 | return false; 348 | } 349 | return true; 350 | } 351 | 352 | fn r_Step_1a(env: &mut SnowballEnv, context: &mut Context) -> bool { 353 | let mut among_var; 354 | let v_1 = env.limit - env.cursor; 355 | 'lab0: loop { 356 | env.ket = env.cursor; 357 | if env.find_among_b(A_1, context) == 0 { 358 | env.cursor = env.limit - v_1; 359 | break 'lab0; 360 | } 361 | env.bra = env.cursor; 362 | if !env.slice_del() { 363 | return false; 364 | } 365 | break 'lab0; 366 | } 367 | env.ket = env.cursor; 368 | among_var = env.find_among_b(A_2, context); 369 | if among_var == 0 { 370 | return false; 371 | } 372 | env.bra = env.cursor; 373 | if among_var == 1 { 374 | if !env.slice_from("ss") { 375 | return false; 376 | } 377 | } else if among_var == 2 { 378 | 'lab1: loop { 379 | let v_2 = env.limit - env.cursor; 380 | 'lab2: loop { 381 | if !env.hop_back(2) { 382 | break 'lab2; 383 | } 384 | if !env.slice_from("i") { 385 | return false; 386 | } 387 | break 'lab1; 388 | } 389 | env.cursor = env.limit - v_2; 390 | if !env.slice_from("ie") { 391 | return false; 392 | } 393 | break 'lab1; 394 | } 395 | } else if among_var == 3 { 396 | if env.cursor <= env.limit_backward { 397 | return false; 398 | } 399 | env.previous_char(); 400 | 'golab3: loop { 401 | 'lab4: loop { 402 | if !env.in_grouping_b(G_v, 97, 121) { 403 | break 'lab4; 404 | } 405 | break 'golab3; 406 | } 407 | if env.cursor <= env.limit_backward { 408 | return false; 409 | } 410 | env.previous_char(); 411 | } 412 | if !env.slice_del() { 413 | return false; 414 | } 415 | } 416 | return true; 417 | } 418 | 419 | fn r_Step_1b(env: &mut SnowballEnv, context: &mut Context) -> bool { 420 | let mut among_var; 421 | env.ket = env.cursor; 422 | among_var = env.find_among_b(A_4, context); 423 | if among_var == 0 { 424 | return false; 425 | } 426 | env.bra = env.cursor; 427 | if among_var == 1 { 428 | if !r_R1(env, context) { 429 | return false; 430 | } 431 | if !env.slice_from("ee") { 432 | return false; 433 | } 434 | } else if among_var == 2 { 435 | let v_1 = env.limit - env.cursor; 436 | 'golab0: loop { 437 | 'lab1: loop { 438 | if !env.in_grouping_b(G_v, 97, 121) { 439 | break 'lab1; 440 | } 441 | break 'golab0; 442 | } 443 | if env.cursor <= env.limit_backward { 444 | return false; 445 | } 446 | env.previous_char(); 447 | } 448 | env.cursor = env.limit - v_1; 449 | if !env.slice_del() { 450 | return false; 451 | } 452 | let v_3 = env.limit - env.cursor; 453 | among_var = env.find_among_b(A_3, context); 454 | if among_var == 0 { 455 | return false; 456 | } 457 | env.cursor = env.limit - v_3; 458 | if among_var == 1 { 459 | let c = env.cursor; 460 | let (bra, ket) = (env.cursor, env.cursor); 461 | env.insert(bra, ket, "e"); 462 | env.cursor = c; 463 | } else if among_var == 2 { 464 | env.ket = env.cursor; 465 | if env.cursor <= env.limit_backward { 466 | return false; 467 | } 468 | env.previous_char(); 469 | env.bra = env.cursor; 470 | if !env.slice_del() { 471 | return false; 472 | } 473 | } else if among_var == 3 { 474 | if env.cursor != context.i_p1 { 475 | return false; 476 | } 477 | let v_4 = env.limit - env.cursor; 478 | if !r_shortv(env, context) { 479 | return false; 480 | } 481 | env.cursor = env.limit - v_4; 482 | let c = env.cursor; 483 | let (bra, ket) = (env.cursor, env.cursor); 484 | env.insert(bra, ket, "e"); 485 | env.cursor = c; 486 | } 487 | } 488 | return true; 489 | } 490 | 491 | fn r_Step_1c(env: &mut SnowballEnv, context: &mut Context) -> bool { 492 | env.ket = env.cursor; 493 | 'lab0: loop { 494 | let v_1 = env.limit - env.cursor; 495 | 'lab1: loop { 496 | if !env.eq_s_b(&"y") { 497 | break 'lab1; 498 | } 499 | break 'lab0; 500 | } 501 | env.cursor = env.limit - v_1; 502 | if !env.eq_s_b(&"Y") { 503 | return false; 504 | } 505 | break 'lab0; 506 | } 507 | env.bra = env.cursor; 508 | if !env.out_grouping_b(G_v, 97, 121) { 509 | return false; 510 | } 511 | 'lab2: loop { 512 | if env.cursor > env.limit_backward { 513 | break 'lab2; 514 | } 515 | return false; 516 | } 517 | if !env.slice_from("i") { 518 | return false; 519 | } 520 | return true; 521 | } 522 | 523 | fn r_Step_2(env: &mut SnowballEnv, context: &mut Context) -> bool { 524 | let mut among_var; 525 | env.ket = env.cursor; 526 | among_var = env.find_among_b(A_5, context); 527 | if among_var == 0 { 528 | return false; 529 | } 530 | env.bra = env.cursor; 531 | if !r_R1(env, context) { 532 | return false; 533 | } 534 | if among_var == 1 { 535 | if !env.slice_from("tion") { 536 | return false; 537 | } 538 | } else if among_var == 2 { 539 | if !env.slice_from("ence") { 540 | return false; 541 | } 542 | } else if among_var == 3 { 543 | if !env.slice_from("ance") { 544 | return false; 545 | } 546 | } else if among_var == 4 { 547 | if !env.slice_from("able") { 548 | return false; 549 | } 550 | } else if among_var == 5 { 551 | if !env.slice_from("ent") { 552 | return false; 553 | } 554 | } else if among_var == 6 { 555 | if !env.slice_from("ize") { 556 | return false; 557 | } 558 | } else if among_var == 7 { 559 | if !env.slice_from("ate") { 560 | return false; 561 | } 562 | } else if among_var == 8 { 563 | if !env.slice_from("al") { 564 | return false; 565 | } 566 | } else if among_var == 9 { 567 | if !env.slice_from("ful") { 568 | return false; 569 | } 570 | } else if among_var == 10 { 571 | if !env.slice_from("ous") { 572 | return false; 573 | } 574 | } else if among_var == 11 { 575 | if !env.slice_from("ive") { 576 | return false; 577 | } 578 | } else if among_var == 12 { 579 | if !env.slice_from("ble") { 580 | return false; 581 | } 582 | } else if among_var == 13 { 583 | if !env.eq_s_b(&"l") { 584 | return false; 585 | } 586 | if !env.slice_from("og") { 587 | return false; 588 | } 589 | } else if among_var == 14 { 590 | if !env.slice_from("less") { 591 | return false; 592 | } 593 | } else if among_var == 15 { 594 | if !env.in_grouping_b(G_valid_LI, 99, 116) { 595 | return false; 596 | } 597 | if !env.slice_del() { 598 | return false; 599 | } 600 | } 601 | return true; 602 | } 603 | 604 | fn r_Step_3(env: &mut SnowballEnv, context: &mut Context) -> bool { 605 | let mut among_var; 606 | env.ket = env.cursor; 607 | among_var = env.find_among_b(A_6, context); 608 | if among_var == 0 { 609 | return false; 610 | } 611 | env.bra = env.cursor; 612 | if !r_R1(env, context) { 613 | return false; 614 | } 615 | if among_var == 1 { 616 | if !env.slice_from("tion") { 617 | return false; 618 | } 619 | } else if among_var == 2 { 620 | if !env.slice_from("ate") { 621 | return false; 622 | } 623 | } else if among_var == 3 { 624 | if !env.slice_from("al") { 625 | return false; 626 | } 627 | } else if among_var == 4 { 628 | if !env.slice_from("ic") { 629 | return false; 630 | } 631 | } else if among_var == 5 { 632 | if !env.slice_del() { 633 | return false; 634 | } 635 | } else if among_var == 6 { 636 | if !r_R2(env, context) { 637 | return false; 638 | } 639 | if !env.slice_del() { 640 | return false; 641 | } 642 | } 643 | return true; 644 | } 645 | 646 | fn r_Step_4(env: &mut SnowballEnv, context: &mut Context) -> bool { 647 | let mut among_var; 648 | env.ket = env.cursor; 649 | among_var = env.find_among_b(A_7, context); 650 | if among_var == 0 { 651 | return false; 652 | } 653 | env.bra = env.cursor; 654 | if !r_R2(env, context) { 655 | return false; 656 | } 657 | if among_var == 1 { 658 | if !env.slice_del() { 659 | return false; 660 | } 661 | } else if among_var == 2 { 662 | 'lab0: loop { 663 | let v_1 = env.limit - env.cursor; 664 | 'lab1: loop { 665 | if !env.eq_s_b(&"s") { 666 | break 'lab1; 667 | } 668 | break 'lab0; 669 | } 670 | env.cursor = env.limit - v_1; 671 | if !env.eq_s_b(&"t") { 672 | return false; 673 | } 674 | break 'lab0; 675 | } 676 | if !env.slice_del() { 677 | return false; 678 | } 679 | } 680 | return true; 681 | } 682 | 683 | fn r_Step_5(env: &mut SnowballEnv, context: &mut Context) -> bool { 684 | let mut among_var; 685 | env.ket = env.cursor; 686 | among_var = env.find_among_b(A_8, context); 687 | if among_var == 0 { 688 | return false; 689 | } 690 | env.bra = env.cursor; 691 | if among_var == 1 { 692 | 'lab0: loop { 693 | let v_1 = env.limit - env.cursor; 694 | 'lab1: loop { 695 | if !r_R2(env, context) { 696 | break 'lab1; 697 | } 698 | break 'lab0; 699 | } 700 | env.cursor = env.limit - v_1; 701 | if !r_R1(env, context) { 702 | return false; 703 | } 704 | let v_2 = env.limit - env.cursor; 705 | 'lab2: loop { 706 | if !r_shortv(env, context) { 707 | break 'lab2; 708 | } 709 | return false; 710 | } 711 | env.cursor = env.limit - v_2; 712 | break 'lab0; 713 | } 714 | if !env.slice_del() { 715 | return false; 716 | } 717 | } else if among_var == 2 { 718 | if !r_R2(env, context) { 719 | return false; 720 | } 721 | if !env.eq_s_b(&"l") { 722 | return false; 723 | } 724 | if !env.slice_del() { 725 | return false; 726 | } 727 | } 728 | return true; 729 | } 730 | 731 | fn r_exception2(env: &mut SnowballEnv, context: &mut Context) -> bool { 732 | env.ket = env.cursor; 733 | if env.find_among_b(A_9, context) == 0 { 734 | return false; 735 | } 736 | env.bra = env.cursor; 737 | if env.cursor > env.limit_backward { 738 | return false; 739 | } 740 | return true; 741 | } 742 | 743 | fn r_exception1(env: &mut SnowballEnv, context: &mut Context) -> bool { 744 | let mut among_var; 745 | env.bra = env.cursor; 746 | among_var = env.find_among(A_10, context); 747 | if among_var == 0 { 748 | return false; 749 | } 750 | env.ket = env.cursor; 751 | if env.cursor < env.limit { 752 | return false; 753 | } 754 | if among_var == 1 { 755 | if !env.slice_from("ski") { 756 | return false; 757 | } 758 | } else if among_var == 2 { 759 | if !env.slice_from("sky") { 760 | return false; 761 | } 762 | } else if among_var == 3 { 763 | if !env.slice_from("die") { 764 | return false; 765 | } 766 | } else if among_var == 4 { 767 | if !env.slice_from("lie") { 768 | return false; 769 | } 770 | } else if among_var == 5 { 771 | if !env.slice_from("tie") { 772 | return false; 773 | } 774 | } else if among_var == 6 { 775 | if !env.slice_from("idl") { 776 | return false; 777 | } 778 | } else if among_var == 7 { 779 | if !env.slice_from("gentl") { 780 | return false; 781 | } 782 | } else if among_var == 8 { 783 | if !env.slice_from("ugli") { 784 | return false; 785 | } 786 | } else if among_var == 9 { 787 | if !env.slice_from("earli") { 788 | return false; 789 | } 790 | } else if among_var == 10 { 791 | if !env.slice_from("onli") { 792 | return false; 793 | } 794 | } else if among_var == 11 { 795 | if !env.slice_from("singl") { 796 | return false; 797 | } 798 | } 799 | return true; 800 | } 801 | 802 | fn r_postlude(env: &mut SnowballEnv, context: &mut Context) -> bool { 803 | if !context.b_Y_found { 804 | return false; 805 | } 806 | 'replab0: loop{ 807 | let v_1 = env.cursor; 808 | 'lab1: for _ in 0..1 { 809 | 'golab2: loop { 810 | let v_2 = env.cursor; 811 | 'lab3: loop { 812 | env.bra = env.cursor; 813 | if !env.eq_s(&"Y") { 814 | break 'lab3; 815 | } 816 | env.ket = env.cursor; 817 | env.cursor = v_2; 818 | break 'golab2; 819 | } 820 | env.cursor = v_2; 821 | if env.cursor >= env.limit { 822 | break 'lab1; 823 | } 824 | env.next_char(); 825 | } 826 | if !env.slice_from("y") { 827 | return false; 828 | } 829 | continue 'replab0; 830 | } 831 | env.cursor = v_1; 832 | break 'replab0; 833 | } 834 | return true; 835 | } 836 | 837 | pub fn stem(env: &mut SnowballEnv) -> bool { 838 | let mut context = &mut Context { 839 | b_Y_found: false, 840 | i_p2: 0, 841 | i_p1: 0, 842 | }; 843 | 'lab0: loop { 844 | let v_1 = env.cursor; 845 | 'lab1: loop { 846 | if !r_exception1(env, context) { 847 | break 'lab1; 848 | } 849 | break 'lab0; 850 | } 851 | env.cursor = v_1; 852 | 'lab2: loop { 853 | let v_2 = env.cursor; 854 | 'lab3: loop { 855 | if !env.hop(3) { 856 | break 'lab3; 857 | } 858 | break 'lab2; 859 | } 860 | env.cursor = v_2; 861 | break 'lab0; 862 | } 863 | env.cursor = v_1; 864 | r_prelude(env, context); 865 | r_mark_regions(env, context); 866 | env.limit_backward = env.cursor; 867 | env.cursor = env.limit; 868 | let v_5 = env.limit - env.cursor; 869 | r_Step_1a(env, context); 870 | env.cursor = env.limit - v_5; 871 | 'lab4: loop { 872 | let v_6 = env.limit - env.cursor; 873 | 'lab5: loop { 874 | if !r_exception2(env, context) { 875 | break 'lab5; 876 | } 877 | break 'lab4; 878 | } 879 | env.cursor = env.limit - v_6; 880 | let v_7 = env.limit - env.cursor; 881 | r_Step_1b(env, context); 882 | env.cursor = env.limit - v_7; 883 | let v_8 = env.limit - env.cursor; 884 | r_Step_1c(env, context); 885 | env.cursor = env.limit - v_8; 886 | let v_9 = env.limit - env.cursor; 887 | r_Step_2(env, context); 888 | env.cursor = env.limit - v_9; 889 | let v_10 = env.limit - env.cursor; 890 | r_Step_3(env, context); 891 | env.cursor = env.limit - v_10; 892 | let v_11 = env.limit - env.cursor; 893 | r_Step_4(env, context); 894 | env.cursor = env.limit - v_11; 895 | let v_12 = env.limit - env.cursor; 896 | r_Step_5(env, context); 897 | env.cursor = env.limit - v_12; 898 | break 'lab4; 899 | } 900 | env.cursor = env.limit_backward; 901 | let v_13 = env.cursor; 902 | r_postlude(env, context); 903 | env.cursor = v_13; 904 | break 'lab0; 905 | } 906 | return true; 907 | } 908 | -------------------------------------------------------------------------------- /src/snowball/algorithms/mod.rs: -------------------------------------------------------------------------------- 1 | // Have a look at build.rs 2 | //include!(concat!(env!("OUT_DIR"), "/lang_include.rs")); 3 | pub mod english_stemmer; 4 | -------------------------------------------------------------------------------- /src/snowball/among.rs: -------------------------------------------------------------------------------- 1 | use crate::snowball::SnowballEnv; 2 | 3 | pub struct Among(pub &'static str, 4 | pub i32, 5 | pub i32, 6 | pub Option<&'static (dyn Fn(&mut SnowballEnv, &mut T) -> bool + Sync)>); 7 | -------------------------------------------------------------------------------- /src/snowball/mod.rs: -------------------------------------------------------------------------------- 1 | // TODO: add Snowball license in here 2 | pub mod algorithms; 3 | mod among; 4 | mod snowball_env; 5 | 6 | // TODO: why do we need this `crate::`? 7 | pub use crate::snowball::among::Among; 8 | pub use crate::snowball::snowball_env::SnowballEnv; 9 | -------------------------------------------------------------------------------- /src/snowball/snowball_env.rs: -------------------------------------------------------------------------------- 1 | use std::borrow::Cow; 2 | use crate::snowball::Among; 3 | 4 | #[derive(Debug, Clone)] 5 | pub struct SnowballEnv<'a> { 6 | pub current: Cow<'a, str>, 7 | pub cursor: i32, 8 | pub limit: i32, 9 | pub limit_backward: i32, 10 | pub bra: i32, 11 | pub ket: i32, 12 | } 13 | 14 | 15 | impl<'a> SnowballEnv<'a> { 16 | pub fn create(value: &'a str) -> Self { 17 | let len = value.len(); 18 | SnowballEnv { 19 | current: Cow::from(value), 20 | cursor: 0, 21 | limit: len as i32, 22 | limit_backward: 0, 23 | bra: 0, 24 | ket: len as i32, 25 | } 26 | } 27 | 28 | pub fn get_current(self) -> Cow<'a, str> { 29 | self.current 30 | } 31 | 32 | pub fn set_current(&mut self, current: &'a str) { 33 | self.current = Cow::from(current); 34 | } 35 | 36 | pub fn set_current_s(&mut self, current: String) { 37 | self.current = Cow::from(current); 38 | } 39 | 40 | fn replace_s(&mut self, bra: i32, ket: i32, s: &str) -> i32 { 41 | let adjustment = s.len() as i32 - (ket - bra); 42 | let mut result = String::with_capacity(self.current.len()); 43 | { 44 | let (lhs, _) = self.current.split_at(bra as usize); 45 | let (_, rhs) = self.current.split_at(ket as usize); 46 | result.push_str(lhs); 47 | result.push_str(s); 48 | result.push_str(rhs); 49 | } 50 | // ... not very nice... 51 | let new_lim = self.limit + adjustment; 52 | self.limit = new_lim; 53 | if self.cursor >= ket { 54 | let new_cur = self.cursor + adjustment; 55 | self.cursor = new_cur; 56 | } else if self.cursor > bra { 57 | self.cursor = bra 58 | } 59 | self.current = Cow::from(result); 60 | adjustment 61 | } 62 | 63 | /// Check if s is after cursor. 64 | /// If so, move cursor to the end of s 65 | pub fn eq_s(&mut self, s: &str) -> bool { 66 | if self.cursor >= self.limit { 67 | return false; 68 | } 69 | if self.current[(self.cursor as usize)..].starts_with(s) { 70 | self.cursor += s.len() as i32; 71 | while !self.current.is_char_boundary(self.cursor as usize) { 72 | self.cursor += 1; 73 | } 74 | true 75 | } else { 76 | false 77 | } 78 | } 79 | 80 | /// Check if 's' is before cursor 81 | /// If so, move cursor to the beginning of s 82 | pub fn eq_s_b(&mut self, s: &str) -> bool { 83 | if (self.cursor - self.limit_backward) < s.len() as i32 { 84 | false 85 | // Check if cursor -s.len is a char boundary. if not well... return false obv 86 | } else if !self.current.is_char_boundary(self.cursor as usize - s.len()) || 87 | !self.current[self.cursor as usize - s.len()..].starts_with(s) { 88 | false 89 | } else { 90 | self.cursor -= s.len() as i32; 91 | true 92 | } 93 | } 94 | 95 | /// Replace string between `bra` and `ket` with s 96 | pub fn slice_from(&mut self, s: &str) -> bool { 97 | let (bra, ket) = (self.bra, self.ket); 98 | self.replace_s(bra, ket, s); 99 | true 100 | } 101 | 102 | /// Move cursor to next character 103 | pub fn next_char(&mut self) { 104 | self.cursor += 1; 105 | while !self.current.is_char_boundary(self.cursor as usize) { 106 | self.cursor += 1; 107 | } 108 | } 109 | 110 | /// Move cursor to previous character 111 | pub fn previous_char(&mut self) { 112 | self.cursor -= 1; 113 | while !self.current.is_char_boundary(self.cursor as usize) { 114 | self.cursor -= 1; 115 | } 116 | } 117 | 118 | pub fn hop(&mut self, mut delta: i32) -> bool { 119 | let mut res = self.cursor; 120 | while delta > 0 { 121 | delta -= 1; 122 | if res >= self.limit { 123 | return false; 124 | } 125 | res += 1; 126 | while res < self.limit && !self.current.is_char_boundary(res as usize) { 127 | res += 1; 128 | } 129 | } 130 | self.cursor = res; 131 | return true; 132 | } 133 | 134 | pub fn hop_checked(&mut self, delta: i32) -> bool { 135 | return delta >= 0 && self.hop(delta); 136 | } 137 | 138 | pub fn hop_back(&mut self, mut delta: i32) -> bool { 139 | let mut res = self.cursor; 140 | while delta > 0 { 141 | delta -= 1; 142 | if res <= self.limit_backward { 143 | return false; 144 | } 145 | res -= 1; 146 | while res > self.limit_backward && !self.current.is_char_boundary(res as usize) { 147 | res -= 1; 148 | } 149 | } 150 | self.cursor = res; 151 | return true; 152 | } 153 | 154 | pub fn hop_back_checked(&mut self, delta: i32) -> bool { 155 | return delta >= 0 && self.hop_back(delta); 156 | } 157 | 158 | // A grouping is represented by a minimum code point, a maximum code point, 159 | // and a bitfield of which code points in that range are in the grouping. 160 | // For example, in english.sbl, valid_LI is 'cdeghkmnrt'. 161 | // The minimum and maximum code points are 99 and 116, 162 | // so every time one of these grouping functions is called for g_valid_LI, 163 | // min must be 99 and max must be 116. There are 18 code points within that 164 | // range (inclusive) so the grouping is represented with 18 bits, plus 6 bits of padding: 165 | // 166 | // cdefghij klmnopqr st 167 | // 11101100 10110001 01000000 168 | // 169 | // The first bit is the least significant. 170 | // Those three bytes become &[0b00110111, 0b10001101, 0b00000010], 171 | // which is &[55, 141, 2], which is how g_valid_LI is defined in english.rs. 172 | /// Check if the char the cursor points to is in the grouping 173 | pub fn in_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool { 174 | if self.cursor >= self.limit { 175 | return false; 176 | } 177 | if let Some(chr) = self.current[self.cursor as usize..].chars().next() { 178 | let mut ch = chr as u32; //codepoint as integer 179 | if ch > max || ch < min { 180 | return false; 181 | } 182 | ch -= min; 183 | if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { 184 | return false; 185 | } 186 | self.next_char(); 187 | return true; 188 | } 189 | return false; 190 | } 191 | 192 | pub fn in_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool { 193 | if self.cursor <= self.limit_backward { 194 | return false; 195 | } 196 | self.previous_char(); 197 | if let Some(chr) = self.current[self.cursor as usize..].chars().next() { 198 | let mut ch = chr as u32; //codepoint as integer 199 | self.next_char(); 200 | if ch > max || ch < min { 201 | return false; 202 | } 203 | ch -= min; 204 | if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { 205 | return false; 206 | } 207 | self.previous_char(); 208 | return true; 209 | } 210 | return false; 211 | } 212 | 213 | pub fn out_grouping(&mut self, chars: &[u8], min: u32, max: u32) -> bool { 214 | if self.cursor >= self.limit { 215 | return false; 216 | } 217 | if let Some(chr) = self.current[self.cursor as usize..].chars().next() { 218 | let mut ch = chr as u32; //codepoint as integer 219 | if ch > max || ch < min { 220 | self.next_char(); 221 | return true; 222 | } 223 | ch -= min; 224 | if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { 225 | self.next_char(); 226 | return true; 227 | } 228 | } 229 | return false; 230 | } 231 | 232 | pub fn out_grouping_b(&mut self, chars: &[u8], min: u32, max: u32) -> bool { 233 | if self.cursor <= self.limit_backward { 234 | return false; 235 | } 236 | self.previous_char(); 237 | if let Some(chr) = self.current[self.cursor as usize..].chars().next() { 238 | let mut ch = chr as u32; //codepoint as integer 239 | self.next_char(); 240 | if ch > max || ch < min { 241 | self.previous_char(); 242 | return true; 243 | } 244 | ch -= min; 245 | if (chars[(ch >> 3) as usize] & (0x1 << (ch & 0x7))) == 0 { 246 | self.previous_char(); 247 | return true; 248 | } 249 | } 250 | return false; 251 | 252 | } 253 | 254 | 255 | /// Helper function that removes the string slice between `bra` and `ket` 256 | pub fn slice_del(&mut self) -> bool { 257 | self.slice_from("") 258 | } 259 | 260 | pub fn insert(&mut self, bra: i32, ket: i32, s: &str) { 261 | let adjustment = self.replace_s(bra, ket, s); 262 | if bra <= self.bra { 263 | self.bra = self.bra + adjustment; 264 | } 265 | if bra <= self.ket { 266 | self.ket = self.ket + adjustment; 267 | } 268 | } 269 | 270 | pub fn assign_to(&mut self) -> String { 271 | self.current[0..self.limit as usize].to_string() 272 | } 273 | 274 | pub fn slice_to(&mut self) -> String { 275 | self.current[self.bra as usize..self.ket as usize].to_string() 276 | } 277 | 278 | pub fn find_among(&mut self, amongs: &[Among], context: &mut T) -> i32 { 279 | use std::cmp::min; 280 | let mut i: i32 = 0; 281 | let mut j: i32 = amongs.len() as i32; 282 | 283 | let c = self.cursor; 284 | let l = self.limit; 285 | 286 | let mut common_i = 0i32; 287 | let mut common_j = 0i32; 288 | 289 | let mut first_key_inspected = false; 290 | loop { 291 | let k = i + ((j - i) >> 1); 292 | let mut diff: i32 = 0; 293 | let mut common = min(common_i, common_j); 294 | let w = &amongs[k as usize]; 295 | for lvar in common..w.0.len() as i32 { 296 | if c + common == l { 297 | diff = -1; 298 | break; 299 | } 300 | diff = self.current.as_bytes()[(c + common) as usize] as i32 - w.0.as_bytes()[lvar as usize] as i32; 301 | if diff != 0 { 302 | break; 303 | } 304 | common += 1; 305 | } 306 | if diff < 0 { 307 | j = k; 308 | common_j = common; 309 | } else { 310 | i = k; 311 | common_i = common; 312 | } 313 | if j - i <= 1 { 314 | if i > 0 { 315 | break; 316 | } 317 | if j == i { 318 | break; 319 | } 320 | if first_key_inspected { 321 | break; 322 | } 323 | first_key_inspected = true; 324 | } 325 | } 326 | 327 | loop { 328 | let w = &amongs[i as usize]; 329 | if common_i >= w.0.len() as i32{ 330 | self.cursor = c + w.0.len() as i32; 331 | if let Some(ref method) = w.3 { 332 | let res = method(self, context); 333 | self.cursor = c + w.0.len() as i32; 334 | if res { 335 | return w.2; 336 | } 337 | } else { 338 | return w.2; 339 | } 340 | } 341 | i = w.1; 342 | if i < 0 { 343 | return 0; 344 | } 345 | } 346 | } 347 | 348 | pub fn find_among_b(&mut self, amongs: &[Among], context: &mut T) -> i32 { 349 | let mut i: i32 = 0; 350 | let mut j: i32 = amongs.len() as i32; 351 | 352 | let c = self.cursor; 353 | let lb = self.limit_backward; 354 | 355 | let mut common_i = 0i32; 356 | let mut common_j = 0i32; 357 | 358 | let mut first_key_inspected = false; 359 | 360 | loop { 361 | let k = i + ((j - i) >> 1); 362 | let mut diff: i32 = 0; 363 | let mut common = if common_i < common_j { 364 | common_i 365 | } else { 366 | common_j 367 | }; 368 | let w = &amongs[k as usize]; 369 | for lvar in (0..w.0.len() - common as usize).rev() { 370 | if c - common == lb { 371 | diff = -1; 372 | break; 373 | } 374 | diff = self.current.as_bytes()[(c - common - 1) as usize] as i32 - w.0.as_bytes()[lvar] as i32; 375 | if diff != 0 { 376 | break; 377 | } 378 | // Count up commons. But not one character but the byte width of that char 379 | common += 1; 380 | } 381 | if diff < 0 { 382 | j = k; 383 | common_j = common; 384 | } else { 385 | i = k; 386 | common_i = common; 387 | } 388 | if j - i <= 1 { 389 | if i > 0 { 390 | break; 391 | } 392 | if j == i { 393 | break; 394 | } 395 | if first_key_inspected { 396 | break; 397 | } 398 | first_key_inspected = true; 399 | } 400 | } 401 | loop { 402 | let w = &amongs[i as usize]; 403 | if common_i >= w.0.len() as i32 { 404 | self.cursor = c - w.0.len() as i32; 405 | if let Some(ref method) = w.3 { 406 | let res = method(self, context); 407 | self.cursor = c - w.0.len() as i32; 408 | if res { 409 | return w.2; 410 | } 411 | } else { 412 | return w.2; 413 | } 414 | } 415 | i = w.1; 416 | if i < 0 { 417 | return 0; 418 | } 419 | } 420 | } 421 | } 422 | --------------------------------------------------------------------------------