├── .github └── workflows │ ├── publish.yaml │ ├── python-publish.yml │ ├── test-coverage.yaml │ └── test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── _config.yml ├── pybo ├── __init__.py ├── cli.py ├── corpus │ ├── __init__.py │ ├── parse_corrected.py │ └── word_cleanup.py ├── hfr_cqlr_converter.py ├── monlam2wordlist.py ├── pipeline │ ├── __init__.py │ └── pipes.py ├── rdr │ ├── __init__.py │ ├── rdr.py │ └── rdr_2_replace_matcher.py ├── resources │ └── particles.tsv ├── segmentation_rule │ ├── __init__.py │ ├── make_rule.py │ └── pipeline.py ├── third_party │ └── rules.txt ├── untokenize.py └── utils │ ├── __init__.py │ ├── bo_sorted.py │ ├── profile_entries.py │ ├── profile_report.py │ └── regex_batch_apply.py ├── pybo_logo.png ├── requirements-dev.txt ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── 01_raw_text.txt ├── data │ ├── corpus1 │ │ ├── corpus1.txt │ │ ├── corpus1_bilou_rules.txt │ │ ├── corpus1_hd.txt │ │ ├── corpus1_pybo_data.txt │ │ ├── corpus1_rules.txt │ │ └── corpus1_tr_data.txt │ ├── drokun_test │ │ ├── drokun_test.txt │ │ ├── drokun_test_bilou_rules.txt │ │ ├── drokun_test_hd.txt │ │ ├── drokun_test_rules.txt │ │ └── drokun_test_tr_data.txt │ ├── marpa │ │ ├── marpa.txt │ │ ├── marpa_bilou_rules.txt │ │ ├── marpa_hd.txt │ │ ├── marpa_pybo_data.txt │ │ ├── marpa_rules.txt │ │ └── marpa_tr_data.txt │ └── monlam2020 │ │ ├── multi_pos_multi_sense.csv │ │ ├── multi_pos_multi_sense_expected.csv │ │ ├── one_pos_multi_sense.csv │ │ ├── one_pos_multi_sense_expected.csv │ │ ├── one_pos_one_sense.csv │ │ ├── one_pos_one_sense_expected.csv │ │ ├── verbs.csv │ │ └── verbs_expected.csv ├── hfr2cql │ ├── UDPOS-bo.txt │ ├── adjustments.txt │ ├── cql │ │ ├── _cql2hfr_cql.txt │ │ └── cql.txt │ ├── cql2hfr.txt │ ├── cql_result.txt │ └── hfr_result.txt ├── resources │ ├── rdr_rules.txt │ ├── shelving │ │ ├── test_1.txt │ │ ├── test_1_tok │ │ │ └── test_1_tok.txt │ │ └── test_2.txt │ ├── step1_3 │ │ └── input │ │ │ └── test.txt │ └── step2 │ │ ├── cql_rules.txt │ │ ├── manually_corrected.txt │ │ ├── rdr_input.txt │ │ ├── step2 │ │ └── test_rules.txt ├── test_cli.py ├── test_corpus.py ├── test_hfr_cqlr_converter.py ├── test_monlam2word_list.py ├── test_rdr2adjustment.py ├── test_segmentation_rule_extraction.py ├── test_tok.py ├── test_untokenize.py └── workflow_test.txt └── usage.py /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | test: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | max-parallel: 4 13 | matrix: 14 | python-version: [3.6] 15 | 16 | steps: 17 | - uses: actions/checkout@v1 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v1 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install -r requirements-dev.txt 28 | pip install -e . 29 | - name: Run Test 30 | run: | 31 | pytest tests/ 32 | 33 | publish: 34 | 35 | needs: test 36 | runs-on: ubuntu-latest 37 | 38 | steps: 39 | - uses: actions/checkout@v2 40 | with: 41 | fetch-depth: 0 42 | 43 | - name: Python Semantic Release 44 | uses: relekang/python-semantic-release@master 45 | with: 46 | github_token: ${{ secrets.GITHUB_TOKEN }} 47 | pypi_token: ${{ secrets.PYPI_TOKEN }} 48 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflows will upload a Python Package using Twine when a release is created 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries 3 | 4 | name: Upload Python Package 5 | 6 | on: 7 | release: 8 | types: [created] 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install setuptools wheel twine 25 | - name: Build and publish 26 | env: 27 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 28 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 29 | run: | 30 | python setup.py sdist bdist_wheel 31 | twine upload dist/* 32 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | 2 | name: Test Coverage 3 | 4 | on: 5 | push: 6 | branches: 7 | - '*' 8 | pull_request: 9 | branches: 10 | - '*' 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | max-parallel: 4 18 | matrix: 19 | python-version: [3.6] 20 | 21 | steps: 22 | - uses: actions/checkout@v1 23 | with: 24 | fetch-depth: 1 25 | 26 | - name: Set up Python ${{ matrix.python-version }} 27 | uses: actions/setup-python@v1 28 | with: 29 | python-version: ${{ matrix.python-version }} 30 | 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install -r requirements-dev.txt 35 | pip install -e . 36 | 37 | - name: Run Test Coverage 38 | run: | 39 | coverage run -m pytest tests/ 40 | coverage report 41 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - '*' 7 | pull_request: 8 | branches: 9 | - '*' 10 | 11 | jobs: 12 | test: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | max-parallel: 4 16 | matrix: 17 | python-version: [3.6] 18 | 19 | steps: 20 | - uses: actions/checkout@v1 21 | 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v1 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install -r requirements-dev.txt 31 | pip install -e . 32 | - name: Run Test 33 | run: | 34 | pytest -vv 35 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/macos,pydev,python,eclipse,pycharm+all,visualstudio,visualstudiocode 3 | 4 | # User data 5 | usecases/canon_concordancer/input 6 | usecases/canon_concordancer/output 7 | .cache/ 8 | botok.yaml 9 | **/shelving_tok/ 10 | 11 | *.DICT 12 | *.INIT 13 | *.RAW 14 | *.RDR 15 | *.sDict 16 | 17 | ### Eclipse ### 18 | .metadata 19 | bin/ 20 | tmp/ 21 | *.tmp 22 | *.bak 23 | *.swp 24 | *~.nib 25 | local.properties 26 | .settings/ 27 | .loadpath 28 | .recommenders 29 | 30 | # External tool builders 31 | .externalToolBuilders/ 32 | 33 | # Locally stored "Eclipse launch configurations" 34 | *.launch 35 | 36 | # PyDev specific (Python IDE for Eclipse) 37 | *.pydevproject 38 | 39 | # CDT-specific (C/C++ Development Tooling) 40 | .cproject 41 | 42 | # Java annotation processor (APT) 43 | .factorypath 44 | 45 | # PDT-specific (PHP Development Tools) 46 | .buildpath 47 | 48 | # sbteclipse plugin 49 | .target 50 | 51 | # Tern plugin 52 | .tern-project 53 | 54 | # TeXlipse plugin 55 | .texlipse 56 | 57 | # STS (Spring Tool Suite) 58 | .springBeans 59 | 60 | # Code Recommenders 61 | .recommenders/ 62 | 63 | # Scala IDE specific (Scala & Java development for Eclipse) 64 | .cache-main 65 | .scala_dependencies 66 | .worksheet 67 | 68 | ### Eclipse Patch ### 69 | # Eclipse Core 70 | .project 71 | 72 | # JDT-specific (Eclipse Java Development Tools) 73 | .classpath 74 | 75 | ### macOS ### 76 | *.DS_Store 77 | .AppleDouble 78 | .LSOverride 79 | 80 | # Icon must end with two \r 81 | Icon 82 | 83 | # Thumbnails 84 | ._* 85 | 86 | # Files that might appear in the root of a volume 87 | .DocumentRevisions-V100 88 | .fseventsd 89 | .Spotlight-V100 90 | .TemporaryItems 91 | .Trashes 92 | .VolumeIcon.icns 93 | .com.apple.timemachine.donotpresent 94 | 95 | # Directories potentially created on remote AFP share 96 | .AppleDB 97 | .AppleDesktop 98 | Network Trash Folder 99 | Temporary Items 100 | .apdisk 101 | 102 | ### PyCharm+all ### 103 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 104 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 105 | 106 | # User-specific stuff: 107 | .idea/**/workspace.xml 108 | .idea/**/tasks.xml 109 | .idea/dictionaries 110 | 111 | # Sensitive or high-churn files: 112 | .idea/**/dataSources/ 113 | .idea/**/dataSources.ids 114 | .idea/**/dataSources.xml 115 | .idea/**/dataSources.local.xml 116 | .idea/**/sqlDataSources.xml 117 | .idea/**/dynamic.xml 118 | .idea/**/uiDesigner.xml 119 | 120 | # Gradle: 121 | .idea/**/gradle.xml 122 | .idea/**/libraries 123 | 124 | # CMake 125 | cmake-build-debug/ 126 | 127 | # Mongo Explorer plugin: 128 | .idea/**/mongoSettings.xml 129 | 130 | ## File-based project format: 131 | *.iws 132 | 133 | ## Plugin-specific files: 134 | 135 | # IntelliJ 136 | /out/ 137 | 138 | # mpeltonen/sbt-idea plugin 139 | .idea_modules/ 140 | 141 | # JIRA plugin 142 | atlassian-ide-plugin.xml 143 | 144 | # Cursive Clojure plugin 145 | .idea/replstate.xml 146 | 147 | # Ruby plugin and RubyMine 148 | /.rakeTasks 149 | 150 | # Crashlytics plugin (for Android Studio and IntelliJ) 151 | com_crashlytics_export_strings.xml 152 | crashlytics.properties 153 | crashlytics-build.properties 154 | fabric.properties 155 | 156 | ### PyCharm+all Patch ### 157 | # Ignores the whole idea folder 158 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360 159 | 160 | .idea/ 161 | 162 | ### pydev ### 163 | .pydevproject 164 | 165 | ### Python ### 166 | # Byte-compiled / optimized / DLL files 167 | __pycache__/ 168 | *.py[cod] 169 | *$py.class 170 | 171 | # C extensions 172 | *.so 173 | 174 | # Distribution / packaging 175 | .Python 176 | build/ 177 | develop-eggs/ 178 | dist/ 179 | downloads/ 180 | eggs/ 181 | .eggs/ 182 | lib/ 183 | lib64/ 184 | parts/ 185 | sdist/ 186 | var/ 187 | wheels/ 188 | *.egg-info/ 189 | .installed.cfg 190 | *.egg 191 | 192 | # PyInstaller 193 | # Usually these files are written by a python script from a template 194 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 195 | *.manifest 196 | *.spec 197 | 198 | # Installer logs 199 | pip-log.txt 200 | pip-delete-this-directory.txt 201 | 202 | # Unit test / coverage reports 203 | htmlcov/ 204 | .tox/ 205 | .coverage 206 | .coverage.* 207 | .cache 208 | nosetests.xml 209 | coverage.xml 210 | *.cover 211 | .hypothesis/ 212 | .pytest_cache/ 213 | 214 | # Translations 215 | *.mo 216 | *.pot 217 | 218 | # Django stuff: 219 | *.log 220 | local_settings.py 221 | 222 | # Flask stuff: 223 | instance/ 224 | .webassets-cache 225 | 226 | # Scrapy stuff: 227 | .scrapy 228 | 229 | # Sphinx documentation 230 | docs/_build/ 231 | 232 | # PyBuilder 233 | target/ 234 | 235 | # Jupyter Notebook 236 | .ipynb_checkpoints 237 | 238 | # pyenv 239 | .python-version 240 | 241 | # celery beat schedule file 242 | celerybeat-schedule.* 243 | 244 | # SageMath parsed files 245 | *.sage.py 246 | 247 | # Environments 248 | .env 249 | .venv 250 | env/ 251 | venv/ 252 | ENV/ 253 | env.bak/ 254 | venv.bak/ 255 | 256 | # Spyder project settings 257 | .spyderproject 258 | .spyproject 259 | 260 | # Rope project settings 261 | .ropeproject 262 | 263 | # mkdocs documentation 264 | /site 265 | 266 | # mypy 267 | .mypy_cache/ 268 | 269 | ### VisualStudioCode ### 270 | .vscode 271 | !.vscode/settings.json 272 | !.vscode/tasks.json 273 | !.vscode/launch.json 274 | !.vscode/extensions.json 275 | .history 276 | 277 | ### VisualStudio ### 278 | ## Ignore Visual Studio temporary files, build results, and 279 | ## files generated by popular Visual Studio add-ons. 280 | ## 281 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore 282 | 283 | # User-specific files 284 | *.suo 285 | *.user 286 | *.userosscache 287 | *.sln.docstates 288 | 289 | # User-specific files (MonoDevelop/Xamarin Studio) 290 | *.userprefs 291 | 292 | # Build results 293 | [Dd]ebug/ 294 | [Dd]ebugPublic/ 295 | [Rr]elease/ 296 | [Rr]eleases/ 297 | x64/ 298 | x86/ 299 | bld/ 300 | [Bb]in/ 301 | [Oo]bj/ 302 | [Ll]og/ 303 | 304 | # Visual Studio 2015 cache/options directory 305 | .vs/ 306 | # Uncomment if you have tasks that create the project's static files in wwwroot 307 | #wwwroot/ 308 | 309 | # MSTest test Results 310 | [Tt]est[Rr]esult*/ 311 | [Bb]uild[Ll]og.* 312 | 313 | # NUNIT 314 | *.VisualState.xml 315 | TestResult.xml 316 | 317 | # Build Results of an ATL Project 318 | [Dd]ebugPS/ 319 | [Rr]eleasePS/ 320 | dlldata.c 321 | 322 | # .NET Core 323 | project.lock.json 324 | project.fragment.lock.json 325 | artifacts/ 326 | **/Properties/launchSettings.json 327 | 328 | *_i.c 329 | *_p.c 330 | *_i.h 331 | *.ilk 332 | *.meta 333 | *.obj 334 | *.pch 335 | *.pdb 336 | *.pgc 337 | *.pgd 338 | *.rsp 339 | *.sbr 340 | *.tlb 341 | *.tli 342 | *.tlh 343 | *.tmp_proj 344 | *.vspscc 345 | *.vssscc 346 | .builds 347 | *.pidb 348 | *.svclog 349 | *.scc 350 | 351 | # Chutzpah Test files 352 | _Chutzpah* 353 | 354 | # Visual C++ cache files 355 | ipch/ 356 | *.aps 357 | *.ncb 358 | *.opendb 359 | *.opensdf 360 | *.sdf 361 | *.cachefile 362 | *.VC.db 363 | *.VC.VC.opendb 364 | 365 | # Visual Studio profiler 366 | *.psess 367 | *.vsp 368 | *.vspx 369 | *.sap 370 | 371 | # TFS 2012 Local Workspace 372 | $tf/ 373 | 374 | # Guidance Automation Toolkit 375 | *.gpState 376 | 377 | # ReSharper is a .NET coding add-in 378 | _ReSharper*/ 379 | *.[Rr]e[Ss]harper 380 | *.DotSettings.user 381 | 382 | # JustCode is a .NET coding add-in 383 | .JustCode 384 | 385 | # TeamCity is a build add-in 386 | _TeamCity* 387 | 388 | # DotCover is a Code Coverage Tool 389 | *.dotCover 390 | 391 | # Visual Studio code coverage results 392 | *.coverage 393 | *.coveragexml 394 | 395 | # NCrunch 396 | _NCrunch_* 397 | .*crunch*.local.xml 398 | nCrunchTemp_* 399 | 400 | # MightyMoose 401 | *.mm.* 402 | AutoTest.Net/ 403 | 404 | # Web workbench (sass) 405 | .sass-cache/ 406 | 407 | # Installshield output folder 408 | [Ee]xpress/ 409 | 410 | # DocProject is a documentation generator add-in 411 | DocProject/buildhelp/ 412 | DocProject/Help/*.HxT 413 | DocProject/Help/*.HxC 414 | DocProject/Help/*.hhc 415 | DocProject/Help/*.hhk 416 | DocProject/Help/*.hhp 417 | DocProject/Help/Html2 418 | DocProject/Help/html 419 | 420 | # Click-Once directory 421 | publish/ 422 | 423 | # Publish Web Output 424 | *.[Pp]ublish.xml 425 | *.azurePubxml 426 | # By default, sensitive information, such as encrypted password 427 | # should be stored in the .pubxml.user file. 428 | *.pubxml 429 | *.pubxml.user 430 | *.publishproj 431 | 432 | # Microsoft Azure Web App publish settings. Comment the next line if you want to 433 | # checkin your Azure Web App publish settings, but sensitive information contained 434 | # in these scripts will be unencrypted 435 | PublishScripts/ 436 | 437 | # NuGet Packages 438 | *.nupkg 439 | # The packages folder can be ignored because of Package Restore 440 | **/packages/* 441 | # except build/, which is used as an MSBuild target. 442 | !**/packages/build/ 443 | # Uncomment if necessary however generally it will be regenerated when needed 444 | #!**/packages/repositories.config 445 | # NuGet v3's project.json files produces more ignorable files 446 | *.nuget.props 447 | *.nuget.targets 448 | 449 | # Microsoft Azure Build Output 450 | csx/ 451 | *.build.csdef 452 | 453 | # Microsoft Azure Emulator 454 | ecf/ 455 | rcf/ 456 | 457 | # Windows Store app package directories and files 458 | AppPackages/ 459 | BundleArtifacts/ 460 | Package.StoreAssociation.xml 461 | _pkginfo.txt 462 | 463 | # Visual Studio cache files 464 | # files ending in .cache can be ignored 465 | *.[Cc]ache 466 | # but keep track of directories ending in .cache 467 | !*.[Cc]ache/ 468 | 469 | # Others 470 | ClientBin/ 471 | ~$* 472 | *~ 473 | *.dbmdl 474 | *.dbproj.schemaview 475 | *.jfm 476 | *.pfx 477 | *.publishsettings 478 | orleans.codegen.cs 479 | 480 | # Since there are multiple workflows, uncomment next line to ignore bower_components 481 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622) 482 | #bower_components/ 483 | 484 | # RIA/Silverlight projects 485 | Generated_Code/ 486 | 487 | # Backup & report files from converting an old project file 488 | # to a newer Visual Studio version. Backup files are not needed, 489 | # because we have git ;-) 490 | _UpgradeReport_Files/ 491 | Backup*/ 492 | UpgradeLog*.XML 493 | UpgradeLog*.htm 494 | 495 | # SQL Server files 496 | *.mdf 497 | *.ldf 498 | *.ndf 499 | 500 | # Business Intelligence projects 501 | *.rdl.data 502 | *.bim.layout 503 | *.bim_*.settings 504 | 505 | # Microsoft Fakes 506 | FakesAssemblies/ 507 | 508 | # GhostDoc plugin setting file 509 | *.GhostDoc.xml 510 | 511 | # Node.js Tools for Visual Studio 512 | .ntvs_analysis.dat 513 | node_modules/ 514 | 515 | # Typescript v1 declaration files 516 | typings/ 517 | 518 | # Visual Studio 6 build log 519 | *.plg 520 | 521 | # Visual Studio 6 workspace options file 522 | *.opt 523 | 524 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.) 525 | *.vbw 526 | 527 | # Visual Studio LightSwitch build output 528 | **/*.HTMLClient/GeneratedArtifacts 529 | **/*.DesktopClient/GeneratedArtifacts 530 | **/*.DesktopClient/ModelManifest.xml 531 | **/*.Server/GeneratedArtifacts 532 | **/*.Server/ModelManifest.xml 533 | _Pvt_Extensions 534 | 535 | # Paket dependency manager 536 | .paket/paket.exe 537 | paket-files/ 538 | 539 | # FAKE - F# Make 540 | .fake/ 541 | 542 | # JetBrains Rider 543 | *.sln.iml 544 | 545 | # CodeRush 546 | .cr/ 547 | 548 | # Python Tools for Visual Studio (PTVS) 549 | *.pyc 550 | 551 | # Cake - Uncomment if you are using it 552 | # textunits/** 553 | # !textunits/packages.config 554 | 555 | # Telerik's JustMock configuration file 556 | *.jmconfig 557 | 558 | # BizTalk build output 559 | *.btp.cs 560 | *.btm.cs 561 | *.odx.cs 562 | *.xsd.cs 563 | 564 | ### VisualStudio Patch ### 565 | # By default, sensitive information, such as encrypted password 566 | # should be stored in the .pubxml.user file. 567 | 568 | default.profile 569 | 570 | # pybo 571 | *.pickled 572 | **/pybo.yaml 573 | 574 | # End of https://www.gitignore.io/api/macos,pydev,python,eclipse,pycharm+all,visualstudio,visualstudiocode 575 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/psf/black 3 | rev: 19.3b0 4 | hooks: 5 | - id: black 6 | - repo: https://github.com/PyCQA/flake8 7 | rev: 3.8.3 8 | hooks: 9 | - id: flake8 10 | - repo: https://github.com/timothycrosley/isort 11 | rev: 5.2.2 12 | hooks: 13 | - id: isort -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | 3 | All notable changes to this project will be documented in this file. 4 | 5 | The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/). 6 | 7 | ## [0.7.3](https://github.com/Esukhia/pybo/releases/tag/v0.7.3) - 8 | ### Added 9 | * change pybo to bo in cli 10 | 11 | ## [0.7.2](https://github.com/Esukhia/pybo/releases/tag/v0.7.2) - 20200808 12 | ### Added 13 | * fix dependencies in setup.py 14 | 15 | ## [0.7.1](https://github.com/Esukhia/pybo/releases/tag/v0.7.1) - 20200808 16 | ### Added 17 | * added tibetan_sort as dep, cleanup, kakha cli messages 18 | 19 | ## [0.7.0](https://github.com/Esukhia/pybo/releases/tag/v0.7.0) - 20200807 20 | ### Added 21 | * added kakha using tibetan_sort 22 | 23 | ## [0.6.23](https://github.com/Esukhia/pybo/releases/tag/v0.6.23) - 20200714 24 | ### Added 25 | * syl-based content shelving and reinsertion #3 26 | 27 | ## [0.6.22](https://github.com/Esukhia/pybo/releases/tag/v0.6.22) - 20200710 28 | ### Added 29 | * #5 Add optional "--tags" to tok command to select and order token tags 30 | 31 | ## [0.6.21](https://github.com/Esukhia/pybo/releases/tag/v0.6.21) - 20191215 32 | ### Added 33 | * add profile-update to CLI 34 | ### Changed 35 | * use Token.text_cleaned whenever possible, fallback to Token.text otherwise 36 | * output of `pybo rdr` and `pybo profile-report` 37 | 38 | ## [0.6.20](https://github.com/Esukhia/pybo/releases/tag/v0.6.20) - 20191213 39 | ### Added 40 | * Support for `object.suffixL` in CQL rule creation. 41 | 42 | ## [0.6.19](https://github.com/Esukhia/pybo/releases/tag/v0.6.19) - 20191210 43 | ### Fixed 44 | * import bug fixed 45 | 46 | ## [0.6.18](https://github.com/Esukhia/pybo/releases/tag/v0.6.18) - 20191210 47 | ### Added 48 | * botok profile report: `pybo profile-report ` 49 | Finds out all duplicates over all the folders and files. 50 | 51 | ## [0.6.17](https://github.com/Esukhia/pybo/releases/tag/v0.6.17) - 20191122 52 | ### Fixed 53 | * bad setup 54 | 55 | ## [0.6.16](https://github.com/Esukhia/pybo/releases/tag/v0.6.16) - 20191122 56 | ### Fixed 57 | * bad imports 58 | 59 | ## [0.6.15](https://github.com/Esukhia/pybo/releases/tag/v0.6.15) - 20191122 60 | ### Fixed 61 | * reference to bo_sorted() not removed 62 | 63 | ## [0.6.14](https://github.com/Esukhia/pybo/releases/tag/v0.6.14) - 20191122 64 | ### Fixed 65 | * piycu for Windows from third-party website 66 | * temporarily remove bo_sorted() + CLI command 67 | * fixed rdr_2_replace_matcher bug on first line of rules 68 | ### Added 69 | * cwd CLI command 70 | 71 | ## [0.6.13](https://github.com/Esukhia/pybo/releases/tag/v0.6.13) - 20191109 72 | ### Fixed 73 | * removed pyicu dependency 74 | 75 | ## [0.6.12](https://github.com/Esukhia/pybo/releases/tag/v0.6.12) - 20191109 76 | ### Added 77 | * added rdr_2_replace_matcher in utils 78 | 79 | ## [0.6.11](https://github.com/Esukhia/pybo/releases/tag/v0.6.11) - 20191030 80 | ### Added 81 | * added bo_sort() and the corresponding kakha CLI option 82 | 83 | ## [0.6.10](https://github.com/Esukhia/pybo/releases/tag/v0.6.10) - 20190901 84 | ### Added 85 | * added pyewts to pybo 86 | 87 | ## [0.6.9](https://github.com/Esukhia/pybo/releases/tag/v0.6.9) - 20190901 88 | ### Added 89 | * the tokenizer's codebase is extracted from pybo and now lives in botok. All the related history is brought out to that project. 90 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # PYBO - Tibetan NLP in Python 4 | [![PyPI version](https://badge.fury.io/py/pybo.svg)](https://badge.fury.io/py/pybo) 5 | ![Test](https://github.com/Esukhia/pybo/workflows/Test/badge.svg) 6 | ![Test Coverage](https://github.com/Esukhia/pybo/workflows/Test%20Coverage/badge.svg) 7 | ![Publish](https://github.com/Esukhia/pybo/workflows/Publish/badge.svg) 8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://black.readthedocs.io/en/stable/) 9 | 10 | 11 | 12 | ## Overview 13 | 14 | bo tokenizes Tibetan text into words. 15 | 16 | ### Basic usage 17 | 18 | 19 | #### Getting started 20 | Requires to have Python3 installed. 21 | 22 | python3 -m pip install pybo 23 | 24 | #### Tokenizing a string 25 | 26 | ```bash 27 | drupchen@drupchen:~$ bo tok-string "༄༅། །རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྻ་ཨ་བ་ཏ་ར། བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པ། ། 28 | སངས་རྒྱས་དང་བྱང་ཆུབ་སེམས་དཔའ་ཐམས་ཅད་ལ་ཕྱག་འཚལ་ལོ། །བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང༌། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་ 29 | སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །" 30 | Loading Trie... (2s.) 31 | ༄༅།_། རྒྱ་གར་ སྐད་ དུ །_ བོ་ དྷི་ སཏྭ་ ཙརྻ་ ཨ་བ་ ཏ་ ར །_ བོད་སྐད་ དུ །_ བྱང་ཆུབ་ སེམས་དཔ འི་ སྤྱོད་པ་ ལ་ འཇུག་པ །_། སངས་རྒྱས་ དང་ བྱང་ཆུབ་ 32 | སེམས་དཔའ་ ཐམས་ཅད་ ལ་ ཕྱག་ འཚལ་ ལོ །_། བདེ་གཤེགས་ ཆོས་ ཀྱི་ སྐུ་ མངའ་ སྲས་ བཅས་ དང༌ །_། ཕྱག་འོས་ ཀུན་ ལ འང་ གུས་པ ར་ ཕྱག་ འཚལ་ 33 | ཏེ །_། བདེ་གཤེགས་ སྲས་ ཀྱི་ སྡོམ་ ལ་ འཇུག་པ་ ནི །_། ལུང་ བཞིན་ མདོར་བསྡུས་ ནས་ ནི་ བརྗོད་པ ར་ བྱ །_། 34 | ``` 35 | 36 | #### Tokenizing a list of files 37 | 38 | The command to tokenize a list of files in a directory: 39 | ``` 40 | bo tok 41 | ``` 42 | 43 | For example to tokenize the file `text.txt` in a directory `./document/` with the following content: 44 | ``` 45 | བཀྲ་ཤི་ས་བདེ་ལེགས་ཕུན་སུམ་ཚོགས། །རྟག་ཏུ་བདེ་བ་ཐོབ་པར་ཤོག། ། 46 | ``` 47 | 48 | I use the command: 49 | ``` 50 | $ bo tok ./document/ 51 | ``` 52 | 53 | ...which create a file `text.txt` in a directory `./document_pybo` containing: 54 | ``` 55 | བཀྲ་ ཤི་ ས་ བདེ་ལེགས་ ཕུན་སུམ་ ཚོགས །_། རྟག་ ཏུ་ བདེ་བ་ ཐོབ་པ ར་ ཤོག །_། 56 | ``` 57 | 58 | ### Sorting Tibetan words 59 | ```bash 60 | $ bo kakha to-sort.txt 61 | ``` 62 | The expected input is one word or entry per line in a .txt file. The file will be overwritten. 63 | 64 | ### FNR - Find and Replace with a list of regexes 65 | 66 | ``` 67 | bo fnr -o -t 68 | ``` 69 | `-o` and `-t` are optional 70 | 71 | Text files should be UTF-8 plain text files. The regexes should be in the following format: 72 | 73 | ``` 74 | - 75 | ``` 76 | 77 | ## Acknowledgements 78 | 79 | - **pybo** is an open source library for Tibetan NLP. 80 | 81 | We are always open to cooperation in introducing new features, tool integrations and testing solutions. 82 | 83 | Many thanks to the companies and organizations who have supported pybo's development, especially: 84 | 85 | * [Khyentse Foundation](https://khyentsefoundation.org) for contributing USD22,000 to kickstart the project 86 | * The [Barom/Esukhia canon project](http://www.barom.org) for sponsoring training data curation 87 | * [BDRC](https://tbrc.org) for contributing 2 staff for 6 months for data curation 88 | 89 | - `third_party/rules.txt` is taken from [tibetan-collation](https://github.com/eroux/tibetan-collation/blob/master/implementations/Unicode/rules.txt). 90 | 91 | ## Contributing 92 | First clone this repo. Create virtual environment and activate it. Then install the dependencies 93 | ```bash 94 | $ pip install -e . 95 | $ pip install -r requirements-dev.txt 96 | ``` 97 | 98 | Next, setup up [pre-commit](https://pre-commit.com/) by creating pre-commit git hook 99 | ```bash 100 | $ pre-commit install 101 | ``` 102 | Please, follow [augular commit message format](https://github.com/angular/angular/blob/master/CONTRIBUTING.md#-commit-message-format) for commit message. We have setup [python-semantic-release](https://github.com/relekang/python-semantic-release) to publish [pybo](https://pypi.org/project/pybo/) package automatically based on commit messages. 103 | 104 | That's all, Enjoy contributing 🎉🎉🎉 105 | 106 | ## License 107 | 108 | The Python code is Copyright (C) 2019 Esukhia, provided under [Apache 2](LICENSE). 109 | 110 | contributors: 111 | * [Drupchen](https://github.com/drupchen) 112 | * [Élie Roux](https://github.com/eroux) 113 | * [Ngawang Trinley](https://github.com/ngawangtrinley) 114 | * [Tenzin](https://github.com/10zinten) 115 | * Joyce Mackzenzie for reworking the logo 116 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-architect -------------------------------------------------------------------------------- /pybo/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | import pyewts 4 | from botok import * 5 | 6 | from .corpus.parse_corrected import extract_new_entries, parse_corrected 7 | from .pipeline.pipes import pybo_form, pybo_mod, pybo_prep 8 | from .utils.profile_report import profile_report 9 | from .utils.regex_batch_apply import batch_apply_regex, get_regex_pairs 10 | 11 | __version__ = "0.8.0" 12 | -------------------------------------------------------------------------------- /pybo/cli.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | from shutil import rmtree 4 | 5 | import click 6 | from bordr import __version__ as bordr__version 7 | from botok import Config, Text, WordTokenizer 8 | from botok import __version__ as botok__version__ 9 | from botok import expose_data 10 | from pyewts import VERSION as pyewts__version__ 11 | from tibetan_sort import TibetanSort 12 | from tibetan_sort import __version__ as tibetan_sort__version__ 13 | 14 | from pybo import __version__ as pybo__version__ 15 | from pybo.corpus.parse_corrected import extract_new_entries 16 | from pybo.pipeline.pipes import pybo_form, pybo_mod, pybo_prep 17 | from pybo.rdr.rdr import rdr as r 18 | from pybo.rdr.rdr_2_replace_matcher import rdr_2_replace_matcher 19 | from pybo.utils.profile_report import profile_report as p_report 20 | from pybo.utils.regex_batch_apply import batch_apply_regex, get_regex_pairs 21 | from pybo.hfr_cqlr_converter import cqlr2hfr, hfr2cqlr 22 | from pybo.segmentation_rule.pipeline import extract_seg_rule 23 | 24 | HOME = Path.home() 25 | DIALECT_PACK_DIR = HOME / "Documents" / "pybo" / "dialect_packs" 26 | DEFAULT_DPACK = "general" 27 | CONFIG_DIR = HOME / ".pybo" 28 | CONFIG_FILE = CONFIG_DIR / "config.json" 29 | 30 | 31 | @click.group() 32 | @click.version_option(pybo__version__) 33 | def cli(): 34 | pass 35 | 36 | 37 | @cli.command() 38 | def info(): 39 | click.echo("pybo install path: " + str(Path(__file__).parent.resolve())) 40 | click.echo("pybo: " + pybo__version__) 41 | click.echo("botok: " + botok__version__) 42 | click.echo("pyewts: " + pyewts__version__) 43 | click.echo("bordr: " + bordr__version) 44 | click.echo("tibetan_sort: " + tibetan_sort__version__) 45 | 46 | 47 | def prepare_folder(main=None, custom=None, overwrite=False): 48 | profile = "POS" 49 | # 1. MAIN PROFILE 50 | if not main: 51 | # for better Windows support: 52 | # https://stackoverflow.com/questions/6227590/finding-the-users-my-documents-path/6227623#6227623 53 | main = Path.home() / "Documents/pybo/main" 54 | else: 55 | main = Path(main) 56 | main.mkdir(parents=True, exist_ok=True) 57 | 58 | if overwrite: 59 | rmtree(main, ignore_errors=True) 60 | main.mkdir() 61 | 62 | try: 63 | expose_data(main, profile=profile) 64 | except IOError: 65 | click.echo('using the existing data in "Documents/pybo/main/"') 66 | 67 | # 2. CUSTOM PROFILE 68 | if not custom: 69 | custom = Path.home() / "Documents/pybo/custom" 70 | else: 71 | custom = Path(custom) 72 | custom.mkdir(exist_ok=True) 73 | for dir in ["adjustment", "remove", "words", "words_skrt"]: 74 | Path(custom / dir).mkdir(exist_ok=True) 75 | 76 | return main, custom 77 | 78 | 79 | def save_config(dialect_pack_path): 80 | config = {"dialect_pack_path": str(dialect_pack_path)} 81 | if not CONFIG_DIR.is_dir(): 82 | CONFIG_DIR.mkdir(parents=True) 83 | json.dump(config, CONFIG_FILE.open("w")) 84 | 85 | 86 | def load_config(): 87 | if not CONFIG_FILE.is_file(): 88 | return 89 | else: 90 | config = json.load(CONFIG_FILE.open()) 91 | return config 92 | 93 | 94 | # Tokenize file 95 | @cli.command() 96 | @click.argument("input-dir", type=click.Path(exists=True)) 97 | @click.option( 98 | "-t", 99 | "--tags", 100 | help="""Select and order the tags. Available tags are: 101 | t-clean_text, p-pos, l-lemma, s-sense.\n 102 | Usage: `-t tpl` will give for every token `///` 103 | and will give just `` if tag option is not specified.""", 104 | ) 105 | @click.option( 106 | "-o", type=click.Path(exists=True), help="output dir, default is the input_dir" 107 | ) 108 | @click.option("-d", "--dialect-name", type=str, help="official dialect pack name.") 109 | @click.option( 110 | "-p", 111 | "--dialect-path", 112 | type=click.Path(exists=True), 113 | help="path to the dialect pack", 114 | ) 115 | @click.option("-w", "--overwrite", is_flag=True) 116 | @click.option("-r", "--rebuild-trie", is_flag=True) 117 | def tok(**kwargs): 118 | input_dir = Path(kwargs["input_dir"]) 119 | dialect_name = kwargs["dialect_name"] 120 | dialect_path = kwargs["dialect_path"] 121 | # overwrite = kwargs["overwrite"] 122 | rebuild = kwargs["rebuild_trie"] 123 | 124 | # load botok config 125 | if dialect_name: 126 | config = Config(dialect_name=dialect_name) 127 | save_config(config.dialect_pack_path) 128 | elif dialect_path: 129 | config = Config.from_path(dialect_path) 130 | # config.dialect_pack_path = Path(dialect_pack_path) 131 | save_config(config.dialect_pack_path) 132 | else: 133 | pybo_config = load_config() 134 | if not pybo_config: 135 | config = Config() 136 | save_config(config.dialect_pack_path) 137 | else: 138 | dialect_pack_path = pybo_config["dialect_pack_path"] 139 | config = Config.from_path(dialect_pack_path) 140 | 141 | print( 142 | f"[INFO] Using `{config.dialect_pack_path.name}` dialect pack for tokenization ..." 143 | ) 144 | 145 | wt = WordTokenizer(config=config, build_trie=rebuild) 146 | 147 | def pybo_tok(in_str): 148 | return wt.tokenize(in_str) 149 | 150 | # Select and Order the tags 151 | if kwargs["tags"]: 152 | pybo_mod.__defaults__ = (list(kwargs["tags"]),) 153 | 154 | if input_dir.is_dir(): 155 | if kwargs["o"] is not None: 156 | output_dir = Path(kwargs["o"]) 157 | else: 158 | output_dir = input_dir.parent / (input_dir.name + "_tok") 159 | output_dir.mkdir(exist_ok=True) 160 | for f in input_dir.glob("*.txt"): 161 | out_file = output_dir / (f.stem + "_tok.txt") 162 | text = Text(f, out_file) 163 | text.custom_pipeline(pybo_prep, pybo_tok, pybo_mod, pybo_form) 164 | elif input_dir.is_file(): 165 | input_file = input_dir 166 | if kwargs["o"] is not None: 167 | output_dir = Path(kwargs["o"]) 168 | else: 169 | output_dir = input_file.parent / (input_file.stem + "_tok") 170 | output_dir.mkdir(exist_ok=True) 171 | out_file = output_dir / (input_file.stem + "_tok.txt") 172 | text = Text(input_file, out_file) 173 | text.custom_pipeline(pybo_prep, pybo_tok, pybo_mod, pybo_form) 174 | else: 175 | print("[INFO] Invalid input directory or file!!!") 176 | 177 | 178 | # Tokenize string 179 | @cli.command() 180 | @click.argument("string") 181 | def tok_string(**kwargs): 182 | t = Text(kwargs["string"]) 183 | click.echo(t.tokenize_words_raw_lines) 184 | 185 | 186 | # lists 187 | tag_types = ["pos", "lemma", "sense"] 188 | 189 | 190 | @cli.command() 191 | @click.argument("input-dir", type=click.Path(exists=True)) 192 | @click.option("-t", "--type") 193 | def lists(**kwargs): 194 | path = Path(kwargs["path"]) 195 | 196 | text_string = "" 197 | for f in path.glob("*.txt"): 198 | text_string += f.read_text(encoding="utf-8-sig") 199 | 200 | 201 | # create report for botok profiles 202 | @cli.command() 203 | @click.argument("profile", type=click.Path(exists=True)) 204 | def profile_report(**kwargs): 205 | p_report(kwargs["profile"]) 206 | 207 | 208 | # rdr_2_replace_matcher 209 | @cli.command() 210 | @click.argument("infile", type=click.Path(exists=True)) 211 | def rdr2repl(**kwargs): 212 | infile = Path(kwargs["infile"]) 213 | outfile = infile.parent / (infile.stem + ".yaml") 214 | dump = infile.read_text(encoding="utf-8-sig") 215 | processed = rdr_2_replace_matcher(dump) 216 | outfile.write_text(processed, encoding="utf-8-sig") 217 | 218 | 219 | # sort in the Tibetan order 220 | @cli.command() 221 | @click.argument("infile", type=click.Path(exists=True)) 222 | def kakha(**kwargs): 223 | sort = TibetanSort() 224 | infile = Path(kwargs["infile"]) 225 | words = infile.read_text(encoding="utf-8-sig").split() 226 | print(f"Sorting {infile.name}") 227 | words = sort.sort_list(words) 228 | print(f"{infile.name} is sorted") 229 | infile.write_text("\n".join(words), encoding="utf-8-sig") 230 | 231 | 232 | # generate rdr rules 233 | @cli.command() 234 | @click.argument("input", type=click.Path(exists=True)) 235 | @click.option("-dp", type=str, help="Dialect pack name, default is general") 236 | @click.option("-k", "--keep", type=str) 237 | @click.option('--type', type=str, help="Type can be either cql which is default type or hfr(Human friendly rule)") 238 | def extract_rules(**kwargs): 239 | file_or_dir = Path(kwargs["input"]) 240 | dialect_pack_name = kwargs["dp"] if kwargs["dp"] else DEFAULT_DPACK 241 | keep = "none" if kwargs["keep"] is None else kwargs["keep"] 242 | type = "cql" if kwargs["type"] is None else kwargs["type"] 243 | if type == "cql": 244 | out_dir = DIALECT_PACK_DIR / dialect_pack_name / "adjustments" / "rules" 245 | else: 246 | out_dir = DIALECT_PACK_DIR / dialect_pack_name / "hfr_rules" 247 | out_dir.mkdir(exist_ok=True) 248 | 249 | log = None 250 | click.echo("[INFO] Extracing adjustments rules ...") 251 | if file_or_dir.is_dir(): 252 | file = file_or_dir / file_or_dir.name 253 | with open(file, encoding="utf-8-sig", mode="w") as tmp: 254 | for f in file_or_dir.glob("*.txt"): 255 | tmp.write(f.read_text(encoding="utf-8-sig") + " ") 256 | log = r(file, outdir=out_dir, keep=keep, type=type) 257 | file.unlink() 258 | elif file_or_dir.is_file(): 259 | log = r(file_or_dir, out_dir, keep=keep, type=type) 260 | click.echo(f"[INFO] {file_or_dir} does not exist!") 261 | 262 | click.echo(log) 263 | click.echo("[INFO] Completed !") 264 | click.echo(f"[INFO] Added adjustments rules to {dialect_pack_name}") 265 | 266 | # generate rdr rules 267 | @cli.command() 268 | @click.argument("input", type=click.Path(exists=True)) 269 | @click.option("-dp", type=str, help="Dialect pack name, default is general") 270 | @click.option('--type', type=str, help="Type can be either cql which is default type or hfr(Human friendly rule)") 271 | @click.option("--e", type=int) 272 | def extract_seg_rules(**kwargs): 273 | rules = '' 274 | input_path = Path(kwargs["input"]) 275 | dialect_pack_name = kwargs["dp"] if kwargs["dp"] else DEFAULT_DPACK 276 | type = "cql" if kwargs["type"] is None else kwargs["type"] 277 | epochs = 3 if kwargs['e'] is None else kwargs['e'] 278 | if type == "cql": 279 | out_dir = DIALECT_PACK_DIR / dialect_pack_name / "adjustments" / "rules" 280 | else: 281 | out_dir = DIALECT_PACK_DIR / dialect_pack_name / "hfr_rules" 282 | out_dir.mkdir(exist_ok=True) 283 | 284 | click.echo("[INFO] Extracing adjustments rules ...") 285 | 286 | if input_path.is_dir(): 287 | print('[ERROR] Invalid file name!!') 288 | elif input_path.is_file(): 289 | rules += extract_seg_rule(input_path, dialect_pack_name, type, epochs) 290 | if rules: 291 | (out_dir / f'{input_path.stem}_rules.tsv').write_text(rules, encoding='utf-8') 292 | else: 293 | print('[INFO] No rules found') 294 | 295 | click.echo("[INFO] Completed !") 296 | click.echo(f"[INFO] Added adjustments rules to {dialect_pack_name}") 297 | 298 | #convert cql to hfr 299 | @cli.command() 300 | @click.argument("input", type=click.Path(exists=True)) 301 | @click.option("-dp", type=str, help="Dialect pack name, default is general") 302 | def convert_cql2hfr(**kwargs): 303 | cql_path = Path(kwargs['input']) 304 | dialect_pack_name = kwargs["dp"] if kwargs["dp"] else DEFAULT_DPACK 305 | hfr_dir = DIALECT_PACK_DIR / dialect_pack_name / "hfr_rules" 306 | hfr_dir.mkdir(exist_ok=True) 307 | hfr_file_path = hfr_dir / (cql_path.stem + ".tsv") 308 | cql_rules = cql_path.read_text(encoding='utf-8') 309 | hfr = cqlr2hfr(cql_rules) 310 | hfr_file_path.write_text(hfr, encoding='utf-8') 311 | 312 | #convert hfr to cql 313 | @cli.command() 314 | @click.argument("input", type=click.Path(exists=True)) 315 | @click.option("-dp", type=str, help="Dialect pack name, default is general") 316 | def convert_hfr2cql(**kwargs): 317 | hfr_path = Path(kwargs['input']) 318 | dialect_pack_name = kwargs["dp"] if kwargs["dp"] else DEFAULT_DPACK 319 | cql_dir = DIALECT_PACK_DIR / dialect_pack_name / "adjustments" / "rules" 320 | cql_dir.mkdir(exist_ok = True) 321 | cql_file_path = cql_dir / (hfr_path.stem + ".tsv") 322 | hfr = hfr_path.read_text(encoding='utf-8') 323 | cql = hfr2cqlr(hfr) 324 | cql_file_path.write_text(cql, encoding='utf-8') 325 | 326 | 327 | # extract new entries from manually corrected texts + existing profile 328 | @cli.command() 329 | @click.argument("corrected-path", type=click.Path(exists=True)) 330 | @click.argument("dialect_path", type=click.Path(exists=True)) 331 | @click.option("-o", "--out-dir", type=click.Path(exists=True)) 332 | def profile_update(**kwargs): 333 | corrected = Path(kwargs["corrected_path"]) 334 | dialect_path = Path(kwargs["dialect_path"]) 335 | out_dir = Path(kwargs["out_dir"]) if kwargs["out_dir"] else None 336 | 337 | dump = "" 338 | for f in corrected.glob("*.txt"): 339 | dump += f.read_text(encoding="utf-8-sig") + "\n" 340 | 341 | rules = extract_new_entries(dump, dialect_path) 342 | if not out_dir: 343 | out = corrected.parent / (corrected.name + "_words.tsv") 344 | else: 345 | out = out_dir / (corrected.name + "_words.tsv") 346 | 347 | if not out.parent.is_dir(): 348 | out.parent.mkdir(exist_ok=True) 349 | 350 | out.write_text(rules, encoding="utf-8-sig") 351 | 352 | 353 | # FNR - Find and Replace with a list of regexes 354 | @cli.command() 355 | @click.argument("in-dir", type=click.Path(exists=True)) 356 | @click.argument("regex-file", type=click.Path(exists=True)) 357 | @click.option("-o", "--out-dir", type=click.Path(exists=True)) 358 | @click.option("-t", "--tag") 359 | def fnr(**kwargs): 360 | # get the args 361 | indir = Path(kwargs["in_dir"]) 362 | regex_file = Path(kwargs["regex_file"]) 363 | out_dir = Path(kwargs["out_dir"]) if kwargs["out_dir"] else None 364 | 365 | if not indir.is_dir(): 366 | click.echo("IN_DIR should be a folder, not a file.\nexiting...") 367 | exit(1) 368 | 369 | # optional out file tag 370 | tag = kwargs["tag"] if kwargs["tag"] else regex_file.stem 371 | 372 | # generate rules 373 | rules = get_regex_pairs(regex_file.open(encoding="utf-8-sig").readlines()) 374 | 375 | # apply on each file, prefixing each one with the regex filename 376 | for f in indir.rglob("*.txt"): 377 | if not f.stem.startswith("_"): 378 | string = f.read_text(encoding="utf-8-sig") 379 | out = batch_apply_regex(string, rules) 380 | name = f"_{tag}_" + f.name 381 | if out_dir: 382 | Path(out_dir).mkdir(parents=True, exist_ok=True) 383 | outfile = out_dir / name 384 | else: 385 | outfile = f.parent / name 386 | outfile.write_text(out, encoding="utf-8-sig") 387 | 388 | 389 | if __name__ == "__main__": 390 | # cli() 391 | save_config("test_path") 392 | -------------------------------------------------------------------------------- /pybo/corpus/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/pybo/c65ee83a0659f721bccdf48db4901360e7d97048/pybo/corpus/__init__.py -------------------------------------------------------------------------------- /pybo/corpus/parse_corrected.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import re 3 | 4 | from tibetan_sort import TibetanSort 5 | 6 | from ..utils.profile_entries import profile_entries 7 | from .word_cleanup import word_cleanup 8 | 9 | sort = TibetanSort() 10 | 11 | 12 | def parse_corrected(in_str): 13 | # prepare string: replace returns and tabs and multiple spaces by a single space 14 | in_str = in_str.replace("\n", " ").replace("\t", " ") 15 | in_str = re.sub(r"\s+", " ", in_str) 16 | 17 | # parse 18 | sep_field = "/" 19 | parsed = [] 20 | for token in in_str.split(): 21 | fields = ["", "", "", "", ""] 22 | for num, f in enumerate(token.split(sep_field)): 23 | # cleanup the form and the lemma 24 | if (num == 0 or num == 2) and f: 25 | f = word_cleanup(f) 26 | fields[num] = f 27 | parsed.append(fields) 28 | return parsed 29 | 30 | 31 | def extract_new_entries(in_str, profile_path): 32 | entries = profile_entries(profile_path) 33 | 34 | # parse input 35 | parsed = parse_corrected(in_str) 36 | 37 | # generate content, without duplicates 38 | entry_data = [] 39 | for p in parsed: 40 | word = p[0] 41 | e_d = "\t".join(p) 42 | if (word not in entries or e_d not in entries[word]) and e_d not in entry_data: 43 | entry_data.append(e_d) 44 | 45 | # sort both lists 46 | # words = sort.sort_list(words) 47 | entry_data = sort.sort_list(entry_data) 48 | entry_data = ["# form pos lemma sense freq"] + entry_data 49 | 50 | return "\n".join(entry_data) 51 | -------------------------------------------------------------------------------- /pybo/corpus/word_cleanup.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from botok import NAMCHE, TSEK, TokChunks 3 | 4 | 5 | def word_cleanup(string): 6 | """If it is Tibetan text, returns the cleaned up syllables, otherwise the original string""" 7 | 8 | def join_syls(syls): 9 | return "".join([syl if syl.endswith(NAMCHE) else syl + TSEK for syl in syls]) 10 | 11 | syls = TokChunks(string).get_syls() 12 | if syls: 13 | return join_syls(syls) 14 | else: 15 | return string 16 | -------------------------------------------------------------------------------- /pybo/hfr_cqlr_converter.py: -------------------------------------------------------------------------------- 1 | import re 2 | from pathlib import Path 3 | 4 | cql2hfr_tag = { 5 | '"ADJ"': "རྒྱན", 6 | '"ADP"': "སྦྱོར", 7 | '"ADV"': "བསྣན", 8 | '"AUX"': "གྲོགས", 9 | '"CCONJ"': "སྦྲེལ", 10 | '"DET"': "ངེས", 11 | '"INTJ"': "འབོད", 12 | '"NOUN"': "མིང", 13 | '"NUM"': "གྲངས", 14 | '"PRON"': "ཚབ", 15 | '"PROPN"': "ཁྱད", 16 | '"PUNCT"': "ཚེག", 17 | '"SCONJ"': "ལྟོས", 18 | '"VERB"': "བྱ", 19 | '"PART"': "རོགས", 20 | "pos=": "གཤིས=", 21 | "lemma=": "མ=", 22 | "sense=": "དོན=", 23 | "&": "༈", 24 | "[": "༺", 25 | "]": "༻", 26 | } 27 | 28 | 29 | def cqlr2hfr(cqlr_string): 30 | """Convert corpus queery language(cql) rules to human friendly rules(hfr) which has UDPOS in Tibetan. 31 | 32 | Args: 33 | cql_string (str): corpus queery language rules 34 | 35 | Returns: 36 | str: human friendly rules(in Tibetan language) 37 | """ 38 | hfr_string = cqlr_string 39 | for cql_tag, hfr_tag in cql2hfr_tag.items(): 40 | hfr_string = hfr_string.replace(cql_tag, hfr_tag) 41 | return hfr_string 42 | 43 | 44 | def hfr2cqlr(hfr_string): 45 | """Convert human friendly rules(hfr) to corpus queery language rules format. 46 | 47 | Args: 48 | hfr_string (str): Human friendly rules(hfr) 49 | 50 | Returns: 51 | str: Corpus queery language(cql) rules format. 52 | """ 53 | cql_string = hfr_string 54 | for cql_tag, hfr_tag in cql2hfr_tag.items(): 55 | cql_string = cql_string.replace(hfr_tag, cql_tag) 56 | return cql_string 57 | -------------------------------------------------------------------------------- /pybo/monlam2wordlist.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import re 3 | 4 | from botok import Text 5 | 6 | ID = -1 7 | 8 | POS_NAMES = (" མིང་ཚིག ", " བྱ་ཚིག ", " བྱེད་ཚིག ", " གྲོགས་ཚིག ") 9 | 10 | 11 | class cols: 12 | ID = "ID" 13 | FORM = "Form" 14 | LEMMA = "Lemma" 15 | MON_POS = "MonPOS" 16 | MON_FEATURE = "MonFeature" 17 | MON_TAG = "MonTag" 18 | POS = "POS" 19 | FEATURE = "Feature" 20 | MORPH = "Morph" 21 | SENSE_TAG = "SenseTag" 22 | DEFINITION = "Definition" 23 | EXAMPLE = "Example" 24 | 25 | 26 | def create_word( 27 | form, 28 | lemma, 29 | mon_pos=None, 30 | mon_feature=None, 31 | mon_tag=None, 32 | pos=None, 33 | feature=None, 34 | morph=None, 35 | sense_tag=None, 36 | definition=None, 37 | example=None, 38 | ): 39 | global ID 40 | ID += 1 41 | return { 42 | "ID": ID, 43 | "Form": form, 44 | "Lemma": lemma, 45 | "MonPOS": mon_pos, 46 | "MonFeature": mon_feature, 47 | "MonTag": mon_tag, 48 | "POS": pos, 49 | "Feature": feature, 50 | "Morph": feature, 51 | "SenseTag": sense_tag, 52 | "Definition": definition, 53 | "Example": example, 54 | } 55 | 56 | 57 | def csv_loader(path): 58 | with open(path, "r") as csv_file: 59 | reader = csv.reader(csv_file) 60 | for i, row in enumerate(reader): 61 | if i == 0: 62 | continue 63 | yield row 64 | 65 | 66 | def get_single_pos(chunk_containing_pos): 67 | """Return only first pos and it's content.""" 68 | pos_char_end_idx = chunk_containing_pos.find(" ") 69 | pos = chunk_containing_pos[:pos_char_end_idx] 70 | pos_content = chunk_containing_pos[(pos_char_end_idx + 1) :] 71 | return pos, pos_content 72 | 73 | 74 | def find_all_remaining_pos(chunk): 75 | """Return all pos position and it's length. 76 | 77 | Return: 78 | pos_start_idxs (list): [(pos_start_idx, len(pos_name)), ...] sorted on pos_start_idx. 79 | 80 | """ 81 | pos_start_idxs = [] 82 | pos_found = False 83 | for pos_name in POS_NAMES: 84 | pos_found = True 85 | pos_start_idx = chunk.find(pos_name) 86 | if pos_start_idx != -1: 87 | pos_start_idxs.append((pos_start_idx, len(pos_name))) 88 | if pos_found: 89 | pos_start_idxs.append((len(chunk), 0)) 90 | return sorted(pos_start_idxs, key=lambda x: x[0]) 91 | 92 | 93 | def get_pos_list(text): 94 | """Parse pos and it's content (mon_tags, definitions) in string. 95 | 96 | Returns: 97 | post_list (list): [(pos, pos_content), ...] 98 | """ 99 | 100 | pos_list = [] 101 | chunks_containing_pos = text.split(" 1. ") 102 | estimated_n_pos = len(chunks_containing_pos) 103 | if estimated_n_pos == 1: # one_pos_one_sense 104 | chunk_containing_pos = chunks_containing_pos[0].strip() 105 | pos, pos_content = get_single_pos(chunk_containing_pos) 106 | pos_list.append((pos, pos_content)) 107 | elif estimated_n_pos == 2: # one_pos_multi_senses 108 | pos, pos_content = chunks_containing_pos 109 | pos_list.append((pos, pos_content)) 110 | else: # multi_pos_multi_senses 111 | pos = chunks_containing_pos[0] 112 | for i, chunk_containing_pos in enumerate(chunks_containing_pos[1:]): 113 | if i == estimated_n_pos - 2: # if last chunk, check for all pos 114 | new_chunk_start = 0 115 | next_pos_start_idxs = find_all_remaining_pos(chunk_containing_pos) 116 | for next_pos_start_idx, pos_name_len in next_pos_start_idxs: 117 | pos_content = chunk_containing_pos[ 118 | new_chunk_start:next_pos_start_idx 119 | ] 120 | pos_list.append((pos, pos_content)) 121 | pos = chunk_containing_pos[ 122 | next_pos_start_idx : next_pos_start_idx + pos_name_len 123 | ].strip() 124 | new_chunk_start = next_pos_start_idx + pos_name_len 125 | if not next_pos_start_idxs: 126 | pos_list.append((pos, chunk_containing_pos)) 127 | else: 128 | next_pos_start_idx = chunk_containing_pos.rfind(" ") 129 | pos_content = chunk_containing_pos[:next_pos_start_idx] 130 | pos_list.append((pos, pos_content)) 131 | pos = chunk_containing_pos[next_pos_start_idx + 1 :] 132 | 133 | return pos_list 134 | 135 | 136 | def get_definition_list(pos_list): 137 | """Parse definitions from pos_content. 138 | 139 | Returns: 140 | definition_list (list): [(pos, definition-content), ...] 141 | 142 | """ 143 | definition_list = [] 144 | for pos, pos_content in pos_list: 145 | for definition_content in re.split(r" \d\. ", pos_content): 146 | definition_list.append((pos, definition_content)) 147 | return definition_list 148 | 149 | 150 | def get_tag_list(definition_list): 151 | """Parse monlam tag from definition content. 152 | 153 | Returns: 154 | tag_list (list): [(pos, tag, definition), ...] 155 | """ 156 | 157 | def parse_tag(text): 158 | if text[0] != "༡": 159 | return "", text 160 | tag_end_idx = text.find(" ") 161 | tag = text[:tag_end_idx] 162 | definition = text[tag_end_idx + 1 :] 163 | return tag, definition 164 | 165 | tag_list = [] 166 | for pos, definition_content in definition_list: 167 | tag, definition = parse_tag(definition_content) 168 | tag_list.append((pos, tag, definition)) 169 | return tag_list 170 | 171 | 172 | def get_sense_tag_list(tag_list): 173 | """Parse sense from definition. 174 | 175 | Sense here the first word of the given definition. 176 | 177 | Returns: 178 | sense_tag_list (list): [(pos, tag, sense_tag, definition), ...] 179 | 180 | """ 181 | 182 | def get_first_segment(text, delimiter=" "): 183 | seg_idx = text.find(delimiter) 184 | if seg_idx == -1: 185 | return text 186 | return text[:seg_idx] 187 | 188 | sense_tag_list = [] 189 | for *pos_and_tag, definition in tag_list: 190 | first_segment = get_first_segment(definition) 191 | tokenized_segment = Text(first_segment).tokenize_words_raw_text 192 | sense = get_first_segment(tokenized_segment) 193 | sense_tag_list.append((*pos_and_tag, sense, definition)) 194 | return sense_tag_list 195 | 196 | 197 | def get_example_list(sense_tag_list): 198 | """Parse example from the definition.""" 199 | 200 | def parse_example(text, example_tag="དཔེར་ན།"): 201 | example_start_idx = text.rfind(example_tag) 202 | if example_start_idx == -1: 203 | return text, "" 204 | definition = text[:example_start_idx].strip() 205 | example = text[example_start_idx + len(example_tag) :].strip() 206 | return definition, example 207 | 208 | example_list = [] 209 | for *pos_tag_sense, definition in sense_tag_list: 210 | definition, example = parse_example(definition) 211 | example_list.append((*pos_tag_sense, definition, example)) 212 | return example_list 213 | 214 | 215 | def parse_attrs(form, text_containing_attrs): 216 | pos_list = get_pos_list(text_containing_attrs) 217 | definition_list = get_definition_list(pos_list) 218 | tag_list = get_tag_list(definition_list) 219 | sense_tag_list = get_sense_tag_list(tag_list) 220 | example_list = get_example_list(sense_tag_list) 221 | return example_list 222 | 223 | 224 | def monlam2wordlist(rows): 225 | word_list_rows = [] 226 | for row in rows: 227 | *_, form, result = row 228 | attrs = parse_attrs(form, result) 229 | print(row, attrs) 230 | return word_list_rows 231 | 232 | 233 | def dump_tsv(rows, out_path): 234 | with open(out_path, "w") as csv_file: 235 | writer = csv.writer(csv_file, delimiter="\t") 236 | writer.writerows(rows) 237 | -------------------------------------------------------------------------------- /pybo/pipeline/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/pybo/c65ee83a0659f721bccdf48db4901360e7d97048/pybo/pipeline/__init__.py -------------------------------------------------------------------------------- /pybo/pipeline/pipes.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import botok 3 | 4 | 5 | def get_chunks(raw_string): 6 | chunker = botok.Chunks(raw_string) 7 | chunks = chunker.make_chunks() 8 | chunks = chunker.get_readable(chunks) 9 | return chunks 10 | 11 | 12 | def shelve_info(chunks): 13 | shelved = [] 14 | clean_chunks = [] 15 | 16 | syl_count = 0 17 | for i, chunk in enumerate(chunks): 18 | marker, text = chunk 19 | if marker == "TEXT" or marker == "PUNCT": 20 | syl_count += 1 21 | 22 | # 2.a. extract transparent chars 23 | # TODO: adapt to also include \t as transparent char 24 | if "\n" in text: 25 | # remove transparent char 26 | text = text.replace("\n", "") 27 | index = (syl_count, "\n") 28 | 29 | shelved.append(index) 30 | clean_chunks.append((marker, text)) 31 | 32 | # 2.b. extract any non-bo chunk 33 | elif marker != "TEXT" and marker != "PUNCT": 34 | index = (syl_count, text) 35 | shelved.append(index) 36 | 37 | else: 38 | clean_chunks.append(chunk) 39 | 40 | return clean_chunks, shelved 41 | 42 | 43 | def pybo_prep(in_str): 44 | # 1. get chunks 45 | chunks = get_chunks(in_str) 46 | 47 | # 2. shelve needed info 48 | chunks, shelved = shelve_info(chunks) 49 | pybo_form_sep = pybo_form.__defaults__[0] 50 | pybo_form.__defaults__ = (pybo_form_sep, shelved) 51 | 52 | # 3. tokenize 53 | str_for_botok = "".join([c[1] for c in chunks]) 54 | 55 | return str_for_botok 56 | 57 | 58 | def get_tag(token, tag_code): 59 | maps = {"r": "text", "t": "text_cleaned", "p": "pos", "l": "lemma", "s": "sense"} 60 | try: 61 | return token[maps[tag_code]] 62 | except Exception: 63 | return "" 64 | 65 | 66 | def pybo_mod(tokens, tag_codes=[]): 67 | """extract text/pos tuples from Token objects""" 68 | txt_tags = [] 69 | for token in tokens: 70 | tags = [] 71 | tags.append(token.text) 72 | # Select and order the tags 73 | for tag_code in tag_codes: 74 | tags.append(get_tag(token, tag_code)) 75 | txt_tags.append(tags) 76 | return txt_tags 77 | 78 | 79 | def ws2uc(tags): 80 | """Convert whitespace in raw-text to underscore.""" 81 | tags[0] = tags[0].replace(" ", "_") 82 | return tags 83 | 84 | 85 | def n_chunks(token): 86 | return len([chunk for chunk in token.split("་") if chunk]) 87 | 88 | 89 | def pybo_form(tokens, sep=" ", shelved=None): 90 | """Format in a single string to be written to file""" 91 | if not shelved: 92 | print(shelved) 93 | out = [] 94 | shelved_idx = 0 95 | syl_count = 0 96 | 97 | # reinsert shelved tokens 98 | for token in tokens: 99 | out.append("/".join(ws2uc(token))) 100 | syl_count += n_chunks(token[0]) 101 | sheveled_syl_count, shelved_cleaned_chunk = shelved[shelved_idx] 102 | if "PART" not in token and sheveled_syl_count <= syl_count: 103 | out.append(ws2uc([shelved_cleaned_chunk])[0]) 104 | shelved_idx += 1 105 | 106 | # add all the remaining sheveld tokens 107 | if shelved_idx < len(shelved): 108 | for _, shelved_cleaned_chunk in shelved_cleaned_chunk[shelved_idx:]: 109 | out.append(ws2uc([shelved_cleaned_chunk])) 110 | else: 111 | out = ["/".join(ws2uc(token)) for token in tokens] 112 | return sep.join(out) 113 | -------------------------------------------------------------------------------- /pybo/rdr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/pybo/c65ee83a0659f721bccdf48db4901360e7d97048/pybo/rdr/__init__.py -------------------------------------------------------------------------------- /pybo/rdr/rdr.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from pathlib import Path 3 | from shutil import copyfile 4 | 5 | from bordr import rdr as r 6 | 7 | from .rdr_2_replace_matcher import rdr_2_replace_matcher 8 | 9 | from pybo.hfr_cqlr_converter import cqlr2hfr 10 | 11 | 12 | def rdr_postprocess(rules, infile, outdir=None, keep="model"): 13 | suffixes = [".DICT", ".INIT", ".RAW", ".RDR", ".sDict"] 14 | if not outdir: 15 | outdir = infile.parent.parent 16 | else: 17 | outdir = Path(outdir) 18 | 19 | # write adjustment rules file 20 | adj_file = outdir / (infile.stem + "_rules.tsv") 21 | adj_file.write_text(rules, encoding="utf-8-sig") 22 | 23 | # copy files to output directory 24 | for s in suffixes: 25 | if keep == "all": 26 | src = infile.parent / (infile.name + s) 27 | dst = outdir / (infile.name + s) 28 | if src != dst: 29 | copyfile(src, dst) 30 | Path(infile.parent / (infile.name + s)).unlink() 31 | elif keep == "model": 32 | if s in [".DICT", ".RDR"]: 33 | src = infile.parent / (infile.name + s) 34 | dst = outdir / (infile.name + s) 35 | if src != dst: 36 | copyfile(src, dst) 37 | Path(infile.parent / (infile.name + s)).unlink() 38 | else: 39 | Path(infile.parent / (infile.name + s)).unlink() 40 | elif keep == "none": 41 | Path(infile.parent / (infile.name + s)).unlink() 42 | else: 43 | raise SyntaxError("'keep' should either be 'all', 'model' or 'none'.") 44 | 45 | 46 | def rdr(infile, outdir=None, keep="model", type="cql"): 47 | """ 48 | 49 | :param infile: file to process. should be a POS tagged file 50 | :param outdir: optional. should be the output directory 51 | :param keep: all RDR files if "all", the .RDR and .DICT files if "model", none if None 52 | :return: RDR's log 53 | """ 54 | infile = Path(infile).resolve() 55 | 56 | # run the RDR training 57 | log = r(str(infile), mode="train", verbose=True) 58 | 59 | # translate to adjustment tsv 60 | rdr_rules = Path(infile.parent / (infile.name + ".RDR")).read_text( 61 | encoding="utf-8-sig" 62 | ) 63 | rules = rdr_2_replace_matcher(rdr_rules) 64 | if type is not "cql": 65 | rules = cqlr2hfr(rules) 66 | # remove RDR files and copy them if needed 67 | rdr_postprocess(rules, infile, outdir=outdir, keep=keep) 68 | 69 | return log if log else None 70 | -------------------------------------------------------------------------------- /pybo/rdr/rdr_2_replace_matcher.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | # variables 4 | tag = "object.tag" 5 | word = "object.word" 6 | prev_tag = "object.prevTag" 7 | next_tag = "object.nextTag" 8 | prev_word = "object.prevWord" 9 | next_word = "object.nextWord" 10 | conclusion = "object.conclusion" 11 | suffix = "object.suffixL" 12 | op = " == " 13 | ccl_op = " = " 14 | cond_sep = " and " 15 | rule_sep = " : " 16 | cql_rule_sep = " & " 17 | level_sep = "\t" 18 | 19 | positive = [tag, word, next_tag, next_word, conclusion, suffix] 20 | negative = [prev_tag, prev_word] 21 | eq_table = { 22 | tag: "pos", 23 | prev_tag: "pos", 24 | next_tag: "pos", 25 | conclusion: "pos", 26 | word: "text", 27 | prev_word: "text", 28 | next_word: "text", 29 | suffix: "text", 30 | } 31 | 32 | 33 | def rdr_2_replace_matcher(string): 34 | cql = format_rules(find_rules(find_levels(string))) 35 | repl = "{match_cql}\t{replace_index}\t=\t{replace_cql}" 36 | repls = [ 37 | repl.format(match_cql=a, replace_index=b, replace_cql=c) 38 | for a, b, c in cql 39 | if a != c 40 | ] 41 | return "\n".join(repls) 42 | 43 | 44 | def format_rules(rules): 45 | def generate_cql(test): 46 | if len(test) > 2: 47 | s, *_, e = sorted(test) 48 | elif len(test) == 2: 49 | s, e = sorted(test) 50 | else: 51 | s, e = 0, 0 52 | 53 | slots = [] 54 | slot_zero_idx = None 55 | for num, t in enumerate(range(s, e + 1)): 56 | if t == 0: 57 | slot_zero_idx = num + 1 58 | 59 | if t in test: 60 | conds = [f"{eq_table[tag]}={pos}" for tag, pos in test[t]] 61 | slots.append("[" + cql_rule_sep.join(conds) + "]") 62 | else: 63 | slots.append("[]") 64 | assert slot_zero_idx is not None 65 | return " ".join(slots), slot_zero_idx 66 | 67 | cql = [] 68 | for rule in rules: 69 | test_cql, idx = generate_cql(rule["test"]) 70 | ccl_cql, _ = generate_cql(rule["ccl"]) 71 | cql.append((test_cql, idx, ccl_cql)) 72 | return cql 73 | 74 | 75 | def find_levels(string): 76 | out = [] 77 | for line in string.split("\n"): 78 | if not line: 79 | continue 80 | count = 0 81 | while line[0] == level_sep: 82 | count += 1 83 | line = line[1:] 84 | out.append((count, line)) 85 | return out 86 | 87 | 88 | def find_rules(lines): 89 | rules = [] 90 | 91 | # state == {: , ...} 92 | # test == {: (, ), ...} 93 | state = {} 94 | for level, line in lines: 95 | # if level 0, pass. there is no rule to implement 96 | if level == 0: 97 | continue 98 | 99 | tests, ccl = parse_line(line) 100 | ordered_tests = defaultdict(list) 101 | for t in tests: 102 | for pos, test in t.items(): 103 | ordered_tests[pos].append(test) 104 | 105 | # save current rule in state to use in indented rules 106 | state[level] = ordered_tests 107 | 108 | test = defaultdict(list) 109 | for l in range(1, level + 1): 110 | for pos, t in state[l].items(): 111 | for u in t: 112 | if u not in test[pos]: # avoid duplicates 113 | test[pos].append(u) 114 | rules.append({"test": test, "ccl": ccl}) 115 | return rules 116 | 117 | 118 | def parse_line(line): 119 | rule, ccl = line.split(rule_sep) 120 | tests = rule.split(cond_sep) 121 | ccl = parse_test(ccl) 122 | ccl[0] = [ccl[0]] 123 | tests = [parse_test(t) for t in tests] 124 | return tests, ccl 125 | 126 | 127 | def parse_test(test): 128 | def parser(test, op): 129 | pos = 0 130 | attr, tag = test.split(op) 131 | for p in positive: 132 | if p in attr and len(attr) > len(p): 133 | pos = int(attr[-1]) 134 | attr = attr[:-1] 135 | for n in negative: 136 | if n in attr and len(attr) > len(n): 137 | pos = -int(attr[-1]) 138 | attr = attr[:-1] 139 | 140 | if attr == suffix: 141 | tag = '".*' + tag[1:] 142 | return attr, pos, tag 143 | 144 | if op in test: 145 | attr, pos, tag = parser(test, op) 146 | elif ccl_op in test: 147 | attr, pos, tag = parser(test, ccl_op) 148 | else: 149 | raise SyntaxError 150 | return {pos: (attr, tag)} 151 | -------------------------------------------------------------------------------- /pybo/resources/particles.tsv: -------------------------------------------------------------------------------- 1 | # form pos lemma sense freq 2 | གི་ PART གི 3 | ཀྱི་ PART གི 4 | གྱི་ PART གི 5 | འི་ PART གི 6 | ཡི་ PART གི 7 | གིས་ PART གིས 8 | ཀྱིས་ PART གིས 9 | གྱིས་ PART གིས 10 | ཡིས་ PART གིས 11 | ས་ PART གིས 12 | སུ་ PART ལ 13 | ར་ PART ལ 14 | རུ་ PART ལ 15 | ཏུ་ PART ལ 16 | ན་ PART ལ 17 | ལ་ PART ལ 18 | དུ་ PART ལ 19 | སྟེ་ PART སྟེ 20 | ཏེ་ PART སྟེ 21 | དེ་ PART སྟེ 22 | ཀྱང་ PART ཀྱང 23 | ཡང་ PART ཀྱང 24 | འང་ PART ཀྱང 25 | གམ་ PART གམ 26 | ངམ་ PART གམ 27 | དམ་ PART གམ 28 | ནམ་ PART གམ 29 | བམ་ PART གམ 30 | མམ་ PART གམ 31 | འམ་ PART གམ 32 | རམ་ PART གམ 33 | ལམ་ PART གམ 34 | སམ་ PART གམ 35 | ཏམ་ PART གམ 36 | པ་ PART པ 37 | བ་ PART པ 38 | པོ་ PART པོ 39 | བོ་ PART པོ 40 | གོ་ PART གོ 41 | ངོ་ PART གོ 42 | དོ་ PART གོ 43 | ནོ་ PART གོ 44 | བོ་ PART གོ 45 | མོ་ PART གོ 46 | འོ་ PART གོ 47 | རོ་ PART གོ 48 | ལོ་ PART གོ 49 | སོ་ PART གོ 50 | ཏོ་ PART གོ 51 | ཅིང་ PART ཅིང 52 | ཤིང་ PART ཅིང 53 | ཞིང་ PART ཅིང 54 | ཅེས་ PART ཅེས 55 | ཞེས་ PART ཅེས 56 | ཅེའོ་ PART ཅེའོ 57 | ཤེའོ་ PART ཅེའོ 58 | ཞེའོ་ PART ཅེའོ 59 | ཅེ་ན་ PART ཅེ་ན 60 | ཤེ་ན་ PART ཅེ་ན 61 | ཞེ་ན་ PART ཅེ་ན 62 | ཅིག་ PART ཅིག 63 | ཤིག་ PART ཅིག 64 | ཞིག་ PART ཅིག 65 | ཀྱིན་ PART གིན 66 | གིན་ PART གིན 67 | གྱིན་ PART གིན 68 | ནས་ PART ནས 69 | -------------------------------------------------------------------------------- /pybo/segmentation_rule/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/pybo/c65ee83a0659f721bccdf48db4901360e7d97048/pybo/segmentation_rule/__init__.py -------------------------------------------------------------------------------- /pybo/segmentation_rule/make_rule.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def get_syls(token): 4 | syls = [] 5 | token_parts = re.split("(་)", token) 6 | syl = '' 7 | for walker, part in enumerate(token_parts): 8 | if part: 9 | if walker % 2 == 0: 10 | syl += part 11 | else: 12 | syl += part 13 | syls.append(syl) 14 | syl = "" 15 | return syls 16 | 17 | def parse_rule(rule): 18 | """Parse all the components of cql rule 19 | 20 | Args: 21 | rule (str): cql rule 22 | 23 | Returns: 24 | str: token info, index info, operator of the rule and conclusion tag 25 | """ 26 | part_of_rules = rule.split('\t') 27 | return part_of_rules[0], part_of_rules[1], part_of_rules[2], part_of_rules[3] 28 | 29 | def get_tokens(tokens_info): 30 | """Parse tokens from tokens info of a cql rule 31 | 32 | Args: 33 | tokens_info (str): tokens info in a cql rule 34 | 35 | Returns: 36 | list: tokens from token info 37 | """ 38 | tokens = re.findall(r'\[.*?\]', tokens_info) 39 | return tokens 40 | 41 | def parse_tok(token): 42 | try: 43 | bilou_tag = re.search(r'pos="(\S)"', token).group(1) 44 | except: 45 | bilou_tag = '' 46 | try: 47 | text = re.search(r'text="(\S+)" ?', token).group(1) 48 | except: 49 | text = '' 50 | text = re.sub('\.', '\\\S', text) 51 | return bilou_tag, text 52 | 53 | def add_extra_token_pat(ambiguous_seg_pat): 54 | """extra context tokens added to ambiguous seg pat 55 | 56 | Args: 57 | ambiguous_seg_pat (str): ambiguous segmentation pattern 58 | 59 | Returns: 60 | str: ambiguous seg pat with extra context token 61 | """ 62 | extra_token_pat = r' \S+?/\S ' 63 | ambiguous_seg_pat_with_extra_token_pat = f'{extra_token_pat}{ambiguous_seg_pat}{extra_token_pat}' 64 | ambiguous_seg_pat_with_extra_token_pat = ambiguous_seg_pat_with_extra_token_pat.replace(' ', ' ') 65 | return ambiguous_seg_pat_with_extra_token_pat 66 | 67 | def get_ambiguous_seg_pat(tokens_in_rule, index_info): 68 | """Return ambguous segmentation's pattern 69 | 70 | Args: 71 | tokens_in_rule (list): tokens in bilou rule 72 | 73 | Returns: 74 | str: ambiguos segmentation's pattern 75 | """ 76 | ambiguous_seg_pat = '' 77 | for token in tokens_in_rule: 78 | bilou_tag, text = parse_tok(token) 79 | if text: 80 | ambiguous_seg_pat += f' {text}' 81 | if bilou_tag: 82 | ambiguous_seg_pat += f'/{bilou_tag}' 83 | else: 84 | ambiguous_seg_pat += r'/\S' 85 | else: 86 | ambiguous_seg_pat += r" \S+?" 87 | if bilou_tag: 88 | ambiguous_seg_pat += f'/{bilou_tag}' 89 | else: 90 | ambiguous_seg_pat += r'/\S' 91 | if len(tokens_in_rule) < 4: 92 | ambiguous_seg_pat = add_extra_token_pat(ambiguous_seg_pat) 93 | return ambiguous_seg_pat 94 | 95 | def construct_token_info(ambiguous_seg_candidate): 96 | """Construct token info part of a cql rule 97 | 98 | Args: 99 | ambiguous_seg_candidate (list): ambiguous segmentation candidate's token list 100 | 101 | Returns: 102 | str: token info part of a cql rule 103 | """ 104 | token_info = '' 105 | for token in ambiguous_seg_candidate: 106 | token_parts = token.split('/') 107 | token_text = re.search(r'(\S+)<\S+',token_parts[0]).group(1) 108 | token_pos = re.search(r'<(\S+)>',token_parts[0]).group(1) 109 | if token_pos != 'NO_POS': 110 | token_info += f'[text="{token_text}" & pos="{token_pos}"] ' 111 | else: 112 | token_info += f'[text="{token_text}"] ' 113 | return token_info.strip() 114 | 115 | def get_ambiguous_seg_candidates(tokens_in_rule, index_info, bilou_tag_data): 116 | """Return all the possible ambiguous segmentation candidates containing tokens in rule 117 | 118 | Args: 119 | tokens_in_rule (list): tokens in bilou rule 120 | bilou_tag_data (str): bilou tagged data 121 | 122 | Returns: 123 | list: ambiguous segmentation candidates 124 | """ 125 | ambiguous_seg_candidates_tokens = [] 126 | ambiguous_seg_pat = get_ambiguous_seg_pat(tokens_in_rule, index_info) 127 | ambiguous_seg_candidates = re.findall(ambiguous_seg_pat, bilou_tag_data) 128 | ambiguous_seg_candidates = list(set(ambiguous_seg_candidates)) 129 | for ambiguous_seg_candidate in ambiguous_seg_candidates: 130 | ambiguous_seg_candidates_tokens.append([token for token in ambiguous_seg_candidate.split(' ') if token]) 131 | return ambiguous_seg_candidates_tokens 132 | 133 | def is_single_syl(token): 134 | """Check token is single syllable 135 | 136 | Args: 137 | token (str): token 138 | 139 | Returns: 140 | boolean: True if token is single syllable else False 141 | """ 142 | syls = [syl for syl in token.split('་') if syl] 143 | if len(syls) > 1: 144 | return False 145 | else: 146 | return True 147 | 148 | def parse_index_info(index_info): 149 | """Return index of the token from index info 150 | 151 | Args: 152 | index_info (str): index info of a cql rule 153 | 154 | Returns: 155 | int: index of token 156 | """ 157 | if '-' in index_info: 158 | index_info_parts = index_info.split('-') 159 | index = int(index_info_parts[0]) 160 | else: 161 | index = int(index_info) 162 | return index 163 | 164 | def splited_token_in_human_data(split_tok_text, human_data): 165 | spilt_suggestion = split_tok_text.strip() 166 | syls = get_syls(spilt_suggestion) 167 | for syl_walker, syl in enumerate(syls): 168 | split_possible = f' {syl} {"".join(syls[syl_walker+1:])} ' 169 | if split_possible in human_data: 170 | return split_possible, syl_walker+1 171 | return '', 0 172 | 173 | def get_splited_token(spilt_suggestion): 174 | """Split split suggestion and return it 175 | 176 | Args: 177 | spilt_suggestion (str): split suggestion 178 | 179 | Returns: 180 | str: opposite of split suggestion 181 | """ 182 | spilt_suggestion = spilt_suggestion.strip() 183 | syls = [syl.strip() for syl in spilt_suggestion.split('་') if syl and syl != ' '] 184 | suggestion = f'{syls[0]}་ {"་".join(syls[1:])}' 185 | if spilt_suggestion[-1] == '་': 186 | suggestion += '་' 187 | splited_token = f' {suggestion} ' 188 | return splited_token 189 | 190 | def is_false_positive_split(tokens_in_rule, index, splited_token, human_data): 191 | """Check if the rule is a false positive split case or not 192 | 193 | Args: 194 | tokens_in_rule (list): tokens in rule 195 | index (int): index of token on which split is going to take 196 | splited_token (str): splited token 197 | human_data (str): human segmented data 198 | 199 | Returns: 200 | boolean: True if rule is false positive else false 201 | """ 202 | split_suggestion_with_context = '' 203 | splited_token = splited_token.strip() 204 | for token_walker, token in enumerate(tokens_in_rule, 1): 205 | token_text = re.search(r'text=\"(\S+)\"', token).group(1) 206 | if token_walker == 1: 207 | split_suggestion_with_context += f' {token_text} ' 208 | elif token_walker == index: 209 | split_suggestion_with_context += f'{splited_token} ' 210 | else: 211 | split_suggestion_with_context += f'{token_text} ' 212 | if split_suggestion_with_context in human_data: 213 | return False 214 | else: 215 | return True 216 | 217 | def is_invalid_split(tokens_info, index_info, human_data): 218 | """Return false if split suggestion is ambiguous segmentation else true 219 | 220 | Args: 221 | tokens_info (str): token info of a rule 222 | index_info (str): index info of a cql rule 223 | human_data (str): human segmented data 224 | 225 | Returns: 226 | boolean: True if invalid split rule else False 227 | """ 228 | index = parse_index_info(index_info) 229 | tokens = get_tokens(tokens_info) 230 | token_to_split = re.search(r'text=\"(\S+)\"', tokens[index-1]).group(1) 231 | if is_single_syl(token_to_split) or len(tokens) < index: 232 | return True, 0 233 | else: 234 | split_suggestion = f" {token_to_split} " 235 | splited_token, split_idx = splited_token_in_human_data(split_suggestion, human_data) 236 | if split_suggestion in human_data and splited_token and not is_false_positive_split(tokens, index, splited_token, human_data): 237 | return False, split_idx 238 | else: 239 | return True, 0 240 | 241 | def is_false_positive_merge(tokens_in_rule, index, human_data): 242 | """Check if rule is false positive merge or not 243 | 244 | Args: 245 | tokens_in_rule (list): tokens in rule 246 | index (int): index of token on which merge operation is going to perform 247 | human_data (str): human segmented data 248 | 249 | Returns: 250 | boolean: true if rule is false positive merge else false 251 | """ 252 | merge_suggestion_with_context = '' 253 | for token_walker, token in enumerate(tokens_in_rule, 1): 254 | token_text = re.search(r'text=\"(\S+)\"', token).group(1) 255 | if token_walker == 1: 256 | merge_suggestion_with_context += f' {token_text} ' 257 | elif token_walker == index: 258 | merge_suggestion_with_context += f'{token_text}' 259 | elif token_walker == index+1: 260 | merge_suggestion_with_context += f'{token_text} ' 261 | else: 262 | merge_suggestion_with_context += f'{token_text} ' 263 | if merge_suggestion_with_context in human_data: 264 | return False 265 | else: 266 | return True 267 | 268 | def is_invalid_merge(tokens_info, index_info, human_data): 269 | """Return false if merge suggestion is ambiguous segmentation else true 270 | 271 | Args: 272 | tokens_info (str): token info of a rule 273 | index_info (str): index info of a cql rule 274 | human_data (str): human segmented data 275 | 276 | Returns: 277 | boolean: True if invalid merge rule else False 278 | """ 279 | index = parse_index_info(index_info) 280 | tokens = get_tokens(tokens_info) 281 | if len(tokens) <= index or index == 0: 282 | return True 283 | else: 284 | part1 = re.search(r'text=\"(\S+)\"', tokens[index-1]).group(1) 285 | part2 = re.search(r'text=\"(\S+)\"', tokens[index]).group(1) 286 | merge_suggestion = f' {part1}{part2} ' 287 | splited_token_in_hd, split_idx = splited_token_in_human_data(merge_suggestion, human_data) 288 | if "།" not in merge_suggestion and (merge_suggestion in human_data and splited_token_in_hd) and not is_false_positive_merge(tokens, index, human_data): 289 | return False 290 | else: 291 | return True 292 | 293 | def filter_valid_rules(new_rules, human_data): 294 | """Return valid rules which can solve ambiguous segmentation errors 295 | 296 | Args: 297 | new_rules (list): cql rules 298 | human_data (str): human segmented data 299 | 300 | Returns: 301 | list: cql rules 302 | """ 303 | valid_rules = [] 304 | for new_rule in new_rules: 305 | tokens_info, index_info, operator, conclusion = parse_rule(new_rule) 306 | if ":" == operator: 307 | is_invalid_split_flag, split_idx = is_invalid_split(tokens_info, index_info, human_data) 308 | if not is_invalid_split_flag: 309 | new_rule = re.sub(r'-\d', f'-{split_idx}', new_rule) 310 | valid_rules.append(new_rule) 311 | elif "+" == operator: 312 | if not is_invalid_merge(tokens_info, index_info, human_data): 313 | valid_rules.append(new_rule) 314 | return valid_rules 315 | 316 | def get_new_rule(ambiguous_seg_candidates, index, conclusion, human_data): 317 | """Return list of usable cql rules by botok 318 | 319 | Args: 320 | ambiguous_seg_candidates (list): ambiguous segmentation candidates 321 | index (int): index of token on which operation needs to perform 322 | conclusion (str): conclusion tag of rule 323 | human_data (str): human segmented data 324 | 325 | Returns: 326 | list: usable cql rules of botok 327 | """ 328 | new_rules = [] 329 | for ambiguous_seg_candidate in ambiguous_seg_candidates: 330 | new_rule = f"{construct_token_info(ambiguous_seg_candidate)}\t" 331 | if 'B' in conclusion: 332 | new_rule += f'{index}\t+\t[]' 333 | elif 'I' in conclusion: 334 | new_rule += f'{index-1}\t+\t[]' 335 | elif 'S' in conclusion: 336 | new_rule += f'{index}-1\t:\t[] []' 337 | else: 338 | new_rule = '' 339 | if new_rule: 340 | new_rules.append(new_rule) 341 | unique_rules = list(set(new_rules)) 342 | filtered_rules = filter_valid_rules(unique_rules, human_data) 343 | return filtered_rules -------------------------------------------------------------------------------- /pybo/segmentation_rule/pipeline.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from pathlib import Path 4 | 5 | from bordr import rdr as r 6 | from botok.tokenizers.wordtokenizer import WordTokenizer 7 | 8 | from pybo.rdr.rdr_2_replace_matcher import rdr_2_replace_matcher 9 | from pybo.hfr_cqlr_converter import cqlr2hfr 10 | 11 | from pybo.segmentation_rule.make_rule import * 12 | from pybo.untokenize import assemble, pre_processing 13 | 14 | 15 | HOME = Path.home() 16 | DIALECT_PACK_DIR = HOME / "Documents" / "pybo" / "dialect_packs" 17 | DEFAULT_DPACK = "general" 18 | 19 | 20 | def get_botok_segmentation(sample_text): 21 | """Tokenize sample text using botok tokenizer 22 | 23 | Args: 24 | sample_text (str): Input string that needs to be tokenize 25 | 26 | Returns: 27 | str: sample text with space between each tokens 28 | """ 29 | wt = WordTokenizer() 30 | tokens = wt.tokenize(sample_text) 31 | segmented_sample_text = '' 32 | for token in tokens: 33 | token_text = token.text.replace(' ', '') 34 | if token.pos: 35 | token_pos = token.pos 36 | else: 37 | token_pos = token.chunk_type 38 | token_with_tag = f'{token_text}<{token_pos}> ' 39 | if '\n' in token_with_tag: 40 | token_with_tag = token_with_tag.replace('\n', '') 41 | token_with_tag += '\n' 42 | segmented_sample_text += token_with_tag 43 | segmented_sample_text = segmented_sample_text.replace(' \n', '\n') 44 | return segmented_sample_text 45 | 46 | def post_process_botok_segmented_data(segmented_text): 47 | """Remove unwanted space from segmented text 48 | 49 | Args: 50 | segmented_text (str): Botok segmented text 51 | 52 | Returns: 53 | str: clean segmented text 54 | """ 55 | clean_segmented_text = segmented_text.replace('\n ', '\n') 56 | clean_segmented_text = clean_segmented_text.replace(' ', ' ') 57 | return clean_segmented_text 58 | 59 | 60 | def post_process_human_data(human_data): 61 | """Remove unwanted space and solves double shad(ཉིས་ཤད་) split cases 62 | 63 | Args: 64 | human_data (str): human segmented data 65 | 66 | Returns: 67 | str: clean human segmented data 68 | """ 69 | human_data = human_data.replace('། །', '།།') 70 | human_data = human_data.replace(' ', ' ') 71 | return human_data 72 | 73 | def get_toks(seg_str): 74 | """Extract list of tokens from segmented string 75 | 76 | Args: 77 | seg_str (str): segmented string which can be by human or botok 78 | 79 | Returns: 80 | list: list of tokens 81 | """ 82 | tokens = [token for token in seg_str.split(' ') if token] 83 | return tokens 84 | 85 | def parse_tok(botok_tok): 86 | """parse botok parts 87 | 88 | Args: 89 | botok_tok (str): botok tok 90 | 91 | Returns: 92 | str,str: text of token and pos of token 93 | """ 94 | pos = re.search(r'<.*?>', botok_tok)[0] 95 | text = botok_tok.replace(pos, '') 96 | return text, pos 97 | 98 | def get_bilou_tag_line(human_toks, botok_toks): 99 | """Add bilou tags to botok tokens and join them as a string with space between each tokens 100 | 101 | Args: 102 | human_toks (list): tokens from human segmented line 103 | botok_toks (list): tokens from botok segmented line 104 | 105 | Returns: 106 | str: botok tokens with bilou tag separated by space 107 | """ 108 | bilou_tag_line = '' 109 | while True: 110 | human_tok = human_toks[0] 111 | cur_tok = '' 112 | tok_walker= 0 113 | while tok_walker < len(botok_toks): 114 | botok_tok_text, botok_tok_pos = parse_tok(botok_toks[tok_walker]) 115 | if botok_tok_text == human_tok: 116 | bilou_tag_line += f'{botok_tok_text}{botok_tok_pos}/U ' 117 | botok_toks = botok_toks[tok_walker+1:] 118 | break 119 | elif botok_tok_text in human_tok: 120 | cur_tok += botok_tok_text 121 | if cur_tok == human_tok: 122 | bilou_tag_line += f'{botok_tok_text}{botok_tok_pos}/I ' 123 | botok_toks = botok_toks[tok_walker+1:] 124 | break 125 | elif re.search(f'^{botok_tok_text}', human_tok): 126 | bilou_tag_line += f'{botok_tok_text}{botok_tok_pos}/B ' 127 | else: 128 | bilou_tag_line += f'{botok_tok_text}{botok_tok_pos}/I ' 129 | elif re.search(human_tok, botok_tok_text): 130 | cur_tok = human_tok 131 | bilou_tag_line += f'{botok_tok_text}{botok_tok_pos}/S ' 132 | while re.search(cur_tok, botok_tok_text): 133 | human_toks = human_toks[1:] 134 | human_tok = human_toks[0] 135 | cur_tok += human_tok 136 | else: 137 | botok_toks = botok_toks[tok_walker:] 138 | if tok_walker != 0: 139 | break 140 | else: 141 | bilou_tag_line += f'{botok_tok_text}{botok_tok_pos}/S ' 142 | tok_walker += 1 143 | human_toks = human_toks[1:] 144 | if not human_toks: 145 | break 146 | return bilou_tag_line 147 | 148 | def get_detokenized_line(tokenized_line): 149 | tokens = pre_processing(tokenized_line) 150 | detokenized_line = assemble(tokens) 151 | return detokenized_line 152 | 153 | def get_bilou_tag_data(human_data): 154 | """Human data get detokenized. 155 | Detokenized text is tokenized by botok. 156 | Bilou tag is given to botok segmented data by comparing with human segmentation 157 | 158 | Args: 159 | human_data (str): segmented corpus data by human 160 | 161 | Returns: 162 | str: botok segmented data with bilou tag 163 | """ 164 | human_lines = human_data.splitlines() 165 | bilou_tag_data = '' 166 | for human_line in human_lines: 167 | detokenized_line = get_detokenized_line(human_line) 168 | botok_line = get_botok_segmentation(detokenized_line) 169 | human_toks = get_toks(human_line) 170 | botok_toks = get_toks(botok_line) 171 | bilou_tag_data += get_bilou_tag_line(human_toks, botok_toks) + '\n' 172 | return bilou_tag_data 173 | 174 | def get_split_suggestions(bilou_tag_data): 175 | """Return all the tokens with Split tag(S) 176 | 177 | Args: 178 | bilou_tag_data (str): Botok segmented data with bilou tag 179 | 180 | Returns: 181 | list: list of tokens with split tags 182 | """ 183 | split_suggestions = [split_token[:-2] for split_token in re.findall(r'\S+/S', bilou_tag_data)] 184 | return list(set(split_suggestions)) 185 | 186 | def get_merge_suggestions(bilou_tag_data): 187 | """Return all the tokens which are meant to be merge 188 | 189 | Args: 190 | bilou_tag_data (str): Botok segmented data with bilou tag 191 | 192 | Returns: 193 | list: list of tokens that are meant to be merge 194 | """ 195 | merge_suggestions = [merge_suggestion for merge_suggestion,_ in re.findall(r'(\S+/B (\S+/I )+)', bilou_tag_data)] 196 | return list(set(merge_suggestions)) 197 | 198 | def parse_merge_suggestion(merge_suggestion): 199 | """Return tokens in merge suggestion 200 | 201 | Args: 202 | merge_suggestion (str): merge suggestion extracted from bilou tagged text 203 | 204 | Returns: 205 | list: tokens in merge suggestion 206 | """ 207 | merge_suggestion_tokens = [re.search(r'(\S+)<\S+',token).group(1) for token in merge_suggestion.split(' ') if token] 208 | return merge_suggestion_tokens 209 | 210 | def get_counter_merge_suggestion(merge_suggestion_tokens): 211 | """Return opposite of merge suggestion 212 | 213 | Args: 214 | merge_suggestion_tokens (list): tokens in merge suggestion 215 | 216 | Returns: 217 | str: opposite of merge suggestion 218 | """ 219 | counter_merge_suggestion = ' '.join(merge_suggestion_tokens) 220 | if merge_suggestion_tokens[-1][-1] == '་': 221 | counter_merge_suggestion += " " 222 | return counter_merge_suggestion 223 | 224 | def get_remove_word_candidates(split_suggestions, human_data): 225 | """Return remove word candidate or non ambiguous spilt options from spilt suggestions using human data 226 | 227 | Args: 228 | split_suggestions (list): spilt suggestion extracted from bilou tagged text 229 | human_data (str): human segmented text 230 | 231 | Returns: 232 | list: remove word candidates 233 | """ 234 | remove_word_candidate = [] 235 | for split_suggestion_token in split_suggestions: 236 | split_suggestion_tok_text = re.search(r'(\S+)<\S+',split_suggestion_token).group(1) 237 | if not is_single_syl(split_suggestion_tok_text): 238 | split_suggestion = f' {split_suggestion_tok_text} ' 239 | splited_token, split_idx = splited_token_in_human_data(split_suggestion_tok_text, human_data) 240 | if split_suggestion not in human_data and splited_token: 241 | remove_word_candidate.append(split_suggestion_tok_text) 242 | return remove_word_candidate 243 | 244 | def get_new_word_candidate(merge_suggestion, human_data): 245 | """Return new word if merge suggestion is not ambiguous one else empty string return 246 | 247 | Args: 248 | merge_suggestion (str): merge sugeestion 249 | human_data (str): human segmented data 250 | 251 | Returns: 252 | str: new word candidate 253 | """ 254 | new_word = '' 255 | merge_suggestion_tokens = parse_merge_suggestion(merge_suggestion) 256 | new_word = ''.join(merge_suggestion_tokens) 257 | # counter_merge_suggestion = " " + get_counter_merge_suggestion(merge_suggestion_tokens) 258 | splited_token, split_idx = splited_token_in_human_data(new_word, human_data) 259 | if not splited_token: 260 | return new_word 261 | else: 262 | return '' 263 | 264 | def get_new_word_candidates(merge_suggestions, human_data): 265 | """Return all the new word candidate from merge suggestions using human data 266 | 267 | Args: 268 | merge_suggestions (list): merge suggestions extracted from bilou tagged text 269 | human_data (str): human segmented data 270 | 271 | Returns: 272 | list: new word candidate 273 | """ 274 | new_word_candidate = [] 275 | for merge_suggestion in merge_suggestions: 276 | new_word = get_new_word_candidate(merge_suggestion, human_data) 277 | if new_word: 278 | new_word_candidate.append(new_word) 279 | return new_word_candidate 280 | 281 | def filter_seg_errors(bilou_tag_data, human_data): 282 | """Filters out obivious segmentation error and extract new words and new remove words 283 | 284 | Args: 285 | bilou_tag_data (str): segmented botok data with bilou tag 286 | human_data (ste): segmented human data 287 | 288 | Returns: 289 | list: new word list and new remove word list 290 | """ 291 | new_word_candidate = [] 292 | new_remove_word_candidate = [] 293 | split_suggestions = get_split_suggestions(bilou_tag_data) 294 | merge_suggestions = get_merge_suggestions(bilou_tag_data) 295 | new_word_candidate = get_new_word_candidates(merge_suggestions, human_data) 296 | new_remove_word_candidate = get_remove_word_candidates(split_suggestions, human_data) 297 | return new_word_candidate, new_remove_word_candidate 298 | 299 | def rdr_postprocess(file_path): 300 | suffixes = [".DICT", ".INIT", ".RAW", ".sDict"] 301 | for s in suffixes: 302 | Path(file_path.parent / (file_path.name + s)).unlink() 303 | 304 | def remove_duplicate_word(word_list): 305 | return list(set(word_list)) 306 | 307 | def add_word_2_adjustment(words_2_add, corpus_file_name, dialect_pack_name, type='words'): 308 | """New word candidates or new remove word candidates are added with existing word list. 309 | Duplicates are then removed. 310 | Unique word list are then added to its file. 311 | 312 | Args: 313 | words_2_add (list): word list of new word candidates or new remove word candidates 314 | corpus_file_name (str): courpus file name 315 | dialect_pack_name (str): current working dialect pack name 316 | type (str, optional): type can be either words or remove. Defaults to 'words'. 317 | 318 | Returns: 319 | list: latest word list of mentioned type 320 | """ 321 | old_word_list = [] 322 | word_list_path = (DIALECT_PACK_DIR / dialect_pack_name / "adjustments" / type / f'{corpus_file_name}.tsv') 323 | if word_list_path.is_file(): 324 | old_word_list = [old_word for old_word in word_list_path.read_text(encoding='utf-8-sig').splitlines() if old_word] 325 | new_word_list = old_word_list + words_2_add 326 | new_word_list = remove_duplicate_word(new_word_list) 327 | new_words = '\n'.join(new_word_list) 328 | word_list_path.write_text(new_words, encoding='utf-8-sig') 329 | print(f'[INFO]: New {type} added to adjustment {type} list..') 330 | return new_word_list 331 | 332 | def get_bilou_rules(bilou_tag_data_path): 333 | """Extract rdr rules by training RDR model using bilou tagged data. 334 | Convert rdr rules to cql rules and returning it. 335 | 336 | Args: 337 | bilou_tag_data_path (pathlib): path of bilou tagged data 338 | 339 | Returns: 340 | list: rdr rules converted into cql rules 341 | """ 342 | log = r(str(bilou_tag_data_path), mode="train", verbose=True) 343 | print('[INFO]: RDR TRAINING COMPLETED..') 344 | rdr_rules = Path(f"{bilou_tag_data_path}.RDR").read_text( 345 | encoding="utf-8-sig" 346 | ) 347 | bilou_rules = rdr_2_replace_matcher(rdr_rules).splitlines() 348 | bilou_rules = list(set(bilou_rules)) 349 | return bilou_rules 350 | 351 | def convert_bilou_rules(bilou_rules, bilou_tag_init, human_data): 352 | """Convert bilou rules to normal cql rules as rules with bilou tag are not usable by botok 353 | 354 | Args: 355 | bilou_rules (list): cql rules with bilou tag 356 | bilou_tag_init (str): bilou tagged initial text 357 | human_data (str): human segmented data 358 | 359 | Returns: 360 | list: usable cql rule by botok 361 | """ 362 | new_cql_rules = [] 363 | for bilou_rule in bilou_rules: 364 | tokens_info, index_info, operator, conclusion = parse_rule(bilou_rule) 365 | tokens_in_rule = get_tokens(tokens_info) 366 | ambiguous_seg_candidates = get_ambiguous_seg_candidates(tokens_in_rule, index_info, bilou_tag_init) 367 | new_cql_rules += get_new_rule(ambiguous_seg_candidates, int(index_info)+1, conclusion, human_data) # index incremented as extra context token involve 368 | new_cql_rules = list(set(new_cql_rules)) 369 | return new_cql_rules 370 | 371 | def extract_seg_rule(corpus_file_path, dialect_pack_name=DEFAULT_DPACK, type='cql', no_epochs = 3): 372 | """Extracts segmentation rules. 373 | 374 | Args: 375 | corpus_file_path (pathlib): input file's path 376 | dialect_pack_name (string, optional): name of dialect pack for which rules are. Defaults to DEFAULT_DPACK. 377 | type (str, optional): type of rules can be human friendly rule(hfr) or corpus query rule. Defaults to 'cql'. 378 | no_epochs (int, optional): Number of times word filters need to perform. Defaults to 3. 379 | 380 | Returns: 381 | str: segmentation rules 382 | """ 383 | new_word_list = [] 384 | new_remove_word_list = [] 385 | corpus_file_name = corpus_file_path.stem[:-2] 386 | number_of_segmentation = 1 387 | human_data = corpus_file_path.read_text(encoding='utf-8-sig') 388 | human_data = post_process_human_data(human_data) 389 | while True: 390 | bilou_tag_data = get_bilou_tag_data(human_data) 391 | print(f'[INFO]: SEGMENTATION PHASE {number_of_segmentation} COMPLETED..') 392 | new_word_list, new_remove_word_list = filter_seg_errors(bilou_tag_data, human_data) 393 | print('[INFO]: FILTER SEGMENTATION ERROR COMPLETED..') 394 | if new_word_list: 395 | new_word_list = add_word_2_adjustment(new_word_list, corpus_file_name, dialect_pack_name, type='words') 396 | if new_remove_word_list: 397 | new_remove_word_list = add_word_2_adjustment(new_remove_word_list, corpus_file_name, dialect_pack_name, type='remove') 398 | bilou_tag_data = get_bilou_tag_data(human_data) 399 | word_list, remove_word_list = filter_seg_errors(bilou_tag_data, human_data) 400 | new_remove_word_list = [remove_word for remove_word in remove_word_list if remove_word not in new_remove_word_list] 401 | new_word_list = [word for word in word_list if word not in new_word_list] 402 | number_of_segmentation += 1 403 | if (not new_word_list and not new_remove_word_list) or number_of_segmentation > no_epochs: 404 | break 405 | bilou_tag_data_path = (corpus_file_path.parent / f'{corpus_file_name}_tr_data.txt') 406 | bilou_tag_data_path.write_text(bilou_tag_data, encoding='utf-8') 407 | bilou_rules = get_bilou_rules(bilou_tag_data_path) 408 | (corpus_file_path.parent / f'{corpus_file_name}_bilou_rules.txt').write_text("\n".join(bilou_rules), encoding='utf-8') 409 | new_cql_rules = [] 410 | bilou_tag_init = (corpus_file_path.parent / f'{bilou_tag_data_path.name}.INIT').read_text(encoding='utf-8-sig') 411 | new_cql_rules = convert_bilou_rules(bilou_rules, bilou_tag_init, human_data) 412 | new_cql_rules = "\n".join(new_cql_rules) 413 | rdr_postprocess(bilou_tag_data_path) 414 | if type != 'cql': 415 | new_cql_rules = cqlr2hfr(new_cql_rules) 416 | return new_cql_rules -------------------------------------------------------------------------------- /pybo/third_party/rules.txt: -------------------------------------------------------------------------------- 1 | # Rules for Sanskrit ordering 2 | # From Bod rgya tshig mdzod chen mo pages 9 - 11, 347, 1153, 1615, 1619, 1711, 1827, 2055, 2061, 2840, 2920, 3136 and 3137 3 | # Example: ཀར་ལུགས། < ཀརྐ་ཊ། 4 | &ཀར<ཀརྐ<ཀརྟ<ཀརྞ<ཀརྨ<ཀརྴ<ཀརྵ 5 | &ཀལ<ཀལྐ<ཀལྤ 6 | &ཀས<ཀསྨ 7 | &གཉ<གཉྫ 8 | &ཐར<ཐརྐ 9 | &པུས<པུསྟི 10 | &ཕལ<ཕལྒ 11 | &བིལ<བིལྦ 12 | &མཉ<མཉྫ 13 | &མར<མརྒ 14 | &ཝར<ཝརྟ 15 | &ཤས<ཤསྟ 16 | &སར<སརྒ 17 | &ཨར<ཨརྒ<ཨརྱ=ཨཪྱ 18 | &ཨས<ཨསྨ 19 | # Marks (seconadry different, with low equal primary weight after Lao) 20 | &[before 1]ཀ<།<<༎<<༏<<༐<<༑<<༔<<༴<་=༌ 21 | &ཀ<<ྈྐ<ཫ<དཀ<བཀ<རྐ<ལྐ<སྐ<བརྐ<བསྐ 22 | &ཁ<<ྈྑ<མཁ<འཁ 23 | &ག<དགག<དགང<དགད<དགན<དགབ<དགཝ<དགའ<དགར<དགལ<དགས<དགི<དགུ<དགེ<དགོ<དགྭ<དགྱ<དགྲ<བགག<བགང<བགད<བགབ<བགམ<<<བགཾ<བགཝ<བགའ 24 | <བགར<བགལ<བགི<བགུ<བགེ<བགོ<བགྭ<བགྱ<བགྲ<བགླ<མགག<མགང<མགད<མགབ<མགའ<མགར<མགལ<མགི<མགུ<མགེ<མགོ<མགྭ<མགྱ<མགྲ<འགག<འགང<འགད<འགན<འགབ<འགམ<<<འགཾ 25 | <འགའ<འགར<འགལ<འགས<འགི<འགུ<འགེ<འགོ<འགྭ<འགྱ<འགྲ<རྒ<ལྒ<སྒ<བརྒ<བསྒ 26 | &ང<<<ྂ<<<ྃ<དངག<དངང<དངད<དངན<དངབ<དངའ<དངར<དངལ<དངི<དངུ<དངེ<དངོ<མངག<མངང<མངད<མངན<མངབ<མངའ<མངར<མངལ<མངི<མངུ<མངེ<མངོ<རྔ<ལྔ<སྔ<བརྔ<བསྔ 27 | &ཅ<གཅ<བཅ<ལྕ<བལྕ 28 | &ཆ<མཆ<འཆ 29 | &ཇ<མཇ<འཇ<རྗ<ལྗ<བརྗ 30 | &ཉ<<ྋྙ<གཉ<མཉ<རྙ=ཪྙ<སྙ<བརྙ=བཪྙ<བསྙ 31 | &ཏ<ཊ<ཏྭ<ཏྲ<གཏ<བཏ<རྟ<ལྟ<སྟ<བརྟ<བལྟ<བསྟ 32 | &ཐ<ཋ<མཐ<འཐ 33 | &ད<ཌ<གདག<གདང<གདད<གདན<གདབ<གདམ<<<གདཾ<གདའ<གདར<གདལ<གདས<གདི<གདུ<གདེ<གདོ<གདྭ<བདག<བདང<བདད<བདབ<བདམ<<<བདཾ<བདའ 34 | <བདར<བདལ<བདས<བདི<བདུ<བདེ<བདོ<བདྭ<མདག<མདང<མདད<མདན<མདབ<མདའ<མདར<མདལ<མདས<མདི<མདུ<མདེ<མདོ<མདྭ<འདག<འདང<འདད<འདན<འདབ<འདམ<<<འདཾ 35 | <འདཝ<འདའ<འདར<འདལ<འདས<འདི<འདུ<འདེ<འདོ<འདྭ<འདྲ<རྡ<ལྡ<སྡ<བརྡ<བལྡ<བསྡ 36 | &ན<ཎ<གནག<གནང<གནད<གནན<གནབ<གནམ<<<གནཾ<གནཝ<གནའ<གནར<གནལ<གནས<གནི<གནུ<གནེ<གནོ<གནྭ<མནག<མནང<མནད<མནན<མནབ<མནམ<<<མནཾ<མནའ 37 | <མནར<མནལ<མནས<མནི<མནུ<མནེ<མནོ<མནྭ<རྣ<སྣ<བརྣ<བསྣ 38 | &པ<<ྉྤ<དཔག<དཔང<དཔད<དཔབ<དཔའ<དཔར<དཔལ<དཔས<དཔི<དཔུ<དཔེ<དཔོ<དཔྱ<དཔྲ<ལྤ<སྤ 39 | &ཕ<<ྉྥ<འཕ 40 | &བ<དབག<དབང<དབད<དབན<དབབ<དབའ<དབར<དབལ<དབས<དབི<དབུ<དབེ<དབོ<དབྱ<དབྲ<འབག<འབང<འབད<འབན<འབབ<འབམ 41 | <<<འབཾ<འབའ<འབར<འབལ<འབས<འབི<འབུ<འབེ<འབོ<འབྱ<འབྲ<རྦ<ལྦ<སྦ 42 | &མ<<<ཾ<དམག<དམང<དམད<དམན<དམབ<དམཝ<དམའ<དམར<དམལ<དམས<དམི<དམུ<དམེ<དམོ<དམྭ<དམྱ<རྨ<སྨ 43 | &ཙ<གཙ<བཙ<རྩ<སྩ<བརྩ<བསྩ 44 | &ཚ<མཚ<འཚ 45 | &ཛ<མཛ<འཛ<རྫ<བརྫ 46 | # &ཝ 47 | &ཞ<གཞ<བཞ 48 | &ཟ<གཟ<བཟ 49 | # &འ 50 | &ཡ<གཡ 51 | &ར<<<ཪ<ཬ<བརླ=བཪླ 52 | # &ལ 53 | &ཤ<ཥ<གཤ<བཤ 54 | &ས<གསག<གསང<གསད<གསན<གསབ<གསའ<གསར<གསལ<གསས<གསི<གསུ<གསེ<གསོ<གསྭ<བསག<བསང<བསད<བསབ<བསམ<<<བསཾ<བསའ<བསར 55 | <བསལ<བསས<བསི<བསུ<བསེ<བསོ<བསྭ<བསྲ<བསླ 56 | &ཧ<ལྷ 57 | &ཨ 58 | # Explicit vowels 59 | <ཱ<ི<ཱི<ྀ<ཱྀ<ུ<ཱུ<ེ<ཻ=ེེ<ོ<ཽ=ོོ 60 | # Post-radicals 61 | <ྐ<ྑ<ྒ<ྔ<ྕ<ྖ<ྗ<ྙ<ྟ<ྚ<ྠ<ྛ<ྡ<ྜ<ྣ<ྞ<ྤ<ྥ<ྦ<ྨ<ྩ<ྪ<ྫ<ྭ<<<ྺ<ྮ<ྯ<ྰ<ྱ<<<ྻ<ྲ<<<ྼ<ླ<ྴ 62 | <ྵ<ྶ<ྷ<ྸ 63 | # Combining marks and signs (secondary weight) 64 | &༹<<྄<<ཿ<<྅<<ྈ<<ྉ<<ྊ<<ྋ<<ྌ<<ྍ<<ྎ<<ྏ 65 | # Treatༀ, ཷand ,ཹ as decomposed 66 | &ཨོཾ=ༀ 67 | &ྲཱྀ=ཷ 68 | &ླཱྀ=ཹ -------------------------------------------------------------------------------- /pybo/untokenize.py: -------------------------------------------------------------------------------- 1 | 2 | def pre_processing(tokenized_text): 3 | tokens = [token for token in tokenized_text.split(' ') if token] 4 | return tokens 5 | 6 | def get_token_text(token): 7 | token_parts = [part for part in token.split('/') if part] 8 | return token_parts[0] 9 | 10 | def assemble(tokens): 11 | detokenized_text = '' 12 | for token in tokens: 13 | detokenized_text += get_token_text(token) 14 | return detokenized_text 15 | -------------------------------------------------------------------------------- /pybo/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/pybo/c65ee83a0659f721bccdf48db4901360e7d97048/pybo/utils/__init__.py -------------------------------------------------------------------------------- /pybo/utils/bo_sorted.py: -------------------------------------------------------------------------------- 1 | # # coding: utf-8 2 | # from icu import RuleBasedCollator 3 | # from pathlib import Path 4 | # 5 | # 6 | # rules = Path(__file__).parent / "../third_party/rules.txt" 7 | # collator = RuleBasedCollator( 8 | # "[normalization on]\n[reorder Tibt]\n" + rules.read_text(encoding="utf-8") 9 | # ) 10 | # 11 | # 12 | # def bo_sorted(word_list): 13 | # return sorted(word_list, key=collator.getSortKey) 14 | -------------------------------------------------------------------------------- /pybo/utils/profile_entries.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from collections import defaultdict 3 | from pathlib import Path 4 | 5 | 6 | def profile_entries(pathname): 7 | pathname = Path(pathname) 8 | entries = defaultdict(list) 9 | 10 | profile_files = [Path(__file__).parent.parent / "resources/particles.tsv"] 11 | for d in pathname.glob("*"): 12 | # filter unwanted directories and files 13 | dirs_ignored = ["adjustment", "entry_data"] 14 | if not d.is_dir() or d.name in dirs_ignored or d.name.startswith("."): 15 | continue 16 | 17 | profile_files.extend(list(d.glob("*.tsv"))) 18 | 19 | # add files 20 | for f in profile_files: 21 | lines = f.read_text(encoding="utf-8-sig").splitlines() 22 | for num, line in enumerate(lines): 23 | if line.startswith("#"): 24 | continue 25 | entry = line.split("\t", 1)[0] 26 | entries[entry].append(line) 27 | return entries 28 | -------------------------------------------------------------------------------- /pybo/utils/profile_report.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from collections import defaultdict 3 | from pathlib import Path 4 | 5 | 6 | def reorder_data(data): 7 | ordered = [] 8 | for entry, e in data.items(): 9 | count = 0 10 | for _, files in e.items(): 11 | count += len(files) 12 | ordered.append((count, {entry: e})) 13 | ordered = sorted(ordered, reverse=True, key=lambda x: x[0]) 14 | return ordered 15 | 16 | 17 | def profile_report(pathname): 18 | pathname = Path(pathname) 19 | data = {} 20 | 21 | for d in sorted(pathname.glob("*")): 22 | # filter unwanted directories and files 23 | dirs_ignored = ["adjustment", "entry_data"] 24 | if not d.is_dir() or d.name in dirs_ignored or d.name.startswith("."): 25 | continue 26 | 27 | for f in sorted(d.glob("*.tsv")): 28 | lines = f.read_text(encoding="utf-8-sig").splitlines() 29 | for num, line in enumerate(lines): 30 | if line.startswith("#"): 31 | continue 32 | entry = line.split("\t", 1)[0] 33 | path = f"{d.name}/{f.name}" 34 | 35 | if entry not in data: 36 | data[entry] = {} 37 | if line not in data[entry]: 38 | data[entry][line] = [] 39 | 40 | data[entry][line].append((path, num)) 41 | 42 | data = reorder_data(data) 43 | 44 | # filter and format all entries that have similar forms over files 45 | report = ["WORD\tENTRY\tFILE-NAME\tLINE-NUMBER"] 46 | count = defaultdict(int) 47 | for num, d in data: 48 | count[num] += 1 49 | for entry, e in d.items(): 50 | tmp = [] 51 | tmp.append(f"{entry}: {num}") 52 | for line, files in e.items(): 53 | tmp.append(f'\t"{line}"') 54 | tmp.extend([f"\t\t{f}\t{n}" for f, n in files]) 55 | report.extend(tmp) 56 | report = ( 57 | [f"total distinct entries: {len(data)}"] 58 | + [f"entries with {a} entries: {b}" for a, b in count.items()] 59 | + [""] 60 | + report 61 | ) 62 | report = "\n".join(report) 63 | 64 | # print to file 65 | out = pathname / (pathname.name + "_report.tsv") 66 | out.write_text(report, encoding="utf-8-sig") 67 | -------------------------------------------------------------------------------- /pybo/utils/regex_batch_apply.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import re 3 | 4 | 5 | def batch_apply_regex(string, pairs): 6 | for find, repl in pairs: 7 | string = re.sub(find, repl, string, flags=re.MULTILINE) 8 | return string 9 | 10 | 11 | def get_regex_pairs(lines, sep="\t-\t"): 12 | regex_pairs = [] 13 | clean_lines = _parse_lines(lines, sep) 14 | 15 | for line in clean_lines: 16 | find, replace = line.split(sep) 17 | regex_pairs.append((r"" + find, r"" + replace)) 18 | return regex_pairs 19 | 20 | 21 | def _parse_lines(lines, sep): 22 | cleaned = [] 23 | for num, line in enumerate(lines): 24 | # remove comment lines and empty lines 25 | if "#" in line: 26 | line = line[: line.find("#")] 27 | 28 | # strip line returns while keeping space chars and screen all empty lines 29 | line = line.strip("\n\r") 30 | if not line: 31 | continue 32 | 33 | # ensure there is 1 and only 1 occurrence of sep 34 | if line.count(sep) != 1: 35 | print(f"passing line {num + 1}: {line}.") 36 | continue 37 | 38 | cleaned.append(line) 39 | return cleaned 40 | -------------------------------------------------------------------------------- /pybo_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/pybo/c65ee83a0659f721bccdf48db4901360e7d97048/pybo_logo.png -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pre-commit 2 | coverage 3 | pytest 4 | covdefaults 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click 2 | PyYAML 3 | botok >= 0.7.4 4 | pyewts 5 | bordr -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 88 3 | ignore = E203, W503, E501, F401, F403 4 | 5 | [isort] 6 | line_length = 88 7 | known_first_party = pybo 8 | multi_line_output = 3 9 | include_trailing_comma = True 10 | 11 | [coverage:run] 12 | plugins = covdefaults 13 | omit = .env/* 14 | 15 | [coverage:report] 16 | fail_under = 100 17 | show_missing = True 18 | skip_covered = True 19 | 20 | 21 | [semantic_release] 22 | version_variable = pybo/__init__.py:__version__ 23 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # -*- coding: utf8 -*- 3 | 4 | from __future__ import print_function 5 | 6 | import re 7 | from pathlib import Path 8 | 9 | import setuptools 10 | from pkg_resources import parse_version 11 | 12 | assert parse_version(setuptools.__version__) >= parse_version("38.6.0") 13 | 14 | 15 | def get_version(prop, project): 16 | project = Path(__file__).parent / project / "__init__.py" 17 | result = re.search( 18 | r'{}\s*=\s*[\'"]([^\'"]*)[\'"]'.format(prop), project.read_text() 19 | ) 20 | return result.group(1) 21 | 22 | 23 | def read(fname): 24 | p = Path(__file__).parent / fname 25 | with p.open(encoding="utf-8") as f: 26 | return f.read() 27 | 28 | 29 | setuptools.setup( 30 | name="pybo", 31 | version=get_version("__version__", "pybo"), # edit version in pybo/__init__.py 32 | author="Esukhia development team", 33 | author_email="esukhiadev@gmail.com", 34 | description="Python utils for processing Tibetan", 35 | license="Apache2", 36 | keywords="nlp computational_linguistics search ngrams language_models linguistics toolkit tibetan", 37 | url="https://github.com/Esukhia/pybo", 38 | packages=setuptools.find_packages(), 39 | long_description=read("README.md"), 40 | long_description_content_type="text/markdown", 41 | project_urls={ 42 | "Source": "https://github.com/Esukhia/pybo", 43 | "Tracker": "https://github.com/Esukhia/pybo/issues", 44 | }, 45 | classifiers=[ 46 | "Development Status :: 3 - Alpha", 47 | "Topic :: Text Processing :: Linguistic", 48 | "Programming Language :: Python :: 3", 49 | "Operating System :: OS Independent", 50 | "Intended Audience :: Developers", 51 | "Intended Audience :: Science/Research", 52 | "License :: OSI Approved :: Apache Software License", 53 | "Natural Language :: Tibetan", 54 | ], 55 | python_requires=">=3.6", 56 | install_requires=["botok>=0.8.2", "pyyaml", "click", "pyewts", "bordr", "tibetan_sort", "pytest"], 57 | tests_require=["pytest"], 58 | entry_points={ 59 | "console_scripts": ["bo=pybo.cli:cli"] # command=package.module:function 60 | }, 61 | ) 62 | -------------------------------------------------------------------------------- /tests/01_raw_text.txt: -------------------------------------------------------------------------------- 1 | STEP 1: standard botok + custom words and rules 2 | - 1 raw text 3 | - 2 segmented text 4 | 5 | STEP 2: 6 | - 1 manually corrected segmentation and POS + extra information 7 | 8 | - 2 extract info from manually corrected 9 | - entry data: (script to create) 10 | - word lists + entry data 11 | - rules 12 | - rules: (RDR) 13 | - extract rules using RDR 14 | - filter them manually 15 | - convert them to botok matcher replacements 16 | 17 | https://github.com/buda-base/bonlp-datasets/blob/master/human2rdr.txt 18 | 19 | STEP 3: 20 | - resegment using only clean entry data 21 | - adjust wordlists and rules until it is as close to the manually corrected as possible 22 | - provide new entry data and 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | STEP1 + STEP 3 31 | pybo tok -p 32 | pybo tok -p2 33 | 34 | STEP2 35 | pybo extract profile 36 | output will be : 37 | - for entry data: words_bo + entry_data 38 | - for RDR: human readable rules to be proofed 39 | 40 | pybo convert 41 | will convert human readable selected rules to matcher replacement rules. 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | Documents/ 50 | pybo/ 51 | main_profile/ 52 | adjustment/ 53 | entry_data/ 54 | frequency/ 55 | words_bo/ 56 | words_non_inflected/ 57 | words_skrt/ 58 | custom_profile/ 59 | adjustment/ 60 | entry_data/ 61 | frequency/ 62 | words_bo/ 63 | words_non_inflected/ 64 | words_skrt/ -------------------------------------------------------------------------------- /tests/data/corpus1/corpus1.txt: -------------------------------------------------------------------------------- 1 | ལས་ཞེས་པ་ནི་ལས་བྱེད་པས་ལས་བྱེད་པ་ལ་ཟེར་རོ།། ལས་བྱེད་པས་ལས་མ་བྱེད་པ་ལ་ཟེར་བ་མ་ཡིན་ནོ།། སྐད་ཆ་དེ་ཡི་དོན་གཅིག་ནི་ནང་པ་གོ་བ་དང་དོན་གཞན་ནི་མི་ལ་གོ། དཔེ་དེབ་ནི་དེབ་ཀྱི་དོན་གཅིག་རེད་ གོ་བ་ནོར་པ་མ་ལེན་རོགས། 2 | ལས་ཞེས་པ་ནི་ལས་བྱེད་པས་ལས་བྱེད་པ་ལ་ཟེར་རོ།། ལས་བྱེད་པས་ལས་མ་བྱེད་པ་ལ་ཟེར་བ་མ་ཡིན་ནོ།། སྐད་ཆ་དེ་ཡི་དོན་གཅིག་ནི་ནང་པ་གོ་བ་དང་དོན་གཞན་ནི་མི་ལ་གོ། དཔེ་དེབ་ནི་དེབ་ཀྱི་དོན་གཅིག་རེད་ གོ་བ་ནོར་པ་མ་ལེན་རོགས། 3 | ལས་ཞེས་པ་ནི་ལས་བྱེད་པས་ལས་བྱེད་པ་ལ་ཟེར་རོ།། ལས་བྱེད་པས་ལས་མ་བྱེད་པ་ལ་ཟེར་བ་མ་ཡིན་ནོ།། སྐད་ཆ་དེ་ཡི་དོན་གཅིག་ནི་ནང་པ་གོ་བ་དང་དོན་གཞན་ནི་མི་ལ་གོ། དཔེ་དེབ་ནི་དེབ་ཀྱི་དོན་གཅིག་རེད་ གོ་བ་ནོར་པ་མ་ལེན་རོགས། 4 | ལས་ཞེས་པ་ནི་ལས་བྱེད་པས་ལས་བྱེད་པ་ལ་ཟེར་རོ།། ལས་བྱེད་པས་ལས་མ་བྱེད་པ་ལ་ཟེར་བ་མ་ཡིན་ནོ།། སྐད་ཆ་དེ་ཡི་དོན་གཅིག་ནི་ནང་པ་གོ་བ་དང་དོན་གཞན་ནི་མི་ལ་གོ། དཔེ་དེབ་ནི་དེབ་ཀྱི་དོན་གཅིག་རེད་ གོ་བ་ནོར་པ་མ་ལེན་རོགས། 5 | ལས་ཞེས་པ་ནི་ལས་བྱེད་པས་ལས་བྱེད་པ་ལ་ཟེར་རོ།། ལས་བྱེད་པས་ལས་མ་བྱེད་པ་ལ་ཟེར་བ་མ་ཡིན་ནོ།། སྐད་ཆ་དེ་ཡི་དོན་གཅིག་ནི་ནང་པ་གོ་བ་དང་དོན་གཞན་ནི་མི་ལ་གོ། དཔེ་དེབ་ནི་དེབ་ཀྱི་དོན་གཅིག་རེད་ གོ་བ་ནོར་པ་མ་ལེན་རོགས། གོ་བ་ནོར་པ་མ་ལེན་རོགས། -------------------------------------------------------------------------------- /tests/data/corpus1/corpus1_bilou_rules.txt: -------------------------------------------------------------------------------- 1 | [text="ཀྱི་"] [text="དོན་"] [pos="U" & text="གཅིག་"] 3 = [pos="I"] 2 | [pos="U" & text="ལས་"] [text="བྱེད་པ"] 1 = [pos="B"] 3 | [pos="U" & text="གོ་བ་"] [text="དང་"] [text="དོན་"] 1 = [pos="S"] 4 | [text="ཀྱི་"] [pos="U"] [text="གཅིག་"] 2 = [pos="B"] -------------------------------------------------------------------------------- /tests/data/corpus1/corpus1_hd.txt: -------------------------------------------------------------------------------- 1 | ལས་ ཞེས་པ་ ནི་ ལས་བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་ བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། 2 | ལས་ ཞེས་པ་ ནི་ ལས་བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་ བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། 3 | ལས་ ཞེས་པ་ ནི་ ལས་བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་ བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། 4 | ལས་ ཞེས་པ་ ནི་ ལས་བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་ བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། 5 | ལས་ ཞེས་པ་ ནི་ ལས་བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་ བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། -------------------------------------------------------------------------------- /tests/data/corpus1/corpus1_pybo_data.txt: -------------------------------------------------------------------------------- 1 | ལས་ ཞེས་པ་ ནི་ ལས་ བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་ བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་ གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། 2 | ལས་ ཞེས་པ་ ནི་ ལས་ བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་ བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་ གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། 3 | ལས་ ཞེས་པ་ ནི་ ལས་ བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་ བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་ གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། 4 | ལས་ ཞེས་པ་ ནི་ ལས་ བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་ བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་ གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། 5 | ལས་ ཞེས་པ་ ནི་ ལས་ བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་ བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་ གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། -------------------------------------------------------------------------------- /tests/data/corpus1/corpus1_rules.txt: -------------------------------------------------------------------------------- 1 | ["ནང་པ་"] ["གོ་བ་"] ["དང་"] ["དོན་"] 2-1 : [] [] 2 | ["ཀྱི་"] ["དོན་"] ["གཅིག་"] ["རེད་"] ["གོ་བ་"] 2 + [] 3 | ["དེབ་"] ["ཀྱི་"] ["དོན་"] ["གཅིག་"] ["རེད་"] 3 + [] -------------------------------------------------------------------------------- /tests/data/corpus1/corpus1_tr_data.txt: -------------------------------------------------------------------------------- 1 | ལས་/U ཞེས་པ་/U ནི་/U ལས་/B བྱེད་པ/I ས་/U ལས་/U བྱེད་པ་/U ལ་/U ཟེར་/U རོ/U །།/U ལས་/B བྱེད་པ/I ས་/U ལས་/U མ་/U བྱེད་པ་/U ལ་/U ཟེར་བ་/U མ་/U ཡིན་/U ནོ/U །།/U སྐད་ཆ་/U དེ་/U ཡི་/U དོན་/U གཅིག་/U ནི་/U ནང་པ་/U གོ་བ་/S དང་/U དོན་/U གཞན་/U ནི་/U མི་/U ལ་/U གོ/U །/U དཔེ་དེབ་/U ནི་/U དེབ་/U ཀྱི་/U དོན་/B གཅིག་/I རེད་/U གོ་བ་/U ནོར་པ་/U མ་/U ལེན་/U རོགས/U །/U 2 | ལས་/U ཞེས་པ་/U ནི་/U ལས་/B བྱེད་པ/I ས་/U ལས་/U བྱེད་པ་/U ལ་/U ཟེར་/U རོ/U །།/U ལས་/B བྱེད་པ/I ས་/U ལས་/U མ་/U བྱེད་པ་/U ལ་/U ཟེར་བ་/U མ་/U ཡིན་/U ནོ/U །།/U སྐད་ཆ་/U དེ་/U ཡི་/U དོན་/U གཅིག་/U ནི་/U ནང་པ་/U གོ་བ་/S དང་/U དོན་/U གཞན་/U ནི་/U མི་/U ལ་/U གོ/U །/U དཔེ་དེབ་/U ནི་/U དེབ་/U ཀྱི་/U དོན་/B གཅིག་/I རེད་/U གོ་བ་/U ནོར་པ་/U མ་/U ལེན་/U རོགས/U །/U 3 | ལས་/U ཞེས་པ་/U ནི་/U ལས་/B བྱེད་པ/I ས་/U ལས་/U བྱེད་པ་/U ལ་/U ཟེར་/U རོ/U །།/U ལས་/B བྱེད་པ/I ས་/U ལས་/U མ་/U བྱེད་པ་/U ལ་/U ཟེར་བ་/U མ་/U ཡིན་/U ནོ/U །།/U སྐད་ཆ་/U དེ་/U ཡི་/U དོན་/U གཅིག་/U ནི་/U ནང་པ་/U གོ་བ་/S དང་/U དོན་/U གཞན་/U ནི་/U མི་/U ལ་/U གོ/U །/U དཔེ་དེབ་/U ནི་/U དེབ་/U ཀྱི་/U དོན་/B གཅིག་/I རེད་/U གོ་བ་/U ནོར་པ་/U མ་/U ལེན་/U རོགས/U །/U 4 | ལས་/U ཞེས་པ་/U ནི་/U ལས་/B བྱེད་པ/I ས་/U ལས་/U བྱེད་པ་/U ལ་/U ཟེར་/U རོ/U །།/U ལས་/B བྱེད་པ/I ས་/U ལས་/U མ་/U བྱེད་པ་/U ལ་/U ཟེར་བ་/U མ་/U ཡིན་/U ནོ/U །།/U སྐད་ཆ་/U དེ་/U ཡི་/U དོན་/U གཅིག་/U ནི་/U ནང་པ་/U གོ་བ་/S དང་/U དོན་/U གཞན་/U ནི་/U མི་/U ལ་/U གོ/U །/U དཔེ་དེབ་/U ནི་/U དེབ་/U ཀྱི་/U དོན་/B གཅིག་/I རེད་/U གོ་བ་/U ནོར་པ་/U མ་/U ལེན་/U རོགས/U །/U 5 | ལས་/U ཞེས་པ་/U ནི་/U ལས་/B བྱེད་པ/I ས་/U ལས་/U བྱེད་པ་/U ལ་/U ཟེར་/U རོ/U །།/U ལས་/B བྱེད་པ/I ས་/U ལས་/U མ་/U བྱེད་པ་/U ལ་/U ཟེར་བ་/U མ་/U ཡིན་/U ནོ/U །།/U སྐད་ཆ་/U དེ་/U ཡི་/U དོན་/U གཅིག་/U ནི་/U ནང་པ་/U གོ་བ་/S དང་/U དོན་/U གཞན་/U ནི་/U མི་/U ལ་/U གོ/U །/U དཔེ་དེབ་/U ནི་/U དེབ་/U ཀྱི་/U དོན་/B གཅིག་/I རེད་/U གོ་བ་/U ནོར་པ་/U མ་/U ལེན་/U རོགས/U །/U གོ་བ་/U ནོར་པ་/U མ་/U ལེན་/U རོགས/U །/U 6 | -------------------------------------------------------------------------------- /tests/data/drokun_test/drokun_test.txt: -------------------------------------------------------------------------------- 1 | བདག་པས་གཞན་གཅེས་འགྲོ་ཀུན་བརྩེ་བས་སྐྱོང་། 2 | བདག་སོགས་འགྲོ་ཀུན་སྨིན་ཅིང་གྲོལ་བྱའི་ཕྱིར། 3 | དགེ་བས་འགྲོ་ཀུན་སངས་རྒྱས་ཐོབ་ཕྱིར་བསྔོ། 4 | བསོད་ནམས་དེས།།འགྲོ་ཀུན་བཤེས་གཉེན་བསྟེན་པར་ཤོག། 5 | བཅས་ཏེ་གནང་ཚུལ།འགྲོ་ཀུན་དང་བ་འདྲེན་ཕྱིར་བཀོད་པ་ 6 | མི་འདུག་པས།།འགྲོ་ཀུན་བརྩེ་བས་སྐྱོངས་ཤིག་ 7 | ཆུ་དེས་འགྲོ་ཀུན་ཚིམ་པ་རྨི། 8 | ཆུ་དེས་འགྲོ་ཀུན་ཚིམ་པ་དེ། 9 | ལགས་སོ།།།།གསུམ་པ།འགྲོ་ཀུན་དང་བ་འདྲེན་ཕྱིར་བཀོད་པ་ 10 | བདག་པས་གཞན་གཅེས་འགྲོ་ཀུན་བརྩེ་བས་སྐྱོང་། 11 | བདག་སོགས་འགྲོ་ཀུན་སྨིན་ཅིང་གྲོལ་བྱའི་ཕྱིར། 12 | དགེ་བས་འགྲོ་ཀུན་སངས་རྒྱས་ཐོབ་ཕྱིར་བསྔོ། 13 | བསོད་ནམས་དེས།།འགྲོ་ཀུན་བཤེས་གཉེན་བསྟེན་པར་ཤོག། 14 | བཅས་ཏེ་གནང་ཚུལ།འགྲོ་ཀུན་དང་བ་འདྲེན་ཕྱིར་བཀོད་པ་ 15 | མི་འདུག་པས།།འགྲོ་ཀུན་བརྩེ་བས་སྐྱོངས་ཤིག་ 16 | ཆུ་དེས་འགྲོ་ཀུན་ཚིམ་པ་རྨི། 17 | ཆུ་དེས་འགྲོ་ཀུན་ཚིམ་པ་དེ། 18 | ལགས་སོ།།།།གསུམ་པ།འགྲོ་ཀུན་དང་བ་འདྲེན་ཕྱིར་བཀོད་པ་ 19 | བཅས་ཏེ་གནང་ཚུལ།འགྲོ་ཀུན་དང་བ་འདྲེན་ཕྱིར་བཀོད་པ་ 20 | བདག་པས་གཞན་གཅེས་འགྲོ་ཀུན་བརྩེ་བས་སྐྱོང་། 21 | བདག་སོགས་འགྲོ་ཀུན་སྨིན་ཅིང་གྲོལ་བྱའི་ཕྱིར། 22 | དགེ་བས་འགྲོ་ཀུན་སངས་རྒྱས་ཐོབ་ཕྱིར་བསྔོ། 23 | བསོད་ནམས་དེས།།འགྲོ་ཀུན་བཤེས་གཉེན་བསྟེན་པར་ཤོག། 24 | བཅས་ཏེ་གནང་ཚུལ།འགྲོ་ཀུན་དང་བ་འདྲེན་ཕྱིར་བཀོད་པ་ 25 | མི་འདུག་པས།།འགྲོ་ཀུན་བརྩེ་བས་སྐྱོངས་ཤིག་ 26 | ཆུ་དེས་འགྲོ་ཀུན་ཚིམ་པ་རྨི། 27 | ཆུ་དེས་འགྲོ་ཀུན་ཚིམ་པ་དེ། 28 | ལགས་སོ།།།།གསུམ་པ།འགྲོ་ཀུན་དང་བ་འདྲེན་ཕྱིར་བཀོད་པ་ -------------------------------------------------------------------------------- /tests/data/drokun_test/drokun_test_bilou_rules.txt: -------------------------------------------------------------------------------- 1 | [text=""] [pos="U"] [text="།།།།"] 2 = [pos="S"] 2 | [pos="U"] [pos="U" & text="འགྲོ་"] [pos="U"] 2 = [pos="B"] 3 | [pos=""] [pos="U" & text="བདག་པ"] 2 = [pos="S"] 4 | [pos="U" & text="ཀུན་"] [pos="U"] [pos="U" & text="དེ"] 1 = [pos="U"] 5 | [text="བདག་པ"] [pos="U"] [text="གཞན་གཅེས་"] 2 = [pos="I"] 6 | [pos="U" & text="།།།།"] [text="གསུམ་པ"] [text="།"] 1 = [pos="S"] 7 | [pos="U" & text="ཀུན་"] [pos="U"] [pos="U"] 1 = [pos="I"] 8 | [text="དེ"] [] [pos="U" & text="།།"] 3 = [pos="S"] 9 | [text="གནང་ཚུལ"] [pos="U"] [pos="U" & text="འགྲོ་"] [pos="U"] 3 = [pos="U"] -------------------------------------------------------------------------------- /tests/data/drokun_test/drokun_test_hd.txt: -------------------------------------------------------------------------------- 1 | བདག་ པས་ གཞན་གཅེས་ འགྲོ་ཀུན་ བརྩེ་བ ས་ སྐྱོང་ ། 2 | བདག་ སོགས་ འགྲོ་ཀུན་ སྨིན་ ཅིང་ གྲོལ་བྱ འི་ ཕྱིར ། 3 | དགེ་བ ས་ འགྲོ་ཀུན་ སངས་རྒྱས་ ཐོབ་ ཕྱིར་ བསྔོ ། 4 | བསོད་ནམས་ དེ ས ། ། འགྲོ་ཀུན་ བཤེས་གཉེན་ བསྟེན་པ ར་ ཤོག ། 5 | བཅས་ ཏེ་ གནང་ཚུལ ། འགྲོ་ ཀུན་ དང་བ་ འདྲེན་ ཕྱིར་ བཀོད་པ་ 6 | མི་ འདུག་པ ས ། ། འགྲོ་ ཀུན་ བརྩེ་བ ས་ སྐྱོངས་ ཤིག་ 7 | ཆུ་ དེ ས་ འགྲོ་ཀུན་ ཚིམ་པ་ རྨི ། 8 | ཆུ་ དེ ས་ འགྲོ་ ཀུན་ ཚིམ་པ་ དེ ། 9 | ལགས་ སོ །། །། གསུམ་པ ། འགྲོ་ཀུན་ དང་བ་ འདྲེན་ ཕྱིར་ བཀོད་པ་ 10 | བདག་ པས་ གཞན་གཅེས་ འགྲོ་ཀུན་ བརྩེ་བ ས་ སྐྱོང་ ། 11 | བདག་ སོགས་ འགྲོ་ཀུན་ སྨིན་ ཅིང་ གྲོལ་བྱ འི་ ཕྱིར ། 12 | དགེ་བ ས་ འགྲོ་ཀུན་ སངས་རྒྱས་ ཐོབ་ ཕྱིར་ བསྔོ ། 13 | བསོད་ནམས་ དེ ས ། ། འགྲོ་ཀུན་ བཤེས་གཉེན་ བསྟེན་པ ར་ ཤོག ། 14 | བཅས་ ཏེ་ གནང་ཚུལ ། འགྲོ་ ཀུན་ དང་བ་ འདྲེན་ ཕྱིར་ བཀོད་པ་ 15 | མི་ འདུག་པ ས ། ། འགྲོ་ ཀུན་ བརྩེ་བ ས་ སྐྱོངས་ ཤིག་ 16 | ཆུ་ དེ ས་ འགྲོ་ཀུན་ ཚིམ་པ་ རྨི ། 17 | ཆུ་ དེ ས་ འགྲོ་ ཀུན་ ཚིམ་པ་ དེ ། 18 | ལགས་ སོ །། །། གསུམ་པ ། འགྲོ་ཀུན་ དང་བ་ འདྲེན་ ཕྱིར་ བཀོད་པ་ 19 | བཅས་ ཏེ་ གནང་ཚུལ ། འགྲོ་ ཀུན་ དང་བ་ འདྲེན་ ཕྱིར་ བཀོད་པ་ 20 | བདག་ པས་ གཞན་གཅེས་ འགྲོ་ཀུན་ བརྩེ་བ ས་ སྐྱོང་ ། 21 | བདག་ སོགས་ འགྲོ་ཀུན་ སྨིན་ ཅིང་ གྲོལ་བྱ འི་ ཕྱིར ། 22 | དགེ་བ ས་ འགྲོ་ཀུན་ སངས་རྒྱས་ ཐོབ་ ཕྱིར་ བསྔོ ། 23 | བསོད་ནམས་ དེ ས ། ། འགྲོ་ཀུན་ བཤེས་གཉེན་ བསྟེན་པ ར་ ཤོག ། 24 | བཅས་ ཏེ་ གནང་ཚུལ ། འགྲོ་ ཀུན་ དང་བ་ འདྲེན་ ཕྱིར་ བཀོད་པ་ 25 | མི་ འདུག་པ ས ། ། འགྲོ་ ཀུན་ བརྩེ་བ ས་ སྐྱོངས་ ཤིག་ 26 | ཆུ་ དེ ས་ འགྲོ་ཀུན་ ཚིམ་པ་ རྨི ། 27 | ཆུ་ དེ ས་ འགྲོ་ ཀུན་ ཚིམ་པ་ དེ ། 28 | ལགས་ སོ །། །། གསུམ་པ ། འགྲོ་ཀུན་ དང་བ་ འདྲེན་ ཕྱིར་ བཀོད་པ་ 29 | -------------------------------------------------------------------------------- /tests/data/drokun_test/drokun_test_rules.txt: -------------------------------------------------------------------------------- 1 | [text="གཞན་གཅེས་"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] 2 + [] 2 | [text="།" & pos="PUNCT"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] 2 + [] 3 | [text="ས་" & pos="PART"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] 2 + [] 4 | [text="སོགས་" & pos="DET"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] 2 + [] -------------------------------------------------------------------------------- /tests/data/drokun_test/drokun_test_tr_data.txt: -------------------------------------------------------------------------------- 1 | བདག་པ/S ས་/I གཞན་གཅེས་/U འགྲོ་/B ཀུན་/I བརྩེ་བ/U ས་/U སྐྱོང་/U །/U 2 | བདག་/U སོགས་/U འགྲོ་/B ཀུན་/I སྨིན་/U ཅིང་/U གྲོལ་བྱ/U འི་/U ཕྱིར/U །/U 3 | དགེ་བ/U ས་/U འགྲོ་/B ཀུན་/I སངས་རྒྱས་/U ཐོབ་/U ཕྱིར་/U བསྔོ/U །/U 4 | བསོད་ནམས་/U དེ/U ས/U །།/S འགྲོ་/B ཀུན་/I བཤེས་གཉེན་/U བསྟེན་པ/U ར་/U ཤོག/U །/U 5 | བཅས་/U ཏེ་/U གནང་ཚུལ/U །/U འགྲོ་/U ཀུན་/U དང་བ་/U འདྲེན་/U ཕྱིར་/U བཀོད་པ་/U 6 | མི་/U འདུག་པ/U ས/U །།/U འགྲོ་/U ཀུན་/U བརྩེ་བ/U ས་/U སྐྱོངས་/U ཤིག་/U 7 | ཆུ་/U དེ/U ས་/U འགྲོ་/B ཀུན་/I ཚིམ་པ་/U རྨི/U །/U 8 | ཆུ་/U དེ/U ས་/U འགྲོ་/U ཀུན་/U ཚིམ་པ་/U དེ/U །/U 9 | ལགས་སོ/S །།།།/S གསུམ་པ/U །/U འགྲོ་/B ཀུན་/I དང་བ་/U འདྲེན་/U ཕྱིར་/U བཀོད་པ་/U 10 | བདག་པ/S ས་/I གཞན་གཅེས་/U འགྲོ་/B ཀུན་/I བརྩེ་བ/U ས་/U སྐྱོང་/U །/U 11 | བདག་/U སོགས་/U འགྲོ་/B ཀུན་/I སྨིན་/U ཅིང་/U གྲོལ་བྱ/U འི་/U ཕྱིར/U །/U 12 | དགེ་བ/U ས་/U འགྲོ་/B ཀུན་/I སངས་རྒྱས་/U ཐོབ་/U ཕྱིར་/U བསྔོ/U །/U 13 | བསོད་ནམས་/U དེ/U ས/U །།/S འགྲོ་/B ཀུན་/I བཤེས་གཉེན་/U བསྟེན་པ/U ར་/U ཤོག/U །/U 14 | བཅས་/U ཏེ་/U གནང་ཚུལ/U །/U འགྲོ་/U ཀུན་/U དང་བ་/U འདྲེན་/U ཕྱིར་/U བཀོད་པ་/U 15 | མི་/U འདུག་པ/U ས/U །།/U འགྲོ་/U ཀུན་/U བརྩེ་བ/U ས་/U སྐྱོངས་/U ཤིག་/U 16 | ཆུ་/U དེ/U ས་/U འགྲོ་/B ཀུན་/I ཚིམ་པ་/U རྨི/U །/U 17 | ཆུ་/U དེ/U ས་/U འགྲོ་/U ཀུན་/U ཚིམ་པ་/U དེ/U །/U 18 | ལགས་སོ/S །།།།/S གསུམ་པ/U །/U འགྲོ་/B ཀུན་/I དང་བ་/U འདྲེན་/U ཕྱིར་/U བཀོད་པ་/U 19 | བཅས་/U ཏེ་/U གནང་ཚུལ/U །/U འགྲོ་/U ཀུན་/U དང་བ་/U འདྲེན་/U ཕྱིར་/U བཀོད་པ་/U 20 | བདག་པ/S ས་/I གཞན་གཅེས་/U འགྲོ་/B ཀུན་/I བརྩེ་བ/U ས་/U སྐྱོང་/U །/U 21 | བདག་/U སོགས་/U འགྲོ་/B ཀུན་/I སྨིན་/U ཅིང་/U གྲོལ་བྱ/U འི་/U ཕྱིར/U །/U 22 | དགེ་བ/U ས་/U འགྲོ་/B ཀུན་/I སངས་རྒྱས་/U ཐོབ་/U ཕྱིར་/U བསྔོ/U །/U 23 | བསོད་ནམས་/U དེ/U ས/U །།/S འགྲོ་/B ཀུན་/I བཤེས་གཉེན་/U བསྟེན་པ/U ར་/U ཤོག/U །/U 24 | བཅས་/U ཏེ་/U གནང་ཚུལ/U །/U འགྲོ་/U ཀུན་/U དང་བ་/U འདྲེན་/U ཕྱིར་/U བཀོད་པ་/U 25 | མི་/U འདུག་པ/U ས/U །།/U འགྲོ་/U ཀུན་/U བརྩེ་བ/U ས་/U སྐྱོངས་/U ཤིག་/U 26 | ཆུ་/U དེ/U ས་/U འགྲོ་/B ཀུན་/I ཚིམ་པ་/U རྨི/U །/U 27 | ཆུ་/U དེ/U ས་/U འགྲོ་/U ཀུན་/U ཚིམ་པ་/U དེ/U །/U 28 | ལགས་སོ/S །།།།/S གསུམ་པ/U །/U འགྲོ་/B ཀུན་/I དང་བ་/U འདྲེན་/U ཕྱིར་/U བཀོད་པ་/U 29 | -------------------------------------------------------------------------------- /tests/data/marpa/marpa_bilou_rules.txt: -------------------------------------------------------------------------------- 1 | [text="ཅིག་"] [pos="U" & text="གསུངས"] [text="།"] 2 = [pos="S"] 2 | [text="ནང་"] [pos="S" & text="མཐུན་པ"] 2 = [pos="I"] 3 | [pos="U"] [pos="U" & text="དུ་"] [pos="S"] 2 = [pos="S"] 4 | [pos="U"] [] [text="ཤོམས་"] 1 = [pos="S"] 5 | [text="གྱི་"] [] [pos="U"] [pos="S"] [pos="S"] 3 = [pos="U"] 6 | [pos="S" & text="ཕྲིན་ལས་"] [pos="U"] [pos="U"] 1 = [pos="U"] 7 | [text="ལ་ཆ"] [pos="U"] 2 = [pos="I"] 8 | [pos="S"] [pos="S"] [pos="U"] [] [text="དབང་"] 3 = [pos="U"] 9 | [pos="U" & text="བྱས་པ་"] [] [text="།།"] 1 = [pos="S"] 10 | [text="ཆགས་པ་"] [] [pos="U"] 3 = [pos="S"] 11 | [text="གདུང་"] [pos="U"] 2 = [pos="S"] 12 | [pos="S" & text="བུ་"] [pos="U"] [pos="U" & text="བྱུང་བ་"] 1 = [pos="S"] 13 | [pos="U"] [pos="S"] [pos="U" & text="གདེངས་"] 3 = [pos="S"] 14 | [text="གདེངས་"] [pos="U" & text="དང་"] 2 = [pos="S"] 15 | [pos="U"] [text="འཕོས་"] 1 = [pos="S"] 16 | [text="ཀྱི་"] [pos="U" & text="ཞལ་"] [text="ནས"] 2 = [pos="S"] 17 | [text="ཞུས་པ་"] [text="ལགས"] [pos="U" & text="།།"] 3 = [pos="S"] 18 | [text="།"] [pos="U" & text="སྔར་"] [text="གྱི་"] 2 = [pos="S"] 19 | [text="ཕུག་རོན་"] [] [pos="U"] 3 = [pos="S"] 20 | [pos="U" & text="བུ་སློབ་"] [] [text="།།"] 1 = [pos="S"] 21 | [pos="U" & text="ཏེ"] [pos="U"] [pos="S"] 1 = [pos="S"] 22 | [pos="U"] [pos="S"] [pos="U" & text="མི་"] 3 = [pos="S"] 23 | [text="།"] [] [pos="U" & text="སྣ་ཚོགས་"] 3 = [pos="S"] 24 | [pos="S"] [pos="U" & text="ཅིག་"] [pos="U"] 2 = [pos="S"] 25 | [pos="S"] [pos="U"] [pos="U" & text="ཚེ"] 3 = [pos="S"] 26 | [text="འི་"] [] [pos="U" & text="འདྲ"] 3 = [pos="S"] 27 | [text="འཕོས་"] [] [pos="U"] 3 = [pos="S"] 28 | [pos="U" & text="སངས་རྒྱས་"] [text="ལ་"] [text="ཞུས་པ་"] 1 = [pos="S"] 29 | [pos="S"] [pos="U"] [pos="U" & text="སོང་བ་"] 3 = [pos="S"] 30 | [text="གསུངས་བ་"] [pos="U" & text="ལྟར"] [text="།"] 2 = [pos="S"] 31 | [text="ཡོད་པ་"] [pos="U" & text="དེ་"] 2 = [pos="S"] 32 | [text="སྒོས་"] [pos="U"] 2 = [pos="S"] 33 | [pos="S"] [pos="U" & text="གསུངས"] [pos="U"] 2 = [pos="S"] 34 | [pos="U" & text="ལས"] [text="།"] [text="སྔར་"] 1 = [pos="S"] 35 | [pos="U" & text="ཡིན"] [] [text="སྤྱན་མིག་"] 1 = [pos="S"] 36 | [text="པང་"] [pos="U"] 2 = [pos="S"] 37 | [pos="S"] [pos="U"] [pos="U" & text="མགུར་"] 3 = [pos="S"] 38 | [pos="S" & text="ཕྱག་འཚལ་"] [text="བསྟོད"] 1 = [pos="U"] 39 | [text="བར་"] [pos="U" & text="དུ་"] 2 = [pos="I"] 40 | [text="བླ་མ"] [pos="U" & text="འི་"] [text="ཞལ་"] 2 = [pos="S"] 41 | [pos="S"] [pos="U"] [pos="U" & text="བཞུགས་"] 3 = [pos="S"] 42 | [pos="U" & text="།"] [] [text="ཀ་"] 1 = [pos="S"] 43 | [pos="U"] [text="ལ་ལ"] 1 = [pos="S"] 44 | [pos="U"] [] [text="རྒན་རྒོན་"] 1 = [pos="S"] 45 | [pos="U" & text="ཡོད་"] [text="ན་"] 1 = [pos="S"] 46 | [pos="U" & text="རྫོགས་རིམ་"] [] [text="བསྡུས་"] 1 = [pos="S"] 47 | [text="འགྲོ་"] [pos="U" & text="ཀུན་"] 2 = [pos="I"] 48 | [pos="S"] [pos="U"] [pos="U" & text="ཀྱི་"] [] [text="ཡིན་"] 3 = [pos="U"] 49 | [pos="U" & text="ཚུལ་"] [pos="U" & text="དུ་"] [pos="S"] 2 = [pos="U"] 50 | [pos="U" & text="རྨི་ལམ་"] [text="བཟང་"] 1 = [pos="S"] 51 | [pos="U"] [] [text="ལ་ལ"] 1 = [pos="S"] 52 | [pos="S"] [pos="U"] [pos="S" & text="དུ་མ་"] 2 = [pos="U"] 53 | [pos="U" & text="ཀྱི་"] [pos="U"] [pos="S"] 1 = [pos="S"] 54 | [pos="U" & text="འགྲོ་"] [pos="U"] [pos="S"] 1 = [pos="S"] 55 | [pos="U"] [] [text="གོམས་པ་"] 1 = [pos="S"] 56 | [text="དང་"] [pos="U" & text="།"] [] [text="།"] 2 = [pos="U"] 57 | [text="སྤུར་ཁང་"] [pos="U"] 2 = [pos="S"] 58 | [pos="S"] [pos="U"] [pos="U" & text="སྣང་བ་"] 3 = [pos="S"] 59 | [pos="U"] [pos="S"] [pos="S"] 1 = [pos="S"] 60 | [text="སྒྲ་"] [pos="U"] [text="།"] 2 = [pos="S"] 61 | [pos="S"] [pos="U"] [pos="U"] [text="རྗེ་བཙུན་མི་"] 3 = [pos="U"] 62 | [pos="S"] [pos="U"] [pos="U" & text="བཅས་པ་"] 3 = [pos="S"] 63 | [pos="S"] [pos="U"] [pos="U" & text="དུར་ཁྲོད་"] 3 = [pos="S"] 64 | [text="དར་བ"] [] [pos="U"] 3 = [pos="S"] 65 | [text="བཀའ་བརྒྱུད་"] [pos="U"] 2 = [pos="S"] 66 | [text="དང་"] [text="།"] [pos="U"] [text="།"] 3 = [pos="U"] 67 | [pos="U"] [pos="U" & text="མི་"] [pos="S" & text="ང་རང་"] 2 = [pos="U"] 68 | [pos="U"] [pos="S" & text="ནགས་"] [pos="U"] 2 = [pos="U"] 69 | [pos="U"] [text=".*པ"] 1 = [pos="S"] 70 | [text="དགོངས་"] [text="ཏེ"] [pos="U" & text="།"] 3 = [pos="S"] 71 | [pos="S"] [pos="U" & text="གཅིག་"] [pos="U" & text="ལ་"] 2 = [pos="U"] 72 | [pos="U" & text="ལ"] [] [text="ད་ལྟ་"] 1 = [pos="S"] 73 | [text="གཟིགས་པ"] [text="ས"] [pos="U" & text="།"] 3 = [pos="S"] 74 | [text="ལ་ལ"] [pos="U"] 2 = [pos="S"] 75 | [text="ཕྱག་ལེན་"] [] [pos="U"] [pos="S"] [pos="S"] 3 = [pos="U"] 76 | [pos="U"] [text="བདག་མེད་མ་"] 1 = [pos="S"] 77 | [pos="U"] [] [text="རྔོག་སྟོན་"] 1 = [pos="S"] 78 | [pos="U"] [] [text="གང་བ་"] 1 = [pos="S"] 79 | [text="ལུག་རྫི་"] [] [pos="U"] 3 = [pos="S"] 80 | [pos="U" & text="ཅིང་"] [pos="U"] [pos="S"] 1 = [pos="S"] 81 | [text="མཆོད་པ་"] [pos="U" & text="ཕུལ་"] 2 = [pos="S"] 82 | [text="ཡོན་"] [] [pos="U" & text="།"] [pos="U"] 3 = [pos="U"] 83 | [pos="U"] [text="ཡབ་"] [text="ལ་"] 1 = [pos="U"] 84 | [pos="S" & text="ད་ལྟ་"] [pos="U"] 1 = [pos="U"] 85 | [pos="U" & text="ཡིན་"] [text="ནོ"] 1 = [pos="S"] 86 | [text="གནང་བ"] [] [pos="S"] 3 = [pos="U"] 87 | [text="བདག་ཅག་"] [pos="U"] [text="ཀྱི་"] 2 = [pos="S"] 88 | [pos="B"] [pos="U" & text="མེད་"] [text="ཕྱག་རྒྱ་ཆེན་པོ་"] [text="ལ"] 2 = [pos="S"] 89 | [pos="S"] [pos="S" & text="མཁར་"] [pos="U"] 3 = [pos="U"] 90 | [text="གཉིས་"] [pos="S" & text="ཀ་"] 2 = [pos="U"] 91 | [pos="U"] [text="ཕ་རོལ་"] 1 = [pos="S"] 92 | [text="ར་"] [pos="U" & text="གདའ"] [text="།།"] 2 = [pos="S"] 93 | [text="ཕ་ཇོ་"] [pos="U"] 2 = [pos="S"] 94 | [pos="U" & text="རྟོགས་པ་"] [text="དེ"] [text="ས"] 1 = [pos="S"] 95 | [pos="S"] [pos="U"] [pos="I"] 2 = [pos="I"] 96 | [text="བུ་སློབ་"] [pos="U" & text="ཀུན"] 2 = [pos="S"] 97 | [pos="U" & text="དགོངས་"] [text="ཏེ"] 1 = [pos="S"] 98 | [pos="U"] [] [text="ཀ་ཆེན་"] 1 = [pos="S"] 99 | [text="།"] [pos="U"] [text="།"] 2 = [pos="S"] 100 | [pos="U"] [text="བས"] 1 = [pos="B"] 101 | [pos="S" & text="ཕུག་རོན་"] [pos="U" & text="གྱི་"] 2 = [pos="U"] 102 | [pos="U"] [text="ལུག་རྫི་"] 1 = [pos="S"] 103 | [pos="U" & text="ཁྱེད་"] [text="རང་"] [text="རྣམས་"] 1 = [pos="S"] 104 | [pos="U" & text="གང་"] [text="གི་"] 1 = [pos="S"] 105 | [text="དང་"] [] [pos="U" & text="བླ་མ་"] [pos="S"] [pos="U"] 3 = [pos="U"] 106 | [pos="U"] [text="ཡབ་"] 1 = [pos="S"] 107 | [pos="U" & text="མི་"] [text="འདུག་པ"] [text="ས"] 1 = [pos="S"] 108 | [text="ཞན་"] [] [pos="U"] 3 = [pos="S"] 109 | [pos="U"] [] [text="ཕ་རོལ་"] 1 = [pos="S"] 110 | [pos="S" & text="སྲས་"] [text="ཀྱི་"] [text="ཐུགས་"] 1 = [pos="U"] 111 | [pos="U"] [text="ཐུགས་ཁྲལ་"] 1 = [pos="S"] 112 | [text="གྲགས་རྒྱུ"] [pos="U" & text="།།"] 2 = [pos="S"] 113 | [pos="U"] [] [text="གྲགས་རྒྱུ"] 1 = [pos="S"] 114 | [pos="U"] [pos="U"] [pos="U" & text="གི་"] [text="བུ"] 3 = [pos="U"] 115 | [pos="U"] [pos="S"] [pos="U" & text="ཙམ་"] 3 = [pos="S"] 116 | [pos="U"] [pos="U"] [pos="S" & text="དོན་དུ་"] 3 = [pos="U"] 117 | [text="ར་"] [] [pos="U" & text="ལས"] 3 = [pos="S"] 118 | [text="མི་"] [pos="U" & text="འདུག་པ"] [text="ས"] 2 = [pos="S"] 119 | [text="བསྐོར་བ་"] [] [pos="U"] 3 = [pos="S"] 120 | [pos="U"] [pos="S" & text="ནགས་"] [pos="U"] [text="།"] 2 = [pos="S"] 121 | [text="ཀླད་"] [] [pos="U"] 3 = [pos="S"] 122 | [pos="U"] [] [text=".*མེད་པ"] 1 = [pos="S"] 123 | [pos="U" & text="།"] [] [text="མི་"] 1 = [pos="S"] 124 | [pos="U" & text="བླ་མ"] [] [text="ཞལ་"] 1 = [pos="S"] 125 | [text="གཟིགས་པ"] [pos="U" & text="ས"] [text="།"] 2 = [pos="S"] 126 | [text="།།"] [pos="U" & text="སྙན་བརྒྱུད་"] 2 = [pos="S"] 127 | [pos="U" & text="ཡོད་"] [text="ཞུས་པ"] [text="ས"] 1 = [pos="S"] 128 | [pos="U"] [text="ཕུག་རོན་"] 1 = [pos="S"] 129 | [text="ཚོགས་"] [pos="U"] [text="བུ་སློབ་"] 2 = [pos="S"] 130 | [text="།"] [pos="U"] [text="རྟ་"] 2 = [pos="S"] 131 | [text="གྱི་"] [] [pos="U" & text="དང་"] [pos="U"] [pos="S"] 3 = [pos="U"] 132 | [text="བཀའ་བརྒྱུད་"] [] [pos="U"] 3 = [pos="S"] 133 | [pos="U" & text="བཞི"] [pos="U"] [pos="S"] 1 = [pos="S"] 134 | [text="རྒན་རྒོན་"] [pos="U"] 2 = [pos="S"] 135 | [pos="S"] [pos="U"] [pos="U" & text="འདུག་"] 3 = [pos="S"] 136 | [pos="U"] [text="བཀའ་བརྒྱུད་"] 1 = [pos="S"] 137 | [pos="U" & text="དྲུག་"] [pos="U"] [pos="S"] 1 = [pos="S"] 138 | [pos="U"] [pos="U" & text="ད་"] [pos="S"] 2 = [pos="S"] 139 | [pos="U" & text="ཀྱི་"] [text="ཞལ་"] [text="ནས"] 1 = [pos="S"] 140 | [text="ནམ་མཁ"] [pos="U" & text="འི་"] [text="མཐོངས་"] 2 = [pos="S"] 141 | [pos="U"] [pos="U" & text="མི་"] [pos="S"] 2 = [pos="S"] 142 | [text="ནོ"] [pos="U" & text="།།"] 2 = [pos="S"] 143 | [text="སེམས་ཅན་"] [text="གྱི་"] [pos="U" & text="དོན་"] 3 = [pos="S"] 144 | [pos="U" & text="ཐུགས་"] [text="དྲན་"] 1 = [pos="S"] 145 | [pos="S"] [pos="U"] [pos="U" & text="གུས་"] 3 = [pos="S"] 146 | [pos="S"] [pos="U"] [pos="I" & text=".*"] 2 = [pos="S"] 147 | [text="བདག་མེད་མ་"] [] [pos="U"] 3 = [pos="S"] 148 | [text="གདམས་ངག་"] [text="ལ"] [pos="U" & text="།།"] 3 = [pos="S"] 149 | [text="འཚལ་བ་"] [] [pos="U" & text="།།"] [pos="S"] [pos="S"] 3 = [pos="U"] 150 | [pos="S"] [pos="U" & text="གྱི་"] 2 = [pos="S"] 151 | [text="དེ"] [] [pos="U" & text="ཐར་པ"] 3 = [pos="S"] 152 | [pos="U" & text="།"] [] [text="སྣ་ཚོགས་"] 1 = [pos="S"] 153 | [pos="S" & text="རྨི"] [text="།།"] [text="ནཱ་རོ་པ་"] 1 = [pos="U"] 154 | [text="ཁྱོད་"] [pos="U" & text="ཀྱི་"] [pos="U"] [pos="S"] 2 = [pos="U"] 155 | [pos="U" & text="།"] [] [text="རྟ་"] 1 = [pos="S"] 156 | [pos="U"] [pos="U"] [pos="S" & text="གཤེགས"] 3 = [pos="U"] 157 | [pos="U" & text="མེ་ཏོག་"] [text="སྣ་ཚོགས་"] 1 = [pos="S"] 158 | [pos="S"] [pos="U"] [pos="U" & text="ཀྱི་"] 3 = [pos="S"] 159 | [pos="S" & text="སྲས་"] [text="དར་མ་མདོ་སྡེ་"] 1 = [pos="U"] 160 | [pos="U" & text="དང་"] [pos="U" & text="དགའ་སྤྲོ་"] [pos="S" & text="དཔག་ཏུ་མེད་པ་"] 1 = [pos="U"] 161 | [text="བུ་སློབ་"] [] [pos="U" & text="།།"] 3 = [pos="S"] 162 | [pos="S" & text="བུ་"] [pos="U" & text="དང་"] [pos="U"] 1 = [pos="S"] 163 | [pos="U"] [pos="U"] [pos="U" & text="འི་"] [pos="S"] [pos="S"] 3 = [pos="U"] 164 | [pos="S" & text="འཁོར་བ་"] [pos="U"] [pos="U"] 1 = [pos="U"] 165 | [text="བཙུགས་པ་"] [] [pos="U" & text="།།"] 3 = [pos="S"] 166 | [text="ལ་ད"] [] [pos="U"] 3 = [pos="I"] 167 | [text="ཡོད་"] [text="ཞུས་པ"] [pos="U" & text="ས"] 3 = [pos="S"] 168 | [pos="U"] [] [text="རྔོག་པ"] 1 = [pos="S"] 169 | [pos="U" & text="གཅིག་"] [pos="U"] [pos="S"] 1 = [pos="S"] 170 | [text="དེ་"] [pos="S" & text="ཀ་"] 2 = [pos="U"] 171 | [pos="U"] [] [text="ཨ་ཕོ་"] 1 = [pos="S"] 172 | [text="བསྐོར་བ་"] [pos="U"] 2 = [pos="S"] 173 | [pos="B"] [pos="U" & text="མེད་"] 2 = [pos="I"] 174 | [text="ཡོན་"] [] [pos="U"] 3 = [pos="S"] 175 | [pos="U"] [pos="S"] [pos="S" & text="མཁར་"] 1 = [pos="U"] 176 | [text="རྣམས་"] [] [pos="U" & text="རྨི་ལམ་"] 3 = [pos="S"] 177 | [pos="U" & text="།"] [] [text="།"] 1 = [pos="S"] 178 | [pos="U" & text="བླ་མ་"] [pos="S"] [pos="U"] 1 = [pos="S"] 179 | [pos="S"] [pos="S" & text="བུ་"] [pos="U"] [text="དང་"] 3 = [pos="U"] 180 | [pos="S"] [pos="U"] [pos="U" & text="ཕུལ་བ"] 3 = [pos="S"] 181 | [pos="U" & text="རྣམས"] [pos="U"] [pos="S"] 1 = [pos="S"] 182 | [pos="U"] [] [text="འཕོས་"] 1 = [pos="S"] 183 | [text="འོག་"] [pos="U"] [text="བུ་ཆེན་"] 2 = [pos="U"] 184 | [text="ཐུགས་ཁྲལ་"] [] [pos="U"] 3 = [pos="S"] 185 | [pos="S"] [pos="U"] [pos="S"] 2 = [pos="S"] 186 | [text="གནང་"] [pos="U" & text="བར་"] 2 = [pos="S"] 187 | [pos="U"] [text="གོམས་པ་"] 1 = [pos="S"] 188 | [text="མེས་སྟོན་"] [] [pos="U"] 3 = [pos="S"] 189 | [text="།"] [pos="U" & text="དབུ་"] 2 = [pos="S"] 190 | [pos="S" & text="ཕྱི་ཕྱག་"] [pos="U"] 1 = [pos="U"] 191 | [pos="U"] [pos="U" & text="ཡང་"] [pos="S"] 2 = [pos="S"] 192 | [text="ན་"] [pos="U" & text="ཡོད་པ་"] 2 = [pos="S"] 193 | [text="།།"] [] [pos="S" & text="མེ་"] 3 = [pos="U"] 194 | [pos="U"] [text="ཕུག་རོན་"] [text="གྱི་"] 1 = [pos="U"] 195 | [pos="U"] [] [text="རྗེ་བཙུན་མི་"] 1 = [pos="S"] 196 | [pos="S"] [pos="U" & text="གྱི་"] [] [text="ཞིག་"] 2 = [pos="U"] 197 | [pos="S" & text="ཕུག་རོན་"] [text="གྱི་"] 1 = [pos="U"] 198 | [pos="U" & text="རྟ་"] [text="ལ་"] 1 = [pos="S"] 199 | [text="རྟོགས་པ་"] [pos="U"] [text="ས"] 2 = [pos="S"] 200 | [pos="U" & text="བར་"] [text="དུ་"] 1 = [pos="B"] 201 | [text="ཕུག་རོན་"] [text="གྱི་"] [pos="U"] 3 = [pos="U"] 202 | [text="བུ་ཆེན་"] [] [pos="U"] 3 = [pos="S"] 203 | [text="ཤིང་"] [] [pos="U" & text="ལ་"] 3 = [pos="S"] 204 | [text="ཕ་ཇོ་"] [] [pos="U"] 3 = [pos="S"] 205 | [text="།"] [pos="U"] [text="སྲས་"] 2 = [pos="S"] 206 | [pos="S" & text="དཔག་ཏུ་མེད་པ་"] [text="ཞིག་"] 1 = [pos="U"] 207 | [text="རྗེ་བཙུན་མི་"] [pos="U"] 2 = [pos="S"] 208 | [pos="U"] [text="རྗེ་བཙུན་མི་"] 1 = [pos="S"] 209 | [text="ལ་ལ"] [] [pos="U"] 3 = [pos="S"] 210 | [pos="U" & text="ཐུགས་"] [text="ཉམས་"] 1 = [pos="B"] 211 | [pos="U"] [] [text="བུ་ཆེན་"] 1 = [pos="S"] 212 | [text="ཡོད་པ་"] [pos="U" & text="ཡིན་"] 2 = [pos="S"] 213 | [pos="U" & text="གཅིག་པ"] [text=".*པ" & pos="U"] 1 = [pos="U"] 214 | [pos="U"] [] [text="མེས་སྟོན་"] 1 = [pos="S"] 215 | [pos="S" & text="ཡབ་ཡུམ་"] [pos="U"] [pos="U"] 1 = [pos="U"] 216 | [pos="U"] [pos="U"] [pos="U" & text="ང་"] [pos="S"] [pos="S"] 3 = [pos="I"] 217 | [text="དགོས་པ་"] [text="ཡོད་"] [pos="U" & text="དོ"] 3 = [pos="S"] 218 | [text="བསྒོམས་"] [pos="U"] 2 = [pos="S"] 219 | [pos="S"] [pos="S" & text="རྒྱུ་"] [pos="U"] 3 = [pos="U"] 220 | [text="རྟ་"] [pos="U" & text="ལ་"] 2 = [pos="S"] 221 | [pos="U" & text="འོག་"] [pos="U"] [text="བུ་ཆེན་" & pos="S"] 1 = [pos="U"] 222 | [pos="U"] [text="བས"] [text="།།"] 1 = [pos="S"] 223 | [text="བདག་མེད་མ་"] [pos="U"] 2 = [pos="S"] 224 | [text="ལ་ལུང་"] [pos="U" & text="བསྟན་"] 2 = [pos="I"] 225 | [pos="S"] [pos="U"] [pos="U" & text="གསུངས་པ"] 3 = [pos="S"] 226 | [pos="U" & text="ལ་"] [text="རྨི་ལམ་"] 1 = [pos="S"] 227 | [text="རྨི་ལམ་"] [pos="U" & text="བཟང་"] 2 = [pos="S"] 228 | [pos="S" & text="ཆིབས་"] [text="ནས་"] 1 = [pos="U"] 229 | [text="བཟང་"] [] [pos="U" & text="འདི"] 3 = [pos="S"] 230 | [text="ཇི་ལྟར་"] [] [pos="U"] [text=".*"] 3 = [pos="U"] 231 | [pos="U"] [pos="S"] [pos="U" & text="དེ"] 3 = [pos="S"] 232 | [pos="S"] [pos="U" & text="ར་"] [pos="S" & text="ཆད་མེད་པ"] 2 = [pos="I"] 233 | [text="གཉིས་"] [pos="S"] [pos="U"] [pos="S"] 3 = [pos="U"] 234 | [pos="U" & text="ར་"] [] [text="ལས"] 1 = [pos="S"] 235 | [pos="U" & text="ན་"] [] [text="ཡིན"] 1 = [pos="S"] 236 | [text="གང་"] [pos="U" & text="གི་"] 2 = [pos="S"] 237 | [pos="S" & text="ང་རང་"] [pos="U" & text="གི་"] [pos="S"] 2 = [pos="U"] 238 | [text="ལ་"] [pos="U"] [text="དོན་"] 2 = [pos="S"] 239 | [text="རང་"] [pos="U" & text="རྣམས་"] 2 = [pos="S"] 240 | [pos="S"] [pos="S"] [pos="U"] 3 = [pos="S"] 241 | [text="ལ"] [] [pos="S" & text="ད་ལྟ་"] [pos="U"] 3 = [pos="S"] 242 | [pos="S" & text="སྒོས་"] [pos="S"] [pos="U"] 3 = [pos="U"] 243 | [text="ཁྱེད་"] [pos="U" & text="རང་"] [text="རྣམས་"] 2 = [pos="S"] 244 | [text="ལ་ད"] [pos="U"] 2 = [pos="I"] 245 | [text="གསུངས་པ་"] [pos="U" & text="ལྟར་"] 2 = [pos="S"] 246 | [text="ཕ་རོལ་"] [pos="U"] 2 = [pos="S"] 247 | [text="ན་"] [] [pos="U" & text="ཡིན"] 3 = [pos="S"] 248 | [pos="U" & text="ལ་"] [text="ཞུས་པ་"] 1 = [pos="S"] 249 | [pos="U"] [pos="U" & text="རོ་"] [pos="I"] 2 = [pos="B"] 250 | [text="ས"] [] [pos="S" & text="བུ་"] [pos="U"] [pos="U"] 3 = [pos="S"] 251 | [text="ཐུགས་"] [pos="U" & text="དྲན་"] 2 = [pos="S"] 252 | [text="ར་"] [pos="S"] [text="ན་"] 2 = [pos="U"] 253 | [pos="S" & text="བུ་"] [pos="U"] [pos="U"] 1 = [pos="U"] 254 | [text="ཡོད་"] [pos="U" & text="ཞུས་པ"] 2 = [pos="S"] 255 | [pos="U"] [pos="S" & text="བཞི་"] [pos="U"] 2 = [pos="U"] 256 | [text="དང་"] [text="།"] [pos="U" & text="ནཱ་རོ"] 3 = [pos="S"] 257 | [text="དགོས་པ་"] [pos="U" & text="ཡོད་"] 2 = [pos="S"] 258 | [pos="U"] [text=".*ཁ"] 1 = [pos="S"] 259 | [pos="U" & text="གནང་"] [text="བར་"] 1 = [pos="B"] 260 | [text="རྨི་ལམ་"] [text="བཟང་"] [pos="U" & text="།།"] 3 = [pos="S"] 261 | [pos="U" & text="བྱོན་"] [pos="S"] [pos="S"] 1 = [pos="U"] 262 | [text="པང་"] [] [pos="U"] 3 = [pos="S"] 263 | [pos="U" & text="སྐུ་དྲིན་"] [text="ཅན"] [text="།།"] 1 = [pos="B"] 264 | [pos="U" & text="དགོས་པ་"] [text="ཡོད་"] 1 = [pos="S"] 265 | [pos="S"] [pos="U"] [pos="U" & text="བྱུང་བ་"] 3 = [pos="S"] 266 | [pos="U" & text="ནང་"] [text="མཐུན་པ"] 1 = [pos="B"] 267 | [pos="U" & text="ར་"] [text="གདའ"] [text="།།"] 1 = [pos="S"] 268 | [text="དོན་དུ་"] [pos="U"] [text="ལགས་"] 2 = [pos="S"] 269 | [pos="U" & text="མཐའ་"] [pos="U"] [pos="S"] 1 = [pos="S"] 270 | [pos="U" & text="བྱུང་"] [text="སྟེ"] 1 = [pos="S"] 271 | [pos="S"] [pos="U"] [pos="S" & text="ཁྱུང་"] 2 = [pos="U"] 272 | [text="རྟོགས་པ་"] [] [pos="U" & text="ས"] 3 = [pos="S"] 273 | [text="དོན་དུ་"] [] [pos="U" & text="ལགས་"] 3 = [pos="S"] 274 | [pos="U"] [] [text="བདག་མེད་མ་"] 1 = [pos="S"] 275 | [pos="S"] [pos="U"] [pos="U" & text="།"] [] [text="།"] 3 = [pos="B"] 276 | [pos="U"] [pos="U" & text="ཡང་"] [pos="S" & text="རྗེ་མར་པ"] 2 = [pos="U"] 277 | [pos="U" & text="།།"] [] [text="རྨི་ལམ་"] 1 = [pos="S"] 278 | [pos="U" & text="བར་ཆད་"] [] [text="།།"] 1 = [pos="S"] 279 | [text="བུ་ཆེན་"] [] [pos="U"] [pos="S"] [pos="U"] 3 = [pos="U"] 280 | [pos="S"] [pos="U" & text="གྱི་"] [] [text="།།"] 2 = [pos="U"] 281 | [pos="U" & text="ནམ་མཁ"] [pos="U"] [pos="S"] 1 = [pos="S"] 282 | [text="ནཱ་རོ"] [pos="U" & text="འི་"] [text="ལུང་བསྟན་"] 2 = [pos="S"] 283 | [pos="U" & text="ཡོད་པ་"] [text="དེ་"] 1 = [pos="S"] 284 | [text="རྗེ་"] [pos="U" & text="མི་"] 2 = [pos="B"] 285 | [pos="U"] [pos="U" & text="ཡང་"] [pos="S"] [text="བྱུང་"] 2 = [pos="U"] 286 | [text="གོམས་པ་"] [pos="U"] 2 = [pos="S"] 287 | [text="ལ་"] [pos="U" & text="ནི་"] 2 = [pos="S"] 288 | [pos="S" & text="བུ་"] [pos="U" & text="ལ་"] [pos="U"] 1 = [pos="S"] 289 | [text="བུ་"] [pos="U" & text="དང་"] 2 = [pos="S"] 290 | [text="།"] [text="མེས་སྟོན་"] [pos="U" & text="གྱིས་"] 3 = [pos="U"] 291 | [pos="S"] [pos="U"] [pos="U" & text="གྱིས་"] 3 = [pos="S"] 292 | [pos="U"] [] [text="མི་ངན་"] 1 = [pos="S"] 293 | [pos="S"] [pos="U" & text="ཡིན"] [pos="U"] 2 = [pos="S"] 294 | [text="ཕྱག་ལེན་"] [pos="U" & text="དང་"] [pos="U"] [pos="S"] 2 = [pos="U"] 295 | [pos="U" & text="ར་"] [text="ཚོགས་"] [text="གྲྭ་པ་"] 1 = [pos="S"] 296 | [pos="U" & text="རྨི་ལམ་"] [pos="S"] 1 = [pos="S"] 297 | [text="འཕོས་"] [pos="U"] 2 = [pos="S"] 298 | [text="ལ་"] [text="ཞུས་པ་"] [pos="U" & text="ལགས"] 3 = [pos="S"] 299 | [text="ཉི་མ་"] [pos="U"] [text="ཕྱེད་"] 2 = [pos="S"] 300 | [pos="U"] [pos="U"] [pos="S" & text="ཡོང་བ་"] 3 = [pos="U"] 301 | [pos="U"] [] [text="གདུང་"] 1 = [pos="S"] 302 | [text="ཕ་རོལ་"] [] [pos="U"] 3 = [pos="S"] 303 | [pos="U" & text="ཁྱེད་"] [text="གཉིས་"] 1 = [pos="S"] 304 | [text="ལ་"] [pos="U" & text="རྨི་ལམ་"] 2 = [pos="S"] 305 | [pos="U" & text="ཀྱིས"] [text="།"] 1 = [pos="S"] 306 | [text="མི་"] [text="འདུག་པ"] [pos="U" & text="ས"] 3 = [pos="S"] 307 | [pos="U" & text="ཤིང་"] [] [text="ལ་"] 1 = [pos="S"] 308 | [text="ནས་"] [pos="S"] [pos="U" & text="ས་"] [pos="S"] 3 = [pos="I"] 309 | [text="ཀྱི་"] [text="ཞལ་"] [pos="U" & text="ནས"] 3 = [pos="S"] 310 | [pos="U"] [pos="U" & text="རྒྱུན་"] [pos="S"] 2 = [pos="B"] 311 | [pos="U"] [] [text="ལ་མོ"] 1 = [pos="S"] 312 | [pos="U" & text="།"] [text="དབུ་"] 1 = [pos="S"] 313 | [text="ས"] [pos="U"] [text="བུ་"] 2 = [pos="S"] 314 | [pos="U" & text="འགྲོ་"] [text="ཀུན་"] 1 = [pos="B"] 315 | [text="ཤིང་"] [pos="U"] [text="ལ་"] 2 = [pos="S"] 316 | [pos="U" & text="དང་"] [pos="U"] [pos="S"] 1 = [pos="S"] 317 | [text="ཤི་བ"] [pos="U"] 2 = [pos="S"] 318 | [text="བླ་མ"] [text="འི་"] [pos="U" & text="ཞལ་"] 3 = [pos="S"] 319 | [text="ཐུགས་"] [pos="U" & text="ཉམས་"] 2 = [pos="I"] 320 | [pos="S"] [pos="U"] [pos="U" & text="གསུངས"] 3 = [pos="S"] 321 | [text="གོམས་པ་"] [] [pos="U"] 3 = [pos="S"] 322 | [text="ཇི་ལྟར་"] [] [pos="U"] 3 = [pos="S"] 323 | [text="ཕུག་རོན་"] [pos="U"] 2 = [pos="S"] 324 | [pos="U"] [text="བུ་ཆེན་"] 1 = [pos="S"] 325 | [text="ར་"] [pos="U"] [text="ལས"] 2 = [pos="S"] 326 | [pos="U" & text="གླིང་"] [text="དུ་"] [text="གཤེགས་པ་"] 1 = [pos="S"] 327 | [pos="U"] [pos="U" & text="དུ་"] [pos="S" & text="རྗེ་མར་པ"] 2 = [pos="U"] 328 | [pos="S" & text="བཀྲ་ཤིས་ཤོག"] [pos="U" & text="།"] [pos="S"] 2 = [pos="U"] 329 | [pos="S"] [pos="U" & text="ཡོད་"] [pos="U"] 2 = [pos="S"] 330 | [pos="U"] [] [text="ཆགདབུ་"] 1 = [pos="S"] 331 | [text="དང་"] [pos="S" & text="ཕྲིན་ལས་"] [pos="U"] [pos="U"] 2 = [pos="S"] 332 | [text="བུ་ཆེན་"] [pos="U" & text="རྣམས་"] 2 = [pos="S"] 333 | [pos="U" & text="ཀུན་"] [text="ལ་"] 1 = [pos="S"] 334 | [pos="U" & text="སྒྲ་"] [text="དང་"] [text="།"] 1 = [pos="S"] 335 | [pos="U"] [text="བུ"] 1 = [pos="S"] 336 | [text="ཐར་པ"] [pos="U"] [text="གླིང་"] 2 = [pos="S"] 337 | [text="ཀ་"] [pos="U"] [text="ཏུ་"] 2 = [pos="S"] 338 | [pos="S" & text="བུ་"] [pos="U" & text="གཅིག་"] [pos="U" & text="ཡོད་པ་"] 1 = [pos="S"] 339 | [pos="U" & text="ཡོད་པ་"] [text="ཡིན་"] 1 = [pos="S"] 340 | [pos="S"] [pos="U"] [pos="U" & text="ཚུལ་"] 3 = [pos="S"] 341 | [pos="U"] [] [text="སྲིད་པ་"] 1 = [pos="S"] 342 | [text="སུ་"] [text="གཤེགས"] [pos="U" & text="།།"] 3 = [pos="S"] 343 | [text="ལ་"] [pos="U" & text="ཞུས་པ་"] [text="ལགས"] 2 = [pos="S"] 344 | [text="ཁྱབ་པ་"] [] [pos="U" & text="།།"] 3 = [pos="S"] 345 | [pos="U" & text="མཆོད་པ་"] [text="ཕུལ་"] 1 = [pos="S"] 346 | [pos="U"] [text="རྒན་རྒོན་"] 1 = [pos="S"] 347 | [pos="U"] [pos="S"] [pos="U" & text="འོད་གསལ་"] 3 = [pos="S"] 348 | [text="བར་"] [pos="U" & text="དུ་"] [] [text="གི་"] 2 = [pos="S"] 349 | [pos="U" & text="གསུངས་"] [pos="S"] 1 = [pos="S"] 350 | [pos="U" & text="ཡང་"] [text="།"] 1 = [pos="S"] 351 | [text="ཁྱེད་"] [pos="U" & text="གཉིས་"] 2 = [pos="S"] 352 | [text="མཐུ་ཆེན་"] [] [pos="U"] 3 = [pos="S"] 353 | [pos="U" & text="གསུངས་པ་"] [text="ལྟར་"] 1 = [pos="S"] 354 | [text="།།"] [pos="S" & text="དུས་གསུམ་"] 2 = [pos="U"] 355 | [pos="U"] [text="རྒྱུགས་པ"] 1 = [pos="S"] 356 | [text="ལྟ་བུ་"] [pos="U" & text="ཞིག་"] [text="ཡོད་པ་"] 2 = [pos="S"] 357 | [text="ཡོད་པ་"] [] [pos="U" & text="དང་"] 3 = [pos="S"] 358 | [pos="S"] [pos="U" & text="གྱི་"] [] [text="ཡོད"] 2 = [pos="U"] 359 | [pos="U"] [] [text="འདུས་བྱས་"] 1 = [pos="S"] 360 | [pos="S"] [pos="U" & text="གཅིག་"] [pos="U"] 2 = [pos="S"] 361 | [pos="U" & text="ནཱ་རོ"] [] [text="ལུང་བསྟན་"] 1 = [pos="S"] 362 | [pos="U" & text="གདམས་ངག་"] [text="ལ"] 1 = [pos="S"] 363 | [pos="U" & text="ཅིག་"] [text="གསུངས"] [text="།"] 1 = [pos="S"] 364 | [pos="U" & text="།"] [text="སྔར་"] [text="གྱི་"] 1 = [pos="S"] 365 | [pos="U" & text="གདམས་ངག་"] [text="ཐེབས་པ་"] 1 = [pos="S"] 366 | [pos="U"] [] [text="བརྒྱུད་"] 1 = [pos="S"] 367 | [text="བྱུང་བ་"] [] [pos="U" & text="།།"] 3 = [pos="S"] 368 | [pos="U" & text="།།"] [text="དེ་ནས་"] [text="རྗེ་བཙུན་མི་"] 1 = [pos="U"] 369 | [pos="U"] [pos="U"] [pos="I" & text="ཅན་"] 3 = [pos="U"] 370 | [text="ར་"] [pos="U" & text="ཚོགས་"] 2 = [pos="S"] 371 | [pos="U"] [] [text=".*བ་"] 1 = [pos="S"] 372 | [text="མེས་སྟོན་"] [pos="U"] 2 = [pos="S"] -------------------------------------------------------------------------------- /tests/data/marpa/marpa_rules.txt: -------------------------------------------------------------------------------- 1 | [text="།།" & pos="PUNCT"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] [text="བཤེས་གཉེན་" & pos="NOUN"] 2 + [] 2 | [text="ཟླ་ཕྱེད་" & pos="NOUN"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="ཕྱོགས་" & pos="NOUN"] 2 + [] 3 | [text="།" & pos="PUNCT"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] [text="དང་བ་" & pos="NOUN"] 2 + [] 4 | [text="གྲོང་འཇུག་" & pos="NOUN"] [text="ཐུགས་" & pos="NOUN"] [text="ཉམས་" & pos="NOUN"] [text="སུ་" & pos="ADP"] 2 + [] 5 | [text="།།" & pos="PUNCT"] [text="སྐྱེ་" & pos="NOUN"] [text="མེད་" & pos="VERB"] [text="རང་སྒྲ་"] 2 + [] 6 | [text="།།" & pos="PUNCT"] [text="སྐྱེ་" & pos="NOUN"] [text="མེད་" & pos="VERB"] [text="རི་བོང་" & pos="NOUN"] 2 + [] 7 | [text="འཁོར་འདས་" & pos="NOUN"] [text="དབྱེར་" & pos="TEXT"] [text="མེད་" & pos="VERB"] [text="ག་" & pos="PRON"] 2 + [] 8 | [text="གྱི་"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="བསྟན་པ་" & pos="VERB"] 2 + [] 9 | [text="གཞན་གཅེས་"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] [text="བརྩེ་བ" & pos="VERB"] 2 + [] 10 | [text="བརྩེ་བདུངས་"] [text="ནང་" & pos="NOUN"] [text="མཐུན་པ" & pos="VERB"] [text="ས" & pos="PART"] 2 + [] 11 | [text="།།" & pos="PUNCT"] [text="སྐྱེ་" & pos="NOUN"] [text="མེད་" & pos="VERB"] [text="སྤྲོས་བྲལ་" & pos="ADJ"] 2 + [] 12 | [text="རྟ" & pos="NOUN"] [text="།།" & pos="PUNCT"] [text="རོ་" & pos="NOUN"] [text="སྙོམས་" & pos="VERB"] [text="ལྕགས་" & pos="NOUN"] 3 + [] 13 | [text="།།" & pos="PUNCT"] [text="རྗེ་"] [text="མི་" & pos="PART"] [text="འབྲལ་"] 3 + [] 14 | [text="བྱང་ཆུབ་" & pos="NOUN"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="གཙུག་" & pos="NOUN"] 2 + [] 15 | [text="བདུན་" & pos="NUM"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="བརྟན་པ་" & pos="VERB"] 2 + [] 16 | [text="ཡབ་" & pos="NOUN"] [text="དབྱེར་" & pos="TEXT"] [text="མེད་" & pos="VERB"] [text="དགྱེས་རྡོར་"] 2 + [] 17 | [text="།།" & pos="PUNCT"] [text="སྐྱེ་" & pos="NOUN"] [text="མེད་" & pos="VERB"] [text="མཚོན་པ" & pos="VERB"] 2 + [] 18 | [text="བླ་མ་" & pos="NOUN"] [text="སྐུ་དྲིན་" & pos="NOUN"] [text="ཅན" & pos="PART"] [text="།།" & pos="PUNCT"] [text="དཔལ་" & pos="OTHER"] 2 + [] 19 | [text="གི་"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="ཆོས་གྲྭ་" & pos="NOUN"] 2 + [] 20 | [text="ས་" & pos="PART"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] [text="སངས་རྒྱས་" & pos="NOUN"] 2 + [] 21 | [text="།།" & pos="PUNCT"] [text="དབྱེར་" & pos="TEXT"] [text="མེད་" & pos="VERB"] [text="རང་བཞིན་" & pos="NOUN"] 2 + [] 22 | [text="།།" & pos="PUNCT"] [text="དབྱེར་" & pos="TEXT"] [text="མེད་" & pos="VERB"] [text="དཔལ་ལྡན་" & pos="ADJ"] 2 + [] 23 | [text="སེམས་" & pos="NOUN"] [text="སྐྱེ་" & pos="NOUN"] [text="མེད་" & pos="VERB"] [text="དུ་"] 2 + [] 24 | [text="ཇི་བཞིན་" & pos="OTHER"] [text="ཐུགས་" & pos="NOUN"] [text="ཉམས་" & pos="NOUN"] [text="སུ་" & pos="ADP"] 2 + [] 25 | [text="།།" & pos="PUNCT"] [text="བསྲེ་བ་" & pos="VERB"] [text="རོ་" & pos="NOUN"] [text="སྙོམས་" & pos="VERB"] [text="གཉིས་" & pos="NUM"] 3 + [] 26 | [text="འདྲེན་མཛད་"] [text="སྐུ་དྲིན་" & pos="NOUN"] [text="ཅན" & pos="PART"] [text="།།" & pos="PUNCT"] [text="གྲུབ་ཐོབ་" & pos="NOUN"] 2 + [] 27 | [text="གི་"] [text="ཐུགས་" & pos="NOUN"] [text="ཉམས་" & pos="NOUN"] [text="ལ་" & pos="ADP"] 2 + [] 28 | [text="ནི་" & pos="PART"] [text="ཐུགས་" & pos="NOUN"] [text="ཉམས་" & pos="NOUN"] [text="ལ་" & pos="ADP"] 2 + [] 29 | [text="།" & pos="PUNCT"] [text="དེ་དུས་" & pos="PRON"] [text="ཀྱང་"] [text="རྔོག་པ" & pos="PROPN"] [text="ས་" & pos="PART"] 2-1 : [] [] 30 | [text="ས་" & pos="PART"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] [text="ཚིམ་པ་" & pos="VERB"] 2 + [] 31 | [text="གྱི་"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="འཕགས་པ་" & pos="VERB"] 2 + [] 32 | [text="འི་" & pos="PART"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="ཡང་"] 2 + [] 33 | [text="མོས་གུས་" & pos="NOUN"] [text="ནང་" & pos="NOUN"] [text="མཐུན་པ" & pos="VERB"] [text="ས་" & pos="PART"] 2 + [] 34 | [text="གདམས་དག་"] [text="ཐུགས་" & pos="NOUN"] [text="ཉམས་" & pos="NOUN"] [text="སུ་" & pos="ADP"] 2 + [] 35 | [text="འི་" & pos="PART"] [text="ཐུགས་" & pos="NOUN"] [text="ཉམས་" & pos="NOUN"] [text="དྲག་པོ་" & pos="ADJ"] 2 + [] 36 | [text="རང་བཞིན་" & pos="NOUN"] [text="སྐྱེ་" & pos="NOUN"] [text="མེད་" & pos="VERB"] [text="ཀི་" & pos="ADP"] 2 + [] 37 | [text="བདུན་" & pos="NUM"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="སྣང་བ་" & pos="VERB"] 2 + [] 38 | [text="ཀྱི་"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="གནས་པ" & pos="VERB"] 2 + [] 39 | [text="ནོ་"] [text="སྐུ་དྲིན་" & pos="NOUN"] [text="ཅན" & pos="PART"] [text="།།" & pos="PUNCT"] [text="ཕ་" & pos="NOUN"] 2 + [] 40 | [text="སོགས་" & pos="DET"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] [text="སྨིན་" & pos="VERB"] 2 + [] 41 | [text="རྫོགས་" & pos="VERB"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="གཅིག་བརྒྱུད་"] 2 + [] 42 | [text="ལ་" & pos="ADP"] [text="འབྱུང་བ་" & pos="VERB"] [text="རོ་" & pos="NOUN"] [text="སྙོམས་" & pos="VERB"] [text="ཀྱི་"] 3 + [] 43 | [text="རྩེ་གདུངས་"] [text="ནང་" & pos="NOUN"] [text="མཐུན་པ" & pos="VERB"] [text="ས་" & pos="PART"] 2 + [] 44 | [text="།" & pos="PUNCT"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="ལག་" & pos="NOUN"] 2 + [] -------------------------------------------------------------------------------- /tests/data/monlam2020/multi_pos_multi_sense.csv: -------------------------------------------------------------------------------- 1 | edited,favorites,_id,word,result 2 | -,-,83307,ལ་,མིང་ཚིག 1. བོད་ཡིག་གི་གསལ་བྱེད་ལས་སྡེ་བ་བདུན་པའི་ཡི་གེ་གཉིས་པ་སྟེ། འདིའི་ང་རོ་འདོན་ཚུལ་ནི། སྐྱེ་གནས་སོ་དང༌། བྱེད་པ་ལྕེ་རྩེ། སྒྲ་འབྱིན་ཚུལ་ཆེས་ཤིན་ཏུ་ལྷོད་པ། མིང་གཞིའི་རྟགས་ཀྱི་དབྱེ་བའི་མོ་གཤམ་གྱི་ཡི་གེ་ཞིག 2. རྗེས་འཇུག་བཅུའི་དགུ་པ་སྟེ། མིང་གཞིའི་ཡི་གེ་ཀུན་གྱི་རྗེས་སུ་འཇུག དཔེར་ན། ཀལ། ཁལ། གལ། ངལ། ཅལ། ཆལ། མཇལ། ཉལ། ཏལ། ཐལ། དལ། ནལ། དཔལ། ཕལ། བལ། མལ། བཙལ། ཚལ། འཛོལ། ཝལ། ཞལ། ཟལ། འོལ། ཡལ། རལ། ཤལ། སལ། ཧལ། ཨལ་བཅས་པ་ལྟ་བུ། 3. ༡རྒྱུན་སྤྱོད། རི་འདབས་ནས་ཡར་རི་རྩེ་བར་གྱི་གྱེན་ངོས་སམ། ས་བབ་དམའ་ས་ནས་མཐོ་ས་ལ་ཡར་གྱེན་དུ་གསེག་ཡོད་པའི་ཟུར་ངོས་ཀྱི་མིང། དཔེར་ན། ལ་རེ་ཐུར་རེ་མ་བརྒྱབ་ན། བདེ་མོའི་ཐང་ལ་ཐོན་རྒྱུ་མེད་ལྟ་བུ། ༡ལྡོག་ཚིག ཐུར། 4. ༡ཆོས་ལུགས། གཅོད་བྱེད་ཀྱི་ཡི་གེ་སྟེ། སྐུ་ཐམས་ཅད་ཀྱི་སྣང་བ་སྟོན་པ་དབང་རྫོགས་པ་རང་བྱུང་ཆེན་པོའི་རྒྱུད་ལས། ལ་ནི་གཅོད་བྱེད་ཀྱི་ཡི་གེར་ཤེས་པར་བྱའོ། །ཞེས་རྒྱུད་དེའི་སྒྲའི་སྡོམ་བྱང་བསྟན་པའི་སྐབས་སུ་གསལ། བྱེད་ཚིག 1. ༡རྒྱུན་སྤྱོད། ལས་སུ་བྱ་བ་ལ་འཇུག་པ། ལ་དོན་གྱི་ལ་ཡིག་སྦྱར་བའི་གཞི་དེ་ལ། བྱ་བ་གཞན་ཞིག་བྱས་པའམ་བྱེད་བཞིན་པ་དང་། བྱེད་འགྱུར་གང་རུང་སྟོན་པ་ལ་འཇུག་པ་སྟེ། སྦྱོར་ཚུལ་ནི་རྗེས་འཇུག་ཐམས་ཅད་ཀྱི་རྗེས་སུ་ཁྱད་མེད་དུ་འཇུག་གོ། དཔེར་ན། ཤར་ཕྱོགས་ལ་འགྲོ། རང་ཁྱིམ་ལ་བསྡད་ལྟ་བུ། 2. ༡རྒྱུན་སྤྱོད། དགོས་ཆེད་ལ་འཇུག་པ། ལ་ཡིག་སྦྱར་བའི་ཚེ་ན། བྱེད་པ་པོ་ཅི་རིགས་ཤིག་གིས་རང་གི་དགོས་པའམ་བྱ་བའི་ཡུལ་གྱི་དགོས་པ་ཞིག་བསྒྲུབ་པའི་ཆེད་དུ་བྱ་བ་ཞིག་བྱེད་པར་སྟོན་པ་སྟེ། སྦྱོར་ཚུལ་ནི་རྗེས་འཇུག་ཐམས་ཅད་ཀྱི་རྗེས་སུ་ཁྱད་མེད་དུ་འཇུག་གོ། དཔེར་ན། ནད་པ་ལ་སྨན་བྱིན། མེ་ཏོག་ལ་ཆུ་འདྲེན་ལྟ་བུ། 3. ༡རྒྱུན་སྤྱོད། རྟེན་གནས་ལ་འཇུག་པ། ལ་ཡིག་སྦྱར་ཚེ་དངོས་པོ་གང་ཞིག་གང་དུ་ཡོད་པར་སྟོན་པར་བྱེད་པ་སྟེ། སྦྱོར་ཚུལ་ནི་རྗེས་འཇུག་ཐམས་ཅད་ཀྱི་རྗེས་སུ་ཁྱད་མེད་དུ་འཇུག་གོ། དཔེར་ན། ཐབ་ལ་མེ་ཡོད། ཁང་ཐོག་ལ་དུད་ཁུང་ཡོད་ལྟ་བུ། 4. ༡རྒྱུན་སྤྱོད། དེ་ཉིད་ལ་འཇུག་པ། ལ་དོན་གྱི་ལ་ཡིག་སྦྱར་ས་དེའི་ངོ་བོར་གྲུབ་པའམ་དེ་ཉིད་དུ་ངེས་པར་སྟོན་པ་དོན་གྱི་ཆ་ནས་ཡུལ་དང་བྱ་བ་སོ་སོར་འབྱེད་དུ་མེད་པ་ཞིག་སྟོན་པ་སྟེ། སྦྱོར་ཚུལ་ནི་རྗེས་འཇུག་ཐམས་ཅད་ཀྱི་རྗེས་སུ་ཁྱད་མེད་དུ་འཇུག་གོ། དཔེར་ན། ཇེ་གསལ་ལ་བཏང། བདེན་པ་ལ་གནས་ལྟ་བུ། 5. ༡རྒྱུན་སྤྱོད། ཚེ་སྐབས་ལ་འཇུག་པ། དུས་དང་སྐབས་སྟོན་པའི་མིང་མཐར་ལ་ཡིག་སྦྱར་ནས། བྱ་བ་གང་ཞིག་གམ་འགྱུར་ལྡོག་གང་ཞིག་དུས་སྐབས་ག་འདྲ་ཞིག་ལ་བྱེད་པའམ་འབྱུང་བ་སྟོན་པ་སྟེ། སྦྱོར་ཚུལ་ནི་རྗེས་འཇུག་ཐམས་ཅད་ཀྱི་རྗེས་སུ་ཁྱད་མེད་དུ་འཇུག་གོ། དཔེར་ན། ཚེས་བཅོ་ལྔ་ལ་འཁྲུངས། ནམ་གུང་ཙམ་ལ་འབྱོར་ལྟ་བུ། 6. ༡རྒྱུན་སྤྱོད། ཚིག་སྔ་ཕྱི་གཉིས་ཀྱི་མཚམས་སྦྱོར་དུ་འཇུག་པ་སྟེ། དཔེར་ན། རྩམ་པ་ལྡོད་ལ་མིད། གླུ་ལོངས་ལ་གར་རྩེན་ལྟ་བུ། 7. ༡རྒྱུན་སྤྱོད། དོན་རྐྱེན་གསལ་བྱེད་དུ་འཇུག་པ་ནི། དཔེར་ན། རྒྱལ་པོའི་ཆད་པ་ལ་བྲོས་ལྟ་བུ། 8. ༡རྒྱུན་སྤྱོད། བྱེད་རྒྱུ་གསལ་བྱེད་དུ་འཇུག་པ། དཔེར་ན། དངུལ་དཀར་སྟོང་ཕྲག་གཅིག་ལ་བྱམས་པའི་སྐུ་བཞེངས་ལྟ་བུ་སྟེ། དངུལ་དཀར་སྟོང་ཕྲག་གཅིག་གིས་བྱམས་པའི་སྐུ་བཞེངས་ཞེས་པ་དང་དོན་གཅིག་གོ། 9. ༡རྒྱུན་སྤྱོད། བརྗོད་གཞི་ངོས་འཛིན་པའི་དོན་དུ་འཇུག་པ་ནི། དཔེར་ན། དགའ་བའི་ལུས་ལ་རང་དབང་མེད། སྐྱིད་པའི་སེམས་ལ་དྲན་འཛིན་མེད་ལྟ་བུ། གྲོགས་ཚིག བྱ་ཚིག་མ་འོངས་པའི་མཐའ་ལ་སྦྱར་ན་བྱ་བ་དེ་བྱེད་པ་ལ་ཧ་ཅང་ཉེ་བའི་དོན་སྟོན་ཏེ། དཔེར་ན། ང་གནས་གཞན་དུ་འགྲོ་ལ་ཡོད་ལྟ་བུ། 3 | -------------------------------------------------------------------------------- /tests/data/monlam2020/multi_pos_multi_sense_expected.csv: -------------------------------------------------------------------------------- 1 | ID,Form,Lemma,MonPOS,MonFeature,MonTag,POS,Feature,Morph,SenseTag,Definition,Example 2 | 1,ལ་,ལ་1,མིང་ཚིག,,,,,,བོད་ཡིག,བོད་ཡིག་གི་གསལ་བྱེད་ལས་སྡེ་བ་བདུན་པའི་ཡི་གེ་གཉིས་པ་སྟེ། འདིའི་ང་རོ་འདོན་ཚུལ་ནི། སྐྱེ་གནས་སོ་དང༌། བྱེད་པ་ལྕེ་རྩེ། སྒྲ་འབྱིན་ཚུལ་ཆེས་ཤིན་ཏུ་ལྷོད་པ། མིང་གཞིའི་རྟགས་ཀྱི་དབྱེ་བའི་མོ་གཤམ་གྱི་ཡི་གེ་ཞིག, 3 | 2,ལ་,ལ་1,མིང་ཚིག,,,,,,རྗེས་འཇུག,རྗེས་འཇུག་བཅུའི་དགུ་པ་སྟེ། མིང་གཞིའི་ཡི་གེ་ཀུན་གྱི་རྗེས་སུ་འཇུག ,ཀལ། ཁལ། གལ། ངལ། ཅལ། ཆལ། མཇལ། ཉལ། ཏལ། ཐལ། དལ། ནལ། དཔལ། ཕལ། བལ། མལ། བཙལ། ཚལ། འཛོལ། ཝལ། ཞལ། ཟལ། འོལ། ཡལ། རལ། ཤལ། སལ། ཧལ། ཨལ་བཅས་པ་ལྟ་བུ། 4 | 3,ལ་,ལ་1,མིང་ཚིག,,"༡རྒྱུན་སྤྱོད།, ༡ལྡོག་ཚིག:ཐུར།",,,,རི་འདབས,རི་འདབས་ནས་ཡར་རི་རྩེ་བར་གྱི་གྱེན་ངོས་སམ། ས་བབ་དམའ་ས་ནས་མཐོ་ས་ལ་ཡར་གྱེན་དུ་གསེག་ཡོད་པའི་ཟུར་ངོས་ཀྱི་མིང། ,ལ་རེ་ཐུར་རེ་མ་བརྒྱབ་ན། བདེ་མོའི་ཐང་ལ་ཐོན་རྒྱུ་མེད་ལྟ་བུ། 5 | 4,ལ་,ལ་1,མིང་ཚིག,,༡ཆོས་ལུགས།,,,,གཅོད་བྱེད,གཅོད་བྱེད་ཀྱི་ཡི་གེ་སྟེ། སྐུ་ཐམས་ཅད་ཀྱི་སྣང་བ་སྟོན་པ་དབང་རྫོགས་པ་རང་བྱུང་ཆེན་པོའི་རྒྱུད་ལས། ལ་ནི་གཅོད་བྱེད་ཀྱི་ཡི་གེར་ཤེས་པར་བྱའོ། །ཞེས་རྒྱུད་དེའི་སྒྲའི་སྡོམ་བྱང་བསྟན་པའི་སྐབས་སུ་གསལ།, 6 | 5,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,, 7 | 6,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,, 8 | 7,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,, 9 | 8,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,, 10 | 9,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,, 11 | 10,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,, 12 | 11,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,, 13 | 12,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,, 14 | 13,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,, 15 | 14,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,, 16 | 15,ལ་,ལ་3,གྲོགས་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,, 17 | -------------------------------------------------------------------------------- /tests/data/monlam2020/one_pos_multi_sense.csv: -------------------------------------------------------------------------------- 1 | edited,favorites,_id,word,result 2 | -,-,2,ཀ་ཀ་, མིང་ཚིག 1. བྱ་སྐྱ་ཀ 2. བྱ་སྐྱ་ཀའི་སྐད། 3. ལུག་གི་ཐེ་གེའམ་ཨ་ཅུག 4. ༡ཡུལ་སྐད།  བྱིས་པ་ཆུང་ངུའི་གྱོན་པ་ལ་ཀ་ཀ་ཟེར། དཔེ་རིས་ལ་གཟིགས། 3 | -------------------------------------------------------------------------------- /tests/data/monlam2020/one_pos_multi_sense_expected.csv: -------------------------------------------------------------------------------- 1 | ID,Form,Lemma,MonPOS,MonFeature,MonTag,POS,Feature,Morph,SenseTag,Definition,Example 2 | 1,ཀ་ཀ་,ཀ་ཀ་1, མིང་ཚིག,  ,,,,,བྱ་,བྱ་སྐྱ་ཀ, 3 | ,,ཀ་ཀ་2,,,,,,,བྱ་,བྱ་སྐྱ་ཀའི་སྐད།, 4 | ,,ཀ་ཀ་3,,,,,,,ལུག་,ལུག་གི་ཐེ་གེའམ་ཨ་ཅུག , 5 | ,,ཀ་ཀ་4,,,༡ཡུལ་སྐད།,,,,བྱིས་པ་,བྱིས་པ་ཆུང་ངུའི་གྱོན་པ་ལ་ཀ་ཀ་ཟེར། དཔེ་རིས་ལ་གཟིགས།, 6 | -------------------------------------------------------------------------------- /tests/data/monlam2020/one_pos_one_sense.csv: -------------------------------------------------------------------------------- 1 | edited,favorites,_id,word,result 2 | -,-,3,ཀ་ཀ་ནཱི་ལ་,མིང་ཚིག ༡རྒྱ་གར། ཨིནྡྲ་ནཱི་ལ་དང་ནཱི་ལ་གཉིས་ལས་མདོག་ཅུང་ཟད་ནག་པའི་རྡོ་སྨན་ཁ་དོག་ཅུང་ཟད་ནག་པ། ཉི་མ་ཤར་བའི་ཕྱོགས་སུ་རང་གི་འོད་འབྱུང་ལ་ཆེ་ཆུང་མ་ངེས་པ་ཁ་དོག་སྔོ་ཞིང་ཤིན་ཏུ་དྭངས་པ་དང་དབྱིབས་སྐེད་རྔ་ཅན་ནམ། ལེབ་མོ་ངོས་མང་སོགས་སྣ་ཚོགས། ཆེ་ཆུང་མ་ངེས་ཤིང་། སྲ་ལ་མཁྲེགས་པ་ཞིག འདིས་ནད་མ་ལུས་སེལ་བར་བཤད། 3 | -------------------------------------------------------------------------------- /tests/data/monlam2020/one_pos_one_sense_expected.csv: -------------------------------------------------------------------------------- 1 | ID,Form,Lemma,MonPOS,MonFeature,MonTag,POS,Feature,Morph,SenseTag,Definition,Example 2 | 1,ཀ་ཀ་ནཱི་ལ་,ཀ་ཀ་ནཱི་ལ་1,མིང་ཚིག,,༡རྒྱ་གར།,,,,ཨིནྡྲ་ནཱི་ལ་,ཨིནྡྲ་ནཱི་ལ་དང་ནཱི་ལ་གཉིས་ལས་མདོག་ཅུང་ཟད་ནག་པའི་རྡོ་སྨན་ཁ་དོག་ཅུང་ཟད་ནག་པ། ཉི་མ་ཤར་བའི་ཕྱོགས་སུ་རང་གི་འོད་འབྱུང་ལ་ཆེ་ཆུང་མ་ངེས་པ་ཁ་དོག་སྔོ་ཞིང་ཤིན་ཏུ་དྭངས་པ་དང་དབྱིབས་སྐེད་རྔ་ཅན་ནམ། ལེབ་མོ་ངོས་མང་སོགས་སྣ་ཚོགས། ཆེ་ཆུང་མ་ངེས་ཤིང་། སྲ་ལ་མཁྲེགས་པ་ཞིག འདིས་ནད་མ་ལུས་སེལ་བར་བཤད།i, 3 | -------------------------------------------------------------------------------- /tests/data/monlam2020/verbs.csv: -------------------------------------------------------------------------------- 1 | edited,favorites,_id,word,result 2 | -,-,83305,རློམས་, བྱ་ཚིག ༡རྒྱུན་སྤྱོད། ༡བྱ་བྱེད་ཐ་དད་པ། རློམ་གྱི་སྐུལ་ཚིག ༢མ་འོངས་པ། བརླམ། ༢ད་ལྟ་བ། རློམ། ༢འདས་པ། བརླམས། ༢སྐུལ་ཚིག རློམས། 3 | -,-,83302,རློམ་, བྱ་ཚིག 1. ༡རྒྱུན་སྤྱོད། ༡བྱ་བྱེད་ཐ་མི་དད་པ། ༡དུས་གསུམ་ཐོར་བུ་པ། གཟུགས་མི་འགྱུར་བ། དོན་དང་མི་མཐུན་པའི་རང་ཉིད་བཟང་པོའམ་ལེགས་པ་ཡིན་པར་སེམས་པའི་དོན། དཔེར་ན། མཁས་པར་རློམ་ཡང་བླུན་རྟགས་མཐོང་ལྟ་བུ། 2. ༡བྱ་བྱེད་ཐ་དད་པ། ༡དུས་གསུམ་ཐོར་བུ་པ། སྐུལ་ཚིག་ཙམ་འགྱུར་བ། ཕྱི་རྐྱེན་གྱིས་བློ་སེམས་ངན་པར་བསྒྱུར་བའམ་དབང་པོ་རྨོངས་པར་བྱས་པའི་དོན། ༢མ་འོངས་པ། བརླམ། ༢ད་ལྟ་བ། རློམ། ༢འདས་པ། བརླམས། ༢སྐུལ་ཚིག རློམས། དཔེར་ན། ༡མ་འོངས་པ། གཟབ་ནན་མ་བྱས་ན་གདོན་གྱིས་བརླམ་ཉེན་ཆེ། ༡ད་ལྟ་བ། རྒྱུད་རློམ་བྱེད་ཀྱི་ངན་སྔགས། ༡འདས་པ། གདོན་གྱིས་བརླམས་ནས་གཏམ་འཆལ་བཤད། ༡སྐུལ་ཚིག དགྲ་བོའི་ཤེས་པ་རློམས་ཤིག་ལྟ་བུ། མིང་ཚིག ཡིད་ཆེས་པའི་དོན་ཏེ། དཔེར་ན། ངས་ནམ་རྒྱུན་གྲོགས་པོ་དག་གི་གཏམ་རློམ་གྱིན་ཡོད་ལྟ་བུ། 4 | -------------------------------------------------------------------------------- /tests/data/monlam2020/verbs_expected.csv: -------------------------------------------------------------------------------- 1 | edited,favorites,_id,word,result 2 | -,-,83305,རློམས་, བྱ་ཚིག ༡རྒྱུན་སྤྱོད། ༡བྱ་བྱེད་ཐ་དད་པ། རློམ་གྱི་སྐུལ་ཚིག ༢མ་འོངས་པ། བརླམ། ༢ད་ལྟ་བ། རློམ། ༢འདས་པ། བརླམས། ༢སྐུལ་ཚིག རློམས། 3 | -,-,83302,རློམ་, བྱ་ཚིག 1. ༡རྒྱུན་སྤྱོད། ༡བྱ་བྱེད་ཐ་མི་དད་པ། ༡དུས་གསུམ་ཐོར་བུ་པ། གཟུགས་མི་འགྱུར་བ། དོན་དང་མི་མཐུན་པའི་རང་ཉིད་བཟང་པོའམ་ལེགས་པ་ཡིན་པར་སེམས་པའི་དོན། དཔེར་ན། མཁས་པར་རློམ་ཡང་བླུན་རྟགས་མཐོང་ལྟ་བུ། 2. ༡བྱ་བྱེད་ཐ་དད་པ། ༡དུས་གསུམ་ཐོར་བུ་པ། སྐུལ་ཚིག་ཙམ་འགྱུར་བ། ཕྱི་རྐྱེན་གྱིས་བློ་སེམས་ངན་པར་བསྒྱུར་བའམ་དབང་པོ་རྨོངས་པར་བྱས་པའི་དོན། ༢མ་འོངས་པ། བརླམ། ༢ད་ལྟ་བ། རློམ། ༢འདས་པ། བརླམས། ༢སྐུལ་ཚིག རློམས། དཔེར་ན། ༡མ་འོངས་པ། གཟབ་ནན་མ་བྱས་ན་གདོན་གྱིས་བརླམ་ཉེན་ཆེ། ༡ད་ལྟ་བ། རྒྱུད་རློམ་བྱེད་ཀྱི་ངན་སྔགས། ༡འདས་པ། གདོན་གྱིས་བརླམས་ནས་གཏམ་འཆལ་བཤད། ༡སྐུལ་ཚིག དགྲ་བོའི་ཤེས་པ་རློམས་ཤིག་ལྟ་བུ། མིང་ཚིག ཡིད་ཆེས་པའི་དོན་ཏེ། དཔེར་ན། ངས་ནམ་རྒྱུན་གྲོགས་པོ་དག་གི་གཏམ་རློམ་གྱིན་ཡོད་ལྟ་བུ། 4 | -------------------------------------------------------------------------------- /tests/hfr2cql/UDPOS-bo.txt: -------------------------------------------------------------------------------- 1 | UD-POS ཚིག་གཤིས། བྱང་བུ། 2 | ADJ རྒྱན་ཚིག རྒྱན 3 | ADP སྦྱོར་ཚིག སྦྱོར 4 | ADV བསྣན་ཚིག བསྣན 5 | AUX བྱ་གྲོགས། གྲོགས 6 | CCONJ ལྟོས་མེད་སྦྲེལ་ཚིག སྦྲེལ 7 | DET ངེས་གཟུང་། ངེས 8 | INTJ འབོད་ཚིག འབོད 9 | NOUN མིང་ཚིག མིང 10 | NUM གྲངས་ཚིག གྲངས 11 | PRON མིང་ཚབ། ཚབ 12 | PROPN ཁྱད་མིང་། ཁྱད 13 | PUNCT ཚེག་ཤད། ཚེག 14 | SCONJ ལྟོས་བཅས་སྦྲེལ་ཚིག ལྟོས 15 | VERB བྱ་ཚིག བྱ 16 | PART རོགས་ཚིག རོགས -------------------------------------------------------------------------------- /tests/hfr2cql/adjustments.txt: -------------------------------------------------------------------------------- 1 | # Syntax for the possible adjustment 2 | # =================================== 3 | # - CQL rules: "" can be used without specifying that there is "text_cleaned=" 4 | # - Index format: either "" or "-" 5 | # - Adjustment format: 6 | # - "+" for merge 7 | # - ":" for split (default: syllable mode) 8 | # - "::" for split in character mode 9 | # - "=" for replace 10 | # - Constraint: "-" is only allowed if adjustment is ":" or "::" 11 | 12 | ["ལ་ལ་"] ["ལ་ལ་"] 1 = [pos="PART"] 13 | ["ལ་ལ་"] ["ལ་ལ་"] 2 = [pos="PART"] 14 | ["ལ་ལ་"] ["ལ་ལ་"] 1-2 :: [pos="PART"] [pos="PART"] 15 | ["ལ་"] ["ལ་"] ["ལ་ལ་"] 3-2 :: [pos="PART"] [pos="PART"] 16 | ["ལ་"] ["ལ་"] ["ལ་"] ["ལ་"] 2 + [pos="PART"] -------------------------------------------------------------------------------- /tests/hfr2cql/cql/_cql2hfr_cql.txt: -------------------------------------------------------------------------------- 1 | #matchcql idx op replacecql 2 | 3 | ༺གཤིས=ངཟ༻ ༺གཤིས=ཏཅ༻ 2 = ༺གཤིས=ཡཚ༻ 4 | ༺གཤིས=ངཟ༻ ༺"སྟེ་" ༈ གཤིས=ཏཅ༻ 2 = ༺གཤིས=ཏཅ༻ 5 | ༺གཤིས=ངཟ༻ ༺"ཅིང་" ༈ གཤིས=ཏཅ༻ 2 = ༺གཤིས=ཏཅ༻ 6 | ༺གཤིས=ངཟ༻ ༺"ཞིང་" ༈ གཤིས=ཏཅ༻ 2 = ༺གཤིས=མཚ༻ 7 | ༺གཤིས=ངཟ༻ ༺"ཤིང་" ༈ གཤིས=ཏཅ༻ 2 = ༺གཤིས=མཚ༻ 8 | ༺གཤིས=མཚ༻ ༺གཤིས=ཏཅ༻ 2 = ༺གཤིས=ཡཚ༻ 9 | ༺གཤིས=མཚ༻ ༺"སྟེ་" ༈ གཤིས=ཏཅ༻ 2 = ༺གཤིས=ཏཅ༻ 10 | ༺གཤིས=མཚ༻ ༺"ཏེ་" ༈ གཤིས=ཏཅ༻ 2 = ༺གཤིས=ཏཅ༻ 11 | ༺གཤིས=མཚ༻ ༺"ཞིང་" ༈ གཤིས=ཏཅ༻ 2 = ༺གཤིས=ཏཅ༻ 12 | ༺གཤིས=མཚ༻ ༺"ཤིང་" ༈ གཤིས=ཏཅ༻ 2 = ༺གཤིས=ཏཅ༻ 13 | ༺གཤིས=མཚ༻ ༺"ཤིང་" ༈ གཤིས=ཏཅ༻ ༺༻ ༺གཤིས=ཡཚ༻ 2 = ༺གཤིས=མཚ༻ 14 | ༺གཤིས=ཚབ༻ ༺"ནས་" ༈ གཤིས=ཏཅ༻ 2 = ༺གཤིས=ཡཚ༻ 15 | ༺"སུ་"༻ ༺གཤིས=ཚབ༻ ༺"ནས་" ༈ གཤིས=ཏཅ༻ 3 = ༺གཤིས=ཏཅ༻ 16 | ༺གཤིས=ངཚ༻ ༺གཤིས=ཏཅ༻ 2 = ༺གཤིས=ཡཚ༻ 17 | ༺གཤིས=ངཚ༻ ༺"སྟེ་" ༈ གཤིས=ཏཅ༻ 2 = ༺གཤིས=ཏཅ༻ 18 | ༺གཤིས=ཡཚ༻ ༺གཤིས=ཏཅ༻ 2 = ༺གཤིས=མཚ༻ 19 | ༺གཤིས=མཚ༻ ༺གཤིས=ཡཚ༻ ༺"ནས་" ༈ གཤིས=ཏཅ༻ 3 = ༺གཤིས=ཡཚ༻ 20 | ༺གཤིས=ཡཚ༻ ༺"ཏེ་" ༈ གཤིས=ཏཅ༻ 2 = ༺གཤིས=ཏཅ༻ -------------------------------------------------------------------------------- /tests/hfr2cql/cql/cql.txt: -------------------------------------------------------------------------------- 1 | #matchcql idx op replacecql 2 | 3 | [pos="DET"] [pos="SCONJ"] 2 = [pos="ADP"] 4 | [pos="DET"] ["སྟེ་" & pos="SCONJ"] 2 = [pos="SCONJ"] 5 | [pos="DET"] ["ཅིང་" & pos="SCONJ"] 2 = [pos="SCONJ"] 6 | [pos="DET"] ["ཞིང་" & pos="SCONJ"] 2 = [pos="NOUN"] 7 | [pos="DET"] ["ཤིང་" & pos="SCONJ"] 2 = [pos="NOUN"] 8 | [pos="NOUN"] [pos="SCONJ"] 2 = [pos="ADP"] 9 | [pos="NOUN"] ["སྟེ་" & pos="SCONJ"] 2 = [pos="SCONJ"] 10 | [pos="NOUN"] ["ཏེ་" & pos="SCONJ"] 2 = [pos="SCONJ"] 11 | [pos="NOUN"] ["ཞིང་" & pos="SCONJ"] 2 = [pos="SCONJ"] 12 | [pos="NOUN"] ["ཤིང་" & pos="SCONJ"] 2 = [pos="SCONJ"] 13 | [pos="NOUN"] ["ཤིང་" & pos="SCONJ"] [] [pos="ADP"] 2 = [pos="NOUN"] 14 | [pos="PRON"] ["ནས་" & pos="SCONJ"] 2 = [pos="ADP"] 15 | ["སུ་"] [pos="PRON"] ["ནས་" & pos="SCONJ"] 3 = [pos="SCONJ"] 16 | [pos="NUM"] [pos="SCONJ"] 2 = [pos="ADP"] 17 | [pos="NUM"] ["སྟེ་" & pos="SCONJ"] 2 = [pos="SCONJ"] 18 | [pos="ADP"] [pos="SCONJ"] 2 = [pos="NOUN"] 19 | [pos="NOUN"] [pos="ADP"] ["ནས་" & pos="SCONJ"] 3 = [pos="ADP"] 20 | [pos="ADP"] ["ཏེ་" & pos="SCONJ"] 2 = [pos="SCONJ"] -------------------------------------------------------------------------------- /tests/hfr2cql/cql2hfr.txt: -------------------------------------------------------------------------------- 1 | "ADJ" - རཚ 2 | "ADP" - ཡཚ 3 | "ADV" - ནཚ 4 | "AUX" - བག 5 | "CCONJ" - ཏམ 6 | "DET" - ངཟ 7 | "INTJ" - འཚ 8 | "NOUN" - མཚ 9 | "NUM" - ངཚ 10 | "PRON" - ཚབ 11 | "PROPN" - ཁམ 12 | "PUNCT" - ཚཤ 13 | "SCONJ" - ཏཅ 14 | "VERB" - བཚ 15 | "PART" - གཚ 16 | pos= - གཤིས= 17 | lemma= - མ= 18 | sense= - དོན= 19 | & - ༈ 20 | \[ - ༺ 21 | \] - ༻ 22 | -------------------------------------------------------------------------------- /tests/hfr2cql/cql_result.txt: -------------------------------------------------------------------------------- 1 | #matchcql idx op replacecql 2 | 3 | [pos="DET"] [pos="SCONJ"] 2 = [pos="ADP"] 4 | [pos="DET"] ["སྟེ་" & pos="SCONJ"] 2 = [pos="SCONJ"] 5 | [pos="DET"] ["ཅིང་" & pos="SCONJ"] 2 = [pos="SCONJ"] 6 | [pos="DET"] ["ཞིང་" & pos="SCONJ"] 2 = [pos="NOUN"] 7 | [pos="DET"] ["ཤིང་" & pos="SCONJ"] 2 = [pos="NOUN"] 8 | [pos="NOUN"] [pos="SCONJ"] 2 = [pos="ADP"] 9 | [pos="NOUN"] ["སྟེ་" & pos="SCONJ"] 2 = [pos="SCONJ"] 10 | [pos="NOUN"] ["ཏེ་" & pos="SCONJ"] 2 = [pos="SCONJ"] 11 | [pos="NOUN"] ["ཞིང་" & pos="SCONJ"] 2 = [pos="SCONJ"] 12 | [pos="NOUN"] ["ཤིང་" & pos="SCONJ"] 2 = [pos="SCONJ"] 13 | [pos="NOUN"] ["ཤིང་" & pos="SCONJ"] [] [pos="ADP"] 2 = [pos="NOUN"] 14 | [pos="PRON"] ["ནས་" & pos="SCONJ"] 2 = [pos="ADP"] 15 | ["སུ་"] [pos="PRON"] ["ནས་" & pos="SCONJ"] 3 = [pos="SCONJ"] 16 | [pos="NUM"] [pos="SCONJ"] 2 = [pos="ADP"] 17 | [pos="NUM"] ["སྟེ་" & pos="SCONJ"] 2 = [pos="SCONJ"] 18 | [pos="ADP"] [pos="SCONJ"] 2 = [pos="NOUN"] 19 | [pos="NOUN"] [pos="ADP"] ["ནས་" & pos="SCONJ"] 3 = [pos="ADP"] 20 | [pos="ADP"] ["ཏེ་" & pos="SCONJ"] 2 = [pos="SCONJ"] -------------------------------------------------------------------------------- /tests/hfr2cql/hfr_result.txt: -------------------------------------------------------------------------------- 1 | #matchcql idx op replacecql 2 | 3 | ༺གཤིས=ངེས༻ ༺གཤིས=ལྟོས༻ 2 = ༺གཤིས=སྦྱོར༻ 4 | ༺གཤིས=ངེས༻ ༺"སྟེ་" ༈ གཤིས=ལྟོས༻ 2 = ༺གཤིས=ལྟོས༻ 5 | ༺གཤིས=ངེས༻ ༺"ཅིང་" ༈ གཤིས=ལྟོས༻ 2 = ༺གཤིས=ལྟོས༻ 6 | ༺གཤིས=ངེས༻ ༺"ཞིང་" ༈ གཤིས=ལྟོས༻ 2 = ༺གཤིས=མིང༻ 7 | ༺གཤིས=ངེས༻ ༺"ཤིང་" ༈ གཤིས=ལྟོས༻ 2 = ༺གཤིས=མིང༻ 8 | ༺གཤིས=མིང༻ ༺གཤིས=ལྟོས༻ 2 = ༺གཤིས=སྦྱོར༻ 9 | ༺གཤིས=མིང༻ ༺"སྟེ་" ༈ གཤིས=ལྟོས༻ 2 = ༺གཤིས=ལྟོས༻ 10 | ༺གཤིས=མིང༻ ༺"ཏེ་" ༈ གཤིས=ལྟོས༻ 2 = ༺གཤིས=ལྟོས༻ 11 | ༺གཤིས=མིང༻ ༺"ཞིང་" ༈ གཤིས=ལྟོས༻ 2 = ༺གཤིས=ལྟོས༻ 12 | ༺གཤིས=མིང༻ ༺"ཤིང་" ༈ གཤིས=ལྟོས༻ 2 = ༺གཤིས=ལྟོས༻ 13 | ༺གཤིས=མིང༻ ༺"ཤིང་" ༈ གཤིས=ལྟོས༻ ༺༻ ༺གཤིས=སྦྱོར༻ 2 = ༺གཤིས=མིང༻ 14 | ༺གཤིས=ཚབ༻ ༺"ནས་" ༈ གཤིས=ལྟོས༻ 2 = ༺གཤིས=སྦྱོར༻ 15 | ༺"སུ་"༻ ༺གཤིས=ཚབ༻ ༺"ནས་" ༈ གཤིས=ལྟོས༻ 3 = ༺གཤིས=ལྟོས༻ 16 | ༺གཤིས=གྲངས༻ ༺གཤིས=ལྟོས༻ 2 = ༺གཤིས=སྦྱོར༻ 17 | ༺གཤིས=གྲངས༻ ༺"སྟེ་" ༈ གཤིས=ལྟོས༻ 2 = ༺གཤིས=ལྟོས༻ 18 | ༺གཤིས=སྦྱོར༻ ༺གཤིས=ལྟོས༻ 2 = ༺གཤིས=མིང༻ 19 | ༺གཤིས=མིང༻ ༺གཤིས=སྦྱོར༻ ༺"ནས་" ༈ གཤིས=ལྟོས༻ 3 = ༺གཤིས=སྦྱོར༻ 20 | ༺གཤིས=སྦྱོར༻ ༺"ཏེ་" ༈ གཤིས=ལྟོས༻ 2 = ༺གཤིས=ལྟོས༻ -------------------------------------------------------------------------------- /tests/resources/rdr_rules.txt: -------------------------------------------------------------------------------- 1 | True : object.conclusion = "NN" 2 | object.tag == "LATIN" : object.conclusion = "LATIN" 3 | object.tag == "OTHER" : object.conclusion = "OTHER" 4 | object.tag == "PUNCT" : object.conclusion = "PUNCT" 5 | object.tag == "DET" : object.conclusion = "DET" 6 | object.word == "དག" and object.nextTag1 == "PART" : object.conclusion = "VERB" 7 | object.tag == "PROPN" : object.conclusion = "PROPN" 8 | object.tag == "NOUN" : object.conclusion = "NOUN" 9 | object.tag == "PART" : object.conclusion = "PART" 10 | object.word == "ས" and object.nextTag1 == "PART" : object.conclusion = "ADP" 11 | object.prevTag1 == "PUNCT" and object.word == "ས་" : object.conclusion = "ADP" 12 | object.prevTag1 == "PART" and object.word == "ས་" : object.conclusion = "ADP" 13 | object.prevWord1 == "མི" : object.conclusion = "PART" 14 | object.prevWord1 == "བྷ་" : object.conclusion = "ADP" 15 | object.word == "ས་" and object.nextWord1 == "ལ་" : object.conclusion = "ADP" 16 | object.prevWord1 == "ལ" and object.word == "ས་" : object.conclusion = "PART" 17 | object.nextWord1 == "སྟེངས་" : object.conclusion = "ADP" 18 | object.word == "ར" and object.nextWord1 == "འི་" : object.conclusion = "ADP" 19 | object.tag == "X" : object.conclusion = "X" 20 | object.tag == "OOV" : object.conclusion = "OOV" 21 | object.tag == "VERB" : object.conclusion = "VERB" 22 | object.suffixL1 == "མ" : object.conclusion = "NOUN" 23 | object.suffixL1 == "ན" : object.conclusion = "OOV" 24 | object.nextWord2 == "བོད་སྐད་" : object.conclusion = "NON_WORD" 25 | object.nextWord1 == "ཡིག་" : object.conclusion = "NON_WORD" 26 | object.suffixL2 == "ཕྱོགས་" : object.conclusion = "NOUN" 27 | object.nextTag1 == "NUM" and object.nextTag2 == "NUM" : object.conclusion = "NOUN" 28 | object.prevWord1 == "དཔེར་ན་" : object.conclusion = "NOUN" 29 | object.prevWord1 == "།_" and object.nextWord1 == "ལ་སོགས་པ་" : object.conclusion = "NOUN" 30 | object.suffixL1 == "སོ" : object.conclusion = "OOV" 31 | object.suffixL2 == "སྐྱེས་" : object.conclusion = "OOV" 32 | object.nextTag1 == "NON_WORD" : object.conclusion = "OOV" 33 | object.suffixL2 == "ཆད་" : object.conclusion = "ADV" 34 | object.tag == "ADP" : object.conclusion = "ADP" 35 | object.tag == "AUX" : object.conclusion = "AUX" 36 | object.tag == "NUM" : object.conclusion = "NUM" 37 | object.tag == "TEXT" : object.conclusion = "TEXT" 38 | object.tag == "PRON" : object.conclusion = "PRON" 39 | object.tag == "ADJ" : object.conclusion = "ADJ" 40 | object.tag == "SCONJ" : object.conclusion = "SCONJ" 41 | object.prevTag1 == "NOUN" : object.conclusion = "ADP" 42 | object.prevTag1 == "DET" : object.conclusion = "ADP" 43 | object.tag == "ADV" : object.conclusion = "ADV" 44 | object.tag == "NON_WORD" : object.conclusion = "NON_WORD" 45 | object.tag == "INTJ" : object.conclusion = "INTJ" -------------------------------------------------------------------------------- /tests/resources/shelving/test_1.txt: -------------------------------------------------------------------------------- 1 | ཝ་ཡེ། བཀྲ་ 2 | ཤིས་ཡིན་པས། 3 | -------------------------------------------------------------------------------- /tests/resources/shelving/test_1_tok/test_1_tok.txt: -------------------------------------------------------------------------------- 1 | ཝ་ཡེ/NO_POS །_/ བཀྲ་ཤིས་/NOUN ཡིན་པ/AUX ས/PART །/ -------------------------------------------------------------------------------- /tests/resources/shelving/test_2.txt: -------------------------------------------------------------------------------- 1 | བཀྲ་ཤིས་བདེ་ལེགས་ 2 | ཕུན་སུམ་ཚོགས། this is non-bo text རྟག་ཏུ་བདེ་ 3 | བ་ཐོབ་པ 4 | ར་ཤོག 5 | -------------------------------------------------------------------------------- /tests/resources/step1_3/input/test.txt: -------------------------------------------------------------------------------- 1 | བཀྲ་ཤིས་བདེ་ལེགས་ཕུན་སུམ་ཚོགས། རྟག་ཏུ་བདེ་བ་ཐོབ་པར་ཤོག 2 | བཀྲ་ཤིས་བདེ་ལེགས་ཕུན་སུམ་ཚོགས། རྟག་ཏུ་བདེ་བ་ཐོབ་པར་ཤོག -------------------------------------------------------------------------------- /tests/resources/step2/cql_rules.txt: -------------------------------------------------------------------------------- 1 | ["ཁྲུང་"] ["ཁྲུང་"] 1 + [pos="NOUN"] 2 | ["ཁྲུང་"] ["ཁྲུང་ས་"] 2-5 :: [pos=""] [pos="NOUN"] 3 | ["ཁྲུང་"] ["ཁྲུང་"] 1 + [pos="NOUN"] 4 | ["ཁྲུང་" & pos="NO_POS"] [pos="NOUN"] 1 + [pos="NOUN"] 5 | -------------------------------------------------------------------------------- /tests/resources/step2/manually_corrected.txt: -------------------------------------------------------------------------------- 1 | བཀྲ་ཤིས་བདེ་ལེགས་/NOUN 2 | ཕུན་སུམ་ཚོགས/ADJ །_//།//12 རྟག་/NOUN/རྟག་པ་ ཏུ་/PART/དུ་ བདེ་བ་/NOUN ཐོབ་པ/VERB/ཐོབ་ ར་/PART/ལ་ ཤོག/AUX 3 | བཀྲ་ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་/NOUN ཏུ་/PART/དུ་ བདེ་བ་/NOUN ཐོབ་པ/VERB ར་/PART ཤོག/AUXr -------------------------------------------------------------------------------- /tests/resources/step2/rdr_input.txt: -------------------------------------------------------------------------------- 1 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB 2 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB 3 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB 4 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB 5 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB 6 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB 7 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB -------------------------------------------------------------------------------- /tests/resources/step2/step2: -------------------------------------------------------------------------------- 1 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB 2 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB 3 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB 4 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB 5 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB 6 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB 7 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB 8 | -------------------------------------------------------------------------------- /tests/resources/step2/test_rules.txt: -------------------------------------------------------------------------------- 1 | ༺"ཁྲུང་"༻ ༺"ཁྲུང་"༻ 1 + ༺གཤིས=མིང༻ 2 | ༺"ཁྲུང་"༻ ༺"ཁྲུང་ས་"༻ 2-5 :: ༺གཤིས=""༻ ༺གཤིས=མིང༻ 3 | ༺"ཁྲུང་"༻ ༺"ཁྲུང་"༻ 1 + ༺གཤིས=མིང༻ 4 | ༺"ཁྲུང་" ༈ གཤིས="NO_POS"༻ ༺གཤིས=མིང༻ 1 + ༺གཤིས=མིང༻ 5 | ༺"ནི་ལ"༻ 1-1 : ༺གཤིས=མིང༻ 6 | -------------------------------------------------------------------------------- /tests/test_cli.py: -------------------------------------------------------------------------------- 1 | from click.testing import CliRunner 2 | 3 | from pybo.cli import cli, profile_update 4 | 5 | 6 | def test_tok(): 7 | runner = CliRunner() 8 | runner.invoke(cli, ["tok", "tests/resources/shelving/", "--tags", "pl"]) 9 | 10 | 11 | def test_extract_rules(): 12 | runner = CliRunner() 13 | runner.invoke(cli, ["extract-rules", "tests/resources/step2/step2"]) 14 | 15 | def test_extract_seg_rules(): 16 | runner = CliRunner() 17 | runner.invoke(cli, ["extract-seg-rules", "tests/data/corpus1/corpus1.txt", "--type", "hfr", "--e", 1]) -------------------------------------------------------------------------------- /tests/test_corpus.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from textwrap import dedent 3 | import pytest 4 | 5 | from pybo import * 6 | 7 | @pytest.mark.skip(reason="old workflow") 8 | def test_parse_manually_corrected(): 9 | dump = Path(__file__).parent / "resources/step2/manually_corrected.txt" 10 | dump = dump.read_text(encoding="utf-8-sig") 11 | data = extract_new_entries(dump, Path(__file__).parent / "resources/main") 12 | assert data == dedent( 13 | """\ 14 | # form pos lemma sense freq 15 | །_ ། 12 16 | །_ PUNCT 17 | ཏུ་ PART དུ་ 18 | ཐོབ་པ་ VERB 19 | ཐོབ་པ་ VERB ཐོབ་ 20 | ཕུན་སུམ་ཚོགས་ ADJ 21 | བཀྲ་ཤིས་ NOUN 22 | བཀྲ་ཤིས་བདེ་ལེགས་ NOUN 23 | བདེ་བ་ NOUN 24 | བདེ་ལེགས་ NOUN 25 | ར་ PART 26 | ར་ PART ལ་ 27 | རྟག་ NOUN 28 | རྟག་ NOUN རྟག་པ་ 29 | ཤོག་ AUX 30 | ཤོག་ AUXr """ 31 | ) 32 | -------------------------------------------------------------------------------- /tests/test_hfr_cqlr_converter.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pybo.hfr_cqlr_converter import cqlr2hfr, hfr2cqlr 4 | 5 | @pytest.fixture(scope="module") 6 | def cqlr(): 7 | return ( 8 | '["ལ་ལ་"] ["ལ་ལ་"] 1 = [pos="PART"]' 9 | '["ལ་ལ་"] ["ལ་ལ་"] 2 = [pos="PART"]' 10 | '["ལ་ལ་"] ["ལ་ལ་"] 1-2 :: [pos="NOUN"] [pos="PART"]' 11 | '["ལ་"] ["ལ་"] ["ལ་ལ་"] 3-2 :: [pos="PART"] [pos="PART"]' 12 | '["ལ་"] ["ལ་"] ["ལ་"] ["ལ་"] 2 + [pos="DET"]' 13 | ) 14 | 15 | 16 | @pytest.fixture(scope="module") 17 | def hfr(): 18 | return ( 19 | '༺"ལ་ལ་"༻ ༺"ལ་ལ་"༻ 1 = ༺གཤིས=རོགས༻' 20 | '༺"ལ་ལ་"༻ ༺"ལ་ལ་"༻ 2 = ༺གཤིས=རོགས༻' 21 | '༺"ལ་ལ་"༻ ༺"ལ་ལ་"༻ 1-2 :: ༺གཤིས=མིང༻ ༺གཤིས=རོགས༻' 22 | '༺"ལ་"༻ ༺"ལ་"༻ ༺"ལ་ལ་"༻ 3-2 :: ༺གཤིས=རོགས༻ ༺གཤིས=རོགས༻' 23 | '༺"ལ་"༻ ༺"ལ་"༻ ༺"ལ་"༻ ༺"ལ་"༻ 2 + ༺གཤིས=ངེས༻' 24 | ) 25 | 26 | 27 | def test_cql2hfr(cqlr, hfr): 28 | hfr_result = cqlr2hfr(cqlr) 29 | print(hfr_result) 30 | assert hfr_result == hfr 31 | print("Test pass..") 32 | 33 | 34 | def test_hfr2cql(hfr, cqlr): 35 | cql_result = hfr2cqlr(hfr) 36 | assert cql_result == cqlr 37 | print("Test pass..") 38 | -------------------------------------------------------------------------------- /tests/test_monlam2word_list.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pytest 4 | 5 | from pybo.monlam2wordlist import ( 6 | csv_loader, 7 | get_definition_list, 8 | get_example_list, 9 | get_pos_list, 10 | get_sense_tag_list, 11 | get_tag_list, 12 | monlam2wordlist, 13 | parse_attrs, 14 | ) 15 | 16 | testcases_ids = ("one_pos_one_sense", "one_pos_multi_senses", "multi_pos_multi_senses") 17 | 18 | # monlam-result-col, pos-list, definition-list, tag-list, sense-list, example-list 19 | parser_to_try = ( 20 | # one-pos-one-sense 21 | ( 22 | "མིང་ཚིག ༡ཀ། ཀཀ། ཁཁ། གག། དཔེར་ན། པཔ།", 23 | [("མིང་ཚིག", "༡ཀ། ཀཀ། ཁཁ། གག། དཔེར་ན། པཔ།")], 24 | [("མིང་ཚིག", "༡ཀ། ཀཀ། ཁཁ། གག། དཔེར་ན། པཔ།")], 25 | [("མིང་ཚིག", "༡ཀ།", "ཀཀ། ཁཁ། གག། དཔེར་ན། པཔ།")], 26 | [("མིང་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ། ཁཁ། གག། དཔེར་ན། པཔ།")], 27 | [("མིང་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ། ཁཁ། གག།", "པཔ།")], 28 | ), 29 | # one-pos-multi-senses 30 | ( 31 | "མིང་ཚིག 1. ༡ཀ། ཀཀ། དཔེར་ན། པཔ། 2. ༡ཀ། ཁཁ། 3. གག།", 32 | [("མིང་ཚིག", "༡ཀ། ཀཀ། དཔེར་ན། པཔ། 2. ༡ཀ། ཁཁ། 3. གག།")], 33 | [ 34 | ("མིང་ཚིག", "༡ཀ། ཀཀ། དཔེར་ན། པཔ།"), 35 | ("མིང་ཚིག", "༡ཀ། ཁཁ།"), 36 | ("མིང་ཚིག", "གག།"), 37 | ], 38 | [ 39 | ("མིང་ཚིག", "༡ཀ།", "ཀཀ། དཔེར་ན། པཔ།"), 40 | ("མིང་ཚིག", "༡ཀ།", "ཁཁ།"), 41 | ("མིང་ཚིག", "", "གག།"), 42 | ], 43 | [ 44 | ("མིང་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ། དཔེར་ན། པཔ།"), 45 | ("མིང་ཚིག", "༡ཀ།", "ཁཁ", "ཁཁ།"), 46 | ("མིང་ཚིག", "", "གག", "གག།"), 47 | ], 48 | [ 49 | ("མིང་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ།", "པཔ།"), 50 | ("མིང་ཚིག", "༡ཀ།", "ཁཁ", "ཁཁ།", ""), 51 | ("མིང་ཚིག", "", "གག", "གག།", ""), 52 | ], 53 | ), 54 | # multi-pos-multi-senses 55 | ( 56 | "མིང་ཚིག 1. ༡ཀ། ཀཀ། 2. ཁཁ། བྱེད་ཚིག 1. ཀཀ། 2. ༡ཀ། ཁཁ། དཔེར་ན། པཔ། གྲོགས་ཚིག ༡ཀ། ཀཀ། ཁཁ། བྱེད་ཚིག ཀཀ། ཁཁ།", 57 | [ 58 | ("མིང་ཚིག", "༡ཀ། ཀཀ། 2. ཁཁ།"), 59 | ("བྱེད་ཚིག", "ཀཀ། 2. ༡ཀ། ཁཁ། དཔེར་ན། པཔ།"), 60 | ("གྲོགས་ཚིག", "༡ཀ། ཀཀ། ཁཁ།"), 61 | ("བྱེད་ཚིག", "ཀཀ། ཁཁ།"), 62 | ], 63 | [ 64 | ("མིང་ཚིག", "༡ཀ། ཀཀ།"), 65 | ("མིང་ཚིག", "ཁཁ།"), 66 | ("བྱེད་ཚིག", "ཀཀ།"), 67 | ("བྱེད་ཚིག", "༡ཀ། ཁཁ། དཔེར་ན། པཔ།"), 68 | ("གྲོགས་ཚིག", "༡ཀ། ཀཀ། ཁཁ།"), 69 | ("བྱེད་ཚིག", "ཀཀ། ཁཁ།"), 70 | ], 71 | [ 72 | ("མིང་ཚིག", "༡ཀ།", "ཀཀ།"), 73 | ("མིང་ཚིག", "", "ཁཁ།"), 74 | ("བྱེད་ཚིག", "", "ཀཀ།"), 75 | ("བྱེད་ཚིག", "༡ཀ།", "ཁཁ། དཔེར་ན། པཔ།"), 76 | ("གྲོགས་ཚིག", "༡ཀ།", "ཀཀ། ཁཁ།"), 77 | ("བྱེད་ཚིག", "", "ཀཀ། ཁཁ།"), 78 | ], 79 | [ 80 | ("མིང་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ།"), 81 | ("མིང་ཚིག", "", "ཁཁ", "ཁཁ།"), 82 | ("བྱེད་ཚིག", "", "ཀཀ", "ཀཀ།"), 83 | ("བྱེད་ཚིག", "༡ཀ།", "ཁཁ", "ཁཁ། དཔེར་ན། པཔ།"), 84 | ("གྲོགས་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ། ཁཁ།"), 85 | ("བྱེད་ཚིག", "", "ཀཀ", "ཀཀ། ཁཁ།"), 86 | ], 87 | [ 88 | ("མིང་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ།", ""), 89 | ("མིང་ཚིག", "", "ཁཁ", "ཁཁ།", ""), 90 | ("བྱེད་ཚིག", "", "ཀཀ", "ཀཀ།", ""), 91 | ("བྱེད་ཚིག", "༡ཀ།", "ཁཁ", "ཁཁ།", "པཔ།"), 92 | ("གྲོགས་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ། ཁཁ།", ""), 93 | ("བྱེད་ཚིག", "", "ཀཀ", "ཀཀ། ཁཁ།", ""), 94 | ], 95 | ), 96 | ) 97 | 98 | 99 | @pytest.fixture(params=parser_to_try, ids=testcases_ids) 100 | def parser_testcase(request): 101 | return request.param 102 | 103 | 104 | def test_get_pos_list(parser_testcase): 105 | monlam_result_col, pos_expected, *_ = parser_testcase 106 | assert get_pos_list(monlam_result_col) == pos_expected 107 | 108 | 109 | def test_get_definition_list(parser_testcase): 110 | _, pos_list, definition_expected, *_ = parser_testcase 111 | assert get_definition_list(pos_list) == definition_expected 112 | 113 | 114 | def test_get_tag_list(parser_testcase): 115 | _, _, definition_list, tag_expected, *_ = parser_testcase 116 | assert get_tag_list(definition_list) == tag_expected 117 | 118 | 119 | def test_get_sense_tag_list(parser_testcase): 120 | *_, tag_list, sense_expected, _ = parser_testcase 121 | assert get_sense_tag_list(tag_list) == sense_expected 122 | 123 | 124 | def test_get_example_list(parser_testcase): 125 | *_, sense_list, example_expected = parser_testcase 126 | assert get_example_list(sense_list) == example_expected 127 | 128 | 129 | data_path = Path("./tests/data/monlam2020/") 130 | testcases_to_try = ( 131 | ( 132 | csv_loader(data_path / "one_pos_one_sense.csv"), 133 | csv_loader(data_path / "one_pos_one_sense_expected.csv"), 134 | ), 135 | ( 136 | csv_loader(data_path / "one_pos_multi_sense.csv"), 137 | csv_loader(data_path / "one_pos_multi_sense_expected.csv"), 138 | ), 139 | ( 140 | csv_loader(data_path / "multi_pos_multi_sense.csv"), 141 | csv_loader(data_path / "multi_pos_multi_sense_expected.csv"), 142 | ), 143 | ) 144 | 145 | 146 | @pytest.fixture(params=testcases_to_try, ids=testcases_ids) 147 | def a_testcase(request): 148 | return request.param 149 | 150 | 151 | # def test_monlam2wordlist(a_testcase): 152 | # monlam_rows, expected_rows = a_testcase 153 | # wordlists = monlam2wordlist(monlam_rows) 154 | # print(wordlists) 155 | -------------------------------------------------------------------------------- /tests/test_rdr2adjustment.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from textwrap import dedent 3 | 4 | from pybo.rdr.rdr_2_replace_matcher import rdr_2_replace_matcher 5 | 6 | 7 | def test_suffix_bug(): 8 | dump = Path("tests/resources/rdr_rules.txt").read_text(encoding="utf-8") 9 | rules = rdr_2_replace_matcher(dump) 10 | expected = dedent( 11 | """\ 12 | [pos="DET" & text="དག"] [pos="PART"] 1 = [pos="VERB"] 13 | [pos="PART" & text="ས"] [pos="PART"] 1 = [pos="ADP"] 14 | [pos="PUNCT"] [pos="PART" & text="ས་"] 2 = [pos="ADP"] 15 | [pos="PART"] [pos="PART" & text="ས་"] 2 = [pos="ADP"] 16 | [pos="PART" & text="མི"] [pos="PART" & text="ས་"] 2 = [pos="PART"] 17 | [text="བྷ་"] [pos="PART"] 2 = [pos="ADP"] 18 | [pos="PART" & text="ས་"] [text="ལ་"] 1 = [pos="ADP"] 19 | [text="ལ"] [pos="PART" & text="ས་"] [text="ལ་"] 2 = [pos="PART"] 20 | [pos="PART"] [text="སྟེངས་"] 1 = [pos="ADP"] 21 | [pos="PART" & text="ར"] [text="འི་"] 1 = [pos="ADP"] 22 | [pos="VERB"] [text=".*མ"] 1 = [pos="NOUN"] 23 | [pos="VERB"] [text=".*ན"] 1 = [pos="OOV"] 24 | [pos="VERB"] [] [text="བོད་སྐད་"] 1 = [pos="NON_WORD"] 25 | [pos="VERB"] [text="ཡིག་"] 1 = [pos="NON_WORD"] 26 | [pos="VERB"] [] [text=".*ཕྱོགས་"] 1 = [pos="NOUN"] 27 | [pos="VERB"] [pos="NUM"] [pos="NUM"] 1 = [pos="NOUN"] 28 | [text="དཔེར་ན་"] [pos="VERB"] 2 = [pos="NOUN"] 29 | [text="།_"] [pos="VERB"] [text="ལ་སོགས་པ་"] 2 = [pos="NOUN"] 30 | [pos="VERB"] [text=".*སོ"] 1 = [pos="OOV"] 31 | [pos="VERB"] [] [text=".*སྐྱེས་"] 1 = [pos="OOV"] 32 | [pos="VERB"] [pos="NON_WORD"] 1 = [pos="OOV"] 33 | [pos="VERB"] [] [text=".*ཆད་"] 1 = [pos="ADV"] 34 | [pos="NOUN"] [pos="SCONJ"] 2 = [pos="ADP"] 35 | [pos="DET"] [pos="SCONJ"] 2 = [pos="ADP"]""" 36 | ) 37 | assert rules == expected 38 | -------------------------------------------------------------------------------- /tests/test_segmentation_rule_extraction.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import re 3 | 4 | from pybo.segmentation_rule.make_rule import * 5 | from pybo.segmentation_rule.pipeline import * 6 | 7 | @pytest.fixture(scope="module") 8 | def human_data(): 9 | return "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ ། ། ན་མོ་གུ་རུ་ དེ་ཝ་ཌཱ་ཀི་ནི ། སྔོན་སྦྱངས་ ཐུགས་བསྐྱེད་ སྨོན་ལམ་ དུས་ བབས་ ལྷག་བསམ་ གྲུ་གཟིངས་ ནང་ དུ་ ལུས་སྲོག་ མ་ ཆགས་ འགྲོ་དོན་ སྦྱོར་བ་ མཆོག་ གིས་ རབ་ ཞུགས་ ནས །" 10 | 11 | @pytest.fixture(scope="module") 12 | def source_data(): 13 | return "སྒྲ་བསྒྱུར་མར་པ་ལོ་ཙྪའི་རྣམ་པར་ཐར་པ་མཐོང་བ་དོན་ཡོད་བཞུགས་སོ།།ན་མོ་གུ་རུ་དེ་ཝ་ཌཱ་ཀི་ནི།སྔོན་སྦྱངས་ཐུགས་བསྐྱེད་སྨོན་ལམ་དུས་བབས་ལྷག་བསམ་གྲུ་གཟིངས་ནང་དུ་ལུས་སྲོག་མ་ཆགས་འགྲོ་དོན་སྦྱོར་བ་མཆོག་གིས་རབ་ཞུགས་ནས།" 14 | 15 | 16 | def test_postprocessing_human_data(human_data): 17 | expected_human_data = "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ །། ན་མོ་གུ་རུ་ དེ་ཝ་ཌཱ་ཀི་ནི ། སྔོན་སྦྱངས་ ཐུགས་བསྐྱེད་ སྨོན་ལམ་ དུས་ བབས་ ལྷག་བསམ་ གྲུ་གཟིངས་ ནང་ དུ་ ལུས་སྲོག་ མ་ ཆགས་ འགྲོ་དོན་ སྦྱོར་བ་ མཆོག་ གིས་ རབ་ ཞུགས་ ནས །" 18 | assert expected_human_data == post_process_human_data(human_data) 19 | 20 | 21 | def test_construct_bilou_tag_line(): 22 | human_toks = ["སྒྲ་བསྒྱུར་", "མར་པ་", "ལོ་ཙྪ", "འི་", "རྣམ་པར་", "ཐར་པ་", "མཐོང་བ་", "དོན་ཡོད་", "བཞུགས་", "སོ", "།།", "ན་མོ་གུ་རུ་", "དེ་ཝ་ཌཱ་ཀི་ནི", "།", "རྣམས་", "ལས་", "དམ་ཆོས་", "ནོར་བུ་", "དགོས་འདོད་", "ཆར་འབབས་", "བླངས་", "ནས་", "ནི", "།།", "གི", "ས་", "བསྐྱོད་", "ཕུ་ལ་ཧ་རི་"] 23 | botok_toks = ["སྒྲ་", "བསྒྱུར་", "མར་པ་", "ལོ་", "ཙྪའི་", "རྣམ་པ", "ར་", "ཐར་པ་", "མཐོང་བ་", "དོན་", "ཡོད་", "བཞུགས་", "སོ", "།།", "ན་མོ་", "གུ་རུ་", "དེ་ཝ་", "ཌཱ་ཀི་", "ནི", "།", "རྣམས་", "ལས་དམ་", "ཆོས་ནོར་", "བུ་", "དགོས་འདོད་", "ཆ", "ར་", "འབབས་", "བླངས་", "ནས་", "ནི", "།།", "གིས་", "བསྐྱོད་", "ཕུ་ལ་ཧ་རི་"] 24 | expected_bilou_line = 'སྒྲ་/B བསྒྱུར་/I མར་པ་/U ལོ་/B ཙྪའི་/S རྣམ་པ/B ར་/I ཐར་པ་/U མཐོང་བ་/U དོན་/B ཡོད་/I བཞུགས་/U སོ/U །།/U ན་མོ་/B གུ་རུ་/I དེ་ཝ་/B ཌཱ་ཀི་/I ནི/I །/U རྣམས་/U ལས་དམ་/S ཆོས་ནོར་/S བུ་/I དགོས་འདོད་/U ཆ/B ར་/I འབབས་/I བླངས་/U ནས་/U ནི/U །།/U གིས་/S བསྐྱོད་/U ཕུ་ལ་ཧ་རི་/U ' 25 | assert expected_bilou_line == get_bilou_tag_line(human_toks, botok_toks) 26 | 27 | def test_get_new_word_candidate(): 28 | merge_suggestions = ["སྒྲ་/B བསྒྱུར་/I", "དོན་\B ཡོད་\I"] 29 | human_data = "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ ། མཐོང་བ་ དོན་ ཡོད་ བཞུགས་ སོ །" 30 | expected_new_words = ["སྒྲ་བསྒྱུར་"] 31 | assert expected_new_words == get_new_word_candidates(merge_suggestions, human_data) 32 | 33 | def test_get_remove_word_candidate(): 34 | split_suggestions = ["སྒྲ་བསྒྱུར་", "དོན་ཡོད་", "མཐོང་བ་"] 35 | human_data = " སྒྲ་ བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་ བ་ དོན་ཡོད་ བཞུགས་ སོ ། མཐོང་ བ་ དོན་ ཡོད་ བཞུགས་ སོ །" 36 | expected_remove_words = ["སྒྲ་བསྒྱུར་", "མཐོང་བ་"] 37 | assert expected_remove_words == get_remove_word_candidates(split_suggestions, human_data) 38 | 39 | def test_false_positive_merge(): 40 | tokens_in_rule = ['[text="ང་"]', '[text="ཁོང་"]', '[text="ཅན་"]', '[text="དུ་"]', '[text="མི་"]'] 41 | index = 2 42 | human_data = "སྒོམ་ བྱེད་ ཀྱིན་ ཡོད་ འདུག་པ ས ། ང་ ཁོང་ ཅན་ དུ་ མི་ འགྲོ ཁྱེད་རང་ ང འི་ ཕྱི་ ལ་ འགྲོ་ ན་ གསེར་ མཉམ་ དུ་ བྱེད །" 43 | assert True == is_false_positive_merge(tokens_in_rule, index, human_data) 44 | 45 | def test_true_positive_merge(): 46 | tokens_in_rule = ['[text="ཁྱོད་"]', '[text="ཁོང་"]', '[text="ཅན་"]', '[text="བཏང་"]', '[text="དགོས་"]'] 47 | index = 2 48 | human_data = "མ་རྒྱུད་ ཀྱི་ བདག་པོ་ གཅིག་ བཞུགས་ ཤིང་ ཡོད་པ ས་ ཁྱོད་ ཁོང་ཅན་ བཏང་ དགོས་ གསུངས །" 49 | assert False == is_false_positive_merge(tokens_in_rule, index, human_data) 50 | 51 | def test_true_positive_split(): 52 | tokens_in_rule = ['[text="ང་"]', '[text="ཁོང་ཅན་"]', '[text="དུ་"]', '[text="མི་"]'] 53 | index = 2 54 | counter_split_suggestion = ' ཁོང་ ཅན་ ' 55 | human_data = "སྒོམ་ བྱེད་ ཀྱིན་ ཡོད་ འདུག་པ ས ། ང་ ཁོང་ ཅན་ དུ་ མི་ འགྲོ ཁྱེད་རང་ ང འི་ ཕྱི་ ལ་ འགྲོ་ ན་ གསེར་ མཉམ་ དུ་ བྱེད །" 56 | assert False == is_false_positive_split(tokens_in_rule, index, counter_split_suggestion, human_data) 57 | 58 | def test_false_positive_split(): 59 | tokens_in_rule = ['[text="ཁྱོད་"]', '[text="ཁོང་ཅན་"]', '[text="བཏང་"]', '[text="དགོས་"]'] 60 | index = 2 61 | counter_split_suggestion = ' ཁོང་ ཅན་ ' 62 | human_data = "མ་རྒྱུད་ ཀྱི་ བདག་པོ་ གཅིག་ བཞུགས་ ཤིང་ ཡོད་པ ས་ ཁྱོད་ ཁོང་ཅན་ བཏང་ དགོས་ གསུངས །" 63 | assert True == is_false_positive_split(tokens_in_rule, index, counter_split_suggestion, human_data) 64 | 65 | def test_invalid_split_rule(): 66 | tokens_info = '[text="སྒྲ་བསྒྱུར་"] [text="མར་པ་"]' 67 | index_info = '2-1' 68 | human_data = "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ །།" 69 | assert (True,0) == is_invalid_split(tokens_info, index_info, human_data) 70 | 71 | def test_valid_split_rule(): 72 | tokens_info = '[text="སྒྲ་"] [text="བསྒྱུར་"] [text="མཐོང་བ་"]' 73 | index_info = '3-1' 74 | human_data = "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ །། སྒྲ་ བསྒྱུར་ མཐོང་ བ་ དོན་ ཡོད་ བཞུགས་ སོ" 75 | assert (False,1) == is_invalid_split(tokens_info, index_info, human_data) 76 | 77 | def test_invalid_merge_rule(): 78 | tokens_info = '[text="སྒྲ་བསྒྱུར་"] [text="མར་"] [text="པ་"]' 79 | index_info = '2' 80 | human_data = "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ །།" 81 | assert True == is_invalid_merge(tokens_info, index_info, human_data) 82 | 83 | def test_valid_merge_rule(): 84 | tokens_info = '[text="ཐར་པ་"] [text="མཐོང་"] [text="བ་"]' 85 | index_info = '2' 86 | human_data = "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ །། སྒྲ་ བསྒྱུར་ མཐོང་ བ་ དོན་ ཡོད་ བཞུགས་ སོ" 87 | assert False == is_invalid_merge(tokens_info, index_info, human_data) 88 | 89 | 90 | if __name__ == "__main__": 91 | # input_path = Path('./tests/corpus1/corpus1.txt') 92 | # input_path = Path('./tests/marpa/marpa.txt') 93 | input_path = Path('./tests/data/drokun_test/drokun_test_hd.txt') 94 | rules = extract_seg_rule(input_path, type='cql') 95 | (input_path.parent / f'{input_path.stem}_rules.txt').write_text(rules, encoding='utf-8') -------------------------------------------------------------------------------- /tests/test_tok.py: -------------------------------------------------------------------------------- 1 | from click.testing import CliRunner 2 | 3 | from pybo.cli import tok 4 | 5 | 6 | def test_tok_dir(): 7 | runner = CliRunner() 8 | runner.invoke(tok, ["tests/resources/shelving/", "--tags", "pl"]) 9 | 10 | def test_tok_file(): 11 | runner = CliRunner() 12 | runner.invoke(tok, ["tests/resources/shelving/test_1.txt", "--tags", "p"]) 13 | -------------------------------------------------------------------------------- /tests/test_untokenize.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from pybo.untokenize import * 3 | 4 | def test_untokenize_clean_text(): 5 | tokenized_text = "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ ཙྪའི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ །། ན་མོ་གུ་རུ་ དེ་ཝ་ཌཱ་ཀི་ནི ། " 6 | tokens = pre_processing(tokenized_text) 7 | detokenized_text = assemble(tokens) 8 | expected_text = "སྒྲ་བསྒྱུར་མར་པ་ལོ་ཙྪའི་རྣམ་པར་ཐར་པ་མཐོང་བ་དོན་ཡོད་བཞུགས་སོ།།ན་མོ་གུ་རུ་དེ་ཝ་ཌཱ་ཀི་ནི།" 9 | assert expected_text == detokenized_text 10 | 11 | def test_untokenize_single_tagged_text(): 12 | tokenized_text = "སྒྲ་བསྒྱུར་/NO_POS མར་པ་/NO_POS ལོ་/NO_POS ཙྪའི་/NO_POS རྣམ་པར་/NO_POS ཐར་པ་/NO_POS མཐོང་བ་/NO_POS དོན་ཡོད་/NO_POS བཞུགས་/NO_POS སོ/NO_POS །།/NO_POS ན་མོ་གུ་རུ་/NO_POS དེ་ཝ་ཌཱ་ཀི་ནི/NO_POS །/NO_POS " 13 | tokens = pre_processing(tokenized_text) 14 | detokenized_text = assemble(tokens) 15 | expected_text = "སྒྲ་བསྒྱུར་མར་པ་ལོ་ཙྪའི་རྣམ་པར་ཐར་པ་མཐོང་བ་དོན་ཡོད་བཞུགས་སོ།།ན་མོ་གུ་རུ་དེ་ཝ་ཌཱ་ཀི་ནི།" 16 | assert expected_text == detokenized_text 17 | 18 | def test_untokenize_multi_tagged_text(): 19 | tokenized_text = "ལས་//// ཞེས་པ་//PART/ཞེས་པ་/ ནི་//PART/ནི་/ ལས་//// བྱེད་པ//VERB/བྱེད་པ་/" 20 | tokens = pre_processing(tokenized_text) 21 | detokenized_text = assemble(tokens) 22 | expected_text = "ལས་ཞེས་པ་ནི་ལས་བྱེད་པ" 23 | assert expected_text == detokenized_text -------------------------------------------------------------------------------- /tests/workflow_test.txt: -------------------------------------------------------------------------------- 1 | # first run of pybo on a folder of files to process 2 | -------------------------------------------------------------------------------- /usage.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from pybo import Text, pyewts 4 | from pybo.cli import prepare_folder 5 | 6 | prepare_folder() 7 | 8 | string = """ཤོག 9 | བཀྲ་ཤིས་""" 10 | t = Text(string) 11 | print(t.tokenize_words_raw_lines) 12 | 13 | converter = pyewts.pyewts() 14 | 15 | uni = "བཀྲ་ཤིས་བདེ་ལེགས།། །།" 16 | wylie = "bkra shis bde legs//_//" 17 | 18 | new_uni = converter.toUnicode(wylie) 19 | new_wylie = converter.toWylie(uni) 20 | 21 | assert uni[:-5] == new_uni[:-3] # double shads are a single char in pyewts 22 | assert wylie == new_wylie 23 | --------------------------------------------------------------------------------