├── .github
    └── workflows
    │   ├── publish.yaml
    │   ├── python-publish.yml
    │   ├── test-coverage.yaml
    │   └── test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── _config.yml
├── pybo
    ├── __init__.py
    ├── cli.py
    ├── corpus
    │   ├── __init__.py
    │   ├── parse_corrected.py
    │   └── word_cleanup.py
    ├── hfr_cqlr_converter.py
    ├── monlam2wordlist.py
    ├── pipeline
    │   ├── __init__.py
    │   └── pipes.py
    ├── rdr
    │   ├── __init__.py
    │   ├── rdr.py
    │   └── rdr_2_replace_matcher.py
    ├── resources
    │   └── particles.tsv
    ├── segmentation_rule
    │   ├── __init__.py
    │   ├── make_rule.py
    │   └── pipeline.py
    ├── third_party
    │   └── rules.txt
    ├── untokenize.py
    └── utils
    │   ├── __init__.py
    │   ├── bo_sorted.py
    │   ├── profile_entries.py
    │   ├── profile_report.py
    │   └── regex_batch_apply.py
├── pybo_logo.png
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
    ├── 01_raw_text.txt
    ├── data
    │   ├── corpus1
    │   │   ├── corpus1.txt
    │   │   ├── corpus1_bilou_rules.txt
    │   │   ├── corpus1_hd.txt
    │   │   ├── corpus1_pybo_data.txt
    │   │   ├── corpus1_rules.txt
    │   │   └── corpus1_tr_data.txt
    │   ├── drokun_test
    │   │   ├── drokun_test.txt
    │   │   ├── drokun_test_bilou_rules.txt
    │   │   ├── drokun_test_hd.txt
    │   │   ├── drokun_test_rules.txt
    │   │   └── drokun_test_tr_data.txt
    │   ├── marpa
    │   │   ├── marpa.txt
    │   │   ├── marpa_bilou_rules.txt
    │   │   ├── marpa_hd.txt
    │   │   ├── marpa_pybo_data.txt
    │   │   ├── marpa_rules.txt
    │   │   └── marpa_tr_data.txt
    │   └── monlam2020
    │   │   ├── multi_pos_multi_sense.csv
    │   │   ├── multi_pos_multi_sense_expected.csv
    │   │   ├── one_pos_multi_sense.csv
    │   │   ├── one_pos_multi_sense_expected.csv
    │   │   ├── one_pos_one_sense.csv
    │   │   ├── one_pos_one_sense_expected.csv
    │   │   ├── verbs.csv
    │   │   └── verbs_expected.csv
    ├── hfr2cql
    │   ├── UDPOS-bo.txt
    │   ├── adjustments.txt
    │   ├── cql
    │   │   ├── _cql2hfr_cql.txt
    │   │   └── cql.txt
    │   ├── cql2hfr.txt
    │   ├── cql_result.txt
    │   └── hfr_result.txt
    ├── resources
    │   ├── rdr_rules.txt
    │   ├── shelving
    │   │   ├── test_1.txt
    │   │   ├── test_1_tok
    │   │   │   └── test_1_tok.txt
    │   │   └── test_2.txt
    │   ├── step1_3
    │   │   └── input
    │   │   │   └── test.txt
    │   └── step2
    │   │   ├── cql_rules.txt
    │   │   ├── manually_corrected.txt
    │   │   ├── rdr_input.txt
    │   │   ├── step2
    │   │   └── test_rules.txt
    ├── test_cli.py
    ├── test_corpus.py
    ├── test_hfr_cqlr_converter.py
    ├── test_monlam2word_list.py
    ├── test_rdr2adjustment.py
    ├── test_segmentation_rule_extraction.py
    ├── test_tok.py
    ├── test_untokenize.py
    └── workflow_test.txt
└── usage.py


/.github/workflows/publish.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 | jobs:
 9 |   test:
10 |     runs-on: ubuntu-latest
11 |     strategy:
12 |       max-parallel: 4
13 |       matrix:
14 |         python-version: [3.6]
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v1
18 |     
19 |     - name: Set up Python ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v1
21 |       with:
22 |         python-version: ${{ matrix.python-version }}
23 |     
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         pip install -r requirements-dev.txt
28 |         pip install -e .
29 |     - name: Run Test
30 |       run: |
31 |         pytest tests/
32 |   
33 |   publish:
34 |     
35 |     needs: test
36 |     runs-on: ubuntu-latest
37 | 
38 |     steps:
39 |     - uses: actions/checkout@v2
40 |       with:
41 |         fetch-depth: 0
42 | 
43 |     - name: Python Semantic Release
44 |       uses: relekang/python-semantic-release@master
45 |       with:
46 |         github_token: ${{ secrets.GITHUB_TOKEN }}
47 |         pypi_token: ${{ secrets.PYPI_TOKEN }}
48 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflows will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | name: Upload Python Package
 5 | 
 6 | on:
 7 |   release:
 8 |     types: [created]
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v2
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v2
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install setuptools wheel twine
25 |     - name: Build and publish
26 |       env:
27 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
28 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
29 |       run: |
30 |         python setup.py sdist bdist_wheel
31 |         twine upload dist/*
32 | 


--------------------------------------------------------------------------------
/.github/workflows/test-coverage.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | name: Test Coverage
 3 | 
 4 | on:
 5 |   push:
 6 |     branches:
 7 |       - '*'
 8 |   pull_request:
 9 |     branches:
10 |       - '*' 
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       max-parallel: 4
18 |       matrix:
19 |         python-version: [3.6]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v1
23 |       with:
24 |         fetch-depth: 1
25 | 
26 |     - name: Set up Python ${{ matrix.python-version }}
27 |       uses: actions/setup-python@v1
28 |       with:
29 |         python-version: ${{ matrix.python-version }}
30 |     
31 |     - name: Install dependencies
32 |       run: |
33 |         python -m pip install --upgrade pip
34 |         pip install -r requirements-dev.txt
35 |         pip install -e .
36 |     
37 |     - name: Run Test Coverage
38 |       run: |
39 |         coverage run -m pytest tests/
40 |         coverage report
41 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - '*'
 7 |   pull_request:
 8 |     branches:
 9 |       - '*'
10 | 
11 | jobs:
12 |   test:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       max-parallel: 4
16 |       matrix:
17 |         python-version: [3.6]
18 | 
19 |     steps:
20 |     - uses: actions/checkout@v1
21 |     
22 |     - name: Set up Python ${{ matrix.python-version }}
23 |       uses: actions/setup-python@v1
24 |       with:
25 |         python-version: ${{ matrix.python-version }}
26 |     
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         pip install -r requirements-dev.txt
31 |         pip install -e .
32 |     - name: Run Test
33 |       run: |
34 |         pytest -vv
35 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.gitignore.io/api/macos,pydev,python,eclipse,pycharm+all,visualstudio,visualstudiocode
  3 | 
  4 | # User data
  5 | usecases/canon_concordancer/input
  6 | usecases/canon_concordancer/output
  7 | .cache/
  8 | botok.yaml
  9 | **/shelving_tok/
 10 | 
 11 | *.DICT
 12 | *.INIT
 13 | *.RAW
 14 | *.RDR
 15 | *.sDict
 16 | 
 17 | ### Eclipse ###
 18 | .metadata
 19 | bin/
 20 | tmp/
 21 | *.tmp
 22 | *.bak
 23 | *.swp
 24 | *~.nib
 25 | local.properties
 26 | .settings/
 27 | .loadpath
 28 | .recommenders
 29 | 
 30 | # External tool builders
 31 | .externalToolBuilders/
 32 | 
 33 | # Locally stored "Eclipse launch configurations"
 34 | *.launch
 35 | 
 36 | # PyDev specific (Python IDE for Eclipse)
 37 | *.pydevproject
 38 | 
 39 | # CDT-specific (C/C++ Development Tooling)
 40 | .cproject
 41 | 
 42 | # Java annotation processor (APT)
 43 | .factorypath
 44 | 
 45 | # PDT-specific (PHP Development Tools)
 46 | .buildpath
 47 | 
 48 | # sbteclipse plugin
 49 | .target
 50 | 
 51 | # Tern plugin
 52 | .tern-project
 53 | 
 54 | # TeXlipse plugin
 55 | .texlipse
 56 | 
 57 | # STS (Spring Tool Suite)
 58 | .springBeans
 59 | 
 60 | # Code Recommenders
 61 | .recommenders/
 62 | 
 63 | # Scala IDE specific (Scala & Java development for Eclipse)
 64 | .cache-main
 65 | .scala_dependencies
 66 | .worksheet
 67 | 
 68 | ### Eclipse Patch ###
 69 | # Eclipse Core
 70 | .project
 71 | 
 72 | # JDT-specific (Eclipse Java Development Tools)
 73 | .classpath
 74 | 
 75 | ### macOS ###
 76 | *.DS_Store
 77 | .AppleDouble
 78 | .LSOverride
 79 | 
 80 | # Icon must end with two \r
 81 | Icon
 82 | 
 83 | # Thumbnails
 84 | ._*
 85 | 
 86 | # Files that might appear in the root of a volume
 87 | .DocumentRevisions-V100
 88 | .fseventsd
 89 | .Spotlight-V100
 90 | .TemporaryItems
 91 | .Trashes
 92 | .VolumeIcon.icns
 93 | .com.apple.timemachine.donotpresent
 94 | 
 95 | # Directories potentially created on remote AFP share
 96 | .AppleDB
 97 | .AppleDesktop
 98 | Network Trash Folder
 99 | Temporary Items
100 | .apdisk
101 | 
102 | ### PyCharm+all ###
103 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
104 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
105 | 
106 | # User-specific stuff:
107 | .idea/**/workspace.xml
108 | .idea/**/tasks.xml
109 | .idea/dictionaries
110 | 
111 | # Sensitive or high-churn files:
112 | .idea/**/dataSources/
113 | .idea/**/dataSources.ids
114 | .idea/**/dataSources.xml
115 | .idea/**/dataSources.local.xml
116 | .idea/**/sqlDataSources.xml
117 | .idea/**/dynamic.xml
118 | .idea/**/uiDesigner.xml
119 | 
120 | # Gradle:
121 | .idea/**/gradle.xml
122 | .idea/**/libraries
123 | 
124 | # CMake
125 | cmake-build-debug/
126 | 
127 | # Mongo Explorer plugin:
128 | .idea/**/mongoSettings.xml
129 | 
130 | ## File-based project format:
131 | *.iws
132 | 
133 | ## Plugin-specific files:
134 | 
135 | # IntelliJ
136 | /out/
137 | 
138 | # mpeltonen/sbt-idea plugin
139 | .idea_modules/
140 | 
141 | # JIRA plugin
142 | atlassian-ide-plugin.xml
143 | 
144 | # Cursive Clojure plugin
145 | .idea/replstate.xml
146 | 
147 | # Ruby plugin and RubyMine
148 | /.rakeTasks
149 | 
150 | # Crashlytics plugin (for Android Studio and IntelliJ)
151 | com_crashlytics_export_strings.xml
152 | crashlytics.properties
153 | crashlytics-build.properties
154 | fabric.properties
155 | 
156 | ### PyCharm+all Patch ###
157 | # Ignores the whole idea folder
158 | # See https://github.com/joeblau/gitignore.io/issues/186 and https://github.com/joeblau/gitignore.io/issues/360
159 | 
160 | .idea/
161 | 
162 | ### pydev ###
163 | .pydevproject
164 | 
165 | ### Python ###
166 | # Byte-compiled / optimized / DLL files
167 | __pycache__/
168 | *.py[cod]
169 | *$py.class
170 | 
171 | # C extensions
172 | *.so
173 | 
174 | # Distribution / packaging
175 | .Python
176 | build/
177 | develop-eggs/
178 | dist/
179 | downloads/
180 | eggs/
181 | .eggs/
182 | lib/
183 | lib64/
184 | parts/
185 | sdist/
186 | var/
187 | wheels/
188 | *.egg-info/
189 | .installed.cfg
190 | *.egg
191 | 
192 | # PyInstaller
193 | #  Usually these files are written by a python script from a template
194 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
195 | *.manifest
196 | *.spec
197 | 
198 | # Installer logs
199 | pip-log.txt
200 | pip-delete-this-directory.txt
201 | 
202 | # Unit test / coverage reports
203 | htmlcov/
204 | .tox/
205 | .coverage
206 | .coverage.*
207 | .cache
208 | nosetests.xml
209 | coverage.xml
210 | *.cover
211 | .hypothesis/
212 | .pytest_cache/
213 | 
214 | # Translations
215 | *.mo
216 | *.pot
217 | 
218 | # Django stuff:
219 | *.log
220 | local_settings.py
221 | 
222 | # Flask stuff:
223 | instance/
224 | .webassets-cache
225 | 
226 | # Scrapy stuff:
227 | .scrapy
228 | 
229 | # Sphinx documentation
230 | docs/_build/
231 | 
232 | # PyBuilder
233 | target/
234 | 
235 | # Jupyter Notebook
236 | .ipynb_checkpoints
237 | 
238 | # pyenv
239 | .python-version
240 | 
241 | # celery beat schedule file
242 | celerybeat-schedule.*
243 | 
244 | # SageMath parsed files
245 | *.sage.py
246 | 
247 | # Environments
248 | .env
249 | .venv
250 | env/
251 | venv/
252 | ENV/
253 | env.bak/
254 | venv.bak/
255 | 
256 | # Spyder project settings
257 | .spyderproject
258 | .spyproject
259 | 
260 | # Rope project settings
261 | .ropeproject
262 | 
263 | # mkdocs documentation
264 | /site
265 | 
266 | # mypy
267 | .mypy_cache/
268 | 
269 | ### VisualStudioCode ###
270 | .vscode
271 | !.vscode/settings.json
272 | !.vscode/tasks.json
273 | !.vscode/launch.json
274 | !.vscode/extensions.json
275 | .history
276 | 
277 | ### VisualStudio ###
278 | ## Ignore Visual Studio temporary files, build results, and
279 | ## files generated by popular Visual Studio add-ons.
280 | ##
281 | ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
282 | 
283 | # User-specific files
284 | *.suo
285 | *.user
286 | *.userosscache
287 | *.sln.docstates
288 | 
289 | # User-specific files (MonoDevelop/Xamarin Studio)
290 | *.userprefs
291 | 
292 | # Build results
293 | [Dd]ebug/
294 | [Dd]ebugPublic/
295 | [Rr]elease/
296 | [Rr]eleases/
297 | x64/
298 | x86/
299 | bld/
300 | [Bb]in/
301 | [Oo]bj/
302 | [Ll]og/
303 | 
304 | # Visual Studio 2015 cache/options directory
305 | .vs/
306 | # Uncomment if you have tasks that create the project's static files in wwwroot
307 | #wwwroot/
308 | 
309 | # MSTest test Results
310 | [Tt]est[Rr]esult*/
311 | [Bb]uild[Ll]og.*
312 | 
313 | # NUNIT
314 | *.VisualState.xml
315 | TestResult.xml
316 | 
317 | # Build Results of an ATL Project
318 | [Dd]ebugPS/
319 | [Rr]eleasePS/
320 | dlldata.c
321 | 
322 | # .NET Core
323 | project.lock.json
324 | project.fragment.lock.json
325 | artifacts/
326 | **/Properties/launchSettings.json
327 | 
328 | *_i.c
329 | *_p.c
330 | *_i.h
331 | *.ilk
332 | *.meta
333 | *.obj
334 | *.pch
335 | *.pdb
336 | *.pgc
337 | *.pgd
338 | *.rsp
339 | *.sbr
340 | *.tlb
341 | *.tli
342 | *.tlh
343 | *.tmp_proj
344 | *.vspscc
345 | *.vssscc
346 | .builds
347 | *.pidb
348 | *.svclog
349 | *.scc
350 | 
351 | # Chutzpah Test files
352 | _Chutzpah*
353 | 
354 | # Visual C++ cache files
355 | ipch/
356 | *.aps
357 | *.ncb
358 | *.opendb
359 | *.opensdf
360 | *.sdf
361 | *.cachefile
362 | *.VC.db
363 | *.VC.VC.opendb
364 | 
365 | # Visual Studio profiler
366 | *.psess
367 | *.vsp
368 | *.vspx
369 | *.sap
370 | 
371 | # TFS 2012 Local Workspace
372 | $tf/
373 | 
374 | # Guidance Automation Toolkit
375 | *.gpState
376 | 
377 | # ReSharper is a .NET coding add-in
378 | _ReSharper*/
379 | *.[Rr]e[Ss]harper
380 | *.DotSettings.user
381 | 
382 | # JustCode is a .NET coding add-in
383 | .JustCode
384 | 
385 | # TeamCity is a build add-in
386 | _TeamCity*
387 | 
388 | # DotCover is a Code Coverage Tool
389 | *.dotCover
390 | 
391 | # Visual Studio code coverage results
392 | *.coverage
393 | *.coveragexml
394 | 
395 | # NCrunch
396 | _NCrunch_*
397 | .*crunch*.local.xml
398 | nCrunchTemp_*
399 | 
400 | # MightyMoose
401 | *.mm.*
402 | AutoTest.Net/
403 | 
404 | # Web workbench (sass)
405 | .sass-cache/
406 | 
407 | # Installshield output folder
408 | [Ee]xpress/
409 | 
410 | # DocProject is a documentation generator add-in
411 | DocProject/buildhelp/
412 | DocProject/Help/*.HxT
413 | DocProject/Help/*.HxC
414 | DocProject/Help/*.hhc
415 | DocProject/Help/*.hhk
416 | DocProject/Help/*.hhp
417 | DocProject/Help/Html2
418 | DocProject/Help/html
419 | 
420 | # Click-Once directory
421 | publish/
422 | 
423 | # Publish Web Output
424 | *.[Pp]ublish.xml
425 | *.azurePubxml
426 | # By default, sensitive information, such as encrypted password
427 | # should be stored in the .pubxml.user file.
428 | *.pubxml
429 | *.pubxml.user
430 | *.publishproj
431 | 
432 | # Microsoft Azure Web App publish settings. Comment the next line if you want to
433 | # checkin your Azure Web App publish settings, but sensitive information contained
434 | # in these scripts will be unencrypted
435 | PublishScripts/
436 | 
437 | # NuGet Packages
438 | *.nupkg
439 | # The packages folder can be ignored because of Package Restore
440 | **/packages/*
441 | # except build/, which is used as an MSBuild target.
442 | !**/packages/build/
443 | # Uncomment if necessary however generally it will be regenerated when needed
444 | #!**/packages/repositories.config
445 | # NuGet v3's project.json files produces more ignorable files
446 | *.nuget.props
447 | *.nuget.targets
448 | 
449 | # Microsoft Azure Build Output
450 | csx/
451 | *.build.csdef
452 | 
453 | # Microsoft Azure Emulator
454 | ecf/
455 | rcf/
456 | 
457 | # Windows Store app package directories and files
458 | AppPackages/
459 | BundleArtifacts/
460 | Package.StoreAssociation.xml
461 | _pkginfo.txt
462 | 
463 | # Visual Studio cache files
464 | # files ending in .cache can be ignored
465 | *.[Cc]ache
466 | # but keep track of directories ending in .cache
467 | !*.[Cc]ache/
468 | 
469 | # Others
470 | ClientBin/
471 | ~$*
472 | *~
473 | *.dbmdl
474 | *.dbproj.schemaview
475 | *.jfm
476 | *.pfx
477 | *.publishsettings
478 | orleans.codegen.cs
479 | 
480 | # Since there are multiple workflows, uncomment next line to ignore bower_components
481 | # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
482 | #bower_components/
483 | 
484 | # RIA/Silverlight projects
485 | Generated_Code/
486 | 
487 | # Backup & report files from converting an old project file
488 | # to a newer Visual Studio version. Backup files are not needed,
489 | # because we have git ;-)
490 | _UpgradeReport_Files/
491 | Backup*/
492 | UpgradeLog*.XML
493 | UpgradeLog*.htm
494 | 
495 | # SQL Server files
496 | *.mdf
497 | *.ldf
498 | *.ndf
499 | 
500 | # Business Intelligence projects
501 | *.rdl.data
502 | *.bim.layout
503 | *.bim_*.settings
504 | 
505 | # Microsoft Fakes
506 | FakesAssemblies/
507 | 
508 | # GhostDoc plugin setting file
509 | *.GhostDoc.xml
510 | 
511 | # Node.js Tools for Visual Studio
512 | .ntvs_analysis.dat
513 | node_modules/
514 | 
515 | # Typescript v1 declaration files
516 | typings/
517 | 
518 | # Visual Studio 6 build log
519 | *.plg
520 | 
521 | # Visual Studio 6 workspace options file
522 | *.opt
523 | 
524 | # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
525 | *.vbw
526 | 
527 | # Visual Studio LightSwitch build output
528 | **/*.HTMLClient/GeneratedArtifacts
529 | **/*.DesktopClient/GeneratedArtifacts
530 | **/*.DesktopClient/ModelManifest.xml
531 | **/*.Server/GeneratedArtifacts
532 | **/*.Server/ModelManifest.xml
533 | _Pvt_Extensions
534 | 
535 | # Paket dependency manager
536 | .paket/paket.exe
537 | paket-files/
538 | 
539 | # FAKE - F# Make
540 | .fake/
541 | 
542 | # JetBrains Rider
543 | *.sln.iml
544 | 
545 | # CodeRush
546 | .cr/
547 | 
548 | # Python Tools for Visual Studio (PTVS)
549 | *.pyc
550 | 
551 | # Cake - Uncomment if you are using it
552 | # textunits/**
553 | # !textunits/packages.config
554 | 
555 | # Telerik's JustMock configuration file
556 | *.jmconfig
557 | 
558 | # BizTalk build output
559 | *.btp.cs
560 | *.btm.cs
561 | *.odx.cs
562 | *.xsd.cs
563 | 
564 | ### VisualStudio Patch ###
565 | # By default, sensitive information, such as encrypted password
566 | # should be stored in the .pubxml.user file.
567 | 
568 | default.profile
569 | 
570 | # pybo
571 | *.pickled
572 | **/pybo.yaml
573 | 
574 | # End of https://www.gitignore.io/api/macos,pydev,python,eclipse,pycharm+all,visualstudio,visualstudiocode
575 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://github.com/psf/black
 3 |   rev: 19.3b0
 4 |   hooks:
 5 |   - id: black
 6 | - repo: https://github.com/PyCQA/flake8
 7 |   rev: 3.8.3
 8 |   hooks:
 9 |   - id: flake8
10 | - repo: https://github.com/timothycrosley/isort
11 |   rev: 5.2.2
12 |   hooks:
13 |   - id: isort


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | 
 3 | All notable changes to this project will be documented in this file.
 4 | 
 5 | The format is based on [Keep a Changelog](http://keepachangelog.com/) and this project adheres to [Semantic Versioning](http://semver.org/).
 6 | 
 7 | ## [0.7.3](https://github.com/Esukhia/pybo/releases/tag/v0.7.3) - 
 8 | ### Added
 9 |  * change pybo to bo in cli
10 | 
11 | ## [0.7.2](https://github.com/Esukhia/pybo/releases/tag/v0.7.2) - 20200808
12 | ### Added
13 |  * fix dependencies in setup.py
14 | 
15 | ## [0.7.1](https://github.com/Esukhia/pybo/releases/tag/v0.7.1) - 20200808
16 | ### Added
17 |  * added tibetan_sort as dep, cleanup, kakha cli messages
18 | 
19 | ## [0.7.0](https://github.com/Esukhia/pybo/releases/tag/v0.7.0) - 20200807
20 | ### Added
21 |  * added kakha using tibetan_sort
22 | 
23 | ## [0.6.23](https://github.com/Esukhia/pybo/releases/tag/v0.6.23) - 20200714
24 | ### Added
25 |  * syl-based content shelving and reinsertion #3
26 | 
27 | ## [0.6.22](https://github.com/Esukhia/pybo/releases/tag/v0.6.22) - 20200710
28 | ### Added
29 |  * #5 Add optional "--tags" to tok command to select and order token tags
30 | 
31 | ## [0.6.21](https://github.com/Esukhia/pybo/releases/tag/v0.6.21) - 20191215
32 | ### Added
33 |  * add profile-update to CLI
34 | ### Changed
35 |  * use Token.text_cleaned whenever possible, fallback to Token.text otherwise
36 |  * output of `pybo rdr` and `pybo profile-report`
37 | 
38 | ## [0.6.20](https://github.com/Esukhia/pybo/releases/tag/v0.6.20) - 20191213
39 | ### Added
40 |  * Support for `object.suffixL<X>` in CQL rule creation.
41 | 
42 | ## [0.6.19](https://github.com/Esukhia/pybo/releases/tag/v0.6.19) - 20191210
43 | ### Fixed
44 |  * import bug fixed
45 | 
46 | ## [0.6.18](https://github.com/Esukhia/pybo/releases/tag/v0.6.18) - 20191210
47 | ### Added
48 |  * botok profile report: `pybo profile-report <path>`
49 |  Finds out all duplicates over all the folders and files.
50 | 
51 | ## [0.6.17](https://github.com/Esukhia/pybo/releases/tag/v0.6.17) - 20191122
52 | ### Fixed
53 |  * bad setup
54 | 
55 | ## [0.6.16](https://github.com/Esukhia/pybo/releases/tag/v0.6.16) - 20191122
56 | ### Fixed
57 |  * bad imports
58 | 
59 | ## [0.6.15](https://github.com/Esukhia/pybo/releases/tag/v0.6.15) - 20191122
60 | ### Fixed
61 |  * reference to bo_sorted() not removed
62 | 
63 | ## [0.6.14](https://github.com/Esukhia/pybo/releases/tag/v0.6.14) - 20191122
64 | ### Fixed
65 |  * piycu for Windows from third-party website
66 |  * temporarily remove bo_sorted() + CLI command
67 |  * fixed rdr_2_replace_matcher bug on first line of rules
68 | ### Added
69 |  * cwd CLI command
70 | 
71 | ## [0.6.13](https://github.com/Esukhia/pybo/releases/tag/v0.6.13) - 20191109
72 | ### Fixed
73 |  * removed pyicu dependency
74 | 
75 | ## [0.6.12](https://github.com/Esukhia/pybo/releases/tag/v0.6.12) - 20191109
76 | ### Added
77 |  * added rdr_2_replace_matcher in utils
78 | 
79 | ## [0.6.11](https://github.com/Esukhia/pybo/releases/tag/v0.6.11) - 20191030
80 | ### Added
81 |  * added bo_sort() and the corresponding kakha CLI option
82 | 
83 | ## [0.6.10](https://github.com/Esukhia/pybo/releases/tag/v0.6.10) - 20190901
84 | ### Added
85 |  * added pyewts to pybo
86 | 
87 | ## [0.6.9](https://github.com/Esukhia/pybo/releases/tag/v0.6.9) - 20190901
88 | ### Added
89 |  * the tokenizer's codebase is extracted from pybo and now lives in botok. All the related history is brought out to that project.
90 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <a target="_blank" rel="noopener noreferrer" href="http://www.montypython.net/sounds/sketches/exparrot.wav"> <img src=https://github.com/Esukhia/pybo/blob/master/pybo_logo.png width=150> </a>
  2 | 
  3 | # PYBO - Tibetan NLP in Python
  4 | [![PyPI version](https://badge.fury.io/py/pybo.svg)](https://badge.fury.io/py/pybo)
  5 | ![Test](https://github.com/Esukhia/pybo/workflows/Test/badge.svg)
  6 | ![Test Coverage](https://github.com/Esukhia/pybo/workflows/Test%20Coverage/badge.svg)
  7 | ![Publish](https://github.com/Esukhia/pybo/workflows/Publish/badge.svg)
  8 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://black.readthedocs.io/en/stable/)
  9 | 
 10 | 
 11 | 
 12 | ## Overview
 13 | 
 14 | bo tokenizes Tibetan text into words.
 15 | 
 16 | ### Basic usage
 17 | 
 18 | 
 19 | #### Getting started
 20 | Requires to have Python3 installed.
 21 | 
 22 |     python3 -m pip install pybo
 23 | 
 24 | #### Tokenizing a string
 25 | 
 26 | ```bash
 27 | drupchen@drupchen:~$ bo tok-string "༄༅། །རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྻ་ཨ་བ་ཏ་ར། བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པ། །
 28 | སངས་རྒྱས་དང་བྱང་ཆུབ་སེམས་དཔའ་ཐམས་ཅད་ལ་ཕྱག་འཚལ་ལོ། །བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང༌། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་
 29 | སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །"
 30 | Loading Trie... (2s.)
 31 | ༄༅།_། རྒྱ་གར་ སྐད་ དུ །_ བོ་ དྷི་ སཏྭ་ ཙརྻ་ ཨ་བ་ ཏ་ ར །_ བོད་སྐད་ དུ །_ བྱང་ཆུབ་ སེམས་དཔ འི་ སྤྱོད་པ་ ལ་ འཇུག་པ །_། སངས་རྒྱས་ དང་ བྱང་ཆུབ་
 32 | སེམས་དཔའ་ ཐམས་ཅད་ ལ་ ཕྱག་ འཚལ་ ལོ །_། བདེ་གཤེགས་ ཆོས་ ཀྱི་ སྐུ་ མངའ་ སྲས་ བཅས་ དང༌ །_། ཕྱག་འོས་ ཀུན་ ལ འང་ གུས་པ ར་ ཕྱག་ འཚལ་
 33 | ཏེ །_། བདེ་གཤེགས་ སྲས་ ཀྱི་ སྡོམ་ ལ་ འཇུག་པ་ ནི །_། ལུང་ བཞིན་ མདོར་བསྡུས་ ནས་ ནི་ བརྗོད་པ ར་ བྱ །_།
 34 | ```
 35 | 
 36 | #### Tokenizing a list of files
 37 | 
 38 | The command to tokenize a list of files in a directory:
 39 | ```
 40 | bo tok <path-to-directory>
 41 | ```
 42 | 
 43 | For example to tokenize the file `text.txt` in a directory `./document/` with the following content: 
 44 | ```
 45 | བཀྲ་ཤི་ས་བདེ་ལེགས་ཕུན་སུམ་ཚོགས། །རྟག་ཏུ་བདེ་བ་ཐོབ་པར་ཤོག། །
 46 | ```
 47 | 
 48 | I use the command:
 49 | ```
 50 | $ bo tok ./document/
 51 | ```
 52 | 
 53 | ...which create a file `text.txt` in a directory `./document_pybo` containing:
 54 | ```
 55 | བཀྲ་ ཤི་ ས་ བདེ་ལེགས་ ཕུན་སུམ་ ཚོགས །_། རྟག་ ཏུ་ བདེ་བ་ ཐོབ་པ ར་ ཤོག །_།
 56 | ```
 57 | 
 58 | ### Sorting Tibetan words
 59 | ```bash
 60 | $ bo kakha to-sort.txt
 61 | ```
 62 | The expected input is one word or entry per line in a .txt file. The file will be overwritten.
 63 | 
 64 | ### FNR - Find and Replace with a list of regexes
 65 | 
 66 | ```
 67 | bo fnr <in-dir> <regex-file> -o <out-dir> -t <tag>
 68 | ```
 69 | `-o` and `-t` are optional
 70 | 
 71 | Text files should be UTF-8 plain text files. The regexes should be in the following format:
 72 | 
 73 | ```
 74 | <find-pattern><tab>-<tab><replace-pattern>
 75 | ```
 76 | 
 77 | ## Acknowledgements
 78 | 
 79 | - **pybo** is an open source library for Tibetan NLP.
 80 | 
 81 | We are always open to cooperation in introducing new features, tool integrations and testing solutions.
 82 | 
 83 | Many thanks to the companies and organizations who have supported pybo's development, especially:
 84 | 
 85 | * [Khyentse Foundation](https://khyentsefoundation.org) for contributing USD22,000 to kickstart the project 
 86 | * The [Barom/Esukhia canon project](http://www.barom.org) for sponsoring training data curation
 87 | * [BDRC](https://tbrc.org) for contributing 2 staff for 6 months for data curation
 88 | 
 89 | - `third_party/rules.txt` is taken from [tibetan-collation](https://github.com/eroux/tibetan-collation/blob/master/implementations/Unicode/rules.txt).
 90 | 
 91 | ## Contributing
 92 | First clone this repo. Create virtual environment and activate it. Then install the dependencies
 93 | ```bash
 94 | $ pip install -e .
 95 | $ pip install -r requirements-dev.txt
 96 | ```
 97 | 
 98 | Next, setup up [pre-commit](https://pre-commit.com/) by creating pre-commit git hook
 99 | ```bash
100 | $ pre-commit install
101 | ```
102 | Please, follow [augular commit message format](https://github.com/angular/angular/blob/master/CONTRIBUTING.md#-commit-message-format) for commit message. We have setup [python-semantic-release](https://github.com/relekang/python-semantic-release) to publish [pybo](https://pypi.org/project/pybo/) package automatically based on commit messages.
103 | 
104 | That's all, Enjoy contributing 🎉🎉🎉
105 | 
106 | ## License
107 | 
108 | The Python code is Copyright (C) 2019 Esukhia, provided under [Apache 2](LICENSE). 
109 | 
110 | contributors:
111 |  * [Drupchen](https://github.com/drupchen)
112 |  * [Élie Roux](https://github.com/eroux)
113 |  * [Ngawang Trinley](https://github.com/ngawangtrinley)
114 |  * [Tenzin](https://github.com/10zinten)
115 |  * Joyce Mackzenzie for reworking the logo
116 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-architect


--------------------------------------------------------------------------------
/pybo/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | 
 3 | import pyewts
 4 | from botok import *
 5 | 
 6 | from .corpus.parse_corrected import extract_new_entries, parse_corrected
 7 | from .pipeline.pipes import pybo_form, pybo_mod, pybo_prep
 8 | from .utils.profile_report import profile_report
 9 | from .utils.regex_batch_apply import batch_apply_regex, get_regex_pairs
10 | 
11 | __version__ = "0.8.0"
12 | 


--------------------------------------------------------------------------------
/pybo/cli.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | from pathlib import Path
  3 | from shutil import rmtree
  4 | 
  5 | import click
  6 | from bordr import __version__ as bordr__version
  7 | from botok import Config, Text, WordTokenizer
  8 | from botok import __version__ as botok__version__
  9 | from botok import expose_data
 10 | from pyewts import VERSION as pyewts__version__
 11 | from tibetan_sort import TibetanSort
 12 | from tibetan_sort import __version__ as tibetan_sort__version__
 13 | 
 14 | from pybo import __version__ as pybo__version__
 15 | from pybo.corpus.parse_corrected import extract_new_entries
 16 | from pybo.pipeline.pipes import pybo_form, pybo_mod, pybo_prep
 17 | from pybo.rdr.rdr import rdr as r
 18 | from pybo.rdr.rdr_2_replace_matcher import rdr_2_replace_matcher
 19 | from pybo.utils.profile_report import profile_report as p_report
 20 | from pybo.utils.regex_batch_apply import batch_apply_regex, get_regex_pairs
 21 | from pybo.hfr_cqlr_converter import cqlr2hfr, hfr2cqlr
 22 | from pybo.segmentation_rule.pipeline import extract_seg_rule
 23 | 
 24 | HOME = Path.home()
 25 | DIALECT_PACK_DIR = HOME / "Documents" / "pybo" / "dialect_packs"
 26 | DEFAULT_DPACK = "general"
 27 | CONFIG_DIR = HOME / ".pybo"
 28 | CONFIG_FILE = CONFIG_DIR / "config.json"
 29 | 
 30 | 
 31 | @click.group()
 32 | @click.version_option(pybo__version__)
 33 | def cli():
 34 |     pass
 35 | 
 36 | 
 37 | @cli.command()
 38 | def info():
 39 |     click.echo("pybo install path: " + str(Path(__file__).parent.resolve()))
 40 |     click.echo("pybo: " + pybo__version__)
 41 |     click.echo("botok: " + botok__version__)
 42 |     click.echo("pyewts: " + pyewts__version__)
 43 |     click.echo("bordr: " + bordr__version)
 44 |     click.echo("tibetan_sort: " + tibetan_sort__version__)
 45 | 
 46 | 
 47 | def prepare_folder(main=None, custom=None, overwrite=False):
 48 |     profile = "POS"
 49 |     # 1. MAIN PROFILE
 50 |     if not main:
 51 |         # for better Windows support:
 52 |         # https://stackoverflow.com/questions/6227590/finding-the-users-my-documents-path/6227623#6227623
 53 |         main = Path.home() / "Documents/pybo/main"
 54 |     else:
 55 |         main = Path(main)
 56 |     main.mkdir(parents=True, exist_ok=True)
 57 | 
 58 |     if overwrite:
 59 |         rmtree(main, ignore_errors=True)
 60 |         main.mkdir()
 61 | 
 62 |     try:
 63 |         expose_data(main, profile=profile)
 64 |     except IOError:
 65 |         click.echo('using the existing data in "Documents/pybo/main/"')
 66 | 
 67 |     # 2. CUSTOM PROFILE
 68 |     if not custom:
 69 |         custom = Path.home() / "Documents/pybo/custom"
 70 |     else:
 71 |         custom = Path(custom)
 72 |     custom.mkdir(exist_ok=True)
 73 |     for dir in ["adjustment", "remove", "words", "words_skrt"]:
 74 |         Path(custom / dir).mkdir(exist_ok=True)
 75 | 
 76 |     return main, custom
 77 | 
 78 | 
 79 | def save_config(dialect_pack_path):
 80 |     config = {"dialect_pack_path": str(dialect_pack_path)}
 81 |     if not CONFIG_DIR.is_dir():
 82 |         CONFIG_DIR.mkdir(parents=True)
 83 |     json.dump(config, CONFIG_FILE.open("w"))
 84 | 
 85 | 
 86 | def load_config():
 87 |     if not CONFIG_FILE.is_file():
 88 |         return
 89 |     else:
 90 |         config = json.load(CONFIG_FILE.open())
 91 |     return config
 92 | 
 93 | 
 94 | # Tokenize file
 95 | @cli.command()
 96 | @click.argument("input-dir", type=click.Path(exists=True))
 97 | @click.option(
 98 |     "-t",
 99 |     "--tags",
100 |     help="""Select and order the tags. Available tags are:
101 | t-clean_text, p-pos, l-lemma, s-sense.\n
102 | Usage: `-t tpl` will give for every token `<raw-text>/<clean-text>/<pos>/<lemma>`
103 | and will give just `<raw-text>` if tag option is not specified.""",
104 | )
105 | @click.option(
106 |     "-o", type=click.Path(exists=True), help="output dir, default is the input_dir"
107 | )
108 | @click.option("-d", "--dialect-name", type=str, help="official dialect pack name.")
109 | @click.option(
110 |     "-p",
111 |     "--dialect-path",
112 |     type=click.Path(exists=True),
113 |     help="path to the dialect pack",
114 | )
115 | @click.option("-w", "--overwrite", is_flag=True)
116 | @click.option("-r", "--rebuild-trie", is_flag=True)
117 | def tok(**kwargs):
118 |     input_dir = Path(kwargs["input_dir"])
119 |     dialect_name = kwargs["dialect_name"]
120 |     dialect_path = kwargs["dialect_path"]
121 |     # overwrite = kwargs["overwrite"]
122 |     rebuild = kwargs["rebuild_trie"]
123 | 
124 |     # load botok config
125 |     if dialect_name:
126 |         config = Config(dialect_name=dialect_name)
127 |         save_config(config.dialect_pack_path)
128 |     elif dialect_path:
129 |         config = Config.from_path(dialect_path)
130 |         # config.dialect_pack_path = Path(dialect_pack_path)
131 |         save_config(config.dialect_pack_path)
132 |     else:
133 |         pybo_config = load_config()
134 |         if not pybo_config:
135 |             config = Config()
136 |             save_config(config.dialect_pack_path)
137 |         else:
138 |             dialect_pack_path = pybo_config["dialect_pack_path"]
139 |             config = Config.from_path(dialect_pack_path)
140 | 
141 |     print(
142 |         f"[INFO] Using `{config.dialect_pack_path.name}` dialect pack for tokenization ..."
143 |     )
144 | 
145 |     wt = WordTokenizer(config=config, build_trie=rebuild)
146 | 
147 |     def pybo_tok(in_str):
148 |         return wt.tokenize(in_str)
149 | 
150 |     # Select and Order the tags
151 |     if kwargs["tags"]:
152 |         pybo_mod.__defaults__ = (list(kwargs["tags"]),)
153 | 
154 |     if input_dir.is_dir():
155 |         if kwargs["o"] is not None:
156 |             output_dir = Path(kwargs["o"])
157 |         else:
158 |             output_dir = input_dir.parent / (input_dir.name + "_tok")
159 |             output_dir.mkdir(exist_ok=True)
160 |         for f in input_dir.glob("*.txt"):
161 |             out_file = output_dir / (f.stem + "_tok.txt")
162 |             text = Text(f, out_file)
163 |             text.custom_pipeline(pybo_prep, pybo_tok, pybo_mod, pybo_form)
164 |     elif input_dir.is_file():
165 |         input_file = input_dir
166 |         if kwargs["o"] is not None:
167 |             output_dir = Path(kwargs["o"])
168 |         else:
169 |             output_dir = input_file.parent / (input_file.stem + "_tok")
170 |             output_dir.mkdir(exist_ok=True)
171 |         out_file = output_dir / (input_file.stem + "_tok.txt")
172 |         text = Text(input_file, out_file)
173 |         text.custom_pipeline(pybo_prep, pybo_tok, pybo_mod, pybo_form)
174 |     else:
175 |         print("[INFO] Invalid input directory or file!!!")
176 | 
177 | 
178 | # Tokenize string
179 | @cli.command()
180 | @click.argument("string")
181 | def tok_string(**kwargs):
182 |     t = Text(kwargs["string"])
183 |     click.echo(t.tokenize_words_raw_lines)
184 | 
185 | 
186 | # lists
187 | tag_types = ["pos", "lemma", "sense"]
188 | 
189 | 
190 | @cli.command()
191 | @click.argument("input-dir", type=click.Path(exists=True))
192 | @click.option("-t", "--type")
193 | def lists(**kwargs):
194 |     path = Path(kwargs["path"])
195 | 
196 |     text_string = ""
197 |     for f in path.glob("*.txt"):
198 |         text_string += f.read_text(encoding="utf-8-sig")
199 | 
200 | 
201 | # create report for botok profiles
202 | @cli.command()
203 | @click.argument("profile", type=click.Path(exists=True))
204 | def profile_report(**kwargs):
205 |     p_report(kwargs["profile"])
206 | 
207 | 
208 | # rdr_2_replace_matcher
209 | @cli.command()
210 | @click.argument("infile", type=click.Path(exists=True))
211 | def rdr2repl(**kwargs):
212 |     infile = Path(kwargs["infile"])
213 |     outfile = infile.parent / (infile.stem + ".yaml")
214 |     dump = infile.read_text(encoding="utf-8-sig")
215 |     processed = rdr_2_replace_matcher(dump)
216 |     outfile.write_text(processed, encoding="utf-8-sig")
217 | 
218 | 
219 | # sort in the Tibetan order
220 | @cli.command()
221 | @click.argument("infile", type=click.Path(exists=True))
222 | def kakha(**kwargs):
223 |     sort = TibetanSort()
224 |     infile = Path(kwargs["infile"])
225 |     words = infile.read_text(encoding="utf-8-sig").split()
226 |     print(f"Sorting {infile.name}")
227 |     words = sort.sort_list(words)
228 |     print(f"{infile.name} is sorted")
229 |     infile.write_text("\n".join(words), encoding="utf-8-sig")
230 | 
231 | 
232 | # generate rdr rules
233 | @cli.command()
234 | @click.argument("input", type=click.Path(exists=True))
235 | @click.option("-dp", type=str, help="Dialect pack name, default is general")
236 | @click.option("-k", "--keep", type=str)
237 | @click.option('--type', type=str, help="Type can be either cql which is default type or hfr(Human friendly rule)")
238 | def extract_rules(**kwargs):
239 |     file_or_dir = Path(kwargs["input"])
240 |     dialect_pack_name = kwargs["dp"] if kwargs["dp"] else DEFAULT_DPACK
241 |     keep = "none" if kwargs["keep"] is None else kwargs["keep"]
242 |     type = "cql" if kwargs["type"] is None else kwargs["type"]
243 |     if type == "cql":
244 |         out_dir = DIALECT_PACK_DIR / dialect_pack_name / "adjustments" / "rules"
245 |     else:
246 |         out_dir = DIALECT_PACK_DIR / dialect_pack_name / "hfr_rules"
247 |         out_dir.mkdir(exist_ok=True)
248 | 
249 |     log = None
250 |     click.echo("[INFO] Extracing adjustments rules ...")
251 |     if file_or_dir.is_dir():
252 |         file = file_or_dir / file_or_dir.name
253 |         with open(file, encoding="utf-8-sig", mode="w") as tmp:
254 |             for f in file_or_dir.glob("*.txt"):
255 |                 tmp.write(f.read_text(encoding="utf-8-sig") + " ")
256 |         log = r(file, outdir=out_dir, keep=keep, type=type)
257 |         file.unlink()
258 |     elif file_or_dir.is_file():
259 |         log = r(file_or_dir, out_dir, keep=keep, type=type)
260 |         click.echo(f"[INFO] {file_or_dir} does not exist!")
261 | 
262 |     click.echo(log)
263 |     click.echo("[INFO] Completed !")
264 |     click.echo(f"[INFO] Added adjustments rules to {dialect_pack_name}")
265 | 
266 | # generate rdr rules
267 | @cli.command()
268 | @click.argument("input", type=click.Path(exists=True))
269 | @click.option("-dp", type=str, help="Dialect pack name, default is general")
270 | @click.option('--type', type=str, help="Type can be either cql which is default type or hfr(Human friendly rule)")
271 | @click.option("--e", type=int)
272 | def extract_seg_rules(**kwargs):
273 |     rules = ''
274 |     input_path = Path(kwargs["input"])
275 |     dialect_pack_name = kwargs["dp"] if kwargs["dp"] else DEFAULT_DPACK
276 |     type = "cql" if kwargs["type"] is None else kwargs["type"]
277 |     epochs = 3 if kwargs['e'] is None else kwargs['e']
278 |     if type == "cql":
279 |         out_dir = DIALECT_PACK_DIR / dialect_pack_name / "adjustments" / "rules"
280 |     else:
281 |         out_dir = DIALECT_PACK_DIR / dialect_pack_name / "hfr_rules"
282 |         out_dir.mkdir(exist_ok=True)
283 | 
284 |     click.echo("[INFO] Extracing adjustments rules ...")
285 |     
286 |     if input_path.is_dir():
287 |         print('[ERROR] Invalid file name!!')
288 |     elif input_path.is_file():
289 |         rules += extract_seg_rule(input_path, dialect_pack_name, type, epochs)
290 |         if rules:
291 |             (out_dir / f'{input_path.stem}_rules.tsv').write_text(rules, encoding='utf-8')
292 |         else:
293 |             print('[INFO] No rules found')
294 |         
295 |     click.echo("[INFO] Completed !")
296 |     click.echo(f"[INFO] Added adjustments rules to {dialect_pack_name}")
297 | 
298 | #convert cql to hfr
299 | @cli.command()
300 | @click.argument("input", type=click.Path(exists=True))
301 | @click.option("-dp", type=str, help="Dialect pack name, default is general")
302 | def convert_cql2hfr(**kwargs):
303 |     cql_path = Path(kwargs['input'])
304 |     dialect_pack_name = kwargs["dp"] if kwargs["dp"] else DEFAULT_DPACK
305 |     hfr_dir = DIALECT_PACK_DIR / dialect_pack_name / "hfr_rules"
306 |     hfr_dir.mkdir(exist_ok=True) 
307 |     hfr_file_path = hfr_dir / (cql_path.stem + ".tsv")
308 |     cql_rules = cql_path.read_text(encoding='utf-8')
309 |     hfr = cqlr2hfr(cql_rules)
310 |     hfr_file_path.write_text(hfr, encoding='utf-8')
311 | 
312 | #convert hfr to cql
313 | @cli.command()
314 | @click.argument("input", type=click.Path(exists=True))
315 | @click.option("-dp", type=str, help="Dialect pack name, default is general")
316 | def convert_hfr2cql(**kwargs):
317 |     hfr_path = Path(kwargs['input'])
318 |     dialect_pack_name = kwargs["dp"] if kwargs["dp"] else DEFAULT_DPACK
319 |     cql_dir = DIALECT_PACK_DIR / dialect_pack_name / "adjustments" / "rules"
320 |     cql_dir.mkdir(exist_ok = True)
321 |     cql_file_path = cql_dir / (hfr_path.stem + ".tsv")
322 |     hfr = hfr_path.read_text(encoding='utf-8')
323 |     cql = hfr2cqlr(hfr)
324 |     cql_file_path.write_text(cql, encoding='utf-8')
325 | 
326 | 
327 | # extract new entries from manually corrected texts + existing profile
328 | @cli.command()
329 | @click.argument("corrected-path", type=click.Path(exists=True))
330 | @click.argument("dialect_path", type=click.Path(exists=True))
331 | @click.option("-o", "--out-dir", type=click.Path(exists=True))
332 | def profile_update(**kwargs):
333 |     corrected = Path(kwargs["corrected_path"])
334 |     dialect_path = Path(kwargs["dialect_path"])
335 |     out_dir = Path(kwargs["out_dir"]) if kwargs["out_dir"] else None
336 | 
337 |     dump = ""
338 |     for f in corrected.glob("*.txt"):
339 |         dump += f.read_text(encoding="utf-8-sig") + "\n"
340 | 
341 |     rules = extract_new_entries(dump, dialect_path)
342 |     if not out_dir:
343 |         out = corrected.parent / (corrected.name + "_words.tsv")
344 |     else:
345 |         out = out_dir / (corrected.name + "_words.tsv")
346 | 
347 |     if not out.parent.is_dir():
348 |         out.parent.mkdir(exist_ok=True)
349 | 
350 |     out.write_text(rules, encoding="utf-8-sig")
351 | 
352 | 
353 | # FNR - Find and Replace with a list of regexes
354 | @cli.command()
355 | @click.argument("in-dir", type=click.Path(exists=True))
356 | @click.argument("regex-file", type=click.Path(exists=True))
357 | @click.option("-o", "--out-dir", type=click.Path(exists=True))
358 | @click.option("-t", "--tag")
359 | def fnr(**kwargs):
360 |     # get the args
361 |     indir = Path(kwargs["in_dir"])
362 |     regex_file = Path(kwargs["regex_file"])
363 |     out_dir = Path(kwargs["out_dir"]) if kwargs["out_dir"] else None
364 | 
365 |     if not indir.is_dir():
366 |         click.echo("IN_DIR should be a folder, not a file.\nexiting...")
367 |         exit(1)
368 | 
369 |     # optional out file tag
370 |     tag = kwargs["tag"] if kwargs["tag"] else regex_file.stem
371 | 
372 |     # generate rules
373 |     rules = get_regex_pairs(regex_file.open(encoding="utf-8-sig").readlines())
374 | 
375 |     # apply on each file, prefixing each one with the regex filename
376 |     for f in indir.rglob("*.txt"):
377 |         if not f.stem.startswith("_"):
378 |             string = f.read_text(encoding="utf-8-sig")
379 |             out = batch_apply_regex(string, rules)
380 |             name = f"_{tag}_" + f.name
381 |             if out_dir:
382 |                 Path(out_dir).mkdir(parents=True, exist_ok=True)
383 |                 outfile = out_dir / name
384 |             else:
385 |                 outfile = f.parent / name
386 |             outfile.write_text(out, encoding="utf-8-sig")
387 | 
388 | 
389 | if __name__ == "__main__":
390 |     # cli()
391 |     save_config("test_path")
392 | 


--------------------------------------------------------------------------------
/pybo/corpus/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/pybo/c65ee83a0659f721bccdf48db4901360e7d97048/pybo/corpus/__init__.py


--------------------------------------------------------------------------------
/pybo/corpus/parse_corrected.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import re
 3 | 
 4 | from tibetan_sort import TibetanSort
 5 | 
 6 | from ..utils.profile_entries import profile_entries
 7 | from .word_cleanup import word_cleanup
 8 | 
 9 | sort = TibetanSort()
10 | 
11 | 
12 | def parse_corrected(in_str):
13 |     # prepare string: replace returns and tabs and multiple spaces by a single space
14 |     in_str = in_str.replace("\n", " ").replace("\t", " ")
15 |     in_str = re.sub(r"\s+", " ", in_str)
16 | 
17 |     # parse
18 |     sep_field = "/"
19 |     parsed = []
20 |     for token in in_str.split():
21 |         fields = ["", "", "", "", ""]
22 |         for num, f in enumerate(token.split(sep_field)):
23 |             # cleanup the form and the lemma
24 |             if (num == 0 or num == 2) and f:
25 |                 f = word_cleanup(f)
26 |             fields[num] = f
27 |         parsed.append(fields)
28 |     return parsed
29 | 
30 | 
31 | def extract_new_entries(in_str, profile_path):
32 |     entries = profile_entries(profile_path)
33 | 
34 |     # parse input
35 |     parsed = parse_corrected(in_str)
36 | 
37 |     # generate content, without duplicates
38 |     entry_data = []
39 |     for p in parsed:
40 |         word = p[0]
41 |         e_d = "\t".join(p)
42 |         if (word not in entries or e_d not in entries[word]) and e_d not in entry_data:
43 |             entry_data.append(e_d)
44 | 
45 |     # sort both lists
46 |     # words = sort.sort_list(words)
47 |     entry_data = sort.sort_list(entry_data)
48 |     entry_data = ["# form	pos	lemma	sense	freq"] + entry_data
49 | 
50 |     return "\n".join(entry_data)
51 | 


--------------------------------------------------------------------------------
/pybo/corpus/word_cleanup.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from botok import NAMCHE, TSEK, TokChunks
 3 | 
 4 | 
 5 | def word_cleanup(string):
 6 |     """If it is Tibetan text, returns the cleaned up syllables, otherwise the original string"""
 7 | 
 8 |     def join_syls(syls):
 9 |         return "".join([syl if syl.endswith(NAMCHE) else syl + TSEK for syl in syls])
10 | 
11 |     syls = TokChunks(string).get_syls()
12 |     if syls:
13 |         return join_syls(syls)
14 |     else:
15 |         return string
16 | 


--------------------------------------------------------------------------------
/pybo/hfr_cqlr_converter.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from pathlib import Path
 3 | 
 4 | cql2hfr_tag = {
 5 |     '"ADJ"': "རྒྱན",
 6 |     '"ADP"': "སྦྱོར",
 7 |     '"ADV"': "བསྣན",
 8 |     '"AUX"': "གྲོགས",
 9 |     '"CCONJ"': "སྦྲེལ",
10 |     '"DET"': "ངེས",
11 |     '"INTJ"': "འབོད",
12 |     '"NOUN"': "མིང",
13 |     '"NUM"': "གྲངས",
14 |     '"PRON"': "ཚབ",
15 |     '"PROPN"': "ཁྱད",
16 |     '"PUNCT"': "ཚེག",
17 |     '"SCONJ"': "ལྟོས",
18 |     '"VERB"': "བྱ",
19 |     '"PART"': "རོགས",
20 |     "pos=": "གཤིས=",
21 |     "lemma=": "མ=",
22 |     "sense=": "དོན=",
23 |     "&": "༈",
24 |     "[": "༺",
25 |     "]": "༻",
26 | }
27 | 
28 | 
29 | def cqlr2hfr(cqlr_string):
30 |     """Convert corpus queery language(cql) rules to human friendly rules(hfr) which has UDPOS in Tibetan.
31 | 
32 |     Args:
33 |         cql_string (str): corpus queery language rules
34 | 
35 |     Returns:
36 |         str: human friendly rules(in Tibetan language)
37 |     """
38 |     hfr_string = cqlr_string
39 |     for cql_tag, hfr_tag in cql2hfr_tag.items():
40 |         hfr_string = hfr_string.replace(cql_tag, hfr_tag)
41 |     return hfr_string
42 | 
43 | 
44 | def hfr2cqlr(hfr_string):
45 |     """Convert human friendly rules(hfr) to corpus queery language rules format.
46 | 
47 |     Args:
48 |         hfr_string (str): Human friendly rules(hfr)
49 | 
50 |     Returns:
51 |         str: Corpus queery language(cql) rules format.
52 |     """
53 |     cql_string = hfr_string
54 |     for cql_tag, hfr_tag in cql2hfr_tag.items():
55 |         cql_string = cql_string.replace(hfr_tag, cql_tag)
56 |     return cql_string
57 | 


--------------------------------------------------------------------------------
/pybo/monlam2wordlist.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import re
  3 | 
  4 | from botok import Text
  5 | 
  6 | ID = -1
  7 | 
  8 | POS_NAMES = (" མིང་ཚིག ", " བྱ་ཚིག ", " བྱེད་ཚིག ", " གྲོགས་ཚིག ")
  9 | 
 10 | 
 11 | class cols:
 12 |     ID = "ID"
 13 |     FORM = "Form"
 14 |     LEMMA = "Lemma"
 15 |     MON_POS = "MonPOS"
 16 |     MON_FEATURE = "MonFeature"
 17 |     MON_TAG = "MonTag"
 18 |     POS = "POS"
 19 |     FEATURE = "Feature"
 20 |     MORPH = "Morph"
 21 |     SENSE_TAG = "SenseTag"
 22 |     DEFINITION = "Definition"
 23 |     EXAMPLE = "Example"
 24 | 
 25 | 
 26 | def create_word(
 27 |     form,
 28 |     lemma,
 29 |     mon_pos=None,
 30 |     mon_feature=None,
 31 |     mon_tag=None,
 32 |     pos=None,
 33 |     feature=None,
 34 |     morph=None,
 35 |     sense_tag=None,
 36 |     definition=None,
 37 |     example=None,
 38 | ):
 39 |     global ID
 40 |     ID += 1
 41 |     return {
 42 |         "ID": ID,
 43 |         "Form": form,
 44 |         "Lemma": lemma,
 45 |         "MonPOS": mon_pos,
 46 |         "MonFeature": mon_feature,
 47 |         "MonTag": mon_tag,
 48 |         "POS": pos,
 49 |         "Feature": feature,
 50 |         "Morph": feature,
 51 |         "SenseTag": sense_tag,
 52 |         "Definition": definition,
 53 |         "Example": example,
 54 |     }
 55 | 
 56 | 
 57 | def csv_loader(path):
 58 |     with open(path, "r") as csv_file:
 59 |         reader = csv.reader(csv_file)
 60 |         for i, row in enumerate(reader):
 61 |             if i == 0:
 62 |                 continue
 63 |             yield row
 64 | 
 65 | 
 66 | def get_single_pos(chunk_containing_pos):
 67 |     """Return only first pos and it's content."""
 68 |     pos_char_end_idx = chunk_containing_pos.find(" ")
 69 |     pos = chunk_containing_pos[:pos_char_end_idx]
 70 |     pos_content = chunk_containing_pos[(pos_char_end_idx + 1) :]
 71 |     return pos, pos_content
 72 | 
 73 | 
 74 | def find_all_remaining_pos(chunk):
 75 |     """Return all pos position and it's length.
 76 | 
 77 |     Return:
 78 |         pos_start_idxs (list): [(pos_start_idx, len(pos_name)), ...] sorted on pos_start_idx.
 79 | 
 80 |     """
 81 |     pos_start_idxs = []
 82 |     pos_found = False
 83 |     for pos_name in POS_NAMES:
 84 |         pos_found = True
 85 |         pos_start_idx = chunk.find(pos_name)
 86 |         if pos_start_idx != -1:
 87 |             pos_start_idxs.append((pos_start_idx, len(pos_name)))
 88 |     if pos_found:
 89 |         pos_start_idxs.append((len(chunk), 0))
 90 |     return sorted(pos_start_idxs, key=lambda x: x[0])
 91 | 
 92 | 
 93 | def get_pos_list(text):
 94 |     """Parse pos and it's content (mon_tags, definitions) in string.
 95 | 
 96 |     Returns:
 97 |         post_list (list): [(pos, pos_content), ...]
 98 |     """
 99 | 
100 |     pos_list = []
101 |     chunks_containing_pos = text.split(" 1. ")
102 |     estimated_n_pos = len(chunks_containing_pos)
103 |     if estimated_n_pos == 1:  # one_pos_one_sense
104 |         chunk_containing_pos = chunks_containing_pos[0].strip()
105 |         pos, pos_content = get_single_pos(chunk_containing_pos)
106 |         pos_list.append((pos, pos_content))
107 |     elif estimated_n_pos == 2:  # one_pos_multi_senses
108 |         pos, pos_content = chunks_containing_pos
109 |         pos_list.append((pos, pos_content))
110 |     else:  # multi_pos_multi_senses
111 |         pos = chunks_containing_pos[0]
112 |         for i, chunk_containing_pos in enumerate(chunks_containing_pos[1:]):
113 |             if i == estimated_n_pos - 2:  # if last chunk, check for all pos
114 |                 new_chunk_start = 0
115 |                 next_pos_start_idxs = find_all_remaining_pos(chunk_containing_pos)
116 |                 for next_pos_start_idx, pos_name_len in next_pos_start_idxs:
117 |                     pos_content = chunk_containing_pos[
118 |                         new_chunk_start:next_pos_start_idx
119 |                     ]
120 |                     pos_list.append((pos, pos_content))
121 |                     pos = chunk_containing_pos[
122 |                         next_pos_start_idx : next_pos_start_idx + pos_name_len
123 |                     ].strip()
124 |                     new_chunk_start = next_pos_start_idx + pos_name_len
125 |                 if not next_pos_start_idxs:
126 |                     pos_list.append((pos, chunk_containing_pos))
127 |             else:
128 |                 next_pos_start_idx = chunk_containing_pos.rfind(" ")
129 |                 pos_content = chunk_containing_pos[:next_pos_start_idx]
130 |                 pos_list.append((pos, pos_content))
131 |                 pos = chunk_containing_pos[next_pos_start_idx + 1 :]
132 | 
133 |     return pos_list
134 | 
135 | 
136 | def get_definition_list(pos_list):
137 |     """Parse definitions from pos_content.
138 | 
139 |     Returns:
140 |         definition_list (list): [(pos, definition-content), ...]
141 | 
142 |     """
143 |     definition_list = []
144 |     for pos, pos_content in pos_list:
145 |         for definition_content in re.split(r" \d\. ", pos_content):
146 |             definition_list.append((pos, definition_content))
147 |     return definition_list
148 | 
149 | 
150 | def get_tag_list(definition_list):
151 |     """Parse monlam tag from definition content.
152 | 
153 |     Returns:
154 |         tag_list (list): [(pos, tag, definition), ...]
155 |     """
156 | 
157 |     def parse_tag(text):
158 |         if text[0] != "༡":
159 |             return "", text
160 |         tag_end_idx = text.find(" ")
161 |         tag = text[:tag_end_idx]
162 |         definition = text[tag_end_idx + 1 :]
163 |         return tag, definition
164 | 
165 |     tag_list = []
166 |     for pos, definition_content in definition_list:
167 |         tag, definition = parse_tag(definition_content)
168 |         tag_list.append((pos, tag, definition))
169 |     return tag_list
170 | 
171 | 
172 | def get_sense_tag_list(tag_list):
173 |     """Parse sense from definition.
174 | 
175 |     Sense here the first word of the given definition.
176 | 
177 |     Returns:
178 |         sense_tag_list (list): [(pos, tag, sense_tag, definition), ...]
179 | 
180 |     """
181 | 
182 |     def get_first_segment(text, delimiter=" "):
183 |         seg_idx = text.find(delimiter)
184 |         if seg_idx == -1:
185 |             return text
186 |         return text[:seg_idx]
187 | 
188 |     sense_tag_list = []
189 |     for *pos_and_tag, definition in tag_list:
190 |         first_segment = get_first_segment(definition)
191 |         tokenized_segment = Text(first_segment).tokenize_words_raw_text
192 |         sense = get_first_segment(tokenized_segment)
193 |         sense_tag_list.append((*pos_and_tag, sense, definition))
194 |     return sense_tag_list
195 | 
196 | 
197 | def get_example_list(sense_tag_list):
198 |     """Parse example from the definition."""
199 | 
200 |     def parse_example(text, example_tag="དཔེར་ན།"):
201 |         example_start_idx = text.rfind(example_tag)
202 |         if example_start_idx == -1:
203 |             return text, ""
204 |         definition = text[:example_start_idx].strip()
205 |         example = text[example_start_idx + len(example_tag) :].strip()
206 |         return definition, example
207 | 
208 |     example_list = []
209 |     for *pos_tag_sense, definition in sense_tag_list:
210 |         definition, example = parse_example(definition)
211 |         example_list.append((*pos_tag_sense, definition, example))
212 |     return example_list
213 | 
214 | 
215 | def parse_attrs(form, text_containing_attrs):
216 |     pos_list = get_pos_list(text_containing_attrs)
217 |     definition_list = get_definition_list(pos_list)
218 |     tag_list = get_tag_list(definition_list)
219 |     sense_tag_list = get_sense_tag_list(tag_list)
220 |     example_list = get_example_list(sense_tag_list)
221 |     return example_list
222 | 
223 | 
224 | def monlam2wordlist(rows):
225 |     word_list_rows = []
226 |     for row in rows:
227 |         *_, form, result = row
228 |         attrs = parse_attrs(form, result)
229 |         print(row, attrs)
230 |     return word_list_rows
231 | 
232 | 
233 | def dump_tsv(rows, out_path):
234 |     with open(out_path, "w") as csv_file:
235 |         writer = csv.writer(csv_file, delimiter="\t")
236 |         writer.writerows(rows)
237 | 


--------------------------------------------------------------------------------
/pybo/pipeline/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/pybo/c65ee83a0659f721bccdf48db4901360e7d97048/pybo/pipeline/__init__.py


--------------------------------------------------------------------------------
/pybo/pipeline/pipes.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import botok
  3 | 
  4 | 
  5 | def get_chunks(raw_string):
  6 |     chunker = botok.Chunks(raw_string)
  7 |     chunks = chunker.make_chunks()
  8 |     chunks = chunker.get_readable(chunks)
  9 |     return chunks
 10 | 
 11 | 
 12 | def shelve_info(chunks):
 13 |     shelved = []
 14 |     clean_chunks = []
 15 | 
 16 |     syl_count = 0
 17 |     for i, chunk in enumerate(chunks):
 18 |         marker, text = chunk
 19 |         if marker == "TEXT" or marker == "PUNCT":
 20 |             syl_count += 1
 21 | 
 22 |         # 2.a. extract transparent chars
 23 |         # TODO: adapt to also include \t as transparent char
 24 |         if "\n" in text:
 25 |             # remove transparent char
 26 |             text = text.replace("\n", "")
 27 |             index = (syl_count, "\n")
 28 | 
 29 |             shelved.append(index)
 30 |             clean_chunks.append((marker, text))
 31 | 
 32 |         # 2.b. extract any non-bo chunk
 33 |         elif marker != "TEXT" and marker != "PUNCT":
 34 |             index = (syl_count, text)
 35 |             shelved.append(index)
 36 | 
 37 |         else:
 38 |             clean_chunks.append(chunk)
 39 | 
 40 |     return clean_chunks, shelved
 41 | 
 42 | 
 43 | def pybo_prep(in_str):
 44 |     # 1. get chunks
 45 |     chunks = get_chunks(in_str)
 46 | 
 47 |     # 2. shelve needed info
 48 |     chunks, shelved = shelve_info(chunks)
 49 |     pybo_form_sep = pybo_form.__defaults__[0]
 50 |     pybo_form.__defaults__ = (pybo_form_sep, shelved)
 51 | 
 52 |     # 3. tokenize
 53 |     str_for_botok = "".join([c[1] for c in chunks])
 54 | 
 55 |     return str_for_botok
 56 | 
 57 | 
 58 | def get_tag(token, tag_code):
 59 |     maps = {"r": "text", "t": "text_cleaned", "p": "pos", "l": "lemma", "s": "sense"}
 60 |     try:
 61 |         return token[maps[tag_code]]
 62 |     except Exception:
 63 |         return ""
 64 | 
 65 | 
 66 | def pybo_mod(tokens, tag_codes=[]):
 67 |     """extract text/pos tuples from Token objects"""
 68 |     txt_tags = []
 69 |     for token in tokens:
 70 |         tags = []
 71 |         tags.append(token.text)
 72 |         # Select and order the tags
 73 |         for tag_code in tag_codes:
 74 |             tags.append(get_tag(token, tag_code))
 75 |         txt_tags.append(tags)
 76 |     return txt_tags
 77 | 
 78 | 
 79 | def ws2uc(tags):
 80 |     """Convert whitespace in raw-text to underscore."""
 81 |     tags[0] = tags[0].replace(" ", "_")
 82 |     return tags
 83 | 
 84 | 
 85 | def n_chunks(token):
 86 |     return len([chunk for chunk in token.split("་") if chunk])
 87 | 
 88 | 
 89 | def pybo_form(tokens, sep=" ", shelved=None):
 90 |     """Format in a single string to be written to file"""
 91 |     if not shelved:
 92 |         print(shelved)
 93 |         out = []
 94 |         shelved_idx = 0
 95 |         syl_count = 0
 96 | 
 97 |         # reinsert shelved tokens
 98 |         for token in tokens:
 99 |             out.append("/".join(ws2uc(token)))
100 |             syl_count += n_chunks(token[0])
101 |             sheveled_syl_count, shelved_cleaned_chunk = shelved[shelved_idx]
102 |             if "PART" not in token and sheveled_syl_count <= syl_count:
103 |                 out.append(ws2uc([shelved_cleaned_chunk])[0])
104 |                 shelved_idx += 1
105 | 
106 |         # add all the remaining sheveld tokens
107 |         if shelved_idx < len(shelved):
108 |             for _, shelved_cleaned_chunk in shelved_cleaned_chunk[shelved_idx:]:
109 |                 out.append(ws2uc([shelved_cleaned_chunk]))
110 |     else:
111 |         out = ["/".join(ws2uc(token)) for token in tokens]
112 |     return sep.join(out)
113 | 


--------------------------------------------------------------------------------
/pybo/rdr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/pybo/c65ee83a0659f721bccdf48db4901360e7d97048/pybo/rdr/__init__.py


--------------------------------------------------------------------------------
/pybo/rdr/rdr.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from pathlib import Path
 3 | from shutil import copyfile
 4 | 
 5 | from bordr import rdr as r
 6 | 
 7 | from .rdr_2_replace_matcher import rdr_2_replace_matcher
 8 | 
 9 | from pybo.hfr_cqlr_converter import cqlr2hfr
10 | 
11 | 
12 | def rdr_postprocess(rules, infile, outdir=None, keep="model"):
13 |     suffixes = [".DICT", ".INIT", ".RAW", ".RDR", ".sDict"]
14 |     if not outdir:
15 |         outdir = infile.parent.parent
16 |     else:
17 |         outdir = Path(outdir)
18 | 
19 |     # write adjustment rules file
20 |     adj_file = outdir / (infile.stem + "_rules.tsv")
21 |     adj_file.write_text(rules, encoding="utf-8-sig")
22 | 
23 |     # copy files to output directory
24 |     for s in suffixes:
25 |         if keep == "all":
26 |             src = infile.parent / (infile.name + s)
27 |             dst = outdir / (infile.name + s)
28 |             if src != dst:
29 |                 copyfile(src, dst)
30 |                 Path(infile.parent / (infile.name + s)).unlink()
31 |         elif keep == "model":
32 |             if s in [".DICT", ".RDR"]:
33 |                 src = infile.parent / (infile.name + s)
34 |                 dst = outdir / (infile.name + s)
35 |                 if src != dst:
36 |                     copyfile(src, dst)
37 |                     Path(infile.parent / (infile.name + s)).unlink()
38 |             else:
39 |                 Path(infile.parent / (infile.name + s)).unlink()
40 |         elif keep == "none":
41 |             Path(infile.parent / (infile.name + s)).unlink()
42 |         else:
43 |             raise SyntaxError("'keep' should either be 'all', 'model' or 'none'.")
44 | 
45 | 
46 | def rdr(infile, outdir=None, keep="model", type="cql"):
47 |     """
48 | 
49 |     :param infile: file to process. should be a POS tagged file
50 |     :param outdir: optional. should be the output directory
51 |     :param keep: all RDR files if "all", the .RDR and .DICT files if "model", none if None
52 |     :return: RDR's log
53 |     """
54 |     infile = Path(infile).resolve()
55 | 
56 |     # run the RDR training
57 |     log = r(str(infile), mode="train", verbose=True)
58 | 
59 |     # translate to adjustment tsv
60 |     rdr_rules = Path(infile.parent / (infile.name + ".RDR")).read_text(
61 |         encoding="utf-8-sig"
62 |     )
63 |     rules = rdr_2_replace_matcher(rdr_rules)
64 |     if type is not "cql":
65 |         rules = cqlr2hfr(rules)
66 |     # remove RDR files and copy them if needed
67 |     rdr_postprocess(rules, infile, outdir=outdir, keep=keep)
68 | 
69 |     return log if log else None
70 | 


--------------------------------------------------------------------------------
/pybo/rdr/rdr_2_replace_matcher.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | # variables
  4 | tag = "object.tag"
  5 | word = "object.word"
  6 | prev_tag = "object.prevTag"
  7 | next_tag = "object.nextTag"
  8 | prev_word = "object.prevWord"
  9 | next_word = "object.nextWord"
 10 | conclusion = "object.conclusion"
 11 | suffix = "object.suffixL"
 12 | op = " == "
 13 | ccl_op = " = "
 14 | cond_sep = " and "
 15 | rule_sep = " : "
 16 | cql_rule_sep = " & "
 17 | level_sep = "\t"
 18 | 
 19 | positive = [tag, word, next_tag, next_word, conclusion, suffix]
 20 | negative = [prev_tag, prev_word]
 21 | eq_table = {
 22 |     tag: "pos",
 23 |     prev_tag: "pos",
 24 |     next_tag: "pos",
 25 |     conclusion: "pos",
 26 |     word: "text",
 27 |     prev_word: "text",
 28 |     next_word: "text",
 29 |     suffix: "text",
 30 | }
 31 | 
 32 | 
 33 | def rdr_2_replace_matcher(string):
 34 |     cql = format_rules(find_rules(find_levels(string)))
 35 |     repl = "{match_cql}\t{replace_index}\t=\t{replace_cql}"
 36 |     repls = [
 37 |         repl.format(match_cql=a, replace_index=b, replace_cql=c)
 38 |         for a, b, c in cql
 39 |         if a != c
 40 |     ]
 41 |     return "\n".join(repls)
 42 | 
 43 | 
 44 | def format_rules(rules):
 45 |     def generate_cql(test):
 46 |         if len(test) > 2:
 47 |             s, *_, e = sorted(test)
 48 |         elif len(test) == 2:
 49 |             s, e = sorted(test)
 50 |         else:
 51 |             s, e = 0, 0
 52 | 
 53 |         slots = []
 54 |         slot_zero_idx = None
 55 |         for num, t in enumerate(range(s, e + 1)):
 56 |             if t == 0:
 57 |                 slot_zero_idx = num + 1
 58 | 
 59 |             if t in test:
 60 |                 conds = [f"{eq_table[tag]}={pos}" for tag, pos in test[t]]
 61 |                 slots.append("[" + cql_rule_sep.join(conds) + "]")
 62 |             else:
 63 |                 slots.append("[]")
 64 |         assert slot_zero_idx is not None
 65 |         return " ".join(slots), slot_zero_idx
 66 | 
 67 |     cql = []
 68 |     for rule in rules:
 69 |         test_cql, idx = generate_cql(rule["test"])
 70 |         ccl_cql, _ = generate_cql(rule["ccl"])
 71 |         cql.append((test_cql, idx, ccl_cql))
 72 |     return cql
 73 | 
 74 | 
 75 | def find_levels(string):
 76 |     out = []
 77 |     for line in string.split("\n"):
 78 |         if not line:
 79 |             continue
 80 |         count = 0
 81 |         while line[0] == level_sep:
 82 |             count += 1
 83 |             line = line[1:]
 84 |         out.append((count, line))
 85 |     return out
 86 | 
 87 | 
 88 | def find_rules(lines):
 89 |     rules = []
 90 | 
 91 |     # state == {<level/int>: <test>, ...}
 92 |     # test == {<position/int>: (<tag>, <POS>), ...}
 93 |     state = {}
 94 |     for level, line in lines:
 95 |         # if level 0, pass. there is no rule to implement
 96 |         if level == 0:
 97 |             continue
 98 | 
 99 |         tests, ccl = parse_line(line)
100 |         ordered_tests = defaultdict(list)
101 |         for t in tests:
102 |             for pos, test in t.items():
103 |                 ordered_tests[pos].append(test)
104 | 
105 |         # save current rule in state to use in indented rules
106 |         state[level] = ordered_tests
107 | 
108 |         test = defaultdict(list)
109 |         for l in range(1, level + 1):
110 |             for pos, t in state[l].items():
111 |                 for u in t:
112 |                     if u not in test[pos]:  # avoid duplicates
113 |                         test[pos].append(u)
114 |         rules.append({"test": test, "ccl": ccl})
115 |     return rules
116 | 
117 | 
118 | def parse_line(line):
119 |     rule, ccl = line.split(rule_sep)
120 |     tests = rule.split(cond_sep)
121 |     ccl = parse_test(ccl)
122 |     ccl[0] = [ccl[0]]
123 |     tests = [parse_test(t) for t in tests]
124 |     return tests, ccl
125 | 
126 | 
127 | def parse_test(test):
128 |     def parser(test, op):
129 |         pos = 0
130 |         attr, tag = test.split(op)
131 |         for p in positive:
132 |             if p in attr and len(attr) > len(p):
133 |                 pos = int(attr[-1])
134 |                 attr = attr[:-1]
135 |         for n in negative:
136 |             if n in attr and len(attr) > len(n):
137 |                 pos = -int(attr[-1])
138 |                 attr = attr[:-1]
139 | 
140 |         if attr == suffix:
141 |             tag = '".*' + tag[1:]
142 |         return attr, pos, tag
143 | 
144 |     if op in test:
145 |         attr, pos, tag = parser(test, op)
146 |     elif ccl_op in test:
147 |         attr, pos, tag = parser(test, ccl_op)
148 |     else:
149 |         raise SyntaxError
150 |     return {pos: (attr, tag)}
151 | 


--------------------------------------------------------------------------------
/pybo/resources/particles.tsv:
--------------------------------------------------------------------------------
 1 | # form	pos	lemma	sense	freq
 2 | གི་	PART	གི		
 3 | ཀྱི་	PART	གི		
 4 | གྱི་	PART	གི		
 5 | འི་	PART	གི		
 6 | ཡི་	PART	གི		
 7 | གིས་	PART	གིས		
 8 | ཀྱིས་	PART	གིས		
 9 | གྱིས་	PART	གིས		
10 | ཡིས་	PART	གིས		
11 | ས་	PART	གིས		
12 | སུ་	PART	ལ		
13 | ར་	PART	ལ		
14 | རུ་	PART	ལ		
15 | ཏུ་	PART	ལ		
16 | ན་	PART	ལ		
17 | ལ་	PART	ལ		
18 | དུ་	PART	ལ		
19 | སྟེ་	PART	སྟེ		
20 | ཏེ་	PART	སྟེ		
21 | དེ་	PART	སྟེ		
22 | ཀྱང་	PART	ཀྱང		
23 | ཡང་	PART	ཀྱང		
24 | འང་	PART	ཀྱང		
25 | གམ་	PART	གམ		
26 | ངམ་	PART	གམ		
27 | དམ་	PART	གམ		
28 | ནམ་	PART	གམ		
29 | བམ་	PART	གམ		
30 | མམ་	PART	གམ		
31 | འམ་	PART	གམ		
32 | རམ་	PART	གམ		
33 | ལམ་	PART	གམ		
34 | སམ་	PART	གམ		
35 | ཏམ་	PART	གམ		
36 | པ་	PART	པ		
37 | བ་	PART	པ		
38 | པོ་	PART	པོ		
39 | བོ་	PART	པོ		
40 | གོ་	PART	གོ		
41 | ངོ་	PART	གོ		
42 | དོ་	PART	གོ		
43 | ནོ་	PART	གོ		
44 | བོ་	PART	གོ		
45 | མོ་	PART	གོ		
46 | འོ་	PART	གོ		
47 | རོ་	PART	གོ		
48 | ལོ་	PART	གོ		
49 | སོ་	PART	གོ		
50 | ཏོ་	PART	གོ		
51 | ཅིང་	PART	ཅིང		
52 | ཤིང་	PART	ཅིང		
53 | ཞིང་	PART	ཅིང		
54 | ཅེས་	PART	ཅེས		
55 | ཞེས་	PART	ཅེས		
56 | ཅེའོ་	PART	ཅེའོ		
57 | ཤེའོ་	PART	ཅེའོ		
58 | ཞེའོ་	PART	ཅེའོ		
59 | ཅེ་ན་	PART	ཅེ་ན		
60 | ཤེ་ན་	PART	ཅེ་ན		
61 | ཞེ་ན་	PART	ཅེ་ན		
62 | ཅིག་	PART	ཅིག		
63 | ཤིག་	PART	ཅིག		
64 | ཞིག་	PART	ཅིག		
65 | ཀྱིན་	PART	གིན		
66 | གིན་	PART	གིན		
67 | གྱིན་	PART	གིན		
68 | ནས་	PART	ནས		
69 | 


--------------------------------------------------------------------------------
/pybo/segmentation_rule/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/pybo/c65ee83a0659f721bccdf48db4901360e7d97048/pybo/segmentation_rule/__init__.py


--------------------------------------------------------------------------------
/pybo/segmentation_rule/make_rule.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | def get_syls(token):
  4 |     syls = []
  5 |     token_parts = re.split("(་)", token)
  6 |     syl = ''
  7 |     for walker, part in enumerate(token_parts):
  8 |         if part:
  9 |             if walker % 2 == 0:
 10 |                 syl += part
 11 |             else:
 12 |                 syl += part
 13 |                 syls.append(syl)
 14 |                 syl = ""
 15 |     return syls
 16 | 
 17 | def parse_rule(rule):
 18 |     """Parse all the components of cql rule
 19 | 
 20 |     Args:
 21 |         rule (str): cql rule
 22 | 
 23 |     Returns:
 24 |         str: token info, index info, operator of the rule and conclusion tag
 25 |     """
 26 |     part_of_rules = rule.split('\t')
 27 |     return part_of_rules[0], part_of_rules[1], part_of_rules[2], part_of_rules[3]
 28 | 
 29 | def get_tokens(tokens_info):
 30 |     """Parse tokens from tokens info of a cql rule
 31 | 
 32 |     Args:
 33 |         tokens_info (str): tokens info in a cql rule
 34 | 
 35 |     Returns:
 36 |         list: tokens from token info
 37 |     """
 38 |     tokens = re.findall(r'\[.*?\]', tokens_info)
 39 |     return tokens
 40 | 
 41 | def parse_tok(token):
 42 |     try:
 43 |         bilou_tag = re.search(r'pos="(\S)"', token).group(1)
 44 |     except:
 45 |         bilou_tag = ''
 46 |     try:
 47 |         text = re.search(r'text="(\S+)" ?', token).group(1)
 48 |     except:
 49 |         text = ''
 50 |     text = re.sub('\.', '\\\S', text)
 51 |     return bilou_tag, text
 52 | 
 53 | def add_extra_token_pat(ambiguous_seg_pat):
 54 |     """extra context tokens added to ambiguous seg pat
 55 | 
 56 |     Args:
 57 |         ambiguous_seg_pat (str): ambiguous segmentation pattern
 58 | 
 59 |     Returns:
 60 |         str: ambiguous seg pat with extra context token
 61 |     """
 62 |     extra_token_pat = r' \S+?/\S '
 63 |     ambiguous_seg_pat_with_extra_token_pat = f'{extra_token_pat}{ambiguous_seg_pat}{extra_token_pat}'
 64 |     ambiguous_seg_pat_with_extra_token_pat = ambiguous_seg_pat_with_extra_token_pat.replace('  ', ' ')
 65 |     return ambiguous_seg_pat_with_extra_token_pat
 66 | 
 67 | def get_ambiguous_seg_pat(tokens_in_rule, index_info):
 68 |     """Return ambguous segmentation's pattern
 69 | 
 70 |     Args:
 71 |         tokens_in_rule (list): tokens in bilou rule
 72 | 
 73 |     Returns:
 74 |         str: ambiguos segmentation's pattern
 75 |     """
 76 |     ambiguous_seg_pat = ''
 77 |     for token in tokens_in_rule:
 78 |         bilou_tag, text = parse_tok(token)
 79 |         if text:
 80 |             ambiguous_seg_pat += f' {text}'
 81 |             if bilou_tag:
 82 |                 ambiguous_seg_pat += f'/{bilou_tag}'
 83 |             else:
 84 |                 ambiguous_seg_pat += r'/\S'
 85 |         else:
 86 |             ambiguous_seg_pat += r" \S+?"
 87 |             if bilou_tag:
 88 |                 ambiguous_seg_pat += f'/{bilou_tag}'
 89 |             else:
 90 |                 ambiguous_seg_pat += r'/\S'
 91 |     if len(tokens_in_rule) < 4:
 92 |         ambiguous_seg_pat = add_extra_token_pat(ambiguous_seg_pat)
 93 |     return ambiguous_seg_pat
 94 | 
 95 | def construct_token_info(ambiguous_seg_candidate):
 96 |     """Construct token info part of a cql rule
 97 | 
 98 |     Args:
 99 |         ambiguous_seg_candidate (list): ambiguous segmentation candidate's token list
100 | 
101 |     Returns:
102 |         str: token info part of a cql rule
103 |     """
104 |     token_info = ''
105 |     for token in ambiguous_seg_candidate:
106 |         token_parts = token.split('/')
107 |         token_text = re.search(r'(\S+)<\S+',token_parts[0]).group(1)
108 |         token_pos = re.search(r'<(\S+)>',token_parts[0]).group(1)
109 |         if token_pos != 'NO_POS':
110 |             token_info += f'[text="{token_text}" & pos="{token_pos}"] '
111 |         else:
112 |             token_info += f'[text="{token_text}"] '
113 |     return token_info.strip()
114 | 
115 | def get_ambiguous_seg_candidates(tokens_in_rule, index_info, bilou_tag_data):
116 |     """Return all the possible ambiguous segmentation candidates containing tokens in rule
117 | 
118 |     Args:
119 |         tokens_in_rule (list): tokens in bilou rule
120 |         bilou_tag_data (str): bilou tagged data
121 | 
122 |     Returns:
123 |         list: ambiguous segmentation candidates
124 |     """
125 |     ambiguous_seg_candidates_tokens = []
126 |     ambiguous_seg_pat = get_ambiguous_seg_pat(tokens_in_rule, index_info)
127 |     ambiguous_seg_candidates = re.findall(ambiguous_seg_pat, bilou_tag_data)
128 |     ambiguous_seg_candidates = list(set(ambiguous_seg_candidates))
129 |     for ambiguous_seg_candidate in ambiguous_seg_candidates:
130 |         ambiguous_seg_candidates_tokens.append([token for token in ambiguous_seg_candidate.split(' ') if token])
131 |     return ambiguous_seg_candidates_tokens
132 | 
133 | def is_single_syl(token):
134 |     """Check token is single syllable
135 | 
136 |     Args:
137 |         token (str): token
138 | 
139 |     Returns:
140 |         boolean: True if token is single syllable else False
141 |     """
142 |     syls = [syl for syl in token.split('་') if syl]
143 |     if len(syls) > 1:
144 |         return False
145 |     else:
146 |         return True
147 | 
148 | def parse_index_info(index_info):
149 |     """Return index of the token from index info
150 | 
151 |     Args:
152 |         index_info (str): index info of a cql rule
153 | 
154 |     Returns:
155 |         int: index of token
156 |     """
157 |     if '-' in index_info:
158 |         index_info_parts = index_info.split('-')
159 |         index = int(index_info_parts[0])
160 |     else:
161 |         index = int(index_info)
162 |     return index
163 | 
164 | def splited_token_in_human_data(split_tok_text, human_data):
165 |     spilt_suggestion = split_tok_text.strip()
166 |     syls = get_syls(spilt_suggestion)
167 |     for syl_walker, syl in enumerate(syls):
168 |         split_possible = f' {syl} {"".join(syls[syl_walker+1:])} '
169 |         if split_possible in human_data:
170 |             return split_possible, syl_walker+1
171 |     return '', 0
172 | 
173 | def get_splited_token(spilt_suggestion):
174 |     """Split split suggestion and return it
175 | 
176 |     Args:
177 |         spilt_suggestion (str): split suggestion
178 | 
179 |     Returns:
180 |         str: opposite of split suggestion
181 |     """
182 |     spilt_suggestion = spilt_suggestion.strip()
183 |     syls = [syl.strip() for syl in spilt_suggestion.split('་') if syl and syl != ' ']
184 |     suggestion = f'{syls[0]}་ {"་".join(syls[1:])}'
185 |     if spilt_suggestion[-1] == '་':
186 |         suggestion += '་'
187 |     splited_token = f' {suggestion} '
188 |     return splited_token
189 | 
190 | def is_false_positive_split(tokens_in_rule, index, splited_token, human_data):
191 |     """Check if the rule is a false positive split case or not
192 | 
193 |     Args:
194 |         tokens_in_rule (list): tokens in rule
195 |         index (int): index of token on which split is going to take
196 |         splited_token (str): splited token
197 |         human_data (str): human segmented data
198 | 
199 |     Returns:
200 |         boolean: True if rule is false positive else false
201 |     """
202 |     split_suggestion_with_context = ''
203 |     splited_token = splited_token.strip()
204 |     for token_walker, token in enumerate(tokens_in_rule, 1):
205 |         token_text = re.search(r'text=\"(\S+)\"', token).group(1)
206 |         if token_walker == 1:
207 |             split_suggestion_with_context += f' {token_text} '
208 |         elif token_walker == index:
209 |             split_suggestion_with_context += f'{splited_token} '
210 |         else:
211 |             split_suggestion_with_context += f'{token_text} '
212 |     if split_suggestion_with_context in human_data:
213 |         return False
214 |     else:
215 |         return True
216 | 
217 | def is_invalid_split(tokens_info, index_info, human_data):
218 |     """Return false if split suggestion is ambiguous segmentation else true 
219 | 
220 |     Args:
221 |         tokens_info (str): token info of a rule
222 |         index_info (str): index info of a cql rule
223 |         human_data (str): human segmented data
224 | 
225 |     Returns:
226 |         boolean: True if invalid split rule else False
227 |     """
228 |     index = parse_index_info(index_info)
229 |     tokens = get_tokens(tokens_info)
230 |     token_to_split = re.search(r'text=\"(\S+)\"', tokens[index-1]).group(1)
231 |     if is_single_syl(token_to_split) or len(tokens) < index:
232 |         return True, 0
233 |     else:
234 |         split_suggestion = f" {token_to_split} "
235 |         splited_token, split_idx = splited_token_in_human_data(split_suggestion, human_data)
236 |         if split_suggestion in human_data and splited_token and not is_false_positive_split(tokens, index, splited_token, human_data):
237 |             return False, split_idx
238 |         else:
239 |             return True, 0
240 | 
241 | def is_false_positive_merge(tokens_in_rule, index, human_data):
242 |     """Check if rule is false positive merge or not
243 | 
244 |     Args:
245 |         tokens_in_rule (list): tokens in rule
246 |         index (int): index of token on which merge operation is going to perform
247 |         human_data (str): human segmented data
248 | 
249 |     Returns:
250 |         boolean: true if rule is false positive merge else false
251 |     """
252 |     merge_suggestion_with_context = ''
253 |     for token_walker, token in enumerate(tokens_in_rule, 1):
254 |         token_text = re.search(r'text=\"(\S+)\"', token).group(1)
255 |         if token_walker == 1:
256 |             merge_suggestion_with_context += f' {token_text} '
257 |         elif token_walker == index:
258 |             merge_suggestion_with_context += f'{token_text}'
259 |         elif token_walker == index+1:
260 |             merge_suggestion_with_context += f'{token_text} '
261 |         else:
262 |             merge_suggestion_with_context += f'{token_text} '
263 |     if merge_suggestion_with_context in human_data:
264 |         return False
265 |     else:
266 |         return True
267 | 
268 | def is_invalid_merge(tokens_info, index_info, human_data):
269 |     """Return false if merge suggestion is ambiguous segmentation else true 
270 | 
271 |     Args:
272 |         tokens_info (str): token info of a rule
273 |         index_info (str): index info of a cql rule
274 |         human_data (str): human segmented data
275 | 
276 |     Returns:
277 |         boolean: True if invalid merge rule else False
278 |     """
279 |     index = parse_index_info(index_info)
280 |     tokens = get_tokens(tokens_info)
281 |     if len(tokens) <= index or index == 0:
282 |         return True
283 |     else:
284 |         part1 = re.search(r'text=\"(\S+)\"', tokens[index-1]).group(1)
285 |         part2 = re.search(r'text=\"(\S+)\"', tokens[index]).group(1)
286 |         merge_suggestion = f' {part1}{part2} '
287 |         splited_token_in_hd, split_idx = splited_token_in_human_data(merge_suggestion, human_data)
288 |         if "།" not in merge_suggestion and (merge_suggestion in human_data and splited_token_in_hd) and not is_false_positive_merge(tokens, index, human_data):
289 |             return False
290 |         else:
291 |             return True
292 | 
293 | def filter_valid_rules(new_rules, human_data):
294 |     """Return valid rules which can solve ambiguous segmentation errors
295 | 
296 |     Args:
297 |         new_rules (list): cql rules
298 |         human_data (str): human segmented data
299 | 
300 |     Returns:
301 |         list: cql rules
302 |     """
303 |     valid_rules = []
304 |     for new_rule in new_rules:
305 |         tokens_info, index_info, operator, conclusion = parse_rule(new_rule)
306 |         if ":" == operator:
307 |             is_invalid_split_flag, split_idx = is_invalid_split(tokens_info, index_info, human_data)
308 |             if not is_invalid_split_flag:
309 |                 new_rule = re.sub(r'-\d', f'-{split_idx}', new_rule)
310 |                 valid_rules.append(new_rule)
311 |         elif "+" == operator:
312 |             if not is_invalid_merge(tokens_info, index_info, human_data):
313 |                 valid_rules.append(new_rule)
314 |     return valid_rules
315 | 
316 | def get_new_rule(ambiguous_seg_candidates, index, conclusion, human_data):
317 |     """Return list of usable cql rules by botok
318 | 
319 |     Args:
320 |         ambiguous_seg_candidates (list): ambiguous segmentation candidates
321 |         index (int): index of token on which operation needs to perform
322 |         conclusion (str): conclusion tag of rule
323 |         human_data (str): human segmented data
324 | 
325 |     Returns:
326 |         list: usable cql rules of botok
327 |     """
328 |     new_rules = []
329 |     for ambiguous_seg_candidate in ambiguous_seg_candidates:
330 |         new_rule = f"{construct_token_info(ambiguous_seg_candidate)}\t"
331 |         if 'B' in conclusion:
332 |             new_rule += f'{index}\t+\t[]'
333 |         elif 'I' in conclusion:
334 |             new_rule += f'{index-1}\t+\t[]'
335 |         elif 'S' in conclusion:
336 |             new_rule += f'{index}-1\t:\t[] []'
337 |         else:
338 |             new_rule = ''
339 |         if new_rule:
340 |             new_rules.append(new_rule)
341 |     unique_rules = list(set(new_rules))
342 |     filtered_rules = filter_valid_rules(unique_rules, human_data)
343 |     return filtered_rules


--------------------------------------------------------------------------------
/pybo/segmentation_rule/pipeline.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from pathlib import Path
  4 | 
  5 | from bordr import rdr as r
  6 | from botok.tokenizers.wordtokenizer import WordTokenizer
  7 | 
  8 | from pybo.rdr.rdr_2_replace_matcher import rdr_2_replace_matcher
  9 | from pybo.hfr_cqlr_converter import cqlr2hfr
 10 | 
 11 | from pybo.segmentation_rule.make_rule import *
 12 | from pybo.untokenize import assemble, pre_processing
 13 | 
 14 | 
 15 | HOME = Path.home()
 16 | DIALECT_PACK_DIR = HOME / "Documents" / "pybo" / "dialect_packs"
 17 | DEFAULT_DPACK = "general"
 18 | 
 19 | 
 20 | def get_botok_segmentation(sample_text):
 21 |     """Tokenize sample text using botok tokenizer
 22 | 
 23 |     Args:
 24 |         sample_text (str): Input string that needs to be tokenize
 25 | 
 26 |     Returns:
 27 |         str: sample text with space between each tokens
 28 |     """
 29 |     wt = WordTokenizer()
 30 |     tokens = wt.tokenize(sample_text)
 31 |     segmented_sample_text = ''
 32 |     for token in tokens:
 33 |         token_text = token.text.replace(' ', '')
 34 |         if token.pos:
 35 |             token_pos = token.pos
 36 |         else:
 37 |             token_pos = token.chunk_type
 38 |         token_with_tag = f'{token_text}<{token_pos}> '
 39 |         if '\n' in token_with_tag:
 40 |             token_with_tag = token_with_tag.replace('\n', '')
 41 |             token_with_tag += '\n'
 42 |         segmented_sample_text += token_with_tag
 43 |     segmented_sample_text = segmented_sample_text.replace(' \n', '\n')
 44 |     return segmented_sample_text
 45 | 
 46 | def post_process_botok_segmented_data(segmented_text):
 47 |     """Remove unwanted space from segmented text
 48 | 
 49 |     Args:
 50 |         segmented_text (str): Botok segmented text
 51 | 
 52 |     Returns:
 53 |         str: clean segmented text
 54 |     """
 55 |     clean_segmented_text = segmented_text.replace('\n ', '\n')
 56 |     clean_segmented_text = clean_segmented_text.replace('  ', ' ')
 57 |     return clean_segmented_text
 58 | 
 59 | 
 60 | def post_process_human_data(human_data):
 61 |     """Remove unwanted space and solves double shad(ཉིས་ཤད་) split cases
 62 | 
 63 |     Args:
 64 |         human_data (str): human segmented data
 65 | 
 66 |     Returns:
 67 |         str: clean human segmented data
 68 |     """
 69 |     human_data = human_data.replace('།  །', '།།')
 70 |     human_data = human_data.replace('  ', ' ')
 71 |     return human_data
 72 | 
 73 | def get_toks(seg_str):
 74 |     """Extract list of tokens from segmented string
 75 | 
 76 |     Args:
 77 |         seg_str (str): segmented string which can be by human or botok
 78 | 
 79 |     Returns:
 80 |         list: list of tokens
 81 |     """
 82 |     tokens = [token for token in seg_str.split(' ') if token]
 83 |     return tokens
 84 | 
 85 | def parse_tok(botok_tok):
 86 |     """parse botok parts
 87 | 
 88 |     Args:
 89 |         botok_tok (str): botok tok
 90 | 
 91 |     Returns:
 92 |         str,str: text of token and pos of token
 93 |     """
 94 |     pos = re.search(r'<.*?>', botok_tok)[0]
 95 |     text = botok_tok.replace(pos, '')
 96 |     return text, pos
 97 | 
 98 | def get_bilou_tag_line(human_toks, botok_toks):
 99 |     """Add bilou tags to botok tokens and join them as a string with space between each tokens
100 | 
101 |     Args:
102 |         human_toks (list): tokens from human segmented line
103 |         botok_toks (list): tokens from botok segmented line
104 | 
105 |     Returns:
106 |         str: botok tokens with bilou tag separated by space
107 |     """
108 |     bilou_tag_line = ''
109 |     while True:
110 |         human_tok = human_toks[0]
111 |         cur_tok = ''
112 |         tok_walker= 0
113 |         while tok_walker < len(botok_toks):
114 |             botok_tok_text, botok_tok_pos = parse_tok(botok_toks[tok_walker])
115 |             if botok_tok_text == human_tok:
116 |                 bilou_tag_line += f'{botok_tok_text}{botok_tok_pos}/U '
117 |                 botok_toks = botok_toks[tok_walker+1:]
118 |                 break
119 |             elif botok_tok_text in human_tok:
120 |                 cur_tok += botok_tok_text
121 |                 if cur_tok == human_tok:
122 |                     bilou_tag_line += f'{botok_tok_text}{botok_tok_pos}/I '
123 |                     botok_toks = botok_toks[tok_walker+1:]
124 |                     break
125 |                 elif re.search(f'^{botok_tok_text}', human_tok):
126 |                     bilou_tag_line += f'{botok_tok_text}{botok_tok_pos}/B '
127 |                 else:
128 |                     bilou_tag_line += f'{botok_tok_text}{botok_tok_pos}/I '
129 |             elif re.search(human_tok, botok_tok_text):
130 |                 cur_tok = human_tok
131 |                 bilou_tag_line += f'{botok_tok_text}{botok_tok_pos}/S '
132 |                 while re.search(cur_tok, botok_tok_text):
133 |                     human_toks = human_toks[1:]
134 |                     human_tok = human_toks[0]
135 |                     cur_tok += human_tok
136 |             else:
137 |                 botok_toks = botok_toks[tok_walker:]
138 |                 if tok_walker != 0:
139 |                     break
140 |                 else:
141 |                     bilou_tag_line += f'{botok_tok_text}{botok_tok_pos}/S '
142 |             tok_walker += 1
143 |         human_toks = human_toks[1:]
144 |         if not human_toks:
145 |             break
146 |     return bilou_tag_line
147 | 
148 | def get_detokenized_line(tokenized_line):
149 |     tokens = pre_processing(tokenized_line)
150 |     detokenized_line = assemble(tokens)
151 |     return detokenized_line
152 | 
153 | def get_bilou_tag_data(human_data):
154 |     """Human data get detokenized.
155 |     Detokenized text is tokenized by botok.
156 |     Bilou tag is given to botok segmented data by comparing with human segmentation
157 | 
158 |     Args:
159 |         human_data (str): segmented corpus data by human
160 | 
161 |     Returns:
162 |         str: botok segmented data with bilou tag
163 |     """
164 |     human_lines = human_data.splitlines()
165 |     bilou_tag_data = ''
166 |     for human_line in human_lines:
167 |         detokenized_line = get_detokenized_line(human_line)
168 |         botok_line = get_botok_segmentation(detokenized_line)
169 |         human_toks = get_toks(human_line)
170 |         botok_toks = get_toks(botok_line)
171 |         bilou_tag_data += get_bilou_tag_line(human_toks, botok_toks) + '\n'
172 |     return bilou_tag_data
173 | 
174 | def get_split_suggestions(bilou_tag_data):
175 |     """Return all the tokens with Split tag(S)
176 | 
177 |     Args:
178 |         bilou_tag_data (str): Botok segmented data with bilou tag
179 | 
180 |     Returns:
181 |         list: list of tokens with split tags
182 |     """
183 |     split_suggestions = [split_token[:-2] for split_token in re.findall(r'\S+/S', bilou_tag_data)]
184 |     return list(set(split_suggestions))
185 | 
186 | def get_merge_suggestions(bilou_tag_data):
187 |     """Return all the tokens which are meant to be merge
188 | 
189 |     Args:
190 |         bilou_tag_data (str): Botok segmented data with bilou tag
191 | 
192 |     Returns:
193 |         list: list of tokens that are meant to be merge
194 |     """
195 |     merge_suggestions = [merge_suggestion for merge_suggestion,_ in re.findall(r'(\S+/B (\S+/I )+)', bilou_tag_data)]
196 |     return list(set(merge_suggestions))
197 | 
198 | def parse_merge_suggestion(merge_suggestion):
199 |     """Return tokens in merge suggestion
200 | 
201 |     Args:
202 |         merge_suggestion (str): merge suggestion extracted from bilou tagged text
203 | 
204 |     Returns:
205 |         list: tokens in merge suggestion
206 |     """
207 |     merge_suggestion_tokens = [re.search(r'(\S+)<\S+',token).group(1) for token in merge_suggestion.split(' ') if token]
208 |     return merge_suggestion_tokens
209 | 
210 | def get_counter_merge_suggestion(merge_suggestion_tokens):
211 |     """Return opposite of merge suggestion
212 | 
213 |     Args:
214 |         merge_suggestion_tokens (list): tokens in merge suggestion
215 | 
216 |     Returns:
217 |         str: opposite of merge suggestion
218 |     """
219 |     counter_merge_suggestion = ' '.join(merge_suggestion_tokens)
220 |     if merge_suggestion_tokens[-1][-1] == '་':
221 |         counter_merge_suggestion += " "
222 |     return counter_merge_suggestion
223 | 
224 | def get_remove_word_candidates(split_suggestions, human_data):
225 |     """Return remove word candidate or non ambiguous spilt options from spilt suggestions using human data
226 | 
227 |     Args:
228 |         split_suggestions (list): spilt suggestion extracted from bilou tagged text
229 |         human_data (str): human segmented text
230 | 
231 |     Returns:
232 |         list: remove word candidates
233 |     """
234 |     remove_word_candidate = []
235 |     for split_suggestion_token in split_suggestions:
236 |         split_suggestion_tok_text = re.search(r'(\S+)<\S+',split_suggestion_token).group(1)
237 |         if not is_single_syl(split_suggestion_tok_text):
238 |             split_suggestion = f' {split_suggestion_tok_text} '
239 |             splited_token, split_idx = splited_token_in_human_data(split_suggestion_tok_text, human_data)
240 |             if split_suggestion not in human_data and splited_token:
241 |                 remove_word_candidate.append(split_suggestion_tok_text)
242 |     return remove_word_candidate
243 | 
244 | def get_new_word_candidate(merge_suggestion, human_data):
245 |     """Return new word if merge suggestion is not ambiguous one else empty string return
246 | 
247 |     Args:
248 |         merge_suggestion (str): merge sugeestion
249 |         human_data (str): human segmented data
250 | 
251 |     Returns:
252 |         str: new word candidate
253 |     """
254 |     new_word = ''
255 |     merge_suggestion_tokens = parse_merge_suggestion(merge_suggestion)
256 |     new_word =  ''.join(merge_suggestion_tokens)
257 |     # counter_merge_suggestion = " " + get_counter_merge_suggestion(merge_suggestion_tokens)
258 |     splited_token, split_idx = splited_token_in_human_data(new_word, human_data)
259 |     if not splited_token:
260 |         return new_word
261 |     else:
262 |         return ''
263 | 
264 | def get_new_word_candidates(merge_suggestions, human_data):
265 |     """Return all the new word candidate from merge suggestions using human data
266 | 
267 |     Args:
268 |         merge_suggestions (list): merge suggestions extracted from bilou tagged text
269 |         human_data (str): human segmented data
270 | 
271 |     Returns:
272 |         list: new word candidate
273 |     """
274 |     new_word_candidate = []
275 |     for merge_suggestion in merge_suggestions:
276 |         new_word = get_new_word_candidate(merge_suggestion, human_data)
277 |         if new_word:
278 |             new_word_candidate.append(new_word)     
279 |     return new_word_candidate
280 | 
281 | def filter_seg_errors(bilou_tag_data, human_data):
282 |     """Filters out obivious segmentation error and extract new words and new remove words
283 | 
284 |     Args:
285 |         bilou_tag_data (str): segmented botok data with bilou tag
286 |         human_data (ste): segmented human data
287 | 
288 |     Returns:
289 |         list: new word list and new remove word list
290 |     """
291 |     new_word_candidate = []
292 |     new_remove_word_candidate = []
293 |     split_suggestions = get_split_suggestions(bilou_tag_data)
294 |     merge_suggestions = get_merge_suggestions(bilou_tag_data)
295 |     new_word_candidate = get_new_word_candidates(merge_suggestions, human_data)  
296 |     new_remove_word_candidate = get_remove_word_candidates(split_suggestions, human_data)
297 |     return new_word_candidate, new_remove_word_candidate
298 | 
299 | def rdr_postprocess(file_path):
300 |     suffixes = [".DICT", ".INIT", ".RAW", ".sDict"]
301 |     for s in suffixes:
302 |         Path(file_path.parent / (file_path.name + s)).unlink()
303 | 
304 | def remove_duplicate_word(word_list):
305 |     return list(set(word_list))
306 | 
307 | def add_word_2_adjustment(words_2_add, corpus_file_name, dialect_pack_name, type='words'):
308 |     """New word candidates or new remove word candidates are added with existing word list.
309 |     Duplicates are then removed.
310 |     Unique word list are then added to its file.
311 | 
312 |     Args:
313 |         words_2_add (list): word list of new word candidates or new remove word candidates
314 |         corpus_file_name (str): courpus file name
315 |         dialect_pack_name (str): current working dialect pack name
316 |         type (str, optional): type can be either words or remove. Defaults to 'words'.
317 |     
318 |     Returns:
319 |         list: latest word list of mentioned type
320 |     """
321 |     old_word_list = []
322 |     word_list_path = (DIALECT_PACK_DIR / dialect_pack_name / "adjustments" / type / f'{corpus_file_name}.tsv')
323 |     if word_list_path.is_file():
324 |         old_word_list = [old_word for old_word in word_list_path.read_text(encoding='utf-8-sig').splitlines() if old_word]
325 |     new_word_list = old_word_list + words_2_add
326 |     new_word_list = remove_duplicate_word(new_word_list)
327 |     new_words = '\n'.join(new_word_list)
328 |     word_list_path.write_text(new_words, encoding='utf-8-sig')
329 |     print(f'[INFO]: New {type} added to adjustment {type} list..')
330 |     return new_word_list
331 | 
332 | def get_bilou_rules(bilou_tag_data_path):
333 |     """Extract rdr rules by training RDR model using bilou tagged data.
334 |     Convert rdr rules to cql rules and returning it.
335 | 
336 |     Args:
337 |         bilou_tag_data_path (pathlib): path of bilou tagged data
338 | 
339 |     Returns:
340 |         list: rdr rules converted into cql rules 
341 |     """
342 |     log = r(str(bilou_tag_data_path), mode="train", verbose=True)
343 |     print('[INFO]: RDR TRAINING COMPLETED..')
344 |     rdr_rules = Path(f"{bilou_tag_data_path}.RDR").read_text(
345 |         encoding="utf-8-sig"
346 |     )
347 |     bilou_rules = rdr_2_replace_matcher(rdr_rules).splitlines()
348 |     bilou_rules = list(set(bilou_rules))
349 |     return bilou_rules
350 | 
351 | def convert_bilou_rules(bilou_rules, bilou_tag_init, human_data):
352 |     """Convert bilou rules to normal cql rules as rules with bilou tag are not usable by botok
353 | 
354 |     Args:
355 |         bilou_rules (list): cql rules with bilou tag
356 |         bilou_tag_init (str): bilou tagged initial text
357 |         human_data (str): human segmented data
358 | 
359 |     Returns:
360 |         list: usable cql rule by botok
361 |     """
362 |     new_cql_rules = []
363 |     for bilou_rule in bilou_rules:
364 |         tokens_info, index_info, operator, conclusion = parse_rule(bilou_rule)
365 |         tokens_in_rule = get_tokens(tokens_info)
366 |         ambiguous_seg_candidates = get_ambiguous_seg_candidates(tokens_in_rule, index_info, bilou_tag_init)
367 |         new_cql_rules += get_new_rule(ambiguous_seg_candidates, int(index_info)+1, conclusion, human_data) # index incremented as extra context token involve
368 |     new_cql_rules = list(set(new_cql_rules))
369 |     return new_cql_rules
370 | 
371 | def extract_seg_rule(corpus_file_path, dialect_pack_name=DEFAULT_DPACK, type='cql', no_epochs = 3):
372 |     """Extracts segmentation rules.
373 | 
374 |     Args:
375 |         corpus_file_path (pathlib): input file's path
376 |         dialect_pack_name (string, optional): name of dialect pack for which rules are. Defaults to DEFAULT_DPACK.
377 |         type (str, optional): type of rules can be human friendly rule(hfr) or corpus query rule. Defaults to 'cql'.
378 |         no_epochs (int, optional): Number of times word filters need to perform. Defaults to 3.
379 | 
380 |     Returns:
381 |         str: segmentation rules
382 |     """
383 |     new_word_list = []
384 |     new_remove_word_list = []
385 |     corpus_file_name = corpus_file_path.stem[:-2]
386 |     number_of_segmentation = 1
387 |     human_data = corpus_file_path.read_text(encoding='utf-8-sig')
388 |     human_data = post_process_human_data(human_data)
389 |     while True:
390 |         bilou_tag_data = get_bilou_tag_data(human_data)
391 |         print(f'[INFO]: SEGMENTATION PHASE {number_of_segmentation} COMPLETED..')
392 |         new_word_list, new_remove_word_list = filter_seg_errors(bilou_tag_data, human_data)
393 |         print('[INFO]: FILTER SEGMENTATION ERROR COMPLETED..')
394 |         if new_word_list:
395 |             new_word_list = add_word_2_adjustment(new_word_list, corpus_file_name, dialect_pack_name, type='words')
396 |         if new_remove_word_list:
397 |             new_remove_word_list = add_word_2_adjustment(new_remove_word_list, corpus_file_name, dialect_pack_name, type='remove')
398 |         bilou_tag_data = get_bilou_tag_data(human_data)
399 |         word_list, remove_word_list = filter_seg_errors(bilou_tag_data, human_data)
400 |         new_remove_word_list = [remove_word for remove_word in remove_word_list if remove_word not in new_remove_word_list]
401 |         new_word_list = [word for word in word_list if word not in new_word_list]
402 |         number_of_segmentation += 1
403 |         if (not new_word_list and not new_remove_word_list) or number_of_segmentation > no_epochs:
404 |             break
405 |     bilou_tag_data_path = (corpus_file_path.parent / f'{corpus_file_name}_tr_data.txt')
406 |     bilou_tag_data_path.write_text(bilou_tag_data, encoding='utf-8')
407 |     bilou_rules = get_bilou_rules(bilou_tag_data_path)
408 |     (corpus_file_path.parent / f'{corpus_file_name}_bilou_rules.txt').write_text("\n".join(bilou_rules), encoding='utf-8')
409 |     new_cql_rules = []
410 |     bilou_tag_init = (corpus_file_path.parent / f'{bilou_tag_data_path.name}.INIT').read_text(encoding='utf-8-sig')
411 |     new_cql_rules = convert_bilou_rules(bilou_rules, bilou_tag_init, human_data)
412 |     new_cql_rules = "\n".join(new_cql_rules)
413 |     rdr_postprocess(bilou_tag_data_path)
414 |     if type != 'cql':
415 |         new_cql_rules = cqlr2hfr(new_cql_rules)
416 |     return new_cql_rules


--------------------------------------------------------------------------------
/pybo/third_party/rules.txt:
--------------------------------------------------------------------------------
 1 | # Rules for Sanskrit ordering
 2 | # From Bod rgya tshig mdzod chen mo pages 9 - 11, 347, 1153, 1615, 1619, 1711, 1827, 2055, 2061, 2840, 2920, 3136 and 3137
 3 | # Example: ཀར་ལུགས།  < ཀརྐ་ཊ།
 4 | &ཀར<ཀརྐ<ཀརྟ<ཀརྞ<ཀརྨ<ཀརྴ<ཀརྵ
 5 | &ཀལ<ཀལྐ<ཀལྤ
 6 | &ཀས<ཀསྨ
 7 | &གཉ<གཉྫ
 8 | &ཐར<ཐརྐ
 9 | &པུས<པུསྟི
10 | &ཕལ<ཕལྒ
11 | &བིལ<བིལྦ
12 | &མཉ<མཉྫ
13 | &མར<མརྒ
14 | &ཝར<ཝརྟ
15 | &ཤས<ཤསྟ
16 | &སར<སརྒ
17 | &ཨར<ཨརྒ<ཨརྱ=ཨཪྱ
18 | &ཨས<ཨསྨ
19 | # Marks (seconadry different, with low equal primary weight after Lao)
20 | &[before 1]ཀ<།<<༎<<༏<<༐<<༑<<༔<<༴<་=༌
21 | &ཀ<<ྈྐ<ཫ<དཀ<བཀ<རྐ<ལྐ<སྐ<བརྐ<བསྐ
22 | &ཁ<<ྈྑ<མཁ<འཁ
23 | &ག<དགག<དགང<དགད<དགན<དགབ<དགཝ<དགའ<དགར<དགལ<དགས<དགི<དགུ<དགེ<དགོ<དགྭ<དགྱ<དགྲ<བགག<བགང<བགད<བགབ<བགམ<<<བགཾ<བགཝ<བགའ
24 | 		<བགར<བགལ<བགི<བགུ<བགེ<བགོ<བགྭ<བགྱ<བགྲ<བགླ<མགག<མགང<མགད<མགབ<མགའ<མགར<མགལ<མགི<མགུ<མགེ<མགོ<མགྭ<མགྱ<མགྲ<འགག<འགང<འགད<འགན<འགབ<འགམ<<<འགཾ
25 | 		<འགའ<འགར<འགལ<འགས<འགི<འགུ<འགེ<འགོ<འགྭ<འགྱ<འགྲ<རྒ<ལྒ<སྒ<བརྒ<བསྒ
26 | &ང<<<ྂ<<<ྃ<དངག<དངང<དངད<དངན<དངབ<དངའ<དངར<དངལ<དངི<དངུ<དངེ<དངོ<མངག<མངང<མངད<མངན<མངབ<མངའ<མངར<མངལ<མངི<མངུ<མངེ<མངོ<རྔ<ལྔ<སྔ<བརྔ<བསྔ
27 | &ཅ<གཅ<བཅ<ལྕ<བལྕ
28 | &ཆ<མཆ<འཆ
29 | &ཇ<མཇ<འཇ<རྗ<ལྗ<བརྗ
30 | &ཉ<<ྋྙ<གཉ<མཉ<རྙ=ཪྙ<སྙ<བརྙ=བཪྙ<བསྙ
31 | &ཏ<ཊ<ཏྭ<ཏྲ<གཏ<བཏ<རྟ<ལྟ<སྟ<བརྟ<བལྟ<བསྟ
32 | &ཐ<ཋ<མཐ<འཐ
33 | &ད<ཌ<གདག<གདང<གདད<གདན<གདབ<གདམ<<<གདཾ<གདའ<གདར<གདལ<གདས<གདི<གདུ<གདེ<གདོ<གདྭ<བདག<བདང<བདད<བདབ<བདམ<<<བདཾ<བདའ
34 | 		<བདར<བདལ<བདས<བདི<བདུ<བདེ<བདོ<བདྭ<མདག<མདང<མདད<མདན<མདབ<མདའ<མདར<མདལ<མདས<མདི<མདུ<མདེ<མདོ<མདྭ<འདག<འདང<འདད<འདན<འདབ<འདམ<<<འདཾ
35 | 		<འདཝ<འདའ<འདར<འདལ<འདས<འདི<འདུ<འདེ<འདོ<འདྭ<འདྲ<རྡ<ལྡ<སྡ<བརྡ<བལྡ<བསྡ
36 | &ན<ཎ<གནག<གནང<གནད<གནན<གནབ<གནམ<<<གནཾ<གནཝ<གནའ<གནར<གནལ<གནས<གནི<གནུ<གནེ<གནོ<གནྭ<མནག<མནང<མནད<མནན<མནབ<མནམ<<<མནཾ<མནའ
37 | 		<མནར<མནལ<མནས<མནི<མནུ<མནེ<མནོ<མནྭ<རྣ<སྣ<བརྣ<བསྣ
38 | &པ<<ྉྤ<དཔག<དཔང<དཔད<དཔབ<དཔའ<དཔར<དཔལ<དཔས<དཔི<དཔུ<དཔེ<དཔོ<དཔྱ<དཔྲ<ལྤ<སྤ
39 | &ཕ<<ྉྥ<འཕ
40 | &བ<དབག<དབང<དབད<དབན<དབབ<དབའ<དབར<དབལ<དབས<དབི<དབུ<དབེ<དབོ<དབྱ<དབྲ<འབག<འབང<འབད<འབན<འབབ<འབམ
41 | 	<<<འབཾ<འབའ<འབར<འབལ<འབས<འབི<འབུ<འབེ<འབོ<འབྱ<འབྲ<རྦ<ལྦ<སྦ
42 | &མ<<<ཾ<དམག<དམང<དམད<དམན<དམབ<དམཝ<དམའ<དམར<དམལ<དམས<དམི<དམུ<དམེ<དམོ<དམྭ<དམྱ<རྨ<སྨ
43 | &ཙ<གཙ<བཙ<རྩ<སྩ<བརྩ<བསྩ
44 | &ཚ<མཚ<འཚ
45 | &ཛ<མཛ<འཛ<རྫ<བརྫ
46 | # &ཝ
47 | &ཞ<གཞ<བཞ
48 | &ཟ<གཟ<བཟ
49 | # &འ
50 | &ཡ<གཡ
51 | &ར<<<ཪ<ཬ<བརླ=བཪླ
52 | # &ལ
53 | &ཤ<ཥ<གཤ<བཤ
54 | &ས<གསག<གསང<གསད<གསན<གསབ<གསའ<གསར<གསལ<གསས<གསི<གསུ<གསེ<གསོ<གསྭ<བསག<བསང<བསད<བསབ<བསམ<<<བསཾ<བསའ<བསར
55 | 		<བསལ<བསས<བསི<བསུ<བསེ<བསོ<བསྭ<བསྲ<བསླ
56 | &ཧ<ལྷ
57 | &ཨ
58 | # Explicit vowels
59 | <ཱ<ི<ཱི<ྀ<ཱྀ<ུ<ཱུ<ེ<ཻ=ེེ<ོ<ཽ=ོོ
60 | # Post-radicals
61 | 	<ྐ<ྑ<ྒ<ྔ<ྕ<ྖ<ྗ<ྙ<ྟ<ྚ<ྠ<ྛ<ྡ<ྜ<ྣ<ྞ<ྤ<ྥ<ྦ<ྨ<ྩ<ྪ<ྫ<ྭ<<<ྺ<ྮ<ྯ<ྰ<ྱ<<<ྻ<ྲ<<<ྼ<ླ<ྴ
62 | 	<ྵ<ྶ<ྷ<ྸ
63 | # Combining marks and signs (secondary weight)
64 | &༹<<྄<<ཿ<<྅<<ྈ<<ྉ<<ྊ<<ྋ<<ྌ<<ྍ<<ྎ<<ྏ
65 | # Treatༀ,  ཷand ,ཹ as decomposed
66 | &ཨོཾ=ༀ
67 | &ྲཱྀ=ཷ
68 | &ླཱྀ=ཹ


--------------------------------------------------------------------------------
/pybo/untokenize.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def pre_processing(tokenized_text):
 3 |     tokens = [token for token in tokenized_text.split(' ') if token]
 4 |     return tokens
 5 | 
 6 | def get_token_text(token):
 7 |     token_parts = [part for part in token.split('/') if part]
 8 |     return token_parts[0]
 9 | 
10 | def assemble(tokens):
11 |     detokenized_text = ''
12 |     for token in tokens:
13 |         detokenized_text += get_token_text(token)
14 |     return detokenized_text
15 | 


--------------------------------------------------------------------------------
/pybo/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/pybo/c65ee83a0659f721bccdf48db4901360e7d97048/pybo/utils/__init__.py


--------------------------------------------------------------------------------
/pybo/utils/bo_sorted.py:
--------------------------------------------------------------------------------
 1 | # # coding: utf-8
 2 | # from icu import RuleBasedCollator
 3 | # from pathlib import Path
 4 | #
 5 | #
 6 | # rules = Path(__file__).parent / "../third_party/rules.txt"
 7 | # collator = RuleBasedCollator(
 8 | #     "[normalization on]\n[reorder Tibt]\n" + rules.read_text(encoding="utf-8")
 9 | # )
10 | #
11 | #
12 | # def bo_sorted(word_list):
13 | #     return sorted(word_list, key=collator.getSortKey)
14 | 


--------------------------------------------------------------------------------
/pybo/utils/profile_entries.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from collections import defaultdict
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | def profile_entries(pathname):
 7 |     pathname = Path(pathname)
 8 |     entries = defaultdict(list)
 9 | 
10 |     profile_files = [Path(__file__).parent.parent / "resources/particles.tsv"]
11 |     for d in pathname.glob("*"):
12 |         # filter unwanted directories and files
13 |         dirs_ignored = ["adjustment", "entry_data"]
14 |         if not d.is_dir() or d.name in dirs_ignored or d.name.startswith("."):
15 |             continue
16 | 
17 |         profile_files.extend(list(d.glob("*.tsv")))
18 | 
19 |     # add files
20 |     for f in profile_files:
21 |         lines = f.read_text(encoding="utf-8-sig").splitlines()
22 |         for num, line in enumerate(lines):
23 |             if line.startswith("#"):
24 |                 continue
25 |             entry = line.split("\t", 1)[0]
26 |             entries[entry].append(line)
27 |     return entries
28 | 


--------------------------------------------------------------------------------
/pybo/utils/profile_report.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from collections import defaultdict
 3 | from pathlib import Path
 4 | 
 5 | 
 6 | def reorder_data(data):
 7 |     ordered = []
 8 |     for entry, e in data.items():
 9 |         count = 0
10 |         for _, files in e.items():
11 |             count += len(files)
12 |         ordered.append((count, {entry: e}))
13 |     ordered = sorted(ordered, reverse=True, key=lambda x: x[0])
14 |     return ordered
15 | 
16 | 
17 | def profile_report(pathname):
18 |     pathname = Path(pathname)
19 |     data = {}
20 | 
21 |     for d in sorted(pathname.glob("*")):
22 |         # filter unwanted directories and files
23 |         dirs_ignored = ["adjustment", "entry_data"]
24 |         if not d.is_dir() or d.name in dirs_ignored or d.name.startswith("."):
25 |             continue
26 | 
27 |         for f in sorted(d.glob("*.tsv")):
28 |             lines = f.read_text(encoding="utf-8-sig").splitlines()
29 |             for num, line in enumerate(lines):
30 |                 if line.startswith("#"):
31 |                     continue
32 |                 entry = line.split("\t", 1)[0]
33 |                 path = f"{d.name}/{f.name}"
34 | 
35 |                 if entry not in data:
36 |                     data[entry] = {}
37 |                 if line not in data[entry]:
38 |                     data[entry][line] = []
39 | 
40 |                 data[entry][line].append((path, num))
41 | 
42 |     data = reorder_data(data)
43 | 
44 |     # filter and format all entries that have similar forms over files
45 |     report = ["WORD\tENTRY\tFILE-NAME\tLINE-NUMBER"]
46 |     count = defaultdict(int)
47 |     for num, d in data:
48 |         count[num] += 1
49 |         for entry, e in d.items():
50 |             tmp = []
51 |             tmp.append(f"{entry}: {num}")
52 |             for line, files in e.items():
53 |                 tmp.append(f'\t"{line}"')
54 |                 tmp.extend([f"\t\t{f}\t{n}" for f, n in files])
55 |             report.extend(tmp)
56 |     report = (
57 |         [f"total distinct entries: {len(data)}"]
58 |         + [f"entries with {a} entries: {b}" for a, b in count.items()]
59 |         + [""]
60 |         + report
61 |     )
62 |     report = "\n".join(report)
63 | 
64 |     # print to file
65 |     out = pathname / (pathname.name + "_report.tsv")
66 |     out.write_text(report, encoding="utf-8-sig")
67 | 


--------------------------------------------------------------------------------
/pybo/utils/regex_batch_apply.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import re
 3 | 
 4 | 
 5 | def batch_apply_regex(string, pairs):
 6 |     for find, repl in pairs:
 7 |         string = re.sub(find, repl, string, flags=re.MULTILINE)
 8 |     return string
 9 | 
10 | 
11 | def get_regex_pairs(lines, sep="\t-\t"):
12 |     regex_pairs = []
13 |     clean_lines = _parse_lines(lines, sep)
14 | 
15 |     for line in clean_lines:
16 |         find, replace = line.split(sep)
17 |         regex_pairs.append((r"" + find, r"" + replace))
18 |     return regex_pairs
19 | 
20 | 
21 | def _parse_lines(lines, sep):
22 |     cleaned = []
23 |     for num, line in enumerate(lines):
24 |         # remove comment lines and empty lines
25 |         if "#" in line:
26 |             line = line[: line.find("#")]
27 | 
28 |         # strip line returns while keeping space chars and screen all empty lines
29 |         line = line.strip("\n\r")
30 |         if not line:
31 |             continue
32 | 
33 |         # ensure there is 1 and only 1 occurrence of sep
34 |         if line.count(sep) != 1:
35 |             print(f"passing line {num + 1}: {line}.")
36 |             continue
37 | 
38 |         cleaned.append(line)
39 |     return cleaned
40 | 


--------------------------------------------------------------------------------
/pybo_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/pybo/c65ee83a0659f721bccdf48db4901360e7d97048/pybo_logo.png


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pre-commit
2 | coverage
3 | pytest
4 | covdefaults
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | click
2 | PyYAML
3 | botok >= 0.7.4
4 | pyewts
5 | bordr


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | max-line-length = 88
 3 | ignore = E203, W503, E501, F401, F403
 4 | 
 5 | [isort]
 6 | line_length = 88
 7 | known_first_party = pybo
 8 | multi_line_output = 3
 9 | include_trailing_comma = True
10 | 
11 | [coverage:run]
12 | plugins = covdefaults
13 | omit = .env/*
14 | 
15 | [coverage:report]
16 | fail_under = 100
17 | show_missing = True
18 | skip_covered = True
19 | 
20 | 
21 | [semantic_release]
22 | version_variable = pybo/__init__.py:__version__
23 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # -*- coding: utf8 -*-
 3 | 
 4 | from __future__ import print_function
 5 | 
 6 | import re
 7 | from pathlib import Path
 8 | 
 9 | import setuptools
10 | from pkg_resources import parse_version
11 | 
12 | assert parse_version(setuptools.__version__) >= parse_version("38.6.0")
13 | 
14 | 
15 | def get_version(prop, project):
16 |     project = Path(__file__).parent / project / "__init__.py"
17 |     result = re.search(
18 |         r'{}\s*=\s*[\'"]([^\'"]*)[\'"]'.format(prop), project.read_text()
19 |     )
20 |     return result.group(1)
21 | 
22 | 
23 | def read(fname):
24 |     p = Path(__file__).parent / fname
25 |     with p.open(encoding="utf-8") as f:
26 |         return f.read()
27 | 
28 | 
29 | setuptools.setup(
30 |     name="pybo",
31 |     version=get_version("__version__", "pybo"),  # edit version in pybo/__init__.py
32 |     author="Esukhia development team",
33 |     author_email="esukhiadev@gmail.com",
34 |     description="Python utils for processing Tibetan",
35 |     license="Apache2",
36 |     keywords="nlp computational_linguistics search ngrams language_models linguistics toolkit tibetan",
37 |     url="https://github.com/Esukhia/pybo",
38 |     packages=setuptools.find_packages(),
39 |     long_description=read("README.md"),
40 |     long_description_content_type="text/markdown",
41 |     project_urls={
42 |         "Source": "https://github.com/Esukhia/pybo",
43 |         "Tracker": "https://github.com/Esukhia/pybo/issues",
44 |     },
45 |     classifiers=[
46 |         "Development Status :: 3 - Alpha",
47 |         "Topic :: Text Processing :: Linguistic",
48 |         "Programming Language :: Python :: 3",
49 |         "Operating System :: OS Independent",
50 |         "Intended Audience :: Developers",
51 |         "Intended Audience :: Science/Research",
52 |         "License :: OSI Approved :: Apache Software License",
53 |         "Natural Language :: Tibetan",
54 |     ],
55 |     python_requires=">=3.6",
56 |     install_requires=["botok>=0.8.2", "pyyaml", "click", "pyewts", "bordr", "tibetan_sort", "pytest"],
57 |     tests_require=["pytest"],
58 |     entry_points={
59 |         "console_scripts": ["bo=pybo.cli:cli"]  # command=package.module:function
60 |     },
61 | )
62 | 


--------------------------------------------------------------------------------
/tests/01_raw_text.txt:
--------------------------------------------------------------------------------
 1 | STEP 1: standard botok + custom words and rules
 2 |     - 1 raw text
 3 |     - 2 segmented text
 4 | 
 5 | STEP 2:
 6 |     - 1 manually corrected segmentation and POS + extra information
 7 | 
 8 |     - 2 extract info from manually corrected
 9 |         - entry data: (script to create)
10 |             - word lists + entry data
11 |             - rules
12 |         - rules: (RDR)
13 |             - extract rules using RDR
14 |             - filter them manually
15 |             - convert them to botok matcher replacements
16 | 
17 |             https://github.com/buda-base/bonlp-datasets/blob/master/human2rdr.txt
18 | 
19 | STEP 3:
20 |     - resegment using only clean entry data
21 |     - adjust wordlists and rules until it is as close to the manually corrected as possible
22 |     - provide new entry data and
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 
30 | STEP1 + STEP 3
31 | pybo tok <input_dir> <output_dir> -p <main_dir>
32 | pybo tok <input_dir> <output_dir> -p2 <main_dir> <custom_dir>
33 | 
34 | STEP2
35 | pybo extract profile <input_dir>
36 | output will be :
37 |     - for entry data: words_bo + entry_data
38 |     - for RDR: human readable rules to be proofed
39 | 
40 | pybo convert <input_dir>
41 | will convert human readable selected rules to matcher replacement rules.
42 | 
43 | 
44 | 
45 | 
46 | 
47 | 
48 | 
49 | Documents/
50 |     pybo/
51 |         main_profile/
52 |             adjustment/
53 |             entry_data/
54 |             frequency/
55 |             words_bo/
56 |             words_non_inflected/
57 |             words_skrt/
58 |         custom_profile/
59 |             adjustment/
60 |             entry_data/
61 |             frequency/
62 |             words_bo/
63 |             words_non_inflected/
64 |             words_skrt/


--------------------------------------------------------------------------------
/tests/data/corpus1/corpus1.txt:
--------------------------------------------------------------------------------
1 | ﻿ལས་ཞེས་པ་ནི་ལས་བྱེད་པས་ལས་བྱེད་པ་ལ་ཟེར་རོ།། ལས་བྱེད་པས་ལས་མ་བྱེད་པ་ལ་ཟེར་བ་མ་ཡིན་ནོ།། སྐད་ཆ་དེ་ཡི་དོན་གཅིག་ནི་ནང་པ་གོ་བ་དང་དོན་གཞན་ནི་མི་ལ་གོ། དཔེ་དེབ་ནི་དེབ་ཀྱི་དོན་གཅིག་རེད་ གོ་བ་ནོར་པ་མ་ལེན་རོགས།
2 | ལས་ཞེས་པ་ནི་ལས་བྱེད་པས་ལས་བྱེད་པ་ལ་ཟེར་རོ།། ལས་བྱེད་པས་ལས་མ་བྱེད་པ་ལ་ཟེར་བ་མ་ཡིན་ནོ།། སྐད་ཆ་དེ་ཡི་དོན་གཅིག་ནི་ནང་པ་གོ་བ་དང་དོན་གཞན་ནི་མི་ལ་གོ། དཔེ་དེབ་ནི་དེབ་ཀྱི་དོན་གཅིག་རེད་ གོ་བ་ནོར་པ་མ་ལེན་རོགས།
3 | ལས་ཞེས་པ་ནི་ལས་བྱེད་པས་ལས་བྱེད་པ་ལ་ཟེར་རོ།། ལས་བྱེད་པས་ལས་མ་བྱེད་པ་ལ་ཟེར་བ་མ་ཡིན་ནོ།། སྐད་ཆ་དེ་ཡི་དོན་གཅིག་ནི་ནང་པ་གོ་བ་དང་དོན་གཞན་ནི་མི་ལ་གོ། དཔེ་དེབ་ནི་དེབ་ཀྱི་དོན་གཅིག་རེད་ གོ་བ་ནོར་པ་མ་ལེན་རོགས།
4 | ལས་ཞེས་པ་ནི་ལས་བྱེད་པས་ལས་བྱེད་པ་ལ་ཟེར་རོ།། ལས་བྱེད་པས་ལས་མ་བྱེད་པ་ལ་ཟེར་བ་མ་ཡིན་ནོ།། སྐད་ཆ་དེ་ཡི་དོན་གཅིག་ནི་ནང་པ་གོ་བ་དང་དོན་གཞན་ནི་མི་ལ་གོ། དཔེ་དེབ་ནི་དེབ་ཀྱི་དོན་གཅིག་རེད་ གོ་བ་ནོར་པ་མ་ལེན་རོགས།
5 | ལས་ཞེས་པ་ནི་ལས་བྱེད་པས་ལས་བྱེད་པ་ལ་ཟེར་རོ།། ལས་བྱེད་པས་ལས་མ་བྱེད་པ་ལ་ཟེར་བ་མ་ཡིན་ནོ།། སྐད་ཆ་དེ་ཡི་དོན་གཅིག་ནི་ནང་པ་གོ་བ་དང་དོན་གཞན་ནི་མི་ལ་གོ། དཔེ་དེབ་ནི་དེབ་ཀྱི་དོན་གཅིག་རེད་ གོ་བ་ནོར་པ་མ་ལེན་རོགས། གོ་བ་ནོར་པ་མ་ལེན་རོགས།


--------------------------------------------------------------------------------
/tests/data/corpus1/corpus1_bilou_rules.txt:
--------------------------------------------------------------------------------
1 | [text="ཀྱི་<NO_POS>"] [text="དོན་<NOUN>"] [pos="U" & text="གཅིག་<NUM>"]	3	=	[pos="I"]
2 | [pos="U" & text="ལས་<TEXT>"] [text="བྱེད་པ<VERB>"]	1	=	[pos="B"]
3 | [pos="U" & text="གོ་བ་<VERB>"] [text="དང་<NO_POS>"] [text="དོན་<NOUN>"]	1	=	[pos="S"]
4 | [text="ཀྱི་<NO_POS>"] [pos="U"] [text="གཅིག་<NUM>"]	2	=	[pos="B"]


--------------------------------------------------------------------------------
/tests/data/corpus1/corpus1_hd.txt:
--------------------------------------------------------------------------------
1 | ལས་ ཞེས་པ་ ནི་ ལས་བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་ བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས །
2 | ལས་ ཞེས་པ་ ནི་ ལས་བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་ བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས །
3 | ལས་ ཞེས་པ་ ནི་ ལས་བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་ བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས །
4 | ལས་ ཞེས་པ་ ནི་ ལས་བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་ བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས །
5 | ལས་ ཞེས་པ་ ནི་ ལས་བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་ བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས །


--------------------------------------------------------------------------------
/tests/data/corpus1/corpus1_pybo_data.txt:
--------------------------------------------------------------------------------
1 | ལས་ ཞེས་པ་ ནི་ ལས་ བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་ བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་ གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས །
2 | ལས་ ཞེས་པ་ ནི་ ལས་ བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་ བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་ གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས །
3 | ལས་ ཞེས་པ་ ནི་ ལས་ བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་ བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་ གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས །
4 | ལས་ ཞེས་པ་ ནི་ ལས་ བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་ བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་ གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས །
5 | ལས་ ཞེས་པ་ ནི་ ལས་ བྱེད་པ ས་ ལས་ བྱེད་པ་ ལ་ ཟེར་ རོ །། ལས་ བྱེད་པ ས་ ལས་ མ་ བྱེད་པ་ ལ་ ཟེར་བ་ མ་ ཡིན་ ནོ །། སྐད་ཆ་ དེ་ ཡི་ དོན་ གཅིག་ ནི་ ནང་པ་ གོ་བ་ དང་ དོན་ གཞན་ ནི་ མི་ ལ་ གོ ། དཔེ་དེབ་ ནི་ དེབ་ ཀྱི་ དོན་ གཅིག་ རེད་ གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། གོ་བ་ ནོར་པ་ མ་ ལེན་ རོགས ། 


--------------------------------------------------------------------------------
/tests/data/corpus1/corpus1_rules.txt:
--------------------------------------------------------------------------------
1 | ["ནང་པ་"] ["གོ་བ་"] ["དང་"] ["དོན་"]	2-1	:	[] []
2 | ["ཀྱི་"] ["དོན་"] ["གཅིག་"] ["རེད་"] ["གོ་བ་"]	2	+	[]
3 | ["དེབ་"] ["ཀྱི་"] ["དོན་"] ["གཅིག་"] ["རེད་"]	3	+	[]


--------------------------------------------------------------------------------
/tests/data/corpus1/corpus1_tr_data.txt:
--------------------------------------------------------------------------------
1 | ལས་<TEXT>/U ཞེས་པ་<PART>/U ནི་<PART>/U ལས་<TEXT>/B བྱེད་པ<VERB>/I ས་<PART>/U ལས་<TEXT>/U བྱེད་པ་<VERB>/U ལ་<ADP>/U ཟེར་<VERB>/U རོ<NOUN>/U །།<PUNCT>/U ལས་<TEXT>/B བྱེད་པ<VERB>/I ས་<PART>/U ལས་<TEXT>/U མ་<PART>/U བྱེད་པ་<VERB>/U ལ་<ADP>/U ཟེར་བ་<VERB>/U མ་<PART>/U ཡིན་<NO_POS>/U ནོ<NO_POS>/U །།<PUNCT>/U སྐད་ཆ་<NOUN>/U དེ་<DET>/U ཡི་<ADP>/U དོན་<NOUN>/U གཅིག་<NUM>/U ནི་<PART>/U ནང་པ་<NOUN>/U གོ་བ་<VERB>/S དང་<NO_POS>/U དོན་<NOUN>/U གཞན་<DET>/U ནི་<PART>/U མི་<PART>/U ལ་<ADP>/U གོ<VERB>/U །<PUNCT>/U དཔེ་དེབ་<NOUN>/U ནི་<PART>/U དེབ་<NOUN>/U ཀྱི་<NO_POS>/U དོན་<NOUN>/B གཅིག་<NUM>/I རེད་<VERB>/U གོ་བ་<VERB>/U ནོར་པ་<NOUN>/U མ་<PART>/U ལེན་<VERB>/U རོགས<OTHER>/U །<PUNCT>/U 
2 | ལས་<TEXT>/U ཞེས་པ་<PART>/U ནི་<PART>/U ལས་<TEXT>/B བྱེད་པ<VERB>/I ས་<PART>/U ལས་<TEXT>/U བྱེད་པ་<VERB>/U ལ་<ADP>/U ཟེར་<VERB>/U རོ<NOUN>/U །།<PUNCT>/U ལས་<TEXT>/B བྱེད་པ<VERB>/I ས་<PART>/U ལས་<TEXT>/U མ་<PART>/U བྱེད་པ་<VERB>/U ལ་<ADP>/U ཟེར་བ་<VERB>/U མ་<PART>/U ཡིན་<NO_POS>/U ནོ<NO_POS>/U །།<PUNCT>/U སྐད་ཆ་<NOUN>/U དེ་<DET>/U ཡི་<ADP>/U དོན་<NOUN>/U གཅིག་<NUM>/U ནི་<PART>/U ནང་པ་<NOUN>/U གོ་བ་<VERB>/S དང་<NO_POS>/U དོན་<NOUN>/U གཞན་<DET>/U ནི་<PART>/U མི་<PART>/U ལ་<ADP>/U གོ<VERB>/U །<PUNCT>/U དཔེ་དེབ་<NOUN>/U ནི་<PART>/U དེབ་<NOUN>/U ཀྱི་<NO_POS>/U དོན་<NOUN>/B གཅིག་<NUM>/I རེད་<VERB>/U གོ་བ་<VERB>/U ནོར་པ་<NOUN>/U མ་<PART>/U ལེན་<VERB>/U རོགས<OTHER>/U །<PUNCT>/U 
3 | ལས་<TEXT>/U ཞེས་པ་<PART>/U ནི་<PART>/U ལས་<TEXT>/B བྱེད་པ<VERB>/I ས་<PART>/U ལས་<TEXT>/U བྱེད་པ་<VERB>/U ལ་<ADP>/U ཟེར་<VERB>/U རོ<NOUN>/U །།<PUNCT>/U ལས་<TEXT>/B བྱེད་པ<VERB>/I ས་<PART>/U ལས་<TEXT>/U མ་<PART>/U བྱེད་པ་<VERB>/U ལ་<ADP>/U ཟེར་བ་<VERB>/U མ་<PART>/U ཡིན་<NO_POS>/U ནོ<NO_POS>/U །།<PUNCT>/U སྐད་ཆ་<NOUN>/U དེ་<DET>/U ཡི་<ADP>/U དོན་<NOUN>/U གཅིག་<NUM>/U ནི་<PART>/U ནང་པ་<NOUN>/U གོ་བ་<VERB>/S དང་<NO_POS>/U དོན་<NOUN>/U གཞན་<DET>/U ནི་<PART>/U མི་<PART>/U ལ་<ADP>/U གོ<VERB>/U །<PUNCT>/U དཔེ་དེབ་<NOUN>/U ནི་<PART>/U དེབ་<NOUN>/U ཀྱི་<NO_POS>/U དོན་<NOUN>/B གཅིག་<NUM>/I རེད་<VERB>/U གོ་བ་<VERB>/U ནོར་པ་<NOUN>/U མ་<PART>/U ལེན་<VERB>/U རོགས<OTHER>/U །<PUNCT>/U 
4 | ལས་<TEXT>/U ཞེས་པ་<PART>/U ནི་<PART>/U ལས་<TEXT>/B བྱེད་པ<VERB>/I ས་<PART>/U ལས་<TEXT>/U བྱེད་པ་<VERB>/U ལ་<ADP>/U ཟེར་<VERB>/U རོ<NOUN>/U །།<PUNCT>/U ལས་<TEXT>/B བྱེད་པ<VERB>/I ས་<PART>/U ལས་<TEXT>/U མ་<PART>/U བྱེད་པ་<VERB>/U ལ་<ADP>/U ཟེར་བ་<VERB>/U མ་<PART>/U ཡིན་<NO_POS>/U ནོ<NO_POS>/U །།<PUNCT>/U སྐད་ཆ་<NOUN>/U དེ་<DET>/U ཡི་<ADP>/U དོན་<NOUN>/U གཅིག་<NUM>/U ནི་<PART>/U ནང་པ་<NOUN>/U གོ་བ་<VERB>/S དང་<NO_POS>/U དོན་<NOUN>/U གཞན་<DET>/U ནི་<PART>/U མི་<PART>/U ལ་<ADP>/U གོ<VERB>/U །<PUNCT>/U དཔེ་དེབ་<NOUN>/U ནི་<PART>/U དེབ་<NOUN>/U ཀྱི་<NO_POS>/U དོན་<NOUN>/B གཅིག་<NUM>/I རེད་<VERB>/U གོ་བ་<VERB>/U ནོར་པ་<NOUN>/U མ་<PART>/U ལེན་<VERB>/U རོགས<OTHER>/U །<PUNCT>/U 
5 | ལས་<TEXT>/U ཞེས་པ་<PART>/U ནི་<PART>/U ལས་<TEXT>/B བྱེད་པ<VERB>/I ས་<PART>/U ལས་<TEXT>/U བྱེད་པ་<VERB>/U ལ་<ADP>/U ཟེར་<VERB>/U རོ<NOUN>/U །།<PUNCT>/U ལས་<TEXT>/B བྱེད་པ<VERB>/I ས་<PART>/U ལས་<TEXT>/U མ་<PART>/U བྱེད་པ་<VERB>/U ལ་<ADP>/U ཟེར་བ་<VERB>/U མ་<PART>/U ཡིན་<NO_POS>/U ནོ<NO_POS>/U །།<PUNCT>/U སྐད་ཆ་<NOUN>/U དེ་<DET>/U ཡི་<ADP>/U དོན་<NOUN>/U གཅིག་<NUM>/U ནི་<PART>/U ནང་པ་<NOUN>/U གོ་བ་<VERB>/S དང་<NO_POS>/U དོན་<NOUN>/U གཞན་<DET>/U ནི་<PART>/U མི་<PART>/U ལ་<ADP>/U གོ<VERB>/U །<PUNCT>/U དཔེ་དེབ་<NOUN>/U ནི་<PART>/U དེབ་<NOUN>/U ཀྱི་<NO_POS>/U དོན་<NOUN>/B གཅིག་<NUM>/I རེད་<VERB>/U གོ་བ་<VERB>/U ནོར་པ་<NOUN>/U མ་<PART>/U ལེན་<VERB>/U རོགས<OTHER>/U །<PUNCT>/U གོ་བ་<VERB>/U ནོར་པ་<NOUN>/U མ་<PART>/U ལེན་<VERB>/U རོགས<OTHER>/U །<PUNCT>/U 
6 | 


--------------------------------------------------------------------------------
/tests/data/drokun_test/drokun_test.txt:
--------------------------------------------------------------------------------
 1 | བདག་པས་གཞན་གཅེས་འགྲོ་ཀུན་བརྩེ་བས་སྐྱོང་།
 2 | བདག་སོགས་འགྲོ་ཀུན་སྨིན་ཅིང་གྲོལ་བྱའི་ཕྱིར།
 3 | དགེ་བས་འགྲོ་ཀུན་སངས་རྒྱས་ཐོབ་ཕྱིར་བསྔོ།
 4 | བསོད་ནམས་དེས།།འགྲོ་ཀུན་བཤེས་གཉེན་བསྟེན་པར་ཤོག།
 5 | བཅས་ཏེ་གནང་ཚུལ།འགྲོ་ཀུན་དང་བ་འདྲེན་ཕྱིར་བཀོད་པ་
 6 | མི་འདུག་པས།།འགྲོ་ཀུན་བརྩེ་བས་སྐྱོངས་ཤིག་
 7 | ཆུ་དེས་འགྲོ་ཀུན་ཚིམ་པ་རྨི།
 8 | ཆུ་དེས་འགྲོ་ཀུན་ཚིམ་པ་དེ།
 9 | ལགས་སོ།།།།གསུམ་པ།འགྲོ་ཀུན་དང་བ་འདྲེན་ཕྱིར་བཀོད་པ་
10 | བདག་པས་གཞན་གཅེས་འགྲོ་ཀུན་བརྩེ་བས་སྐྱོང་།
11 | བདག་སོགས་འགྲོ་ཀུན་སྨིན་ཅིང་གྲོལ་བྱའི་ཕྱིར།
12 | དགེ་བས་འགྲོ་ཀུན་སངས་རྒྱས་ཐོབ་ཕྱིར་བསྔོ།
13 | བསོད་ནམས་དེས།།འགྲོ་ཀུན་བཤེས་གཉེན་བསྟེན་པར་ཤོག།
14 | བཅས་ཏེ་གནང་ཚུལ།འགྲོ་ཀུན་དང་བ་འདྲེན་ཕྱིར་བཀོད་པ་
15 | མི་འདུག་པས།།འགྲོ་ཀུན་བརྩེ་བས་སྐྱོངས་ཤིག་
16 | ཆུ་དེས་འགྲོ་ཀུན་ཚིམ་པ་རྨི།
17 | ཆུ་དེས་འགྲོ་ཀུན་ཚིམ་པ་དེ།
18 | ལགས་སོ།།།།གསུམ་པ།འགྲོ་ཀུན་དང་བ་འདྲེན་ཕྱིར་བཀོད་པ་
19 | བཅས་ཏེ་གནང་ཚུལ།འགྲོ་ཀུན་དང་བ་འདྲེན་ཕྱིར་བཀོད་པ་
20 | བདག་པས་གཞན་གཅེས་འགྲོ་ཀུན་བརྩེ་བས་སྐྱོང་།
21 | བདག་སོགས་འགྲོ་ཀུན་སྨིན་ཅིང་གྲོལ་བྱའི་ཕྱིར།
22 | དགེ་བས་འགྲོ་ཀུན་སངས་རྒྱས་ཐོབ་ཕྱིར་བསྔོ།
23 | བསོད་ནམས་དེས།།འགྲོ་ཀུན་བཤེས་གཉེན་བསྟེན་པར་ཤོག།
24 | བཅས་ཏེ་གནང་ཚུལ།འགྲོ་ཀུན་དང་བ་འདྲེན་ཕྱིར་བཀོད་པ་
25 | མི་འདུག་པས།།འགྲོ་ཀུན་བརྩེ་བས་སྐྱོངས་ཤིག་
26 | ཆུ་དེས་འགྲོ་ཀུན་ཚིམ་པ་རྨི།
27 | ཆུ་དེས་འགྲོ་ཀུན་ཚིམ་པ་དེ།
28 | ལགས་སོ།།།།གསུམ་པ།འགྲོ་ཀུན་དང་བ་འདྲེན་ཕྱིར་བཀོད་པ་


--------------------------------------------------------------------------------
/tests/data/drokun_test/drokun_test_bilou_rules.txt:
--------------------------------------------------------------------------------
1 | [text=""] [pos="U"] [text="།།།།<PUNCT>"]	2	=	[pos="S"]
2 | [pos="U"] [pos="U" & text="འགྲོ་<VERB>"] [pos="U"]	2	=	[pos="B"]
3 | [pos=""] [pos="U" & text="བདག་པ<VERB>"]	2	=	[pos="S"]
4 | [pos="U" & text="ཀུན་<DET>"] [pos="U"] [pos="U" & text="དེ<DET>"]	1	=	[pos="U"]
5 | [text="བདག་པ<VERB>"] [pos="U"] [text="གཞན་གཅེས་<NO_POS>"]	2	=	[pos="I"]
6 | [pos="U" & text="།།།།<PUNCT>"] [text="གསུམ་པ<NUM>"] [text="།<PUNCT>"]	1	=	[pos="S"]
7 | [pos="U" & text="ཀུན་<DET>"] [pos="U"] [pos="U"]	1	=	[pos="I"]
8 | [text="དེ<DET>"] [] [pos="U" & text="།།<PUNCT>"]	3	=	[pos="S"]
9 | [text="གནང་ཚུལ<NO_POS>"] [pos="U"] [pos="U" & text="འགྲོ་<VERB>"] [pos="U"]	3	=	[pos="U"]


--------------------------------------------------------------------------------
/tests/data/drokun_test/drokun_test_hd.txt:
--------------------------------------------------------------------------------
 1 | བདག་ པས་ གཞན་གཅེས་ འགྲོ་ཀུན་ བརྩེ་བ ས་ སྐྱོང་ ། 
 2 | བདག་ སོགས་ འགྲོ་ཀུན་ སྨིན་ ཅིང་ གྲོལ་བྱ འི་ ཕྱིར ། 
 3 | དགེ་བ ས་ འགྲོ་ཀུན་ སངས་རྒྱས་ ཐོབ་ ཕྱིར་ བསྔོ ། 
 4 | བསོད་ནམས་ དེ ས ། ། འགྲོ་ཀུན་ བཤེས་གཉེན་ བསྟེན་པ ར་ ཤོག ། 
 5 | བཅས་ ཏེ་ གནང་ཚུལ །  འགྲོ་ ཀུན་ དང་བ་ འདྲེན་ ཕྱིར་ བཀོད་པ་ 
 6 | མི་ འདུག་པ ས །  ། འགྲོ་ ཀུན་ བརྩེ་བ ས་ སྐྱོངས་ ཤིག་ 
 7 | ཆུ་ དེ ས་ འགྲོ་ཀུན་ ཚིམ་པ་ རྨི ། 
 8 | ཆུ་ དེ ས་ འགྲོ་ ཀུན་ ཚིམ་པ་ དེ ། 
 9 | ལགས་ སོ །། །། གསུམ་པ །  འགྲོ་ཀུན་ དང་བ་ འདྲེན་ ཕྱིར་ བཀོད་པ་ 
10 | བདག་ པས་ གཞན་གཅེས་ འགྲོ་ཀུན་ བརྩེ་བ ས་ སྐྱོང་ ། 
11 | བདག་ སོགས་ འགྲོ་ཀུན་ སྨིན་ ཅིང་ གྲོལ་བྱ འི་ ཕྱིར ། 
12 | དགེ་བ ས་ འགྲོ་ཀུན་ སངས་རྒྱས་ ཐོབ་ ཕྱིར་ བསྔོ ། 
13 | བསོད་ནམས་ དེ ས ། ། འགྲོ་ཀུན་ བཤེས་གཉེན་ བསྟེན་པ ར་ ཤོག ། 
14 | བཅས་ ཏེ་ གནང་ཚུལ །  འགྲོ་ ཀུན་ དང་བ་ འདྲེན་ ཕྱིར་ བཀོད་པ་ 
15 | མི་ འདུག་པ ས །  ། འགྲོ་ ཀུན་ བརྩེ་བ ས་ སྐྱོངས་ ཤིག་ 
16 | ཆུ་ དེ ས་ འགྲོ་ཀུན་ ཚིམ་པ་ རྨི ། 
17 | ཆུ་ དེ ས་ འགྲོ་ ཀུན་ ཚིམ་པ་ དེ ། 
18 | ལགས་ སོ །། །། གསུམ་པ །  འགྲོ་ཀུན་ དང་བ་ འདྲེན་ ཕྱིར་ བཀོད་པ་ 
19 | བཅས་ ཏེ་ གནང་ཚུལ །  འགྲོ་ ཀུན་ དང་བ་ འདྲེན་ ཕྱིར་ བཀོད་པ་ 
20 | བདག་ པས་ གཞན་གཅེས་ འགྲོ་ཀུན་ བརྩེ་བ ས་ སྐྱོང་ ། 
21 | བདག་ སོགས་ འགྲོ་ཀུན་ སྨིན་ ཅིང་ གྲོལ་བྱ འི་ ཕྱིར ། 
22 | དགེ་བ ས་ འགྲོ་ཀུན་ སངས་རྒྱས་ ཐོབ་ ཕྱིར་ བསྔོ ། 
23 | བསོད་ནམས་ དེ ས ། ། འགྲོ་ཀུན་ བཤེས་གཉེན་ བསྟེན་པ ར་ ཤོག ། 
24 | བཅས་ ཏེ་ གནང་ཚུལ །  འགྲོ་ ཀུན་ དང་བ་ འདྲེན་ ཕྱིར་ བཀོད་པ་ 
25 | མི་ འདུག་པ ས །  ། འགྲོ་ ཀུན་ བརྩེ་བ ས་ སྐྱོངས་ ཤིག་ 
26 | ཆུ་ དེ ས་ འགྲོ་ཀུན་ ཚིམ་པ་ རྨི ། 
27 | ཆུ་ དེ ས་ འགྲོ་ ཀུན་ ཚིམ་པ་ དེ ། 
28 | ལགས་ སོ །། །། གསུམ་པ །  འགྲོ་ཀུན་ དང་བ་ འདྲེན་ ཕྱིར་ བཀོད་པ་ 
29 | 


--------------------------------------------------------------------------------
/tests/data/drokun_test/drokun_test_rules.txt:
--------------------------------------------------------------------------------
1 | [text="གཞན་གཅེས་"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"]	2	+	[]
2 | [text="།" & pos="PUNCT"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"]	2	+	[]
3 | [text="ས་" & pos="PART"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"]	2	+	[]
4 | [text="སོགས་" & pos="DET"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"]	2	+	[]


--------------------------------------------------------------------------------
/tests/data/drokun_test/drokun_test_tr_data.txt:
--------------------------------------------------------------------------------
 1 | བདག་པ<VERB>/S ས་<PART>/I གཞན་གཅེས་<NO_POS>/U འགྲོ་<VERB>/B ཀུན་<DET>/I བརྩེ་བ<VERB>/U ས་<PART>/U སྐྱོང་<VERB>/U །<PUNCT>/U 
 2 | བདག་<PRON>/U སོགས་<DET>/U འགྲོ་<VERB>/B ཀུན་<DET>/I སྨིན་<VERB>/U ཅིང་<NON_WORD>/U གྲོལ་བྱ<NO_POS>/U འི་<PART>/U ཕྱིར<OTHER>/U །<PUNCT>/U 
 3 | དགེ་བ<VERB>/U ས་<PART>/U འགྲོ་<VERB>/B ཀུན་<DET>/I སངས་རྒྱས་<NOUN>/U ཐོབ་<VERB>/U ཕྱིར་<OTHER>/U བསྔོ<VERB>/U །<PUNCT>/U 
 4 | བསོད་ནམས་<NOUN>/U དེ<DET>/U ས<PART>/U །།<PUNCT>/S འགྲོ་<VERB>/B ཀུན་<DET>/I བཤེས་གཉེན་<NOUN>/U བསྟེན་པ<VERB>/U ར་<PART>/U ཤོག<VERB>/U །<PUNCT>/U 
 5 | བཅས་<VERB>/U ཏེ་<NO_POS>/U གནང་ཚུལ<NO_POS>/U །<PUNCT>/U འགྲོ་<VERB>/U ཀུན་<DET>/U དང་བ་<NOUN>/U འདྲེན་<NO_POS>/U ཕྱིར་<OTHER>/U བཀོད་པ་<VERB>/U 
 6 | མི་<PART>/U འདུག་པ<VERB>/U ས<PART>/U །།<PUNCT>/U འགྲོ་<VERB>/U ཀུན་<DET>/U བརྩེ་བ<VERB>/U ས་<PART>/U སྐྱོངས་<VERB>/U ཤིག་<NO_POS>/U 
 7 | ཆུ་<NOUN>/U དེ<DET>/U ས་<PART>/U འགྲོ་<VERB>/B ཀུན་<DET>/I ཚིམ་པ་<VERB>/U རྨི<NO_POS>/U །<PUNCT>/U 
 8 | ཆུ་<NOUN>/U དེ<DET>/U ས་<PART>/U འགྲོ་<VERB>/U ཀུན་<DET>/U ཚིམ་པ་<VERB>/U དེ<DET>/U །<PUNCT>/U 
 9 | ལགས་སོ<NO_POS>/S །།།།<PUNCT>/S གསུམ་པ<NUM>/U །<PUNCT>/U འགྲོ་<VERB>/B ཀུན་<DET>/I དང་བ་<NOUN>/U འདྲེན་<NO_POS>/U ཕྱིར་<OTHER>/U བཀོད་པ་<VERB>/U 
10 | བདག་པ<VERB>/S ས་<PART>/I གཞན་གཅེས་<NO_POS>/U འགྲོ་<VERB>/B ཀུན་<DET>/I བརྩེ་བ<VERB>/U ས་<PART>/U སྐྱོང་<VERB>/U །<PUNCT>/U 
11 | བདག་<PRON>/U སོགས་<DET>/U འགྲོ་<VERB>/B ཀུན་<DET>/I སྨིན་<VERB>/U ཅིང་<NON_WORD>/U གྲོལ་བྱ<NO_POS>/U འི་<PART>/U ཕྱིར<OTHER>/U །<PUNCT>/U 
12 | དགེ་བ<VERB>/U ས་<PART>/U འགྲོ་<VERB>/B ཀུན་<DET>/I སངས་རྒྱས་<NOUN>/U ཐོབ་<VERB>/U ཕྱིར་<OTHER>/U བསྔོ<VERB>/U །<PUNCT>/U 
13 | བསོད་ནམས་<NOUN>/U དེ<DET>/U ས<PART>/U །།<PUNCT>/S འགྲོ་<VERB>/B ཀུན་<DET>/I བཤེས་གཉེན་<NOUN>/U བསྟེན་པ<VERB>/U ར་<PART>/U ཤོག<VERB>/U །<PUNCT>/U 
14 | བཅས་<VERB>/U ཏེ་<NO_POS>/U གནང་ཚུལ<NO_POS>/U །<PUNCT>/U འགྲོ་<VERB>/U ཀུན་<DET>/U དང་བ་<NOUN>/U འདྲེན་<NO_POS>/U ཕྱིར་<OTHER>/U བཀོད་པ་<VERB>/U 
15 | མི་<PART>/U འདུག་པ<VERB>/U ས<PART>/U །།<PUNCT>/U འགྲོ་<VERB>/U ཀུན་<DET>/U བརྩེ་བ<VERB>/U ས་<PART>/U སྐྱོངས་<VERB>/U ཤིག་<NO_POS>/U 
16 | ཆུ་<NOUN>/U དེ<DET>/U ས་<PART>/U འགྲོ་<VERB>/B ཀུན་<DET>/I ཚིམ་པ་<VERB>/U རྨི<NO_POS>/U །<PUNCT>/U 
17 | ཆུ་<NOUN>/U དེ<DET>/U ས་<PART>/U འགྲོ་<VERB>/U ཀུན་<DET>/U ཚིམ་པ་<VERB>/U དེ<DET>/U །<PUNCT>/U 
18 | ལགས་སོ<NO_POS>/S །།།།<PUNCT>/S གསུམ་པ<NUM>/U །<PUNCT>/U འགྲོ་<VERB>/B ཀུན་<DET>/I དང་བ་<NOUN>/U འདྲེན་<NO_POS>/U ཕྱིར་<OTHER>/U བཀོད་པ་<VERB>/U 
19 | བཅས་<VERB>/U ཏེ་<NO_POS>/U གནང་ཚུལ<NO_POS>/U །<PUNCT>/U འགྲོ་<VERB>/U ཀུན་<DET>/U དང་བ་<NOUN>/U འདྲེན་<NO_POS>/U ཕྱིར་<OTHER>/U བཀོད་པ་<VERB>/U 
20 | བདག་པ<VERB>/S ས་<PART>/I གཞན་གཅེས་<NO_POS>/U འགྲོ་<VERB>/B ཀུན་<DET>/I བརྩེ་བ<VERB>/U ས་<PART>/U སྐྱོང་<VERB>/U །<PUNCT>/U 
21 | བདག་<PRON>/U སོགས་<DET>/U འགྲོ་<VERB>/B ཀུན་<DET>/I སྨིན་<VERB>/U ཅིང་<NON_WORD>/U གྲོལ་བྱ<NO_POS>/U འི་<PART>/U ཕྱིར<OTHER>/U །<PUNCT>/U 
22 | དགེ་བ<VERB>/U ས་<PART>/U འགྲོ་<VERB>/B ཀུན་<DET>/I སངས་རྒྱས་<NOUN>/U ཐོབ་<VERB>/U ཕྱིར་<OTHER>/U བསྔོ<VERB>/U །<PUNCT>/U 
23 | བསོད་ནམས་<NOUN>/U དེ<DET>/U ས<PART>/U །།<PUNCT>/S འགྲོ་<VERB>/B ཀུན་<DET>/I བཤེས་གཉེན་<NOUN>/U བསྟེན་པ<VERB>/U ར་<PART>/U ཤོག<VERB>/U །<PUNCT>/U 
24 | བཅས་<VERB>/U ཏེ་<NO_POS>/U གནང་ཚུལ<NO_POS>/U །<PUNCT>/U འགྲོ་<VERB>/U ཀུན་<DET>/U དང་བ་<NOUN>/U འདྲེན་<NO_POS>/U ཕྱིར་<OTHER>/U བཀོད་པ་<VERB>/U 
25 | མི་<PART>/U འདུག་པ<VERB>/U ས<PART>/U །།<PUNCT>/U འགྲོ་<VERB>/U ཀུན་<DET>/U བརྩེ་བ<VERB>/U ས་<PART>/U སྐྱོངས་<VERB>/U ཤིག་<NO_POS>/U 
26 | ཆུ་<NOUN>/U དེ<DET>/U ས་<PART>/U འགྲོ་<VERB>/B ཀུན་<DET>/I ཚིམ་པ་<VERB>/U རྨི<NO_POS>/U །<PUNCT>/U 
27 | ཆུ་<NOUN>/U དེ<DET>/U ས་<PART>/U འགྲོ་<VERB>/U ཀུན་<DET>/U ཚིམ་པ་<VERB>/U དེ<DET>/U །<PUNCT>/U 
28 | ལགས་སོ<NO_POS>/S །།།།<PUNCT>/S གསུམ་པ<NUM>/U །<PUNCT>/U འགྲོ་<VERB>/B ཀུན་<DET>/I དང་བ་<NOUN>/U འདྲེན་<NO_POS>/U ཕྱིར་<OTHER>/U བཀོད་པ་<VERB>/U 
29 | 


--------------------------------------------------------------------------------
/tests/data/marpa/marpa_bilou_rules.txt:
--------------------------------------------------------------------------------
  1 | [text="ཅིག་<NO_POS>"] [pos="U" & text="གསུངས<VERB>"] [text="།<PUNCT>"]	2	=	[pos="S"]
  2 | [text="ནང་<NOUN>"] [pos="S" & text="མཐུན་པ<VERB>"]	2	=	[pos="I"]
  3 | [pos="U"] [pos="U" & text="དུ་<NO_POS>"] [pos="S"]	2	=	[pos="S"]
  4 | [pos="U"] [] [text="ཤོམས་<VERB>"]	1	=	[pos="S"]
  5 | [text="གྱི་<NO_POS>"] [] [pos="U"] [pos="S"] [pos="S"]	3	=	[pos="U"]
  6 | [pos="S" & text="ཕྲིན་ལས་<NOUN>"] [pos="U"] [pos="U"]	1	=	[pos="U"]
  7 | [text="ལ་ཆ<PROPN>"] [pos="U"]	2	=	[pos="I"]
  8 | [pos="S"] [pos="S"] [pos="U"] [] [text="དབང་<NOUN>"]	3	=	[pos="U"]
  9 | [pos="U" & text="བྱས་པ་<VERB>"] [] [text="།།<PUNCT>"]	1	=	[pos="S"]
 10 | [text="ཆགས་པ་<VERB>"] [] [pos="U"]	3	=	[pos="S"]
 11 | [text="གདུང་<NOUN>"] [pos="U"]	2	=	[pos="S"]
 12 | [pos="S" & text="བུ་<NOUN>"] [pos="U"] [pos="U" & text="བྱུང་བ་<VERB>"]	1	=	[pos="S"]
 13 | [pos="U"] [pos="S"] [pos="U" & text="གདེངས་<NOUN>"]	3	=	[pos="S"]
 14 | [text="གདེངས་<NOUN>"] [pos="U" & text="དང་<NO_POS>"]	2	=	[pos="S"]
 15 | [pos="U"] [text="འཕོས་<VERB>"]	1	=	[pos="S"]
 16 | [text="ཀྱི་<NO_POS>"] [pos="U" & text="ཞལ་<NOUN>"] [text="ནས<PART>"]	2	=	[pos="S"]
 17 | [text="ཞུས་པ་<VERB>"] [text="ལགས<AUX>"] [pos="U" & text="།།<PUNCT>"]	3	=	[pos="S"]
 18 | [text="།<PUNCT>"] [pos="U" & text="སྔར་<NOUN>"] [text="གྱི་<NO_POS>"]	2	=	[pos="S"]
 19 | [text="ཕུག་རོན་<NOUN>"] [] [pos="U"]	3	=	[pos="S"]
 20 | [pos="U" & text="བུ་སློབ་<NOUN>"] [] [text="།།<PUNCT>"]	1	=	[pos="S"]
 21 | [pos="U" & text="ཏེ<NO_POS>"] [pos="U"] [pos="S"]	1	=	[pos="S"]
 22 | [pos="U"] [pos="S"] [pos="U" & text="མི་<PART>"]	3	=	[pos="S"]
 23 | [text="།<PUNCT>"] [] [pos="U" & text="སྣ་ཚོགས་<DET>"]	3	=	[pos="S"]
 24 | [pos="S"] [pos="U" & text="ཅིག་<NO_POS>"] [pos="U"]	2	=	[pos="S"]
 25 | [pos="S"] [pos="U"] [pos="U" & text="ཚེ<NOUN>"]	3	=	[pos="S"]
 26 | [text="འི་<PART>"] [] [pos="U" & text="འདྲ<VERB>"]	3	=	[pos="S"]
 27 | [text="འཕོས་<VERB>"] [] [pos="U"]	3	=	[pos="S"]
 28 | [pos="U" & text="སངས་རྒྱས་<NOUN>"] [text="ལ་<ADP>"] [text="ཞུས་པ་<VERB>"]	1	=	[pos="S"]
 29 | [pos="S"] [pos="U"] [pos="U" & text="སོང་བ་<VERB>"]	3	=	[pos="S"]
 30 | [text="གསུངས་བ་<VERB>"] [pos="U" & text="ལྟར<OTHER>"] [text="།<PUNCT>"]	2	=	[pos="S"]
 31 | [text="ཡོད་པ་<VERB>"] [pos="U" & text="དེ་<DET>"]	2	=	[pos="S"]
 32 | [text="སྒོས་<ADV>"] [pos="U"]	2	=	[pos="S"]
 33 | [pos="S"] [pos="U" & text="གསུངས<VERB>"] [pos="U"]	2	=	[pos="S"]
 34 | [pos="U" & text="ལས<TEXT>"] [text="།<PUNCT>"] [text="སྔར་<NOUN>"]	1	=	[pos="S"]
 35 | [pos="U" & text="ཡིན<NO_POS>"] [] [text="སྤྱན་མིག་<NO_POS>"]	1	=	[pos="S"]
 36 | [text="པང་<NOUN>"] [pos="U"]	2	=	[pos="S"]
 37 | [pos="S"] [pos="U"] [pos="U" & text="མགུར་<NOUN>"]	3	=	[pos="S"]
 38 | [pos="S" & text="ཕྱག་འཚལ་<NO_POS>"] [text="བསྟོད<VERB>"]	1	=	[pos="U"]
 39 | [text="བར་<NOUN>"] [pos="U" & text="དུ་<NO_POS>"]	2	=	[pos="I"]
 40 | [text="བླ་མ<NOUN>"] [pos="U" & text="འི་<PART>"] [text="ཞལ་<NOUN>"]	2	=	[pos="S"]
 41 | [pos="S"] [pos="U"] [pos="U" & text="བཞུགས་<VERB>"]	3	=	[pos="S"]
 42 | [pos="U" & text="།<PUNCT>"] [] [text="ཀ་<DET>"]	1	=	[pos="S"]
 43 | [pos="U"] [text="ལ་ལ<PRON>"]	1	=	[pos="S"]
 44 | [pos="U"] [] [text="རྒན་རྒོན་<NOUN>"]	1	=	[pos="S"]
 45 | [pos="U" & text="ཡོད་<VERB>"] [text="ན་<ADP>"]	1	=	[pos="S"]
 46 | [pos="U" & text="རྫོགས་རིམ་<NOUN>"] [] [text="བསྡུས་<VERB>"]	1	=	[pos="S"]
 47 | [text="འགྲོ་<VERB>"] [pos="U" & text="ཀུན་<DET>"]	2	=	[pos="I"]
 48 | [pos="S"] [pos="U"] [pos="U" & text="ཀྱི་<NO_POS>"] [] [text="ཡིན་<NO_POS>"]	3	=	[pos="U"]
 49 | [pos="U" & text="ཚུལ་<NOUN>"] [pos="U" & text="དུ་<NO_POS>"] [pos="S"]	2	=	[pos="U"]
 50 | [pos="U" & text="རྨི་ལམ་<NOUN>"] [text="བཟང་<VERB>"]	1	=	[pos="S"]
 51 | [pos="U"] [] [text="ལ་ལ<PRON>"]	1	=	[pos="S"]
 52 | [pos="S"] [pos="U"] [pos="S" & text="དུ་མ་<ADJ>"]	2	=	[pos="U"]
 53 | [pos="U" & text="ཀྱི་<NO_POS>"] [pos="U"] [pos="S"]	1	=	[pos="S"]
 54 | [pos="U" & text="འགྲོ་<VERB>"] [pos="U"] [pos="S"]	1	=	[pos="S"]
 55 | [pos="U"] [] [text="གོམས་པ་<VERB>"]	1	=	[pos="S"]
 56 | [text="དང་<NO_POS>"] [pos="U" & text="།<PUNCT>"] [] [text="།<PUNCT>"]	2	=	[pos="U"]
 57 | [text="སྤུར་ཁང་<NOUN>"] [pos="U"]	2	=	[pos="S"]
 58 | [pos="S"] [pos="U"] [pos="U" & text="སྣང་བ་<VERB>"]	3	=	[pos="S"]
 59 | [pos="U"] [pos="S"] [pos="S"]	1	=	[pos="S"]
 60 | [text="སྒྲ་<NOUN>"] [pos="U"] [text="།<PUNCT>"]	2	=	[pos="S"]
 61 | [pos="S"] [pos="U"] [pos="U"] [text="རྗེ་བཙུན་མི་<NO_POS>"]	3	=	[pos="U"]
 62 | [pos="S"] [pos="U"] [pos="U" & text="བཅས་པ་<VERB>"]	3	=	[pos="S"]
 63 | [pos="S"] [pos="U"] [pos="U" & text="དུར་ཁྲོད་<NOUN>"]	3	=	[pos="S"]
 64 | [text="དར་བ<VERB>"] [] [pos="U"]	3	=	[pos="S"]
 65 | [text="བཀའ་བརྒྱུད་<NO_POS>"] [pos="U"]	2	=	[pos="S"]
 66 | [text="དང་<NO_POS>"] [text="།<PUNCT>"] [pos="U"] [text="།<PUNCT>"]	3	=	[pos="U"]
 67 | [pos="U"] [pos="U" & text="མི་<PART>"] [pos="S" & text="ང་རང་<OTHER>"]	2	=	[pos="U"]
 68 | [pos="U"] [pos="S" & text="ནགས་<NOUN>"] [pos="U"]	2	=	[pos="U"]
 69 | [pos="U"] [text=".*པ<ADJ>"]	1	=	[pos="S"]
 70 | [text="དགོངས་<VERB>"] [text="ཏེ<NO_POS>"] [pos="U" & text="།<PUNCT>"]	3	=	[pos="S"]
 71 | [pos="S"] [pos="U" & text="གཅིག་<NUM>"] [pos="U" & text="ལ་<ADP>"]	2	=	[pos="U"]
 72 | [pos="U" & text="ལ<ADP>"] [] [text="ད་ལྟ་<OTHER>"]	1	=	[pos="S"]
 73 | [text="གཟིགས་པ<VERB>"] [text="ས<PART>"] [pos="U" & text="།<PUNCT>"]	3	=	[pos="S"]
 74 | [text="ལ་ལ<PRON>"] [pos="U"]	2	=	[pos="S"]
 75 | [text="ཕྱག་ལེན་<NOUN>"] [] [pos="U"] [pos="S"] [pos="S"]	3	=	[pos="U"]
 76 | [pos="U"] [text="བདག་མེད་མ་<NO_POS>"]	1	=	[pos="S"]
 77 | [pos="U"] [] [text="རྔོག་སྟོན་<NO_POS>"]	1	=	[pos="S"]
 78 | [pos="U"] [] [text="གང་བ་<VERB>"]	1	=	[pos="S"]
 79 | [text="ལུག་རྫི་<NOUN>"] [] [pos="U"]	3	=	[pos="S"]
 80 | [pos="U" & text="ཅིང་<NON_WORD>"] [pos="U"] [pos="S"]	1	=	[pos="S"]
 81 | [text="མཆོད་པ་<VERB>"] [pos="U" & text="ཕུལ་<VERB>"]	2	=	[pos="S"]
 82 | [text="ཡོན་<NOUN>"] [] [pos="U" & text="།<PUNCT>"] [pos="U"]	3	=	[pos="U"]
 83 | [pos="U"] [text="ཡབ་<NOUN>"] [text="ལ་<ADP>"]	1	=	[pos="U"]
 84 | [pos="S" & text="ད་ལྟ་<OTHER>"] [pos="U"]	1	=	[pos="U"]
 85 | [pos="U" & text="ཡིན་<NO_POS>"] [text="ནོ<NO_POS>"]	1	=	[pos="S"]
 86 | [text="གནང་བ<NO_POS>"] [] [pos="S"]	3	=	[pos="U"]
 87 | [text="བདག་ཅག་<PRON>"] [pos="U"] [text="ཀྱི་<NO_POS>"]	2	=	[pos="S"]
 88 | [pos="B"] [pos="U" & text="མེད་<VERB>"] [text="ཕྱག་རྒྱ་ཆེན་པོ་<NO_POS>"] [text="ལ<ADP>"]	2	=	[pos="S"]
 89 | [pos="S"] [pos="S" & text="མཁར་<NOUN>"] [pos="U"]	3	=	[pos="U"]
 90 | [text="གཉིས་<NUM>"] [pos="S" & text="ཀ་<DET>"]	2	=	[pos="U"]
 91 | [pos="U"] [text="ཕ་རོལ་<NOUN>"]	1	=	[pos="S"]
 92 | [text="ར་<PART>"] [pos="U" & text="གདའ<VERB>"] [text="།།<PUNCT>"]	2	=	[pos="S"]
 93 | [text="ཕ་ཇོ་<NOUN>"] [pos="U"]	2	=	[pos="S"]
 94 | [pos="U" & text="རྟོགས་པ་<VERB>"] [text="དེ<DET>"] [text="ས<PART>"]	1	=	[pos="S"]
 95 | [pos="S"] [pos="U"] [pos="I"]	2	=	[pos="I"]
 96 | [text="བུ་སློབ་<NOUN>"] [pos="U" & text="ཀུན<DET>"]	2	=	[pos="S"]
 97 | [pos="U" & text="དགོངས་<VERB>"] [text="ཏེ<NO_POS>"]	1	=	[pos="S"]
 98 | [pos="U"] [] [text="ཀ་ཆེན་<NO_POS>"]	1	=	[pos="S"]
 99 | [text="།<PUNCT>"] [pos="U"] [text="།<PUNCT>"]	2	=	[pos="S"]
100 | [pos="U"] [text="བས<ADP>"]	1	=	[pos="B"]
101 | [pos="S" & text="ཕུག་རོན་<NOUN>"] [pos="U" & text="གྱི་<NO_POS>"]	2	=	[pos="U"]
102 | [pos="U"] [text="ལུག་རྫི་<NOUN>"]	1	=	[pos="S"]
103 | [pos="U" & text="ཁྱེད་<PRON>"] [text="རང་<PRON>"] [text="རྣམས་<DET>"]	1	=	[pos="S"]
104 | [pos="U" & text="གང་<NO_POS>"] [text="གི་<NO_POS>"]	1	=	[pos="S"]
105 | [text="དང་<NO_POS>"] [] [pos="U" & text="བླ་མ་<NOUN>"] [pos="S"] [pos="U"]	3	=	[pos="U"]
106 | [pos="U"] [text="ཡབ་<NOUN>"]	1	=	[pos="S"]
107 | [pos="U" & text="མི་<PART>"] [text="འདུག་པ<VERB>"] [text="ས<PART>"]	1	=	[pos="S"]
108 | [text="ཞན་<PROPN>"] [] [pos="U"]	3	=	[pos="S"]
109 | [pos="U"] [] [text="ཕ་རོལ་<NOUN>"]	1	=	[pos="S"]
110 | [pos="S" & text="སྲས་<NOUN>"] [text="ཀྱི་<NO_POS>"] [text="ཐུགས་<NOUN>"]	1	=	[pos="U"]
111 | [pos="U"] [text="ཐུགས་ཁྲལ་<NOUN>"]	1	=	[pos="S"]
112 | [text="གྲགས་རྒྱུ<NO_POS>"] [pos="U" & text="།།<PUNCT>"]	2	=	[pos="S"]
113 | [pos="U"] [] [text="གྲགས་རྒྱུ<NO_POS>"]	1	=	[pos="S"]
114 | [pos="U"] [pos="U"] [pos="U" & text="གི་<NO_POS>"] [text="བུ<NOUN>"]	3	=	[pos="U"]
115 | [pos="U"] [pos="S"] [pos="U" & text="ཙམ་<DET>"]	3	=	[pos="S"]
116 | [pos="U"] [pos="U"] [pos="S" & text="དོན་དུ་<NO_POS>"]	3	=	[pos="U"]
117 | [text="ར་<PART>"] [] [pos="U" & text="ལས<TEXT>"]	3	=	[pos="S"]
118 | [text="མི་<PART>"] [pos="U" & text="འདུག་པ<VERB>"] [text="ས<PART>"]	2	=	[pos="S"]
119 | [text="བསྐོར་བ་<VERB>"] [] [pos="U"]	3	=	[pos="S"]
120 | [pos="U"] [pos="S" & text="ནགས་<NOUN>"] [pos="U"] [text="།<PUNCT>"]	2	=	[pos="S"]
121 | [text="ཀླད་<NOUN>"] [] [pos="U"]	3	=	[pos="S"]
122 | [pos="U"] [] [text=".*མེད་པ<NO_POS>"]	1	=	[pos="S"]
123 | [pos="U" & text="།<PUNCT>"] [] [text="མི་<PART>"]	1	=	[pos="S"]
124 | [pos="U" & text="བླ་མ<NOUN>"] [] [text="ཞལ་<NOUN>"]	1	=	[pos="S"]
125 | [text="གཟིགས་པ<VERB>"] [pos="U" & text="ས<PART>"] [text="།<PUNCT>"]	2	=	[pos="S"]
126 | [text="།།<PUNCT>"] [pos="U" & text="སྙན་བརྒྱུད་<NOUN>"]	2	=	[pos="S"]
127 | [pos="U" & text="ཡོད་<VERB>"] [text="ཞུས་པ<VERB>"] [text="ས<PART>"]	1	=	[pos="S"]
128 | [pos="U"] [text="ཕུག་རོན་<NOUN>"]	1	=	[pos="S"]
129 | [text="ཚོགས་<NOUN>"] [pos="U"] [text="བུ་སློབ་<NOUN>"]	2	=	[pos="S"]
130 | [text="།<PUNCT>"] [pos="U"] [text="རྟ་<NOUN>"]	2	=	[pos="S"]
131 | [text="གྱི་<NO_POS>"] [] [pos="U" & text="དང་<NO_POS>"] [pos="U"] [pos="S"]	3	=	[pos="U"]
132 | [text="བཀའ་བརྒྱུད་<NO_POS>"] [] [pos="U"]	3	=	[pos="S"]
133 | [pos="U" & text="བཞི<NUM>"] [pos="U"] [pos="S"]	1	=	[pos="S"]
134 | [text="རྒན་རྒོན་<NOUN>"] [pos="U"]	2	=	[pos="S"]
135 | [pos="S"] [pos="U"] [pos="U" & text="འདུག་<VERB>"]	3	=	[pos="S"]
136 | [pos="U"] [text="བཀའ་བརྒྱུད་<NO_POS>"]	1	=	[pos="S"]
137 | [pos="U" & text="དྲུག་<NUM>"] [pos="U"] [pos="S"]	1	=	[pos="S"]
138 | [pos="U"] [pos="U" & text="ད་<ADV>"] [pos="S"]	2	=	[pos="S"]
139 | [pos="U" & text="ཀྱི་<NO_POS>"] [text="ཞལ་<NOUN>"] [text="ནས<PART>"]	1	=	[pos="S"]
140 | [text="ནམ་མཁ<NOUN>"] [pos="U" & text="འི་<PART>"] [text="མཐོངས་<NOUN>"]	2	=	[pos="S"]
141 | [pos="U"] [pos="U" & text="མི་<PART>"] [pos="S"]	2	=	[pos="S"]
142 | [text="ནོ<NO_POS>"] [pos="U" & text="།།<PUNCT>"]	2	=	[pos="S"]
143 | [text="སེམས་ཅན་<NOUN>"] [text="གྱི་<NO_POS>"] [pos="U" & text="དོན་<NOUN>"]	3	=	[pos="S"]
144 | [pos="U" & text="ཐུགས་<NOUN>"] [text="དྲན་<VERB>"]	1	=	[pos="S"]
145 | [pos="S"] [pos="U"] [pos="U" & text="གུས་<VERB>"]	3	=	[pos="S"]
146 | [pos="S"] [pos="U"] [pos="I" & text=".*<NO_POS>"]	2	=	[pos="S"]
147 | [text="བདག་མེད་མ་<NO_POS>"] [] [pos="U"]	3	=	[pos="S"]
148 | [text="གདམས་ངག་<NOUN>"] [text="ལ<ADP>"] [pos="U" & text="།།<PUNCT>"]	3	=	[pos="S"]
149 | [text="འཚལ་བ་<VERB>"] [] [pos="U" & text="།།<PUNCT>"] [pos="S"] [pos="S"]	3	=	[pos="U"]
150 | [pos="S"] [pos="U" & text="གྱི་<NO_POS>"]	2	=	[pos="S"]
151 | [text="དེ<DET>"] [] [pos="U" & text="ཐར་པ<VERB>"]	3	=	[pos="S"]
152 | [pos="U" & text="།<PUNCT>"] [] [text="སྣ་ཚོགས་<DET>"]	1	=	[pos="S"]
153 | [pos="S" & text="རྨི<NO_POS>"] [text="།།<PUNCT>"] [text="ནཱ་རོ་པ་<NO_POS>"]	1	=	[pos="U"]
154 | [text="ཁྱོད་<PRON>"] [pos="U" & text="ཀྱི་<NO_POS>"] [pos="U"] [pos="S"]	2	=	[pos="U"]
155 | [pos="U" & text="།<PUNCT>"] [] [text="རྟ་<NOUN>"]	1	=	[pos="S"]
156 | [pos="U"] [pos="U"] [pos="S" & text="གཤེགས<VERB>"]	3	=	[pos="U"]
157 | [pos="U" & text="མེ་ཏོག་<NOUN>"] [text="སྣ་ཚོགས་<DET>"]	1	=	[pos="S"]
158 | [pos="S"] [pos="U"] [pos="U" & text="ཀྱི་<NO_POS>"]	3	=	[pos="S"]
159 | [pos="S" & text="སྲས་<NOUN>"] [text="དར་མ་མདོ་སྡེ་<NO_POS>"]	1	=	[pos="U"]
160 | [pos="U" & text="དང་<NO_POS>"] [pos="U" & text="དགའ་སྤྲོ་<NOUN>"] [pos="S" & text="དཔག་ཏུ་མེད་པ་<NO_POS>"]	1	=	[pos="U"]
161 | [text="བུ་སློབ་<NOUN>"] [] [pos="U" & text="།།<PUNCT>"]	3	=	[pos="S"]
162 | [pos="S" & text="བུ་<NOUN>"] [pos="U" & text="དང་<NO_POS>"] [pos="U"]	1	=	[pos="S"]
163 | [pos="U"] [pos="U"] [pos="U" & text="འི་<PART>"] [pos="S"] [pos="S"]	3	=	[pos="U"]
164 | [pos="S" & text="འཁོར་བ་<NOUN>"] [pos="U"] [pos="U"]	1	=	[pos="U"]
165 | [text="བཙུགས་པ་<VERB>"] [] [pos="U" & text="།།<PUNCT>"]	3	=	[pos="S"]
166 | [text="ལ་ད<PROPN>"] [] [pos="U"]	3	=	[pos="I"]
167 | [text="ཡོད་<VERB>"] [text="ཞུས་པ<VERB>"] [pos="U" & text="ས<PART>"]	3	=	[pos="S"]
168 | [pos="U"] [] [text="རྔོག་པ<PROPN>"]	1	=	[pos="S"]
169 | [pos="U" & text="གཅིག་<NUM>"] [pos="U"] [pos="S"]	1	=	[pos="S"]
170 | [text="དེ་<DET>"] [pos="S" & text="ཀ་<DET>"]	2	=	[pos="U"]
171 | [pos="U"] [] [text="ཨ་ཕོ་<NOUN>"]	1	=	[pos="S"]
172 | [text="བསྐོར་བ་<VERB>"] [pos="U"]	2	=	[pos="S"]
173 | [pos="B"] [pos="U" & text="མེད་<VERB>"]	2	=	[pos="I"]
174 | [text="ཡོན་<NOUN>"] [] [pos="U"]	3	=	[pos="S"]
175 | [pos="U"] [pos="S"] [pos="S" & text="མཁར་<NOUN>"]	1	=	[pos="U"]
176 | [text="རྣམས་<DET>"] [] [pos="U" & text="རྨི་ལམ་<NOUN>"]	3	=	[pos="S"]
177 | [pos="U" & text="།<PUNCT>"] [] [text="།<PUNCT>"]	1	=	[pos="S"]
178 | [pos="U" & text="བླ་མ་<NOUN>"] [pos="S"] [pos="U"]	1	=	[pos="S"]
179 | [pos="S"] [pos="S" & text="བུ་<NOUN>"] [pos="U"] [text="དང་<NO_POS>"]	3	=	[pos="U"]
180 | [pos="S"] [pos="U"] [pos="U" & text="ཕུལ་བ<VERB>"]	3	=	[pos="S"]
181 | [pos="U" & text="རྣམས<DET>"] [pos="U"] [pos="S"]	1	=	[pos="S"]
182 | [pos="U"] [] [text="འཕོས་<VERB>"]	1	=	[pos="S"]
183 | [text="འོག་<NOUN>"] [pos="U"] [text="བུ་ཆེན་<NOUN>"]	2	=	[pos="U"]
184 | [text="ཐུགས་ཁྲལ་<NOUN>"] [] [pos="U"]	3	=	[pos="S"]
185 | [pos="S"] [pos="U"] [pos="S"]	2	=	[pos="S"]
186 | [text="གནང་<NO_POS>"] [pos="U" & text="བར་<NOUN>"]	2	=	[pos="S"]
187 | [pos="U"] [text="གོམས་པ་<VERB>"]	1	=	[pos="S"]
188 | [text="མེས་སྟོན་<NO_POS>"] [] [pos="U"]	3	=	[pos="S"]
189 | [text="།<PUNCT>"] [pos="U" & text="དབུ་<NOUN>"]	2	=	[pos="S"]
190 | [pos="S" & text="ཕྱི་ཕྱག་<NOUN>"] [pos="U"]	1	=	[pos="U"]
191 | [pos="U"] [pos="U" & text="ཡང་<NO_POS>"] [pos="S"]	2	=	[pos="S"]
192 | [text="ན་<ADP>"] [pos="U" & text="ཡོད་པ་<VERB>"]	2	=	[pos="S"]
193 | [text="།།<PUNCT>"] [] [pos="S" & text="མེ་<NOUN>"]	3	=	[pos="U"]
194 | [pos="U"] [text="ཕུག་རོན་<NOUN>"] [text="གྱི་<NO_POS>"]	1	=	[pos="U"]
195 | [pos="U"] [] [text="རྗེ་བཙུན་མི་<NO_POS>"]	1	=	[pos="S"]
196 | [pos="S"] [pos="U" & text="གྱི་<NO_POS>"] [] [text="ཞིག་<NO_POS>"]	2	=	[pos="U"]
197 | [pos="S" & text="ཕུག་རོན་<NOUN>"] [text="གྱི་<NO_POS>"]	1	=	[pos="U"]
198 | [pos="U" & text="རྟ་<NOUN>"] [text="ལ་<ADP>"]	1	=	[pos="S"]
199 | [text="རྟོགས་པ་<VERB>"] [pos="U"] [text="ས<PART>"]	2	=	[pos="S"]
200 | [pos="U" & text="བར་<NOUN>"] [text="དུ་<NO_POS>"]	1	=	[pos="B"]
201 | [text="ཕུག་རོན་<NOUN>"] [text="གྱི་<NO_POS>"] [pos="U"]	3	=	[pos="U"]
202 | [text="བུ་ཆེན་<NOUN>"] [] [pos="U"]	3	=	[pos="S"]
203 | [text="ཤིང་<NO_POS>"] [] [pos="U" & text="ལ་<ADP>"]	3	=	[pos="S"]
204 | [text="ཕ་ཇོ་<NOUN>"] [] [pos="U"]	3	=	[pos="S"]
205 | [text="།<PUNCT>"] [pos="U"] [text="སྲས་<NOUN>"]	2	=	[pos="S"]
206 | [pos="S" & text="དཔག་ཏུ་མེད་པ་<NO_POS>"] [text="ཞིག་<NO_POS>"]	1	=	[pos="U"]
207 | [text="རྗེ་བཙུན་མི་<NO_POS>"] [pos="U"]	2	=	[pos="S"]
208 | [pos="U"] [text="རྗེ་བཙུན་མི་<NO_POS>"]	1	=	[pos="S"]
209 | [text="ལ་ལ<PRON>"] [] [pos="U"]	3	=	[pos="S"]
210 | [pos="U" & text="ཐུགས་<NOUN>"] [text="ཉམས་<NOUN>"]	1	=	[pos="B"]
211 | [pos="U"] [] [text="བུ་ཆེན་<NOUN>"]	1	=	[pos="S"]
212 | [text="ཡོད་པ་<VERB>"] [pos="U" & text="ཡིན་<NO_POS>"]	2	=	[pos="S"]
213 | [pos="U" & text="གཅིག་པ<ADJ>"] [text=".*པ<ADJ>" & pos="U"]	1	=	[pos="U"]
214 | [pos="U"] [] [text="མེས་སྟོན་<NO_POS>"]	1	=	[pos="S"]
215 | [pos="S" & text="ཡབ་ཡུམ་<NOUN>"] [pos="U"] [pos="U"]	1	=	[pos="U"]
216 | [pos="U"] [pos="U"] [pos="U" & text="ང་<PRON>"] [pos="S"] [pos="S"]	3	=	[pos="I"]
217 | [text="དགོས་པ་<VERB>"] [text="ཡོད་<VERB>"] [pos="U" & text="དོ<NO_POS>"]	3	=	[pos="S"]
218 | [text="བསྒོམས་<VERB>"] [pos="U"]	2	=	[pos="S"]
219 | [pos="S"] [pos="S" & text="རྒྱུ་<NOUN>"] [pos="U"]	3	=	[pos="U"]
220 | [text="རྟ་<NOUN>"] [pos="U" & text="ལ་<ADP>"]	2	=	[pos="S"]
221 | [pos="U" & text="འོག་<NOUN>"] [pos="U"] [text="བུ་ཆེན་<NOUN>" & pos="S"]	1	=	[pos="U"]
222 | [pos="U"] [text="བས<ADP>"] [text="།།<PUNCT>"]	1	=	[pos="S"]
223 | [text="བདག་མེད་མ་<NO_POS>"] [pos="U"]	2	=	[pos="S"]
224 | [text="ལ་ལུང་<NO_POS>"] [pos="U" & text="བསྟན་<VERB>"]	2	=	[pos="I"]
225 | [pos="S"] [pos="U"] [pos="U" & text="གསུངས་པ<VERB>"]	3	=	[pos="S"]
226 | [pos="U" & text="ལ་<ADP>"] [text="རྨི་ལམ་<NOUN>"]	1	=	[pos="S"]
227 | [text="རྨི་ལམ་<NOUN>"] [pos="U" & text="བཟང་<VERB>"]	2	=	[pos="S"]
228 | [pos="S" & text="ཆིབས་<VERB>"] [text="ནས་<PART>"]	1	=	[pos="U"]
229 | [text="བཟང་<VERB>"] [] [pos="U" & text="འདི<DET>"]	3	=	[pos="S"]
230 | [text="ཇི་ལྟར་<TEXT>"] [] [pos="U"] [text=".*"]	3	=	[pos="U"]
231 | [pos="U"] [pos="S"] [pos="U" & text="དེ<DET>"]	3	=	[pos="S"]
232 | [pos="S"] [pos="U" & text="ར་<PART>"] [pos="S" & text="ཆད་མེད་པ<NO_POS>"]	2	=	[pos="I"]
233 | [text="གཉིས་<NUM>"] [pos="S"] [pos="U"] [pos="S"]	3	=	[pos="U"]
234 | [pos="U" & text="ར་<PART>"] [] [text="ལས<TEXT>"]	1	=	[pos="S"]
235 | [pos="U" & text="ན་<ADP>"] [] [text="ཡིན<NO_POS>"]	1	=	[pos="S"]
236 | [text="གང་<NO_POS>"] [pos="U" & text="གི་<NO_POS>"]	2	=	[pos="S"]
237 | [pos="S" & text="ང་རང་<OTHER>"] [pos="U" & text="གི་<NO_POS>"] [pos="S"]	2	=	[pos="U"]
238 | [text="ལ་<ADP>"] [pos="U"] [text="དོན་<NOUN>"]	2	=	[pos="S"]
239 | [text="རང་<PRON>"] [pos="U" & text="རྣམས་<DET>"]	2	=	[pos="S"]
240 | [pos="S"] [pos="S"] [pos="U"]	3	=	[pos="S"]
241 | [text="ལ<ADP>"] [] [pos="S" & text="ད་ལྟ་<OTHER>"] [pos="U"]	3	=	[pos="S"]
242 | [pos="S" & text="སྒོས་<ADV>"] [pos="S"] [pos="U"]	3	=	[pos="U"]
243 | [text="ཁྱེད་<PRON>"] [pos="U" & text="རང་<PRON>"] [text="རྣམས་<DET>"]	2	=	[pos="S"]
244 | [text="ལ་ད<PROPN>"] [pos="U"]	2	=	[pos="I"]
245 | [text="གསུངས་པ་<VERB>"] [pos="U" & text="ལྟར་<OTHER>"]	2	=	[pos="S"]
246 | [text="ཕ་རོལ་<NOUN>"] [pos="U"]	2	=	[pos="S"]
247 | [text="ན་<ADP>"] [] [pos="U" & text="ཡིན<NO_POS>"]	3	=	[pos="S"]
248 | [pos="U" & text="ལ་<ADP>"] [text="ཞུས་པ་<VERB>"]	1	=	[pos="S"]
249 | [pos="U"] [pos="U" & text="རོ་<NOUN>"] [pos="I"]	2	=	[pos="B"]
250 | [text="ས<PART>"] [] [pos="S" & text="བུ་<NOUN>"] [pos="U"] [pos="U"]	3	=	[pos="S"]
251 | [text="ཐུགས་<NOUN>"] [pos="U" & text="དྲན་<VERB>"]	2	=	[pos="S"]
252 | [text="ར་<PART>"] [pos="S"] [text="ན་<ADP>"]	2	=	[pos="U"]
253 | [pos="S" & text="བུ་<NOUN>"] [pos="U"] [pos="U"]	1	=	[pos="U"]
254 | [text="ཡོད་<VERB>"] [pos="U" & text="ཞུས་པ<VERB>"]	2	=	[pos="S"]
255 | [pos="U"] [pos="S" & text="བཞི་<NUM>"] [pos="U"]	2	=	[pos="U"]
256 | [text="དང་<NO_POS>"] [text="།<PUNCT>"] [pos="U" & text="ནཱ་རོ<NO_POS>"]	3	=	[pos="S"]
257 | [text="དགོས་པ་<VERB>"] [pos="U" & text="ཡོད་<VERB>"]	2	=	[pos="S"]
258 | [pos="U"] [text=".*ཁ<NO_POS>"]	1	=	[pos="S"]
259 | [pos="U" & text="གནང་<NO_POS>"] [text="བར་<NOUN>"]	1	=	[pos="B"]
260 | [text="རྨི་ལམ་<NOUN>"] [text="བཟང་<VERB>"] [pos="U" & text="།།<PUNCT>"]	3	=	[pos="S"]
261 | [pos="U" & text="བྱོན་<VERB>"] [pos="S"] [pos="S"]	1	=	[pos="U"]
262 | [text="པང་<NOUN>"] [] [pos="U"]	3	=	[pos="S"]
263 | [pos="U" & text="སྐུ་དྲིན་<NOUN>"] [text="ཅན<PART>"] [text="།།<PUNCT>"]	1	=	[pos="B"]
264 | [pos="U" & text="དགོས་པ་<VERB>"] [text="ཡོད་<VERB>"]	1	=	[pos="S"]
265 | [pos="S"] [pos="U"] [pos="U" & text="བྱུང་བ་<VERB>"]	3	=	[pos="S"]
266 | [pos="U" & text="ནང་<NOUN>"] [text="མཐུན་པ<VERB>"]	1	=	[pos="B"]
267 | [pos="U" & text="ར་<PART>"] [text="གདའ<VERB>"] [text="།།<PUNCT>"]	1	=	[pos="S"]
268 | [text="དོན་དུ་<NO_POS>"] [pos="U"] [text="ལགས་<AUX>"]	2	=	[pos="S"]
269 | [pos="U" & text="མཐའ་<NOUN>"] [pos="U"] [pos="S"]	1	=	[pos="S"]
270 | [pos="U" & text="བྱུང་<VERB>"] [text="སྟེ<NO_POS>"]	1	=	[pos="S"]
271 | [pos="S"] [pos="U"] [pos="S" & text="ཁྱུང་<NOUN>"]	2	=	[pos="U"]
272 | [text="རྟོགས་པ་<VERB>"] [] [pos="U" & text="ས<PART>"]	3	=	[pos="S"]
273 | [text="དོན་དུ་<NO_POS>"] [] [pos="U" & text="ལགས་<AUX>"]	3	=	[pos="S"]
274 | [pos="U"] [] [text="བདག་མེད་མ་<NO_POS>"]	1	=	[pos="S"]
275 | [pos="S"] [pos="U"] [pos="U" & text="།<PUNCT>"] [] [text="།<PUNCT>"]	3	=	[pos="B"]
276 | [pos="U"] [pos="U" & text="ཡང་<NO_POS>"] [pos="S" & text="རྗེ་མར་པ<NO_POS>"]	2	=	[pos="U"]
277 | [pos="U" & text="།།<PUNCT>"] [] [text="རྨི་ལམ་<NOUN>"]	1	=	[pos="S"]
278 | [pos="U" & text="བར་ཆད་<NOUN>"] [] [text="།།<PUNCT>"]	1	=	[pos="S"]
279 | [text="བུ་ཆེན་<NOUN>"] [] [pos="U"] [pos="S"] [pos="U"]	3	=	[pos="U"]
280 | [pos="S"] [pos="U" & text="གྱི་<NO_POS>"] [] [text="།།<PUNCT>"]	2	=	[pos="U"]
281 | [pos="U" & text="ནམ་མཁ<NOUN>"] [pos="U"] [pos="S"]	1	=	[pos="S"]
282 | [text="ནཱ་རོ<NO_POS>"] [pos="U" & text="འི་<PART>"] [text="ལུང་བསྟན་<NOUN>"]	2	=	[pos="S"]
283 | [pos="U" & text="ཡོད་པ་<VERB>"] [text="དེ་<DET>"]	1	=	[pos="S"]
284 | [text="རྗེ་<NO_POS>"] [pos="U" & text="མི་<PART>"]	2	=	[pos="B"]
285 | [pos="U"] [pos="U" & text="ཡང་<NO_POS>"] [pos="S"] [text="བྱུང་<VERB>"]	2	=	[pos="U"]
286 | [text="གོམས་པ་<VERB>"] [pos="U"]	2	=	[pos="S"]
287 | [text="ལ་<ADP>"] [pos="U" & text="ནི་<PART>"]	2	=	[pos="S"]
288 | [pos="S" & text="བུ་<NOUN>"] [pos="U" & text="ལ་<ADP>"] [pos="U"]	1	=	[pos="S"]
289 | [text="བུ་<NOUN>"] [pos="U" & text="དང་<NO_POS>"]	2	=	[pos="S"]
290 | [text="།<PUNCT>"] [text="མེས་སྟོན་<NO_POS>"] [pos="U" & text="གྱིས་<NON_WORD>"]	3	=	[pos="U"]
291 | [pos="S"] [pos="U"] [pos="U" & text="གྱིས་<NON_WORD>"]	3	=	[pos="S"]
292 | [pos="U"] [] [text="མི་ངན་<NO_POS>"]	1	=	[pos="S"]
293 | [pos="S"] [pos="U" & text="ཡིན<NO_POS>"] [pos="U"]	2	=	[pos="S"]
294 | [text="ཕྱག་ལེན་<NOUN>"] [pos="U" & text="དང་<NO_POS>"] [pos="U"] [pos="S"]	2	=	[pos="U"]
295 | [pos="U" & text="ར་<PART>"] [text="ཚོགས་<NOUN>"] [text="གྲྭ་པ་<NOUN>"]	1	=	[pos="S"]
296 | [pos="U" & text="རྨི་ལམ་<NOUN>"] [pos="S"]	1	=	[pos="S"]
297 | [text="འཕོས་<VERB>"] [pos="U"]	2	=	[pos="S"]
298 | [text="ལ་<ADP>"] [text="ཞུས་པ་<VERB>"] [pos="U" & text="ལགས<AUX>"]	3	=	[pos="S"]
299 | [text="ཉི་མ་<NOUN>"] [pos="U"] [text="ཕྱེད་<NUM>"]	2	=	[pos="S"]
300 | [pos="U"] [pos="U"] [pos="S" & text="ཡོང་བ་<VERB>"]	3	=	[pos="U"]
301 | [pos="U"] [] [text="གདུང་<NOUN>"]	1	=	[pos="S"]
302 | [text="ཕ་རོལ་<NOUN>"] [] [pos="U"]	3	=	[pos="S"]
303 | [pos="U" & text="ཁྱེད་<PRON>"] [text="གཉིས་<NUM>"]	1	=	[pos="S"]
304 | [text="ལ་<ADP>"] [pos="U" & text="རྨི་ལམ་<NOUN>"]	2	=	[pos="S"]
305 | [pos="U" & text="ཀྱིས<NON_WORD>"] [text="།<PUNCT>"]	1	=	[pos="S"]
306 | [text="མི་<PART>"] [text="འདུག་པ<VERB>"] [pos="U" & text="ས<PART>"]	3	=	[pos="S"]
307 | [pos="U" & text="ཤིང་<NO_POS>"] [] [text="ལ་<ADP>"]	1	=	[pos="S"]
308 | [text="ནས་<PART>"] [pos="S"] [pos="U" & text="ས་<PART>"] [pos="S"]	3	=	[pos="I"]
309 | [text="ཀྱི་<NO_POS>"] [text="ཞལ་<NOUN>"] [pos="U" & text="ནས<PART>"]	3	=	[pos="S"]
310 | [pos="U"] [pos="U" & text="རྒྱུན་<NOUN>"] [pos="S"]	2	=	[pos="B"]
311 | [pos="U"] [] [text="ལ་མོ<OTHER>"]	1	=	[pos="S"]
312 | [pos="U" & text="།<PUNCT>"] [text="དབུ་<NOUN>"]	1	=	[pos="S"]
313 | [text="ས<PART>"] [pos="U"] [text="བུ་<NOUN>"]	2	=	[pos="S"]
314 | [pos="U" & text="འགྲོ་<VERB>"] [text="ཀུན་<DET>"]	1	=	[pos="B"]
315 | [text="ཤིང་<NO_POS>"] [pos="U"] [text="ལ་<ADP>"]	2	=	[pos="S"]
316 | [pos="U" & text="དང་<NO_POS>"] [pos="U"] [pos="S"]	1	=	[pos="S"]
317 | [text="ཤི་བ<VERB>"] [pos="U"]	2	=	[pos="S"]
318 | [text="བླ་མ<NOUN>"] [text="འི་<PART>"] [pos="U" & text="ཞལ་<NOUN>"]	3	=	[pos="S"]
319 | [text="ཐུགས་<NOUN>"] [pos="U" & text="ཉམས་<NOUN>"]	2	=	[pos="I"]
320 | [pos="S"] [pos="U"] [pos="U" & text="གསུངས<VERB>"]	3	=	[pos="S"]
321 | [text="གོམས་པ་<VERB>"] [] [pos="U"]	3	=	[pos="S"]
322 | [text="ཇི་ལྟར་<TEXT>"] [] [pos="U"]	3	=	[pos="S"]
323 | [text="ཕུག་རོན་<NOUN>"] [pos="U"]	2	=	[pos="S"]
324 | [pos="U"] [text="བུ་ཆེན་<NOUN>"]	1	=	[pos="S"]
325 | [text="ར་<PART>"] [pos="U"] [text="ལས<TEXT>"]	2	=	[pos="S"]
326 | [pos="U" & text="གླིང་<NOUN>"] [text="དུ་<NO_POS>"] [text="གཤེགས་པ་<VERB>"]	1	=	[pos="S"]
327 | [pos="U"] [pos="U" & text="དུ་<NO_POS>"] [pos="S" & text="རྗེ་མར་པ<NO_POS>"]	2	=	[pos="U"]
328 | [pos="S" & text="བཀྲ་ཤིས་ཤོག<NO_POS>"] [pos="U" & text="།<PUNCT>"] [pos="S"]	2	=	[pos="U"]
329 | [pos="S"] [pos="U" & text="ཡོད་<VERB>"] [pos="U"]	2	=	[pos="S"]
330 | [pos="U"] [] [text="ཆགདབུ་<NON_WORD>"]	1	=	[pos="S"]
331 | [text="དང་<NO_POS>"] [pos="S" & text="ཕྲིན་ལས་<NOUN>"] [pos="U"] [pos="U"]	2	=	[pos="S"]
332 | [text="བུ་ཆེན་<NOUN>"] [pos="U" & text="རྣམས་<DET>"]	2	=	[pos="S"]
333 | [pos="U" & text="ཀུན་<DET>"] [text="ལ་<ADP>"]	1	=	[pos="S"]
334 | [pos="U" & text="སྒྲ་<NOUN>"] [text="དང་<NO_POS>"] [text="།<PUNCT>"]	1	=	[pos="S"]
335 | [pos="U"] [text="བུ<NOUN>"]	1	=	[pos="S"]
336 | [text="ཐར་པ<VERB>"] [pos="U"] [text="གླིང་<NOUN>"]	2	=	[pos="S"]
337 | [text="ཀ་<DET>"] [pos="U"] [text="ཏུ་<NO_POS>"]	2	=	[pos="S"]
338 | [pos="S" & text="བུ་<NOUN>"] [pos="U" & text="གཅིག་<NUM>"] [pos="U" & text="ཡོད་པ་<VERB>"]	1	=	[pos="S"]
339 | [pos="U" & text="ཡོད་པ་<VERB>"] [text="ཡིན་<NO_POS>"]	1	=	[pos="S"]
340 | [pos="S"] [pos="U"] [pos="U" & text="ཚུལ་<NOUN>"]	3	=	[pos="S"]
341 | [pos="U"] [] [text="སྲིད་པ་<VERB>"]	1	=	[pos="S"]
342 | [text="སུ་<ADP>"] [text="གཤེགས<VERB>"] [pos="U" & text="།།<PUNCT>"]	3	=	[pos="S"]
343 | [text="ལ་<ADP>"] [pos="U" & text="ཞུས་པ་<VERB>"] [text="ལགས<AUX>"]	2	=	[pos="S"]
344 | [text="ཁྱབ་པ་<VERB>"] [] [pos="U" & text="།།<PUNCT>"]	3	=	[pos="S"]
345 | [pos="U" & text="མཆོད་པ་<VERB>"] [text="ཕུལ་<VERB>"]	1	=	[pos="S"]
346 | [pos="U"] [text="རྒན་རྒོན་<NOUN>"]	1	=	[pos="S"]
347 | [pos="U"] [pos="S"] [pos="U" & text="འོད་གསལ་<NOUN>"]	3	=	[pos="S"]
348 | [text="བར་<NOUN>"] [pos="U" & text="དུ་<NO_POS>"] [] [text="གི་<NO_POS>"]	2	=	[pos="S"]
349 | [pos="U" & text="གསུངས་<VERB>"] [pos="S"]	1	=	[pos="S"]
350 | [pos="U" & text="ཡང་<NO_POS>"] [text="།<PUNCT>"]	1	=	[pos="S"]
351 | [text="ཁྱེད་<PRON>"] [pos="U" & text="གཉིས་<NUM>"]	2	=	[pos="S"]
352 | [text="མཐུ་ཆེན་<PROPN>"] [] [pos="U"]	3	=	[pos="S"]
353 | [pos="U" & text="གསུངས་པ་<VERB>"] [text="ལྟར་<OTHER>"]	1	=	[pos="S"]
354 | [text="།།<PUNCT>"] [pos="S" & text="དུས་གསུམ་<OTHER>"]	2	=	[pos="U"]
355 | [pos="U"] [text="རྒྱུགས་པ<VERB>"]	1	=	[pos="S"]
356 | [text="ལྟ་བུ་<NOUN>"] [pos="U" & text="ཞིག་<NO_POS>"] [text="ཡོད་པ་<VERB>"]	2	=	[pos="S"]
357 | [text="ཡོད་པ་<VERB>"] [] [pos="U" & text="དང་<NO_POS>"]	3	=	[pos="S"]
358 | [pos="S"] [pos="U" & text="གྱི་<NO_POS>"] [] [text="ཡོད<VERB>"]	2	=	[pos="U"]
359 | [pos="U"] [] [text="འདུས་བྱས་<NOUN>"]	1	=	[pos="S"]
360 | [pos="S"] [pos="U" & text="གཅིག་<NUM>"] [pos="U"]	2	=	[pos="S"]
361 | [pos="U" & text="ནཱ་རོ<NO_POS>"] [] [text="ལུང་བསྟན་<NOUN>"]	1	=	[pos="S"]
362 | [pos="U" & text="གདམས་ངག་<NOUN>"] [text="ལ<ADP>"]	1	=	[pos="S"]
363 | [pos="U" & text="ཅིག་<NO_POS>"] [text="གསུངས<VERB>"] [text="།<PUNCT>"]	1	=	[pos="S"]
364 | [pos="U" & text="།<PUNCT>"] [text="སྔར་<NOUN>"] [text="གྱི་<NO_POS>"]	1	=	[pos="S"]
365 | [pos="U" & text="གདམས་ངག་<NOUN>"] [text="ཐེབས་པ་<VERB>"]	1	=	[pos="S"]
366 | [pos="U"] [] [text="བརྒྱུད་<NO_POS>"]	1	=	[pos="S"]
367 | [text="བྱུང་བ་<VERB>"] [] [pos="U" & text="།།<PUNCT>"]	3	=	[pos="S"]
368 | [pos="U" & text="།།<PUNCT>"] [text="དེ་ནས་<NO_POS>"] [text="རྗེ་བཙུན་མི་<NO_POS>"]	1	=	[pos="U"]
369 | [pos="U"] [pos="U"] [pos="I" & text="ཅན་<PART>"]	3	=	[pos="U"]
370 | [text="ར་<PART>"] [pos="U" & text="ཚོགས་<NOUN>"]	2	=	[pos="S"]
371 | [pos="U"] [] [text=".*བ་<NON_WORD>"]	1	=	[pos="S"]
372 | [text="མེས་སྟོན་<NO_POS>"] [pos="U"]	2	=	[pos="S"]


--------------------------------------------------------------------------------
/tests/data/marpa/marpa_rules.txt:
--------------------------------------------------------------------------------
 1 | [text="།།" & pos="PUNCT"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] [text="བཤེས་གཉེན་" & pos="NOUN"]	2	+	[]
 2 | [text="ཟླ་ཕྱེད་" & pos="NOUN"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="ཕྱོགས་" & pos="NOUN"]	2	+	[]
 3 | [text="།" & pos="PUNCT"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] [text="དང་བ་" & pos="NOUN"]	2	+	[]
 4 | [text="གྲོང་འཇུག་" & pos="NOUN"] [text="ཐུགས་" & pos="NOUN"] [text="ཉམས་" & pos="NOUN"] [text="སུ་" & pos="ADP"]	2	+	[]
 5 | [text="།།" & pos="PUNCT"] [text="སྐྱེ་" & pos="NOUN"] [text="མེད་" & pos="VERB"] [text="རང་སྒྲ་"]	2	+	[]
 6 | [text="།།" & pos="PUNCT"] [text="སྐྱེ་" & pos="NOUN"] [text="མེད་" & pos="VERB"] [text="རི་བོང་" & pos="NOUN"]	2	+	[]
 7 | [text="འཁོར་འདས་" & pos="NOUN"] [text="དབྱེར་" & pos="TEXT"] [text="མེད་" & pos="VERB"] [text="ག་" & pos="PRON"]	2	+	[]
 8 | [text="གྱི་"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="བསྟན་པ་" & pos="VERB"]	2	+	[]
 9 | [text="གཞན་གཅེས་"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] [text="བརྩེ་བ" & pos="VERB"]	2	+	[]
10 | [text="བརྩེ་བདུངས་"] [text="ནང་" & pos="NOUN"] [text="མཐུན་པ" & pos="VERB"] [text="ས" & pos="PART"]	2	+	[]
11 | [text="།།" & pos="PUNCT"] [text="སྐྱེ་" & pos="NOUN"] [text="མེད་" & pos="VERB"] [text="སྤྲོས་བྲལ་" & pos="ADJ"]	2	+	[]
12 | [text="རྟ" & pos="NOUN"] [text="།།" & pos="PUNCT"] [text="རོ་" & pos="NOUN"] [text="སྙོམས་" & pos="VERB"] [text="ལྕགས་" & pos="NOUN"]	3	+	[]
13 | [text="།།" & pos="PUNCT"] [text="རྗེ་"] [text="མི་" & pos="PART"] [text="འབྲལ་"]	3	+	[]
14 | [text="བྱང་ཆུབ་" & pos="NOUN"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="གཙུག་" & pos="NOUN"]	2	+	[]
15 | [text="བདུན་" & pos="NUM"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="བརྟན་པ་" & pos="VERB"]	2	+	[]
16 | [text="ཡབ་" & pos="NOUN"] [text="དབྱེར་" & pos="TEXT"] [text="མེད་" & pos="VERB"] [text="དགྱེས་རྡོར་"]	2	+	[]
17 | [text="།།" & pos="PUNCT"] [text="སྐྱེ་" & pos="NOUN"] [text="མེད་" & pos="VERB"] [text="མཚོན་པ" & pos="VERB"]	2	+	[]
18 | [text="བླ་མ་" & pos="NOUN"] [text="སྐུ་དྲིན་" & pos="NOUN"] [text="ཅན" & pos="PART"] [text="།།" & pos="PUNCT"] [text="དཔལ་" & pos="OTHER"]	2	+	[]
19 | [text="གི་"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="ཆོས་གྲྭ་" & pos="NOUN"]	2	+	[]
20 | [text="ས་" & pos="PART"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] [text="སངས་རྒྱས་" & pos="NOUN"]	2	+	[]
21 | [text="།།" & pos="PUNCT"] [text="དབྱེར་" & pos="TEXT"] [text="མེད་" & pos="VERB"] [text="རང་བཞིན་" & pos="NOUN"]	2	+	[]
22 | [text="།།" & pos="PUNCT"] [text="དབྱེར་" & pos="TEXT"] [text="མེད་" & pos="VERB"] [text="དཔལ་ལྡན་" & pos="ADJ"]	2	+	[]
23 | [text="སེམས་" & pos="NOUN"] [text="སྐྱེ་" & pos="NOUN"] [text="མེད་" & pos="VERB"] [text="དུ་"]	2	+	[]
24 | [text="ཇི་བཞིན་" & pos="OTHER"] [text="ཐུགས་" & pos="NOUN"] [text="ཉམས་" & pos="NOUN"] [text="སུ་" & pos="ADP"]	2	+	[]
25 | [text="།།" & pos="PUNCT"] [text="བསྲེ་བ་" & pos="VERB"] [text="རོ་" & pos="NOUN"] [text="སྙོམས་" & pos="VERB"] [text="གཉིས་" & pos="NUM"]	3	+	[]
26 | [text="འདྲེན་མཛད་"] [text="སྐུ་དྲིན་" & pos="NOUN"] [text="ཅན" & pos="PART"] [text="།།" & pos="PUNCT"] [text="གྲུབ་ཐོབ་" & pos="NOUN"]	2	+	[]
27 | [text="གི་"] [text="ཐུགས་" & pos="NOUN"] [text="ཉམས་" & pos="NOUN"] [text="ལ་" & pos="ADP"]	2	+	[]
28 | [text="ནི་" & pos="PART"] [text="ཐུགས་" & pos="NOUN"] [text="ཉམས་" & pos="NOUN"] [text="ལ་" & pos="ADP"]	2	+	[]
29 | [text="།" & pos="PUNCT"] [text="དེ་དུས་" & pos="PRON"] [text="ཀྱང་"] [text="རྔོག་པ" & pos="PROPN"] [text="ས་" & pos="PART"]	2-1	:	[] []
30 | [text="ས་" & pos="PART"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] [text="ཚིམ་པ་" & pos="VERB"]	2	+	[]
31 | [text="གྱི་"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="འཕགས་པ་" & pos="VERB"]	2	+	[]
32 | [text="འི་" & pos="PART"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="ཡང་"]	2	+	[]
33 | [text="མོས་གུས་" & pos="NOUN"] [text="ནང་" & pos="NOUN"] [text="མཐུན་པ" & pos="VERB"] [text="ས་" & pos="PART"]	2	+	[]
34 | [text="གདམས་དག་"] [text="ཐུགས་" & pos="NOUN"] [text="ཉམས་" & pos="NOUN"] [text="སུ་" & pos="ADP"]	2	+	[]
35 | [text="འི་" & pos="PART"] [text="ཐུགས་" & pos="NOUN"] [text="ཉམས་" & pos="NOUN"] [text="དྲག་པོ་" & pos="ADJ"]	2	+	[]
36 | [text="རང་བཞིན་" & pos="NOUN"] [text="སྐྱེ་" & pos="NOUN"] [text="མེད་" & pos="VERB"] [text="ཀི་" & pos="ADP"]	2	+	[]
37 | [text="བདུན་" & pos="NUM"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="སྣང་བ་" & pos="VERB"]	2	+	[]
38 | [text="ཀྱི་"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="གནས་པ" & pos="VERB"]	2	+	[]
39 | [text="ནོ་"] [text="སྐུ་དྲིན་" & pos="NOUN"] [text="ཅན" & pos="PART"] [text="།།" & pos="PUNCT"] [text="ཕ་" & pos="NOUN"]	2	+	[]
40 | [text="སོགས་" & pos="DET"] [text="འགྲོ་" & pos="VERB"] [text="ཀུན་" & pos="DET"] [text="སྨིན་" & pos="VERB"]	2	+	[]
41 | [text="རྫོགས་" & pos="VERB"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="གཅིག་བརྒྱུད་"]	2	+	[]
42 | [text="ལ་" & pos="ADP"] [text="འབྱུང་བ་" & pos="VERB"] [text="རོ་" & pos="NOUN"] [text="སྙོམས་" & pos="VERB"] [text="ཀྱི་"]	3	+	[]
43 | [text="རྩེ་གདུངས་"] [text="ནང་" & pos="NOUN"] [text="མཐུན་པ" & pos="VERB"] [text="ས་" & pos="PART"]	2	+	[]
44 | [text="།" & pos="PUNCT"] [text="བར་" & pos="NOUN"] [text="དུ་"] [text="ལག་" & pos="NOUN"]	2	+	[]


--------------------------------------------------------------------------------
/tests/data/monlam2020/multi_pos_multi_sense.csv:
--------------------------------------------------------------------------------
1 | edited,favorites,_id,word,result
2 | -,-,83307,ལ་,མིང་ཚིག 1. བོད་ཡིག་གི་གསལ་བྱེད་ལས་སྡེ་བ་བདུན་པའི་ཡི་གེ་གཉིས་པ་སྟེ། འདིའི་ང་རོ་འདོན་ཚུལ་ནི། སྐྱེ་གནས་སོ་དང༌། བྱེད་པ་ལྕེ་རྩེ། སྒྲ་འབྱིན་ཚུལ་ཆེས་ཤིན་ཏུ་ལྷོད་པ། མིང་གཞིའི་རྟགས་ཀྱི་དབྱེ་བའི་མོ་གཤམ་གྱི་ཡི་གེ་ཞིག 2. རྗེས་འཇུག་བཅུའི་དགུ་པ་སྟེ། མིང་གཞིའི་ཡི་གེ་ཀུན་གྱི་རྗེས་སུ་འཇུག  དཔེར་ན། ཀལ། ཁལ། གལ། ངལ། ཅལ། ཆལ། མཇལ། ཉལ། ཏལ། ཐལ། དལ། ནལ། དཔལ། ཕལ། བལ། མལ། བཙལ། ཚལ། འཛོལ། ཝལ། ཞལ། ཟལ། འོལ། ཡལ། རལ། ཤལ། སལ། ཧལ། ཨལ་བཅས་པ་ལྟ་བུ། 3. ༡རྒྱུན་སྤྱོད། རི་འདབས་ནས་ཡར་རི་རྩེ་བར་གྱི་གྱེན་ངོས་སམ། ས་བབ་དམའ་ས་ནས་མཐོ་ས་ལ་ཡར་གྱེན་དུ་གསེག་ཡོད་པའི་ཟུར་ངོས་ཀྱི་མིང། དཔེར་ན། ལ་རེ་ཐུར་རེ་མ་བརྒྱབ་ན། བདེ་མོའི་ཐང་ལ་ཐོན་རྒྱུ་མེད་ལྟ་བུ། ༡ལྡོག་ཚིག ཐུར། 4. ༡ཆོས་ལུགས། གཅོད་བྱེད་ཀྱི་ཡི་གེ་སྟེ། སྐུ་ཐམས་ཅད་ཀྱི་སྣང་བ་སྟོན་པ་དབང་རྫོགས་པ་རང་བྱུང་ཆེན་པོའི་རྒྱུད་ལས། ལ་ནི་གཅོད་བྱེད་ཀྱི་ཡི་གེར་ཤེས་པར་བྱའོ། །ཞེས་རྒྱུད་དེའི་སྒྲའི་སྡོམ་བྱང་བསྟན་པའི་སྐབས་སུ་གསལ། བྱེད་ཚིག 1. ༡རྒྱུན་སྤྱོད། ལས་སུ་བྱ་བ་ལ་འཇུག་པ། ལ་དོན་གྱི་ལ་ཡིག་སྦྱར་བའི་གཞི་དེ་ལ། བྱ་བ་གཞན་ཞིག་བྱས་པའམ་བྱེད་བཞིན་པ་དང་། བྱེད་འགྱུར་གང་རུང་སྟོན་པ་ལ་འཇུག་པ་སྟེ། སྦྱོར་ཚུལ་ནི་རྗེས་འཇུག་ཐམས་ཅད་ཀྱི་རྗེས་སུ་ཁྱད་མེད་དུ་འཇུག་གོ།  དཔེར་ན། ཤར་ཕྱོགས་ལ་འགྲོ། རང་ཁྱིམ་ལ་བསྡད་ལྟ་བུ། 2. ༡རྒྱུན་སྤྱོད། དགོས་ཆེད་ལ་འཇུག་པ། ལ་ཡིག་སྦྱར་བའི་ཚེ་ན། བྱེད་པ་པོ་ཅི་རིགས་ཤིག་གིས་རང་གི་དགོས་པའམ་བྱ་བའི་ཡུལ་གྱི་དགོས་པ་ཞིག་བསྒྲུབ་པའི་ཆེད་དུ་བྱ་བ་ཞིག་བྱེད་པར་སྟོན་པ་སྟེ། སྦྱོར་ཚུལ་ནི་རྗེས་འཇུག་ཐམས་ཅད་ཀྱི་རྗེས་སུ་ཁྱད་མེད་དུ་འཇུག་གོ།  དཔེར་ན། ནད་པ་ལ་སྨན་བྱིན། མེ་ཏོག་ལ་ཆུ་འདྲེན་ལྟ་བུ། 3. ༡རྒྱུན་སྤྱོད། རྟེན་གནས་ལ་འཇུག་པ། ལ་ཡིག་སྦྱར་ཚེ་དངོས་པོ་གང་ཞིག་གང་དུ་ཡོད་པར་སྟོན་པར་བྱེད་པ་སྟེ། སྦྱོར་ཚུལ་ནི་རྗེས་འཇུག་ཐམས་ཅད་ཀྱི་རྗེས་སུ་ཁྱད་མེད་དུ་འཇུག་གོ།  དཔེར་ན། ཐབ་ལ་མེ་ཡོད། ཁང་ཐོག་ལ་དུད་ཁུང་ཡོད་ལྟ་བུ། 4. ༡རྒྱུན་སྤྱོད། དེ་ཉིད་ལ་འཇུག་པ། ལ་དོན་གྱི་ལ་ཡིག་སྦྱར་ས་དེའི་ངོ་བོར་གྲུབ་པའམ་དེ་ཉིད་དུ་ངེས་པར་སྟོན་པ་དོན་གྱི་ཆ་ནས་ཡུལ་དང་བྱ་བ་སོ་སོར་འབྱེད་དུ་མེད་པ་ཞིག་སྟོན་པ་སྟེ། སྦྱོར་ཚུལ་ནི་རྗེས་འཇུག་ཐམས་ཅད་ཀྱི་རྗེས་སུ་ཁྱད་མེད་དུ་འཇུག་གོ།  དཔེར་ན། ཇེ་གསལ་ལ་བཏང། བདེན་པ་ལ་གནས་ལྟ་བུ། 5. ༡རྒྱུན་སྤྱོད། ཚེ་སྐབས་ལ་འཇུག་པ། དུས་དང་སྐབས་སྟོན་པའི་མིང་མཐར་ལ་ཡིག་སྦྱར་ནས། བྱ་བ་གང་ཞིག་གམ་འགྱུར་ལྡོག་གང་ཞིག་དུས་སྐབས་ག་འདྲ་ཞིག་ལ་བྱེད་པའམ་འབྱུང་བ་སྟོན་པ་སྟེ། སྦྱོར་ཚུལ་ནི་རྗེས་འཇུག་ཐམས་ཅད་ཀྱི་རྗེས་སུ་ཁྱད་མེད་དུ་འཇུག་གོ།  དཔེར་ན། ཚེས་བཅོ་ལྔ་ལ་འཁྲུངས། ནམ་གུང་ཙམ་ལ་འབྱོར་ལྟ་བུ། 6. ༡རྒྱུན་སྤྱོད། ཚིག་སྔ་ཕྱི་གཉིས་ཀྱི་མཚམས་སྦྱོར་དུ་འཇུག་པ་སྟེ།  དཔེར་ན། རྩམ་པ་ལྡོད་ལ་མིད། གླུ་ལོངས་ལ་གར་རྩེན་ལྟ་བུ། 7. ༡རྒྱུན་སྤྱོད། དོན་རྐྱེན་གསལ་བྱེད་དུ་འཇུག་པ་ནི།  དཔེར་ན། རྒྱལ་པོའི་ཆད་པ་ལ་བྲོས་ལྟ་བུ། 8. ༡རྒྱུན་སྤྱོད། བྱེད་རྒྱུ་གསལ་བྱེད་དུ་འཇུག་པ།  དཔེར་ན། དངུལ་དཀར་སྟོང་ཕྲག་གཅིག་ལ་བྱམས་པའི་སྐུ་བཞེངས་ལྟ་བུ་སྟེ། དངུལ་དཀར་སྟོང་ཕྲག་གཅིག་གིས་བྱམས་པའི་སྐུ་བཞེངས་ཞེས་པ་དང་དོན་གཅིག་གོ། 9. ༡རྒྱུན་སྤྱོད། བརྗོད་གཞི་ངོས་འཛིན་པའི་དོན་དུ་འཇུག་པ་ནི།  དཔེར་ན། དགའ་བའི་ལུས་ལ་རང་དབང་མེད། སྐྱིད་པའི་སེམས་ལ་དྲན་འཛིན་མེད་ལྟ་བུ། གྲོགས་ཚིག བྱ་ཚིག་མ་འོངས་པའི་མཐའ་ལ་སྦྱར་ན་བྱ་བ་དེ་བྱེད་པ་ལ་ཧ་ཅང་ཉེ་བའི་དོན་སྟོན་ཏེ།  དཔེར་ན། ང་གནས་གཞན་དུ་འགྲོ་ལ་ཡོད་ལྟ་བུ།
3 | 


--------------------------------------------------------------------------------
/tests/data/monlam2020/multi_pos_multi_sense_expected.csv:
--------------------------------------------------------------------------------
 1 | ID,Form,Lemma,MonPOS,MonFeature,MonTag,POS,Feature,Morph,SenseTag,Definition,Example
 2 | 1,ལ་,ལ་1,མིང་ཚིག,,,,,,བོད་ཡིག,བོད་ཡིག་གི་གསལ་བྱེད་ལས་སྡེ་བ་བདུན་པའི་ཡི་གེ་གཉིས་པ་སྟེ། འདིའི་ང་རོ་འདོན་ཚུལ་ནི། སྐྱེ་གནས་སོ་དང༌། བྱེད་པ་ལྕེ་རྩེ། སྒྲ་འབྱིན་ཚུལ་ཆེས་ཤིན་ཏུ་ལྷོད་པ། མིང་གཞིའི་རྟགས་ཀྱི་དབྱེ་བའི་མོ་གཤམ་གྱི་ཡི་གེ་ཞིག,
 3 | 2,ལ་,ལ་1,མིང་ཚིག,,,,,,རྗེས་འཇུག,རྗེས་འཇུག་བཅུའི་དགུ་པ་སྟེ། མིང་གཞིའི་ཡི་གེ་ཀུན་གྱི་རྗེས་སུ་འཇུག ,ཀལ། ཁལ། གལ། ངལ། ཅལ། ཆལ། མཇལ། ཉལ། ཏལ། ཐལ། དལ། ནལ། དཔལ། ཕལ། བལ། མལ། བཙལ། ཚལ། འཛོལ། ཝལ། ཞལ། ཟལ། འོལ། ཡལ། རལ། ཤལ། སལ། ཧལ། ཨལ་བཅས་པ་ལྟ་བུ།
 4 | 3,ལ་,ལ་1,མིང་ཚིག,,"༡རྒྱུན་སྤྱོད།, ༡ལྡོག་ཚིག:ཐུར།",,,,རི་འདབས,རི་འདབས་ནས་ཡར་རི་རྩེ་བར་གྱི་གྱེན་ངོས་སམ། ས་བབ་དམའ་ས་ནས་མཐོ་ས་ལ་ཡར་གྱེན་དུ་གསེག་ཡོད་པའི་ཟུར་ངོས་ཀྱི་མིང། ,ལ་རེ་ཐུར་རེ་མ་བརྒྱབ་ན། བདེ་མོའི་ཐང་ལ་ཐོན་རྒྱུ་མེད་ལྟ་བུ། 
 5 | 4,ལ་,ལ་1,མིང་ཚིག,,༡ཆོས་ལུགས།,,,,གཅོད་བྱེད,གཅོད་བྱེད་ཀྱི་ཡི་གེ་སྟེ། སྐུ་ཐམས་ཅད་ཀྱི་སྣང་བ་སྟོན་པ་དབང་རྫོགས་པ་རང་བྱུང་ཆེན་པོའི་རྒྱུད་ལས། ལ་ནི་གཅོད་བྱེད་ཀྱི་ཡི་གེར་ཤེས་པར་བྱའོ། །ཞེས་རྒྱུད་དེའི་སྒྲའི་སྡོམ་བྱང་བསྟན་པའི་སྐབས་སུ་གསལ།,
 6 | 5,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,,
 7 | 6,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,,
 8 | 7,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,,
 9 | 8,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,,
10 | 9,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,,
11 | 10,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,,
12 | 11,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,,
13 | 12,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,,
14 | 13,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,,
15 | 14,ལ་,ལ་2,བྱེད་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,,
16 | 15,ལ་,ལ་3,གྲོགས་ཚིག,,༡རྒྱུན་སྤྱོད།,,,,,,
17 | 


--------------------------------------------------------------------------------
/tests/data/monlam2020/one_pos_multi_sense.csv:
--------------------------------------------------------------------------------
1 | edited,favorites,_id,word,result
2 | -,-,2,ཀ་ཀ་, མིང་ཚིག 1. བྱ་སྐྱ་ཀ 2. བྱ་སྐྱ་ཀའི་སྐད། 3. ལུག་གི་ཐེ་གེའམ་ཨ་ཅུག 4. ༡ཡུལ་སྐད།  བྱིས་པ་ཆུང་ངུའི་གྱོན་པ་ལ་ཀ་ཀ་ཟེར། དཔེ་རིས་ལ་གཟིགས།
3 | 


--------------------------------------------------------------------------------
/tests/data/monlam2020/one_pos_multi_sense_expected.csv:
--------------------------------------------------------------------------------
1 | ID,Form,Lemma,MonPOS,MonFeature,MonTag,POS,Feature,Morph,SenseTag,Definition,Example
2 | 1,ཀ་ཀ་,ཀ་ཀ་1, མིང་ཚིག,   ,,,,,བྱ་,བྱ་སྐྱ་ཀ,
3 | ,,ཀ་ཀ་2,,,,,,,བྱ་,བྱ་སྐྱ་ཀའི་སྐད།,
4 | ,,ཀ་ཀ་3,,,,,,,ལུག་,ལུག་གི་ཐེ་གེའམ་ཨ་ཅུག ,
5 | ,,ཀ་ཀ་4,,,༡ཡུལ་སྐད།,,,,བྱིས་པ་,བྱིས་པ་ཆུང་ངུའི་གྱོན་པ་ལ་ཀ་ཀ་ཟེར། དཔེ་རིས་ལ་གཟིགས།,
6 | 


--------------------------------------------------------------------------------
/tests/data/monlam2020/one_pos_one_sense.csv:
--------------------------------------------------------------------------------
1 | edited,favorites,_id,word,result
2 | -,-,3,ཀ་ཀ་ནཱི་ལ་,མིང་ཚིག ༡རྒྱ་གར། ཨིནྡྲ་ནཱི་ལ་དང་ནཱི་ལ་གཉིས་ལས་མདོག་ཅུང་ཟད་ནག་པའི་རྡོ་སྨན་ཁ་དོག་ཅུང་ཟད་ནག་པ། ཉི་མ་ཤར་བའི་ཕྱོགས་སུ་རང་གི་འོད་འབྱུང་ལ་ཆེ་ཆུང་མ་ངེས་པ་ཁ་དོག་སྔོ་ཞིང་ཤིན་ཏུ་དྭངས་པ་དང་དབྱིབས་སྐེད་རྔ་ཅན་ནམ། ལེབ་མོ་ངོས་མང་སོགས་སྣ་ཚོགས། ཆེ་ཆུང་མ་ངེས་ཤིང་། སྲ་ལ་མཁྲེགས་པ་ཞིག འདིས་ནད་མ་ལུས་སེལ་བར་བཤད།
3 | 


--------------------------------------------------------------------------------
/tests/data/monlam2020/one_pos_one_sense_expected.csv:
--------------------------------------------------------------------------------
1 | ID,Form,Lemma,MonPOS,MonFeature,MonTag,POS,Feature,Morph,SenseTag,Definition,Example
2 | 1,ཀ་ཀ་ནཱི་ལ་,ཀ་ཀ་ནཱི་ལ་1,མིང་ཚིག,,༡རྒྱ་གར།,,,,ཨིནྡྲ་ནཱི་ལ་,ཨིནྡྲ་ནཱི་ལ་དང་ནཱི་ལ་གཉིས་ལས་མདོག་ཅུང་ཟད་ནག་པའི་རྡོ་སྨན་ཁ་དོག་ཅུང་ཟད་ནག་པ། ཉི་མ་ཤར་བའི་ཕྱོགས་སུ་རང་གི་འོད་འབྱུང་ལ་ཆེ་ཆུང་མ་ངེས་པ་ཁ་དོག་སྔོ་ཞིང་ཤིན་ཏུ་དྭངས་པ་དང་དབྱིབས་སྐེད་རྔ་ཅན་ནམ། ལེབ་མོ་ངོས་མང་སོགས་སྣ་ཚོགས། ཆེ་ཆུང་མ་ངེས་ཤིང་། སྲ་ལ་མཁྲེགས་པ་ཞིག འདིས་ནད་མ་ལུས་སེལ་བར་བཤད།i,
3 | 


--------------------------------------------------------------------------------
/tests/data/monlam2020/verbs.csv:
--------------------------------------------------------------------------------
1 | edited,favorites,_id,word,result
2 | -,-,83305,རློམས་, བྱ་ཚིག ༡རྒྱུན་སྤྱོད། ༡བྱ་བྱེད་ཐ་དད་པ། རློམ་གྱི་སྐུལ་ཚིག ༢མ་འོངས་པ། བརླམ། ༢ད་ལྟ་བ། རློམ། ༢འདས་པ། བརླམས། ༢སྐུལ་ཚིག རློམས།
3 | -,-,83302,རློམ་, བྱ་ཚིག 1. ༡རྒྱུན་སྤྱོད། ༡བྱ་བྱེད་ཐ་མི་དད་པ། ༡དུས་གསུམ་ཐོར་བུ་པ། གཟུགས་མི་འགྱུར་བ། དོན་དང་མི་མཐུན་པའི་རང་ཉིད་བཟང་པོའམ་ལེགས་པ་ཡིན་པར་སེམས་པའི་དོན།  དཔེར་ན། མཁས་པར་རློམ་ཡང་བླུན་རྟགས་མཐོང་ལྟ་བུ། 2. ༡བྱ་བྱེད་ཐ་དད་པ། ༡དུས་གསུམ་ཐོར་བུ་པ། སྐུལ་ཚིག་ཙམ་འགྱུར་བ། ཕྱི་རྐྱེན་གྱིས་བློ་སེམས་ངན་པར་བསྒྱུར་བའམ་དབང་པོ་རྨོངས་པར་བྱས་པའི་དོན། ༢མ་འོངས་པ། བརླམ། ༢ད་ལྟ་བ། རློམ། ༢འདས་པ། བརླམས། ༢སྐུལ་ཚིག རློམས།  དཔེར་ན། ༡མ་འོངས་པ། གཟབ་ནན་མ་བྱས་ན་གདོན་གྱིས་བརླམ་ཉེན་ཆེ། ༡ད་ལྟ་བ། རྒྱུད་རློམ་བྱེད་ཀྱི་ངན་སྔགས། ༡འདས་པ། གདོན་གྱིས་བརླམས་ནས་གཏམ་འཆལ་བཤད། ༡སྐུལ་ཚིག དགྲ་བོའི་ཤེས་པ་རློམས་ཤིག་ལྟ་བུ། མིང་ཚིག ཡིད་ཆེས་པའི་དོན་ཏེ།  དཔེར་ན། ངས་ནམ་རྒྱུན་གྲོགས་པོ་དག་གི་གཏམ་རློམ་གྱིན་ཡོད་ལྟ་བུ།
4 | 


--------------------------------------------------------------------------------
/tests/data/monlam2020/verbs_expected.csv:
--------------------------------------------------------------------------------
1 | edited,favorites,_id,word,result
2 | -,-,83305,རློམས་, བྱ་ཚིག ༡རྒྱུན་སྤྱོད། ༡བྱ་བྱེད་ཐ་དད་པ། རློམ་གྱི་སྐུལ་ཚིག ༢མ་འོངས་པ། བརླམ། ༢ད་ལྟ་བ། རློམ། ༢འདས་པ། བརླམས། ༢སྐུལ་ཚིག རློམས།
3 | -,-,83302,རློམ་, བྱ་ཚིག 1. ༡རྒྱུན་སྤྱོད། ༡བྱ་བྱེད་ཐ་མི་དད་པ། ༡དུས་གསུམ་ཐོར་བུ་པ། གཟུགས་མི་འགྱུར་བ། དོན་དང་མི་མཐུན་པའི་རང་ཉིད་བཟང་པོའམ་ལེགས་པ་ཡིན་པར་སེམས་པའི་དོན།  དཔེར་ན། མཁས་པར་རློམ་ཡང་བླུན་རྟགས་མཐོང་ལྟ་བུ། 2. ༡བྱ་བྱེད་ཐ་དད་པ། ༡དུས་གསུམ་ཐོར་བུ་པ། སྐུལ་ཚིག་ཙམ་འགྱུར་བ། ཕྱི་རྐྱེན་གྱིས་བློ་སེམས་ངན་པར་བསྒྱུར་བའམ་དབང་པོ་རྨོངས་པར་བྱས་པའི་དོན། ༢མ་འོངས་པ། བརླམ། ༢ད་ལྟ་བ། རློམ། ༢འདས་པ། བརླམས། ༢སྐུལ་ཚིག རློམས།  དཔེར་ན། ༡མ་འོངས་པ། གཟབ་ནན་མ་བྱས་ན་གདོན་གྱིས་བརླམ་ཉེན་ཆེ། ༡ད་ལྟ་བ། རྒྱུད་རློམ་བྱེད་ཀྱི་ངན་སྔགས། ༡འདས་པ། གདོན་གྱིས་བརླམས་ནས་གཏམ་འཆལ་བཤད། ༡སྐུལ་ཚིག དགྲ་བོའི་ཤེས་པ་རློམས་ཤིག་ལྟ་བུ། མིང་ཚིག ཡིད་ཆེས་པའི་དོན་ཏེ།  དཔེར་ན། ངས་ནམ་རྒྱུན་གྲོགས་པོ་དག་གི་གཏམ་རློམ་གྱིན་ཡོད་ལྟ་བུ།
4 | 


--------------------------------------------------------------------------------
/tests/hfr2cql/UDPOS-bo.txt:
--------------------------------------------------------------------------------
 1 | UD-POS	ཚིག་གཤིས།     བྱང་བུ། 
 2 | ADJ	    རྒྱན་ཚིག       རྒྱན
 3 | ADP	    སྦྱོར་ཚིག	    སྦྱོར
 4 | ADV	    བསྣན་ཚིག      བསྣན
 5 | AUX	    བྱ་གྲོགས།      གྲོགས
 6 | CCONJ	ལྟོས་མེད་སྦྲེལ་ཚིག	 སྦྲེལ
 7 | DET	    ངེས་གཟུང་། 	   ངེས
 8 | INTJ	འབོད་ཚིག	 འབོད
 9 | NOUN	མིང་ཚིག	     མིང
10 | NUM	    གྲངས་ཚིག      གྲངས
11 | PRON	མིང་ཚབ།      ཚབ
12 | PROPN	ཁྱད་མིང་། 	   ཁྱད
13 | PUNCT	ཚེག་ཤད།      ཚེག
14 | SCONJ	ལྟོས་བཅས་སྦྲེལ་ཚིག	ལྟོས
15 | VERB	བྱ་ཚིག	     བྱ
16 | PART	རོགས་ཚིག      རོགས


--------------------------------------------------------------------------------
/tests/hfr2cql/adjustments.txt:
--------------------------------------------------------------------------------
 1 | # Syntax for the possible adjustment
 2 | # ===================================
 3 | # - CQL rules: "<text>" can be used without specifying that there is "text_cleaned="
 4 | # - Index format: either "<matching_index>" or "<matching_index>-<splitting-index>"
 5 | # - Adjustment format:
 6 | #		- "+" for merge
 7 | #		- ":" for split (default: syllable mode)
 8 | #		- "::" for split in character mode
 9 | #		- "=" for replace
10 | # - Constraint: "<matching_index>-<splitting-index>" is only allowed if adjustment is ":" or "::"
11 | 
12 | ["ལ་ལ་"] ["ལ་ལ་"]	1	=	[pos="PART"]
13 | ["ལ་ལ་"] ["ལ་ལ་"]	2	=	[pos="PART"]
14 | ["ལ་ལ་"] ["ལ་ལ་"]	1-2	::	[pos="PART"] [pos="PART"]
15 | ["ལ་"] ["ལ་"] ["ལ་ལ་"]	3-2	::	[pos="PART"] [pos="PART"]
16 | ["ལ་"] ["ལ་"] ["ལ་"] ["ལ་"]	2	+	[pos="PART"]


--------------------------------------------------------------------------------
/tests/hfr2cql/cql/_cql2hfr_cql.txt:
--------------------------------------------------------------------------------
 1 | ﻿#matchcql	idx	op	replacecql
 2 | 			
 3 | ༺གཤིས=ངཟ༻ ༺གཤིས=ཏཅ༻	2	=	༺གཤིས=ཡཚ༻
 4 | ༺གཤིས=ངཟ༻ ༺"སྟེ་" ༈ གཤིས=ཏཅ༻	2	=	༺གཤིས=ཏཅ༻
 5 | ༺གཤིས=ངཟ༻ ༺"ཅིང་" ༈ གཤིས=ཏཅ༻	2	=	༺གཤིས=ཏཅ༻
 6 | ༺གཤིས=ངཟ༻ ༺"ཞིང་" ༈ གཤིས=ཏཅ༻	2	=	༺གཤིས=མཚ༻
 7 | ༺གཤིས=ངཟ༻ ༺"ཤིང་" ༈ གཤིས=ཏཅ༻	2	=	༺གཤིས=མཚ༻
 8 | ༺གཤིས=མཚ༻ ༺གཤིས=ཏཅ༻	2	=	༺གཤིས=ཡཚ༻
 9 | ༺གཤིས=མཚ༻ ༺"སྟེ་" ༈ གཤིས=ཏཅ༻	2	=	༺གཤིས=ཏཅ༻
10 | ༺གཤིས=མཚ༻ ༺"ཏེ་" ༈ གཤིས=ཏཅ༻	2	=	༺གཤིས=ཏཅ༻
11 | ༺གཤིས=མཚ༻ ༺"ཞིང་" ༈ གཤིས=ཏཅ༻	2	=	༺གཤིས=ཏཅ༻
12 | ༺གཤིས=མཚ༻ ༺"ཤིང་" ༈ གཤིས=ཏཅ༻	2	=	༺གཤིས=ཏཅ༻
13 | ༺གཤིས=མཚ༻ ༺"ཤིང་" ༈ གཤིས=ཏཅ༻ ༺༻ ༺གཤིས=ཡཚ༻	2	=	༺གཤིས=མཚ༻
14 | ༺གཤིས=ཚབ༻ ༺"ནས་" ༈ གཤིས=ཏཅ༻	2	=	༺གཤིས=ཡཚ༻
15 | ༺"སུ་"༻ ༺གཤིས=ཚབ༻ ༺"ནས་" ༈ གཤིས=ཏཅ༻	3	=	༺གཤིས=ཏཅ༻
16 | ༺གཤིས=ངཚ༻ ༺གཤིས=ཏཅ༻	2	=	༺གཤིས=ཡཚ༻
17 | ༺གཤིས=ངཚ༻ ༺"སྟེ་" ༈ གཤིས=ཏཅ༻	2	=	༺གཤིས=ཏཅ༻
18 | ༺གཤིས=ཡཚ༻ ༺གཤིས=ཏཅ༻	2	=	༺གཤིས=མཚ༻
19 | ༺གཤིས=མཚ༻ ༺གཤིས=ཡཚ༻ ༺"ནས་" ༈ གཤིས=ཏཅ༻	3	=	༺གཤིས=ཡཚ༻
20 | ༺གཤིས=ཡཚ༻ ༺"ཏེ་" ༈ གཤིས=ཏཅ༻	2	=	༺གཤིས=ཏཅ༻


--------------------------------------------------------------------------------
/tests/hfr2cql/cql/cql.txt:
--------------------------------------------------------------------------------
 1 | #matchcql	idx	op	replacecql
 2 | 			
 3 | [pos="DET"] [pos="SCONJ"]	2	=	[pos="ADP"]
 4 | [pos="DET"] ["སྟེ་" & pos="SCONJ"]	2	=	[pos="SCONJ"]
 5 | [pos="DET"] ["ཅིང་" & pos="SCONJ"]	2	=	[pos="SCONJ"]
 6 | [pos="DET"] ["ཞིང་" & pos="SCONJ"]	2	=	[pos="NOUN"]
 7 | [pos="DET"] ["ཤིང་" & pos="SCONJ"]	2	=	[pos="NOUN"]
 8 | [pos="NOUN"] [pos="SCONJ"]	2	=	[pos="ADP"]
 9 | [pos="NOUN"] ["སྟེ་" & pos="SCONJ"]	2	=	[pos="SCONJ"]
10 | [pos="NOUN"] ["ཏེ་" & pos="SCONJ"]	2	=	[pos="SCONJ"]
11 | [pos="NOUN"] ["ཞིང་" & pos="SCONJ"]	2	=	[pos="SCONJ"]
12 | [pos="NOUN"] ["ཤིང་" & pos="SCONJ"]	2	=	[pos="SCONJ"]
13 | [pos="NOUN"] ["ཤིང་" & pos="SCONJ"] [] [pos="ADP"]	2	=	[pos="NOUN"]
14 | [pos="PRON"] ["ནས་" & pos="SCONJ"]	2	=	[pos="ADP"]
15 | ["སུ་"] [pos="PRON"] ["ནས་" & pos="SCONJ"]	3	=	[pos="SCONJ"]
16 | [pos="NUM"] [pos="SCONJ"]	2	=	[pos="ADP"]
17 | [pos="NUM"] ["སྟེ་" & pos="SCONJ"]	2	=	[pos="SCONJ"]
18 | [pos="ADP"] [pos="SCONJ"]	2	=	[pos="NOUN"]
19 | [pos="NOUN"] [pos="ADP"] ["ནས་" & pos="SCONJ"]	3	=	[pos="ADP"]
20 | [pos="ADP"] ["ཏེ་" & pos="SCONJ"]	2	=	[pos="SCONJ"]


--------------------------------------------------------------------------------
/tests/hfr2cql/cql2hfr.txt:
--------------------------------------------------------------------------------
 1 | "ADJ"	-	རཚ
 2 | "ADP"	-	ཡཚ
 3 | "ADV"	-	ནཚ
 4 | "AUX"	-	བག
 5 | "CCONJ"	-	ཏམ
 6 | "DET"	-	ངཟ
 7 | "INTJ"	-	འཚ
 8 | "NOUN"	-	མཚ
 9 | "NUM"	-	ངཚ
10 | "PRON"	-	ཚབ
11 | "PROPN"	-	ཁམ
12 | "PUNCT"	-	ཚཤ
13 | "SCONJ"	-	ཏཅ
14 | "VERB"	-	བཚ
15 | "PART"	-	གཚ
16 | pos=	-	གཤིས=
17 | lemma=	-	མ=
18 | sense=	-	དོན=
19 |  & 	-	 ༈ 
20 | \[	-	༺
21 | \]	-	༻
22 | 


--------------------------------------------------------------------------------
/tests/hfr2cql/cql_result.txt:
--------------------------------------------------------------------------------
 1 | #matchcql	idx	op	replacecql
 2 | 			
 3 | [pos="DET"] [pos="SCONJ"]	2	=	[pos="ADP"]
 4 | [pos="DET"] ["སྟེ་" & pos="SCONJ"]	2	=	[pos="SCONJ"]
 5 | [pos="DET"] ["ཅིང་" & pos="SCONJ"]	2	=	[pos="SCONJ"]
 6 | [pos="DET"] ["ཞིང་" & pos="SCONJ"]	2	=	[pos="NOUN"]
 7 | [pos="DET"] ["ཤིང་" & pos="SCONJ"]	2	=	[pos="NOUN"]
 8 | [pos="NOUN"] [pos="SCONJ"]	2	=	[pos="ADP"]
 9 | [pos="NOUN"] ["སྟེ་" & pos="SCONJ"]	2	=	[pos="SCONJ"]
10 | [pos="NOUN"] ["ཏེ་" & pos="SCONJ"]	2	=	[pos="SCONJ"]
11 | [pos="NOUN"] ["ཞིང་" & pos="SCONJ"]	2	=	[pos="SCONJ"]
12 | [pos="NOUN"] ["ཤིང་" & pos="SCONJ"]	2	=	[pos="SCONJ"]
13 | [pos="NOUN"] ["ཤིང་" & pos="SCONJ"] [] [pos="ADP"]	2	=	[pos="NOUN"]
14 | [pos="PRON"] ["ནས་" & pos="SCONJ"]	2	=	[pos="ADP"]
15 | ["སུ་"] [pos="PRON"] ["ནས་" & pos="SCONJ"]	3	=	[pos="SCONJ"]
16 | [pos="NUM"] [pos="SCONJ"]	2	=	[pos="ADP"]
17 | [pos="NUM"] ["སྟེ་" & pos="SCONJ"]	2	=	[pos="SCONJ"]
18 | [pos="ADP"] [pos="SCONJ"]	2	=	[pos="NOUN"]
19 | [pos="NOUN"] [pos="ADP"] ["ནས་" & pos="SCONJ"]	3	=	[pos="ADP"]
20 | [pos="ADP"] ["ཏེ་" & pos="SCONJ"]	2	=	[pos="SCONJ"]


--------------------------------------------------------------------------------
/tests/hfr2cql/hfr_result.txt:
--------------------------------------------------------------------------------
 1 | #matchcql	idx	op	replacecql
 2 | 			
 3 | ༺གཤིས=ངེས༻ ༺གཤིས=ལྟོས༻	2	=	༺གཤིས=སྦྱོར༻
 4 | ༺གཤིས=ངེས༻ ༺"སྟེ་" ༈ གཤིས=ལྟོས༻	2	=	༺གཤིས=ལྟོས༻
 5 | ༺གཤིས=ངེས༻ ༺"ཅིང་" ༈ གཤིས=ལྟོས༻	2	=	༺གཤིས=ལྟོས༻
 6 | ༺གཤིས=ངེས༻ ༺"ཞིང་" ༈ གཤིས=ལྟོས༻	2	=	༺གཤིས=མིང༻
 7 | ༺གཤིས=ངེས༻ ༺"ཤིང་" ༈ གཤིས=ལྟོས༻	2	=	༺གཤིས=མིང༻
 8 | ༺གཤིས=མིང༻ ༺གཤིས=ལྟོས༻	2	=	༺གཤིས=སྦྱོར༻
 9 | ༺གཤིས=མིང༻ ༺"སྟེ་" ༈ གཤིས=ལྟོས༻	2	=	༺གཤིས=ལྟོས༻
10 | ༺གཤིས=མིང༻ ༺"ཏེ་" ༈ གཤིས=ལྟོས༻	2	=	༺གཤིས=ལྟོས༻
11 | ༺གཤིས=མིང༻ ༺"ཞིང་" ༈ གཤིས=ལྟོས༻	2	=	༺གཤིས=ལྟོས༻
12 | ༺གཤིས=མིང༻ ༺"ཤིང་" ༈ གཤིས=ལྟོས༻	2	=	༺གཤིས=ལྟོས༻
13 | ༺གཤིས=མིང༻ ༺"ཤིང་" ༈ གཤིས=ལྟོས༻ ༺༻ ༺གཤིས=སྦྱོར༻	2	=	༺གཤིས=མིང༻
14 | ༺གཤིས=ཚབ༻ ༺"ནས་" ༈ གཤིས=ལྟོས༻	2	=	༺གཤིས=སྦྱོར༻
15 | ༺"སུ་"༻ ༺གཤིས=ཚབ༻ ༺"ནས་" ༈ གཤིས=ལྟོས༻	3	=	༺གཤིས=ལྟོས༻
16 | ༺གཤིས=གྲངས༻ ༺གཤིས=ལྟོས༻	2	=	༺གཤིས=སྦྱོར༻
17 | ༺གཤིས=གྲངས༻ ༺"སྟེ་" ༈ གཤིས=ལྟོས༻	2	=	༺གཤིས=ལྟོས༻
18 | ༺གཤིས=སྦྱོར༻ ༺གཤིས=ལྟོས༻	2	=	༺གཤིས=མིང༻
19 | ༺གཤིས=མིང༻ ༺གཤིས=སྦྱོར༻ ༺"ནས་" ༈ གཤིས=ལྟོས༻	3	=	༺གཤིས=སྦྱོར༻
20 | ༺གཤིས=སྦྱོར༻ ༺"ཏེ་" ༈ གཤིས=ལྟོས༻	2	=	༺གཤིས=ལྟོས༻


--------------------------------------------------------------------------------
/tests/resources/rdr_rules.txt:
--------------------------------------------------------------------------------
 1 | ﻿True : object.conclusion = "NN"
 2 | 	object.tag == "LATIN" : object.conclusion = "LATIN"
 3 | 	object.tag == "OTHER" : object.conclusion = "OTHER"
 4 | 	object.tag == "PUNCT" : object.conclusion = "PUNCT"
 5 | 	object.tag == "DET" : object.conclusion = "DET"
 6 | 		object.word == "དག" and object.nextTag1 == "PART" : object.conclusion = "VERB"
 7 | 	object.tag == "PROPN" : object.conclusion = "PROPN"
 8 | 	object.tag == "NOUN" : object.conclusion = "NOUN"
 9 | 	object.tag == "PART" : object.conclusion = "PART"
10 | 		object.word == "ས" and object.nextTag1 == "PART" : object.conclusion = "ADP"
11 | 		object.prevTag1 == "PUNCT" and object.word == "ས་" : object.conclusion = "ADP"
12 | 		object.prevTag1 == "PART" and object.word == "ས་" : object.conclusion = "ADP"
13 | 			object.prevWord1 == "མི" : object.conclusion = "PART"
14 | 		object.prevWord1 == "བྷ་" : object.conclusion = "ADP"
15 | 		object.word == "ས་" and object.nextWord1 == "ལ་" : object.conclusion = "ADP"
16 | 			object.prevWord1 == "ལ" and object.word == "ས་" : object.conclusion = "PART"
17 | 		object.nextWord1 == "སྟེངས་" : object.conclusion = "ADP"
18 | 		object.word == "ར" and object.nextWord1 == "འི་" : object.conclusion = "ADP"
19 | 	object.tag == "X" : object.conclusion = "X"
20 | 	object.tag == "OOV" : object.conclusion = "OOV"
21 | 	object.tag == "VERB" : object.conclusion = "VERB"
22 | 		object.suffixL1 == "མ" : object.conclusion = "NOUN"
23 | 		object.suffixL1 == "ན" : object.conclusion = "OOV"
24 | 		object.nextWord2 == "བོད་སྐད་" : object.conclusion = "NON_WORD"
25 | 		object.nextWord1 == "ཡིག་" : object.conclusion = "NON_WORD"
26 | 		object.suffixL2 == "ཕྱོགས་" : object.conclusion = "NOUN"
27 | 		object.nextTag1 == "NUM" and object.nextTag2 == "NUM" : object.conclusion = "NOUN"
28 | 		object.prevWord1 == "དཔེར་ན་" : object.conclusion = "NOUN"
29 | 		object.prevWord1 == "།_" and object.nextWord1 == "ལ་སོགས་པ་" : object.conclusion = "NOUN"
30 | 		object.suffixL1 == "སོ" : object.conclusion = "OOV"
31 | 		object.suffixL2 == "སྐྱེས་" : object.conclusion = "OOV"
32 | 		object.nextTag1 == "NON_WORD" : object.conclusion = "OOV"
33 | 		object.suffixL2 == "ཆད་" : object.conclusion = "ADV"
34 | 	object.tag == "ADP" : object.conclusion = "ADP"
35 | 	object.tag == "AUX" : object.conclusion = "AUX"
36 | 	object.tag == "NUM" : object.conclusion = "NUM"
37 | 	object.tag == "TEXT" : object.conclusion = "TEXT"
38 | 	object.tag == "PRON" : object.conclusion = "PRON"
39 | 	object.tag == "ADJ" : object.conclusion = "ADJ"
40 | 	object.tag == "SCONJ" : object.conclusion = "SCONJ"
41 | 		object.prevTag1 == "NOUN" : object.conclusion = "ADP"
42 | 		object.prevTag1 == "DET" : object.conclusion = "ADP"
43 | 	object.tag == "ADV" : object.conclusion = "ADV"
44 | 	object.tag == "NON_WORD" : object.conclusion = "NON_WORD"
45 | 	object.tag == "INTJ" : object.conclusion = "INTJ"


--------------------------------------------------------------------------------
/tests/resources/shelving/test_1.txt:
--------------------------------------------------------------------------------
1 | ཝ་ཡེ། བཀྲ་
2 | ཤིས་ཡིན་པས།
3 | 


--------------------------------------------------------------------------------
/tests/resources/shelving/test_1_tok/test_1_tok.txt:
--------------------------------------------------------------------------------
1 | ﻿ཝ་ཡེ/NO_POS །_/ བཀྲ་ཤིས་/NOUN ཡིན་པ/AUX ས/PART །/


--------------------------------------------------------------------------------
/tests/resources/shelving/test_2.txt:
--------------------------------------------------------------------------------
1 | བཀྲ་ཤིས་བདེ་ལེགས་
2 | ཕུན་སུམ་ཚོགས། this is non-bo text རྟག་ཏུ་བདེ་
3 | བ་ཐོབ་པ
4 | ར་ཤོག
5 | 


--------------------------------------------------------------------------------
/tests/resources/step1_3/input/test.txt:
--------------------------------------------------------------------------------
1 | ﻿བཀྲ་ཤིས་བདེ་ལེགས་ཕུན་སུམ་ཚོགས། རྟག་ཏུ་བདེ་བ་ཐོབ་པར་ཤོག
2 | བཀྲ་ཤིས་བདེ་ལེགས་ཕུན་སུམ་ཚོགས། རྟག་ཏུ་བདེ་བ་ཐོབ་པར་ཤོག


--------------------------------------------------------------------------------
/tests/resources/step2/cql_rules.txt:
--------------------------------------------------------------------------------
1 | ﻿["ཁྲུང་"] ["ཁྲུང་"]	1	+	[pos="NOUN"]
2 | ["ཁྲུང་"] ["ཁྲུང་ས་"]	2-5	::	[pos=""] [pos="NOUN"]
3 | ["ཁྲུང་"] ["ཁྲུང་"]	1	+	[pos="NOUN"]
4 | ["ཁྲུང་" & pos="NO_POS"] [pos="NOUN"]	1	+	[pos="NOUN"]
5 | 


--------------------------------------------------------------------------------
/tests/resources/step2/manually_corrected.txt:
--------------------------------------------------------------------------------
1 | ﻿བཀྲ་ཤིས་བདེ་ལེགས་/NOUN
2 | ཕུན་སུམ་ཚོགས/ADJ །_//།//12 རྟག་/NOUN/རྟག་པ་ ཏུ་/PART/དུ་ བདེ་བ་/NOUN ཐོབ་པ/VERB/ཐོབ་ ར་/PART/ལ་ ཤོག/AUX
3 | བཀྲ་ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་/NOUN ཏུ་/PART/དུ་ བདེ་བ་/NOUN ཐོབ་པ/VERB ར་/PART ཤོག/AUXr


--------------------------------------------------------------------------------
/tests/resources/step2/rdr_input.txt:
--------------------------------------------------------------------------------
1 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB
2 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB
3 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB
4 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB
5 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB
6 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB
7 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB


--------------------------------------------------------------------------------
/tests/resources/step2/step2:
--------------------------------------------------------------------------------
1 | ﻿བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB
2 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB
3 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB
4 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB
5 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB
6 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB
7 | བཀྲ་ཤིས་བདེ་ལེགས་/OOV ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/AUX བཀྲ་/OTHER ཤིས་/NOUN བདེ་ལེགས་/NOUN ཕུན་སུམ་ཚོགས/ADJ །_/PUNCT རྟག་ཏུ་/OOV བདེ་བ་/VERB ཐོབ་པ/VERB ར་/PART ཤོག/VERB
8 |  


--------------------------------------------------------------------------------
/tests/resources/step2/test_rules.txt:
--------------------------------------------------------------------------------
1 | ﻿༺"ཁྲུང་"༻ ༺"ཁྲུང་"༻	1	+	༺གཤིས=མིང༻
2 | ༺"ཁྲུང་"༻ ༺"ཁྲུང་ས་"༻	2-5	::	༺གཤིས=""༻ ༺གཤིས=མིང༻
3 | ༺"ཁྲུང་"༻ ༺"ཁྲུང་"༻	1	+	༺གཤིས=མིང༻
4 | ༺"ཁྲུང་" ༈ གཤིས="NO_POS"༻ ༺གཤིས=མིང༻	1	+	༺གཤིས=མིང༻
5 | ༺"ནི་ལ"༻	1-1	:	༺གཤིས=མིང༻
6 | 


--------------------------------------------------------------------------------
/tests/test_cli.py:
--------------------------------------------------------------------------------
 1 | from click.testing import CliRunner
 2 | 
 3 | from pybo.cli import cli, profile_update
 4 | 
 5 | 
 6 | def test_tok():
 7 |     runner = CliRunner()
 8 |     runner.invoke(cli, ["tok", "tests/resources/shelving/", "--tags", "pl"])
 9 | 
10 | 
11 | def test_extract_rules():
12 |     runner = CliRunner()
13 |     runner.invoke(cli, ["extract-rules", "tests/resources/step2/step2"])
14 | 
15 | def test_extract_seg_rules():
16 |     runner = CliRunner()
17 |     runner.invoke(cli, ["extract-seg-rules", "tests/data/corpus1/corpus1.txt", "--type", "hfr", "--e", 1])


--------------------------------------------------------------------------------
/tests/test_corpus.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from textwrap import dedent
 3 | import pytest
 4 | 
 5 | from pybo import *
 6 | 
 7 | @pytest.mark.skip(reason="old workflow")
 8 | def test_parse_manually_corrected():
 9 |     dump = Path(__file__).parent / "resources/step2/manually_corrected.txt"
10 |     dump = dump.read_text(encoding="utf-8-sig")
11 |     data = extract_new_entries(dump, Path(__file__).parent / "resources/main")
12 |     assert data == dedent(
13 |         """\
14 |             # form	pos	lemma	sense	freq
15 |             །_		།		12
16 |             །_	PUNCT			
17 |             ཏུ་	PART	དུ་		
18 |             ཐོབ་པ་	VERB			
19 |             ཐོབ་པ་	VERB	ཐོབ་		
20 |             ཕུན་སུམ་ཚོགས་	ADJ			
21 |             བཀྲ་ཤིས་	NOUN			
22 |             བཀྲ་ཤིས་བདེ་ལེགས་	NOUN			
23 |             བདེ་བ་	NOUN			
24 |             བདེ་ལེགས་	NOUN			
25 |             ར་	PART			
26 |             ར་	PART	ལ་		
27 |             རྟག་	NOUN			
28 |             རྟག་	NOUN	རྟག་པ་		
29 |             ཤོག་	AUX			
30 |             ཤོག་	AUXr			"""
31 |     )
32 | 


--------------------------------------------------------------------------------
/tests/test_hfr_cqlr_converter.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from pybo.hfr_cqlr_converter import cqlr2hfr, hfr2cqlr
 4 | 
 5 | @pytest.fixture(scope="module")
 6 | def cqlr():
 7 |     return (
 8 |         '["ལ་ལ་"] ["ལ་ལ་"]	1	=	[pos="PART"]'
 9 |         '["ལ་ལ་"] ["ལ་ལ་"]	2	=	[pos="PART"]'
10 |         '["ལ་ལ་"] ["ལ་ལ་"]	1-2	::	[pos="NOUN"] [pos="PART"]'
11 |         '["ལ་"] ["ལ་"] ["ལ་ལ་"]	3-2	::	[pos="PART"] [pos="PART"]'
12 |         '["ལ་"] ["ལ་"] ["ལ་"] ["ལ་"]	2	+	[pos="DET"]'
13 |     )
14 | 
15 | 
16 | @pytest.fixture(scope="module")
17 | def hfr():
18 |     return (
19 |         '༺"ལ་ལ་"༻ ༺"ལ་ལ་"༻	1	=	༺གཤིས=རོགས༻'
20 |         '༺"ལ་ལ་"༻ ༺"ལ་ལ་"༻	2	=	༺གཤིས=རོགས༻'
21 |         '༺"ལ་ལ་"༻ ༺"ལ་ལ་"༻	1-2	::	༺གཤིས=མིང༻ ༺གཤིས=རོགས༻'
22 |         '༺"ལ་"༻ ༺"ལ་"༻ ༺"ལ་ལ་"༻	3-2	::	༺གཤིས=རོགས༻ ༺གཤིས=རོགས༻'
23 |         '༺"ལ་"༻ ༺"ལ་"༻ ༺"ལ་"༻ ༺"ལ་"༻	2	+	༺གཤིས=ངེས༻'
24 |     )
25 | 
26 | 
27 | def test_cql2hfr(cqlr, hfr):
28 |     hfr_result = cqlr2hfr(cqlr)
29 |     print(hfr_result)
30 |     assert hfr_result == hfr
31 |     print("Test pass..")
32 | 
33 | 
34 | def test_hfr2cql(hfr, cqlr):
35 |     cql_result = hfr2cqlr(hfr)
36 |     assert cql_result == cqlr
37 |     print("Test pass..")
38 | 


--------------------------------------------------------------------------------
/tests/test_monlam2word_list.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | 
  3 | import pytest
  4 | 
  5 | from pybo.monlam2wordlist import (
  6 |     csv_loader,
  7 |     get_definition_list,
  8 |     get_example_list,
  9 |     get_pos_list,
 10 |     get_sense_tag_list,
 11 |     get_tag_list,
 12 |     monlam2wordlist,
 13 |     parse_attrs,
 14 | )
 15 | 
 16 | testcases_ids = ("one_pos_one_sense", "one_pos_multi_senses", "multi_pos_multi_senses")
 17 | 
 18 | # monlam-result-col, pos-list, definition-list, tag-list, sense-list, example-list
 19 | parser_to_try = (
 20 |     # one-pos-one-sense
 21 |     (
 22 |         "མིང་ཚིག ༡ཀ། ཀཀ། ཁཁ། གག། དཔེར་ན། པཔ།",
 23 |         [("མིང་ཚིག", "༡ཀ། ཀཀ། ཁཁ། གག། དཔེར་ན། པཔ།")],
 24 |         [("མིང་ཚིག", "༡ཀ། ཀཀ། ཁཁ། གག། དཔེར་ན། པཔ།")],
 25 |         [("མིང་ཚིག", "༡ཀ།", "ཀཀ། ཁཁ། གག། དཔེར་ན། པཔ།")],
 26 |         [("མིང་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ། ཁཁ། གག། དཔེར་ན། པཔ།")],
 27 |         [("མིང་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ། ཁཁ། གག།", "པཔ།")],
 28 |     ),
 29 |     # one-pos-multi-senses
 30 |     (
 31 |         "མིང་ཚིག 1. ༡ཀ། ཀཀ། དཔེར་ན། པཔ། 2. ༡ཀ། ཁཁ། 3. གག།",
 32 |         [("མིང་ཚིག", "༡ཀ། ཀཀ། དཔེར་ན། པཔ། 2. ༡ཀ། ཁཁ། 3. གག།")],
 33 |         [
 34 |             ("མིང་ཚིག", "༡ཀ། ཀཀ། དཔེར་ན། པཔ།"),
 35 |             ("མིང་ཚིག", "༡ཀ། ཁཁ།"),
 36 |             ("མིང་ཚིག", "གག།"),
 37 |         ],
 38 |         [
 39 |             ("མིང་ཚིག", "༡ཀ།", "ཀཀ། དཔེར་ན། པཔ།"),
 40 |             ("མིང་ཚིག", "༡ཀ།", "ཁཁ།"),
 41 |             ("མིང་ཚིག", "", "གག།"),
 42 |         ],
 43 |         [
 44 |             ("མིང་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ། དཔེར་ན། པཔ།"),
 45 |             ("མིང་ཚིག", "༡ཀ།", "ཁཁ", "ཁཁ།"),
 46 |             ("མིང་ཚིག", "", "གག", "གག།"),
 47 |         ],
 48 |         [
 49 |             ("མིང་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ།", "པཔ།"),
 50 |             ("མིང་ཚིག", "༡ཀ།", "ཁཁ", "ཁཁ།", ""),
 51 |             ("མིང་ཚིག", "", "གག", "གག།", ""),
 52 |         ],
 53 |     ),
 54 |     # multi-pos-multi-senses
 55 |     (
 56 |         "མིང་ཚིག 1. ༡ཀ། ཀཀ། 2. ཁཁ། བྱེད་ཚིག 1. ཀཀ། 2. ༡ཀ། ཁཁ། དཔེར་ན། པཔ། གྲོགས་ཚིག ༡ཀ། ཀཀ། ཁཁ། བྱེད་ཚིག ཀཀ། ཁཁ།",
 57 |         [
 58 |             ("མིང་ཚིག", "༡ཀ། ཀཀ། 2. ཁཁ།"),
 59 |             ("བྱེད་ཚིག", "ཀཀ། 2. ༡ཀ། ཁཁ། དཔེར་ན། པཔ།"),
 60 |             ("གྲོགས་ཚིག", "༡ཀ། ཀཀ། ཁཁ།"),
 61 |             ("བྱེད་ཚིག", "ཀཀ། ཁཁ།"),
 62 |         ],
 63 |         [
 64 |             ("མིང་ཚིག", "༡ཀ། ཀཀ།"),
 65 |             ("མིང་ཚིག", "ཁཁ།"),
 66 |             ("བྱེད་ཚིག", "ཀཀ།"),
 67 |             ("བྱེད་ཚིག", "༡ཀ། ཁཁ། དཔེར་ན། པཔ།"),
 68 |             ("གྲོགས་ཚིག", "༡ཀ། ཀཀ། ཁཁ།"),
 69 |             ("བྱེད་ཚིག", "ཀཀ། ཁཁ།"),
 70 |         ],
 71 |         [
 72 |             ("མིང་ཚིག", "༡ཀ།", "ཀཀ།"),
 73 |             ("མིང་ཚིག", "", "ཁཁ།"),
 74 |             ("བྱེད་ཚིག", "", "ཀཀ།"),
 75 |             ("བྱེད་ཚིག", "༡ཀ།", "ཁཁ། དཔེར་ན། པཔ།"),
 76 |             ("གྲོགས་ཚིག", "༡ཀ།", "ཀཀ། ཁཁ།"),
 77 |             ("བྱེད་ཚིག", "", "ཀཀ། ཁཁ།"),
 78 |         ],
 79 |         [
 80 |             ("མིང་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ།"),
 81 |             ("མིང་ཚིག", "", "ཁཁ", "ཁཁ།"),
 82 |             ("བྱེད་ཚིག", "", "ཀཀ", "ཀཀ།"),
 83 |             ("བྱེད་ཚིག", "༡ཀ།", "ཁཁ", "ཁཁ། དཔེར་ན། པཔ།"),
 84 |             ("གྲོགས་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ། ཁཁ།"),
 85 |             ("བྱེད་ཚིག", "", "ཀཀ", "ཀཀ། ཁཁ།"),
 86 |         ],
 87 |         [
 88 |             ("མིང་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ།", ""),
 89 |             ("མིང་ཚིག", "", "ཁཁ", "ཁཁ།", ""),
 90 |             ("བྱེད་ཚིག", "", "ཀཀ", "ཀཀ།", ""),
 91 |             ("བྱེད་ཚིག", "༡ཀ།", "ཁཁ", "ཁཁ།", "པཔ།"),
 92 |             ("གྲོགས་ཚིག", "༡ཀ།", "ཀཀ", "ཀཀ། ཁཁ།", ""),
 93 |             ("བྱེད་ཚིག", "", "ཀཀ", "ཀཀ། ཁཁ།", ""),
 94 |         ],
 95 |     ),
 96 | )
 97 | 
 98 | 
 99 | @pytest.fixture(params=parser_to_try, ids=testcases_ids)
100 | def parser_testcase(request):
101 |     return request.param
102 | 
103 | 
104 | def test_get_pos_list(parser_testcase):
105 |     monlam_result_col, pos_expected, *_ = parser_testcase
106 |     assert get_pos_list(monlam_result_col) == pos_expected
107 | 
108 | 
109 | def test_get_definition_list(parser_testcase):
110 |     _, pos_list, definition_expected, *_ = parser_testcase
111 |     assert get_definition_list(pos_list) == definition_expected
112 | 
113 | 
114 | def test_get_tag_list(parser_testcase):
115 |     _, _, definition_list, tag_expected, *_ = parser_testcase
116 |     assert get_tag_list(definition_list) == tag_expected
117 | 
118 | 
119 | def test_get_sense_tag_list(parser_testcase):
120 |     *_, tag_list, sense_expected, _ = parser_testcase
121 |     assert get_sense_tag_list(tag_list) == sense_expected
122 | 
123 | 
124 | def test_get_example_list(parser_testcase):
125 |     *_, sense_list, example_expected = parser_testcase
126 |     assert get_example_list(sense_list) == example_expected
127 | 
128 | 
129 | data_path = Path("./tests/data/monlam2020/")
130 | testcases_to_try = (
131 |     (
132 |         csv_loader(data_path / "one_pos_one_sense.csv"),
133 |         csv_loader(data_path / "one_pos_one_sense_expected.csv"),
134 |     ),
135 |     (
136 |         csv_loader(data_path / "one_pos_multi_sense.csv"),
137 |         csv_loader(data_path / "one_pos_multi_sense_expected.csv"),
138 |     ),
139 |     (
140 |         csv_loader(data_path / "multi_pos_multi_sense.csv"),
141 |         csv_loader(data_path / "multi_pos_multi_sense_expected.csv"),
142 |     ),
143 | )
144 | 
145 | 
146 | @pytest.fixture(params=testcases_to_try, ids=testcases_ids)
147 | def a_testcase(request):
148 |     return request.param
149 | 
150 | 
151 | # def test_monlam2wordlist(a_testcase):
152 | #     monlam_rows, expected_rows = a_testcase
153 | #     wordlists = monlam2wordlist(monlam_rows)
154 | #     print(wordlists)
155 | 


--------------------------------------------------------------------------------
/tests/test_rdr2adjustment.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from textwrap import dedent
 3 | 
 4 | from pybo.rdr.rdr_2_replace_matcher import rdr_2_replace_matcher
 5 | 
 6 | 
 7 | def test_suffix_bug():
 8 |     dump = Path("tests/resources/rdr_rules.txt").read_text(encoding="utf-8")
 9 |     rules = rdr_2_replace_matcher(dump)
10 |     expected = dedent(
11 |         """\
12 |             [pos="DET" & text="དག"] [pos="PART"]	1	=	[pos="VERB"]
13 |             [pos="PART" & text="ས"] [pos="PART"]	1	=	[pos="ADP"]
14 |             [pos="PUNCT"] [pos="PART" & text="ས་"]	2	=	[pos="ADP"]
15 |             [pos="PART"] [pos="PART" & text="ས་"]	2	=	[pos="ADP"]
16 |             [pos="PART" & text="མི"] [pos="PART" & text="ས་"]	2	=	[pos="PART"]
17 |             [text="བྷ་"] [pos="PART"]	2	=	[pos="ADP"]
18 |             [pos="PART" & text="ས་"] [text="ལ་"]	1	=	[pos="ADP"]
19 |             [text="ལ"] [pos="PART" & text="ས་"] [text="ལ་"]	2	=	[pos="PART"]
20 |             [pos="PART"] [text="སྟེངས་"]	1	=	[pos="ADP"]
21 |             [pos="PART" & text="ར"] [text="འི་"]	1	=	[pos="ADP"]
22 |             [pos="VERB"] [text=".*མ"]	1	=	[pos="NOUN"]
23 |             [pos="VERB"] [text=".*ན"]	1	=	[pos="OOV"]
24 |             [pos="VERB"] [] [text="བོད་སྐད་"]	1	=	[pos="NON_WORD"]
25 |             [pos="VERB"] [text="ཡིག་"]	1	=	[pos="NON_WORD"]
26 |             [pos="VERB"] [] [text=".*ཕྱོགས་"]	1	=	[pos="NOUN"]
27 |             [pos="VERB"] [pos="NUM"] [pos="NUM"]	1	=	[pos="NOUN"]
28 |             [text="དཔེར་ན་"] [pos="VERB"]	2	=	[pos="NOUN"]
29 |             [text="།_"] [pos="VERB"] [text="ལ་སོགས་པ་"]	2	=	[pos="NOUN"]
30 |             [pos="VERB"] [text=".*སོ"]	1	=	[pos="OOV"]
31 |             [pos="VERB"] [] [text=".*སྐྱེས་"]	1	=	[pos="OOV"]
32 |             [pos="VERB"] [pos="NON_WORD"]	1	=	[pos="OOV"]
33 |             [pos="VERB"] [] [text=".*ཆད་"]	1	=	[pos="ADV"]
34 |             [pos="NOUN"] [pos="SCONJ"]	2	=	[pos="ADP"]
35 |             [pos="DET"] [pos="SCONJ"]	2	=	[pos="ADP"]"""
36 |     )
37 |     assert rules == expected
38 | 


--------------------------------------------------------------------------------
/tests/test_segmentation_rule_extraction.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import re
 3 | 
 4 | from pybo.segmentation_rule.make_rule import *
 5 | from pybo.segmentation_rule.pipeline import *
 6 | 
 7 | @pytest.fixture(scope="module")
 8 | def human_data():
 9 |     return "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ །  ། ན་མོ་གུ་རུ་ དེ་ཝ་ཌཱ་ཀི་ནི །  སྔོན་སྦྱངས་ ཐུགས་བསྐྱེད་ སྨོན་ལམ་ དུས་ བབས་ ལྷག་བསམ་ གྲུ་གཟིངས་ ནང་ དུ་ ལུས་སྲོག་ མ་ ཆགས་ འགྲོ་དོན་ སྦྱོར་བ་ མཆོག་ གིས་ རབ་ ཞུགས་ ནས །"
10 | 
11 | @pytest.fixture(scope="module")
12 | def source_data():
13 |     return "སྒྲ་བསྒྱུར་མར་པ་ལོ་ཙྪའི་རྣམ་པར་ཐར་པ་མཐོང་བ་དོན་ཡོད་བཞུགས་སོ།།ན་མོ་གུ་རུ་དེ་ཝ་ཌཱ་ཀི་ནི།སྔོན་སྦྱངས་ཐུགས་བསྐྱེད་སྨོན་ལམ་དུས་བབས་ལྷག་བསམ་གྲུ་གཟིངས་ནང་དུ་ལུས་སྲོག་མ་ཆགས་འགྲོ་དོན་སྦྱོར་བ་མཆོག་གིས་རབ་ཞུགས་ནས།"
14 | 
15 | 
16 | def test_postprocessing_human_data(human_data):
17 |     expected_human_data = "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ །། ན་མོ་གུ་རུ་ དེ་ཝ་ཌཱ་ཀི་ནི ། སྔོན་སྦྱངས་ ཐུགས་བསྐྱེད་ སྨོན་ལམ་ དུས་ བབས་ ལྷག་བསམ་ གྲུ་གཟིངས་ ནང་ དུ་ ལུས་སྲོག་ མ་ ཆགས་ འགྲོ་དོན་ སྦྱོར་བ་ མཆོག་ གིས་ རབ་ ཞུགས་ ནས །"
18 |     assert expected_human_data == post_process_human_data(human_data)
19 | 
20 | 
21 | def test_construct_bilou_tag_line():
22 |     human_toks = ["སྒྲ་བསྒྱུར་", "མར་པ་", "ལོ་ཙྪ", "འི་", "རྣམ་པར་", "ཐར་པ་", "མཐོང་བ་", "དོན་ཡོད་", "བཞུགས་", "སོ", "།།", "ན་མོ་གུ་རུ་", "དེ་ཝ་ཌཱ་ཀི་ནི", "།", "རྣམས་", "ལས་", "དམ་ཆོས་", "ནོར་བུ་", "དགོས་འདོད་", "ཆར་འབབས་", "བླངས་", "ནས་", "ནི", "།།", "གི", "ས་", "བསྐྱོད་", "ཕུ་ལ་ཧ་རི་"]
23 |     botok_toks = ["སྒྲ་<NO_POS>", "བསྒྱུར་<NO_POS>", "མར་པ་<NO_POS>", "ལོ་<NO_POS>", "ཙྪའི་<NO_POS>", "རྣམ་པ<NO_POS>", "ར་<NO_POS>", "ཐར་པ་<NO_POS>", "མཐོང་བ་<NO_POS>", "དོན་<NO_POS>", "ཡོད་<NO_POS>", "བཞུགས་<NO_POS>", "སོ<NO_POS>", "།།<NO_POS>", "ན་མོ་<NO_POS>", "གུ་རུ་<NO_POS>", "དེ་ཝ་<NO_POS>", "ཌཱ་ཀི་<NO_POS>", "ནི<NO_POS>", "།<NO_POS>", "རྣམས་<DET>", "ལས་དམ་<NO_POS>", "ཆོས་ནོར་<NO_POS>", "བུ་<NOUN>", "དགོས་འདོད་<NOUN>", "ཆ<NOUN>", "ར་<PART>", "འབབས་<NON_WORD>", "བླངས་<VERB>", "ནས་<PART>", "ནི<PART>", "།།<PUNCT>", "གིས་<NON_WORD>", "བསྐྱོད་<VERB>", "ཕུ་ལ་ཧ་རི་<NO_POS>"]
24 |     expected_bilou_line = 'སྒྲ་<NO_POS>/B བསྒྱུར་<NO_POS>/I མར་པ་<NO_POS>/U ལོ་<NO_POS>/B ཙྪའི་<NO_POS>/S རྣམ་པ<NO_POS>/B ར་<NO_POS>/I ཐར་པ་<NO_POS>/U མཐོང་བ་<NO_POS>/U དོན་<NO_POS>/B ཡོད་<NO_POS>/I བཞུགས་<NO_POS>/U སོ<NO_POS>/U །།<NO_POS>/U ན་མོ་<NO_POS>/B གུ་རུ་<NO_POS>/I དེ་ཝ་<NO_POS>/B ཌཱ་ཀི་<NO_POS>/I ནི<NO_POS>/I །<NO_POS>/U རྣམས་<DET>/U ལས་དམ་<NO_POS>/S ཆོས་ནོར་<NO_POS>/S བུ་<NOUN>/I དགོས་འདོད་<NOUN>/U ཆ<NOUN>/B ར་<PART>/I འབབས་<NON_WORD>/I བླངས་<VERB>/U ནས་<PART>/U ནི<PART>/U །།<PUNCT>/U གིས་<NON_WORD>/S བསྐྱོད་<VERB>/U ཕུ་ལ་ཧ་རི་<NO_POS>/U '
25 |     assert expected_bilou_line == get_bilou_tag_line(human_toks, botok_toks)
26 | 
27 | def test_get_new_word_candidate():
28 |     merge_suggestions = ["སྒྲ་<NO_POS>/B བསྒྱུར་<NO_POS>/I", "དོན་<NO_POS>\B ཡོད་<NO_POS>\I"]
29 |     human_data = "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ ། མཐོང་བ་ དོན་ ཡོད་ བཞུགས་ སོ །"
30 |     expected_new_words = ["སྒྲ་བསྒྱུར་"]
31 |     assert expected_new_words == get_new_word_candidates(merge_suggestions, human_data)
32 | 
33 | def test_get_remove_word_candidate():
34 |     split_suggestions = ["སྒྲ་བསྒྱུར་<NO_POS>", "དོན་ཡོད་<NO_POS>", "མཐོང་བ་<NO_POS>"]
35 |     human_data = " སྒྲ་ བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་ བ་ དོན་ཡོད་ བཞུགས་ སོ ། མཐོང་ བ་ དོན་ ཡོད་ བཞུགས་ སོ །"
36 |     expected_remove_words = ["སྒྲ་བསྒྱུར་", "མཐོང་བ་"]
37 |     assert expected_remove_words == get_remove_word_candidates(split_suggestions, human_data)
38 | 
39 | def test_false_positive_merge():
40 |     tokens_in_rule = ['[text="ང་"]', '[text="ཁོང་"]', '[text="ཅན་"]', '[text="དུ་"]', '[text="མི་"]']
41 |     index = 2
42 |     human_data = "སྒོམ་ བྱེད་ ཀྱིན་ ཡོད་ འདུག་པ ས ། ང་ ཁོང་ ཅན་ དུ་ མི་ འགྲོ ཁྱེད་རང་ ང འི་ ཕྱི་ ལ་ འགྲོ་ ན་ གསེར་ མཉམ་ དུ་ བྱེད །"
43 |     assert True == is_false_positive_merge(tokens_in_rule, index, human_data)
44 | 
45 | def test_true_positive_merge():
46 |     tokens_in_rule = ['[text="ཁྱོད་"]', '[text="ཁོང་"]', '[text="ཅན་"]', '[text="བཏང་"]', '[text="དགོས་"]']
47 |     index = 2
48 |     human_data = "མ་རྒྱུད་ ཀྱི་ བདག་པོ་ གཅིག་ བཞུགས་ ཤིང་ ཡོད་པ ས་ ཁྱོད་ ཁོང་ཅན་ བཏང་ དགོས་ གསུངས །"
49 |     assert False == is_false_positive_merge(tokens_in_rule, index, human_data)
50 | 
51 | def test_true_positive_split():
52 |     tokens_in_rule = ['[text="ང་"]', '[text="ཁོང་ཅན་"]', '[text="དུ་"]', '[text="མི་"]']
53 |     index = 2
54 |     counter_split_suggestion = ' ཁོང་ ཅན་ '
55 |     human_data = "སྒོམ་ བྱེད་ ཀྱིན་ ཡོད་ འདུག་པ ས ། ང་ ཁོང་ ཅན་ དུ་ མི་ འགྲོ ཁྱེད་རང་ ང འི་ ཕྱི་ ལ་ འགྲོ་ ན་ གསེར་ མཉམ་ དུ་ བྱེད །"
56 |     assert False == is_false_positive_split(tokens_in_rule, index, counter_split_suggestion, human_data)
57 | 
58 | def test_false_positive_split():
59 |     tokens_in_rule = ['[text="ཁྱོད་"]', '[text="ཁོང་ཅན་"]', '[text="བཏང་"]', '[text="དགོས་"]']
60 |     index = 2
61 |     counter_split_suggestion = ' ཁོང་ ཅན་ '
62 |     human_data = "མ་རྒྱུད་ ཀྱི་ བདག་པོ་ གཅིག་ བཞུགས་ ཤིང་ ཡོད་པ ས་ ཁྱོད་ ཁོང་ཅན་ བཏང་ དགོས་ གསུངས །"
63 |     assert True == is_false_positive_split(tokens_in_rule, index, counter_split_suggestion, human_data)
64 | 
65 | def test_invalid_split_rule():
66 |     tokens_info = '[text="སྒྲ་བསྒྱུར་"] [text="མར་པ་"]'
67 |     index_info = '2-1'
68 |     human_data = "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ །།"
69 |     assert (True,0) == is_invalid_split(tokens_info, index_info, human_data)
70 | 
71 | def test_valid_split_rule():
72 |     tokens_info = '[text="སྒྲ་"] [text="བསྒྱུར་"] [text="མཐོང་བ་"]'
73 |     index_info = '3-1'
74 |     human_data = "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ །། སྒྲ་ བསྒྱུར་ མཐོང་ བ་ དོན་ ཡོད་ བཞུགས་ སོ"
75 |     assert (False,1) == is_invalid_split(tokens_info, index_info, human_data)
76 | 
77 | def test_invalid_merge_rule():
78 |     tokens_info = '[text="སྒྲ་བསྒྱུར་"] [text="མར་"] [text="པ་"]'
79 |     index_info = '2'
80 |     human_data = "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ །།"
81 |     assert True == is_invalid_merge(tokens_info, index_info, human_data)
82 | 
83 | def test_valid_merge_rule():
84 |     tokens_info = '[text="ཐར་པ་"] [text="མཐོང་"] [text="བ་"]'
85 |     index_info = '2'
86 |     human_data = "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ཙྪ འི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ །། སྒྲ་ བསྒྱུར་ མཐོང་ བ་ དོན་ ཡོད་ བཞུགས་ སོ"
87 |     assert False == is_invalid_merge(tokens_info, index_info, human_data)
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     # input_path = Path('./tests/corpus1/corpus1.txt')
92 |     # input_path = Path('./tests/marpa/marpa.txt')
93 |     input_path = Path('./tests/data/drokun_test/drokun_test_hd.txt')
94 |     rules = extract_seg_rule(input_path, type='cql')
95 |     (input_path.parent / f'{input_path.stem}_rules.txt').write_text(rules, encoding='utf-8')


--------------------------------------------------------------------------------
/tests/test_tok.py:
--------------------------------------------------------------------------------
 1 | from click.testing import CliRunner
 2 | 
 3 | from pybo.cli import tok
 4 | 
 5 | 
 6 | def test_tok_dir():
 7 |     runner = CliRunner()
 8 |     runner.invoke(tok, ["tests/resources/shelving/", "--tags", "pl"])
 9 | 
10 | def test_tok_file():
11 |     runner = CliRunner()
12 |     runner.invoke(tok, ["tests/resources/shelving/test_1.txt", "--tags", "p"])
13 | 


--------------------------------------------------------------------------------
/tests/test_untokenize.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from pybo.untokenize import *
 3 | 
 4 | def test_untokenize_clean_text():
 5 |     tokenized_text = "སྒྲ་བསྒྱུར་ མར་པ་ ལོ་ ཙྪའི་ རྣམ་པར་ ཐར་པ་ མཐོང་བ་ དོན་ཡོད་ བཞུགས་ སོ །། ན་མོ་གུ་རུ་ དེ་ཝ་ཌཱ་ཀི་ནི ། "
 6 |     tokens = pre_processing(tokenized_text)
 7 |     detokenized_text = assemble(tokens)
 8 |     expected_text = "སྒྲ་བསྒྱུར་མར་པ་ལོ་ཙྪའི་རྣམ་པར་ཐར་པ་མཐོང་བ་དོན་ཡོད་བཞུགས་སོ།།ན་མོ་གུ་རུ་དེ་ཝ་ཌཱ་ཀི་ནི།"
 9 |     assert expected_text == detokenized_text
10 | 
11 | def test_untokenize_single_tagged_text():
12 |     tokenized_text = "སྒྲ་བསྒྱུར་/NO_POS མར་པ་/NO_POS ལོ་/NO_POS ཙྪའི་/NO_POS རྣམ་པར་/NO_POS ཐར་པ་/NO_POS མཐོང་བ་/NO_POS དོན་ཡོད་/NO_POS བཞུགས་/NO_POS སོ/NO_POS །།/NO_POS ན་མོ་གུ་རུ་/NO_POS དེ་ཝ་ཌཱ་ཀི་ནི/NO_POS །/NO_POS "
13 |     tokens = pre_processing(tokenized_text)
14 |     detokenized_text = assemble(tokens)
15 |     expected_text = "སྒྲ་བསྒྱུར་མར་པ་ལོ་ཙྪའི་རྣམ་པར་ཐར་པ་མཐོང་བ་དོན་ཡོད་བཞུགས་སོ།།ན་མོ་གུ་རུ་དེ་ཝ་ཌཱ་ཀི་ནི།"
16 |     assert expected_text == detokenized_text
17 | 
18 | def test_untokenize_multi_tagged_text():
19 |     tokenized_text = "ལས་//// ཞེས་པ་//PART/ཞེས་པ་/ ནི་//PART/ནི་/ ལས་//// བྱེད་པ//VERB/བྱེད་པ་/"
20 |     tokens = pre_processing(tokenized_text)
21 |     detokenized_text = assemble(tokens)
22 |     expected_text = "ལས་ཞེས་པ་ནི་ལས་བྱེད་པ"
23 |     assert expected_text == detokenized_text


--------------------------------------------------------------------------------
/tests/workflow_test.txt:
--------------------------------------------------------------------------------
1 | # first run of pybo on a folder of files to process
2 | 


--------------------------------------------------------------------------------
/usage.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from pybo import Text, pyewts
 4 | from pybo.cli import prepare_folder
 5 | 
 6 | prepare_folder()
 7 | 
 8 | string = """ཤོག
 9 | བཀྲ་ཤིས་"""
10 | t = Text(string)
11 | print(t.tokenize_words_raw_lines)
12 | 
13 | converter = pyewts.pyewts()
14 | 
15 | uni = "བཀྲ་ཤིས་བདེ་ལེགས།། །།"
16 | wylie = "bkra shis bde legs//_//"
17 | 
18 | new_uni = converter.toUnicode(wylie)
19 | new_wylie = converter.toWylie(uni)
20 | 
21 | assert uni[:-5] == new_uni[:-3]  # double shads are a single char in pyewts
22 | assert wylie == new_wylie
23 | 


--------------------------------------------------------------------------------