├── .coveragerc
├── .github
    └── workflows
    │   ├── publish.yaml
    │   ├── python-package.yml
    │   └── test.yml
├── .gitignore
├── .readthedocs.yml
├── .travis.yml
├── CHANGELOG.md
├── LICENSE
├── README.md
├── _config.yml
├── botok
    ├── __init__.py
    ├── chunks
    │   ├── __init__.py
    │   ├── chunkframework.py
    │   ├── chunkframeworkbase.py
    │   └── chunks.py
    ├── config.py
    ├── modifytokens
    │   ├── __init__.py
    │   ├── adjusttokens.py
    │   ├── cqlmatcher.py
    │   ├── mergedagdra.py
    │   ├── mergingmatcher.py
    │   ├── replacingmatcher.py
    │   ├── splitaffixed.py
    │   ├── splittingmatcher.py
    │   ├── tokenmerge.py
    │   └── tokensplit.py
    ├── resources
    │   ├── README.md
    │   ├── SylComponents.json
    │   ├── bo_punct_position.csv
    │   ├── bo_uni_table.csv
    │   └── particles.tsv
    ├── text
    │   ├── __init__.py
    │   ├── format.py
    │   ├── modify.py
    │   ├── pipelinebase.py
    │   ├── preprocess.py
    │   ├── text.py
    │   └── tokenize.py
    ├── textunits
    │   ├── __init__.py
    │   ├── bostring.py
    │   ├── bosyl.py
    │   ├── charcategories.py
    │   └── sylcomponents.py
    ├── third_party
    │   ├── __init__.py
    │   ├── cqlparser.py
    │   ├── has_skrt_syl.py
    │   └── pynpl
    │   │   ├── __init__.py
    │   │   ├── cql.py
    │   │   └── fsa.py
    ├── tokenizers
    │   ├── __init__.py
    │   ├── chunktokenizer.py
    │   ├── paragraphtokenizer.py
    │   ├── sentencetokenizer.py
    │   ├── stacktokenizer.py
    │   ├── token.py
    │   ├── tokenize.py
    │   └── wordtokenizer.py
    ├── tries
    │   ├── __init__.py
    │   ├── basictrie.py
    │   └── trie.py
    ├── utils
    │   ├── __init__.py
    │   ├── expose_data.py
    │   ├── helpers.py
    │   ├── lenient_normalization.py
    │   └── unicode_normalization.py
    └── vars.py
├── docs
    ├── Makefile
    ├── README.md
    ├── old-docs
    │   ├── Behind BoTokenizer.ipynb
    │   ├── Preprocessing.ipynb
    │   ├── README.md
    │   ├── Using BoTokenizer.ipynb
    │   └── cql_readme.md
    ├── requirements-docs.txt
    └── source
    │   ├── acknowledgement.rst
    │   ├── architecture.rst
    │   ├── conf.py
    │   ├── custom-dialect-pack.rst
    │   ├── getting-started.rst
    │   ├── imgs
    │       └── botok_architecture.svg
    │   ├── index.rst
    │   └── main_classes
    │       └── configuration.rst
├── python-3.13.2-amd64.exe
├── requirements.txt
├── setup.cfg
├── setup.py
├── tests
    ├── __init__.py
    ├── chunks
    │   ├── test_chunkframework.py
    │   ├── test_chunks.py
    │   └── test_chunktokenizer.py
    ├── conftest.py
    ├── data
    │   ├── empty_dialect_pack
    │   │   ├── adjustments
    │   │   │   └── .keep
    │   │   └── dictionary
    │   │   │   └── .keep
    │   └── trie_dialect_pack
    │   │   ├── adjustments
    │   │       ├── remove
    │   │       │   └── test.tsv
    │   │       ├── rules
    │   │       │   └── adjust_rules.tsv
    │   │       ├── words
    │   │       │   ├── test.tsv
    │   │       │   └── test_comma_sep.tsv
    │   │       └── words_skrt
    │   │       │   └── test.tsv
    │   │   └── dictionary
    │   │       └── words
    │   │           └── empty.tsv
    ├── modifytokens
    │   └── test_matchers.py
    ├── resources
    │   ├── rdr_rules.txt
    │   ├── test.txt
    │   ├── test_file_to_tokenize.txt
    │   └── test_file_to_tokenize_pybo.txt
    ├── test_bugs.py
    ├── test_config.py
    ├── text
    │   ├── test_text.py
    │   └── test_text_tokenize.py
    ├── textunits
    │   ├── test_bostring.py
    │   ├── test_bosyl.py
    │   └── test_sylcomponents.py
    ├── tokenizers
    │   ├── test_bugs_missing_tokens.py
    │   ├── test_sent_par_tokenizer.py
    │   ├── test_splitaffixed.py
    │   ├── test_stack_tokenizer.py
    │   ├── test_token.py
    │   ├── test_tokenize.py
    │   └── test_wordtokenizer.py
    └── tries
    │   ├── test_basictrie.py
    │   └── test_trie.py
└── usage.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source = botok
 3 | omit = */tests/*,*/test_*,setup.py,botok/utils/expose_data.py,botok/utils/lenient_normalization.py,botok/utils/unicode_normalization.py
 4 | 
 5 | [report]
 6 | exclude_lines =
 7 |     pragma: no cover
 8 |     def __repr__
 9 |     raise NotImplementedError
10 |     if __name__ == .__main__.:
11 |     pass
12 |     raise ImportError
13 |     except ImportError
14 | fail_under = 80
15 | show_missing = True
16 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 | 
 8 | permissions:
 9 |   contents: write
10 | 
11 | jobs:
12 |   test:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       max-parallel: 4
16 |       matrix:
17 |         python-version: ["3.8", "3.9", "3.10", "3.11"]
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v1
21 | 
22 |       - name: Set up Python ${{ matrix.python-version }}
23 |         uses: actions/setup-python@v4
24 | 
25 |       - name: Install dependencies
26 |         run: |
27 |           python -m pip install --upgrade pip
28 |           pip install -r requirements.txt
29 |           pip install -e .
30 |       - name: Run Test
31 |         run: |
32 |           pytest tests/
33 | 
34 |   publish:
35 |     needs: test
36 |     runs-on: ubuntu-latest
37 | 
38 |     steps:
39 |       - uses: actions/checkout@v2
40 |         with:
41 |           fetch-depth: 0
42 | 
43 |       - name: Python Semantic Release
44 |         uses: relekang/python-semantic-release@v7.34.6
45 |         with:
46 |           github_token: ${{ secrets.GITHUB_TOKEN }}
47 |           pypi_token: ${{ secrets.PYPI_TOKEN }}
48 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | name: Python package
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ master ]
 6 |   pull_request:
 7 |     branches: [ master ]
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       fail-fast: false
14 |       matrix:
15 |         python-version: ['3.10', '3.12']
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v3
19 |     - name: Set up Python ${{ matrix.python-version }}
20 |       uses: actions/setup-python@v4
21 |       with:
22 |         python-version: ${{ matrix.python-version }}
23 |         cache: 'pip'
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         python -m pip install flake8 pytest pytest-cov codecov
28 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
29 |         pip install -e .
30 |     - name: Lint with flake8
31 |       run: |
32 |         # stop the build if there are Python syntax errors or undefined names
33 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
34 |         # exit-zero treats all errors as warnings
35 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
36 |     - name: Test with pytest
37 |       run: |
38 |         pytest -xvs
39 |     - name: Generate coverage report
40 |       run: |
41 |         pytest --cov=botok --cov-report=xml --cov-fail-under=80
42 |     - name: Upload coverage to Codecov
43 |       uses: codecov/codecov-action@v3
44 |       with:
45 |         file: ./coverage.xml
46 |         fail_ci_if_error: false
47 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
 1 | name: Test
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - master
 7 |   pull_request:
 8 |     branches:
 9 |       - master
10 | 
11 | jobs:
12 |   test:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       max-parallel: 4
16 |       matrix:
17 |         python-version: ["3.8", "3.9", "3.10", "3.11"]
18 | 
19 |     steps:
20 |       - uses: actions/checkout@v1
21 | 
22 |       - name: Set up Python ${{ matrix.python-version }}
23 |         uses: actions/setup-python@v4
24 |         with:
25 |           python-version: ${{ matrix.python-version }}
26 | 
27 |       - name: Install dependencies
28 |         run: |
29 |           python -m pip install --upgrade pip
30 |           pip install -r requirements.txt
31 |           pip install -e .
32 |           
33 |       - name: Run Test
34 |         run: |
35 |           pytest -vv
36 | 


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: ubuntu-22.04
 5 |   tools:
 6 |     python: "3.9"
 7 | 
 8 | python:
 9 |   install:
10 |     - requirements: docs/requirements-docs.txt
11 |     - method: pip
12 |       path: .
13 | 
14 | sphinx:
15 |   configuration: docs/source/conf.py
16 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | dist: trusty
 2 | sudo: required
 3 | 
 4 | before_install:
 5 | language: python
 6 | python:
 7 |  - '3.6'
 8 | install:
 9 |  - pip3 install -r requirements.txt
10 |  - pip3 show attrs
11 |  - pip3 show pytest
12 |  - pip3 install -U setuptools
13 |  - python3 setup.py install
14 |  - pip3 install coveralls
15 | before_script:
16 |  - sleep 1 # this is just a placeholder
17 | script:
18 |  - coverage run --source=botok -m pytest tests/
19 | after_success: coveralls
20 | 


--------------------------------------------------------------------------------
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-architect


--------------------------------------------------------------------------------
/botok/__init__.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from pathlib import Path
 3 | 
 4 | from .chunks.chunkframework import ChunkFramework
 5 | from .chunks.chunkframeworkbase import ChunkFrameworkBase
 6 | from .chunks.chunks import Chunks, TokChunks
 7 | from .config import Config
 8 | from .modifytokens.adjusttokens import AdjustTokens
 9 | from .modifytokens.cqlmatcher import CQLMatcher
10 | from .modifytokens.mergedagdra import MergeDagdra
11 | from .modifytokens.mergingmatcher import MergingMatcher
12 | from .modifytokens.replacingmatcher import ReplacingMatcher
13 | from .modifytokens.splitaffixed import split_affixed
14 | from .modifytokens.splittingmatcher import SplittingMatcher
15 | from .modifytokens.tokenmerge import TokenMerge
16 | from .modifytokens.tokensplit import TokenSplit
17 | from .text.pipelinebase import PipelineBase
18 | from .text.text import Text
19 | from .textunits.bostring import BoString
20 | from .textunits.bosyl import BoSyl
21 | from .textunits.sylcomponents import SylComponents
22 | from .third_party.cqlparser import Query, parse_cql_query, replace_token_attributes
23 | from .tokenizers.chunktokenizer import ChunkTokenizer
24 | from .tokenizers.paragraphtokenizer import paragraph_tokenizer
25 | from .tokenizers.sentencetokenizer import sentence_tokenizer
26 | from .tokenizers.stacktokenizer import tokenize_in_stacks
27 | from .tokenizers.token import Token
28 | from .tokenizers.tokenize import Tokenize
29 | from .tokenizers.wordtokenizer import WordTokenizer
30 | from .tries.basictrie import BasicTrie
31 | from .tries.trie import Trie
32 | from .utils.expose_data import expose_data
33 | from .utils.unicode_normalization import normalize_unicode
34 | 
35 | # from .utils.get_data import get_data
36 | from .vars import *
37 | from .vars import __version__
38 | 


--------------------------------------------------------------------------------
/botok/chunks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/chunks/__init__.py


--------------------------------------------------------------------------------
/botok/chunks/chunks.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from .chunkframework import ChunkFramework
  3 | from ..vars import ChunkMarkers as c
  4 | from ..vars import CharMarkers as a
  5 | 
  6 | 
  7 | class Chunks(ChunkFramework):
  8 |     """
  9 |     Produces chunks of the following types: bo, non-bo, punct and syl chunks
 10 | 
 11 |     Implements the following chunking pipeline:
 12 |             chunk "input_str" into BO / OTHER
 13 |             | chunk BO into PUNCT / BO
 14 |             | chunk BO into SYM / BO
 15 |             | chunk BO into NUM / BO
 16 |             | chunk BO into TEXT (syllables)
 17 |             | chunk OTHER into CJK / OTHER
 18 |             | chunk OTHER into LATIN / OTHER
 19 | 
 20 |     .. note:: Following Tibetan usage, it does not consider space as a punctuation mark.
 21 |     Spaces get attached to the chunk preceding them.
 22 |     """
 23 | 
 24 |     def __init__(self, string, ignore_chars=None):
 25 |         ChunkFramework.__init__(self, string, ignore_chars=ignore_chars)
 26 | 
 27 |     def make_chunks(self, indices=True, gen=False, space_as_punct=False):
 28 |         chunks = self.chunk_bo_chars()
 29 |         if space_as_punct:
 30 |             chunks = self.pipe_chunk(
 31 |                 chunks, self.chunk_spaces, to_chunk_marker=c.BO.value, yes=c.PUNCT.value
 32 |             )
 33 |         chunks = self.pipe_chunk(
 34 |             chunks, self.chunk_punct, to_chunk_marker=c.BO.value, yes=c.PUNCT.value
 35 |         )
 36 |         chunks = self.pipe_chunk(chunks, self.chunk_symbol, c.BO.value, c.SYM.value)
 37 |         chunks = self.pipe_chunk(chunks, self.chunk_number, c.BO.value, c.NUM.value)
 38 |         if not space_as_punct:
 39 |             chunks = self.merge_skippable_punct(
 40 |                 chunks
 41 |             )  # ensure we have correctly built syls
 42 |         chunks = self.pipe_chunk(chunks, self.syllabify, c.BO.value, c.TEXT.value)
 43 |         chunks = self.pipe_chunk(chunks, self.adjust_syls, c.TEXT.value, c.TEXT.value)
 44 |         chunks = self.pipe_chunk(chunks, self.chunk_cjk, c.OTHER.value, c.CJK.value)
 45 |         chunks = self.pipe_chunk(chunks, self.chunk_latin, c.OTHER.value, c.LATIN.value)
 46 |         if not space_as_punct:
 47 |             chunks = self.merge_skippable_punct(chunks)
 48 |         if not indices:
 49 |             return self.get_chunked(chunks, gen=gen)
 50 |         return chunks
 51 | 
 52 | 
 53 | class TokChunks(Chunks):
 54 |     """
 55 |     This class uses the chunks produced by ``Chunks`` to identify Tibetan syllables and clean them.
 56 |     Thus produces pre-processed Tibetan text that can be further processed.
 57 | 
 58 |     Every chunk produced by ``Chunks`` is wrapped into a tuple containing:
 59 |             - either None or a list containing the cleaned syllable
 60 |               (the indices to every non-space and non-tsek char in every syllable chunk)
 61 |             - the chunk itself
 62 | 
 63 |     """
 64 | 
 65 |     def __init__(self, string, ignore_chars=None, space_as_punct=False):
 66 |         super().__init__(string, ignore_chars=ignore_chars)
 67 |         self.chunks = None
 68 |         self.space_as_punct = space_as_punct
 69 | 
 70 |     def serve_syls_to_trie(self):
 71 |         chunks = []
 72 |         for chunk in self.make_chunks(space_as_punct=self.space_as_punct):
 73 |             if chunk[0] == c.TEXT:
 74 |                 syl = self.__get_text_chars(chunk[1], chunk[1] + chunk[2])
 75 |                 chunks.append((syl, chunk))
 76 |             else:
 77 |                 chunks.append((None, chunk))
 78 |         self.chunks = chunks
 79 | 
 80 |     def get_syls(self):
 81 |         syls = []
 82 |         for chunk in self.make_chunks(space_as_punct=self.space_as_punct):
 83 |             if chunk[0] == c.TEXT:
 84 |                 char_idxs = self.__get_text_chars(chunk[1], chunk[1] + chunk[2])
 85 |                 syls.append("".join([self.bs.string[i] for i in char_idxs]))
 86 |         return syls
 87 | 
 88 |     def __get_text_chars(self, start_idx, end_idx):
 89 |         """
 90 |         Removes all the spaces and tseks from a given syllable by only keeping the characters that
 91 |         pass ``__is_syl_text()``.
 92 | 
 93 |         :param start_idx: starting index of the syllable-chunk to clean
 94 |         :param end_idx: its ending index
 95 |         :type start_idx: int
 96 |         :type end_idx: int
 97 |         :return: a list of indices corresponding to the chars of the cleaned syllable
 98 |         """
 99 |         return [i for i in range(start_idx, end_idx) if self.__is_syl_text(i)]
100 | 
101 |     def __is_syl_text(self, char_idx):
102 |         """
103 |         Tests whether the character at the given index is part of the cleaned syllable or not.
104 |         """
105 |         return (
106 |             self.bs.base_structure[char_idx] != a.TSEK
107 |             and self.bs.base_structure[char_idx] != a.TRANSPARENT
108 |             and self.bs.base_structure[char_idx] != a.SKRT_LONG_VOW
109 |         ) or self.bs.base_structure[char_idx] == a.SKRT_LONG_VOW
110 | 


--------------------------------------------------------------------------------
/botok/config.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import zipfile
  3 | from collections import defaultdict
  4 | from pathlib import Path
  5 | 
  6 | import requests
  7 | 
  8 | # Defaults
  9 | DEFAULT_BASE_PATH = Path.home() / "Documents" / "pybo" / "dialect_packs"
 10 | DEFAULT_DIALECT_PACK = "general"
 11 | 
 12 | 
 13 | def get_dialect_pack_url(dialect_name, version=None):
 14 |     # Try 50 times
 15 |     attempts = 0
 16 | 
 17 |     while not version and attempts < 50:
 18 |         try:
 19 |             response = requests.get(
 20 |                 "https://api.github.com/repos/Esukhia/botok-data/releases/latest",
 21 |                 timeout=50
 22 |             )
 23 |             version = response.json()["tag_name"]
 24 |         except (requests.RequestException, KeyError):
 25 |             pass
 26 | 
 27 |         attempts += 1
 28 | 
 29 |     return f"https://github.com/Esukhia/botok-data/releases/download/{version}/{dialect_name}.zip"
 30 | 
 31 | 
 32 | def get_dialect_pack(dialect_name, out_dir, version=None):
 33 |     out_dir = Path(out_dir)
 34 |     out_dir.mkdir(exist_ok=True, parents=True)
 35 |     dialect_pack_path = out_dir / dialect_name
 36 |     if dialect_pack_path.is_dir():
 37 |         return dialect_pack_path
 38 | 
 39 |     print(f"[INFO] Downloading {dialect_name} dialect pack ...")
 40 |     # Download the dialect pack
 41 |     url = get_dialect_pack_url(dialect_name, version)
 42 |     r = requests.get(url, stream=True, timeout=50)
 43 | 
 44 |     # attempt 50 times to download the zip
 45 |     check = zipfile.is_zipfile(io.BytesIO(r.content))
 46 |     attempts = 0
 47 |     while not check and attempts < 50:
 48 |         r = requests.get(url, stream=True, timeout=50)
 49 |         check = zipfile.is_zipfile(io.BytesIO(r.content))
 50 |         attempts += 1
 51 | 
 52 |     if not check:
 53 |         raise IOError("the .zip file couldn't be downloaded.")
 54 | 
 55 |     # extract the zip in the current folder
 56 |     with zipfile.ZipFile(io.BytesIO(r.content)) as z:
 57 |         z.extractall(path=str(out_dir))
 58 | 
 59 |     print("[INFO] Download completed!")
 60 | 
 61 |     return dialect_pack_path
 62 | 
 63 | 
 64 | class Config:
 65 |     """botok config for Tibetan dialect pack.
 66 | 
 67 |     Each dialect pack has two components:
 68 |       1. Dictionary:
 69 |          - contains all the data required to construct the Trie.
 70 |          - It should in the directory called `dictionary` inside the dialect pack directory.
 71 |       2. Adjustment:
 72 |          - Contains all the data required to adjust the text segmentation rules.
 73 |     """
 74 | 
 75 |     def __init__(self, dialect_name=None, base_path=None):
 76 |         """Create config for given `dialect_name` and stored in `base_path`"""
 77 |         if not dialect_name:
 78 |             dialect_name = DEFAULT_DIALECT_PACK
 79 |         if not base_path:
 80 |             base_path = DEFAULT_BASE_PATH
 81 |         dialect_pack_path = get_dialect_pack(dialect_name, base_path)
 82 |         self.reset(dialect_pack_path)
 83 | 
 84 |     def reset(self, dialect_pack_path=None):
 85 |         """Reset the config to default bo_general_pack."""
 86 |         if dialect_pack_path:
 87 |             self.dialect_pack_path = dialect_pack_path
 88 |         else:
 89 |             self.dialect_pack_path = get_dialect_pack(
 90 |                 DEFAULT_DIALECT_PACK, DEFAULT_BASE_PATH
 91 |             )
 92 |         self.dictionary = self._get_pack_component("dictionary")
 93 |         self.adjustments = self._get_pack_component("adjustments")
 94 | 
 95 |     def _get_pack_component(self, pack_component_name, pack_component=None):
 96 |         """Return all the data_paths of the `pack_component.
 97 | 
 98 |         data_paths stored in python `dict` as per the directory
 99 |         structure of the pack component.
100 |         """
101 |         if not pack_component:
102 |             pack_component = defaultdict(list)
103 |         for path in (self.dialect_pack_path / pack_component_name).iterdir():
104 |             if not path.is_dir():
105 |                 continue
106 |             data_type = path.name
107 |             pack_component[data_type].extend(list(path.rglob("*.tsv")))
108 |         return pack_component
109 | 
110 |     @classmethod
111 |     def from_path(cls, dialect_pack_path):
112 |         """Creates config from ``dialect_pack_path``.
113 | 
114 |         Returns:
115 |             :class: `Config`: An instance of a Configuration object
116 | 
117 |         Examples::
118 | 
119 |             config = Config.from_path(path_to_dialect_pack)
120 |             assert config.dictionary == True
121 |             assert config.adjustments == True
122 | 
123 |         """
124 |         path = Path(dialect_pack_path)
125 |         dialect_name = path.name
126 |         base_path = path.parent
127 |         return cls(dialect_name, base_path)
128 | 
129 |     @property
130 |     def profile(self):
131 |         """Returns profile name of the dialect_pack."""
132 |         return self.dialect_pack_path.name
133 | 
134 |     def add_dialect_pack(self, path):
135 |         """"Merge given dialect_pack at `path` to current dialect_pack."""
136 |         self.dialect_pack_path = path
137 |         self._get_pack_component("dictionary", self.dictionary)
138 |         self._get_pack_component("adjustments", self.adjustments)
139 | 


--------------------------------------------------------------------------------
/botok/modifytokens/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/modifytokens/__init__.py


--------------------------------------------------------------------------------
/botok/modifytokens/adjusttokens.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import csv
  3 | import re
  4 | 
  5 | from .splittingmatcher import SplittingMatcher
  6 | from .mergingmatcher import MergingMatcher
  7 | from .replacingmatcher import ReplacingMatcher
  8 | from ..utils.helpers import decomment_file
  9 | 
 10 | 
 11 | class AdjustTokens:
 12 |     """
 13 |     Syntax for the .tsv adjustment rules
 14 |     ===================================
 15 |     - each rule should be as follows: "<matchcql>\t<index>\t<operation>\t<replacecql>"
 16 |     - comments with # and empty lines are allowed
 17 |     - CQL rules: "<text>" can be used without specifying that there is "text_cleaned="
 18 |     - Index format: either "<matching_index>" or "<matching_index>-<splitting-index>"
 19 |     - Adjustment format:
 20 |             - "+" for merge
 21 |             - ":" for split (default: syllable mode)
 22 |             - "::" for split in character mode
 23 |             - "=" for replace
 24 |     - Constraint: "<matching_index>-<splitting-index>" is only allowed if adjustment is ":" or "::"
 25 |     """
 26 | 
 27 |     def __init__(self, main=None, custom=None):
 28 |         self.paths = []
 29 |         if custom:
 30 |             self.paths.extend(custom)
 31 |         elif main:
 32 |             self.paths.extend(main)
 33 |         self.rules = []
 34 |         self.parse_rules()
 35 | 
 36 |     def no_token_matched(self, matchcql):
 37 |         matched_tokens = [token for token in re.split(r'(\[.+?\])', matchcql) if token != " " and token != ""]
 38 |         return len(matched_tokens)
 39 | 
 40 |     def adjust(self, token_list):
 41 |         for rule in self.rules:
 42 |             if rule["operation"] == "split":
 43 |                 if rule["matchidx"] <= self.no_token_matched(rule['matchcql']):
 44 |                     sm = SplittingMatcher(
 45 |                         rule["matchcql"],
 46 |                         rule["matchidx"],
 47 |                         rule["splitidx"],
 48 |                         token_list,
 49 |                         rule["replacecql"],
 50 |                     )
 51 |                     token_list = sm.split_on_matches(mode=rule["splitmode"])
 52 |                 else:
 53 |                     print(f'[ERROR]: No token to spilt with token number {rule["matchidx"]} found in rule {"    ".join(rule)}')
 54 |             elif rule["operation"] == "merge":
 55 |                 if rule["matchidx"] < self.no_token_matched(rule['matchcql']):
 56 |                     mm = MergingMatcher(
 57 |                         rule["matchcql"], rule["matchidx"], token_list, rule["replacecql"]
 58 |                     )
 59 |                     token_list = mm.merge_on_matches()
 60 |                 else:
 61 |                     print(f'[ERROR]: No token to merge with token number {rule["matchidx"]} found in rule {"    ".join(rule)}')
 62 |             elif rule["operation"] == "repl":
 63 |                 rm = ReplacingMatcher(
 64 |                     rule["matchcql"], rule["matchidx"], token_list, rule["replacecql"]
 65 |                 )
 66 |                 rm.replace_on_matches()
 67 |         return token_list
 68 | 
 69 |     def parse_rules(self):
 70 |         """
 71 |         Files are sorted before being applied. Thus, filenames
 72 |         :return:
 73 |         """
 74 |         for rule_file in sorted(self.paths):
 75 |             for rule in csv.reader(
 76 |                 decomment_file(rule_file.open(encoding="utf-8-sig")), delimiter="\t"
 77 |             ):
 78 |                 self.rules.append(self.parse_rule(rule))
 79 | 
 80 |     @staticmethod
 81 |     def parse_rule(rule):
 82 |         idx_sep = "-"
 83 | 
 84 |         # sanity checks
 85 |         if len(rule) != 4:
 86 |             raise SyntaxError("There can't be more than three columns per rule.")
 87 |         if not rule[1]:
 88 |             raise SyntaxError("There needs to be an index for every rule.")
 89 |         if idx_sep in rule[1] and rule[2] not in [":", "::"]:
 90 |             raise SyntaxError(
 91 |                 "The double index in only intended for split adjustments."
 92 |             )
 93 |         if rule[2] not in ["+", "=", ":", "::"]:
 94 |             raise SyntaxError(
 95 |                 'The supported operations are either of ["+", "=", ":", "::"].'
 96 |             )
 97 | 
 98 |         # parse
 99 |         rule_dict = {
100 |             "matchcql": None,
101 |             "matchidx": None,
102 |             "operation": None,
103 |             "splitidx": None,
104 |             "splitmode": None,
105 |             "replacecql": None,
106 |         }
107 |         rule_dict["matchcql"] = rule[0]
108 |         if idx_sep in rule[1]:
109 |             match_idx, split_idx = rule[1].split("-")
110 |             rule_dict["matchidx"] = int(match_idx)
111 |             rule_dict["splitidx"] = int(split_idx)
112 |         else:
113 |             rule_dict["matchidx"] = int(rule[1])
114 |         if rule[2] == "=":
115 |             rule_dict["operation"] = "repl"
116 |         elif rule[2] == "+":
117 |             rule_dict["operation"] = "merge"
118 |         elif rule[2] == ":":
119 |             rule_dict["operation"] = "split"
120 |             rule_dict["splitmode"] = "syl"
121 |         elif rule[2] == "::":
122 |             rule_dict["operation"] = "split"
123 |             rule_dict["splitmode"] = "char"
124 |         rule_dict["replacecql"] = rule[3]
125 | 
126 |         return rule_dict
127 | 


--------------------------------------------------------------------------------
/botok/modifytokens/cqlmatcher.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from ..third_party.cqlparser import Query
 3 | 
 4 | 
 5 | class CQLMatcher:
 6 |     def __init__(self, query):
 7 |         """
 8 |         Creates a matcher object to be later executed against a list of tokens with BoMatcher.match()
 9 | 
10 |         :param query: CQL compliant query string
11 |         :type query: string
12 | 
13 |         """
14 |         self.query = Query(query)
15 | 
16 |     def match(self, tokens_list):
17 |         """
18 |         Runs cql.Query on a slice of the list of tokens for every index in the list.
19 | 
20 |         :param tokens_list: output of BoTokenizer
21 |         :type tokens_list: list of Token objects
22 |         :return: a list of matching slices of tokens_list
23 |         :rtype: list of tuples with each two values: beginning and end indices
24 |         """
25 |         slice_len = len(self.query.tokenexprs) - 1
26 |         matches = []
27 |         for i in range(len(tokens_list)):
28 |             if i + slice_len <= len(tokens_list) and self.query(
29 |                 tokens_list[i : i + slice_len + 1]
30 |             ):
31 |                 matches.append((i, i + slice_len))
32 |         return matches
33 | 


--------------------------------------------------------------------------------
/botok/modifytokens/mergedagdra.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from .tokenmerge import TokenMerge
 3 | from ..vars import TSEK, DAGDRA
 4 | 
 5 | 
 6 | class MergeDagdra:
 7 |     """
 8 |     A class to merge pa/po/ba/bo tokens in a token list produced by BoTokenizer
 9 | 
10 |     """
11 | 
12 |     def __init__(self):
13 |         pass
14 | 
15 |     def merge(self, tokens):
16 |         """
17 |         Merges the tokens containing either pa/po/ba/bo
18 | 
19 |         :param tokens: list of Token objects
20 |         """
21 |         if len(tokens) <= 1:
22 |             pass
23 |         elif len(tokens) == 2:
24 |             token0, token1 = tokens
25 |             if token1.text_cleaned in DAGDRA:
26 |                 # split token containing the affixed particle
27 |                 merged = self.merge_with_previous_token(token0, token1)
28 |                 del tokens[1]
29 |                 tokens[0] = merged
30 |         else:
31 |             t = 0
32 |             while t <= len(tokens) - 1:
33 |                 if t + 1 > len(tokens) - 1:
34 |                     break
35 |                 token0, token1 = tokens[t], tokens[t + 1]
36 |                 clean_word = (
37 |                     token1.text_cleaned + TSEK
38 |                     if not token1.text_cleaned.endswith(TSEK)
39 |                     else token1.text_cleaned
40 |                 )
41 |                 if (
42 |                     token0.chunk_type == "TEXT"
43 |                     and token1.chunk_type == "TEXT"
44 |                     and clean_word in DAGDRA
45 |                 ):
46 |                     # split token containing the affixed particle
47 |                     merged = self.merge_with_previous_token(token0, token1)
48 | 
49 |                     # replace the original token with the two new ones
50 |                     tokens[t : t + 2] = [merged]
51 |                 t += 1
52 | 
53 |     def merge_with_previous_token(self, token0, token1):
54 |         merged = TokenMerge(token0, token1).merge()
55 |         merged.has_merged_dagdra = True
56 |         merged.lemma = merged.text_cleaned
57 |         return merged
58 | 


--------------------------------------------------------------------------------
/botok/modifytokens/mergingmatcher.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from ..third_party.cqlparser import Query
 3 | from .tokenmerge import TokenMerge
 4 | 
 5 | 
 6 | class MergingMatcher:
 7 |     def __init__(self, query, replace_idx, token_list, token_changes=None):
 8 |         self.matcher = Query(query)
 9 |         self.span = len(self.matcher.tokenexprs) - 1
10 |         self.token_list = token_list
11 | 
12 |         self.replace_idx = replace_idx - 1
13 |         self.token_changes = token_changes
14 | 
15 |     def merge_on_matches(self):
16 |         merged_list = []
17 |         i = 0
18 |         while i < len(self.token_list):
19 |             if self.__matches(i):
20 |                 # find the index of the token to split
21 |                 idx = i + self.replace_idx
22 | 
23 |                 # add new tokens that precede the one to split
24 |                 for r in range(i, idx):
25 |                     merged_list.append(self.token_list[r])
26 |                     i += 1
27 | 
28 |                 # split the token and add them to the new list
29 |                 merged_list.append(
30 |                     self.__merge(self.token_list[idx], self.token_list[idx + 1])
31 |                 )
32 |                 i += 1
33 |             else:
34 |                 merged_list.append(self.token_list[i])
35 | 
36 |             i += 1
37 | 
38 |         return merged_list
39 | 
40 |     def __matches(self, i):
41 |         return i + self.span <= len(self.token_list) and self.matcher(
42 |             self.token_list[i : i + self.span + 1]
43 |         )
44 | 
45 |     def __merge(self, token1, token2):
46 |         ts = TokenMerge(token1, token2, self.token_changes)
47 |         return ts.merge()
48 | 


--------------------------------------------------------------------------------
/botok/modifytokens/replacingmatcher.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from ..third_party.cqlparser import Query, replace_token_attributes
 3 | 
 4 | 
 5 | class ReplacingMatcher:
 6 |     def __init__(self, query, replace_idx, token_list, token_changes=None):
 7 |         self.matcher = Query(query)
 8 |         self.span = len(self.matcher.tokenexprs) - 1
 9 |         self.replace_idx = replace_idx - 1
10 |         self.token_list = token_list
11 |         self.token_changes = token_changes
12 | 
13 |     def replace_on_matches(self):
14 |         i = 0
15 |         while i < len(self.token_list):
16 |             if self.__matches(i):
17 |                 # find the index of the token to split
18 |                 idx = i + self.replace_idx
19 |                 replace_token_attributes(self.token_list[idx], self.token_changes)
20 |             i += 1
21 | 
22 |     def __matches(self, i):
23 |         return i + self.span <= len(self.token_list) and self.matcher(
24 |             self.token_list[i : i + self.span + 1]
25 |         )
26 | 


--------------------------------------------------------------------------------
/botok/modifytokens/splitaffixed.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from .tokensplit import TokenSplit
 3 | 
 4 | 
 5 | def split_affixed(tokens):
 6 |     """
 7 |     Splits in place the tokens containing affixed particles
 8 |     tokens have to be Token objects produced by BoTokenizer.Tokenizer
 9 | 
10 |     :param tokens: list of Token objects
11 |     """
12 |     t = 0
13 |     while t <= len(tokens) - 1:
14 |         # check that splitting is possible (affixation attribute exists)
15 |         # and that there is no meaning that has "affixed: False".
16 |         # ie, check that the inflected form can't be the affixed form of a word and the unaffixed form of another word
17 |         if tokens[t].affixation and not [
18 |             True for m in tokens[t].senses if "affixed" in m and not m["affixed"]
19 |         ]:
20 |             # split token containing the affixed particle
21 |             split_idx = tokens[t].syls_idx[-1][-tokens[t].affixation["len"]]
22 |             changes = (
23 |                 '[affix_host="True"] '
24 |                 '[pos="PART" & affix="True" & skrt="False" & freq="None" & senses="None"]'
25 |             )
26 |             ts = TokenSplit(tokens[t], split_idx, token_changes=changes)
27 |             token1, token2 = ts.split()
28 |             if token2.senses is None:
29 |                 token2.senses = []
30 | 
31 |             # replace the original token with the two new ones
32 |             tokens[t : t + 1] = [token1, token2]
33 | 
34 |             t += 1  # increment once more to account for the newly split token
35 |         t += 1
36 | 


--------------------------------------------------------------------------------
/botok/modifytokens/splittingmatcher.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from ..third_party.cqlparser import Query
 3 | from .tokensplit import TokenSplit
 4 | 
 5 | 
 6 | class SplittingMatcher:
 7 |     def __init__(self, query, replace_idx, split_idx, token_list, token_changes=None):
 8 |         self.matcher = Query(query)
 9 |         self.span = len(self.matcher.tokenexprs) - 1
10 |         self.token_list = token_list
11 | 
12 |         self.replace_idx = replace_idx - 1
13 |         self.split_idx = split_idx
14 |         self.token_changes = token_changes
15 | 
16 |     def split_on_matches(self, mode="char"):
17 |         """
18 |         :param mode: can either be "char" or "syl"
19 |         """
20 |         split_list = []
21 | 
22 |         i = 0
23 |         while i < len(self.token_list):
24 |             if self.__matches(i, self.token_list):
25 |                 # find the index of the token to split
26 |                 idx = i + self.replace_idx
27 | 
28 |                 # add new tokens that precede the one to split
29 |                 for r in range(i, idx):
30 |                     split_list.append(self.token_list[r])
31 |                     i += 1
32 | 
33 |                 # split the token and add them to the new list
34 |                 split_list.extend(self.__split(self.token_list[idx], mode=mode))
35 | 
36 |             else:
37 |                 split_list.append(self.token_list[i])
38 |             i += 1
39 | 
40 |         return split_list
41 | 
42 |     def __matches(self, i, token_list):
43 |         return i + self.span <= len(token_list) and self.matcher(
44 |             self.token_list[i : i + self.span + 1]
45 |         )
46 | 
47 |     def __split(self, token, mode):
48 |         ts = TokenSplit(token, self.split_idx, self.token_changes)
49 |         return ts.split(mode=mode)
50 | 


--------------------------------------------------------------------------------
/botok/modifytokens/tokenmerge.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import copy
 3 | 
 4 | from ..third_party.cqlparser import replace_token_attributes
 5 | 
 6 | 
 7 | class TokenMerge:
 8 |     """
 9 | 
10 |     """
11 | 
12 |     def __init__(self, token1, token2, token_changes=None):
13 |         self.token1 = token1
14 |         self.token2 = token2
15 |         self.merged = copy.deepcopy(token1)
16 |         if not self.merged.syls_idx:
17 |             self.merged.syls_idx = []
18 |         if not self.merged.syls:
19 |             self.merged.syls = []
20 |         self.token_changes = token_changes
21 | 
22 |     def merge(self):
23 |         self.merge_attrs()
24 |         self.replace_attrs()
25 |         return self.merged
26 | 
27 |     def replace_attrs(self):
28 |         """
29 |         Replaces the content of attributes that were found in the cql query.
30 |         If no query is provided, the values of the first token are kept.
31 |         """
32 |         if self.token_changes:
33 |             replace_token_attributes(self.merged, self.token_changes)
34 | 
35 |     def merge_attrs(self):
36 |         self.__merge_texts()
37 |         self.__merge_indices()
38 |         self.__merge_syls_idx()
39 |         self.__merge_syls_start_end()
40 |         self.__del_lemma()
41 | 
42 |     def __merge_texts(self):
43 |         self.merged.text += self.token2.text
44 | 
45 |     def __merge_indices(self):
46 |         self.merged.len += self.token2.len
47 | 
48 |     def __merge_syls_start_end(self):
49 |         # token1 is a host syllable and token2 its affixed syllable
50 |         if (
51 |             not self.merged.syls_start_end
52 |             or not self.token1.syls_start_end
53 |             or not self.token2.syls_start_end
54 |         ):
55 |             return
56 | 
57 |         if (
58 |             self.token1.affix_host
59 |             and not self.token1.affix
60 |             and not self.token2.affix_host
61 |             and self.token2.affix
62 |         ):
63 |             self.merged.syls_start_end[-1]["end"] = self.token2.syls_start_end[0]["end"]
64 |             self.merged.syls_start_end.extend(self.token2.syls_start_end[1:])
65 |         else:
66 |             self.merged.syls_start_end.extend(self.token2.syls_start_end)
67 | 
68 |     def __merge_syls_idx(self):
69 |         """
70 |         Updates indices and add the syls to the merged object
71 |         Re-joins the host-syllable and affixed particle syllables into a single one;
72 |         then, affix is True and affixed also, so cleaned_content gets its tsek.
73 |         """
74 |         first_syl = True
75 |         if self.token2.syls_idx:
76 |             for syl in self.token2.syls_idx:
77 |                 if syl:
78 |                     new_syl = [i + self.token1.len for i in syl]
79 | 
80 |                     # token1 is a host syllable and token2 its affixed syllable
81 |                     if (
82 |                         first_syl
83 |                         and (self.token1.affix_host and not self.token1.affix)
84 |                         and (not self.token2.affix_host and self.token2.affix)
85 |                     ):
86 |                         self.merged.syls_idx[-1] += new_syl
87 |                         self.merged.affix = True
88 |                         first_syl = False
89 |                     else:
90 |                         self.merged.syls_idx.append(new_syl)
91 | 
92 |     def __del_lemma(self):
93 |         """
94 |         Simply deletes any lemma in merged since the lemma of the merged token can't be guessed.
95 |         """
96 |         if self.token1["lemma"]:
97 |             self.merged.lemma = None
98 | 


--------------------------------------------------------------------------------
/botok/modifytokens/tokensplit.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import copy
  3 | 
  4 | from ..third_party.cqlparser import replace_token_attributes
  5 | 
  6 | 
  7 | class TokenSplit:
  8 |     """
  9 |     Takes a token object and divide it into two using an index of the content.
 10 | 
 11 |     The affected attributes are:
 12 |         - token.content     : the string is split at the index
 13 |         - token.char_groups : the dict items are redistributed
 14 |         - token.start       : second token only. now equals "start + index"
 15 |         - token.len      : length of new content
 16 |         - token.syls        : syls are redistributed and split if necessary
 17 | 
 18 |     """
 19 | 
 20 |     def __init__(self, token, split_idx, token_changes=None):
 21 |         self.token = token
 22 |         self.first = None
 23 |         self.second = None
 24 |         self.token_changes = token_changes
 25 |         self.idx = split_idx
 26 | 
 27 |     def split(self, mode="char"):
 28 |         """
 29 |         :param mode: can either be "syl" or "char" to split on a syllable index or a character index.
 30 |         """
 31 |         if mode != "char" and mode != "syl":
 32 |             raise SyntaxError("splitting mode should either be 'syl' or 'char'. ")
 33 | 
 34 |         # in syllable-mode, if there is only one syllable, return the word without splitting it.
 35 |         if mode == "syl" and len(self.token.syls) == 1:
 36 |             return [self.token]
 37 | 
 38 |         self.split_on_idx(mode=mode)
 39 |         self.replace_attrs()
 40 | 
 41 |         return [self.first, self.second]
 42 | 
 43 |     def replace_attrs(self):
 44 |         if self.token_changes:
 45 |             tokens = [self.first, self.second]
 46 |             replace_token_attributes(tokens, self.token_changes)
 47 |             self.first, self.second = tokens
 48 | 
 49 |     def split_on_idx(self, mode):
 50 |         self.first = copy.deepcopy(self.token)
 51 |         self.second = copy.deepcopy(self.token)
 52 | 
 53 |         if mode == "syl":
 54 |             self.idx = self.token.syls_start_end[self.idx - 1]["end"]
 55 | 
 56 |         self.__split_contents()
 57 |         self.__split_indices()
 58 |         self.__split_syls_idx()
 59 |         self.__split_syls_start_end(mode)
 60 |         self.__split_char_types()
 61 |         self.__split_affixation()
 62 | 
 63 |     def __split_contents(self):
 64 |         text = self.first.text
 65 |         self.first.text = text[0 : self.idx]
 66 |         self.second.text = text[self.idx :]
 67 | 
 68 |     def __split_char_types(self):
 69 |         char_types = self.first.char_types
 70 |         self.first.char_types = char_types[: self.idx]
 71 |         self.second.char_types = char_types[self.idx :]
 72 | 
 73 |     def __split_indices(self):
 74 |         self.first.len = len(self.first.text)
 75 |         self.second.len = len(self.second.text)
 76 |         self.second.start = self.second.start + self.idx
 77 | 
 78 |     def __split_syls_start_end(self, mode):
 79 |         if not self.token.syls_start_end:
 80 |             return
 81 | 
 82 |         to_split_idx = 0
 83 |         for num, s in enumerate(self.token.syls_start_end):
 84 |             if s["start"] <= self.idx <= s["end"]:
 85 |                 to_split_idx = num
 86 |                 break  # ensure to exit on first match
 87 | 
 88 |         start = self.token.syls_start_end[:to_split_idx]
 89 |         end = self.token.syls_start_end[to_split_idx + 1 :]
 90 |         to_split = self.token.syls_start_end[to_split_idx]
 91 | 
 92 |         if mode == "char":
 93 |             start.append({"start": to_split["start"], "end": self.idx})
 94 |             end.append({"start": self.idx, "end": to_split["end"]})
 95 | 
 96 |         if mode == "syl":
 97 |             start.append(to_split)
 98 | 
 99 |         self.first.syls_start_end = start
100 |         self.second.syls_start_end = end
101 | 
102 |     def __split_syls_idx(self):
103 |         syls = self.first.syls_idx
104 |         # empty syls
105 |         self.first.syls_idx = []
106 |         self.second.syls_idx = []
107 | 
108 |         if syls:
109 |             for syl in syls:
110 |                 if syl[-1] < self.idx:
111 |                     self.first.syls_idx.append(syl)
112 | 
113 |                 else:
114 |                     # separate the syl in two
115 |                     part1, part2 = [], []
116 |                     for i in syl:
117 |                         if i < self.idx:
118 |                             part1.append(i)
119 |                         else:
120 |                             part2.append(i - self.idx)
121 | 
122 |                     # add them if non-empty
123 |                     if part1:
124 |                         self.first.syls_idx.append(part1)
125 |                     if part2:
126 |                         self.second.syls_idx.append(part2)
127 | 
128 |     def __split_affixation(self):
129 |         if self.token.affixation:
130 |             self.first.affixation.pop("len", '')
131 |             self.first.affixation.pop("type", '')
132 |             self.second.affixation.pop("aa", '')
133 | 


--------------------------------------------------------------------------------
/botok/resources/README.md:
--------------------------------------------------------------------------------
 1 | # Resource files
 2 | 
 3 | This document lists the files used by pybo, their format, origin and usage. Unless indicated otherwise they are in the public domain.
 4 | 
 5 | ### SylComponents.json
 6 | 
 7 | This file has been compiled by hand based on the list presented in *TODO: ref. to HL article*
 8 | 
 9 | It is used to check if a syllable is correct according to Classical Tibetan norms, and find the root letter.
10 | 
11 | ### frequency/mgd.txt
12 | 
13 | XXX
14 | 
15 | ### frequency/tc.txt
16 | 
17 | XXX
18 | 
19 | ### lemmas/particles.yaml
20 | 
21 | XXX
22 | 
23 | ### trie/ancient.txt , trie/exceptions.txt
24 | 
25 | These files come from [tibetan-spellchecker](https://github.com/eroux/tibetan-spellchecker) and indicate exceptions to Classical Tibetan norms.
26 | 
27 | ### trie/particles.txt
28 | 
29 | This is a list of all particles, compiled by hand, with the `PART` POS tag.
30 | 
31 | ### trie/Tibetan.DICT
32 | 
33 | This file has been extracted from:
34 | 
35 | Meelen, Marieke, Hill, Nathan, & Handy, Christopher. (2017). The Annotated Corpus of Classical Tibetan (ACTib), Part II - POS-tagged version, based on the BDRC digitised text collection, tagged with the Memory-Based Tagger from TiMBL [Data set]. Zenodo. [http://doi.org/10.5281/zenodo.822537](https://doi.org/10.5281/zenodo.822537)
36 | 
37 | It is available under the [CC-BY 4.0 license](https://creativecommons.org/licenses/by/4.0/).
38 | 
39 | ### trie/tsikchen.txt
40 | 
41 | This file has been extracted from a digitized version of:
42 | 
43 | [Yisun, Zhang. 1985. བོད་རྒྱ་ཚིག་མཛོད་ཆེན་མོ།. Beijing: མི་རིགས་དཔེ་སྐྲུན་ཁང་།](http://tbrc.org/link?RID=W29329)
44 | 
45 | Although the book is under copyright, we consider that the bare list of words we provide is not.
46 | 
47 | ### trie/mgd.txt
48 | 
49 | XXX
50 | 
51 | 
52 | ### trie/recordings_4.txt , trie/oral_corpus_0.txt , trie/oral_corpus_1.txt , trie/oral_corpus_2.txt , trie/oral_corpus_3.txt
53 | 
54 | XXX
55 | 


--------------------------------------------------------------------------------
/botok/resources/bo_punct_position.csv:
--------------------------------------------------------------------------------
 1 | Char,unicode repr,punct_position
 2 | 0F01,—༁—,opening_punct
 3 | 0F02,—༂—,opening_punct
 4 | 0F03,—༃—,opening_punct
 5 | 0F04,—༄—,opening_punct
 6 | 0F05,—༅—,opening_punct
 7 | 0F06,—༆—,opening_punct
 8 | 0F07,—༇—,opening_punct
 9 | 0F08,—༈—,opening_punct
10 | 0F09,—༉—,opening_punct
11 | 0F0A,—༊—,opening_punct
12 | 0F0D,—།—,closing_punct
13 | 0F0E,—༎—,closing_punct
14 | 0F0F,—༏—,closing_punct
15 | 0F10,—༐—,closing_punct
16 | 0F11,—༑—,opening_punct
17 | 0F12,—༒—,opening_punct
18 | 0F14,—༔—,closing_punct
19 | 0F34,—༴—,closing_punct
20 | 0F3A,—༺—,opening_punct
21 | 0F3B,—༻—,closing_punct
22 | 0F3C,—༼—,opening_punct
23 | 0F3D,—༽—,closing_punct
24 | 0F3E,—༾—,closing_punct
25 | 0F3F,—༿—,opening_punct
26 | 0FD0,—࿐—,opening_punct
27 | 0FD1,—࿑—,opening_punct
28 | 0FD3,—࿓—,opening_punct
29 | 0FD4,—࿔—,opening_punct
30 | 0FD9,—࿙—,opening_punct
31 | 0FDA,—࿚—,closing_punct
32 | 


--------------------------------------------------------------------------------
/botok/resources/particles.tsv:
--------------------------------------------------------------------------------
 1 | # form	pos	lemma	sense	freq
 2 | གི	PART	གི		
 3 | ཀྱི	PART	གི		
 4 | གྱི	PART	གི		
 5 | འི	PART	གི		
 6 | ཡི	PART	གི		
 7 | གིས	PART	གིས		
 8 | ཀྱིས	PART	གིས		
 9 | གྱིས	PART	གིས		
10 | ཡིས	PART	གིས		
11 | ས	PART	གིས		
12 | སུ	PART	ལ		
13 | ར	PART	ལ		
14 | རུ	PART	ལ		
15 | ཏུ	PART	ལ		
16 | ན	PART	ལ		
17 | ལ	PART	ལ		
18 | དུ	PART	ལ		
19 | སྟེ	PART	སྟེ		
20 | ཏེ	PART	སྟེ		
21 | དེ	PART	སྟེ		
22 | ཀྱང	PART	ཀྱང		
23 | ཡང	PART	ཀྱང		
24 | འང	PART	ཀྱང		
25 | གམ	PART	གམ		
26 | ངམ	PART	གམ		
27 | དམ	PART	གམ		
28 | ནམ	PART	གམ		
29 | བམ	PART	གམ		
30 | མམ	PART	གམ		
31 | འམ	PART	གམ		
32 | རམ	PART	གམ		
33 | ལམ	PART	གམ		
34 | སམ	PART	གམ		
35 | ཏམ	PART	གམ		
36 | པ	PART	པ		
37 | བ	PART	པ		
38 | པོ	PART	པོ		
39 | བོ	PART	པོ		
40 | གོ	PART	གོ		
41 | ངོ	PART	གོ		
42 | དོ	PART	གོ		
43 | ནོ	PART	གོ		
44 | བོ	PART	གོ		
45 | མོ	PART	གོ		
46 | འོ	PART	གོ		
47 | རོ	PART	གོ		
48 | ལོ	PART	གོ		
49 | སོ	PART	གོ		
50 | ཏོ	PART	གོ		
51 | ཅིང	PART	ཅིང		
52 | ཤིང	PART	ཅིང		
53 | ཞིང	PART	ཅིང		
54 | ཅེས	PART	ཅེས		
55 | ཞེས	PART	ཅེས		
56 | ཅེའོ	PART	ཅེའོ		
57 | ཤེའོ	PART	ཅེའོ		
58 | ཞེའོ	PART	ཅེའོ		
59 | ཅེ་ན	PART	ཅེ་ན		
60 | ཤེ་ན	PART	ཅེ་ན		
61 | ཞེ་ན	PART	ཅེ་ན		
62 | ཅིག	PART	ཅིག		
63 | ཤིག	PART	ཅིག		
64 | ཞིག	PART	ཅིག		
65 | ཀྱིན	PART	གིན		
66 | གིན	PART	གིན		
67 | གྱིན	PART	གིན		
68 | ནས	PART	ནས		
69 | 


--------------------------------------------------------------------------------
/botok/text/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/text/__init__.py


--------------------------------------------------------------------------------
/botok/text/format.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from typing import DefaultDict, List, Tuple
 3 | 
 4 | from .tokenize import BoToken
 5 | 
 6 | 
 7 | def plaintext(tokens: List[str], sep=" ") -> str:
 8 |     tokens = [t.replace(" ", "_") for t in tokens]
 9 |     return sep.join(tokens)
10 | 
11 | 
12 | def plaintext_sent_par(units: List[Tuple[int, List[BoToken]]], sep="\n") -> str:
13 |     out = []
14 |     for u in units:
15 |         unit = "".join([word.text for word in u['tokens']])
16 |         out.append(unit)
17 |     return sep.join(out)
18 | 
19 | 
20 | def basic_conc(concs: DefaultDict[str, List[str]], sep="\t", esc_context=True) -> str:
21 |     out = []
22 |     for occ, LR in concs.items():
23 |         for left, right in LR:
24 | 
25 |             if esc_context:
26 |                 left, right = f'"{left}"', f'"{right}"'
27 | 
28 |             line = f"{left}{sep}{occ}{sep}{right}"
29 |             out.append(line)
30 | 
31 |     return "\n".join(out)
32 | 
33 | 
34 | def stats_types(total_mistakes: DefaultDict[str, int], sep="\t") -> str:
35 |     total = [(mis, freq) for mis, freq in total_mistakes.items()]
36 |     total = sorted(total, reverse=True, key=lambda x: x[1])
37 |     total = [f"{mis}{sep}{freq}" for mis, freq in total]
38 |     return "\n".join(total)
39 | 


--------------------------------------------------------------------------------
/botok/text/modify.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from collections import defaultdict
 3 | from typing import DefaultDict, List, NewType, Tuple
 4 | 
 5 | from ..third_party.has_skrt_syl import has_skrt_syl
 6 | from .tokenize import BoToken
 7 | 
 8 | 
 9 | def is_mistake(token):
10 |     exceptions = ["\n"]
11 |     if token.chunk_type == "TEXT" or (
12 |         token.chunk_type == "LATIN"
13 |         or token.chunk_type == "CJK"
14 |         or token.chunk_type == "OTHER"
15 |     ):
16 |         if (
17 |             (not token.skrt and not has_skrt_syl(token.text_cleaned))
18 |             and (
19 |                 token.senses
20 |                 and len(
21 |                     [
22 |                         True
23 |                         for m in token.senses
24 |                         if "pos" in m
25 |                         and (m["pos"] == "NO_POS" or m["pos"] == "NON_WORD")
26 |                     ]
27 |                 )
28 |                 > 0
29 |                 or (
30 |                     token.chunk_type == "LATIN"
31 |                     or token.chunk_type == "CJK"
32 |                     or token.chunk_type == "OTHER"
33 |                 )
34 |             )
35 |             and token.text not in exceptions
36 |         ):
37 |             return True
38 |     return False
39 | 
40 | 
41 | def words_error_concs(
42 |     tokens: List[BoToken], left=5, right=5
43 | ) -> DefaultDict[str, List[str]]:
44 |     mistakes = defaultdict(list)
45 |     for num, t in enumerate(tokens):
46 |         if is_mistake(t):
47 |             if num - left < 0:
48 |                 l = tokens[:num]
49 |             else:
50 |                 l = tokens[num - left : num]
51 |             if num + right > len(tokens) - 1:
52 |                 r = tokens[num + 1 :]
53 |             else:
54 |                 r = tokens[num + 1 : num + 1 + right]
55 | 
56 |             l_context = [t.text for t in l]
57 |             r_context = [t.text for t in r]
58 |             mis = t.text.replace("\n", "\\n")
59 |             mistakes[mis].append(["".join(l_context), "".join(r_context)])
60 |     return mistakes
61 | 
62 | 
63 | def words_error_types(tokens: List[BoToken]) -> DefaultDict[str, int]:
64 |     mistakes = defaultdict(int)
65 |     for num, t in enumerate(tokens):
66 |         if is_mistake(t):
67 |             mis = t.text.replace("\n", "\\n")
68 |             mistakes[mis] += 1
69 |     return mistakes
70 | 
71 | 
72 | def words_raw_types(tokens: List[BoToken]) -> DefaultDict[str, int]:
73 |     types = defaultdict(int)
74 |     for t in tokens:
75 |         occ = t.text.replace("\n", "\\n")
76 |         types[occ] += 1
77 |     return types
78 | 
79 | 
80 | def words_raw_text(tokens: List[BoToken]) -> List[str]:
81 |     return [t.text for t in tokens]
82 | 
83 | 
84 | def chunks_raw_text(tokens: List[Tuple[str, str]]) -> List[str]:
85 |     return [chunk for _, chunk in tokens]
86 | 


--------------------------------------------------------------------------------
/botok/text/pipelinebase.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from pathlib import Path
  3 | 
  4 | 
  5 | class PipelineBase:
  6 |     def __init__(self, profile, pipes=None):
  7 |         self.pipes = pipes
  8 | 
  9 |         self.prep = None
 10 |         self.tok = None
 11 |         self.mod = None
 12 |         self.form = None
 13 | 
 14 |         self.left = 5
 15 |         self.right = 5
 16 |         self.tok_params = None
 17 |         self.filename = (
 18 |             None  # for an advanced mode, to show what conc comes from which file
 19 |         )
 20 | 
 21 |         self.args_list = {
 22 |             "prep",
 23 |             "tok",
 24 |             "mod",
 25 |             "form",  # components
 26 |             "tok_params",  # pybo
 27 |             "left",
 28 |             "right",  # concs
 29 |             "filename",
 30 |         }  # others
 31 | 
 32 |         self.parse_profile(profile)
 33 | 
 34 |     def pipe_str(self, text: str) -> str:
 35 |         # a. preprocessing
 36 |         if self.prep:
 37 |             text = self.pipes["prep"][self.prep](text)
 38 | 
 39 |         # b. tokenizing
 40 |         if (
 41 |             isinstance(self.tok, str)
 42 |             and (
 43 |                 "word" in self.tok or "sentence" in self.tok or "paragraph" in self.tok
 44 |             )
 45 |             and self.tok_params
 46 |         ):
 47 |             elts = self.pipes["tok"][self.tok](text, config=self.tok_params["config"])
 48 |         else:
 49 |             elts = self.pipes["tok"][self.tok](text)
 50 | 
 51 |         # c. modifying
 52 |         mod = self.pipes["mod"][self.mod]
 53 |         if isinstance(self.mod, str) and self.mod.endswith("concs"):
 54 |             elts = mod(elts, left=self.left, right=self.right)
 55 |         else:
 56 |             elts = mod(elts)
 57 | 
 58 |         # d. formatting
 59 |         elts = self.pipes["form"][self.form](elts)
 60 | 
 61 |         return elts
 62 | 
 63 |     def pipe_file(self, filename: str, out_file: str):
 64 |         in_file = Path(filename)
 65 |         out_file = Path(out_file)
 66 |         assert in_file.is_file()
 67 | 
 68 |         with in_file.open(encoding="utf-8-sig") as f:
 69 |             dump = f.read()
 70 | 
 71 |         output = self.pipe_str(dump)
 72 | 
 73 |         with out_file.open("w", encoding="utf-8-sig") as g:
 74 |             g.write(output)
 75 | 
 76 |     def parse_profile(self, pipeline):
 77 |         self.is_valid_params(pipeline)
 78 |         for arg, v in pipeline.items():
 79 |             if arg == "prep":
 80 |                 self.prep = v
 81 |             elif arg == "tok":
 82 |                 self.tok = v
 83 |             elif arg == "mod":
 84 |                 self.mod = v
 85 |             elif arg == "form":
 86 |                 self.form = v
 87 |             elif arg == "tok_params":
 88 |                 self.tok_params = v
 89 |             elif arg == "left":
 90 |                 self.left = v
 91 |             elif arg == "right":
 92 |                 self.right = v
 93 |             elif arg == "filename":
 94 |                 self.filename = v
 95 |         self.is_valid_pipeline()
 96 | 
 97 |     def is_valid_params(self, pipeline):
 98 |         for arg, val in pipeline.items():
 99 |             # ensure all arguments are valid attributes
100 |             if arg not in self.args_list:
101 |                 raise SyntaxError(
102 |                     f'{arg} is not a valid argument\nvalid options are {" ".join(self.map)}'
103 |                 )
104 | 
105 |             # ensure arguments have valid values
106 |             if arg in self.pipes and val not in self.pipes[arg]:
107 |                 raise SyntaxError(
108 |                     f'{val} is not a valid value for {arg}\nvalid options are {" ".join(self.pipes[arg])}'
109 |                 )
110 | 
111 |     def is_valid_pipeline(self):
112 |         # missing pipes
113 |         if not self.tok or not self.mod or not self.form:
114 |             raise BrokenPipeError(
115 |                 "A valid pipeline must have a tokenizer, a processor and a formatter."
116 |             )
117 | 


--------------------------------------------------------------------------------
/botok/text/preprocess.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import re
 3 | 
 4 | 
 5 | def basic_cleanup(text: str) -> str:
 6 |     text = text.strip()
 7 |     text = re.sub(r"\n+", " ", text)
 8 |     text = re.sub(r"\s+", " ", text)
 9 |     return text
10 | 
11 | 
12 | def basic_keeps_lines(text: str) -> str:
13 |     text = text.strip()
14 |     # text = re.sub(r'\s+', ' ', text)
15 |     return text
16 | 


--------------------------------------------------------------------------------
/botok/text/text.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from pathlib import Path
  3 | from types import FunctionType
  4 | 
  5 | from ..vars import Ids
  6 | from .format import *
  7 | from .modify import *
  8 | from .pipelinebase import PipelineBase
  9 | from .preprocess import *
 10 | from .tokenize import *
 11 | 
 12 | builtin_pipes = {
 13 |     # a. Preprocessing
 14 |     "prep": {
 15 |         "dummy": lambda x: x,
 16 |         "basic_cleanup": basic_cleanup,
 17 |         "basic_keeps_lines": basic_keeps_lines,
 18 |     },
 19 |     # b. Tokenizers
 20 |     "tok": {
 21 |         "space_tok": space_tok,
 22 |         "word_tok": word_tok,
 23 |         "chunk_tok": chunk_tok,
 24 |         "sentence_tok": sentence_tok,
 25 |         "paragraph_tok": paragraph_tok,
 26 |     },
 27 |     # c. Modifiers
 28 |     "mod": {
 29 |         "dummy": lambda x: x,
 30 |         "words_raw_text": words_raw_text,
 31 |         "words_raw_types": words_raw_types,
 32 |         "words_error_types": words_error_types,
 33 |         "words_error_concs": words_error_concs,
 34 |         "chunks_raw_text": chunks_raw_text,
 35 |     },
 36 |     # d. Formatters
 37 |     "form": {
 38 |         "dummy": lambda x: x,
 39 |         "plaintext": plaintext,
 40 |         "plaintext_sent_par": plaintext_sent_par,
 41 |         "basic_concs": basic_conc,
 42 |         "stats_types": stats_types,
 43 |     },
 44 | }
 45 | 
 46 | 
 47 | class Text:
 48 |     """
 49 |     Takes as input:
 50 |         - a string to process
 51 |         - the Path object of a file to process
 52 | 
 53 |     including a custom pipeline is as simple as:
 54 |         - subclassing Text class
 55 |         - creating a new @property method like the built in ones while providing your own arguments to self.__process()
 56 |     """
 57 | 
 58 |     def __init__(self, input, out_file=None, tok_params=None):
 59 |         """
 60 |         if input == str: return a string
 61 |         if input == Path:
 62 |                             1. out_file != None: write to given Path object
 63 |                             2. out_file == None: write to cwd and append "_pybo" to file name
 64 | 
 65 |         custom_tok: settings for building the custom tokenizer: see docstring of Config class in config.py
 66 |         """
 67 |         self.input = input
 68 |         self.tok_params = tok_params
 69 | 
 70 |         if isinstance(input, str):
 71 |             if out_file:
 72 |                 assert isinstance(out_file, Path)
 73 |                 self.out_file = out_file
 74 |             else:
 75 |                 self.out_file = None
 76 |         elif isinstance(input, Path):
 77 |             if not out_file:
 78 |                 self.out_file = input.parent / f"{input.stem}_pybo{input.suffix}"
 79 |             else:
 80 |                 self.out_file = out_file
 81 |         else:
 82 |             raise TypeError("input should either be a string, or a Path object")
 83 | 
 84 |     @property
 85 |     def tokenize_on_spaces(self):
 86 |         return self.__process("basic_cleanup", "space_tok", "dummy", "plaintext")
 87 | 
 88 |     @property
 89 |     def tokenize_words_raw_text(self):
 90 |         return self.__process(
 91 |             "basic_cleanup", "word_tok", "words_raw_text", "plaintext",
 92 |         )
 93 | 
 94 |     @property
 95 |     def tokenize_words_raw_lines(self):
 96 |         return self.__process(
 97 |             "basic_keeps_lines", "word_tok", "words_raw_text", "plaintext",
 98 |         )
 99 | 
100 |     @property
101 |     def tokenize_chunks_plaintext(self):
102 |         return self.__process(
103 |             "basic_keeps_lines", "chunk_tok", "chunks_raw_text", "plaintext"
104 |         )
105 | 
106 |     @property
107 |     def tokenize_sentences_plaintext(self):
108 |         return self.__process(
109 |             "basic_cleanup", "sentence_tok", "dummy", "plaintext_sent_par",
110 |         )
111 | 
112 |     @property
113 |     def tokenize_paragraph_plaintext(self):
114 |         return self.__process(
115 |             "basic_cleanup", "paragraph_tok", "dummy", "plaintext_sent_par",
116 |         )
117 | 
118 |     @property
119 |     def list_word_types(self):
120 |         return self.__process(
121 |             "basic_keeps_lines", "word_tok", "words_raw_types", "stats_types",
122 |         )
123 | 
124 |     def custom_pipeline(
125 |         self, preprocessor, tokenizer, modifier, formatter, tok_params=None
126 |     ):
127 |         """
128 |         every pipe should be either the name of an existing pipe as found in builtin_pipes or a function
129 |         """
130 |         return self.__process(preprocessor, tokenizer, modifier, formatter, tok_params)
131 | 
132 |     def __process(self, preprocessor, tokenizer, modifier, formatter, tok_params=None):
133 |         if tok_params:
134 |             for k, v in tok_params.items():
135 |                 if k not in self.tok_params or self.tok_params[k] is None:
136 |                     self.tok_params[k] = v
137 | 
138 |         profile, pipes = self.__create_pipeline(
139 |             preprocessor, tokenizer, modifier, formatter, self.tok_params
140 |         )
141 |         pipeline = PipelineBase(profile, pipes=pipes)
142 | 
143 |         if self.out_file:
144 |             return pipeline.pipe_file(self.input, self.out_file)
145 |         else:
146 |             return pipeline.pipe_str(self.input)
147 | 
148 |     @staticmethod
149 |     def __create_pipeline(
150 |         preprocessor, tokenizer, modifier, formatter, tok_params=None
151 |     ):
152 |         profile = {}
153 |         pipes = {"prep": {}, "tok": {}, "mod": {}, "form": {}}
154 |         for a, b, c in [
155 |             ("prep", Ids.prep, preprocessor),
156 |             ("tok", Ids.tok, tokenizer),
157 |             ("mod", Ids.mod, modifier),
158 |             ("form", Ids.form, formatter),
159 |         ]:
160 |             if isinstance(c, FunctionType):
161 |                 pipes[a].update({b: c})
162 |                 profile[a] = b
163 |             elif isinstance(c, str):
164 |                 profile[a] = c
165 |                 assert c in builtin_pipes[a]
166 |                 pipes[a][c] = builtin_pipes[a][c]
167 |             else:
168 |                 raise SyntaxError("Should be either a function or a string")
169 | 
170 |         if tok_params:
171 |             profile["tok_params"] = tok_params
172 |         return profile, pipes
173 | 


--------------------------------------------------------------------------------
/botok/text/tokenize.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from functools import lru_cache
 3 | from typing import List, NewType, Tuple
 4 | 
 5 | from ..tokenizers.chunktokenizer import ChunkTokenizer
 6 | from ..tokenizers.paragraphtokenizer import paragraph_tokenizer
 7 | from ..tokenizers.sentencetokenizer import sentence_tokenizer
 8 | from ..tokenizers.token import Token
 9 | from ..tokenizers.wordtokenizer import WordTokenizer
10 | 
11 | BoToken = NewType("BoToken", Token)
12 | 
13 | 
14 | def space_tok(text: str) -> List[str]:
15 |     """Tokenizes string on spaces
16 | 
17 |     """
18 |     return text.split(" ")
19 | 
20 | 
21 | def word_tok(text: str, config=None) -> List[BoToken]:
22 |     tok = get_wordtokenizer(config=config)
23 |     return tok.tokenize(text)
24 | 
25 | 
26 | def sentence_tok(text: str, config=None) -> List[Tuple[int, List[BoToken]]]:
27 |     tok = get_wordtokenizer(config=config)
28 |     tokens = tok.tokenize(text)
29 |     return sentence_tokenizer(tokens)
30 | 
31 | 
32 | def paragraph_tok(text: str, config=None) -> List[Tuple[int, List[BoToken]]]:
33 |     tok = get_wordtokenizer(config=config)
34 |     tokens = tok.tokenize(text)
35 |     return paragraph_tokenizer(tokens)
36 | 
37 | 
38 | @lru_cache(
39 |     maxsize=None
40 | )  # <--- make sure that the trie is only built once then kept in memory
41 | def get_wordtokenizer(config=None):
42 |     return WordTokenizer(config=config)
43 | 
44 | 
45 | def chunk_tok(text: str) -> List[str]:
46 |     return ChunkTokenizer(text).tokenize()
47 | 


--------------------------------------------------------------------------------
/botok/textunits/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/textunits/__init__.py


--------------------------------------------------------------------------------
/botok/textunits/bostring.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from warnings import warn
  3 | 
  4 | from .charcategories import get_char_category
  5 | from ..vars import CharMarkers as a
  6 | from ..vars import char_values
  7 | 
  8 | 
  9 | class BoString:
 10 |     """
 11 |     This class is the foundational building block of pre-processing.
 12 | 
 13 |     It implements the natural groups of characters a user makes when looking at
 14 |     a text in his native language.
 15 | 
 16 |     Implementation:
 17 |     ---------------
 18 | 
 19 |         - all the characters in the Unicode Tables for Tibetan are organized in lists
 20 |             hard-coded as string variables in ``__attribute_basic_types()``.
 21 |         - upon instanciation, ``__init__().base_structure`` is populated with the indices of every
 22 |             char in the input string(key) and the group constant to which it belongs(values)
 23 |         - human-readable description of the group constant can be accessed in ``__init__().char_markers``
 24 | 
 25 |     :Example:
 26 | 
 27 |     >>> from botok.textunits.bostring import BoString
 28 |     >>> from botok.vars import CharMarkers
 29 | 
 30 |     >>> bo_str = ' བཀྲ་ཤིས་  tr བདེ་ལེགས།'
 31 |     >>> bs = BoString(bo_str)
 32 | 
 33 |     >>> bs.base_structure  # key: character index, value: character group
 34 |     {0: 15, 1: 1, 2: 1, 3: 2, 4: 4, 5: 1, 6: 3, 7: 1, 8: 4, 9: 15, 10: 15, 11: 14,
 35 |     12: 14, 13: 15, 14: 1, 15: 1, 16: 3, 17: 4, 18: 1, 19: 3, 20: 1, 21: 1, 22: 8}
 36 | 
 37 |     >>> bs.get_categories()
 38 |     {0: 'space', 1: 'cons', 2: 'cons', 3: 'sub-cons', 4: 'tsek', 5: 'cons', 6: 'vow',
 39 |     7: 'cons', 8: 'tsek', 9: 'space', 10: 'space', 11: 'other', 12: 'other',
 40 |     13: 'space', 14: 'cons', 15: 'cons', 16: 'vow', 17: 'tsek', 18: 'cons', 19: 'vow',
 41 |     20: 'cons', 21: 'cons', 22: 'punct'}
 42 | 
 43 |     .. note:: You may want to refine the groups that are implemented to have a finer analysis.
 44 |                 Be sure to create the corresponding constants in ``__init__()`` and the corresponding
 45 |                 entries in ``__init__().char_markers``.
 46 |     """
 47 | 
 48 |     def __init__(self, string, ignore_chars=None):
 49 |         if ignore_chars is None:
 50 |             ignore_chars = []
 51 |         self.ignore_chars = ignore_chars
 52 |         self.string = string
 53 |         self.len = len(string)
 54 |         self.base_structure = {}
 55 |         self.__attribute_basic_types()
 56 | 
 57 |     def __attribute_basic_types(self):
 58 |         """
 59 |         Populates ``__init__().base_structure``.
 60 |         """
 61 |         for i in range(len(self.string)):
 62 |             char = self.string[i]
 63 |             cat = get_char_category(char)
 64 |             self.__nfc_check(cat, i)
 65 |             if char in self.ignore_chars:
 66 |                 self.base_structure[
 67 |                     i
 68 |                 ] = (
 69 |                     a.TRANSPARENT.value
 70 |                 )  # spaces chars are allowed anywhere, thus ignored
 71 |             else:
 72 |                 self.base_structure[i] = cat
 73 | 
 74 |     def __nfc_check(self, cat, idx):
 75 |         if cat == a.NFC:
 76 |             slice_start = 10
 77 |             slice_end = 10
 78 |             while idx - slice_start < 0:
 79 |                 slice_start -= 1
 80 |             while idx + slice_end >= self.len:
 81 |                 slice_end -= 1
 82 |             warn(
 83 |                 f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", '
 84 |                 f'found in "{self.string[slice_start: slice_end]}".'
 85 |             )
 86 | 
 87 |     def export_groups(self, start_idx, slice_len, for_substring=True):
 88 |         """
 89 |         Export the base groups for a slice of the input string
 90 | 
 91 |         :param start_idx: starting index of the slice
 92 |         :param slice_len: length of the slice we want to export
 93 |         :param for_substring: if True, indices start at 0, Else the indices of the original string are kept.
 94 |         :type start_idx: int
 95 |         :type slice_len: int
 96 |         :return: the slice of ``__init__().base_structure`` described in the parameters
 97 |         :rtype: dict
 98 | 
 99 |         :Example:
100 | 
101 |         >>> bo_str = ' བཀྲ་ཤིས་  tr བདེ་ལེགས།'
102 |         >>> bs = BoString(bo_str)
103 | 
104 |         >>> bs.export_groups(2, 5)
105 |         {0: 1, 1: 2, 2: 4, 3: 1, 4: 3}
106 | 
107 |         >>> bs.export_groups(2, 5, for_substring=False)
108 |         {2: 1, 3: 2, 4: 4, 5: 1, 6: 3}
109 | 
110 |         """
111 |         if for_substring:
112 |             return {
113 |                 n: self.base_structure[i]
114 |                 for n, i in enumerate(range(start_idx, start_idx + slice_len))
115 |             }
116 |         else:
117 |             return {
118 |                 i: self.base_structure[i]
119 |                 for i in range(start_idx, start_idx + slice_len)
120 |             }
121 | 
122 |     def get_categories(self, struct=None):
123 |         if struct is None or not isinstance(struct, dict):
124 |             return {k: char_values[v] for k, v in self.base_structure.items()}
125 |         else:
126 |             return {k: char_values[v] for k, v in struct.items()}
127 | 


--------------------------------------------------------------------------------
/botok/textunits/bosyl.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from .sylcomponents import SylComponents
 3 | 
 4 | 
 5 | class BoSyl(SylComponents):
 6 |     def __init__(self):
 7 |         SylComponents.__init__(self)
 8 |         self.affixes = {
 9 |             "ར": {"len": 1, "type": "la"},
10 |             "ས": {"len": 1, "type": "gis"},
11 |             "འི": {"len": 2, "type": "gi"},
12 |             "འམ": {"len": 2, "type": "am"},
13 |             "འང": {"len": 2, "type": "ang"},
14 |             "འོ": {"len": 2, "type": "o"},
15 |             "འིའོ": {"len": 4, "type": "gi+o"},
16 |             "འིའམ": {"len": 4, "type": "gi+am"},
17 |             "འིའང": {"len": 4, "type": "gi+ang"},
18 |             "འོའམ": {"len": 4, "type": "o+am"},
19 |             "འོའང": {"len": 4, "type": "o+ang"},
20 |         }
21 | 
22 |     def is_affixable(self, syl):
23 |         """expects a clean syllable without ending tsek"""
24 |         affixable = False
25 |         if self.is_thame(syl):
26 |             affixable = True
27 |             for ending in ["ར", "ས", "འི", "འོ", "མ", "ང"]:
28 |                 if len(syl) > len(ending) and syl.endswith(ending):
29 |                     affixable = False
30 |         return affixable
31 | 
32 |     def get_all_affixed(self, syl):
33 |         """
34 |         :param syl: syl to be affixed
35 |         :return: if affixable: [(<syl+affixed>, {'len': int, 'type': str, 'aa': bool}), (..., ...)]
36 |                  otherwise   : <syl>
37 |         """
38 |         if self.is_affixable(syl):
39 |             aa = False
40 |             if syl.endswith("འ") and len(syl) > 1:
41 |                 syl = syl[:-1]
42 |                 aa = True
43 | 
44 |             affixed = []
45 |             for a in self.affixes.keys():
46 |                 metadata = {}
47 |                 metadata.update(self.affixes[a])
48 |                 metadata.update({"aa": aa})
49 |                 affixed.append((syl + a, metadata))
50 |             return affixed
51 | 
52 |         else:
53 |             return None
54 | 


--------------------------------------------------------------------------------
/botok/textunits/charcategories.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import csv
 3 | from collections import defaultdict
 4 | 
 5 | from ..vars import CharMarkers as c
 6 | 
 7 | # Get the categories of Tibetan characters from the csv file
 8 | categories = defaultdict(list)
 9 | table_path = Path(__file__).parent.parent / "resources/bo_uni_table.csv"
10 | for row in list(csv.reader(table_path.open(encoding="utf-8-sig")))[1:]:
11 |     char = row[1].replace("—", "")
12 |     cat = c[row[2]].value
13 |     categories[cat].append(char)
14 | 
15 | # all unicode chars liable to be used as spaces or that allowed in a valid Tibetan string
16 | # yet that will be ignored when read by a human.
17 | transparent = [
18 |     " ",  # \U32 SPACE
19 |     "᠎",  # \U6158 MONGOLIAN VOWEL SEPARATOR
20 |     " ",  # \U8192 EN QUAD
21 |     " ",  # \U8193 EM QUAD
22 |     " ",  # \U8194 EN SPACE
23 |     " ",  # \U8195 EM SPACE
24 |     " ",  # \U8196 THREE-PER-EM SPACE
25 |     " ",  # \U8197 FOUR-PER-EM SPACE
26 |     " ",  # \U8198 SIX-PER-EM SPACE
27 |     " ",  # \U8199 FIGURE SPACE
28 |     " ",  # \U8200 PUNCTUATION SPACE
29 |     " ",  # \U8201 THIN SPACE
30 |     " ",  # \U8202 HAIR SPACE
31 |     "​",  # \U8203 ZERO WIDTH SPACE
32 |     " ",  # \U8239 NARROW NO-BREAK SPACE
33 |     " ",  # \U8287 MEDIUM MATHEMATICAL SPACE
34 |     "　",  # \U12288 IDEOGRAPHIC SPACE
35 |     "﻿",  # \U65279 ZERO WIDTH NO-BREAK SPACE
36 |     "\t",  # Tabulation
37 |     "\n",  # carriage return can happen in the middle of a word
38 | ]
39 | 
40 | 
41 | def get_char_category(char):
42 |     # source for codepoints: https://jrgraphix.net/research/unicode.php
43 |     if char in transparent:
44 |         return c.TRANSPARENT.value
45 | 
46 |     # Tibetan range
47 |     if "\u0f00" <= char <= "\u0fff":
48 |         for cat, chars in categories.items():
49 |             if char in chars:
50 |                 return cat
51 |         raise ValueError(
52 |             f'The char "{char}" is expected to be in the tibetan table, but is not.'
53 |         )
54 | 
55 |     # CJK range
56 |     elif (
57 |         "\u2e80" <= char <= "\ufaff"
58 |         or "\ufe30" <= char <= "\ufe4f"
59 |         or eval('"\u20000"') <= char <= eval('"\u2fa1f"')
60 |     ):
61 |         return c.CJK.value
62 | 
63 |     # LATIN range
64 |     # 1. 0020 - 036f:  Latin Basic + Latin-1 Supplement + Latin Extended-A + Latin Extended-B
65 |     # IPA Extensions + Spacing Modifier Letters + Combining Diacritical Marks
66 |     # 2. 1e00 - 20cf: Latin Extended Additional + Superscripts and Subscripts + Currency Symbols
67 |     elif "\u0020" <= char <= "\u036f" or "\u1e00" <= char <= "\u20cf":
68 |         return c.LATIN.value
69 | 
70 |     else:
71 |         return c.OTHER.value
72 | 


--------------------------------------------------------------------------------
/botok/third_party/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/third_party/__init__.py


--------------------------------------------------------------------------------
/botok/third_party/cqlparser.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from .pynpl.cql import Query
 3 | 
 4 | 
 5 | __all__ = ["Query", "parse_cql_query", "replace_token_attributes"]
 6 | 
 7 | 
 8 | def parse_cql_query(query, numerals=True, booleans=True):
 9 |     """
10 |     CQL parser for replacing the content of Token.attributes.
11 |     From the CQL syntax, only the attribute names and the values
12 |     are taken into account.
13 | 
14 |     :param query: CQL query string
15 |     :param numerals: if True, gives the Python int instead of the string
16 |     :param booleans: if True, gives the Python bool instead of the string
17 |     :return: a list of dicts, one per token slot, where
18 |                 keys == Token.attributes and
19 |                 values == content of the expected Token.attributes
20 |     """
21 | 
22 |     def str2int(string):
23 |         try:
24 |             return int(string)
25 |         except ValueError:
26 |             return string
27 | 
28 |     def str2bool(string):
29 |         b = {"True": True, "False": False, "None": None}
30 |         if string in b:
31 |             return b[string]
32 |         else:
33 |             return string
34 | 
35 |     def cql2pattern(tokenexpr, numerals, booleans):
36 |         """
37 |         Expects the following syntax:
38 |             '[attribute1="value1" & attribute2="value2" (& ...)]'
39 |         """
40 |         changes = {}
41 |         for attrexprs in tokenexpr:
42 |             key = attrexprs.attribute
43 |             value = attrexprs.valueexpr[0]
44 |             if numerals:
45 |                 value = str2int(value)
46 |             if booleans:
47 |                 value = str2bool(value)
48 |             changes[key] = value
49 |         return changes
50 | 
51 |     if query:
52 |         parsed = Query(query)
53 |         pattern = []
54 |         for tokenexpr in parsed.tokenexprs:
55 |             pattern.append(cql2pattern(tokenexpr, numerals, booleans))
56 |         return pattern
57 |     else:
58 |         return None
59 | 
60 | 
61 | def replace_token_attributes(tokens, token_changes):
62 |     """
63 |     Applies in place the replacements found in the CQL query (token_changes)
64 |     the number of tokens in the list and the number of token slots in the query
65 |     must be even.
66 | 
67 |     :param tokens: list of tokens
68 |     :param token_changes: CQL query
69 |     """
70 |     changes = parse_cql_query(token_changes)
71 |     if type(tokens) == list:
72 |         assert len(tokens) == len(changes)
73 |         for i in range(len(tokens)):
74 |             for attr, value in changes[i].items():
75 |                 setattr(tokens[i], attr, value)
76 |     else:
77 |         assert len(changes) == 1
78 |         for attr, value in changes[0].items():
79 |             setattr(tokens, attr, value)
80 | 


--------------------------------------------------------------------------------
/botok/third_party/has_skrt_syl.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | import re
 3 | 
 4 | from ..vars import TSEK
 5 | 
 6 | 
 7 | def is_skrt(syl):
 8 |     """Source for regexes : Paul Hackett Visual Basic script
 9 | 
10 |     regex1: Now do Sanskrit: Skt.vowels, [g|d|b|dz]+_h, hr, shr, Skt
11 |     regex2: more Sanskrit: invalid superscript-subscript pairs
12 |     regex3: tsa-phru mark used in Chinese transliteration
13 |     :param syl: syllable to assert
14 |     :return: True if matches either of the regexes, False otherwise
15 |     """
16 |     regex1 = (
17 |         r"([ཀ-ཬཱ-྅ྐ-ྼ]{0,}[ཱཱཱིུ-ཹཻཽ-ྃ][ཀ-ཬཱ-྅ྐ-ྼ]{0,}|[ཀ-ཬཱ-྅ྐ-ྼ]{0,}"
18 |         r"[གཌདབཛྒྜྡྦྫ][ྷ][ཀ-ཬཱ-྅ྐ-ྼ]{0,}|[ཀ-ཬཱ-྅ྐ-ྼ]{0,}[ཤཧ][ྲ][ཀ-ཬཱ-྅ྐ-ྼ]{0,}|[ཀ-ཬཱ-྅ྐ-ྼ]{0,}"
19 |         r"[གྷཊ-ཎདྷབྷཛྷཥཀྵ-ཬཱཱཱིུ-ཹཻཽ-ྃྒྷྚ-ྞྡྷྦྷྫྷྵྐྵ-ྼ][ཀ-ཬཱ-྅ྐ-ྼ]{0,})"
20 |     )
21 |     regex2 = r"([ཀ-ཬཱ-྅ྐ-ྼ]{0,}[ཀཁགང-ཉཏ-དན-བམ-ཛཝ-ཡཤཧཨ][ྐ-ྫྷྮ-ྰྴ-ྼ][ཀ-ཬཱ-྅ྐ-ྼ]{0,})"
22 |     regex3 = r"([ཀ-ཬཱ-྅ྐ-ྼ]{0,}[༹][ཀ-ཬཱ-྅ྐ-ྼ]{0,})"
23 |     return re.search(regex1, syl) or re.search(regex2, syl) or re.search(regex3, syl)
24 | 
25 | 
26 | def has_skrt_syl(word):
27 |     """Uses is_skrt() to check for sanskrit syllables
28 | 
29 |     """
30 |     skrt = False
31 |     syls = word.strip(TSEK).split(TSEK)
32 |     for s in syls:
33 |         if is_skrt(s):
34 |             skrt = True
35 | 
36 |     return skrt
37 | 


--------------------------------------------------------------------------------
/botok/third_party/pynpl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/third_party/pynpl/__init__.py


--------------------------------------------------------------------------------
/botok/third_party/pynpl/fsa.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Original copyright notice:
  3 | #
  4 | # ---------------------------------------------------------------
  5 | # PyNLPl - Finite State Automata
  6 | #   by Maarten van Gompel
  7 | #   Centre for Language Studies
  8 | #   Radboud University Nijmegen
  9 | #   http://proycon.github.com/folia
 10 | #   http://www.github.com/proycon/pynlpl
 11 | #   proycon AT anaproy DOT nl
 12 | #
 13 | # Partially based/inspired on code by Xiayun Sun (https://github.com/xysun/regex)
 14 | #
 15 | #   Licensed under GPLv3
 16 | #
 17 | # ----------------------------------------------------------------
 18 | #
 19 | # This file is modified and reditstributed here under APL2 with
 20 | # with written permission from the original author
 21 | 
 22 | from __future__ import print_function, unicode_literals, division, absolute_import
 23 | import sys
 24 | 
 25 | 
 26 | class State(object):
 27 |     def __init__(self, **kwargs):
 28 |         if "epsilon" in kwargs:
 29 |             self.epsilon = kwargs["epsilon"]  # epsilon-closure (lis of states)
 30 |         else:
 31 |             self.epsilon = []  # epsilon-closure
 32 |         if "transitions" in kwargs:
 33 |             self.transitions = kwargs["transitions"]
 34 |         else:
 35 |             self.transitions = []  # (matchitem, matchfunction(value), state)
 36 |         if "final" in kwargs:
 37 |             self.final = bool(kwargs["final"])  # ending state
 38 |         else:
 39 |             self.final = False
 40 |         self.transitioned = (
 41 |             None
 42 |         )  # will be a tuple (state, matchitem) indicating how this state was reached
 43 | 
 44 | 
 45 | class NFA(object):
 46 |     """Non-deterministic finite state automaton. Can be used to model DFAs as well if your state transitions are not ambiguous and epsilon is empty."""
 47 | 
 48 |     def __init__(self, initialstate):
 49 |         self.initialstate = initialstate
 50 | 
 51 |     def run(self, sequence, mustmatchall=False, debug=False):
 52 |         def add(state, states):
 53 |             """add state and recursively add epsilon transitions"""
 54 |             assert isinstance(state, State)
 55 |             if state in states:
 56 |                 return
 57 |             states.add(state)
 58 |             for eps in state.epsilon:  # recurse into epsilon transitions
 59 |                 add(eps, states)
 60 | 
 61 |         current_states = set()
 62 |         add(self.initialstate, current_states)
 63 |         if debug:
 64 |             print(
 65 |                 "Starting run, current states: ", repr(current_states), file=sys.stderr
 66 |             )
 67 | 
 68 |         for offset, value in enumerate(sequence):
 69 |             if not current_states:
 70 |                 break
 71 |             if debug:
 72 |                 print("Value: ", repr(value), file=sys.stderr)
 73 |             next_states = set()
 74 |             for state in current_states:
 75 |                 for matchitem, matchfunction, trans_state in state.transitions:
 76 |                     if matchfunction(value):
 77 |                         trans_state.transitioned = (state, matchitem)
 78 |                         add(trans_state, next_states)
 79 | 
 80 |             current_states = next_states
 81 |             if debug:
 82 |                 print("Current states: ", repr(current_states), file=sys.stderr)
 83 |             if not mustmatchall:
 84 |                 for s in current_states:
 85 |                     if s.final:
 86 |                         if debug:
 87 |                             print("Final state reached", file=sys.stderr)
 88 |                         yield offset + 1
 89 | 
 90 |         if mustmatchall:
 91 |             for s in current_states:
 92 |                 if s.final:
 93 |                     if debug:
 94 |                         print("Final state reached", file=sys.stderr)
 95 |                     yield offset + 1
 96 | 
 97 |     def match(self, sequence):
 98 |         try:
 99 |             return next(self.run(sequence, True)) == len(sequence)
100 |         except StopIteration:
101 |             return False
102 | 
103 |     def find(self, sequence, debug=False):
104 |         l = len(sequence)
105 |         for i in range(0, l):
106 |             for length in self.run(sequence[i:], False, debug):
107 |                 yield sequence[i : i + length]
108 | 


--------------------------------------------------------------------------------
/botok/tokenizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/tokenizers/__init__.py


--------------------------------------------------------------------------------
/botok/tokenizers/chunktokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from ..chunks.chunks import TokChunks
 3 | 
 4 | 
 5 | class ChunkTokenizer(TokChunks):
 6 |     def __init__(self, string):
 7 |         super().__init__(string)
 8 | 
 9 |     def tokenize(self):
10 |         tokens = self.make_chunks()
11 |         return self.get_readable(tokens)
12 | 


--------------------------------------------------------------------------------
/botok/tokenizers/paragraphtokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from .sentencetokenizer import get_sentence_indices
 3 | 
 4 | 
 5 | def paragraph_tokenizer(tokens):
 6 |     # a paragraph is defined as a group of sentences that does not have more words than a given threshold
 7 |     threshold = 70
 8 |     paragraph_max = 150
 9 |     par_indices = get_sentence_indices(tokens)
10 | 
11 |     # join small sentences to form paragraphs
12 |     i = 0
13 |     while i < len(par_indices):
14 |         start, end, l = (
15 |             par_indices[i]["start"],
16 |             par_indices[i]["end"],
17 |             par_indices[i]["len"],
18 |         )
19 |         if i > 0 and l < threshold:
20 |             previous_len = par_indices[i - 1]["len"]
21 |             if l + previous_len < paragraph_max:
22 |                 par_indices[i - 1]["end"] = par_indices[i]["end"]
23 |                 par_indices[i - 1]["len"] += par_indices[i]["len"]
24 |                 del par_indices[i]
25 |                 i -= 1
26 |         i += 1
27 | 
28 |     # get tokens for each paragraph
29 |     pars = []
30 |     for par in par_indices:
31 |         start, end, l = par["start"], par["end"], par["len"]
32 |         pars.append((l, tokens[start : end + 1]))
33 | 
34 |     return pars
35 | 


--------------------------------------------------------------------------------
/botok/tokenizers/stacktokenizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | #STACK_PARTS = re.compile(r"(?:[^\u0f18\u0f19\u0f35\u0f37\u0f71-\u0f7e\u0f80-\u0f84\u0f86\u0f87\u0f8d-\u0fbc][\u0f18\u0f19\u0f35\u0f37\u0f71-\u0f7e\u0f80-\u0f84\u0f86\u0f87\u0f8d-\u0fbc]*|^[\u0f18\u0f19\u0f35\u0f37\u0f71-\u0f7e\u0f80-\u0f84\u0f86\u0f87\u0f8d-\u0fbc]+)")
 4 | STACK_PARTS = re.compile(r"[^\u0f18\u0f19\u0f35\u0f37\u0f71-\u0f7e\u0f80-\u0f84\u0f86\u0f87\u0f8d-\u0fbc][\u0f18\u0f19\u0f35\u0f37\u0f71-\u0f7e\u0f80-\u0f84\u0f86\u0f87\u0f8d-\u0fbc]*")
 5 | 
 6 | def tokenize_in_stacks(str):
 7 | 	return STACK_PARTS.findall(str)
 8 | 
 9 | def test_stack_tokenizer():
10 | 	assert(tokenize_in_stacks("ཀཿཐོག་འབྱམ་པའཱི་རོ།") == ["ཀ", "\u0f7f", "ཐོ", "ག", "་", "འ", "བྱ", "མ", "་", "པ", "འཱི", "་", "རོ", "།"])
11 | 	assert(tokenize_in_stacks("\u0f7fཀཿ") == ["\u0f7f", "ཀ", "\u0f7f"])
12 | 
13 | if __name__ == "__main__":
14 | 	test_stack_tokenizer()


--------------------------------------------------------------------------------
/botok/tokenizers/token.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | from ..vars import TSEK, AA
  3 | 
  4 | 
  5 | class Token:
  6 |     def __init__(self):
  7 |         self.text = ""
  8 |         self.char_types = []
  9 |         self.has_merged_dagdra = None
 10 |         self.lemma = ""
 11 |         self.sense = ""
 12 |         self.chunk_type = None
 13 |         self.start = 0
 14 |         self.len = None
 15 |         self.syls_idx = None
 16 |         self.syls_start_end = None
 17 |         self.pos = ""
 18 |         self.affixation = {}
 19 |         self.senses = None
 20 |         self.affix = False
 21 |         self.affix_host = False
 22 |         self.form_freq = None
 23 |         self.freq = None
 24 |         self.skrt = False
 25 |         self._ = {}  # dict for any user specific data
 26 | 
 27 |     def __getitem__(self, attr):
 28 |         # allows to access attributes with the Token['attr'] syntax, besides the Token.attr default
 29 |         try:
 30 |             return self.__getattribute__(attr)
 31 |         except AttributeError:
 32 |             raise AttributeError("does not have attribute: " + attr)
 33 | 
 34 |     def __setitem__(self, key, value):
 35 |         # enforces not to add any extra attribute. Token._ should be used for any custom data
 36 |         if hasattr(self, key):
 37 |             if key != "_":
 38 |                 self.__dict__[key] = value
 39 |             else:
 40 |                 if not isinstance(value, dict):
 41 |                     raise TypeError("only dicts are accepted for Token._")
 42 |                 self.__dict__[key].update(value)
 43 |         else:
 44 |             raise AttributeError("Token objects don't have " + key + " as attribute")
 45 | 
 46 |     @property
 47 |     def syls(self):
 48 |         return (
 49 |             [[self.text[s] for s in syl] for syl in self.syls_idx]
 50 |             if self.syls_idx
 51 |             else ""
 52 |         )
 53 | 
 54 |     @property
 55 |     def text_cleaned(self):
 56 |         """
 57 |         Will append a TSEK to every syllable except syllables that host
 58 |         an affix.
 59 | 
 60 |         """
 61 |         if self.syls:
 62 |             cleaned = TSEK.join(["".join(syl) for syl in self.syls])
 63 |             if self.affix_host and not self.affix:
 64 |                 return cleaned
 65 |             else:
 66 |                 return cleaned + TSEK
 67 |         else:
 68 |             return ""
 69 | 
 70 |     @property
 71 |     def text_unaffixed(self):
 72 |         unaffixed = TSEK.join(["".join(syl) for syl in self.syls]) if self.syls else ""
 73 |         if (
 74 |             self.affixation
 75 |             and not self.affix
 76 |             and "len" in self.affixation
 77 |             and len([True for m in self.senses if "affixed" in m and m["affixed"]]) > 0
 78 |         ):
 79 |             unaffixed = unaffixed[: -self.affixation["len"]]
 80 | 
 81 |             if unaffixed and "aa" in self.affixation and self.affixation["aa"]:
 82 |                 unaffixed += AA
 83 | 
 84 |         if self.affixation and self.affix_host and not self.affix:
 85 |             return unaffixed
 86 |         elif unaffixed:
 87 |             return unaffixed + TSEK
 88 |         else:
 89 |             return ""
 90 | 
 91 |     def __repr__(self):
 92 |         out = 'text: "{}"\n'.format(self.text)
 93 |         if self.text_cleaned:
 94 |             out += 'text_cleaned: "{}"\n'.format(self.text_cleaned)
 95 |         if self.text_unaffixed:
 96 |             out += 'text_unaffixed: "{}"\n'.format(self.text_unaffixed)
 97 |         if self.syls and self.syls != []:
 98 |             out += (
 99 |                 'syls: ["' + '", "'.join(["".join(syl) for syl in self.syls]) + '"]\n'
100 |             )
101 |         if self.pos:
102 |             out += "pos: {}\n".format(self.pos)
103 |         if self.lemma:
104 |             out += "lemma: {}\n".format(self.lemma)
105 |         if self.sense:
106 |             out += "sense: {}\n".format(self.sense)
107 |         if self.senses:
108 |             out += (
109 |                 "senses: | "
110 |                 + " | ".join(
111 |                     [
112 |                         ", ".join([f"{k}: {v}" for k, v in m.items()])
113 |                         for m in self.senses
114 |                     ]
115 |                 )
116 |                 + " |\n"
117 |             )
118 |         if self.char_types:
119 |             out += "char_types: |" + "|".join(self.char_types) + "|\n"
120 |         if self.chunk_type:
121 |             out += "chunk_type: {}\n".format(self.chunk_type)
122 |         if self.form_freq:
123 |             out += "form_freq: {}\n".format(self.form_freq)
124 |         if self.freq:
125 |             out += "freq: {}\n".format(self.freq)
126 |         if self.skrt:
127 |             out += "skrt: {}\n".format(self.skrt)
128 |         if self.affix:
129 |             out += "affix: {}\n".format(self.affix)
130 |         if self.affix_host:
131 |             out += "affix_host: {}\n".format(self.affix_host)
132 |         if self.has_merged_dagdra:
133 |             out += "has_merged_dagdra: {}\n".format(self.has_merged_dagdra)
134 |         if self.syls_idx:
135 |             out += "syls_idx: {}\n".format(self.syls_idx)
136 |         if self.syls_start_end:
137 |             out += "syls_start_end: {}\n".format(self.syls_start_end)
138 |         out += "start: {}\n".format(self.start)
139 |         out += "len: {}\n".format(self.len)
140 |         if self._:
141 |             out += "\n"
142 |             for k, v in self._.items():
143 |                 out += "_{}: {}\n".format(k, v)
144 |         out += "\n"
145 |         return out
146 | 


--------------------------------------------------------------------------------
/botok/tokenizers/wordtokenizer.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | import csv
  3 | from pathlib import Path
  4 | 
  5 | from ..chunks.chunks import TokChunks
  6 | from ..config import Config
  7 | from ..modifytokens.adjusttokens import AdjustTokens
  8 | from ..modifytokens.mergedagdra import MergeDagdra
  9 | from ..modifytokens.splitaffixed import split_affixed
 10 | from ..textunits.bosyl import BoSyl
 11 | from ..tries.trie import Trie
 12 | from ..vars import AA, TSEK
 13 | from .tokenize import Tokenize
 14 | 
 15 | 
 16 | def get_part_lemmas(path):
 17 |     part_lemmas = {}
 18 |     if not path.is_file():
 19 |         return part_lemmas
 20 |     with path.open("r", encoding="utf-8-sig") as f:
 21 |         reader = csv.reader(f, delimiter="\t")
 22 |         for row in list(reader)[1:]:
 23 |             form, _, lemma, _, _ = row
 24 |             part_lemmas[form] = lemma
 25 |     return part_lemmas
 26 | 
 27 | 
 28 | class WordTokenizer:
 29 |     """
 30 |     Convenience class to tokenize a given string.
 31 | 
 32 |     """
 33 | 
 34 |     def __init__(
 35 |         self, config=None, ignore_chars=None, build_trie=False,
 36 |     ):
 37 |         """
 38 |         :param tok_profile: profile for building the trie. (see config.yaml)
 39 |         """
 40 |         if not config:
 41 |             # if config is not given then use default config
 42 |             config = Config()
 43 | 
 44 |         self.config = config
 45 |         self.ignore_chars = ignore_chars
 46 |         self.tok = Tokenize(
 47 |             Trie(
 48 |                 BoSyl,
 49 |                 config.profile,
 50 |                 main_data=config.dictionary,
 51 |                 custom_data=config.adjustments,
 52 |                 pickle_path=config.dialect_pack_path.parent,
 53 |                 build=build_trie,
 54 |             )
 55 |         )
 56 | 
 57 |         self.adj = AdjustTokens(
 58 |             main=config.dictionary["rules"], custom=config.adjustments["rules"]
 59 |         )
 60 | 
 61 |         self.part_lemmas = get_part_lemmas(
 62 |             config.dialect_pack_path
 63 |             / "dictionary"
 64 |             / "words_non_inflected"
 65 |             / "particles.tsv"
 66 |         )
 67 | 
 68 |     def tokenize(self, string, split_affixes=True, spaces_as_punct=False, debug=False):
 69 |         """
 70 |         :param string: to be tokenized
 71 |         :param split_affixes: separates the affixed particles into seperate tokens if True
 72 |         :param debug: print debug info while parsing
 73 |         :return: list of pybo.tokenizers.Token objects
 74 |         """
 75 |         preprocessed = TokChunks(
 76 |             string, ignore_chars=self.ignore_chars, space_as_punct=spaces_as_punct
 77 |         )
 78 |         preprocessed.serve_syls_to_trie()
 79 |         tokens = self.tok.tokenize(preprocessed, debug=debug)
 80 | 
 81 |         if split_affixes:
 82 |             split_affixed(tokens)
 83 | 
 84 |         self._get_default_lemma(tokens)
 85 |         self._choose_default_entry(tokens)
 86 | 
 87 |         # merge pa/po/ba/bo tokens with previous ones
 88 |         MergeDagdra().merge(tokens)
 89 | 
 90 |         # do adjustments
 91 |         tokens = self.adj.adjust(tokens)
 92 | 
 93 |         return tokens
 94 | 
 95 |     def _get_default_lemma(self, token_list):
 96 |         for t in token_list:
 97 |             # pass any token that is not a word
 98 |             if not t.text_unaffixed:
 99 |                 continue
100 | 
101 |             # otherwise, check whether the aa needs to be added and if a tsek should be added
102 |             if t.affix and not t.affix_host:
103 |                 part = "".join(["".join(syl) for syl in t.syls])
104 |                 lemma = self.part_lemmas[part] if part in self.part_lemmas else part
105 |                 lemma += TSEK
106 |             elif not t.affix and t.affix_host:
107 |                 lemma = (
108 |                     t.text_unaffixed + AA + TSEK
109 |                     if t.affixation["aa"]
110 |                     else t.text_unaffixed + TSEK
111 |                 )
112 |             else:
113 |                 lemma = (
114 |                     t.text_unaffixed
115 |                     if t.text_unaffixed.endswith(TSEK)
116 |                     else t.text_unaffixed + TSEK
117 |                 )
118 | 
119 |             for m in t.senses:
120 |                 if "lemma" not in m and ("pos" in m and m["pos"] != "NON_WORD"):
121 |                     m["lemma"] = lemma
122 |             if not t.senses:
123 |                 t.senses.append({"lemma": lemma})
124 | 
125 |     @staticmethod
126 |     def _choose_default_entry(token_list):
127 |         def choose_n_apply(senses, t):
128 |             s = sorted(senses, key=len, reverse=True)
129 |             for a in ["pos", "lemma", "freq", "sense"]:
130 |                 if a in s[0]:
131 |                     t[a] = s[0][a]
132 | 
133 |         for t in token_list:
134 |             if t.senses:
135 |                 # Categorize all meanings in three groups
136 |                 affixed, non_affixed, no = [], [], []
137 |                 for m in t.senses:
138 |                     if "affixed" in m:
139 |                         if m["affixed"]:
140 |                             affixed.append(m)
141 |                         else:
142 |                             non_affixed.append(m)
143 |                     else:
144 |                         no.append(m)
145 | 
146 |                 # Decide what meaning to use as default
147 |                 # get a meaning from either group in the following order: non_affixed, no, affixed
148 |                 # take the one with the highest amount of attrs
149 |                 if non_affixed:
150 |                     choose_n_apply(non_affixed, t)
151 |                 elif no:
152 |                     choose_n_apply(no, t)
153 |                 elif affixed:
154 |                     choose_n_apply(affixed, t)
155 |                 else:
156 |                     raise ValueError("This should never happen.")
157 | 


--------------------------------------------------------------------------------
/botok/tries/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/tries/__init__.py


--------------------------------------------------------------------------------
/botok/tries/basictrie.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | 
  3 | # inspired from https://gist.github.com/nickstanisha/733c134a0171a00f66d4
  4 | # and           https://github.com/eroux/tibetan-phonetics-py
  5 | 
  6 | 
  7 | class Node:
  8 |     def __init__(self, label=None, leaf=False, data=None):
  9 |         if data is None:
 10 |             data = {'_': {}}  # the dict in '_' is for user-data
 11 |         self.label = label
 12 |         self.leaf = leaf
 13 |         self.data = data
 14 |         self.children = dict()
 15 | 
 16 |     def add_child(self, key, leaf=False):
 17 |         if not isinstance(key, Node):
 18 |             self.children[key] = Node(key, leaf)
 19 |         else:
 20 |             self.children[key.leaf] = key
 21 | 
 22 |     def can_walk(self):
 23 |         return self.children != dict()
 24 | 
 25 |     def is_match(self):
 26 |         return self.leaf
 27 | 
 28 |     def __getitem__(self, key):
 29 |         return self.children[key]
 30 | 
 31 | 
 32 | class BasicTrie:
 33 |     def __init__(self):
 34 |         self.head = Node()
 35 | 
 36 |     def __getitem__(self, key):
 37 |         return self.head.children[key]
 38 | 
 39 |     def add(self, word, data=None):
 40 |         # adding the word
 41 |         current_node = self.head
 42 |         word_finished = True
 43 | 
 44 |         i = 0
 45 |         for i in range(len(word)):
 46 |             if word[i] in current_node.children:
 47 |                 current_node = current_node.children[word[i]]
 48 |             else:
 49 |                 word_finished = False
 50 |                 break
 51 | 
 52 |         if not word_finished:
 53 |             while i < len(word):
 54 |                 current_node.add_child(word[i])
 55 |                 current_node = current_node.children[word[i]]
 56 |                 i += 1
 57 | 
 58 |         current_node.leaf = True
 59 | 
 60 |         # adding data to the node
 61 |         if data:
 62 |             assert isinstance(data, dict)
 63 |             current_node.data.update(data)
 64 | 
 65 |     def walk(self, char, current_node=None):
 66 |         # logic of walking the trie adapted to be done outside the trie class (for Tokenize)
 67 |         if not current_node:
 68 |             current_node = self.head
 69 | 
 70 |         if char in current_node.children:
 71 |             next_node = current_node[char]
 72 |         else:
 73 |             next_node = None
 74 | 
 75 |         return next_node
 76 | 
 77 |     def has_word(self, word):
 78 |         if not word:
 79 |             raise ValueError('"word" must be non-null string')
 80 | 
 81 |         # parse the word
 82 |         current_node = self.head
 83 |         exists = True
 84 |         for syl in word:
 85 |             if syl in current_node.children:
 86 |                 current_node = current_node.children[syl]
 87 |             else:
 88 |                 exists = False
 89 |                 break
 90 |         else:
 91 |             # reached a word like 't', not a full word in our dictionary
 92 |             if exists and not current_node.leaf:
 93 |                 exists = False
 94 | 
 95 |         if exists:
 96 |             return {"exists": exists, "data": current_node.data}
 97 |         else:
 98 |             return {"exists": exists, "data": current_node.data}
 99 | 
100 |     def add_data(self, word, data):
101 |         """Adds data to words.
102 | 
103 |         :param word: word to add
104 |         :param data: dict of content to add
105 |         :return: True if any content added, False otherwise
106 |         """
107 |         if not word:
108 |             raise ValueError('"word" must be non-null string')
109 | 
110 |         # parse word
111 |         current_node = self.head
112 |         for syl in word:
113 |             if syl in current_node.children:
114 |                 current_node = current_node.children[syl]
115 |             else:
116 |                 return False
117 | 
118 |         # not a complete word
119 |         if not current_node.leaf:
120 |             return False
121 | 
122 |         # adding data
123 |         if isinstance(data, int):
124 |             current_node.data["form_freq"] = data
125 |             added = True
126 |         else:
127 |             if "senses" not in current_node.data:
128 |                 current_node.data["senses"] = []
129 |             added = self.add_meaning(current_node.data["senses"], data)
130 |         return added
131 | 
132 |     def add_meaning(self, meanings, meaning):
133 |         if meanings:
134 |             for m in meanings:
135 |                 if self.is_diff_meaning(meaning, m):
136 |                     meanings.append(meaning)
137 |                     return True
138 |             return False
139 |         else:
140 |             meanings.append(meaning)
141 |             return True
142 | 
143 |     @staticmethod
144 |     def is_diff_meaning(m1, m2):
145 |         is_diff = False
146 |         for k, v in m1.items():
147 |             if k not in m2 or k in m2 and m2[k] != v:
148 |                 is_diff = True
149 |         return is_diff
150 | 
151 |     def deactivate(self, word, rev=False):
152 |         """Makes word not findable (words are found only when the leaf value is True)
153 | 
154 |         :param word: word to deactivate
155 |         :param rev: reverse the deactivation
156 |         :return True if the word exists, False otherwise
157 |         """
158 |         current_node = self.head
159 |         for syl in word:
160 |             if syl in current_node.children:
161 |                 current_node = current_node.children[syl]
162 |             else:
163 |                 return False
164 |         if isinstance(current_node.data, dict):
165 |             if not rev:
166 |                 current_node.leaf = False
167 |             else:
168 |                 current_node.leaf = True
169 |             return True
170 |         else:
171 |             return False
172 | 


--------------------------------------------------------------------------------
/botok/tries/trie.py:
--------------------------------------------------------------------------------
  1 | # coding: utf-8
  2 | import csv
  3 | import pickle
  4 | import time
  5 | from pathlib import Path
  6 | import logging
  7 | 
  8 | from ..chunks.chunks import TokChunks
  9 | from ..vars import HASH, NAMCHE, NO_POS, TSEK, __version__
 10 | from .basictrie import BasicTrie, Node
 11 | 
 12 | 
 13 | class Trie(BasicTrie):
 14 |     def __init__(
 15 |         self, bosyl, profile, main_data, custom_data, build=False, pickle_path=None
 16 |     ):
 17 |         BasicTrie.__init__(self)
 18 |         self.bosyl = bosyl()
 19 |         self.main_data = main_data
 20 |         self.custom_data = custom_data
 21 |         self.pickled_file = Path(profile + "_trie.pickled")
 22 |         if pickle_path:
 23 |             self.pickled_file = Path(pickle_path) / self.pickled_file
 24 |         self.tmp_inflected = (
 25 |             dict()
 26 |         )  # tmp to inflect only once, even if a word appears in many files.
 27 |         self.load_or_build_trie(build)
 28 | 
 29 |     def rebuild_trie(self):
 30 |         self.head = Node()
 31 |         self.load_or_build_trie(build=True)
 32 | 
 33 |     def load_or_build_trie(self, build=False):
 34 |         if build or not self.pickled_file.exists():
 35 |             self._build_trie()
 36 |         else:
 37 |             self._load_trie()
 38 | 
 39 |         # add and deactivate the custom entries in memory (will not be written)
 40 |         self._populate_trie(self.custom_data)
 41 |         self.tmp_inflected = dict()
 42 | 
 43 |     def _load_trie(self):
 44 |         with self.pickled_file.open("rb") as f:
 45 |             self.head = pickle.load(f)
 46 |             version = self.head.data["_"]["version"]
 47 |             if version != __version__:
 48 |                 print(
 49 |                     f"\nThe trie was build for botok {version}. Current version: {__version__}"
 50 |                 )
 51 |                 self._build_trie()
 52 | 
 53 |     def _build_trie(self):
 54 |         """
 55 |         """
 56 |         logging.debug("Building Trie:")
 57 |         start = time.time()
 58 |         self.head.data["_"]["version"] = __version__  # add version in trie
 59 |         self._populate_trie(self.main_data)
 60 | 
 61 |         with self.pickled_file.open("wb") as f:
 62 |             pickle.dump(self.head, f, pickle.HIGHEST_PROTOCOL)
 63 |         end = time.time()
 64 |         logging.debug("({:.0f} s.)".format(end - start))
 65 | 
 66 |     def _populate_trie(self, files):
 67 |         # first populate the trie with words
 68 |         lexica = (d for d in files if d.startswith("lexica"))
 69 |         for l in lexica:
 70 |             for f in files[l]:
 71 |                 self._add_one_file(f, l)
 72 | 
 73 |         # then add data to the added words
 74 |         rest = (
 75 |             d for d in files if not d.startswith("lexica") and not d.startswith("rules")
 76 |         )
 77 |         for r in rest:
 78 |             for f in files[r]:
 79 |                 self._add_one_file(f, r)
 80 | 
 81 |     def _add_one_file(self, in_file, category):
 82 |         """
 83 |         files can have comments starting with #
 84 |         spaces and empty lines are trimmed
 85 |         a single space(breaks if more than one), a comma or a tab can be used as separators
 86 |         """
 87 |         logging.debug("\t" + str(in_file))
 88 |         with in_file.open("r", encoding="utf-8-sig") as f:
 89 |             lines = self.__clean_lines(f)
 90 |             for l in lines:
 91 |                 word = l.split("\t", 1)[0]
 92 |                 if category == "words":
 93 |                     self.inflect_n_modify_trie(word)
 94 |                     self.inflect_n_add_data(l)
 95 | 
 96 |                 elif category == "words_non_inflected":
 97 |                     self.add_non_inflectible(word)
 98 |                     self.inflect_n_add_data(l)
 99 | 
100 |                 elif category == "words_skrt":
101 |                     self.inflect_n_modify_trie(word, skrt=True)
102 |                     self.inflect_n_add_data(l)
103 | 
104 |                 elif category == "remove":
105 |                     self.inflect_n_modify_trie(l, deactivate=True)
106 | 
107 |                 else:
108 |                     raise SyntaxError(
109 |                         "'category' is: '"
110 |                         + category
111 |                         + "'. Valid answers: words_bo, words_skrt,"
112 |                         "words_non_inflected, entry_data, remove"
113 |                     )
114 | 
115 |     def add_non_inflectible(self, word):
116 |         syls = TokChunks(word).get_syls()
117 |         if not syls:
118 |             return None
119 | 
120 |         # infl = self.__join_syls(syls)
121 |         self.add(syls)
122 | 
123 |     def inflect_n_modify_trie(self, word, deactivate=False, skrt=False):
124 |         """
125 |         Add or deactivate to the trie all the affixed versions of the word
126 |         :param word: a word without ending tsek
127 |         :param deactivate: switch to add or deactivate a word
128 |         """
129 |         inflected = self._get_inflected(word)
130 |         if not inflected:
131 |             return
132 | 
133 |         for infl, data in inflected:
134 |             if deactivate:
135 |                 self.deactivate(infl)
136 |             else:
137 |                 if skrt:
138 |                     if data is None:
139 |                         data = {"skrt": True}
140 |                     else:
141 |                         data.update({"skrt": True})
142 |                     self.add(infl, data=data)
143 |                 else:
144 |                     self.add(infl, data=data)
145 | 
146 |     def inflect_n_add_data(self, line):
147 |         form, pos, lemma, sense, freq = self.__parse_line(line)
148 |         freq = int(freq) if freq else None
149 |         lemma = self.__join_syls(TokChunks(lemma).get_syls()) if lemma else None
150 | 
151 |         inflected = self._get_inflected(form)
152 |         if not inflected:
153 |             return  # The entry is not Tibetan, so return doing nothing
154 | 
155 |         for infl, _ in inflected:
156 |             affixed = True if _ else False
157 |             data = {
158 |                 k: v
159 |                 for k, v in [
160 |                     ("lemma", lemma),
161 |                     ("pos", pos),
162 |                     ("freq", freq),
163 |                     ("sense", sense),
164 |                     ("affixed", affixed),
165 |                 ]
166 |                 if v is not None
167 |             }
168 |             self.add_data(infl, data)
169 | 
170 |     def _get_inflected(self, word):
171 |         """
172 |         gets the clean syls using TokChunks(), then inflects the last syl using BoSyl.get_all_affixed()
173 | 
174 |         :return: list of (<inflected word>, <affixation data>)
175 |         """
176 |         if word in self.tmp_inflected:
177 |             return self.tmp_inflected[word]
178 | 
179 |         syls = TokChunks(word).get_syls()
180 |         if not syls:
181 |             return None
182 | 
183 |         inflected = [(syls, None)]
184 |         affixed = self.bosyl.get_all_affixed(syls[-1])
185 |         if affixed:
186 |             for infl, data in affixed:
187 |                 infl_word = syls[:-1] + [infl]
188 |                 inflected.append((infl_word, {"affixation": data}))
189 | 
190 |         self.tmp_inflected[word] = inflected
191 |         return inflected
192 | 
193 |     @staticmethod
194 |     def __join_syls(syls):
195 |         return "".join([syl if syl.endswith(NAMCHE) else syl + TSEK for syl in syls])
196 | 
197 |     @staticmethod
198 |     def __clean_lines(f):
199 |         # cuts off comments, then strips empty lines
200 |         lines = (
201 |             line[: line.index(HASH)] if HASH in line else line for line in f.readlines()
202 |         )
203 |         return (l for l in lines if l)
204 | 
205 |     @staticmethod
206 |     def __parse_line(line):
207 |         """
208 |         enables support of '\t' and ',' as separator.
209 |         """
210 |         fields = [None, None, None, None, None]
211 |         if "\t" in line:
212 |             sep = "\t"
213 |         elif "," in line:
214 |             sep = ","
215 |         else:
216 |             fields[0] = line
217 |             fields[2] = NO_POS
218 |             return fields
219 | 
220 |         for num, cell in enumerate(list(csv.reader([line], delimiter=sep))[0]):
221 |             fields[num] = cell if cell else None
222 |         return fields
223 | 


--------------------------------------------------------------------------------
/botok/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/utils/__init__.py


--------------------------------------------------------------------------------
/botok/utils/expose_data.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | from pathlib import Path
 3 | 
 4 | from ..config import Config
 5 | 
 6 | 
 7 | def expose_data(out_path, profile=None):
 8 |     """ Copies all the trie and adjustment data to out_path
 9 |     :param out_path: must be an existing empty folder
10 |     """
11 |     if profile not in ["POS", "empty"]:
12 |         raise SyntaxError('profile should be either one of ["POS", "empty"]')
13 | 
14 |     out_path = Path(out_path)
15 |     if not out_path.is_dir() or list(out_path.glob("*")):
16 |         raise IOError("out_path should be an empty folder")
17 | 
18 |     resources = Path(__file__).parent / "../resources"
19 |     resources = resources.resolve()
20 |     res_dirs = [r for r in resources.glob("*") if r.is_dir()]
21 | 
22 |     if profile:
23 |         # export profile data
24 |         for f in Config().config["tokenizers"]["profiles"][profile]:
25 |             Path(out_path / Path(f).parent).mkdir(
26 |                 parents=True, exist_ok=True
27 |             )  # create dir
28 |             shutil.copy(resources / f, out_path / f)
29 | 
30 |         shutil.copytree(resources / "adjustment", out_path / "adjustment")
31 | 
32 |     else:
33 |         # export all data
34 |         for r in res_dirs:
35 |             shutil.copytree(r, out_path / r.name)
36 | 


--------------------------------------------------------------------------------
/botok/utils/helpers.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | 
3 | 
4 | def decomment_file(file):
5 |     for row in file:
6 |         raw = row.split("#")[0].strip()
7 |         if raw:
8 |             yield raw
9 | 


--------------------------------------------------------------------------------
/botok/utils/unicode_normalization.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from enum import Enum
  3 | 
  4 | class Cats(Enum):
  5 |     Other = 0
  6 |     Base = 1
  7 |     Subscript = 2
  8 |     BottomVowel = 3
  9 |     BottomMark = 4
 10 |     TopVowel = 5
 11 |     TopMark = 6
 12 |     RightMark = 7
 13 | 
 14 | 
 15 | CATEGORIES = (
 16 |     [Cats.Other]  # 0F00
 17 |     + [Cats.Base]  # 0F01, often followed by 0f083
 18 |     + [Cats.Other] * 22  # 0F02-0F17
 19 |     + [Cats.BottomVowel] * 2  # 0F18-0F19
 20 |     + [Cats.Other] * 6  # 0F1A-0F1F
 21 |     + [Cats.Base]
 22 |     * 20  # 0F20-0F33, numbers can be followed by 0f18, 0f19 or exceptionally by vowels
 23 |     + [Cats.Other]  # 0F34
 24 |     + [Cats.BottomMark]  # 0F35
 25 |     + [Cats.Other]  # 0F36
 26 |     + [Cats.BottomMark]  # OF37
 27 |     + [Cats.Other]  # 0F38
 28 |     + [Cats.Subscript]  # 0F39, kind of cheating but works
 29 |     + [Cats.Other] * 4  # 0F3A-0F3D
 30 |     + [Cats.RightMark]  # 0F3E
 31 |     + [Cats.Other]  # 0F3F, not quite sure
 32 |     + [Cats.Base] * 45  # 0F40-0F6C
 33 |     + [Cats.Other] * 4  # 0F6D-0F70
 34 |     + [Cats.BottomVowel]  # 0F71
 35 |     + [Cats.TopVowel]  # 0F72
 36 |     + [Cats.TopVowel]  # 0F73
 37 |     + [Cats.BottomVowel] * 2  # 0F74-0F75
 38 |     + [Cats.TopVowel] * 8  # 0F76-0F7D
 39 |     + [Cats.TopMark]  # 0F7E
 40 |     + [Cats.RightMark]  # 0F7F
 41 |     + [Cats.TopVowel] * 2  # 0F80-0F81
 42 |     + [Cats.TopMark] * 2  # 0F82-0F83
 43 |     + [Cats.BottomMark]  # 0F84
 44 |     + [Cats.Other]  # 0F85
 45 |     + [Cats.TopMark] * 2  # 0F86-0F87
 46 |     + [Cats.Base] * 2  # 0F88-0F89
 47 |     + [Cats.Base]  # 0F8A always followed by 0f82 (required by the Unicode spec)
 48 |     + [Cats.Other]  # 0F8B
 49 |     + [Cats.Base]  # 0F8C
 50 |     + [Cats.Subscript] * 48  # 0F8D-0FBC
 51 | )
 52 | 
 53 | 
 54 | def charcat(c):
 55 |     """Returns the category for a single char string"""
 56 |     o = ord(c)
 57 |     if 0x0F00 <= o <= 0x0FBC:
 58 |         return CATEGORIES[o - 0x0F00]
 59 |     return Cats.Other
 60 | 
 61 | 
 62 | # debug:
 63 | # for i, c in enumerate(CATEGORIES):
 64 | #    print("%x : %d" % (0x0F00 + i , c.value))
 65 | 
 66 | 
 67 | def unicode_reorder(txt):
 68 |     # case of a syllable starting with a diacritic (ex: a vowel or subscript)
 69 |     # we push it after the first main letter
 70 |     # txt = re.sub(r"^([\u0f71-\u0f84\u0f8d-\u0fbc]+)([\u0f40-\u0f6c])", r"\2", txt)
 71 |     # return txt, True
 72 |     # inpired from code for Khmer Unicode provided by SIL
 73 |     # https://docs.microsoft.com/en-us/typography/script-development/tibetan#reor
 74 |     # https://docs.microsoft.com/en-us/typography/script-development/use#glyph-reordering
 75 |     charcats = [charcat(c) for c in txt]
 76 |     # find subranges of base+non other and sort components in the subrange
 77 |     i = 0
 78 |     res = []
 79 |     valid = True
 80 |     while i < len(charcats):
 81 |         c = charcats[i]
 82 |         if c != Cats.Base:
 83 |             if c.value > Cats.Base.value:
 84 |                 valid = False
 85 |             res.append(txt[i])
 86 |             i += 1
 87 |             continue
 88 |         # scan for end of component
 89 |         j = i + 1
 90 |         while j < len(charcats) and charcats[j].value > Cats.Base.value:
 91 |             j += 1
 92 |         # sort syllable based on character categories
 93 |         # sort the char indices by category then position in string
 94 |         newindices = sorted(range(i, j), key=lambda e: (charcats[e].value, e))
 95 |         replaces = "".join(txt[n] for n in newindices)
 96 |         res.append(replaces)
 97 |         i = j
 98 |     return "".join(res), valid
 99 | 
100 | 
101 | def normalize_unicode(s, form="nfd"):
102 |     # first, unify Unicode form:
103 |     # http://www.unicode.org/faq/normalization.html
104 |     # https://unicode.org/reports/tr15/
105 |     # https://unicode.org/charts/normalization/chart_Tibetan.html
106 |     # although for some reason this chart considers 0f0c -> 0f0b in NFD
107 |     #
108 |     # deprecated or discouraged characters
109 |     s = s.replace("\u0f73", "\u0f71\u0f72")  # use is discouraged
110 |     s = s.replace("\u0f75", "\u0f71\u0f74")  # use is discouraged
111 |     s = s.replace("\u0f77", "\u0fb2\u0f71\u0f80")  # deprecated
112 |     s = s.replace("\u0f79", "\u0fb3\u0f71\u0f80")  # deprecated
113 |     s = s.replace("\u0f81", "\u0f71\u0f80")  # use is discouraged
114 |     if form == "nfd":
115 |         s = s.replace("\u0f43", "\u0f42\u0fb7")
116 |         s = s.replace("\u0f4d", "\u0f4c\u0fb7")
117 |         s = s.replace("\u0f52", "\u0f51\u0fb7")
118 |         s = s.replace("\u0f57", "\u0f56\u0fb7")
119 |         s = s.replace("\u0f5c", "\u0f5b\u0fb7")
120 |         s = s.replace("\u0f69", "\u0f40\u0fb5")
121 |         s = s.replace("\u0f76", "\u0fb2\u0f80")
122 |         s = s.replace("\u0f78", "\u0fb3\u0f80")
123 |         s = s.replace("\u0f93", "\u0f92\u0fb7")
124 |         s = s.replace("\u0f9d", "\u0f9c\u0fb7")
125 |         s = s.replace("\u0fa2", "\u0fa1\u0fb7")
126 |         s = s.replace("\u0fa7", "\u0fa6\u0fb7")
127 |         s = s.replace("\u0fac", "\u0fab\u0fb7")
128 |         s = s.replace("\u0fb9", "\u0f90\u0fb5")
129 |     else:
130 |         s = s.replace("\u0f42\u0fb7", "\u0f43")
131 |         s = s.replace("\u0f4c\u0fb7", "\u0f4d")
132 |         s = s.replace("\u0f51\u0fb7", "\u0f52")
133 |         s = s.replace("\u0f56\u0fb7", "\u0f57")
134 |         s = s.replace("\u0f5b\u0fb7", "\u0f5c")
135 |         s = s.replace("\u0f40\u0fb5", "\u0f69")
136 |         s = s.replace("\u0fb2\u0f80", "\u0f76")
137 |         s = s.replace("\u0fb3\u0f80", "\u0f78")
138 |         s = s.replace("\u0f92\u0fb7", "\u0f93")
139 |         s = s.replace("\u0f9c\u0fb7", "\u0f9d")
140 |         s = s.replace("\u0fa1\u0fb7", "\u0fa2")
141 |         s = s.replace("\u0fa6\u0fb7", "\u0fa7")
142 |         s = s.replace("\u0fab\u0fb7", "\u0fac")
143 |         s = s.replace("\u0f90\u0fb5", "\u0fb9")
144 |     # 0f00 has not been marked as a composed character in Unicode
145 |     # This is something that is now seen as a mistake, but it cannot be
146 |     # changed because of Unicode change policies.
147 |     s = s.replace("\u0f00", "\u0f68\u0f7c\u0f7e")
148 |     s, valid = unicode_reorder(s)
149 |     # ra doesn't transform into a small rago before anything else than (most) subjoined,
150 |     # so 0f65 should be replaced with 0f62 in that case
151 |     s = re.sub("\u0f65([^\u0f90-\u0f97\u0f9a-\u0fac\u0fae\u0faf\u0fb4-\u0fbc])", r"ར\1", s)
152 |     s = normalize_invalid_start_string(s)
153 |     return s
154 | 
155 | def debug_to_unicode(s):
156 |     res = ""
157 |     for c in s:
158 |         res += "\\u%x " % ord(c)
159 |     return res
160 | 
161 | 
162 | def assert_conv(orig, expected, expectedValid=True):
163 |     resultStr = normalize_unicode(orig)
164 |     assert resultStr == expected, "{} -> {} but {} expected".format(
165 |         debug_to_unicode(orig), debug_to_unicode(resultStr), debug_to_unicode(expected)
166 |     )
167 |     #assert resultValid == expectedValid, "{} valid? -> {} but {} expected".format(
168 |     #    debug_to_unicode(orig), resultValid, expectedValid
169 |     #)
170 | 
171 | 
172 | def is_vowel(char):
173 |     if re.search(r"[\u0f71-\u0f84]", char):
174 |         return True
175 |     return False
176 | 
177 | 
178 | def is_suffix(char):
179 |     if re.search(r"[\u0f90-\u0fbc]", char):
180 |         return True
181 |     return False
182 | 
183 | 
184 | def normalize_invalid_start_string(s):
185 |     if len(s) < 2:
186 |         return s
187 |     # we put the vowel in second place if the string starts with a vowel
188 |     if is_vowel(s[0]) and not is_vowel(s[1]) and not is_suffix(s[1]):
189 |         return s[1] + s[0] + (s[2:] if len(s) > 2 else "")
190 |     if is_suffix(s[0]):
191 |         return s[1:]
192 |     return s
193 | 
194 | 
195 | def test_normalize_unicode():
196 |     assert_conv("\u0F7B\u0F56", "\u0F56\u0F7B", False)
197 |     assert_conv("\u0f40\u0f77", "\u0f40\u0fb2\u0f71\u0f80", False)
198 |     assert_conv("\u0f40\u0f7e\u0f7c\u0f74\u0f71", "\u0f40\u0f74\u0f71\u0f7c\u0f7e")
199 |     assert_conv("\u0f58\u0f74\u0fb0\u0f83", "\u0f58\u0fb0\u0f74\u0f83")
200 |     assert_conv("\u0F51\u0FB7\u0F74\u0FB0", "\u0F51\u0FB7\u0fb0\u0F74")
201 |     assert_conv("\u0F66\u0F7C\u0FB1", "\u0F66\u0FB1\u0F7C")
202 |     assert_conv("\u0F0B\u0F7E", "\u0F0B\u0F7E", False)
203 |     assert_conv("\u0f65\u0f99\u0f7a\u0f7a", "\u0f62\u0f99\u0f7a\u0f7a")
204 |     assert_conv("\u0f01\u0f83", "\u0f01\u0f83") # should be valid
205 | 
206 | 
207 | if __name__ == "__main__":
208 |     test_normalize_unicode()


--------------------------------------------------------------------------------
/botok/vars.py:
--------------------------------------------------------------------------------
 1 | # coding: utf-8
 2 | from enum import Enum, IntEnum
 3 | 
 4 | __version__ = "0.9.0"
 5 | 
 6 | NO_POS = "NOPOS"
 7 | TSEK = "་"
 8 | NAMCHE = "ཿ"
 9 | SHAD = "།"
10 | AA = "འ"
11 | HASH = "#"
12 | VOWELS = ["ི"]
13 | NO_SHAD_CONS = ["ཀ", "ག", "ཤ"]
14 | DAGDRA = ["པ་", "པོ་", "བ་", "བོ་"]
15 | 
16 | CharMarkers = IntEnum(
17 |     "CharMarkers",
18 |     [
19 |         # regular Tibetan
20 |         "CONS",
21 |         "SUB_CONS",
22 |         "VOW",
23 |         "TSEK",
24 |         # punctuation
25 |         "NORMAL_PUNCT",
26 |         "SPECIAL_PUNCT",
27 |         # others
28 |         "NUMERAL",
29 |         "SYMBOL",
30 |         "IN_SYL_MARK",
31 |         "NON_BO_NON_SKRT",
32 |         # lexica_skrt
33 |         "SKRT_CONS",
34 |         "SKRT_SUB_CONS",
35 |         "SKRT_VOW",
36 |         "SKRT_LONG_VOW",
37 |         # other languages
38 |         "CJK",
39 |         "LATIN",
40 |         # misc
41 |         "OTHER",
42 |         "TRANSPARENT",
43 |         "NFC",
44 |     ],
45 |     start=1,
46 | )
47 | char_values = {c.value: c.name for c in CharMarkers}
48 | 
49 | ChunkMarkers = IntEnum(
50 |     "ChunkMarkers",
51 |     [
52 |         # languages
53 |         "BO",
54 |         "LATIN",
55 |         "CJK",
56 |         "OTHER",
57 |         # tibetan textual content
58 |         "TEXT",
59 |         # tibetan non-textual content
60 |         "PUNCT",
61 |         "NON_PUNCT",
62 |         "SPACE",
63 |         "NON_SPACE",
64 |         "SYM",
65 |         "NON_SYM",
66 |         "NUM",
67 |         "NON_NUM",
68 |     ],
69 |     start=100,
70 | )
71 | chunk_values = {c.value: c.name for c in ChunkMarkers}
72 | 
73 | WordMarkers = IntEnum("WordMarkers", ["WORD", "NO_POS", "NON_WORD"], start=1000)
74 | word_values = {w.value: w.name for w in WordMarkers}
75 | 
76 | Ids = Enum("Ids", ["profile", "prep", "tok", "mod", "form"])
77 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = _build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
  1 | # Generating the documentation
  2 | 
  3 | To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
  4 | you can install them with the following command, at the root of the code repository:
  5 | 
  6 | ```bash
  7 | pip install -e ".[docs]"
  8 | ```
  9 | 
 10 | ---
 11 | **NOTE**
 12 | 
 13 | You only need to generate the documentation to inspect it locally (if you're planning changes and want to 
 14 | check how they look like before committing for instance). You don't have to commit the built documentation.
 15 | 
 16 | ---
 17 | 
 18 | ## Packages installed
 19 | 
 20 | Here's an overview of all the packages installed. If you ran the previous command installing all packages from
 21 | `requirements.txt`, you do not need to run the following commands.
 22 | 
 23 | Building it requires the package `sphinx` that you can
 24 | install using:
 25 | 
 26 | ```bash
 27 | pip install -U sphinx
 28 | ```
 29 | 
 30 | You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by
 31 | [Read The Docs](https://readthedocs.org/). You can install it using the following command:
 32 | 
 33 | ```bash
 34 | pip install sphinx_rtd_theme
 35 | ```
 36 | 
 37 | The third necessary package is the `recommonmark` package to accept Markdown as well as Restructured text:
 38 | 
 39 | ```bash
 40 | pip install recommonmark
 41 | ```
 42 | 
 43 | ## Building the documentation
 44 | 
 45 | Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
 46 | 
 47 | ```bash
 48 | make html
 49 | ```
 50 | 
 51 | A folder called ``_build/html`` should have been created. You can now open the file ``_build/html/index.html`` in your
 52 | browser. 
 53 | 
 54 | ---
 55 | **NOTE**
 56 | 
 57 | If you are adding/removing elements from the toc-tree or from any structural item, it is recommended to clean the build
 58 | directory before rebuilding. Run the following command to clean and build:
 59 | 
 60 | ```bash
 61 | make clean && make html
 62 | ```
 63 | 
 64 | ---
 65 | 
 66 | It should build the static app that will be available under `/docs/_build/html`
 67 | 
 68 | ## Adding a new element to the tree (toc-tree)
 69 | 
 70 | Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
 71 | in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
 72 | 
 73 | ## Preview the documentation in a pull request
 74 | 
 75 | Once you have made your pull request, you can check what the documentation will look like after it's merged by
 76 | following these steps:
 77 | 
 78 | - Look at the checks at the bottom of the conversation page of your PR (you may need to click on "show all checks" to
 79 |   expand them).
 80 | - Click on "details" next to the `ci/circleci: build_doc` check.
 81 | - In the new window, click on the "Artifacts" tab.
 82 | - Locate the file "docs/_build/html/index.html" (or any specific page you want to check) and click on it to get a 
 83 |   preview.
 84 | 
 85 | ## Writing Documentation - Specification
 86 | 
 87 | The `huggingface/transformers` documentation follows the
 88 | [Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style. It is
 89 | mostly written in ReStructuredText 
 90 | ([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html), 
 91 | [Sourceforge complete documentation](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html))
 92 | 
 93 | ### Adding a new section
 94 | 
 95 | A section is a page held in the `Notes` toc-tree on the documentation. Adding a new section is done in two steps:
 96 | 
 97 | - Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md).
 98 | - Link that file in `./source/index.rst` on the correct toc-tree.
 99 | 
100 | ### Adding a new model
101 | 
102 | When adding a new model:
103 |  
104 | - Create a file `xxx.rst` under `./source/model_doc`. 
105 | - Link that file in `./source/index.rst` on the `model_doc` toc-tree.
106 | - Write a short overview of the model:
107 |     - Overview with paper & authors
108 |     - Paper abstract
109 |     - Tips and tricks and how to use it best
110 | - Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and
111 |   every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow.
112 |   The order is generally: 
113 |     - Configuration, 
114 |     - Tokenizer
115 |     - PyTorch base model
116 |     - PyTorch head models
117 |     - TensorFlow base model
118 |     - TensorFlow head models
119 | 
120 | These classes should be added using the RST syntax. Usually as follows:
121 | ```
122 | XXXConfig
123 | ~~~~~~~~~~~~~~~~~~~~~
124 | 
125 | .. autoclass:: transformers.XXXConfig
126 |     :members:
127 | ```
128 | 
129 | This will include every public method of the configuration. If for some reason you wish for a method not to be
130 | displayed in the documentation, you can do so by specifying which methods should be in the docs:
131 | 
132 | ```
133 | XXXTokenizer
134 | ~~~~~~~~~~~~~~~~~~~~~
135 | 
136 | .. autoclass:: transformers.XXXTokenizer
137 |     :members: build_inputs_with_special_tokens, get_special_tokens_mask,
138 |         create_token_type_ids_from_sequences, save_vocabulary
139 | 
140 | ```
141 | 
142 | ### Writing source documentation
143 | 
144 | Values that should be put in `code` should either be surrounded by double backticks: \`\`like so\`\` or be written as
145 | an object using the :obj: syntax: :obj:\`like so\`.
146 | 
147 | When mentionning a class, it is recommended to use the :class: syntax as the mentioned class will be automatically
148 | linked by Sphinx: :class:\`transformers.XXXClass\`
149 | 
150 | When mentioning a function, it is recommended to use the :func: syntax as the mentioned method will be automatically
151 | linked by Sphinx: :func:\`transformers.XXXClass.method\`
152 | 
153 | Links should be done as so (note the double underscore at the end): \`text for the link <./local-link-or-global-link#loc>\`__
154 | 
155 | #### Defining arguments in a method
156 | 
157 | Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 
158 | The argument should be followed by its type, with its shape if it is a tensor, and a line return.
159 | Another indentation is necessary before writing the description of the argument.
160 | 
161 | Here's an example showcasing everything so far:
162 | 
163 | ```
164 |     Args:
165 |         input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
166 |             Indices of input sequence tokens in the vocabulary.
167 | 
168 |             Indices can be obtained using :class:`transformers.AlbertTokenizer`.
169 |             See :func:`transformers.PreTrainedTokenizer.encode` and
170 |             :func:`transformers.PreTrainedTokenizer.__call__` for details.
171 | 
172 |             `What are input IDs? <../glossary.html#input-ids>`__
173 | ```
174 | 
175 | #### Writing a multi-line code block 
176 | 
177 | Multi-line code blocks can be useful for displaying examples. They are done like so:
178 | 
179 | ```
180 | Example::
181 | 
182 |     # first line of code
183 |     # second line
184 |     # etc
185 | ```
186 | 
187 | The `Example` string at the beginning can be replaced by anything as long as there are two semicolons following it.
188 | 
189 | #### Writing a return block
190 | 
191 | Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 
192 | The first line should be the type of the return, followed by a line return. No need to indent further for the elements
193 | building the return.
194 | 
195 | Here's an example for tuple return, comprising several objects:
196 | 
197 | ```
198 |     Returns:
199 |         :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
200 |         loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
201 |             Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
202 |         prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
203 |             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
204 | ```
205 | 
206 | Here's an example for a single value return:
207 | 
208 | ```
209 |     Returns:
210 |         A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
211 | ```
212 | 


--------------------------------------------------------------------------------
/docs/old-docs/README.md:
--------------------------------------------------------------------------------
  1 | ## Overview
  2 | 
  3 | pybo is a word tokenizer for the Tibetan language written in Python. pybo takes in chunks of text and returns lists of words. It provides an easy-to-use, high-performance tokenization pipeline that can serve as a stand-alone solution or be adapted as a compliment.
  4 | 
  5 | 
  6 | ## Getting started
  7 | 
  8 |     pip install pybo
  9 |     
 10 | Or to install from the latest master branch:
 11 | 
 12 |     pip install git+https://github.com/Esukhia/pybo.git
 13 | 
 14 | ## How to use pybo
 15 | 
 16 | #### To initiate the tokenizer together with part-of-speech capability: 
 17 | 
 18 |     # Initialize the tokenizer
 19 |     tok = bo.BoTokenizer('POS')
 20 |     
 21 |     # Feed it some Tibetan text
 22 |     input_str = '༄༅། །རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྻ་ཨ་བ་ཏ་ར། བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པ། །སངས་རྒྱས་དང་བྱང་ཆུབ་སེམས་དཔའ་ཐམས་ཅད་ལ་ཕྱག་འཚལ་ལོ། །བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང༌། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །'
 23 |     
 24 |     # Run the tokenizer
 25 |     tokens = tok.tokenize(input_str)
 26 |     
 27 | #### Now in 'tokens' you have an iterable where each token consist of several meta-data:
 28 | 
 29 |     # Access the first token in the iterable
 30 |     tokens[0]
 31 | 
 32 | This yields:
 33 | 
 34 |     content: "༄༅། "
 35 |     char_types: |punct|punct|punct|space|
 36 |     type: punct
 37 |     start: 0
 38 |     len: 4
 39 |     syls: None
 40 |     tag: punct
 41 |     pos: punct
 42 |     skr: "False"
 43 |     freq: 0
 44 |     
 45 | notes:
 46 |  - `start` is the starting index of the current token in the input string.
 47 |  - `syls` is a list of cleaned syllables, each syllable being represented as a list of indices.
 48 | Each index leads to a constituting character within the input string. 
 49 | 
 50 | #### How to access all the words in a list 
 51 | 
 52 |     # iterate through the tokens object to get all the words in a list
 53 |     [t.content for t in tokens]
 54 | 
 55 | #### How to get all the nouns in a text
 56 | 
 57 |     # extract nouns from the tokens
 58 |     [t.content for t in tokens if t.tag == 'NOUNᛃᛃᛃ']
 59 |     
 60 | These examples highlight the basic principle of accessing attributes within each token object. 
 61 | 
 62 | ## Acknowledgements
 63 | 
 64 | **pybo** is an open source library for Tibetan NLP.
 65 | 
 66 | We are always open to cooperation in introducing new features, tool integrations and testing solutions.
 67 | 
 68 | Many thanks to the companies and organizations who have supported pybo's development, especially:
 69 | 
 70 | * [Khyentse Foundation](https://khyentsefoundation.org) for contributing USD22,000 to kickstart the project 
 71 | * The [Barom/Esukhia canon project](http://www.barom.org) for sponsoring training data curation
 72 | * [BDRC](https://tbrc.org) for contributing 2 staff for 6 months for data curation
 73 | 
 74 | ## Maintainance
 75 | 
 76 | Build the source dist:
 77 | 
 78 | ```
 79 | rm -rf dist/
 80 | python3 setup.py clean sdist
 81 | ```
 82 | 
 83 | and upload on twine (versio >= `1.11.0`) with:
 84 | 
 85 | ```
 86 | twine upload dist/*
 87 | ```
 88 | 
 89 | ## License
 90 | 
 91 | The Python code is Copyright (C) 2019 Esukhia, provided under [Apache 2](LICENSE). 
 92 | 
 93 | author: [Drupchen](https://github.com/drupchen)
 94 | 
 95 | contributors:
 96 |  * [Élie Roux](https://github.com/eroux)
 97 |  * [Thubten Rinzin](https://github.com/thubtenrigzin)
 98 |  * [Ngawang Trinley](https://github.com/ngawangtrinley)
 99 |  * [Mikko Kotila](https://github.com/mikkokotila)
100 |  * [Tenzin](https://github.com/10zinten)
101 | 


--------------------------------------------------------------------------------
/docs/old-docs/cql_readme.md:
--------------------------------------------------------------------------------
 1 | # CQL basics
 2 | 
 3 | To use CQL, go to the corpus search and select the CQL option. CQL will not work anywhere else in the interface. Expert users will use CQL for the writing of Word Sketch grammars and term grammars.
 4 | 
 5 | ## Syntax
 6 | 
 7 | With CQL, complex criteria can be set to find one or many tokens. Criteria for each token must be between a pair of square brackets [ ]. The format is:
 8 | 
 9 |     [attribute="value"]
10 | 
11 | To find the lemma teapot, use
12 | 
13 |     [lemma="teapot"]
14 | 
15 | Each token must be inside its own pair of square brackets. To search for phrase refill the teapot, use
16 | 
17 |     [lemma="refill"][lemma="the"][lemma="teapot"]
18 | 
19 | ## Spaces
20 | 
21 | Spaces have no function in CQL. Feel free to use spaces to make the code more readable. This code is equivalent to the previous one.
22 | 
23 | 
24 |     [ lemma = "refill" ]  [ lemma = "the" ]  [ lemma=  "teapot"  ]
25 | 
26 | ## Careful in values!
27 | 
28 | There should not be any spaces inside quotes. This finds nothing because a lemma cannot start with spaces.
29 | 
30 |     [lemma="  the"]
31 | 
32 | More examples
33 | 
34 | | TASK | CQL CODE | RESULT |
35 | | -- | -- | -- |
36 | | find examples of “went” | [word="went"] | concordance of the word went
37 | | find examples of all forms of go | [lemma="go"] | concordance of go, goes, going, gone, went
38 | | find exaples of all words tagged with the tag NP | [tag="NP"] | concordance of various words tagged as NP
39 | 
40 | 
41 | 
42 | * Matching on token annotations (properties or attributes), using regular expressions and =, !=, !. Example: [word="bank"] (or just "bank")
43 | * Combining criteria using &, | and !. Parentheses can also be used for grouping. Example: [lemma="bank" & pos="V"]
44 | * Matchall pattern [] matches any token. Example: "a" [] "day"
45 | * Regular expression operators +, *, ?, {n}, {n,m} at the token level. Example: [pos="ADJ"]+
46 | * Sequences of token constraints. Example: [pos="ADJ"] "cow"
47 | * Operators |, & and parentheses can be used to build complex sequence queries. Example: "happy" "dog" | "sad" cat"
48 | * Querying with tag positions using e.g. <s> (start of sentence), </s> (end of sentence), <s/> (whole sentence) or <s> ... </s> (equivalent to <s/> containing ...). Example: <s> "The". XML attribute values may be used as well, e.g. <ne type="PERS"/> (“named entities that are persons”).
49 | * Using within and containing operators to find hits inside another set of hits. Example: "you" "are" within <s/>
50 | * Using an anchor to capture a token position. Example: "big" A:[]. Captured matches can be used in global constraints (see next item) or processed separately later (using the Java interface; capture information is not yet returned by BlackLab Server). Note that BlackLab can actually capture entire groups of tokens as well, similarly to regular expression engines.
51 | * Global constraints on captured tokens, such as requiring them to contain the same word. Example: "big" A:[] "or" "small" B:[] :: A.word = B.word
52 | 


--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | sphinx==3.1.2
2 | sphinx_rtd_theme==0.5.0
3 | recommonmark==0.6.0
4 | sphinx_markdown_tables==0.0.15
5 | sphinx_copybutton==0.3.0


--------------------------------------------------------------------------------
/docs/source/acknowledgement.rst:
--------------------------------------------------------------------------------
 1 | Acknowledgement
 2 | ----------------------------------------------
 3 | botok is an open source library for Tibetan NLP.
 4 | 
 5 | We are always open to cooperation in introducing new features, tool integrations and testing solutions.
 6 | 
 7 | Many thanks to the companies and organizations who have supported botok's development, especially:
 8 | 
 9 |     - `Khyentse Foundation <https://khyentsefoundation.org/>`_ for contributing USD22,000 to kickstart the project
10 |     - The `Barom/OpenPecha canon project <http://www.barom.org/>`_ for sponsoring training data curation
11 |     - `BDRC <https://tbrc.org/>`_ for contributing 2 staff for 6 months for data curation
12 | 


--------------------------------------------------------------------------------
/docs/source/architecture.rst:
--------------------------------------------------------------------------------
  1 | Architecture
  2 | ============
  3 | 
  4 | This document explains the architecture of botok.
  5 | 
  6 | WordTokenizer architecture
  7 | --------------------------
  8 | 
  9 | Following is the architecture diagram of the `WordTokenizer <https://github.com/OpenPecha/botok/blob/master/botok/tokenizers/wordtokenizer.py>`_ class
 10 | 
 11 | .. image:: imgs/botok_architecture.svg
 12 |     :align: center
 13 | 
 14 | 
 15 | Tokenization workflow
 16 | ---------------------
 17 | 
 18 | Here is botok tokenization workflow with an examples.
 19 | 
 20 | .. code::
 21 | 
 22 |     >>> input_string = "ཀུན་་་དགའི་དོན་གྲུབ།"
 23 |     >>> from botok import BoSyl, Config, TokChunks, Tokenize, Trie
 24 |     >>> config = Config()
 25 |     >>> trie = Trie(BoSyl, profile=config.profile, main_data=config.dictionary, custom_data=config.adjustments)
 26 |     >>> tok = Tokenize(trie)
 27 |     >>> preproc = TokChunks(input_string)
 28 |     >>> preproc.serve_syls_to_trie()
 29 |     >>> tokens = tok.tokenize(preproc)
 30 |     >>>
 31 |     >>> print(*tokens, sep=f"{'='*65}\n\n")
 32 |     text: "ཀུན་་་དགའི་"
 33 |     text_cleaned: "ཀུན་དགའི་"
 34 |     text_unaffixed: "ཀུན་དགའ་"
 35 |     syls: ["ཀུན", "དགའི"]
 36 |     senses: | pos: PROPN, freq: 2923, affixed: True |
 37 |     char_types: |CONS|VOW|CONS|TSEK|TSEK|TSEK|CONS|CONS|CONS|VOW|TSEK|
 38 |     chunk_type: TEXT
 39 |     syls_idx: [[0, 1, 2], [6, 7, 8, 9]]
 40 |     syls_start_end: [{'start': 0, 'end': 6}, {'start': 6, 'end': 11}]
 41 |     start: 0
 42 |     len: 11
 43 | 
 44 |     =================================================================
 45 | 
 46 |     text: "དོན་གྲུབ"
 47 |     text_cleaned: "དོན་གྲུབ་"
 48 |     text_unaffixed: "དོན་གྲུབ་"
 49 |     syls: ["དོན", "གྲུབ"]
 50 |     senses: | pos: PROPN, freq: 1316, affixed: False |
 51 |     char_types: |CONS|VOW|CONS|TSEK|CONS|SUB_CONS|VOW|CONS|
 52 |     chunk_type: TEXT
 53 |     syls_idx: [[0, 1, 2], [4, 5, 6, 7]]
 54 |     syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 8}]
 55 |     start: 11
 56 |     len: 8
 57 | 
 58 |     =================================================================
 59 | 
 60 |     text: "།"
 61 |     char_types: |NORMAL_PUNCT|
 62 |     chunk_type: PUNCT
 63 |     start: 19
 64 |     len: 1
 65 |     >>>
 66 |     >>> from botok import AdjustTokens
 67 |     >>>
 68 |     >>> adjust_tok = AdjustTokens(main=config.dictionary["rules"], custom=config.adjustments["rules"])
 69 |     >>> adjusted_tokens = adjust_tok.adjust(tokens)
 70 |     >>> print(*adjusted_tokens, sep=f"{'='*65}\n\n")
 71 |     text: "ཀུན་་་དགའི་"
 72 |     text_cleaned: "ཀུན་དགའི་"
 73 |     text_unaffixed: "ཀུན་དགའ་"
 74 |     syls: ["ཀུན", "དགའི"]
 75 |     senses: | pos: PROPN, freq: 2923, affixed: True |
 76 |     char_types: |CONS|VOW|CONS|TSEK|TSEK|TSEK|CONS|CONS|CONS|VOW|TSEK|
 77 |     chunk_type: TEXT
 78 |     syls_idx: [[0, 1, 2], [6, 7, 8, 9]]
 79 |     syls_start_end: [{'start': 0, 'end': 6}, {'start': 6, 'end': 11}]
 80 |     start: 0
 81 |     len: 11
 82 | 
 83 |     =================================================================
 84 | 
 85 |     text: "དོན་གྲུབ"
 86 |     text_cleaned: "དོན་གྲུབ་"
 87 |     text_unaffixed: "དོན་གྲུབ་"
 88 |     syls: ["དོན", "གྲུབ"]
 89 |     senses: | pos: PROPN, freq: 1316, affixed: False |
 90 |     char_types: |CONS|VOW|CONS|TSEK|CONS|SUB_CONS|VOW|CONS|
 91 |     chunk_type: TEXT
 92 |     syls_idx: [[0, 1, 2], [4, 5, 6, 7]]
 93 |     syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 8}]
 94 |     start: 11
 95 |     len: 8
 96 | 
 97 |     =================================================================
 98 | 
 99 |     text: "།"
100 |     char_types: |NORMAL_PUNCT|
101 |     chunk_type: PUNCT
102 |     start: 19
103 |     len: 1


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
  1 | # Configuration file for the Sphinx documentation builder.
  2 | #
  3 | # This file only contains a selection of the most common options. For a full
  4 | # list see the documentation:
  5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
  6 | 
  7 | # -- Path setup --------------------------------------------------------------
  8 | 
  9 | # If extensions (or modules to document with autodoc) are in another directory,
 10 | # add these directories to sys.path here. If the directory is relative to the
 11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 12 | #
 13 | import os
 14 | import sys
 15 | 
 16 | sys.path.insert(0, os.path.abspath("../../"))
 17 | 
 18 | 
 19 | # -- Project information -----------------------------------------------------
 20 | 
 21 | project = "botok"
 22 | copyright = "2020-2025, OpenPecha"
 23 | author = "OpenPecha"
 24 | 
 25 | 
 26 | # -- General configuration ---------------------------------------------------
 27 | 
 28 | # Add any Sphinx extension module names here, as strings. They can be
 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 30 | # ones.
 31 | extensions = [
 32 |     "sphinx.ext.autodoc",
 33 |     "sphinx.ext.coverage",
 34 |     "sphinx.ext.napoleon",
 35 |     "recommonmark",
 36 |     "sphinx.ext.viewcode",
 37 |     "sphinx_markdown_tables",
 38 |     "sphinx_copybutton",
 39 | ]
 40 | 
 41 | # Add any paths that contain templates here, relative to this directory.
 42 | templates_path = ["_templates"]
 43 | 
 44 | # The suffix(es) of source filenames.
 45 | # You can specify multiple suffix as a list of string:
 46 | #
 47 | source_suffix = [".rst", ".md"]
 48 | 
 49 | # The master toctree document.
 50 | master_doc = "index"
 51 | 
 52 | # List of patterns, relative to source directory, that match files and
 53 | # directories to ignore when looking for source files.
 54 | # This pattern also affects html_static_path and html_extra_path.
 55 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
 56 | 
 57 | # Remove the prompt when copying examples
 58 | copybutton_prompt_text = ">>> "
 59 | 
 60 | # -- Options for HTML output -------------------------------------------------
 61 | 
 62 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 63 | # a list of builtin themes.
 64 | #
 65 | html_theme = "sphinx_rtd_theme"
 66 | 
 67 | # Theme options are theme-specific and customize the look and feel of a theme
 68 | # further.  For a list of options available for each theme, see the
 69 | # documentation.
 70 | #
 71 | html_theme_options = {"analytics_id": "UA-83738774-2"}
 72 | 
 73 | # Add any paths that contain custom static files (such as style sheets) here,
 74 | # relative to this directory. They are copied after the builtin static files,
 75 | # so a file named "default.css" will overwrite the builtin "default.css".
 76 | html_static_path = ["_static"]
 77 | 
 78 | # This must be the name of an image file (path relative to the configuration
 79 | # directory) that is the favicon of the docs. Modern browsers use this as
 80 | # the icon for tabs, windows and bookmarks. It should be a Windows-style
 81 | # icon file (.ico).
 82 | # html_favicon = "favicon.ico"
 83 | 
 84 | # -- Autodoc configuration --------------------------------------------------
 85 | 
 86 | # Autodoc settings
 87 | autodoc_member_order = 'bysource'
 88 | autoclass_content = 'both'
 89 | autodoc_typehints = 'description'
 90 | 
 91 | # Napoleon settings
 92 | napoleon_google_docstring = True
 93 | napoleon_numpy_docstring = False
 94 | napoleon_include_init_with_doc = True
 95 | napoleon_include_private_with_doc = False
 96 | napoleon_include_special_with_doc = True
 97 | napoleon_use_admonition_for_examples = False
 98 | napoleon_use_admonition_for_notes = False
 99 | napoleon_use_admonition_for_references = False
100 | napoleon_use_ivar = False
101 | napoleon_use_param = True
102 | napoleon_use_rtype = True
103 | napoleon_type_aliases = None
104 | 


--------------------------------------------------------------------------------
/docs/source/custom-dialect-pack.rst:
--------------------------------------------------------------------------------
 1 | Custom Dialect Pack
 2 | ===================
 3 | 
 4 | Why Custom Dialect Pack
 5 | -----------------------
 6 | 
 7 | - For domain specific tokenization
 8 | - Improving tokenization accuracy
 9 | 
10 | 
11 | Example
12 | -------
13 | 
14 | To use a custom dialect pack for tokenization, all we have to do is to create a `botok.Config` object with path to the custom dialect pack and use this config for creating word tokenizer.
15 | 
16 | First, create config for the custom dialect pack.
17 | 
18 | .. code::
19 | 
20 |     >>> from botok import Config
21 |     >>> config = Config.from_path('custom/dialect/pack/path')
22 | 
23 | Then, create word tokenizer with that same config.
24 | 
25 | .. code::
26 | 
27 |     >>> from botok import WordTokenizer
28 |     >>> wt = WordTokenizer(config=config)
29 |     >>> wt.tokenize("མཐའི་བཀྲ་ཤིས། ཀཀ abc མཐའི་རྒྱ་མཚོ་")
30 | 


--------------------------------------------------------------------------------
/docs/source/getting-started.rst:
--------------------------------------------------------------------------------
  1 | Getting Started with Botok
  2 | ==========================
  3 | 
  4 | Installation
  5 | ------------
  6 | 
  7 | .. Caution::
  8 | 
  9 |     Botok only supports Python 3.6 or higher
 10 | 
 11 | Install pre-built botok with pip:
 12 | 
 13 | .. code-block::
 14 | 
 15 |     $ pip install botok
 16 | 
 17 | Install from the latest Master branch of botok with pip:
 18 | 
 19 | .. code-block::
 20 | 
 21 |     $ pip install git+https://github.com/OpenPecha/botok.git
 22 | 
 23 | Install for developers, build botok from source:
 24 | 
 25 | .. code-block::
 26 | 
 27 |     $ git clone https://github.com/OpenPecha/botok.git
 28 |     $ cd botok
 29 |     $ python -m venv .env
 30 |     $ source .env/bin/activate  # On Windows: .env\Scripts\activate
 31 |     $ pip install -e .
 32 | 
 33 | Usage
 34 | -----
 35 | 
 36 | Here is the simple usage of botok to tokenize Tibetan text:
 37 | 
 38 | Import the botok tokenizer called WordTokenizer:
 39 | 
 40 | .. code-block::
 41 | 
 42 |     >>> from botok import WordTokenizer
 43 |     >>>
 44 |     >>> tokenizer = WordTokenizer()
 45 |     Building Trie... (12 s.)
 46 | 
 47 | Tokenize the given text:
 48 | 
 49 | .. code-block::
 50 | 
 51 |     >>> input_str = '༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ།མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།'
 52 |     >>> tokens = tokenizer.tokenize(input_str)
 53 |     >>> print(f'The output is a {type(tokens)}')
 54 |     The output is a <class 'list'>
 55 |     >>> print(f'The constituting elements are {type(tokens[0])}')
 56 |     The constituting elements are <class 'botok.token.Token'>
 57 | 
 58 | Now in 'tokens' you have an iterable where each token consists of several meta-data in attributes of Token Object:
 59 | 
 60 | .. code-block::
 61 | 
 62 |     >>> tokens[0]
 63 |     content: "༆ "
 64 |     char_types: |punct|space|
 65 |     type: punct
 66 |     start: 0
 67 |     len: 2
 68 |     tag: punct
 69 |     pos: punc
 70 |     
 71 |     
 72 | Custom dialect pack
 73 | ------------------
 74 | 
 75 | In order to use a custom dialect pack:
 76 | 
 77 | 1. Prepare your dialect pack in the same folder structure as the `general dialect pack <https://github.com/OpenPecha/botok-data/tree/master/dialect_packs/general>`_
 78 | 2. Instantiate a config object where you pass the dialect name and path
 79 | 3. Instantiate your tokenizer object using that config object
 80 | 4. Your tokenizer will use your custom dialect pack and will use a trie pickled file in the future to build the custom trie
 81 | 
 82 | .. code-block::
 83 | 
 84 |     from botok import WordTokenizer
 85 |     from botok.config import Config
 86 |     from pathlib import Path
 87 | 
 88 |     def get_tokens(wt, text):
 89 |         tokens = wt.tokenize(text, split_affixes=False)
 90 |         return tokens
 91 | 
 92 |     if __name__ == "__main__":
 93 |         config = Config(dialect_name="custom", base_path=Path.home())
 94 |         wt = WordTokenizer(config=config)
 95 |         text = "བཀྲ་ཤིས་བདེ་ལེགས་ཞུས་རྒྱུ་ཡིན་ སེམས་པ་སྐྱིད་པོ་འདུག།"
 96 |         tokens = get_tokens(wt, text)
 97 |         for token in tokens:
 98 |             print(token)
 99 | 
100 | Advanced Usage
101 | -------------
102 | 
103 | For more advanced usage, including POS tagging and lemmatization, see the :doc:`advanced guides <architecture>`.
104 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | .. botok documentation master file, created by
 2 |    sphinx-quickstart on Thu Jul 30 12:30:47 2020.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Botok
 7 | ===================================================================================================
 8 | State-of-the-art tokenizers for Tibetan language.
 9 | 
10 | This is the documentation of our repository `botok <https://github.com/OpenPecha/botok>`_.
11 | 
12 | Features
13 | --------------------------------
14 | 
15 | - Support various dialects
16 | - Word segmentation with support for affixed particles
17 | - Multiple tokenization modes (chunks, spaces, words)
18 | - Rich token attributes (lemma, POS, clean form)
19 | - File and string input processing
20 | - Word frequency counting
21 | - Handles complex cases like double tseks and spaces within words
22 | 
23 | Contents
24 | ----------------------------------
25 | 
26 | .. toctree::
27 |     :maxdepth: 2
28 |     :caption: Overview
29 | 
30 |     getting-started
31 |     acknowledgement
32 | 
33 | .. toctree::
34 |     :maxdepth: 2
35 |     :caption: Advanced guides
36 | 
37 |     architecture
38 |     custom-dialect-pack
39 | 
40 | .. toctree::
41 |     :maxdepth: 2
42 |     :caption: Package Reference
43 | 
44 |     main_classes/configuration
45 | 


--------------------------------------------------------------------------------
/docs/source/main_classes/configuration.rst:
--------------------------------------------------------------------------------
1 | Configuration
2 | -------------------------------------------------------
3 | 
4 | ``Config``
5 | ~~~~~~~~~~
6 | 
7 | .. autoclass:: botok.Config
8 |     :members:


--------------------------------------------------------------------------------
/python-3.13.2-amd64.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/python-3.13.2-amd64.exe


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | black
2 | isort
3 | pytest>=5.0.0
4 | coveralls
5 | covdefaults
6 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [aliases]
 2 | test=pytest
 3 | 
 4 | [coverage:run]
 5 | plugins = covdefaults
 6 | omit =
 7 |     .env/*
 8 |     botok/third_party/pynpl/cql.py
 9 |     botok/third_party/pynpl/fsa.py
10 | 
11 | [semantic_release]
12 | version_variable = botok/vars.py:__version__
13 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | # coding: utf8
 3 | 
 4 | from __future__ import print_function
 5 | 
 6 | import re
 7 | from pathlib import Path
 8 | 
 9 | import setuptools
10 | from pkg_resources import parse_version
11 | 
12 | assert parse_version(setuptools.__version__) >= parse_version("38.6.0")
13 | 
14 | 
15 | def get_version(prop, project):
16 |     project = Path(__file__).parent / project / "vars.py"
17 |     result = re.search(
18 |         r'{}\s*=\s*[\'"]([^\'"]*)[\'"]'.format(prop),
19 |         project.read_text(encoding="utf-8-sig"),
20 |     )
21 |     return result.group(1)
22 | 
23 | 
24 | def read(fname):
25 |     p = Path(__file__).parent / fname
26 |     with p.open(encoding="utf-8-sig") as f:
27 |         return f.read()
28 | 
29 | 
30 | setuptools.setup(
31 |     name="botok",
32 |     version=get_version("__version__", "botok"),  # edit version in botok/vars.py
33 |     author="OpenPecha development team",
34 |     author_email="openpecha@gmail.com",
35 |     description="Tibetan Word Tokenizer",
36 |     license="Apache2",
37 |     keywords="nlp computational_linguistics tibetan tokenizer token",
38 |     url="https://github.com/OpenPecha/botok",
39 |     packages=setuptools.find_packages(),
40 |     long_description=read("README.md"),
41 |     long_description_content_type="text/markdown",
42 |     project_urls={
43 |         "Source": "https://github.com/OpenPecha/botok",
44 |         "Tracker": "https://github.com/OpenPecha/botok/issues",
45 |     },
46 |     classifiers=[
47 |         "Development Status :: 3 - Alpha",
48 |         "Topic :: Text Processing :: Linguistic",
49 |         "Programming Language :: Python :: 3",
50 |         "Operating System :: OS Independent",
51 |         "Intended Audience :: Developers",
52 |         "Intended Audience :: Science/Research",
53 |         "License :: OSI Approved :: Apache Software License",
54 |         "Natural Language :: Tibetan",
55 |     ],
56 |     package_data={
57 |         "botok": [
58 |             "resources/*",
59 |             "resources/words_bo/*",
60 |             "resources/entry_data/*",
61 |             "resources/words_non_inflected/*",
62 |             "resources/lemmas/*",
63 |             "resources/rules/*",
64 |             "resources/words_skrt/*",
65 |             "resources/adjustment/*",
66 |         ]
67 |     },
68 |     python_requires=">=3.7",
69 |     tests_require=["pytest>=5.0.0"],
70 |     install_requires=["pyyaml", "requests"],
71 | )
72 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/tests/__init__.py


--------------------------------------------------------------------------------
/tests/chunks/test_chunkframework.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | from botok import ChunkFramework
  3 | from botok import ChunkMarkers as c
  4 | 
  5 | 
  6 | def test_bo_nonbo():
  7 |     string = 'བཀྲ་་ཤིས་བདེ་ལེགས། 23PIEIUZLDVéjoldvép«»("«»%='
  8 |     cb = ChunkFramework(string)
  9 |     chunks = cb.chunk_bo_chars()
 10 | 
 11 |     output = cb.get_readable(chunks)
 12 |     assert output == [
 13 |         ("BO", "བཀྲ་་ཤིས་བདེ་ལེགས། "),
 14 |         ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%='),
 15 |     ]
 16 | 
 17 | 
 18 | def test_punct_nonpunct():
 19 |     string = "༆ བཀྲ་ཤིས་བདེ་ལེགས།། །།"
 20 |     cb = ChunkFramework(string)
 21 |     chunks = cb.chunk_punct()
 22 | 
 23 |     output = cb.get_readable(chunks)
 24 |     assert output == [
 25 |         ("PUNCT", "༆ "),
 26 |         ("NON_PUNCT", "བཀྲ་ཤིས་བདེ་ལེགས"),
 27 |         ("PUNCT", "།། །།"),
 28 |     ]
 29 | 
 30 | 
 31 | def test_sym_nonsym():
 32 |     string = "བཀྲ་ཤིས་བདེ་ལེགས། ༪༫༝༜༛༚༇༆"
 33 |     cb = ChunkFramework(string)
 34 |     chunks = cb.chunk_symbol()
 35 | 
 36 |     output = cb.get_readable(chunks)
 37 |     assert output == [
 38 |         ("NON_SYM", "བཀྲ་ཤིས་བདེ་ལེགས།"),
 39 |         ("SYM", " ༪༫༝༜༛༚"),
 40 |         ("NON_SYM", "༇༆"),
 41 |     ]
 42 | 
 43 | 
 44 | def test_num_nonnum():
 45 |     string = "བཀྲ་ཤིས་བདེ་ལེགས།  ༡༢༣༠༩༨"
 46 |     cb = ChunkFramework(string)
 47 |     chunks = cb.chunk_number()
 48 | 
 49 |     output = cb.get_readable(chunks)
 50 |     assert output == [("NON_NUM", "བཀྲ་ཤིས་བདེ་ལེགས།"), ("NUM", "  ༡༢༣༠༩༨")]
 51 | 
 52 | 
 53 | def test_space_nonspace():
 54 |     string = "བཀྲ་ཤིས་བདེ་ལེགས།   །བཀྲ་ཤིས་བདེ་ལེགས།"
 55 |     cb = ChunkFramework(string)
 56 |     chunks = cb.chunk_spaces()
 57 | 
 58 |     output = cb.get_readable(chunks)
 59 |     assert output == [
 60 |         ("NON_SPACE", "བཀྲ་ཤིས་བདེ་ལེགས།"),
 61 |         ("SPACE", "   "),
 62 |         ("NON_SPACE", "།བཀྲ་ཤིས་བདེ་ལེགས།"),
 63 |     ]
 64 | 
 65 | 
 66 | def test_text():
 67 |     string = "བཀྲ་ཤིས་བདེ་ལེགས"
 68 |     cb = ChunkFramework(string)
 69 |     chunks = cb.syllabify()
 70 | 
 71 |     output = cb.get_readable(chunks)
 72 |     assert output == [
 73 |         ("TEXT", "བཀྲ་"),
 74 |         ("TEXT", "ཤིས་"),
 75 |         ("TEXT", "བདེ་"),
 76 |         ("TEXT", "ལེགས"),
 77 |     ]
 78 | 
 79 | 
 80 | def test_latin():
 81 |     string = "བཀྲ་ཤིས་བདེ་ལེགས This is a test."
 82 |     cb = ChunkFramework(string)
 83 |     chunks = cb.chunk_latin()
 84 | 
 85 |     output = cb.get_readable(chunks)
 86 |     assert output == [("OTHER", "བཀྲ་ཤིས་བདེ་ལེགས"), ("LATIN", " This is a test.")]
 87 | 
 88 | 
 89 | def test_cjk():
 90 |     string = "བཀྲ་ཤིས་བདེ་ལེགས 这是  什么"
 91 |     cb = ChunkFramework(string)
 92 |     chunks = cb.chunk_cjk()
 93 | 
 94 |     output = cb.get_readable(chunks)
 95 |     assert output == [("OTHER", "བཀྲ་ཤིས་བདེ་ལེགས"), ("CJK", " 这是  什么")]
 96 | 
 97 | 
 98 | def test_other():
 99 |     string = "བཀྲ་ཤིས་བདེ་ལེགས กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"
100 |     cb = ChunkFramework(string)
101 |     chunks = cb.chunk_bo_chars()
102 | 
103 |     output = cb.get_readable(chunks)
104 |     assert output == [
105 |         ("BO", "བཀྲ་ཤིས་བདེ་ལེགས "),
106 |         ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
107 |     ]
108 | 
109 | 
110 | def test_full_example():
111 |     # Follows the order implemented in Chunks
112 |     string = (
113 |         '༆ བཀྲ་ཤིས་བདེ་ལེགས།། །། 23PIEIUZLDVéjoldvép«»("«»%= ༪༫༝༜༛༚༇༆ ༡༢༣༠༩༨ '
114 |         "This is a test. 这是  什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"
115 |     )
116 |     cb = ChunkFramework(string)
117 | 
118 |     # BO / OTHER
119 |     chunks = cb.chunk_bo_chars()
120 |     chunks = cb.clean_chunks(chunks)
121 |     output = cb.get_readable(chunks)
122 |     assert output == [
123 |         ("BO", "༆ བཀྲ་ཤིས་བདེ་ལེགས།། །། "),
124 |         ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%='),
125 |         ("BO", " ༪༫༝༜༛༚༇༆ ༡༢༣༠༩༨ "),
126 |         ("OTHER", "This is a test. 这是  什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
127 |     ]
128 | 
129 |     # BO / PUNCT
130 |     chunks = cb.pipe_chunk(chunks, cb.chunk_punct, c.BO.value, c.PUNCT.value)
131 |     chunks = cb.clean_chunks(chunks)
132 |     output = cb.get_readable(chunks)
133 |     assert output == [
134 |         ("PUNCT", "༆ "),
135 |         ("BO", "བཀྲ་ཤིས་བདེ་ལེགས"),
136 |         ("PUNCT", "།། །། "),  # NEW
137 |         ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '),
138 |         ("BO", "༪༫༝༜༛༚"),
139 |         ("PUNCT", "༇༆ "),  # NEW
140 |         ("BO", "༡༢༣༠༩༨ "),
141 |         ("OTHER", "This is a test. 这是  什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
142 |     ]
143 | 
144 |     # BO / NUM
145 |     chunks = cb.pipe_chunk(chunks, cb.chunk_number, c.BO.value, c.NUM.value)
146 |     chunks = cb.clean_chunks(chunks)
147 |     output = cb.get_readable(chunks)
148 |     assert output == [
149 |         ("PUNCT", "༆ "),
150 |         ("BO", "བཀྲ་ཤིས་བདེ་ལེགས"),
151 |         ("PUNCT", "།། །། "),
152 |         ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '),
153 |         ("BO", "༪༫༝༜༛༚"),
154 |         ("PUNCT", "༇༆ "),
155 |         ("NUM", "༡༢༣༠༩༨ "),  # NEW
156 |         ("OTHER", "This is a test. 这是  什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
157 |     ]
158 | 
159 |     # BO / SYM
160 |     chunks = cb.pipe_chunk(chunks, cb.chunk_symbol, c.BO.value, c.SYM.value)
161 |     chunks = cb.clean_chunks(chunks)
162 |     output = cb.get_readable(chunks)
163 |     assert output == [
164 |         ("PUNCT", "༆ "),
165 |         ("BO", "བཀྲ་ཤིས་བདེ་ལེགས"),
166 |         ("PUNCT", "།། །། "),
167 |         ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '),
168 |         ("SYM", "༪༫༝༜༛༚"),  # NEW
169 |         ("PUNCT", "༇༆ "),
170 |         ("NUM", "༡༢༣༠༩༨ "),
171 |         ("OTHER", "This is a test. 这是  什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
172 |     ]
173 | 
174 |     # TEXT
175 |     chunks = cb.pipe_chunk(chunks, cb.syllabify, c.BO.value, c.TEXT.value)
176 |     chunks = cb.clean_chunks(chunks)
177 |     output = cb.get_readable(chunks)
178 |     assert output == [
179 |         ("PUNCT", "༆ "),
180 |         ("TEXT", "བཀྲ་"),  # NEW
181 |         ("TEXT", "ཤིས་"),  # NEW
182 |         ("TEXT", "བདེ་"),  # NEW
183 |         ("TEXT", "ལེགས"),  # NEW
184 |         ("PUNCT", "།། །། "),
185 |         ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '),
186 |         ("SYM", "༪༫༝༜༛༚"),
187 |         ("PUNCT", "༇༆ "),
188 |         ("NUM", "༡༢༣༠༩༨ "),
189 |         ("OTHER", "This is a test. 这是  什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
190 |     ]
191 | 
192 |     # OTHER / CJK
193 |     chunks = cb.pipe_chunk(chunks, cb.chunk_cjk, c.OTHER.value, c.CJK.value)
194 |     chunks = cb.clean_chunks(chunks)
195 |     output = cb.get_readable(chunks)
196 |     assert output == [
197 |         ("PUNCT", "༆ "),
198 |         ("TEXT", "བཀྲ་"),
199 |         ("TEXT", "ཤིས་"),
200 |         ("TEXT", "བདེ་"),
201 |         ("TEXT", "ལེགས"),
202 |         ("PUNCT", "།། །། "),
203 |         ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '),
204 |         ("SYM", "༪༫༝༜༛༚"),
205 |         ("PUNCT", "༇༆ "),
206 |         ("NUM", "༡༢༣༠༩༨ "),
207 |         ("OTHER", "This is a test."),
208 |         ("CJK", " 这是  什么 "),  # NEW
209 |         ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
210 |     ]
211 | 
212 |     # OTHER / LATIN
213 |     chunks = cb.pipe_chunk(chunks, cb.chunk_latin, c.OTHER.value, c.LATIN.value)
214 |     chunks = cb.clean_chunks(chunks)
215 |     output = cb.get_readable(chunks)
216 |     assert output == [
217 |         ("PUNCT", "༆ "),
218 |         ("TEXT", "བཀྲ་"),
219 |         ("TEXT", "ཤིས་"),
220 |         ("TEXT", "བདེ་"),
221 |         ("TEXT", "ལེགས"),
222 |         ("PUNCT", "།། །། "),
223 |         ("LATIN", '23PIEIUZLDVéjoldvép«»("«»%= '),  # NEW
224 |         ("SYM", "༪༫༝༜༛༚"),
225 |         ("PUNCT", "༇༆ "),
226 |         ("NUM", "༡༢༣༠༩༨ "),
227 |         ("LATIN", "This is a test."),  # NEW
228 |         ("CJK", " 这是  什么 "),
229 |         ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
230 |     ]
231 | 


--------------------------------------------------------------------------------
/tests/chunks/test_chunks.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from botok import Chunks, TokChunks, TSEK
 3 | 
 4 | string = (
 5 |     '༆ བཀྲ་ཤིས་བདེ་ལེགས།། །། 23PIEIUZLDVéjoldvép«»("«»%= ༪༫༝༜༛༚༇༆ ༡༢༣༠༩༨ '
 6 |     "This is a test. 这是  什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"
 7 | )
 8 | 
 9 | 
10 | def test_chunks():
11 |     c = Chunks(string)
12 |     chunks = c.make_chunks()
13 |     output = c.get_readable(chunks)
14 |     assert output == [
15 |         ("PUNCT", "༆ "),
16 |         ("TEXT", "བཀྲ་"),
17 |         ("TEXT", "ཤིས་"),
18 |         ("TEXT", "བདེ་"),
19 |         ("TEXT", "ལེགས"),
20 |         ("PUNCT", "།། །། "),
21 |         ("LATIN", '23PIEIUZLDVéjoldvép«»("«»%= '),  # NEW
22 |         ("SYM", "༪༫༝༜༛༚"),
23 |         ("PUNCT", "༇༆ "),
24 |         ("NUM", "༡༢༣༠༩༨ "),
25 |         ("LATIN", "This is a test."),  # NEW
26 |         ("CJK", " 这是  什么 "),
27 |         ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
28 |     ]
29 | 
30 | 
31 | def test_tokchunks():
32 |     c = TokChunks(string)
33 |     c.serve_syls_to_trie()
34 |     # generate what the tokenizer will ingest
35 |     chunks = c.chunks
36 |     assert chunks == [
37 |         (None, (105, 0, 2)),
38 |         ([2, 3, 4], (104, 2, 4)),  # syllable 1
39 |         ([6, 7, 8], (104, 6, 4)),  # syllable 2
40 |         ([10, 11, 12], (104, 10, 4)),  # syllable 3
41 |         ([14, 15, 16, 17], (104, 14, 4)),  # syllable 4
42 |         (None, (105, 18, 6)),
43 |         (None, (101, 24, 28)),
44 |         (None, (109, 52, 6)),
45 |         (None, (105, 58, 3)),
46 |         (None, (111, 61, 7)),
47 |         (None, (101, 68, 15)),
48 |         (None, (102, 83, 8)),
49 |         (None, (103, 91, 24)),
50 |     ]
51 | 
52 |     # the second element of each tuple is the chunk from Chunks
53 |     readable = [(a[0], c.get_readable([a[1]])[0]) for a in chunks]
54 |     assert readable == [
55 |         (None, ("PUNCT", "༆ ")),
56 |         ([2, 3, 4], ("TEXT", "བཀྲ་")),
57 |         ([6, 7, 8], ("TEXT", "ཤིས་")),
58 |         ([10, 11, 12], ("TEXT", "བདེ་")),
59 |         ([14, 15, 16, 17], ("TEXT", "ལེགས")),
60 |         (None, ("PUNCT", "།། །། ")),
61 |         (None, ("LATIN", '23PIEIUZLDVéjoldvép«»("«»%= ')),
62 |         (None, ("SYM", "༪༫༝༜༛༚")),
63 |         (None, ("PUNCT", "༇༆ ")),
64 |         (None, ("NUM", "༡༢༣༠༩༨ ")),
65 |         (None, ("LATIN", "This is a test.")),
66 |         (None, ("CJK", " 这是  什么 ")),
67 |         (None, ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ")),
68 |     ]
69 | 
70 |     # just for the fun of it: get the cleaned syllable as it is done in the Tokenizer
71 |     chunks = [
72 |         "".join([string[c] for c in chars]) + TSEK for chars, chunk in chunks if chars
73 |     ]
74 |     assert chunks == ["བཀྲ་", "ཤིས་", "བདེ་", "ལེགས་"]
75 | 


--------------------------------------------------------------------------------
/tests/chunks/test_chunktokenizer.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from botok import *
 3 | 
 4 | 
 5 | def test_chunktokenizer():
 6 |     input_str = (
 7 |         " ཤི་བཀྲ་ཤིས་  བདེ་་ལ             ེ       གས་ "
 8 |         'བཀྲ་ཤིས་བདེ་ལེགས ༆ བཀྲ་ཤིས་བདེ་ལེགས།། །། 23PIEIUZLDVéjoldvép«»("«»%= ༪༫༝༜༛༚༇༆ ༡༢༣༠༩༨ '
 9 |         "This is a test. 这是  什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"
10 |     )
11 |     st = ChunkTokenizer(input_str)
12 |     tokens = st.tokenize()
13 |     assert tokens == [
14 |         ("TEXT", " ཤི་"),
15 |         ("TEXT", "བཀྲ་"),
16 |         ("TEXT", "ཤིས་  "),
17 |         ("TEXT", "བདེ་་"),
18 |         ("TEXT", "ལ             ེ       གས་ "),
19 |         ("TEXT", "བཀྲ་"),
20 |         ("TEXT", "ཤིས་"),
21 |         ("TEXT", "བདེ་"),
22 |         ("TEXT", "ལེགས"),
23 |         ("PUNCT", " ༆ "),
24 |         ("TEXT", "བཀྲ་"),
25 |         ("TEXT", "ཤིས་"),
26 |         ("TEXT", "བདེ་"),
27 |         ("TEXT", "ལེགས"),
28 |         ("PUNCT", "།། །། "),
29 |         ("LATIN", '23PIEIUZLDVéjoldvép«»("«»%= '),
30 |         ("SYM", "༪༫༝༜༛༚"),
31 |         ("PUNCT", "༇༆ "),
32 |         ("NUM", "༡༢༣༠༩༨ "),
33 |         ("LATIN", "This is a test."),
34 |         ("CJK", " 这是  什么 "),
35 |         ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"),
36 |     ]
37 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from botok import Config, Tokenize, WordTokenizer
 4 | 
 5 | 
 6 | @pytest.fixture(scope="session")
 7 | def empty_wt():
 8 |     """Return empty word tokenizer."""
 9 |     config = Config.from_path("./tests/data/empty_dialect_pack")
10 |     return WordTokenizer(config=config)
11 | 
12 | 
13 | @pytest.fixture(scope="session")
14 | def wt():
15 |     """Return default word tokenizer."""
16 |     return WordTokenizer()
17 | 


--------------------------------------------------------------------------------
/tests/data/empty_dialect_pack/adjustments/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/tests/data/empty_dialect_pack/adjustments/.keep


--------------------------------------------------------------------------------
/tests/data/empty_dialect_pack/dictionary/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/tests/data/empty_dialect_pack/dictionary/.keep


--------------------------------------------------------------------------------
/tests/data/trie_dialect_pack/adjustments/remove/test.tsv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/tests/data/trie_dialect_pack/adjustments/remove/test.tsv


--------------------------------------------------------------------------------
/tests/data/trie_dialect_pack/adjustments/rules/adjust_rules.tsv:
--------------------------------------------------------------------------------
 1 | # Syntax for the possible adjustment
 2 | # ===================================
 3 | # - CQL rules: "<text>" can be used without specifying that there is "text_cleaned="
 4 | # - Index format: either "<matching_index>" or "<matching_index>-<splitting-index>"
 5 | # - Adjustment format:
 6 | #		- "+" for merge
 7 | #		- ":" for split (default: syllable mode)
 8 | #		- "::" for split in character mode
 9 | #		- "=" for replace
10 | # - Constraint: "<matching_index>-<splitting-index>" is only allowed if adjustment is ":" or "::"
11 | 
12 | ["ལ་ལ་"] ["ལ་ལ་"]	1	=	[pos="PART"]
13 | ["ལ་ལ་"] ["ལ་ལ་"]	2	=	[pos="PART"]
14 | ["ལ་ལ་"] ["ལ་ལ་"]	1-2	::	[pos="NOUN"] [pos="PART"]
15 | ["ལ་"] ["ལ་"] ["ལ་ལ་"]	3-2	::	[pos="PART"] [pos="PART"]
16 | ["ལ་"] ["ལ་"] ["ལ་"] ["ལ་"]	2	+	[pos="DET"]


--------------------------------------------------------------------------------
/tests/data/trie_dialect_pack/adjustments/words/test.tsv:
--------------------------------------------------------------------------------
1 | ཀཀ
2 | ཁཁ
3 | ལྟ།	VERB	ལྟ།		123
4 | ལྟར།	ADV	ལྟར།		456
5 | བཀྲ་ཤིས།		བཀྲ་ཤིས།
6 | བཀྲིས།		བཀྲ་ཤིས།


--------------------------------------------------------------------------------
/tests/data/trie_dialect_pack/adjustments/words/test_comma_sep.tsv:
--------------------------------------------------------------------------------
1 | ང་།,PRON
2 | 


--------------------------------------------------------------------------------
/tests/data/trie_dialect_pack/adjustments/words_skrt/test.tsv:
--------------------------------------------------------------------------------
1 | ཀ་ར།


--------------------------------------------------------------------------------
/tests/data/trie_dialect_pack/dictionary/words/empty.tsv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/tests/data/trie_dialect_pack/dictionary/words/empty.tsv


--------------------------------------------------------------------------------
/tests/modifytokens/test_matchers.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | from pathlib import Path
  3 | 
  4 | import pytest
  5 | 
  6 | from botok import *
  7 | 
  8 | 
  9 | input_str = " མཐའི་རྒྱ་མཚོའི་གླིང་། ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་ཀཀ"
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def tokens_affix_split(wt):
 14 |     return wt.tokenize(input_str)
 15 | 
 16 | 
 17 | @pytest.fixture
 18 | def tokens(wt):
 19 |     return wt.tokenize(input_str, split_affixes=False)
 20 | 
 21 | 
 22 | # IMPORTANT: all the tests have merely been adapted after refactorisation.
 23 | # They should be split in tests per file that also show the expected behaviour of every matcher.
 24 | 
 25 | 
 26 | def test_cql_query():
 27 |     query = '[text="ན"] []'
 28 |     q = Query(query)
 29 |     assert q
 30 | 
 31 | 
 32 | def test_dummy_cql():
 33 |     test = [
 34 |         {"word": "This", "lemma": "this", "tag": "Det"},
 35 |         {"word": "is", "lemma": "be", "tag": "Verb"},
 36 |         {"word": "it", "lemma": "it", "tag": "Pron"},
 37 |         {"word": ".", "lemma": ".", "tag": "Punct"},
 38 |     ]
 39 |     q = '[lemma="this" & tag="Det"] [tag!="ADJ"]'
 40 | 
 41 |     matcher = CQLMatcher(q)
 42 |     matched = matcher.match(test)
 43 |     assert matched == [(0, 1)]
 44 | 
 45 | 
 46 | def test_regex_in_cql_query():
 47 |     test = [
 48 |         {"word": "This", "lemma": "this", "tag": "Det"},
 49 |         {"word": "is", "lemma": "be", "tag": "Verb"},
 50 |         {"word": "it", "lemma": "it", "tag": "Pron"},
 51 |         {"word": ".", "lemma": ".", "tag": "Punct"},
 52 |     ]
 53 |     q = r'[lemma="[^\n\s]+s" & tag="Det"] [tag!="ADJ"]'
 54 | 
 55 |     matcher = CQLMatcher(q)
 56 |     matched = matcher.match(test)
 57 |     expected = [test[m]["word"] for match in matched for m in match]
 58 |     assert expected == ["This", "is"]
 59 | 
 60 | 
 61 | def test_cql(tokens):
 62 |     query = '[pos="NOUN" & text!=""] []'
 63 |     matcher = CQLMatcher(query)
 64 |     slices = matcher.match(tokens)
 65 |     slice_strings = [
 66 |         tuple([tokens[i].text for i in range(start, end + 1)]) for start, end in slices
 67 |     ]
 68 |     assert slices == [(0, 1), (1, 2), (2, 3), (5, 6), (7, 8), (9, 10), (10, 11)]
 69 |     assert slice_strings == [
 70 |         (" མཐའི་", "རྒྱ་མཚོའི་"),
 71 |         ("རྒྱ་མཚོའི་", "གླིང་"),
 72 |         ("གླིང་", "། "),
 73 |         ("བཀྲ་ཤིས་  ", "tr "),
 74 |         ("བདེ་་ལེ གས", "། "),
 75 |         ("བཀྲ་ཤིས་", "བདེ་ལེགས་"),
 76 |         ("བདེ་ལེགས་", "ཀཀ"),
 77 |     ]
 78 | 
 79 | 
 80 | def test_token_split(tokens):
 81 |     ts = TokenSplit(
 82 |         tokens[3],
 83 |         1,
 84 |         token_changes='[chunk_type="SPACE" & pos="PUNCT" & affix_host="False"] []',
 85 |     )
 86 |     first, second = ts.split()
 87 |     assert first.chunk_type == "SPACE"
 88 |     assert first.pos == "PUNCT"
 89 | 
 90 | 
 91 | def test_token_merge(tokens_affix_split):
 92 |     tm = TokenMerge(tokens_affix_split[0], tokens_affix_split[1])
 93 |     merged = tm.merge()
 94 |     assert merged
 95 | 
 96 | 
 97 | def test_match_split_char(tokens):
 98 |     match_query = '[pos="NOUN" & text!=""] []'
 99 |     replace_idx = 1  # slot number in match query
100 |     split_idx = 1  # char index in token.content where split should occur
101 |     replace = '[chunk_type="XXX" & pos="xxx"] []'
102 | 
103 |     sm = SplittingMatcher(match_query, replace_idx, split_idx, tokens, replace)
104 |     split_tokens = sm.split_on_matches()
105 |     assert len(tokens) == 12
106 |     assert len(split_tokens) == 19
107 | 
108 | 
109 | def test_match_split_syl(tokens):
110 |     match_query = '[pos="NOUN" & text!=""] []'
111 |     replace_idx = 1  # slot number in match query
112 |     split_idx = 1  # char index in token.content where split should occur
113 |     replace = '[chunk_type="XXX" & pos="xxx"] []'
114 | 
115 |     sm = SplittingMatcher(match_query, replace_idx, split_idx, tokens, replace)
116 |     split_tokens = sm.split_on_matches(mode="syl")
117 |     assert len(tokens) == 12
118 |     assert len(split_tokens) == 17
119 | 
120 | 
121 | def test_match_merge(tokens, tokens_affix_split):
122 |     match_query = '[pos="NOUN" & text!=""] []'
123 |     replace_idx = 1  # slot number in match query
124 |     replace = '[chunk_type="XXX" & pos="xxx"]'
125 | 
126 |     mm = MergingMatcher(match_query, replace_idx, tokens_affix_split, replace)
127 |     merged_tokens = mm.merge_on_matches()
128 |     assert len(tokens) == 12
129 |     assert len(merged_tokens) == 8
130 | 
131 | 
132 | def test_match_replace(tokens):
133 |     match_query = '[pos="NOUN" & text!=""] []'
134 |     replace_idx = 1
135 |     replace = '[chunk_type="XXX" & pos="xxx"]'
136 | 
137 |     ReplacingMatcher(match_query, replace_idx, tokens, replace).replace_on_matches()
138 |     assert len(tokens) == 12
139 |     assert tokens[1].pos == "xxx"
140 |     assert tokens[4].pos == "VERB"
141 | 
142 | 
143 | def test_adjust_tokens(wt):
144 |     string = "ལ་ལ་ལ་ལ་ལ་བ་ཡོད།"
145 |     token_list = wt.tokenize(string, split_affixes=False)
146 | 
147 |     # add test adjust rule to adjustments rules
148 |     wt.config.adjustments["rules"].append(
149 |         Path("./tests/data/trie_dialect_pack/adjustments/rules/adjust_rules.tsv")
150 |     )
151 | 
152 |     at = AdjustTokens(
153 |         main=wt.config.dictionary["rules"], custom=wt.config.adjustments["rules"]
154 |     )
155 |     adjusted = at.adjust(token_list)
156 |     assert token_list[0].text == "ལ་ལ་"
157 |     assert token_list[1].text == "ལ་ལ་"
158 | 
159 |     assert adjusted[0].text == "ལ་"
160 |     assert adjusted[0].pos == "NOUN"
161 |     assert adjusted[1].text == "ལ་ལ་"
162 |     assert adjusted[1].pos == "DET"
163 |     assert adjusted[2].text == "ལ་"
164 |     assert adjusted[2].pos == "PART"
165 | 
166 | 
167 | def test_last_token():
168 |     token1 = Token()
169 |     token1.pos = "NOUN"
170 | 
171 |     token2 = Token()
172 |     token2.pos = "VERB"
173 | 
174 |     matcher = CQLMatcher('[pos="NOUN"]')
175 |     slices = matcher.match([token1, token2])
176 |     assert slices == [(0, 0)]
177 | 
178 |     matcher = CQLMatcher('[pos="VERB"]')
179 |     slices = matcher.match([token1, token2])
180 |     assert slices == [(1, 1)]
181 | 
182 | 
183 | def test_merge_dagdra(wt):
184 |     token_list = wt.tokenize("བཀྲ་ཤིས་-པ་")
185 |     token_list = [
186 |         t for t in token_list if t.text != "-"
187 |     ]  # remove the "-" inserted to ensure we have two tokens
188 |     mp = MergeDagdra()
189 |     mp.merge(token_list)
190 |     assert len(token_list) == 1 and token_list[0].text == "བཀྲ་ཤིས་པ་"
191 | 
192 |     token_list = wt.tokenize("བཀྲ་ཤིས་-པའོ།")
193 |     token_list = [
194 |         t for t in token_list if t.text != "-"
195 |     ]  # remove the "-" inserted to ensure we have two tokens
196 |     mp.merge(token_list)
197 |     assert len(token_list) == 3 and token_list[0].text == "བཀྲ་ཤིས་པ"
198 | 


--------------------------------------------------------------------------------
/tests/resources/rdr_rules.txt:
--------------------------------------------------------------------------------
 1 | object.tag == "SCONJ" : object.conclusion = "SCONJ"
 2 | 	object.prevTag1 == "DET" : object.conclusion = "ADP"
 3 | 		object.word == "སྟེ་" : object.conclusion = "SCONJ"
 4 | 		object.word == "ཅིང་" : object.conclusion = "SCONJ"
 5 | 		object.word == "ཞིང་" : object.conclusion = "NOUN"
 6 | 		object.prevTag1 == "DET" and object.word == "ཤིང་" : object.conclusion = "NOUN"
 7 | 	object.prevTag1 == "NOUN" : object.conclusion = "ADP"
 8 | 		object.word == "སྟེ་" : object.conclusion = "SCONJ"
 9 | 		object.word == "ཏེ་" : object.conclusion = "SCONJ"
10 | 		object.prevTag1 == "NOUN" and object.word == "ཞིང་" : object.conclusion = "SCONJ"
11 | 		object.word == "ཤིང་" : object.conclusion = "SCONJ"
12 | 			object.nextTag2 == "ADP" : object.conclusion = "NOUN"
13 | 	object.prevTag1 == "PRON" and object.word == "ནས་" : object.conclusion = "ADP"
14 | 		object.prevWord2 == "སུ་" and object.word == "ནས་" : object.conclusion = "SCONJ"
15 | 	object.prevTag1 == "NUM" : object.conclusion = "ADP"
16 | 		object.word == "སྟེ་" : object.conclusion = "SCONJ"
17 | 	object.prevTag1 == "ADP" : object.conclusion = "NOUN"
18 | 		object.prevTag2 == "NOUN" and object.prevTag1 == "ADP" and object.word == "ནས་" : object.conclusion = "ADP"
19 | 		object.word == "ཏེ་" : object.conclusion = "SCONJ"


--------------------------------------------------------------------------------
/tests/resources/test.txt:
--------------------------------------------------------------------------------
 1 | # this is a test line, followed by an empty line, that should be ignored
 2 | 
 3 | བཀྲ་,NOUN # lines can have either a comma
 4 | ཤིས་ NOUN # a space
 5 | བཀྲ་ཤིས་	NOUN # or a tab as separator
 6 | བདེ་,NOUN
 7 |   # this line, being empty after removing the comment, should be ignored
 8 | ལེགས་,ADJ # a comment preceded by a space
 9 | བདེ་ལེགས་,NOUN
10 | བཀྲ་ཤིས་བདེ་ལེགས་,EXCLS # Not so sure about this POS.
11 | 


--------------------------------------------------------------------------------
/tests/resources/test_file_to_tokenize.txt:
--------------------------------------------------------------------------------
1 | ལེ གས། བཀྲ་ཤིས་མཐའི་ ༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།


--------------------------------------------------------------------------------
/tests/resources/test_file_to_tokenize_pybo.txt:
--------------------------------------------------------------------------------
1 | ﻿ལེ_གས །_ བཀྲ་ ཤིས་ མཐའི་ _༆_ ཤི་ བཀྲ་ ཤིས་__ tr_ བདེ་་ ལེ_གས །_ བཀྲ་ ཤིས་ བདེ་ ལེགས་ ༡༢༣ ཀཀ །_ མཐའི་ རྒྱ་ མཚོར་ གནས་ པའི་ ཉས་ ཆུ་ འཐུང་ །།_།། མཁའ །


--------------------------------------------------------------------------------
/tests/test_bugs.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | import sys
  3 | 
  4 | import pytest
  5 | 
  6 | from botok import TokChunks, Config, Trie, BoSyl, Tokenize, Chunks, ChunkFramework
  7 | 
  8 | 
  9 | sys.path.append("../")
 10 | 
 11 | 
 12 | def test_syl_tokenize():
 13 |     instr = " མཐའི་རྒྱ་མཚོའི་གླིང་། ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་ཀཀ"
 14 |     preprocessed = TokChunks(instr)
 15 |     preprocessed.serve_syls_to_trie()
 16 |     config = Config()
 17 |     trie = Trie(BoSyl, config.profile, config.dictionary, config.adjustments)
 18 |     tok = Tokenize(trie)
 19 |     tokens = tok.tokenize(preprocessed)
 20 |     texts = [t.text for t in tokens]
 21 |     expected = [
 22 |         " མཐའི་",
 23 |         "རྒྱ་མཚོའི་",
 24 |         "གླིང་",
 25 |         "། ",
 26 |         "ཤི་",
 27 |         "བཀྲ་ཤིས་  ",
 28 |         "tr ",
 29 |         "བདེ་་ལེ གས",
 30 |         "། ",
 31 |         "བཀྲ་ཤིས་",
 32 |         "བདེ་ལེགས་",
 33 |         "ཀཀ",
 34 |     ]
 35 |     # current: [' མཐའི་', 'རྒྱ་མཚོའི་', '། ', 'གླིང་', 'བཀྲ་', 'ཤི་', 'tr ', 'ཤིས་  ', 'བདེ་་ལེ གས', '། ', 'བདེ་',
 36 |     #          'བཀྲ་ཤིས་', 'ཀཀ', 'ལེགས་']
 37 |     assert texts == expected
 38 | 
 39 | 
 40 | def test_num_lemmas_missing(wt):
 41 |     in_str = "སྟོང་ཕྲག་བརྒྱ་པ་སུམ་བརྒྱ་པ་བཅུ་པ་ལྔ་པ་"
 42 |     tokens = wt.tokenize(in_str)
 43 |     assert [t.lemma for t in tokens] == [
 44 |         "སྟོང་ཕྲག་",
 45 |         "བརྒྱ་པ་",
 46 |         "སུམ་བརྒྱ་པ་",
 47 |         "བཅུ་པ་",
 48 |         "ལྔ་པ་",
 49 |     ]
 50 | 
 51 | 
 52 | def test_no_shad_syllable():
 53 |     in_str = "ཀ འདི་ ཤི དེ་ག རེད་དོ།"
 54 |     bo_string = Chunks(in_str)
 55 |     chunks = bo_string.make_chunks()
 56 |     chunks = bo_string.get_readable(chunks)
 57 |     assert chunks == [
 58 |         ("TEXT", "ཀ "),
 59 |         ("TEXT", "འདི་ "),
 60 |         ("TEXT", "ཤི "),
 61 |         ("TEXT", "དེ་"),
 62 |         ("TEXT", "ག "),
 63 |         ("TEXT", "རེད་"),
 64 |         ("TEXT", "དོ"),
 65 |         ("PUNCT", "།"),
 66 |     ]
 67 | 
 68 | 
 69 | def test_segmentation_bug(wt):
 70 |     tokens = wt.tokenize("ལ་པོ་ལ་པོ་ལ་པོ་")
 71 |     assert len(tokens) == 3
 72 | 
 73 |     tokens = wt.tokenize("ལ་མོ་ལ་མོ་ལ་མོ་")
 74 |     assert len(tokens) == 3
 75 | 
 76 |     tokens = wt.tokenize("གྲོགས་པོ་གྲོགས་པོ་གྲོགས་པོ་")
 77 |     assert len(tokens) == 3
 78 | 
 79 |     tokens = wt.tokenize("བདག་པོ་བདག་པོ་བདག་པོ་དང་")
 80 |     assert len(tokens) == 4
 81 | 
 82 |     tokens = wt.tokenize("བདག་པོ་བདག་པོ་བདག་པོ་")
 83 |     assert len(tokens) == 3
 84 | 
 85 |     tokens = wt.tokenize(
 86 |         "བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་"
 87 |     )
 88 |     assert len(tokens) == 9
 89 | 
 90 | 
 91 | def test_keyerror_part_lemma(wt):
 92 |     tokens = wt.tokenize("ཕའིའོ།")
 93 |     assert len(tokens) == 3
 94 | 
 95 | 
 96 | def test_split_token(empty_wt):
 97 |     wt = empty_wt
 98 |     wt.tok.trie.rebuild_trie()
 99 |     wt.tok.trie.inflect_n_modify_trie("འ་")
100 |     assert not wt.tok.trie.has_word("ར་")["exists"]
101 | 
102 | 
103 | def test_missing_entries_n_bad_unaffixed(wt):
104 |     input_str = "ཤུ་ཀ་ར་"
105 |     tokens = wt.tokenize(input_str, split_affixes=False)
106 |     assert [t.text for t in tokens] == ["ཤུ་", "ཀ་ར་"]
107 |     assert tokens[0].senses
108 |     assert tokens[1].text_unaffixed == "ཀ་ར་"
109 | 
110 | 
111 | def test_multiple_spaces():
112 |     bo_string = Chunks("ཁྱོ ད་ད  ང་")
113 |     chunks = bo_string.make_chunks()
114 |     chunks = bo_string.get_readable(chunks)
115 |     assert chunks[0] == ("TEXT", "ཁྱོ ད་")
116 |     assert chunks[1] == ("TEXT", "ད  ང་")
117 |     assert len(chunks) == 2
118 | 
119 | 
120 | def test_bug1(wt):
121 |     string = "བ་ཀུ་"
122 |     tokens = wt.tokenize(string, debug=True)
123 |     assert tokens
124 | 
125 | 
126 | def test_bug2(wt):
127 |     string = "བྲ་གྲྀ་"
128 |     tokens = wt.tokenize(string, debug=True)
129 |     assert tokens
130 | 
131 | 
132 | def test_many_tseks_in_syllable():
133 |     input_str = " ཤི་བཀྲ་ཤིས་  བདེ་་ལ             ེ       གས་ བཀྲ་ཤིས་བདེ་ལེགས"
134 |     cb = ChunkFramework(input_str)
135 |     chunks = cb.syllabify()
136 |     readable = cb.get_readable(chunks)
137 |     assert readable == [
138 |         ("TEXT", " ཤི་"),
139 |         ("TEXT", "བཀྲ་"),
140 |         ("TEXT", "ཤིས་"),
141 |         ("TEXT", "  བདེ་་"),
142 |         ("TEXT", "ལ             ེ       གས་"),
143 |         ("TEXT", " བཀྲ་"),
144 |         ("TEXT", "ཤིས་"),
145 |         ("TEXT", "བདེ་"),
146 |         ("TEXT", "ལེགས"),
147 |     ]
148 | 
149 |     chunks = cb.chunk_punct()
150 |     chunks = cb.merge_skippable_punct(chunks)
151 |     readable = cb.get_readable(chunks)
152 |     assert readable == [
153 |         ("NON_PUNCT", " ཤི་བཀྲ་ཤིས་  བདེ་་ལ             ེ       གས་ བཀྲ་ཤིས་བདེ་ལེགས")
154 |     ]
155 | 
156 |     ck = Chunks(input_str)
157 |     chunks = ck.make_chunks()
158 |     readable = ck.get_readable(chunks)
159 |     assert readable == [
160 |         ("TEXT", " ཤི་"),
161 |         ("TEXT", "བཀྲ་"),
162 |         ("TEXT", "ཤིས་  "),
163 |         ("TEXT", "བདེ་་"),
164 |         ("TEXT", "ལ             ེ       གས་ "),
165 |         ("TEXT", "བཀྲ་"),
166 |         ("TEXT", "ཤིས་"),
167 |         ("TEXT", "བདེ་"),
168 |         ("TEXT", "ལེགས"),
169 |     ]
170 | 
171 | 
172 | def test_shad_in_syllable():
173 |     input_str = " tr བདེ་་ལེ གས། བཀྲ་"
174 |     ck = Chunks(input_str)
175 |     chunks = ck.make_chunks()
176 |     readable = ck.get_readable(chunks)
177 |     assert readable == [
178 |         ("LATIN", " tr "),
179 |         ("TEXT", "བདེ་་"),
180 |         ("TEXT", "ལེ གས"),
181 |         ("PUNCT", "། "),
182 |         ("TEXT", "བཀྲ་"),
183 |     ]
184 | 
185 | def test_unexpected_skip_syl(wt):
186 |     input_strs = ["དེའི་སྒོ་ནས་བསྟན་པ་དང་སེམས་ཅན་ལ་ཕན་ཐོགས་མཛད་ཚུལ།", "དེ་ཁོ་རང་ཡིན་མོད།"]
187 |     wt.tok.trie.inflect_n_modify_trie("དང་སེམས་", deactivate=True) # To remove དང་སེམས་ from trie
188 |     wt.tok.trie.inflect_n_modify_trie("ཡིན་མོད", deactivate=True)
189 |     wt.tok.trie.inflect_n_modify_trie("ཕན་ཐོགས་")
190 |     expected_strs = ["དེའི་ སྒོ་ ནས་ བསྟན་པ་ དང་ སེམས་ཅན་ ལ་ ཕན་ཐོགས་ མཛད་ ཚུལ ། ", "དེ་ ཁོ་རང་ ཡིན་ མོད ། "]
191 |     result_strs = []
192 |     for input_str in input_strs:
193 |         tokens = wt.tokenize(input_str, split_affixes = False)
194 |         result_str = ''
195 |         for token in tokens:
196 |             result_str += f'{token.text} '
197 |         result_strs.append(result_str)
198 |     assert expected_strs == result_strs
199 | 
200 | 
201 | if __name__ == "__main__":
202 |     test_split_token()
203 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | import copy
 3 | from pathlib import Path
 4 | 
 5 | import pytest
 6 | 
 7 | from botok import Config
 8 | from botok.config import DEFAULT_BASE_PATH
 9 | 
10 | 
11 | @pytest.fixture(scope="module")
12 | def base_path():
13 |     return DEFAULT_BASE_PATH
14 | 
15 | 
16 | def test_defaults(base_path):
17 |     config = Config()
18 | 
19 |     # default dialect pach path
20 |     assert config.dialect_pack_path == base_path / "general"
21 |     assert config.dialect_pack_path.is_dir()
22 | 
23 |     # Trie data should be .tsv file
24 |     for data_type in ["words", "rules"]:
25 |         assert data_type in config.dictionary
26 |         for data_fn in config.dictionary[data_type]:
27 |             assert data_fn.suffix == ".tsv"
28 | 
29 |     # Segmentation adjustment
30 |     for data_type in ["remove", "rules", "words", "words_skrt"]:
31 |         assert data_type in config.adjustments
32 |         for data_fn in config.adjustments[data_type]:
33 |             if data_fn.suffix:
34 |                 assert data_fn.suffix == ".tsv"
35 | 
36 | 
37 | def test_custome_dialect_pack(base_path):
38 |     config = Config(dialect_name="kangyur")
39 |     assert config.dialect_pack_path == base_path / "kangyur"
40 |     assert config.dialect_pack_path.is_dir()
41 | 
42 | 
43 | def test_reset(base_path):
44 |     custome_pack_name = "kangyur"
45 |     config = Config(dialect_name=custome_pack_name)
46 |     assert config.dialect_pack_path == base_path / custome_pack_name
47 | 
48 |     config.reset()
49 | 
50 |     assert config.dialect_pack_path == base_path / "general"
51 | 
52 | 
53 | def test_empty_config():
54 |     config = Config.from_path("./tests/data/empty_dialect_pack")
55 | 
56 |     assert not config.dictionary
57 |     assert not config.adjustments
58 | 
59 | 
60 | def test_add_dialect_pack():
61 |     config = Config()
62 |     old_dictionary = copy.deepcopy(config.dictionary)
63 |     old_adjustments = copy.deepcopy(config.adjustments)
64 | 
65 |     config.add_dialect_pack(Path("./tests/data/trie_dialect_pack"))
66 | 
67 |     assert config.dictionary != old_dictionary
68 |     assert config.adjustments != old_adjustments
69 | 


--------------------------------------------------------------------------------
/tests/text/test_text_tokenize.py:
--------------------------------------------------------------------------------
 1 | from botok.text.tokenize import space_tok, word_tok, sentence_tok, paragraph_tok
 2 | from botok.config import Config
 3 | 
 4 | 
 5 | def test_text_space_tokenizer():
 6 |     """Test the space tokenizer functionality."""
 7 |     text = "ཀཿ ཐོག་ འབྱམ་ པའཱི་ རོ།"
 8 |     tokens = space_tok(text)
 9 |     assert len(tokens) == 5
10 |     assert tokens[0] == "ཀཿ"
11 |     assert tokens[1] == "ཐོག་"
12 | 
13 | 
14 | def test_text_word_tokenizer():
15 |     """Test the word tokenizer functionality."""
16 |     text = "ཀཿཐོག་འབྱམ་པའཱི་རོ།"
17 |     tokens = word_tok(text)
18 |     assert len(tokens) > 0
19 |     assert hasattr(tokens[0], "text")
20 | 
21 | 
22 | def test_text_sentence_tokenizer():
23 |     """Test the sentence tokenizer functionality."""
24 |     text = "ཀཿཐོག་འབྱམ་པའཱི་རོ། འདི་ནི་ཚིག་གྲུབ་གཉིས་པ་ཡིན།"
25 |     sentences = sentence_tok(text)
26 |     # It should have at least one sentence with tokens
27 |     assert len(sentences) > 0
28 |     # The expected output format is a dictionary with sentence data
29 |     assert isinstance(sentences[0], dict)
30 |     assert 'tokens' in sentences[0]
31 |     assert isinstance(sentences[0]['tokens'], list)
32 |     assert len(sentences[0]['tokens']) > 0
33 | 
34 | 
35 | def test_text_paragraph_tokenizer():
36 |     """Test the paragraph tokenizer functionality."""
37 |     text = "ཀཿཐོག་འབྱམ་པའཱི་རོ།\n\nའདི་ནི་དུམ་བུ་གཉིས་པ་ཡིན།"
38 |     paragraphs = paragraph_tok(text)
39 |     # It should have at least one paragraph with tokens
40 |     assert len(paragraphs) > 0
41 |     assert isinstance(paragraphs[0], tuple)
42 |     assert len(paragraphs[0]) == 2
43 |     # The first element is the paragraph index
44 |     assert isinstance(paragraphs[0][0], int)
45 |     # The second element is the list of tokens
46 |     assert isinstance(paragraphs[0][1], list)
47 | 
48 | 
49 | def test_text_tokenizers_with_config():
50 |     """Test tokenizers with custom configuration."""
51 |     config = Config()
52 |     text = "ཀཿཐོག་འབྱམ་པའཱི་རོ།"
53 |     
54 |     # Test word tokenizer with config
55 |     tokens = word_tok(text, config=config)
56 |     assert len(tokens) > 0
57 |     
58 |     # Test sentence tokenizer with config
59 |     sentences = sentence_tok(text, config=config)
60 |     assert len(sentences) > 0
61 |     
62 |     # Test paragraph tokenizer with config
63 |     paragraphs = paragraph_tok(text, config=config)
64 |     assert len(paragraphs) > 0
65 | 


--------------------------------------------------------------------------------
/tests/textunits/test_bostring.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | import warnings
 3 | 
 4 | from botok import BoString
 5 | from botok import CharMarkers as m
 6 | 
 7 | bo_str = "བཀྲ་ཤིས་ ༡༢༣ tr  就到 郊外玩བདེ་ལེགས།"
 8 | 
 9 | 
10 | def test_string():
11 |     """Testing whether, at a given index, the char category corresponds to what is expected."""
12 |     bs = BoString(bo_str)
13 | 
14 |     idx = 0
15 |     assert "བ" == bo_str[idx]
16 |     assert m.CONS == bs.base_structure[idx]
17 | 
18 |     idx = 2
19 |     assert "ྲ" == bo_str[idx]
20 |     assert m.SUB_CONS == bs.base_structure[idx]
21 | 
22 |     idx = 7
23 |     assert "་" == bo_str[idx]
24 |     assert m.TSEK == bs.base_structure[idx]
25 | 
26 |     idx = 9
27 |     assert "༡" == bo_str[idx]
28 |     assert m.NUMERAL == bs.base_structure[idx]
29 | 
30 |     idx = 13
31 |     assert "t" == bo_str[idx]
32 |     assert m.LATIN == bs.base_structure[idx]
33 | 
34 |     idx = 17
35 |     assert "就" == bo_str[idx]
36 |     assert m.CJK == bs.base_structure[idx]
37 | 
38 | 
39 | def test_warning():
40 |     with warnings.catch_warnings(record=True) as w:
41 |         BoString("ༀ་པ་ཊུ་")
42 |         assert len(w) == 1
43 |         assert (
44 |             str(w[0].message)
45 |             == 'Beware of unexpected results: input string contains the non-expanded char "ༀ", found in "ༀ་པ་ཊུ".'
46 |         )
47 | 


--------------------------------------------------------------------------------
/tests/textunits/test_bosyl.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from botok import BoSyl
 3 | 
 4 | bs = BoSyl()
 5 | 
 6 | 
 7 | def test_bosyl():
 8 |     # is_affixable() Vs. SylComponents.is_thame()
 9 |     assert bs.is_thame("ཀུན") is False and bs.is_affixable("ཀུན") is False
10 |     assert bs.is_thame("དེའིའམ") is True and bs.is_affixable("དེའིའམ") is False
11 |     assert bs.is_thame("དེའི") is True and bs.is_affixable("དེའི") is False
12 |     assert bs.is_thame("ང") is True and bs.is_affixable("ང") is True
13 | 
14 |     # get_all_affixed()
15 |     affixed = bs.get_all_affixed("ང")
16 |     assert affixed == [
17 |         ("ངར", {"len": 1, "type": "la", "aa": False}),
18 |         ("ངས", {"len": 1, "type": "gis", "aa": False}),
19 |         ("ངའི", {"len": 2, "type": "gi", "aa": False}),
20 |         ("ངའམ", {"len": 2, "type": "am", "aa": False}),
21 |         ("ངའང", {"len": 2, "type": "ang", "aa": False}),
22 |         ("ངའོ", {"len": 2, "type": "o", "aa": False}),
23 |         ("ངའིའོ", {"len": 4, "type": "gi+o", "aa": False}),
24 |         ("ངའིའམ", {"len": 4, "type": "gi+am", "aa": False}),
25 |         ("ངའིའང", {"len": 4, "type": "gi+ang", "aa": False}),
26 |         ("ངའོའམ", {"len": 4, "type": "o+am", "aa": False}),
27 |         ("ངའོའང", {"len": 4, "type": "o+ang", "aa": False}),
28 |     ]
29 | 
30 |     affixed = bs.get_all_affixed("མཐའ")
31 |     assert affixed == [
32 |         ("མཐར", {"len": 1, "type": "la", "aa": True}),
33 |         ("མཐས", {"len": 1, "type": "gis", "aa": True}),
34 |         ("མཐའི", {"len": 2, "type": "gi", "aa": True}),
35 |         ("མཐའམ", {"len": 2, "type": "am", "aa": True}),
36 |         ("མཐའང", {"len": 2, "type": "ang", "aa": True}),
37 |         ("མཐའོ", {"len": 2, "type": "o", "aa": True}),
38 |         ("མཐའིའོ", {"len": 4, "type": "gi+o", "aa": True}),
39 |         ("མཐའིའམ", {"len": 4, "type": "gi+am", "aa": True}),
40 |         ("མཐའིའང", {"len": 4, "type": "gi+ang", "aa": True}),
41 |         ("མཐའོའམ", {"len": 4, "type": "o+am", "aa": True}),
42 |         ("མཐའོའང", {"len": 4, "type": "o+ang", "aa": True}),
43 |     ]
44 | 
45 |     affixed = bs.get_all_affixed("ཀུན")
46 |     assert affixed is None
47 | 


--------------------------------------------------------------------------------
/tests/textunits/test_sylcomponents.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from botok import SylComponents
 3 | 
 4 | 
 5 | def test_components():
 6 |     sc = SylComponents()
 7 | 
 8 |     # A) get_parts()
 9 |     # 1. (prefix+main-stack, vowel+suffixes)
10 |     assert sc.get_parts("བཀྲིས") == ("བཀྲ", "ིས")
11 |     # 2. (exceptions, 'x')
12 |     assert sc.get_parts("མདྲོན") == ("མདྲོན", "x")
13 |     # 3. a list of solutions if there is more than one (not yet encountered)
14 |     # 4. None if the syllable is not wellformed
15 |     assert sc.get_parts("ཀཀ") is None
16 | 
17 |     # B) get_mingzhi()
18 |     assert sc.get_mingzhi("བསྒྲུབས") == "ྒ"
19 |     # the mingzhi that will serve for the particle agreement:
20 |     assert sc.get_mingzhi("ཁྱེའུར") == "འ"
21 |     # None if more than one solution from get_parts() (not yet encountered)
22 | 
23 |     # support for dadrag
24 |     assert sc.get_mingzhi("ཀུནད") == "ཀ"
25 | 
26 |     # dadrag normalize
27 |     assert sc.normalize_dadrag("ཀུནད") == "ཀུན"
28 | 
29 |     # C) get_info()
30 |     # 1. 'dadrag'
31 |     # A syllable that historically received a "da" second suffix.
32 |     # As for now, the list contains ["ཀུན", "ཤིན", "འོན"] (See pybo/resources/SylComponents.json)
33 |     assert sc.get_info("ཀུན") == "dadrag"
34 |     # 2. 'thame'
35 |     # A syllable that has the potential of hosting an affixed particle.
36 |     # Will be returned for all such syls, whether or not a particle is affixed.
37 |     assert sc.get_info("དེའིའམ") == "thame"
38 |     assert sc.get_info("དེའི") == "thame"
39 |     # 3 the syllable itself in all other cases
40 |     assert sc.get_info("ང") == "thame"
41 |     assert sc.get_info("རྒྱལ") == "རྒྱལ"
42 | 
43 |     # D) is_thame()
44 |     # True if the syllabe is affixable or is already affixed, False otherwise
45 |     assert sc.is_thame("ཀུན") is False
46 |     assert sc.is_thame("དེའིའམ") is True
47 |     assert sc.is_thame("དེའི") is True
48 |     assert sc.is_thame("ང") is True
49 | 


--------------------------------------------------------------------------------
/tests/tokenizers/test_sent_par_tokenizer.py:
--------------------------------------------------------------------------------
  1 | import pytest
  2 | 
  3 | from pathlib import Path
  4 | 
  5 | from botok import paragraph_tokenizer, sentence_tokenizer
  6 | 
  7 | 
  8 | text = (
  9 |     "བཀུར་བར་མི་འགྱུར་ཞིང༌། །བརྙས་བཅོས་མི་སྙན་རྗོད་པར་བྱེད། །དབང་དང་འབྱོར་པ་ལྡན་པ་ཡི། །རྒྱལ་རིགས་ཕལ་ཆེར་བག་མེད་པས། །"
 10 |     "མྱོས་པའི་གླང་ཆེན་བཞིན་དུ་འཁྱམས། །དེ་ཡི་འཁོར་ཀྱང་དེ་འདྲར་འགྱུར། །གཞན་ཡང་རྒྱལ་པོ་རྒྱལ་རིགས་ཀྱི། །སྤྱོད་པ་བཟང་ངན་ཅི་འདྲ་བ། །"
 11 |     "དེ་འདྲའི་ཚུལ་ལ་བལྟས་ནས་སུ། །འབངས་རྣམས་དེ་དང་དེ་འདྲ་སྟེ། །རྒྱལ་པོ་ནོར་ལ་བརྐམས་གྱུར་ན། །ནོར་གྱིས་རྒྱལ་ཁྲིམས་བསླུ་བར་རྩོམ། །"
 12 |     "མི་བདག་གཡེམ་ལ་དགའ་གྱུར་ན། །འཕྱོན་མའི་ཚོགས་རྣམས་མགོ་འཕང་མཐོ། །ཕྲ་མར་ཉན་ན་དབྱེན་གྱིས་གཏོར། །བརྟག་དཔྱད་མི་ཤེས་རྫུན་གྱིས་སླུ། །"
 13 |     "ང་ལོ་ཡང་ན་ཀུན་གྱིས་བསྐྱོད། །ངོ་དགར་བརྩི་ན་ཟོལ་ཚིག་སྨྲ། །དེ་དང་དེ་ལ་སོགས་པ་ཡི། །མི་བདག་དེ་ལ་གང་གང་གིས། །"
 14 |     "བསླུ་བར་རུང་བའི་སྐབས་མཐོང་ན། །གཡོན་ཅན་ཚོགས་ཀྱིས་ཐབས་དེ་སེམས། །མི་རྣམས་རང་འདོད་སྣ་ཚོགས་ལ། །རྒྱལ་པོ་ཀུན་གྱི་ཐུན་མོང་ཕྱིར། །"
 15 |     "རྒྱལ་པོས་བསམ་གཞིགས་མ་བྱས་ན། །ཐ་མར་རྒྱལ་སྲིད་འཇིག་པར་འགྱུར། །ཆེན་པོའི་གོ་སར་གནས་པ་ལ། །སྐྱོན་ཀྱང་ཡོན་ཏན་ཡིན་ཚུལ་དུ། །"
 16 |     "འཁོར་ངན་རྣམས་ཀྱིས་ངོ་བསྟོད་སྨྲ། །དེ་ཕྱིར་སྐྱོན་ཡོན་ཤེས་པ་དཀའ། །ལྷག་པར་རྩོད་ལྡན་སྙིགས་མའི་ཚེ། །འཁོར་གྱི་ནང་ན་མ་རབས་མང༌། །"
 17 |     "སྐྱོན་ཡང་ཡོན་ཏན་ལྟར་མཐོང་ལ། །རང་འདོད་ཆེ་ཞིང་རྒྱལ་པོ་བསླུ། །ཆུས་དང་འཁོར་གྱི་བདེ་ཐབས་ལ། །བསམ་གཞིགས་བྱེད་པ་དཀོན་པའི་ཕྱིར། །"
 18 |     "རྒྱལ་པོས་ལེགས་པར་དཔྱད་ནས་སུ། །བདེན་པའི་ངག་ལས་"
 19 | )
 20 | 
 21 | 
 22 | @pytest.fixture
 23 | def tokens(wt):
 24 |     return wt.tokenize(text, split_affixes=True)
 25 | 
 26 | 
 27 | @pytest.mark.skip(reason="not a config bug")
 28 | def test_sent_tokenizer(tokens):
 29 |     sents = sentence_tokenizer(tokens)
 30 | 
 31 |     out = ["".join([word.text for word in s['tokens']]) for s in sents]
 32 |     expected = [
 33 |         "བཀུར་བར་མི་འགྱུར་ཞིང༌། །བརྙས་བཅོས་མི་སྙན་རྗོད་པར་བྱེད། །",
 34 |         "དབང་དང་འབྱོར་པ་ལྡན་པ་ཡི། །རྒྱལ་རིགས་ཕལ་ཆེར་བག་མེད་པས། །མྱོས་པའི་གླང་ཆེན་བཞིན་དུ་འཁྱམས། །དེ་ཡི་འཁོར་ཀྱང་དེ་འདྲར་འགྱུར། །",
 35 |         "གཞན་ཡང་རྒྱལ་པོ་རྒྱལ་རིགས་ཀྱི། །སྤྱོད་པ་བཟང་ངན་ཅི་འདྲ་བ། །དེ་འདྲའི་ཚུལ་ལ་བལྟས་ནས་སུ། །འབངས་རྣམས་དེ་དང་དེ་འདྲ་སྟེ། །",
 36 |         "རྒྱལ་པོ་ནོར་ལ་བརྐམས་གྱུར་ན། །",
 37 |         "ནོར་གྱིས་རྒྱལ་ཁྲིམས་བསླུ་བར་རྩོམ། །",
 38 |         "མི་བདག་གཡེམ་ལ་དགའ་གྱུར་ན། །",
 39 |         "འཕྱོན་མའི་ཚོགས་རྣམས་མགོ་འཕང་མཐོ། །",
 40 |         "ཕྲ་མར་ཉན་ན་དབྱེན་གྱིས་གཏོར། །",
 41 |         "བརྟག་དཔྱད་མི་ཤེས་རྫུན་གྱིས་སླུ། །ང་ལོ་ཡང་ན་ཀུན་གྱིས་བསྐྱོད། །",
 42 |         "ངོ་དགར་བརྩི་ན་ཟོལ་ཚིག་སྨྲ། །",
 43 |         "དེ་དང་དེ་ལ་སོགས་པ་ཡི། །མི་བདག་དེ་ལ་གང་གང་གིས། །བསླུ་བར་རུང་བའི་སྐབས་མཐོང་ན། །",
 44 |         "གཡོན་ཅན་ཚོགས་ཀྱིས་ཐབས་དེ་སེམས། །མི་རྣམས་རང་འདོད་སྣ་ཚོགས་ལ། །རྒྱལ་པོ་ཀུན་གྱི་ཐུན་མོང་ཕྱིར། །རྒྱལ་པོས་བསམ་གཞིགས་མ་བྱས་ན། །",
 45 |         "ཐ་མར་རྒྱལ་སྲིད་འཇིག་པར་འགྱུར། །",
 46 |         "ཆེན་པོའི་གོ་སར་གནས་པ་ལ། །སྐྱོན་ཀྱང་ཡོན་ཏན་ཡིན་ཚུལ་དུ། །འཁོར་ངན་རྣམས་ཀྱིས་ངོ་བསྟོད་སྨྲ། །",
 47 |         "དེ་ཕྱིར་སྐྱོན་ཡོན་ཤེས་པ་དཀའ། །",
 48 |         "ལྷག་པར་རྩོད་ལྡན་སྙིགས་མའི་ཚེ། །འཁོར་གྱི་ནང་ན་མ་རབས་མང༌། །",
 49 |         "སྐྱོན་ཡང་ཡོན་ཏན་ལྟར་མཐོང་ལ། །རང་འདོད་ཆེ་ཞིང་རྒྱལ་པོ་བསླུ། །ཆུས་དང་འཁོར་གྱི་བདེ་ཐབས་ལ། །བསམ་གཞིགས་བྱེད་པ་དཀོན་པའི་ཕྱིར། །རྒྱལ་པོས་ལེགས་པར་དཔྱད་ནས་",
 50 |         "སུ། །བདེན་པའི་ངག་ལས་",
 51 |     ]
 52 |     assert out == expected
 53 | 
 54 | def test_normalized_sentence(tokens):
 55 |     sents = sentence_tokenizer(tokens)
 56 | 
 57 |     norm_sentences = [sentence['norm_sent'] for sentence in sents]
 58 |     expected = [
 59 |         "བཀུར་བ་ -ར་ མི་ འགྱུར་ ཞིང་ ། །",
 60 |         "བརྙས་བཅོས་ མི་ སྙན་ རྗོད་པ་ -ར་ བྱེད་ ། །",
 61 |         "དབང་ དང་ འབྱོར་པ་ ལྡན་པ་ ཡི་ ། ། རྒྱལ་རིགས་ ཕལ་ཆེར་ བག་མེད་པ་ -ས་ ། ། མྱོས་པ་ -འི་ གླང་ཆེན་ བཞིན་ དུ་ འཁྱམས་ ། ། དེ་ ཡི་ འཁོར་ ཀྱང་ དེ་ འདྲ་ -ར་ འགྱུར་ ། །",
 62 |         "གཞན་ ཡང་ རྒྱལ་པོ་ རྒྱལ་རིགས་ ཀྱི་ ། ། སྤྱོད་པ་ བཟང་ངན་ ཅི་འདྲ་བ་ ། ། དེ་ འདྲ་ -འི་ ཚུལ་ ལ་ བལྟས་ ནས་ སུ་ ། ། འབངས་ རྣམས་ དེ་ དང་ དེ་ འདྲ་ སྟེ་ ། །",
 63 |         "རྒྱལ་པོ་ ནོར་ ལ་ བརྐམས་ གྱུར་ ན་ ། །",
 64 |         "ནོར་ གྱིས་ རྒྱལ་ཁྲིམས་ བསླུ་བ་ -ར་ རྩོམ་ ། །",
 65 |         "མི་བདག་ གཡེམ་ ལ་ དགའ་ གྱུར་ ན་ ། །",
 66 |         "འཕྱོན་མ་ -འི་ ཚོགས་ རྣམས་ མགོ་འཕང་ མཐོ་ ། །",
 67 |         "ཕྲ་མ་ -ར་ ཉན་ ན་ དབྱེན་ གྱིས་ གཏོར་ ། །",
 68 |         "བརྟག་དཔྱད་ མི་ ཤེས་ རྫུན་ གྱིས་ སླུ་ ། ། ང་ ལོ་ ཡང་ན་ ཀུན་ གྱིས་ བསྐྱོད་ ། །",
 69 |         "ངོ་དགའ་ -ར་ བརྩི་ ན་ ཟོལ་ཚིག་ སྨྲ་ ། །",
 70 |         "དེ་ དང་ དེ་ ལ་སོགས་པ་ ཡི་ ། ། མི་བདག་ དེ་ ལ་ གང་ གང་ གིས་ ། ། བསླུ་བ་ -ར་ རུང་བ་ -འི་ སྐབས་ མཐོང་ ན་ ། །",
 71 |         "གཡོན་ཅན་ ཚོགས་ ཀྱིས་ ཐབས་ དེ་ སེམས་ ། ། མི་ རྣམས་ རང་འདོད་ སྣ་ཚོགས་ ལ་ ། །",
 72 |         "རྒྱལ་པོ་ ཀུན་ གྱི་ ཐུན་མོང་ ཕྱིར་ ། ། རྒྱལ་པོ་ -ས་ བསམ་ གཞིགས་ མ་ བྱས་ ན་ ། །",
 73 |         "ཐ་མ་ -ར་ རྒྱལ་སྲིད་ འཇིག་པ་ -ར་ འགྱུར་ ། །",
 74 |         "ཆེན་པོ་ -འི་ གོ་ས་ -ར་ གནས་པ་ ལ་ ། །",
 75 |         "སྐྱོན་ ཀྱང་ ཡོན་ཏན་ ཡིན་ཚུལ་ དུ་ ། ། འཁོར་ ངན་ རྣམས་ ཀྱིས་ ངོ་བསྟོད་ སྨྲ་ ། །",
 76 |         "དེ་ཕྱིར་ སྐྱོན་ཡོན་ ཤེས་པ་ དཀའ་ ། །",
 77 |         "ལྷག་པར་ རྩོད་ ལྡན་ སྙིགས་མ་ -འི་ ཚེ་ ། ། འཁོར་ གྱི་ ནང་ ན་མ་ རབས་ མང་ ། །",
 78 |         "སྐྱོན་ ཡང་ ཡོན་ཏན་ ལྟར་ མཐོང་ ལ་ ། །",
 79 |         "རང་འདོད་ ཆེ་ ཞིང་ རྒྱལ་པོ་ བསླུ་ ། ། ཆུས་ དང་ འཁོར་ གྱི་ བདེ་ ཐབས་ ལ་ ། །",
 80 |         "བསམ་ གཞིགས་ བྱེད་པ་ དཀོན་པ་ -འི་ ཕྱིར་ ། ། རྒྱལ་པོ་ -ས་ ལེགས་པ་ -ར་ དཔྱད་ ནས་ སུ་ ། ། བདེན་པ་ -འི་ ངག་ ལས་",
 81 |     ]
 82 |     assert norm_sentences == expected
 83 | 
 84 | 
 85 | def test_par_tokenizer(tokens):
 86 |     pars = paragraph_tokenizer(tokens)
 87 | 
 88 |     out = ["".join([word.text for word in p[1]]) for p in pars]
 89 |     expected = [
 90 |         "བཀུར་བར་མི་འགྱུར་ཞིང༌། །བརྙས་བཅོས་མི་སྙན་རྗོད་པར་བྱེད། །དབང་དང་འབྱོར་པ་ལྡན་པ་ཡི། །རྒྱལ་རིགས་ཕལ་ཆེར་བག་མེད་པས། །"
 91 |         "མྱོས་པའི་གླང་ཆེན་བཞིན་དུ་འཁྱམས། །དེ་ཡི་འཁོར་ཀྱང་དེ་འདྲར་འགྱུར། །གཞན་ཡང་རྒྱལ་པོ་རྒྱལ་རིགས་ཀྱི། །སྤྱོད་པ་བཟང་ངན་ཅི་འདྲ་བ། །"
 92 |         "དེ་འདྲའི་ཚུལ་ལ་བལྟས་ནས་སུ། །འབངས་རྣམས་དེ་དང་དེ་འདྲ་སྟེ། །རྒྱལ་པོ་ནོར་ལ་བརྐམས་གྱུར་ན། །ནོར་གྱིས་རྒྱལ་ཁྲིམས་བསླུ་བར་རྩོམ། །"
 93 |         "མི་བདག་གཡེམ་ལ་དགའ་གྱུར་ན། །འཕྱོན་མའི་ཚོགས་རྣམས་མགོ་འཕང་མཐོ། །ཕྲ་མར་ཉན་ན་དབྱེན་གྱིས་གཏོར། །བརྟག་དཔྱད་མི་ཤེས་རྫུན་གྱིས་སླུ། །"
 94 |         "ང་ལོ་ཡང་ན་ཀུན་གྱིས་བསྐྱོད། །ངོ་དགར་བརྩི་ན་ཟོལ་ཚིག་སྨྲ། །དེ་དང་དེ་ལ་སོགས་པ་ཡི། །མི་བདག་དེ་ལ་གང་གང་གིས། །བསླུ་བར་རུང་བའི་སྐབས་མཐོང་ན། །",
 95 |         "གཡོན་ཅན་ཚོགས་ཀྱིས་ཐབས་དེ་སེམས། །མི་རྣམས་རང་འདོད་སྣ་ཚོགས་ལ། །རྒྱལ་པོ་ཀུན་གྱི་ཐུན་མོང་ཕྱིར། །རྒྱལ་པོས་བསམ་གཞིགས་མ་བྱས་ན། །"
 96 |         "ཐ་མར་རྒྱལ་སྲིད་འཇིག་པར་འགྱུར། །ཆེན་པོའི་གོ་སར་གནས་པ་ལ། །སྐྱོན་ཀྱང་ཡོན་ཏན་ཡིན་ཚུལ་དུ། །འཁོར་ངན་རྣམས་ཀྱིས་ངོ་བསྟོད་སྨྲ། །"
 97 |         "དེ་ཕྱིར་སྐྱོན་ཡོན་ཤེས་པ་དཀའ། །ལྷག་པར་རྩོད་ལྡན་སྙིགས་མའི་ཚེ། །འཁོར་གྱི་ནང་ན་མ་རབས་མང༌། །སྐྱོན་ཡང་ཡོན་ཏན་ལྟར་མཐོང་ལ། །"
 98 |         "རང་འདོད་ཆེ་ཞིང་རྒྱལ་པོ་བསླུ། །ཆུས་དང་འཁོར་གྱི་བདེ་ཐབས་ལ། །བསམ་གཞིགས་བྱེད་པ་དཀོན་པའི་ཕྱིར། །རྒྱལ་པོས་ལེགས་པར་དཔྱད་ནས་སུ། །བདེན་པའི་ངག་ལས་",
 99 |     ]
100 |     assert out == expected
101 | 


--------------------------------------------------------------------------------
/tests/tokenizers/test_splitaffixed.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from botok import *
 3 | 
 4 | 
 5 | def test_split_token():
 6 |     config = Config.from_path("./tests/data/empty_dialect_pack")
 7 |     wt = WordTokenizer(config=config)
 8 |     wt.tok.trie.rebuild_trie()
 9 |     wt.tok.trie.inflect_n_modify_trie("བདེ་བ་")
10 |     wt.tok.trie.inflect_n_add_data("བདེ་བ་\t\tNOUN")
11 |     wt.tok.trie.inflect_n_modify_trie("གཏན་")
12 |     wt.tok.trie.inflect_n_add_data("གཏན་\t\tNOUN")
13 |     wt.tok.trie.inflect_n_modify_trie("གྱི་")
14 |     wt.tok.trie.inflect_n_add_data("གྱི་\tགི\tPART")
15 |     tokens = wt.tokenize("གཏན་གྱི་བདེ་བའི་རྒྱུ།", split_affixes=False)
16 |     assert len(tokens) == 5
17 |     assert tokens[2].text == "བདེ་བའི་"
18 |     tokens = wt.tokenize("གཏན་གྱི་བདེ་བའི་རྒྱུ།")
19 |     assert len(tokens) == 6
20 |     assert tokens[2].text == "བདེ་བ"
21 |     assert tokens[3].text == "འི་"
22 | 


--------------------------------------------------------------------------------
/tests/tokenizers/test_stack_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from botok.tokenizers.stacktokenizer import tokenize_in_stacks
 2 | 
 3 | 
 4 | def test_stack_tokenizer():
 5 |     """Test the stack tokenizer functionality."""
 6 |     # Test with standard Tibetan text
 7 |     assert tokenize_in_stacks("ཀཿཐོག་འབྱམ་པའཱི་རོ།") == ["ཀ", "\u0f7f", "ཐོ", "ག", "་", "འ", "བྱ", "མ", "་", "པ", "འཱི", "་", "རོ", "།"]
 8 |     
 9 |     # Test with special character at the beginning
10 |     assert tokenize_in_stacks("\u0f7fཀཿ") == ["\u0f7f", "ཀ", "\u0f7f"]
11 |     
12 |     # Test with empty string
13 |     assert tokenize_in_stacks("") == []
14 |     
15 |     # Test with single character
16 |     assert tokenize_in_stacks("ཀ") == ["ཀ"]
17 | 


--------------------------------------------------------------------------------
/tests/tokenizers/test_token.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from textwrap import dedent
 3 | from pytest import raises
 4 | 
 5 | from botok import *
 6 | 
 7 | 
 8 | def test_token():
 9 |     t = Token()
10 |     t.text = "test"
11 |     # Token supports access to attributes in two ways (required for CQL found in third_party/cql.py)
12 |     assert t.text == t["text"]
13 |     assert t._ == t["_"]
14 | 
15 |     # setting existing attributes like dicts is supported
16 |     attrs = {"pos": "NOUN", "freq": "123", "len": 4}
17 |     for k, v in attrs.items():
18 |         t[k] = v
19 |     assert str(t) == dedent(
20 |         """\
21 |                             text: "test"
22 |                             pos: NOUN
23 |                             freq: 123
24 |                             start: 0
25 |                             len: 4
26 |                             
27 |                             """
28 |     )
29 | 
30 |     # raises an error when trying to add a new attribute
31 |     with raises(AttributeError, match=r"Token objects don't have .* as attribute"):
32 |         t["non_attr"] = "test"
33 | 


--------------------------------------------------------------------------------
/tests/tokenizers/test_tokenize.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | from textwrap import dedent
  3 | 
  4 | import pytest
  5 | 
  6 | from botok import *
  7 | 
  8 | 
  9 | @pytest.fixture(scope="module")
 10 | def empty_config():
 11 |     return Config.from_path("./tests/data/empty_dialect_pack")
 12 | 
 13 | 
 14 | def test_tokenize(empty_config, wt):
 15 |     profile = "empty"
 16 |     config = empty_config
 17 |     tok = Tokenize(Trie(BoSyl, profile, config.dictionary, config.adjustments))
 18 |     tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་")
 19 |     tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN\t\tབཀྲ་ཤིས་\t17500")
 20 |     tok.trie.inflect_n_modify_trie("མཐའ་")
 21 |     tok.trie.inflect_n_add_data("མཐའ་\tNOUN")
 22 |     in_str = "མཐའི་བཀྲ་ཤིས། ཀཀ abc མཐའི་རྒྱ་མཚོ་"
 23 |     preproc = TokChunks(in_str)
 24 |     preproc.serve_syls_to_trie()
 25 |     tokens = tok.tokenize(preproc)
 26 |     expected = dedent(
 27 |         """\
 28 |                         text: "བཀྲ་ཤིས"
 29 |                         text_cleaned: "བཀྲ་ཤིས་"
 30 |                         text_unaffixed: "བཀྲ་ཤིས་"
 31 |                         syls: ["བཀྲ", "ཤིས"]
 32 |                         senses: | pos: NOUN, freq: 17500, sense: བཀྲ་ཤིས་, affixed: False |
 33 |                         char_types: |CONS|CONS|SUB_CONS|TSEK|CONS|VOW|CONS|
 34 |                         chunk_type: TEXT
 35 |                         syls_idx: [[0, 1, 2], [4, 5, 6]]
 36 |                         syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
 37 |                         start: 5
 38 |                         len: 7
 39 | 
 40 |                         """
 41 |     )
 42 |     str(tokens[0])
 43 |     assert str(tokens[1]) == expected
 44 |     assert tokens[2].text == "། "
 45 |     assert tokens[2].chunk_type == "PUNCT"
 46 |     # add sense to བཀྲ་ཤིས་
 47 |     wt.tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN\t\tབཀྲ་ཤིས་\t17500")
 48 |     tokens = wt.tokenize(in_str)
 49 |     expected = dedent(
 50 |         """\
 51 |                         text: "བཀྲ་ཤིས"
 52 |                         text_cleaned: "བཀྲ་ཤིས་"
 53 |                         text_unaffixed: "བཀྲ་ཤིས་"
 54 |                         syls: ["བཀྲ", "ཤིས"]
 55 |                         pos: NOUN
 56 |                         lemma: བཀྲ་ཤིས་
 57 |                         sense: བཀྲ་ཤིས་
 58 |                         senses: | pos: NOUN, freq: 17204, affixed: False, lemma: བཀྲ་ཤིས་ | pos: NOUN, freq: 17500, sense: བཀྲ་ཤིས་, affixed: False, lemma: བཀྲ་ཤིས་ |
 59 |                         char_types: |CONS|CONS|SUB_CONS|TSEK|CONS|VOW|CONS|
 60 |                         chunk_type: TEXT
 61 |                         freq: 17500
 62 |                         syls_idx: [[0, 1, 2], [4, 5, 6]]
 63 |                         syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}]
 64 |                         start: 5
 65 |                         len: 7
 66 | 
 67 |                         """
 68 |     )
 69 |     assert str(tokens[2]) == expected
 70 | 
 71 | 
 72 | def test_non_max2(empty_config):
 73 |     profile = "empty"
 74 |     config = empty_config
 75 |     tok = Tokenize(Trie(BoSyl, profile, config.dictionary, config.adjustments))
 76 |     tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་")
 77 |     tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN")
 78 |     tok.trie.inflect_n_modify_trie(
 79 |         "བཀྲ་ཤིས་བདེ་ལེགས།"
 80 |     )  # to ensure we're not in a maximal match
 81 |     preproc = TokChunks("བཀྲ་ཤིས་བདེ་བཀྲ་")
 82 |     preproc.serve_syls_to_trie()
 83 |     tokens = tok.tokenize(preproc)
 84 |     assert tokens[0].text == "བཀྲ་ཤིས་"
 85 |     assert tokens[0]["senses"][0]["pos"] == "NOUN"
 86 |     assert tokens[1].text == "བདེ་"
 87 |     assert tokens[1]["senses"][0]["pos"] == "NON_WORD"
 88 |     assert tokens[2].text == "བཀྲ་"
 89 |     assert tokens[2]["senses"][0]["pos"] == "NO_POS"
 90 | 
 91 | 
 92 | def test_non_max_end_of_string(empty_config):
 93 |     profile = "empty"
 94 |     config = empty_config
 95 |     tok = Tokenize(Trie(BoSyl, profile, config.dictionary, config.adjustments))
 96 |     tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་")
 97 |     tok.trie.inflect_n_modify_trie(
 98 |         "བཀྲ་ཤིས་བདེ་ལེགས།"
 99 |     )  # to ensure we're not in a maximal match
100 |     preproc = TokChunks("བཀྲ་ཤིས་བདེ་")
101 |     preproc.serve_syls_to_trie()
102 |     tokens = tok.tokenize(preproc)
103 |     assert tokens[0].text == "བཀྲ་ཤིས་"
104 |     assert tokens[1].text == "བདེ་"
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     test_non_max2()
109 | 


--------------------------------------------------------------------------------
/tests/tokenizers/test_wordtokenizer.py:
--------------------------------------------------------------------------------
  1 | # coding: utf8
  2 | from textwrap import dedent
  3 | 
  4 | from botok import *
  5 | 
  6 | 
  7 | def test_get_default_lemma(wt):
  8 |     input_str = "བཀྲ་ཤིས་བདེ་ལེགས། མཐའི་རྒྱ་མཚོར་གནས་སོ།། །།ཀཀ"
  9 |     config = Config()
 10 |     profile = config.dialect_pack_path.name
 11 | 
 12 |     # reconstitute all the pieces that WordTokenizer gathers
 13 |     tok = Tokenize(Trie(BoSyl, profile, config.dictionary, config.adjustments))
 14 |     preproc = TokChunks(input_str)
 15 |     preproc.serve_syls_to_trie()
 16 |     tokens = tok.tokenize(preproc)
 17 |     split_affixed(tokens)
 18 | 
 19 |     # if __get_default_lemma() is not run, only the lemmas coming from the lemma folder will be included
 20 |     # in the Token objects.
 21 |     assert str(tokens[3]) == dedent(
 22 |         """\
 23 |                                 text: "མཐ"
 24 |                                 text_cleaned: "མཐ"
 25 |                                 text_unaffixed: "མཐ"
 26 |                                 syls: ["མཐ"]
 27 |                                 senses: | pos: NOUN, freq: 45097, affixed: True |
 28 |                                 char_types: |CONS|CONS|
 29 |                                 chunk_type: TEXT
 30 |                                 affix_host: True
 31 |                                 syls_idx: [[0, 1]]
 32 |                                 syls_start_end: [{'start': 0, 'end': 2}]
 33 |                                 start: 18
 34 |                                 len: 2
 35 | 
 36 |                                 """
 37 |     )
 38 |     assert "lemma" not in tokens[3]["senses"][0]
 39 | 
 40 |     assert str(tokens[4]) == dedent(
 41 |         """\
 42 |                                 text: "འི་"
 43 |                                 text_cleaned: "འི་"
 44 |                                 text_unaffixed: "འི་"
 45 |                                 syls: ["འི"]
 46 |                                 pos: PART
 47 |                                 char_types: |CONS|VOW|TSEK|
 48 |                                 chunk_type: TEXT
 49 |                                 affix: True
 50 |                                 syls_idx: [[0, 1]]
 51 |                                 syls_start_end: [{'start': 2, 'end': 5}]
 52 |                                 start: 20
 53 |                                 len: 3
 54 | 
 55 |                                 """
 56 |     )
 57 | 
 58 |     # regular words also have no lemmas
 59 |     assert "lemma" not in tokens[0]["senses"][0]
 60 | 
 61 |     # doing the same thing using WordTokenizer, which will apply its __get_default_lemma() method
 62 |     # the profile is the same, so no lemma comes from the trie content files.
 63 |     tokens = wt.tokenize(input_str)
 64 | 
 65 |     # the lemma is Token.text_unaffixed with an extra འ and/or a tsek where required
 66 |     assert str(tokens[3]) == dedent(
 67 |         """\
 68 |                                 text: "མཐ"
 69 |                                 text_cleaned: "མཐ"
 70 |                                 text_unaffixed: "མཐ"
 71 |                                 syls: ["མཐ"]
 72 |                                 pos: NOUN
 73 |                                 lemma: མཐའ་
 74 |                                 senses: | pos: NOUN, freq: 45097, affixed: True, lemma: མཐའ་ |
 75 |                                 char_types: |CONS|CONS|
 76 |                                 chunk_type: TEXT
 77 |                                 freq: 45097
 78 |                                 affix_host: True
 79 |                                 syls_idx: [[0, 1]]
 80 |                                 syls_start_end: [{'start': 0, 'end': 2}]
 81 |                                 start: 18
 82 |                                 len: 2
 83 | 
 84 |                                 """
 85 |     )
 86 |     assert tokens[3]["senses"][0]["lemma"] == "མཐའ་"
 87 | 
 88 |     # for particles, WordTokenizer reads the lemmas from a file and attributes them
 89 |     assert str(tokens[4]) == dedent(
 90 |         """\
 91 |                                 text: "འི་"
 92 |                                 text_cleaned: "འི་"
 93 |                                 text_unaffixed: "འི་"
 94 |                                 syls: ["འི"]
 95 |                                 pos: PART
 96 |                                 lemma: གི་
 97 |                                 senses: | lemma: གི་ |
 98 |                                 char_types: |CONS|VOW|TSEK|
 99 |                                 chunk_type: TEXT
100 |                                 affix: True
101 |                                 syls_idx: [[0, 1]]
102 |                                 syls_start_end: [{'start': 2, 'end': 5}]
103 |                                 start: 20
104 |                                 len: 3
105 | 
106 |                                 """
107 |     )
108 | 
109 |     # for regular words, Token.text_unaffixed is simply copied
110 |     assert tokens[0]["senses"][0]["lemma"] == "བཀྲ་ཤིས་"
111 | 
112 |     # non-words do not have lemmas
113 |     assert "lemma" not in tokens[10]["senses"][0]
114 |     assert tokens[10].text_cleaned == "ཀཀ་"
115 |     assert tokens[10].text_unaffixed == "ཀཀ་"
116 | 
117 |     # Token objects whose chunk_type is not 'TEXT' will be attributed no lemma.
118 |     # text_unaffixed and text_cleaned are also empty. Token.text must be retrieved
119 |     assert tokens[2].text_unaffixed == "" == tokens[2].text_cleaned
120 | 
121 | 
122 | def test_spaces_as_punct(wt):
123 |     input_str = "བ ཀྲ་ཤིས་ བདེ་ལེགས། \nམཐའི་རྒྱ་མཚོར་ག ནས་སོ།། །།ཀཀ"
124 |     tokens = wt.tokenize(input_str, spaces_as_punct=True)
125 |     assert tokens[0].text == "བ"
126 |     assert tokens[1].text == " "
127 |     assert tokens[2].text == "ཀྲ་"
128 |     assert tokens[8].text == " \n"
129 | 
130 | def test_particle_bug(wt):
131 |     input_str = "བོད་གིས"
132 |     tokens = wt.tokenize(input_str)
133 |     assert tokens[1].pos == "PART"


--------------------------------------------------------------------------------
/tests/tries/test_basictrie.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from botok import BasicTrie
 3 | 
 4 | 
 5 | def test_trie():
 6 |     trie = BasicTrie()
 7 | 
 8 |     # populate the basic trie
 9 |     words = "hello goo good goodbye help gerald gold tea ted team to too tom stan standard money"
10 |     for w in words.split():
11 |         trie.add(w)
12 | 
13 |     # test word existence. has_word() is not used in pybo. it is only there for testing purposes
14 |     assert trie.has_word("goodbye") == {"data": {"_": {}}, "exists": True}
15 | 
16 |     # add content to data
17 |     trie.add_data("goodbye", {"pos": "NOUN"})
18 |     assert trie.has_word("goodbye") == {
19 |         "exists": True,
20 |         "data": {"_": {}, "senses": [{"pos": "NOUN"}]},
21 |     }
22 | 
23 |     # adding an empty dict to show it does not replace existing content but updates it
24 |     trie.add_data("goodbye", {})
25 |     assert trie.has_word("goodbye") == {
26 |         "exists": True,
27 |         "data": {"_": {}, "senses": [{"pos": "NOUN"}]},
28 |     }
29 | 
30 |     # by default, overwrites existing dict values
31 |     trie.add_data("goodbye", {"pos": "VERB", "lemma": "goodbye"})
32 |     assert trie.has_word("goodbye") == {
33 |         "exists": True,
34 |         "data": {
35 |             "_": {},
36 |             "senses": [{"pos": "NOUN"}, {"pos": "VERB", "lemma": "goodbye"}],
37 |         },
38 |     }
39 | 
40 |     # deactivates an entry, only modifying the Node.leaf value (bool) instead of removing it from the trie.
41 |     trie.deactivate("goodbye")
42 |     assert trie.has_word("goodbye") == {
43 |         "exists": False,
44 |         "data": {
45 |             "_": {},
46 |             "senses": [{"pos": "NOUN"}, {"pos": "VERB", "lemma": "goodbye"}],
47 |         },
48 |     }
49 | 
50 |     # reactivates the entry
51 |     trie.deactivate("goodbye", rev=True)
52 |     assert trie.has_word("goodbye") == {
53 |         "exists": True,
54 |         "data": {
55 |             "_": {},
56 |             "senses": [{"pos": "NOUN"}, {"pos": "VERB", "lemma": "goodbye"}],
57 |         },
58 |     }
59 | 
60 |     # walk() is used to externalize the walking of the trie
61 |     current_node = None  # setting an empty variable for the current node
62 |     for char in "goodbye":
63 |         current_node = trie.walk(char, current_node)
64 | 
65 |     assert current_node.label == "e"  # last char of the word
66 |     assert current_node.leaf is True  # we reached the end of a word
67 |     assert current_node.data == {
68 |         "_": {},
69 |         "senses": [{"pos": "NOUN"}, {"pos": "VERB", "lemma": "goodbye"}],
70 |     }
71 | 


--------------------------------------------------------------------------------
/tests/tries/test_trie.py:
--------------------------------------------------------------------------------
 1 | # coding: utf8
 2 | from collections import defaultdict
 3 | from pathlib import Path
 4 | 
 5 | from botok import BoSyl, Config, TokChunks, Trie
 6 | 
 7 | config = Config()
 8 | 
 9 | 
10 | def syls(string):
11 |     return TokChunks(string).get_syls()
12 | 
13 | 
14 | def test_createtrie():
15 |     profile = "empty"
16 |     config = Config.from_path("./tests/data/trie_dialect_pack")
17 |     bt = Trie(BoSyl, profile, config.dictionary, config.adjustments)
18 | 
19 |     # the trie works as expected. but the add() method should never be used directly:
20 |     # it does not inflect entries, so the tokenizer won't work as expected.
21 |     # be careful only to use it with words that can't ever be inflected, like case particles.
22 |     bt.add(syls("གྲུབ་མཐའ་"), {"pos": "NOUN"})
23 |     assert bt.has_word(syls("གྲུབ་མཐའི་")) == {"exists": False, "data": {"_": {}}}
24 | 
25 |     # use inflect_n_modify_trie() instead, to add entries
26 |     bt.inflect_n_modify_trie("གྲུབ་མཐའ་")
27 |     assert bt.has_word(syls("གྲུབ་མཐའི་")) == {
28 |         "exists": True,
29 |         "data": {"_": {}, "affixation": {"len": 2, "type": "gi", "aa": True}},
30 |     }
31 | 
32 |     bt.inflect_n_modify_trie("ཀ་ར་", skrt=True)
33 |     assert bt.has_word(syls("ཀ་རར་")) == {
34 |         "exists": True,
35 |         "data": {
36 |             "_": {},
37 |             "affixation": {"len": 1, "type": "la", "aa": False},
38 |             "skrt": True,
39 |             "senses": [{"lemma": "", "affixed": True}],
40 |         },
41 |     }  # arrives here because skrt was True
42 | 
43 |     bt.inflect_n_add_data(
44 |         "གྲུབ་མཐའ་\t\t\t\t532"
45 |     )  # 'freq' is hard-coded in Trie, just as 'lemma' and 'pos' are
46 |     assert bt.has_word(syls("གྲུབ་མཐའི་")) == {
47 |         "exists": True,
48 |         "data": {
49 |             "_": {},
50 |             "affixation": {"len": 2, "type": "gi", "aa": True},
51 |             "senses": [{"freq": 532, "affixed": True}],
52 |         },
53 |     }  # freq is an int
54 | 
55 |     # just like add() was not meant to be used directly, deactivate() is not
56 |     # instead, use bt.inflect_n_modify_trie("word", deactivate=True)
57 |     bt.deactivate(syls("ཀ་ར་"))
58 |     assert (
59 |         bt.has_word(syls("ཀ་ར་"))["exists"] is False
60 |     )  # since 'ཀ་ར་' has been deactivated
61 | 
62 | 
63 | def test_multiple_words_per_entry():
64 |     profile = "POS"
65 |     config = Config.from_path("./tests/data/trie_dialect_pack")
66 |     bt = Trie(BoSyl, profile, config.dictionary, config.adjustments)
67 | 
68 |     res = bt.has_word(syls("ལྟར་"))
69 |     assert {"lemma": "ལྟ་", "pos": "VERB", "freq": 123, "affixed": True} in res["data"][
70 |         "senses"
71 |     ]
72 |     assert {"lemma": "ལྟར་", "pos": "ADV", "freq": 456, "affixed": False} in res[
73 |         "data"
74 |     ]["senses"]
75 | 


--------------------------------------------------------------------------------
/usage.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from botok import WordTokenizer, Text, Config
 4 | 
 5 | ###########################################
 6 | in_str = "ལེ གས། བཀྲ་ཤིས་མཐའི་ ༆ ཤི་བཀྲ་ཤིས་  tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།"
 7 | WT = WordTokenizer()
 8 | tokens = WT.tokenize(in_str)
 9 | 
10 | in_str = "ལ་པོ་ལ་པོ་ལ་པོ་"
11 | t = Text(in_str, tok_params={"config": Config()})
12 | tokens = t.tokenize_words_raw_text
13 | tt = Text(
14 |     in_str, tok_params={"config": Config.from_path("./tests/data/trie_dialect_pack")},
15 | )
16 | ttokens = tt.tokenize_words_raw_text
17 | print(tokens)
18 | print(ttokens)
19 | ###########################################
20 | 
21 | #
22 | # ### Extract token-string / POS pairs ########
23 | #
24 | # tagged = ['"{}"/{}'.format(w.text, w.pos) for w in tokens]
25 | # print(', '.join(tagged))
26 | #
27 | #
28 | # ### Extract the cleaned version of the tokens
29 | #
30 | # cleaned = [w.text_cleaned for w in tokens if w.text_cleaned]
31 | # print(' '.join(cleaned))
32 | 


--------------------------------------------------------------------------------