├── .coveragerc ├── .github └── workflows │ ├── publish.yaml │ ├── python-package.yml │ └── test.yml ├── .gitignore ├── .readthedocs.yml ├── .travis.yml ├── CHANGELOG.md ├── LICENSE ├── README.md ├── _config.yml ├── botok ├── __init__.py ├── chunks │ ├── __init__.py │ ├── chunkframework.py │ ├── chunkframeworkbase.py │ └── chunks.py ├── config.py ├── modifytokens │ ├── __init__.py │ ├── adjusttokens.py │ ├── cqlmatcher.py │ ├── mergedagdra.py │ ├── mergingmatcher.py │ ├── replacingmatcher.py │ ├── splitaffixed.py │ ├── splittingmatcher.py │ ├── tokenmerge.py │ └── tokensplit.py ├── resources │ ├── README.md │ ├── SylComponents.json │ ├── bo_punct_position.csv │ ├── bo_uni_table.csv │ └── particles.tsv ├── text │ ├── __init__.py │ ├── format.py │ ├── modify.py │ ├── pipelinebase.py │ ├── preprocess.py │ ├── text.py │ └── tokenize.py ├── textunits │ ├── __init__.py │ ├── bostring.py │ ├── bosyl.py │ ├── charcategories.py │ └── sylcomponents.py ├── third_party │ ├── __init__.py │ ├── cqlparser.py │ ├── has_skrt_syl.py │ └── pynpl │ │ ├── __init__.py │ │ ├── cql.py │ │ └── fsa.py ├── tokenizers │ ├── __init__.py │ ├── chunktokenizer.py │ ├── paragraphtokenizer.py │ ├── sentencetokenizer.py │ ├── stacktokenizer.py │ ├── token.py │ ├── tokenize.py │ └── wordtokenizer.py ├── tries │ ├── __init__.py │ ├── basictrie.py │ └── trie.py ├── utils │ ├── __init__.py │ ├── expose_data.py │ ├── helpers.py │ ├── lenient_normalization.py │ └── unicode_normalization.py └── vars.py ├── docs ├── Makefile ├── README.md ├── old-docs │ ├── Behind BoTokenizer.ipynb │ ├── Preprocessing.ipynb │ ├── README.md │ ├── Using BoTokenizer.ipynb │ └── cql_readme.md ├── requirements-docs.txt └── source │ ├── acknowledgement.rst │ ├── architecture.rst │ ├── conf.py │ ├── custom-dialect-pack.rst │ ├── getting-started.rst │ ├── imgs │ └── botok_architecture.svg │ ├── index.rst │ └── main_classes │ └── configuration.rst ├── python-3.13.2-amd64.exe ├── requirements.txt ├── setup.cfg ├── setup.py ├── tests ├── __init__.py ├── chunks │ ├── test_chunkframework.py │ ├── test_chunks.py │ └── test_chunktokenizer.py ├── conftest.py ├── data │ ├── empty_dialect_pack │ │ ├── adjustments │ │ │ └── .keep │ │ └── dictionary │ │ │ └── .keep │ └── trie_dialect_pack │ │ ├── adjustments │ │ ├── remove │ │ │ └── test.tsv │ │ ├── rules │ │ │ └── adjust_rules.tsv │ │ ├── words │ │ │ ├── test.tsv │ │ │ └── test_comma_sep.tsv │ │ └── words_skrt │ │ │ └── test.tsv │ │ └── dictionary │ │ └── words │ │ └── empty.tsv ├── modifytokens │ └── test_matchers.py ├── resources │ ├── rdr_rules.txt │ ├── test.txt │ ├── test_file_to_tokenize.txt │ └── test_file_to_tokenize_pybo.txt ├── test_bugs.py ├── test_config.py ├── text │ ├── test_text.py │ └── test_text_tokenize.py ├── textunits │ ├── test_bostring.py │ ├── test_bosyl.py │ └── test_sylcomponents.py ├── tokenizers │ ├── test_bugs_missing_tokens.py │ ├── test_sent_par_tokenizer.py │ ├── test_splitaffixed.py │ ├── test_stack_tokenizer.py │ ├── test_token.py │ ├── test_tokenize.py │ └── test_wordtokenizer.py └── tries │ ├── test_basictrie.py │ └── test_trie.py └── usage.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = botok 3 | omit = */tests/*,*/test_*,setup.py,botok/utils/expose_data.py,botok/utils/lenient_normalization.py,botok/utils/unicode_normalization.py 4 | 5 | [report] 6 | exclude_lines = 7 | pragma: no cover 8 | def __repr__ 9 | raise NotImplementedError 10 | if __name__ == .__main__.: 11 | pass 12 | raise ImportError 13 | except ImportError 14 | fail_under = 80 15 | show_missing = True 16 | -------------------------------------------------------------------------------- /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | name: Publish 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | permissions: 9 | contents: write 10 | 11 | jobs: 12 | test: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | max-parallel: 4 16 | matrix: 17 | python-version: ["3.8", "3.9", "3.10", "3.11"] 18 | 19 | steps: 20 | - uses: actions/checkout@v1 21 | 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v4 24 | 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install -r requirements.txt 29 | pip install -e . 30 | - name: Run Test 31 | run: | 32 | pytest tests/ 33 | 34 | publish: 35 | needs: test 36 | runs-on: ubuntu-latest 37 | 38 | steps: 39 | - uses: actions/checkout@v2 40 | with: 41 | fetch-depth: 0 42 | 43 | - name: Python Semantic Release 44 | uses: relekang/python-semantic-release@v7.34.6 45 | with: 46 | github_token: ${{ secrets.GITHUB_TOKEN }} 47 | pypi_token: ${{ secrets.PYPI_TOKEN }} 48 | -------------------------------------------------------------------------------- /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | python-version: ['3.10', '3.12'] 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | cache: 'pip' 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | python -m pip install flake8 pytest pytest-cov codecov 28 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 29 | pip install -e . 30 | - name: Lint with flake8 31 | run: | 32 | # stop the build if there are Python syntax errors or undefined names 33 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 34 | # exit-zero treats all errors as warnings 35 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 36 | - name: Test with pytest 37 | run: | 38 | pytest -xvs 39 | - name: Generate coverage report 40 | run: | 41 | pytest --cov=botok --cov-report=xml --cov-fail-under=80 42 | - name: Upload coverage to Codecov 43 | uses: codecov/codecov-action@v3 44 | with: 45 | file: ./coverage.xml 46 | fail_ci_if_error: false 47 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | name: Test 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | test: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | max-parallel: 4 16 | matrix: 17 | python-version: ["3.8", "3.9", "3.10", "3.11"] 18 | 19 | steps: 20 | - uses: actions/checkout@v1 21 | 22 | - name: Set up Python ${{ matrix.python-version }} 23 | uses: actions/setup-python@v4 24 | with: 25 | python-version: ${{ matrix.python-version }} 26 | 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | pip install -r requirements.txt 31 | pip install -e . 32 | 33 | - name: Run Test 34 | run: | 35 | pytest -vv 36 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.9" 7 | 8 | python: 9 | install: 10 | - requirements: docs/requirements-docs.txt 11 | - method: pip 12 | path: . 13 | 14 | sphinx: 15 | configuration: docs/source/conf.py 16 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: trusty 2 | sudo: required 3 | 4 | before_install: 5 | language: python 6 | python: 7 | - '3.6' 8 | install: 9 | - pip3 install -r requirements.txt 10 | - pip3 show attrs 11 | - pip3 show pytest 12 | - pip3 install -U setuptools 13 | - python3 setup.py install 14 | - pip3 install coveralls 15 | before_script: 16 | - sleep 1 # this is just a placeholder 17 | script: 18 | - coverage run --source=botok -m pytest tests/ 19 | after_success: coveralls 20 | -------------------------------------------------------------------------------- /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-architect -------------------------------------------------------------------------------- /botok/__init__.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from pathlib import Path 3 | 4 | from .chunks.chunkframework import ChunkFramework 5 | from .chunks.chunkframeworkbase import ChunkFrameworkBase 6 | from .chunks.chunks import Chunks, TokChunks 7 | from .config import Config 8 | from .modifytokens.adjusttokens import AdjustTokens 9 | from .modifytokens.cqlmatcher import CQLMatcher 10 | from .modifytokens.mergedagdra import MergeDagdra 11 | from .modifytokens.mergingmatcher import MergingMatcher 12 | from .modifytokens.replacingmatcher import ReplacingMatcher 13 | from .modifytokens.splitaffixed import split_affixed 14 | from .modifytokens.splittingmatcher import SplittingMatcher 15 | from .modifytokens.tokenmerge import TokenMerge 16 | from .modifytokens.tokensplit import TokenSplit 17 | from .text.pipelinebase import PipelineBase 18 | from .text.text import Text 19 | from .textunits.bostring import BoString 20 | from .textunits.bosyl import BoSyl 21 | from .textunits.sylcomponents import SylComponents 22 | from .third_party.cqlparser import Query, parse_cql_query, replace_token_attributes 23 | from .tokenizers.chunktokenizer import ChunkTokenizer 24 | from .tokenizers.paragraphtokenizer import paragraph_tokenizer 25 | from .tokenizers.sentencetokenizer import sentence_tokenizer 26 | from .tokenizers.stacktokenizer import tokenize_in_stacks 27 | from .tokenizers.token import Token 28 | from .tokenizers.tokenize import Tokenize 29 | from .tokenizers.wordtokenizer import WordTokenizer 30 | from .tries.basictrie import BasicTrie 31 | from .tries.trie import Trie 32 | from .utils.expose_data import expose_data 33 | from .utils.unicode_normalization import normalize_unicode 34 | 35 | # from .utils.get_data import get_data 36 | from .vars import * 37 | from .vars import __version__ 38 | -------------------------------------------------------------------------------- /botok/chunks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/chunks/__init__.py -------------------------------------------------------------------------------- /botok/chunks/chunks.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from .chunkframework import ChunkFramework 3 | from ..vars import ChunkMarkers as c 4 | from ..vars import CharMarkers as a 5 | 6 | 7 | class Chunks(ChunkFramework): 8 | """ 9 | Produces chunks of the following types: bo, non-bo, punct and syl chunks 10 | 11 | Implements the following chunking pipeline: 12 | chunk "input_str" into BO / OTHER 13 | | chunk BO into PUNCT / BO 14 | | chunk BO into SYM / BO 15 | | chunk BO into NUM / BO 16 | | chunk BO into TEXT (syllables) 17 | | chunk OTHER into CJK / OTHER 18 | | chunk OTHER into LATIN / OTHER 19 | 20 | .. note:: Following Tibetan usage, it does not consider space as a punctuation mark. 21 | Spaces get attached to the chunk preceding them. 22 | """ 23 | 24 | def __init__(self, string, ignore_chars=None): 25 | ChunkFramework.__init__(self, string, ignore_chars=ignore_chars) 26 | 27 | def make_chunks(self, indices=True, gen=False, space_as_punct=False): 28 | chunks = self.chunk_bo_chars() 29 | if space_as_punct: 30 | chunks = self.pipe_chunk( 31 | chunks, self.chunk_spaces, to_chunk_marker=c.BO.value, yes=c.PUNCT.value 32 | ) 33 | chunks = self.pipe_chunk( 34 | chunks, self.chunk_punct, to_chunk_marker=c.BO.value, yes=c.PUNCT.value 35 | ) 36 | chunks = self.pipe_chunk(chunks, self.chunk_symbol, c.BO.value, c.SYM.value) 37 | chunks = self.pipe_chunk(chunks, self.chunk_number, c.BO.value, c.NUM.value) 38 | if not space_as_punct: 39 | chunks = self.merge_skippable_punct( 40 | chunks 41 | ) # ensure we have correctly built syls 42 | chunks = self.pipe_chunk(chunks, self.syllabify, c.BO.value, c.TEXT.value) 43 | chunks = self.pipe_chunk(chunks, self.adjust_syls, c.TEXT.value, c.TEXT.value) 44 | chunks = self.pipe_chunk(chunks, self.chunk_cjk, c.OTHER.value, c.CJK.value) 45 | chunks = self.pipe_chunk(chunks, self.chunk_latin, c.OTHER.value, c.LATIN.value) 46 | if not space_as_punct: 47 | chunks = self.merge_skippable_punct(chunks) 48 | if not indices: 49 | return self.get_chunked(chunks, gen=gen) 50 | return chunks 51 | 52 | 53 | class TokChunks(Chunks): 54 | """ 55 | This class uses the chunks produced by ``Chunks`` to identify Tibetan syllables and clean them. 56 | Thus produces pre-processed Tibetan text that can be further processed. 57 | 58 | Every chunk produced by ``Chunks`` is wrapped into a tuple containing: 59 | - either None or a list containing the cleaned syllable 60 | (the indices to every non-space and non-tsek char in every syllable chunk) 61 | - the chunk itself 62 | 63 | """ 64 | 65 | def __init__(self, string, ignore_chars=None, space_as_punct=False): 66 | super().__init__(string, ignore_chars=ignore_chars) 67 | self.chunks = None 68 | self.space_as_punct = space_as_punct 69 | 70 | def serve_syls_to_trie(self): 71 | chunks = [] 72 | for chunk in self.make_chunks(space_as_punct=self.space_as_punct): 73 | if chunk[0] == c.TEXT: 74 | syl = self.__get_text_chars(chunk[1], chunk[1] + chunk[2]) 75 | chunks.append((syl, chunk)) 76 | else: 77 | chunks.append((None, chunk)) 78 | self.chunks = chunks 79 | 80 | def get_syls(self): 81 | syls = [] 82 | for chunk in self.make_chunks(space_as_punct=self.space_as_punct): 83 | if chunk[0] == c.TEXT: 84 | char_idxs = self.__get_text_chars(chunk[1], chunk[1] + chunk[2]) 85 | syls.append("".join([self.bs.string[i] for i in char_idxs])) 86 | return syls 87 | 88 | def __get_text_chars(self, start_idx, end_idx): 89 | """ 90 | Removes all the spaces and tseks from a given syllable by only keeping the characters that 91 | pass ``__is_syl_text()``. 92 | 93 | :param start_idx: starting index of the syllable-chunk to clean 94 | :param end_idx: its ending index 95 | :type start_idx: int 96 | :type end_idx: int 97 | :return: a list of indices corresponding to the chars of the cleaned syllable 98 | """ 99 | return [i for i in range(start_idx, end_idx) if self.__is_syl_text(i)] 100 | 101 | def __is_syl_text(self, char_idx): 102 | """ 103 | Tests whether the character at the given index is part of the cleaned syllable or not. 104 | """ 105 | return ( 106 | self.bs.base_structure[char_idx] != a.TSEK 107 | and self.bs.base_structure[char_idx] != a.TRANSPARENT 108 | and self.bs.base_structure[char_idx] != a.SKRT_LONG_VOW 109 | ) or self.bs.base_structure[char_idx] == a.SKRT_LONG_VOW 110 | -------------------------------------------------------------------------------- /botok/config.py: -------------------------------------------------------------------------------- 1 | import io 2 | import zipfile 3 | from collections import defaultdict 4 | from pathlib import Path 5 | 6 | import requests 7 | 8 | # Defaults 9 | DEFAULT_BASE_PATH = Path.home() / "Documents" / "pybo" / "dialect_packs" 10 | DEFAULT_DIALECT_PACK = "general" 11 | 12 | 13 | def get_dialect_pack_url(dialect_name, version=None): 14 | # Try 50 times 15 | attempts = 0 16 | 17 | while not version and attempts < 50: 18 | try: 19 | response = requests.get( 20 | "https://api.github.com/repos/Esukhia/botok-data/releases/latest", 21 | timeout=50 22 | ) 23 | version = response.json()["tag_name"] 24 | except (requests.RequestException, KeyError): 25 | pass 26 | 27 | attempts += 1 28 | 29 | return f"https://github.com/Esukhia/botok-data/releases/download/{version}/{dialect_name}.zip" 30 | 31 | 32 | def get_dialect_pack(dialect_name, out_dir, version=None): 33 | out_dir = Path(out_dir) 34 | out_dir.mkdir(exist_ok=True, parents=True) 35 | dialect_pack_path = out_dir / dialect_name 36 | if dialect_pack_path.is_dir(): 37 | return dialect_pack_path 38 | 39 | print(f"[INFO] Downloading {dialect_name} dialect pack ...") 40 | # Download the dialect pack 41 | url = get_dialect_pack_url(dialect_name, version) 42 | r = requests.get(url, stream=True, timeout=50) 43 | 44 | # attempt 50 times to download the zip 45 | check = zipfile.is_zipfile(io.BytesIO(r.content)) 46 | attempts = 0 47 | while not check and attempts < 50: 48 | r = requests.get(url, stream=True, timeout=50) 49 | check = zipfile.is_zipfile(io.BytesIO(r.content)) 50 | attempts += 1 51 | 52 | if not check: 53 | raise IOError("the .zip file couldn't be downloaded.") 54 | 55 | # extract the zip in the current folder 56 | with zipfile.ZipFile(io.BytesIO(r.content)) as z: 57 | z.extractall(path=str(out_dir)) 58 | 59 | print("[INFO] Download completed!") 60 | 61 | return dialect_pack_path 62 | 63 | 64 | class Config: 65 | """botok config for Tibetan dialect pack. 66 | 67 | Each dialect pack has two components: 68 | 1. Dictionary: 69 | - contains all the data required to construct the Trie. 70 | - It should in the directory called `dictionary` inside the dialect pack directory. 71 | 2. Adjustment: 72 | - Contains all the data required to adjust the text segmentation rules. 73 | """ 74 | 75 | def __init__(self, dialect_name=None, base_path=None): 76 | """Create config for given `dialect_name` and stored in `base_path`""" 77 | if not dialect_name: 78 | dialect_name = DEFAULT_DIALECT_PACK 79 | if not base_path: 80 | base_path = DEFAULT_BASE_PATH 81 | dialect_pack_path = get_dialect_pack(dialect_name, base_path) 82 | self.reset(dialect_pack_path) 83 | 84 | def reset(self, dialect_pack_path=None): 85 | """Reset the config to default bo_general_pack.""" 86 | if dialect_pack_path: 87 | self.dialect_pack_path = dialect_pack_path 88 | else: 89 | self.dialect_pack_path = get_dialect_pack( 90 | DEFAULT_DIALECT_PACK, DEFAULT_BASE_PATH 91 | ) 92 | self.dictionary = self._get_pack_component("dictionary") 93 | self.adjustments = self._get_pack_component("adjustments") 94 | 95 | def _get_pack_component(self, pack_component_name, pack_component=None): 96 | """Return all the data_paths of the `pack_component. 97 | 98 | data_paths stored in python `dict` as per the directory 99 | structure of the pack component. 100 | """ 101 | if not pack_component: 102 | pack_component = defaultdict(list) 103 | for path in (self.dialect_pack_path / pack_component_name).iterdir(): 104 | if not path.is_dir(): 105 | continue 106 | data_type = path.name 107 | pack_component[data_type].extend(list(path.rglob("*.tsv"))) 108 | return pack_component 109 | 110 | @classmethod 111 | def from_path(cls, dialect_pack_path): 112 | """Creates config from ``dialect_pack_path``. 113 | 114 | Returns: 115 | :class: `Config`: An instance of a Configuration object 116 | 117 | Examples:: 118 | 119 | config = Config.from_path(path_to_dialect_pack) 120 | assert config.dictionary == True 121 | assert config.adjustments == True 122 | 123 | """ 124 | path = Path(dialect_pack_path) 125 | dialect_name = path.name 126 | base_path = path.parent 127 | return cls(dialect_name, base_path) 128 | 129 | @property 130 | def profile(self): 131 | """Returns profile name of the dialect_pack.""" 132 | return self.dialect_pack_path.name 133 | 134 | def add_dialect_pack(self, path): 135 | """"Merge given dialect_pack at `path` to current dialect_pack.""" 136 | self.dialect_pack_path = path 137 | self._get_pack_component("dictionary", self.dictionary) 138 | self._get_pack_component("adjustments", self.adjustments) 139 | -------------------------------------------------------------------------------- /botok/modifytokens/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/modifytokens/__init__.py -------------------------------------------------------------------------------- /botok/modifytokens/adjusttokens.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import csv 3 | import re 4 | 5 | from .splittingmatcher import SplittingMatcher 6 | from .mergingmatcher import MergingMatcher 7 | from .replacingmatcher import ReplacingMatcher 8 | from ..utils.helpers import decomment_file 9 | 10 | 11 | class AdjustTokens: 12 | """ 13 | Syntax for the .tsv adjustment rules 14 | =================================== 15 | - each rule should be as follows: "\t\t\t" 16 | - comments with # and empty lines are allowed 17 | - CQL rules: "" can be used without specifying that there is "text_cleaned=" 18 | - Index format: either "" or "-" 19 | - Adjustment format: 20 | - "+" for merge 21 | - ":" for split (default: syllable mode) 22 | - "::" for split in character mode 23 | - "=" for replace 24 | - Constraint: "-" is only allowed if adjustment is ":" or "::" 25 | """ 26 | 27 | def __init__(self, main=None, custom=None): 28 | self.paths = [] 29 | if custom: 30 | self.paths.extend(custom) 31 | elif main: 32 | self.paths.extend(main) 33 | self.rules = [] 34 | self.parse_rules() 35 | 36 | def no_token_matched(self, matchcql): 37 | matched_tokens = [token for token in re.split(r'(\[.+?\])', matchcql) if token != " " and token != ""] 38 | return len(matched_tokens) 39 | 40 | def adjust(self, token_list): 41 | for rule in self.rules: 42 | if rule["operation"] == "split": 43 | if rule["matchidx"] <= self.no_token_matched(rule['matchcql']): 44 | sm = SplittingMatcher( 45 | rule["matchcql"], 46 | rule["matchidx"], 47 | rule["splitidx"], 48 | token_list, 49 | rule["replacecql"], 50 | ) 51 | token_list = sm.split_on_matches(mode=rule["splitmode"]) 52 | else: 53 | print(f'[ERROR]: No token to spilt with token number {rule["matchidx"]} found in rule {" ".join(rule)}') 54 | elif rule["operation"] == "merge": 55 | if rule["matchidx"] < self.no_token_matched(rule['matchcql']): 56 | mm = MergingMatcher( 57 | rule["matchcql"], rule["matchidx"], token_list, rule["replacecql"] 58 | ) 59 | token_list = mm.merge_on_matches() 60 | else: 61 | print(f'[ERROR]: No token to merge with token number {rule["matchidx"]} found in rule {" ".join(rule)}') 62 | elif rule["operation"] == "repl": 63 | rm = ReplacingMatcher( 64 | rule["matchcql"], rule["matchidx"], token_list, rule["replacecql"] 65 | ) 66 | rm.replace_on_matches() 67 | return token_list 68 | 69 | def parse_rules(self): 70 | """ 71 | Files are sorted before being applied. Thus, filenames 72 | :return: 73 | """ 74 | for rule_file in sorted(self.paths): 75 | for rule in csv.reader( 76 | decomment_file(rule_file.open(encoding="utf-8-sig")), delimiter="\t" 77 | ): 78 | self.rules.append(self.parse_rule(rule)) 79 | 80 | @staticmethod 81 | def parse_rule(rule): 82 | idx_sep = "-" 83 | 84 | # sanity checks 85 | if len(rule) != 4: 86 | raise SyntaxError("There can't be more than three columns per rule.") 87 | if not rule[1]: 88 | raise SyntaxError("There needs to be an index for every rule.") 89 | if idx_sep in rule[1] and rule[2] not in [":", "::"]: 90 | raise SyntaxError( 91 | "The double index in only intended for split adjustments." 92 | ) 93 | if rule[2] not in ["+", "=", ":", "::"]: 94 | raise SyntaxError( 95 | 'The supported operations are either of ["+", "=", ":", "::"].' 96 | ) 97 | 98 | # parse 99 | rule_dict = { 100 | "matchcql": None, 101 | "matchidx": None, 102 | "operation": None, 103 | "splitidx": None, 104 | "splitmode": None, 105 | "replacecql": None, 106 | } 107 | rule_dict["matchcql"] = rule[0] 108 | if idx_sep in rule[1]: 109 | match_idx, split_idx = rule[1].split("-") 110 | rule_dict["matchidx"] = int(match_idx) 111 | rule_dict["splitidx"] = int(split_idx) 112 | else: 113 | rule_dict["matchidx"] = int(rule[1]) 114 | if rule[2] == "=": 115 | rule_dict["operation"] = "repl" 116 | elif rule[2] == "+": 117 | rule_dict["operation"] = "merge" 118 | elif rule[2] == ":": 119 | rule_dict["operation"] = "split" 120 | rule_dict["splitmode"] = "syl" 121 | elif rule[2] == "::": 122 | rule_dict["operation"] = "split" 123 | rule_dict["splitmode"] = "char" 124 | rule_dict["replacecql"] = rule[3] 125 | 126 | return rule_dict 127 | -------------------------------------------------------------------------------- /botok/modifytokens/cqlmatcher.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from ..third_party.cqlparser import Query 3 | 4 | 5 | class CQLMatcher: 6 | def __init__(self, query): 7 | """ 8 | Creates a matcher object to be later executed against a list of tokens with BoMatcher.match() 9 | 10 | :param query: CQL compliant query string 11 | :type query: string 12 | 13 | """ 14 | self.query = Query(query) 15 | 16 | def match(self, tokens_list): 17 | """ 18 | Runs cql.Query on a slice of the list of tokens for every index in the list. 19 | 20 | :param tokens_list: output of BoTokenizer 21 | :type tokens_list: list of Token objects 22 | :return: a list of matching slices of tokens_list 23 | :rtype: list of tuples with each two values: beginning and end indices 24 | """ 25 | slice_len = len(self.query.tokenexprs) - 1 26 | matches = [] 27 | for i in range(len(tokens_list)): 28 | if i + slice_len <= len(tokens_list) and self.query( 29 | tokens_list[i : i + slice_len + 1] 30 | ): 31 | matches.append((i, i + slice_len)) 32 | return matches 33 | -------------------------------------------------------------------------------- /botok/modifytokens/mergedagdra.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from .tokenmerge import TokenMerge 3 | from ..vars import TSEK, DAGDRA 4 | 5 | 6 | class MergeDagdra: 7 | """ 8 | A class to merge pa/po/ba/bo tokens in a token list produced by BoTokenizer 9 | 10 | """ 11 | 12 | def __init__(self): 13 | pass 14 | 15 | def merge(self, tokens): 16 | """ 17 | Merges the tokens containing either pa/po/ba/bo 18 | 19 | :param tokens: list of Token objects 20 | """ 21 | if len(tokens) <= 1: 22 | pass 23 | elif len(tokens) == 2: 24 | token0, token1 = tokens 25 | if token1.text_cleaned in DAGDRA: 26 | # split token containing the affixed particle 27 | merged = self.merge_with_previous_token(token0, token1) 28 | del tokens[1] 29 | tokens[0] = merged 30 | else: 31 | t = 0 32 | while t <= len(tokens) - 1: 33 | if t + 1 > len(tokens) - 1: 34 | break 35 | token0, token1 = tokens[t], tokens[t + 1] 36 | clean_word = ( 37 | token1.text_cleaned + TSEK 38 | if not token1.text_cleaned.endswith(TSEK) 39 | else token1.text_cleaned 40 | ) 41 | if ( 42 | token0.chunk_type == "TEXT" 43 | and token1.chunk_type == "TEXT" 44 | and clean_word in DAGDRA 45 | ): 46 | # split token containing the affixed particle 47 | merged = self.merge_with_previous_token(token0, token1) 48 | 49 | # replace the original token with the two new ones 50 | tokens[t : t + 2] = [merged] 51 | t += 1 52 | 53 | def merge_with_previous_token(self, token0, token1): 54 | merged = TokenMerge(token0, token1).merge() 55 | merged.has_merged_dagdra = True 56 | merged.lemma = merged.text_cleaned 57 | return merged 58 | -------------------------------------------------------------------------------- /botok/modifytokens/mergingmatcher.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from ..third_party.cqlparser import Query 3 | from .tokenmerge import TokenMerge 4 | 5 | 6 | class MergingMatcher: 7 | def __init__(self, query, replace_idx, token_list, token_changes=None): 8 | self.matcher = Query(query) 9 | self.span = len(self.matcher.tokenexprs) - 1 10 | self.token_list = token_list 11 | 12 | self.replace_idx = replace_idx - 1 13 | self.token_changes = token_changes 14 | 15 | def merge_on_matches(self): 16 | merged_list = [] 17 | i = 0 18 | while i < len(self.token_list): 19 | if self.__matches(i): 20 | # find the index of the token to split 21 | idx = i + self.replace_idx 22 | 23 | # add new tokens that precede the one to split 24 | for r in range(i, idx): 25 | merged_list.append(self.token_list[r]) 26 | i += 1 27 | 28 | # split the token and add them to the new list 29 | merged_list.append( 30 | self.__merge(self.token_list[idx], self.token_list[idx + 1]) 31 | ) 32 | i += 1 33 | else: 34 | merged_list.append(self.token_list[i]) 35 | 36 | i += 1 37 | 38 | return merged_list 39 | 40 | def __matches(self, i): 41 | return i + self.span <= len(self.token_list) and self.matcher( 42 | self.token_list[i : i + self.span + 1] 43 | ) 44 | 45 | def __merge(self, token1, token2): 46 | ts = TokenMerge(token1, token2, self.token_changes) 47 | return ts.merge() 48 | -------------------------------------------------------------------------------- /botok/modifytokens/replacingmatcher.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from ..third_party.cqlparser import Query, replace_token_attributes 3 | 4 | 5 | class ReplacingMatcher: 6 | def __init__(self, query, replace_idx, token_list, token_changes=None): 7 | self.matcher = Query(query) 8 | self.span = len(self.matcher.tokenexprs) - 1 9 | self.replace_idx = replace_idx - 1 10 | self.token_list = token_list 11 | self.token_changes = token_changes 12 | 13 | def replace_on_matches(self): 14 | i = 0 15 | while i < len(self.token_list): 16 | if self.__matches(i): 17 | # find the index of the token to split 18 | idx = i + self.replace_idx 19 | replace_token_attributes(self.token_list[idx], self.token_changes) 20 | i += 1 21 | 22 | def __matches(self, i): 23 | return i + self.span <= len(self.token_list) and self.matcher( 24 | self.token_list[i : i + self.span + 1] 25 | ) 26 | -------------------------------------------------------------------------------- /botok/modifytokens/splitaffixed.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from .tokensplit import TokenSplit 3 | 4 | 5 | def split_affixed(tokens): 6 | """ 7 | Splits in place the tokens containing affixed particles 8 | tokens have to be Token objects produced by BoTokenizer.Tokenizer 9 | 10 | :param tokens: list of Token objects 11 | """ 12 | t = 0 13 | while t <= len(tokens) - 1: 14 | # check that splitting is possible (affixation attribute exists) 15 | # and that there is no meaning that has "affixed: False". 16 | # ie, check that the inflected form can't be the affixed form of a word and the unaffixed form of another word 17 | if tokens[t].affixation and not [ 18 | True for m in tokens[t].senses if "affixed" in m and not m["affixed"] 19 | ]: 20 | # split token containing the affixed particle 21 | split_idx = tokens[t].syls_idx[-1][-tokens[t].affixation["len"]] 22 | changes = ( 23 | '[affix_host="True"] ' 24 | '[pos="PART" & affix="True" & skrt="False" & freq="None" & senses="None"]' 25 | ) 26 | ts = TokenSplit(tokens[t], split_idx, token_changes=changes) 27 | token1, token2 = ts.split() 28 | if token2.senses is None: 29 | token2.senses = [] 30 | 31 | # replace the original token with the two new ones 32 | tokens[t : t + 1] = [token1, token2] 33 | 34 | t += 1 # increment once more to account for the newly split token 35 | t += 1 36 | -------------------------------------------------------------------------------- /botok/modifytokens/splittingmatcher.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from ..third_party.cqlparser import Query 3 | from .tokensplit import TokenSplit 4 | 5 | 6 | class SplittingMatcher: 7 | def __init__(self, query, replace_idx, split_idx, token_list, token_changes=None): 8 | self.matcher = Query(query) 9 | self.span = len(self.matcher.tokenexprs) - 1 10 | self.token_list = token_list 11 | 12 | self.replace_idx = replace_idx - 1 13 | self.split_idx = split_idx 14 | self.token_changes = token_changes 15 | 16 | def split_on_matches(self, mode="char"): 17 | """ 18 | :param mode: can either be "char" or "syl" 19 | """ 20 | split_list = [] 21 | 22 | i = 0 23 | while i < len(self.token_list): 24 | if self.__matches(i, self.token_list): 25 | # find the index of the token to split 26 | idx = i + self.replace_idx 27 | 28 | # add new tokens that precede the one to split 29 | for r in range(i, idx): 30 | split_list.append(self.token_list[r]) 31 | i += 1 32 | 33 | # split the token and add them to the new list 34 | split_list.extend(self.__split(self.token_list[idx], mode=mode)) 35 | 36 | else: 37 | split_list.append(self.token_list[i]) 38 | i += 1 39 | 40 | return split_list 41 | 42 | def __matches(self, i, token_list): 43 | return i + self.span <= len(token_list) and self.matcher( 44 | self.token_list[i : i + self.span + 1] 45 | ) 46 | 47 | def __split(self, token, mode): 48 | ts = TokenSplit(token, self.split_idx, self.token_changes) 49 | return ts.split(mode=mode) 50 | -------------------------------------------------------------------------------- /botok/modifytokens/tokenmerge.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import copy 3 | 4 | from ..third_party.cqlparser import replace_token_attributes 5 | 6 | 7 | class TokenMerge: 8 | """ 9 | 10 | """ 11 | 12 | def __init__(self, token1, token2, token_changes=None): 13 | self.token1 = token1 14 | self.token2 = token2 15 | self.merged = copy.deepcopy(token1) 16 | if not self.merged.syls_idx: 17 | self.merged.syls_idx = [] 18 | if not self.merged.syls: 19 | self.merged.syls = [] 20 | self.token_changes = token_changes 21 | 22 | def merge(self): 23 | self.merge_attrs() 24 | self.replace_attrs() 25 | return self.merged 26 | 27 | def replace_attrs(self): 28 | """ 29 | Replaces the content of attributes that were found in the cql query. 30 | If no query is provided, the values of the first token are kept. 31 | """ 32 | if self.token_changes: 33 | replace_token_attributes(self.merged, self.token_changes) 34 | 35 | def merge_attrs(self): 36 | self.__merge_texts() 37 | self.__merge_indices() 38 | self.__merge_syls_idx() 39 | self.__merge_syls_start_end() 40 | self.__del_lemma() 41 | 42 | def __merge_texts(self): 43 | self.merged.text += self.token2.text 44 | 45 | def __merge_indices(self): 46 | self.merged.len += self.token2.len 47 | 48 | def __merge_syls_start_end(self): 49 | # token1 is a host syllable and token2 its affixed syllable 50 | if ( 51 | not self.merged.syls_start_end 52 | or not self.token1.syls_start_end 53 | or not self.token2.syls_start_end 54 | ): 55 | return 56 | 57 | if ( 58 | self.token1.affix_host 59 | and not self.token1.affix 60 | and not self.token2.affix_host 61 | and self.token2.affix 62 | ): 63 | self.merged.syls_start_end[-1]["end"] = self.token2.syls_start_end[0]["end"] 64 | self.merged.syls_start_end.extend(self.token2.syls_start_end[1:]) 65 | else: 66 | self.merged.syls_start_end.extend(self.token2.syls_start_end) 67 | 68 | def __merge_syls_idx(self): 69 | """ 70 | Updates indices and add the syls to the merged object 71 | Re-joins the host-syllable and affixed particle syllables into a single one; 72 | then, affix is True and affixed also, so cleaned_content gets its tsek. 73 | """ 74 | first_syl = True 75 | if self.token2.syls_idx: 76 | for syl in self.token2.syls_idx: 77 | if syl: 78 | new_syl = [i + self.token1.len for i in syl] 79 | 80 | # token1 is a host syllable and token2 its affixed syllable 81 | if ( 82 | first_syl 83 | and (self.token1.affix_host and not self.token1.affix) 84 | and (not self.token2.affix_host and self.token2.affix) 85 | ): 86 | self.merged.syls_idx[-1] += new_syl 87 | self.merged.affix = True 88 | first_syl = False 89 | else: 90 | self.merged.syls_idx.append(new_syl) 91 | 92 | def __del_lemma(self): 93 | """ 94 | Simply deletes any lemma in merged since the lemma of the merged token can't be guessed. 95 | """ 96 | if self.token1["lemma"]: 97 | self.merged.lemma = None 98 | -------------------------------------------------------------------------------- /botok/modifytokens/tokensplit.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import copy 3 | 4 | from ..third_party.cqlparser import replace_token_attributes 5 | 6 | 7 | class TokenSplit: 8 | """ 9 | Takes a token object and divide it into two using an index of the content. 10 | 11 | The affected attributes are: 12 | - token.content : the string is split at the index 13 | - token.char_groups : the dict items are redistributed 14 | - token.start : second token only. now equals "start + index" 15 | - token.len : length of new content 16 | - token.syls : syls are redistributed and split if necessary 17 | 18 | """ 19 | 20 | def __init__(self, token, split_idx, token_changes=None): 21 | self.token = token 22 | self.first = None 23 | self.second = None 24 | self.token_changes = token_changes 25 | self.idx = split_idx 26 | 27 | def split(self, mode="char"): 28 | """ 29 | :param mode: can either be "syl" or "char" to split on a syllable index or a character index. 30 | """ 31 | if mode != "char" and mode != "syl": 32 | raise SyntaxError("splitting mode should either be 'syl' or 'char'. ") 33 | 34 | # in syllable-mode, if there is only one syllable, return the word without splitting it. 35 | if mode == "syl" and len(self.token.syls) == 1: 36 | return [self.token] 37 | 38 | self.split_on_idx(mode=mode) 39 | self.replace_attrs() 40 | 41 | return [self.first, self.second] 42 | 43 | def replace_attrs(self): 44 | if self.token_changes: 45 | tokens = [self.first, self.second] 46 | replace_token_attributes(tokens, self.token_changes) 47 | self.first, self.second = tokens 48 | 49 | def split_on_idx(self, mode): 50 | self.first = copy.deepcopy(self.token) 51 | self.second = copy.deepcopy(self.token) 52 | 53 | if mode == "syl": 54 | self.idx = self.token.syls_start_end[self.idx - 1]["end"] 55 | 56 | self.__split_contents() 57 | self.__split_indices() 58 | self.__split_syls_idx() 59 | self.__split_syls_start_end(mode) 60 | self.__split_char_types() 61 | self.__split_affixation() 62 | 63 | def __split_contents(self): 64 | text = self.first.text 65 | self.first.text = text[0 : self.idx] 66 | self.second.text = text[self.idx :] 67 | 68 | def __split_char_types(self): 69 | char_types = self.first.char_types 70 | self.first.char_types = char_types[: self.idx] 71 | self.second.char_types = char_types[self.idx :] 72 | 73 | def __split_indices(self): 74 | self.first.len = len(self.first.text) 75 | self.second.len = len(self.second.text) 76 | self.second.start = self.second.start + self.idx 77 | 78 | def __split_syls_start_end(self, mode): 79 | if not self.token.syls_start_end: 80 | return 81 | 82 | to_split_idx = 0 83 | for num, s in enumerate(self.token.syls_start_end): 84 | if s["start"] <= self.idx <= s["end"]: 85 | to_split_idx = num 86 | break # ensure to exit on first match 87 | 88 | start = self.token.syls_start_end[:to_split_idx] 89 | end = self.token.syls_start_end[to_split_idx + 1 :] 90 | to_split = self.token.syls_start_end[to_split_idx] 91 | 92 | if mode == "char": 93 | start.append({"start": to_split["start"], "end": self.idx}) 94 | end.append({"start": self.idx, "end": to_split["end"]}) 95 | 96 | if mode == "syl": 97 | start.append(to_split) 98 | 99 | self.first.syls_start_end = start 100 | self.second.syls_start_end = end 101 | 102 | def __split_syls_idx(self): 103 | syls = self.first.syls_idx 104 | # empty syls 105 | self.first.syls_idx = [] 106 | self.second.syls_idx = [] 107 | 108 | if syls: 109 | for syl in syls: 110 | if syl[-1] < self.idx: 111 | self.first.syls_idx.append(syl) 112 | 113 | else: 114 | # separate the syl in two 115 | part1, part2 = [], [] 116 | for i in syl: 117 | if i < self.idx: 118 | part1.append(i) 119 | else: 120 | part2.append(i - self.idx) 121 | 122 | # add them if non-empty 123 | if part1: 124 | self.first.syls_idx.append(part1) 125 | if part2: 126 | self.second.syls_idx.append(part2) 127 | 128 | def __split_affixation(self): 129 | if self.token.affixation: 130 | self.first.affixation.pop("len", '') 131 | self.first.affixation.pop("type", '') 132 | self.second.affixation.pop("aa", '') 133 | -------------------------------------------------------------------------------- /botok/resources/README.md: -------------------------------------------------------------------------------- 1 | # Resource files 2 | 3 | This document lists the files used by pybo, their format, origin and usage. Unless indicated otherwise they are in the public domain. 4 | 5 | ### SylComponents.json 6 | 7 | This file has been compiled by hand based on the list presented in *TODO: ref. to HL article* 8 | 9 | It is used to check if a syllable is correct according to Classical Tibetan norms, and find the root letter. 10 | 11 | ### frequency/mgd.txt 12 | 13 | XXX 14 | 15 | ### frequency/tc.txt 16 | 17 | XXX 18 | 19 | ### lemmas/particles.yaml 20 | 21 | XXX 22 | 23 | ### trie/ancient.txt , trie/exceptions.txt 24 | 25 | These files come from [tibetan-spellchecker](https://github.com/eroux/tibetan-spellchecker) and indicate exceptions to Classical Tibetan norms. 26 | 27 | ### trie/particles.txt 28 | 29 | This is a list of all particles, compiled by hand, with the `PART` POS tag. 30 | 31 | ### trie/Tibetan.DICT 32 | 33 | This file has been extracted from: 34 | 35 | Meelen, Marieke, Hill, Nathan, & Handy, Christopher. (2017). The Annotated Corpus of Classical Tibetan (ACTib), Part II - POS-tagged version, based on the BDRC digitised text collection, tagged with the Memory-Based Tagger from TiMBL [Data set]. Zenodo. [http://doi.org/10.5281/zenodo.822537](https://doi.org/10.5281/zenodo.822537) 36 | 37 | It is available under the [CC-BY 4.0 license](https://creativecommons.org/licenses/by/4.0/). 38 | 39 | ### trie/tsikchen.txt 40 | 41 | This file has been extracted from a digitized version of: 42 | 43 | [Yisun, Zhang. 1985. བོད་རྒྱ་ཚིག་མཛོད་ཆེན་མོ།. Beijing: མི་རིགས་དཔེ་སྐྲུན་ཁང་།](http://tbrc.org/link?RID=W29329) 44 | 45 | Although the book is under copyright, we consider that the bare list of words we provide is not. 46 | 47 | ### trie/mgd.txt 48 | 49 | XXX 50 | 51 | 52 | ### trie/recordings_4.txt , trie/oral_corpus_0.txt , trie/oral_corpus_1.txt , trie/oral_corpus_2.txt , trie/oral_corpus_3.txt 53 | 54 | XXX 55 | -------------------------------------------------------------------------------- /botok/resources/bo_punct_position.csv: -------------------------------------------------------------------------------- 1 | Char,unicode repr,punct_position 2 | 0F01,—༁—,opening_punct 3 | 0F02,—༂—,opening_punct 4 | 0F03,—༃—,opening_punct 5 | 0F04,—༄—,opening_punct 6 | 0F05,—༅—,opening_punct 7 | 0F06,—༆—,opening_punct 8 | 0F07,—༇—,opening_punct 9 | 0F08,—༈—,opening_punct 10 | 0F09,—༉—,opening_punct 11 | 0F0A,—༊—,opening_punct 12 | 0F0D,—།—,closing_punct 13 | 0F0E,—༎—,closing_punct 14 | 0F0F,—༏—,closing_punct 15 | 0F10,—༐—,closing_punct 16 | 0F11,—༑—,opening_punct 17 | 0F12,—༒—,opening_punct 18 | 0F14,—༔—,closing_punct 19 | 0F34,—༴—,closing_punct 20 | 0F3A,—༺—,opening_punct 21 | 0F3B,—༻—,closing_punct 22 | 0F3C,—༼—,opening_punct 23 | 0F3D,—༽—,closing_punct 24 | 0F3E,—༾—,closing_punct 25 | 0F3F,—༿—,opening_punct 26 | 0FD0,—࿐—,opening_punct 27 | 0FD1,—࿑—,opening_punct 28 | 0FD3,—࿓—,opening_punct 29 | 0FD4,—࿔—,opening_punct 30 | 0FD9,—࿙—,opening_punct 31 | 0FDA,—࿚—,closing_punct 32 | -------------------------------------------------------------------------------- /botok/resources/particles.tsv: -------------------------------------------------------------------------------- 1 | # form pos lemma sense freq 2 | གི PART གི 3 | ཀྱི PART གི 4 | གྱི PART གི 5 | འི PART གི 6 | ཡི PART གི 7 | གིས PART གིས 8 | ཀྱིས PART གིས 9 | གྱིས PART གིས 10 | ཡིས PART གིས 11 | ས PART གིས 12 | སུ PART ལ 13 | ར PART ལ 14 | རུ PART ལ 15 | ཏུ PART ལ 16 | ན PART ལ 17 | ལ PART ལ 18 | དུ PART ལ 19 | སྟེ PART སྟེ 20 | ཏེ PART སྟེ 21 | དེ PART སྟེ 22 | ཀྱང PART ཀྱང 23 | ཡང PART ཀྱང 24 | འང PART ཀྱང 25 | གམ PART གམ 26 | ངམ PART གམ 27 | དམ PART གམ 28 | ནམ PART གམ 29 | བམ PART གམ 30 | མམ PART གམ 31 | འམ PART གམ 32 | རམ PART གམ 33 | ལམ PART གམ 34 | སམ PART གམ 35 | ཏམ PART གམ 36 | པ PART པ 37 | བ PART པ 38 | པོ PART པོ 39 | བོ PART པོ 40 | གོ PART གོ 41 | ངོ PART གོ 42 | དོ PART གོ 43 | ནོ PART གོ 44 | བོ PART གོ 45 | མོ PART གོ 46 | འོ PART གོ 47 | རོ PART གོ 48 | ལོ PART གོ 49 | སོ PART གོ 50 | ཏོ PART གོ 51 | ཅིང PART ཅིང 52 | ཤིང PART ཅིང 53 | ཞིང PART ཅིང 54 | ཅེས PART ཅེས 55 | ཞེས PART ཅེས 56 | ཅེའོ PART ཅེའོ 57 | ཤེའོ PART ཅེའོ 58 | ཞེའོ PART ཅེའོ 59 | ཅེ་ན PART ཅེ་ན 60 | ཤེ་ན PART ཅེ་ན 61 | ཞེ་ན PART ཅེ་ན 62 | ཅིག PART ཅིག 63 | ཤིག PART ཅིག 64 | ཞིག PART ཅིག 65 | ཀྱིན PART གིན 66 | གིན PART གིན 67 | གྱིན PART གིན 68 | ནས PART ནས 69 | -------------------------------------------------------------------------------- /botok/text/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/text/__init__.py -------------------------------------------------------------------------------- /botok/text/format.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from typing import DefaultDict, List, Tuple 3 | 4 | from .tokenize import BoToken 5 | 6 | 7 | def plaintext(tokens: List[str], sep=" ") -> str: 8 | tokens = [t.replace(" ", "_") for t in tokens] 9 | return sep.join(tokens) 10 | 11 | 12 | def plaintext_sent_par(units: List[Tuple[int, List[BoToken]]], sep="\n") -> str: 13 | out = [] 14 | for u in units: 15 | unit = "".join([word.text for word in u['tokens']]) 16 | out.append(unit) 17 | return sep.join(out) 18 | 19 | 20 | def basic_conc(concs: DefaultDict[str, List[str]], sep="\t", esc_context=True) -> str: 21 | out = [] 22 | for occ, LR in concs.items(): 23 | for left, right in LR: 24 | 25 | if esc_context: 26 | left, right = f'"{left}"', f'"{right}"' 27 | 28 | line = f"{left}{sep}{occ}{sep}{right}" 29 | out.append(line) 30 | 31 | return "\n".join(out) 32 | 33 | 34 | def stats_types(total_mistakes: DefaultDict[str, int], sep="\t") -> str: 35 | total = [(mis, freq) for mis, freq in total_mistakes.items()] 36 | total = sorted(total, reverse=True, key=lambda x: x[1]) 37 | total = [f"{mis}{sep}{freq}" for mis, freq in total] 38 | return "\n".join(total) 39 | -------------------------------------------------------------------------------- /botok/text/modify.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from collections import defaultdict 3 | from typing import DefaultDict, List, NewType, Tuple 4 | 5 | from ..third_party.has_skrt_syl import has_skrt_syl 6 | from .tokenize import BoToken 7 | 8 | 9 | def is_mistake(token): 10 | exceptions = ["\n"] 11 | if token.chunk_type == "TEXT" or ( 12 | token.chunk_type == "LATIN" 13 | or token.chunk_type == "CJK" 14 | or token.chunk_type == "OTHER" 15 | ): 16 | if ( 17 | (not token.skrt and not has_skrt_syl(token.text_cleaned)) 18 | and ( 19 | token.senses 20 | and len( 21 | [ 22 | True 23 | for m in token.senses 24 | if "pos" in m 25 | and (m["pos"] == "NO_POS" or m["pos"] == "NON_WORD") 26 | ] 27 | ) 28 | > 0 29 | or ( 30 | token.chunk_type == "LATIN" 31 | or token.chunk_type == "CJK" 32 | or token.chunk_type == "OTHER" 33 | ) 34 | ) 35 | and token.text not in exceptions 36 | ): 37 | return True 38 | return False 39 | 40 | 41 | def words_error_concs( 42 | tokens: List[BoToken], left=5, right=5 43 | ) -> DefaultDict[str, List[str]]: 44 | mistakes = defaultdict(list) 45 | for num, t in enumerate(tokens): 46 | if is_mistake(t): 47 | if num - left < 0: 48 | l = tokens[:num] 49 | else: 50 | l = tokens[num - left : num] 51 | if num + right > len(tokens) - 1: 52 | r = tokens[num + 1 :] 53 | else: 54 | r = tokens[num + 1 : num + 1 + right] 55 | 56 | l_context = [t.text for t in l] 57 | r_context = [t.text for t in r] 58 | mis = t.text.replace("\n", "\\n") 59 | mistakes[mis].append(["".join(l_context), "".join(r_context)]) 60 | return mistakes 61 | 62 | 63 | def words_error_types(tokens: List[BoToken]) -> DefaultDict[str, int]: 64 | mistakes = defaultdict(int) 65 | for num, t in enumerate(tokens): 66 | if is_mistake(t): 67 | mis = t.text.replace("\n", "\\n") 68 | mistakes[mis] += 1 69 | return mistakes 70 | 71 | 72 | def words_raw_types(tokens: List[BoToken]) -> DefaultDict[str, int]: 73 | types = defaultdict(int) 74 | for t in tokens: 75 | occ = t.text.replace("\n", "\\n") 76 | types[occ] += 1 77 | return types 78 | 79 | 80 | def words_raw_text(tokens: List[BoToken]) -> List[str]: 81 | return [t.text for t in tokens] 82 | 83 | 84 | def chunks_raw_text(tokens: List[Tuple[str, str]]) -> List[str]: 85 | return [chunk for _, chunk in tokens] 86 | -------------------------------------------------------------------------------- /botok/text/pipelinebase.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from pathlib import Path 3 | 4 | 5 | class PipelineBase: 6 | def __init__(self, profile, pipes=None): 7 | self.pipes = pipes 8 | 9 | self.prep = None 10 | self.tok = None 11 | self.mod = None 12 | self.form = None 13 | 14 | self.left = 5 15 | self.right = 5 16 | self.tok_params = None 17 | self.filename = ( 18 | None # for an advanced mode, to show what conc comes from which file 19 | ) 20 | 21 | self.args_list = { 22 | "prep", 23 | "tok", 24 | "mod", 25 | "form", # components 26 | "tok_params", # pybo 27 | "left", 28 | "right", # concs 29 | "filename", 30 | } # others 31 | 32 | self.parse_profile(profile) 33 | 34 | def pipe_str(self, text: str) -> str: 35 | # a. preprocessing 36 | if self.prep: 37 | text = self.pipes["prep"][self.prep](text) 38 | 39 | # b. tokenizing 40 | if ( 41 | isinstance(self.tok, str) 42 | and ( 43 | "word" in self.tok or "sentence" in self.tok or "paragraph" in self.tok 44 | ) 45 | and self.tok_params 46 | ): 47 | elts = self.pipes["tok"][self.tok](text, config=self.tok_params["config"]) 48 | else: 49 | elts = self.pipes["tok"][self.tok](text) 50 | 51 | # c. modifying 52 | mod = self.pipes["mod"][self.mod] 53 | if isinstance(self.mod, str) and self.mod.endswith("concs"): 54 | elts = mod(elts, left=self.left, right=self.right) 55 | else: 56 | elts = mod(elts) 57 | 58 | # d. formatting 59 | elts = self.pipes["form"][self.form](elts) 60 | 61 | return elts 62 | 63 | def pipe_file(self, filename: str, out_file: str): 64 | in_file = Path(filename) 65 | out_file = Path(out_file) 66 | assert in_file.is_file() 67 | 68 | with in_file.open(encoding="utf-8-sig") as f: 69 | dump = f.read() 70 | 71 | output = self.pipe_str(dump) 72 | 73 | with out_file.open("w", encoding="utf-8-sig") as g: 74 | g.write(output) 75 | 76 | def parse_profile(self, pipeline): 77 | self.is_valid_params(pipeline) 78 | for arg, v in pipeline.items(): 79 | if arg == "prep": 80 | self.prep = v 81 | elif arg == "tok": 82 | self.tok = v 83 | elif arg == "mod": 84 | self.mod = v 85 | elif arg == "form": 86 | self.form = v 87 | elif arg == "tok_params": 88 | self.tok_params = v 89 | elif arg == "left": 90 | self.left = v 91 | elif arg == "right": 92 | self.right = v 93 | elif arg == "filename": 94 | self.filename = v 95 | self.is_valid_pipeline() 96 | 97 | def is_valid_params(self, pipeline): 98 | for arg, val in pipeline.items(): 99 | # ensure all arguments are valid attributes 100 | if arg not in self.args_list: 101 | raise SyntaxError( 102 | f'{arg} is not a valid argument\nvalid options are {" ".join(self.map)}' 103 | ) 104 | 105 | # ensure arguments have valid values 106 | if arg in self.pipes and val not in self.pipes[arg]: 107 | raise SyntaxError( 108 | f'{val} is not a valid value for {arg}\nvalid options are {" ".join(self.pipes[arg])}' 109 | ) 110 | 111 | def is_valid_pipeline(self): 112 | # missing pipes 113 | if not self.tok or not self.mod or not self.form: 114 | raise BrokenPipeError( 115 | "A valid pipeline must have a tokenizer, a processor and a formatter." 116 | ) 117 | -------------------------------------------------------------------------------- /botok/text/preprocess.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import re 3 | 4 | 5 | def basic_cleanup(text: str) -> str: 6 | text = text.strip() 7 | text = re.sub(r"\n+", " ", text) 8 | text = re.sub(r"\s+", " ", text) 9 | return text 10 | 11 | 12 | def basic_keeps_lines(text: str) -> str: 13 | text = text.strip() 14 | # text = re.sub(r'\s+', ' ', text) 15 | return text 16 | -------------------------------------------------------------------------------- /botok/text/text.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from pathlib import Path 3 | from types import FunctionType 4 | 5 | from ..vars import Ids 6 | from .format import * 7 | from .modify import * 8 | from .pipelinebase import PipelineBase 9 | from .preprocess import * 10 | from .tokenize import * 11 | 12 | builtin_pipes = { 13 | # a. Preprocessing 14 | "prep": { 15 | "dummy": lambda x: x, 16 | "basic_cleanup": basic_cleanup, 17 | "basic_keeps_lines": basic_keeps_lines, 18 | }, 19 | # b. Tokenizers 20 | "tok": { 21 | "space_tok": space_tok, 22 | "word_tok": word_tok, 23 | "chunk_tok": chunk_tok, 24 | "sentence_tok": sentence_tok, 25 | "paragraph_tok": paragraph_tok, 26 | }, 27 | # c. Modifiers 28 | "mod": { 29 | "dummy": lambda x: x, 30 | "words_raw_text": words_raw_text, 31 | "words_raw_types": words_raw_types, 32 | "words_error_types": words_error_types, 33 | "words_error_concs": words_error_concs, 34 | "chunks_raw_text": chunks_raw_text, 35 | }, 36 | # d. Formatters 37 | "form": { 38 | "dummy": lambda x: x, 39 | "plaintext": plaintext, 40 | "plaintext_sent_par": plaintext_sent_par, 41 | "basic_concs": basic_conc, 42 | "stats_types": stats_types, 43 | }, 44 | } 45 | 46 | 47 | class Text: 48 | """ 49 | Takes as input: 50 | - a string to process 51 | - the Path object of a file to process 52 | 53 | including a custom pipeline is as simple as: 54 | - subclassing Text class 55 | - creating a new @property method like the built in ones while providing your own arguments to self.__process() 56 | """ 57 | 58 | def __init__(self, input, out_file=None, tok_params=None): 59 | """ 60 | if input == str: return a string 61 | if input == Path: 62 | 1. out_file != None: write to given Path object 63 | 2. out_file == None: write to cwd and append "_pybo" to file name 64 | 65 | custom_tok: settings for building the custom tokenizer: see docstring of Config class in config.py 66 | """ 67 | self.input = input 68 | self.tok_params = tok_params 69 | 70 | if isinstance(input, str): 71 | if out_file: 72 | assert isinstance(out_file, Path) 73 | self.out_file = out_file 74 | else: 75 | self.out_file = None 76 | elif isinstance(input, Path): 77 | if not out_file: 78 | self.out_file = input.parent / f"{input.stem}_pybo{input.suffix}" 79 | else: 80 | self.out_file = out_file 81 | else: 82 | raise TypeError("input should either be a string, or a Path object") 83 | 84 | @property 85 | def tokenize_on_spaces(self): 86 | return self.__process("basic_cleanup", "space_tok", "dummy", "plaintext") 87 | 88 | @property 89 | def tokenize_words_raw_text(self): 90 | return self.__process( 91 | "basic_cleanup", "word_tok", "words_raw_text", "plaintext", 92 | ) 93 | 94 | @property 95 | def tokenize_words_raw_lines(self): 96 | return self.__process( 97 | "basic_keeps_lines", "word_tok", "words_raw_text", "plaintext", 98 | ) 99 | 100 | @property 101 | def tokenize_chunks_plaintext(self): 102 | return self.__process( 103 | "basic_keeps_lines", "chunk_tok", "chunks_raw_text", "plaintext" 104 | ) 105 | 106 | @property 107 | def tokenize_sentences_plaintext(self): 108 | return self.__process( 109 | "basic_cleanup", "sentence_tok", "dummy", "plaintext_sent_par", 110 | ) 111 | 112 | @property 113 | def tokenize_paragraph_plaintext(self): 114 | return self.__process( 115 | "basic_cleanup", "paragraph_tok", "dummy", "plaintext_sent_par", 116 | ) 117 | 118 | @property 119 | def list_word_types(self): 120 | return self.__process( 121 | "basic_keeps_lines", "word_tok", "words_raw_types", "stats_types", 122 | ) 123 | 124 | def custom_pipeline( 125 | self, preprocessor, tokenizer, modifier, formatter, tok_params=None 126 | ): 127 | """ 128 | every pipe should be either the name of an existing pipe as found in builtin_pipes or a function 129 | """ 130 | return self.__process(preprocessor, tokenizer, modifier, formatter, tok_params) 131 | 132 | def __process(self, preprocessor, tokenizer, modifier, formatter, tok_params=None): 133 | if tok_params: 134 | for k, v in tok_params.items(): 135 | if k not in self.tok_params or self.tok_params[k] is None: 136 | self.tok_params[k] = v 137 | 138 | profile, pipes = self.__create_pipeline( 139 | preprocessor, tokenizer, modifier, formatter, self.tok_params 140 | ) 141 | pipeline = PipelineBase(profile, pipes=pipes) 142 | 143 | if self.out_file: 144 | return pipeline.pipe_file(self.input, self.out_file) 145 | else: 146 | return pipeline.pipe_str(self.input) 147 | 148 | @staticmethod 149 | def __create_pipeline( 150 | preprocessor, tokenizer, modifier, formatter, tok_params=None 151 | ): 152 | profile = {} 153 | pipes = {"prep": {}, "tok": {}, "mod": {}, "form": {}} 154 | for a, b, c in [ 155 | ("prep", Ids.prep, preprocessor), 156 | ("tok", Ids.tok, tokenizer), 157 | ("mod", Ids.mod, modifier), 158 | ("form", Ids.form, formatter), 159 | ]: 160 | if isinstance(c, FunctionType): 161 | pipes[a].update({b: c}) 162 | profile[a] = b 163 | elif isinstance(c, str): 164 | profile[a] = c 165 | assert c in builtin_pipes[a] 166 | pipes[a][c] = builtin_pipes[a][c] 167 | else: 168 | raise SyntaxError("Should be either a function or a string") 169 | 170 | if tok_params: 171 | profile["tok_params"] = tok_params 172 | return profile, pipes 173 | -------------------------------------------------------------------------------- /botok/text/tokenize.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from functools import lru_cache 3 | from typing import List, NewType, Tuple 4 | 5 | from ..tokenizers.chunktokenizer import ChunkTokenizer 6 | from ..tokenizers.paragraphtokenizer import paragraph_tokenizer 7 | from ..tokenizers.sentencetokenizer import sentence_tokenizer 8 | from ..tokenizers.token import Token 9 | from ..tokenizers.wordtokenizer import WordTokenizer 10 | 11 | BoToken = NewType("BoToken", Token) 12 | 13 | 14 | def space_tok(text: str) -> List[str]: 15 | """Tokenizes string on spaces 16 | 17 | """ 18 | return text.split(" ") 19 | 20 | 21 | def word_tok(text: str, config=None) -> List[BoToken]: 22 | tok = get_wordtokenizer(config=config) 23 | return tok.tokenize(text) 24 | 25 | 26 | def sentence_tok(text: str, config=None) -> List[Tuple[int, List[BoToken]]]: 27 | tok = get_wordtokenizer(config=config) 28 | tokens = tok.tokenize(text) 29 | return sentence_tokenizer(tokens) 30 | 31 | 32 | def paragraph_tok(text: str, config=None) -> List[Tuple[int, List[BoToken]]]: 33 | tok = get_wordtokenizer(config=config) 34 | tokens = tok.tokenize(text) 35 | return paragraph_tokenizer(tokens) 36 | 37 | 38 | @lru_cache( 39 | maxsize=None 40 | ) # <--- make sure that the trie is only built once then kept in memory 41 | def get_wordtokenizer(config=None): 42 | return WordTokenizer(config=config) 43 | 44 | 45 | def chunk_tok(text: str) -> List[str]: 46 | return ChunkTokenizer(text).tokenize() 47 | -------------------------------------------------------------------------------- /botok/textunits/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/textunits/__init__.py -------------------------------------------------------------------------------- /botok/textunits/bostring.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from warnings import warn 3 | 4 | from .charcategories import get_char_category 5 | from ..vars import CharMarkers as a 6 | from ..vars import char_values 7 | 8 | 9 | class BoString: 10 | """ 11 | This class is the foundational building block of pre-processing. 12 | 13 | It implements the natural groups of characters a user makes when looking at 14 | a text in his native language. 15 | 16 | Implementation: 17 | --------------- 18 | 19 | - all the characters in the Unicode Tables for Tibetan are organized in lists 20 | hard-coded as string variables in ``__attribute_basic_types()``. 21 | - upon instanciation, ``__init__().base_structure`` is populated with the indices of every 22 | char in the input string(key) and the group constant to which it belongs(values) 23 | - human-readable description of the group constant can be accessed in ``__init__().char_markers`` 24 | 25 | :Example: 26 | 27 | >>> from botok.textunits.bostring import BoString 28 | >>> from botok.vars import CharMarkers 29 | 30 | >>> bo_str = ' བཀྲ་ཤིས་ tr བདེ་ལེགས།' 31 | >>> bs = BoString(bo_str) 32 | 33 | >>> bs.base_structure # key: character index, value: character group 34 | {0: 15, 1: 1, 2: 1, 3: 2, 4: 4, 5: 1, 6: 3, 7: 1, 8: 4, 9: 15, 10: 15, 11: 14, 35 | 12: 14, 13: 15, 14: 1, 15: 1, 16: 3, 17: 4, 18: 1, 19: 3, 20: 1, 21: 1, 22: 8} 36 | 37 | >>> bs.get_categories() 38 | {0: 'space', 1: 'cons', 2: 'cons', 3: 'sub-cons', 4: 'tsek', 5: 'cons', 6: 'vow', 39 | 7: 'cons', 8: 'tsek', 9: 'space', 10: 'space', 11: 'other', 12: 'other', 40 | 13: 'space', 14: 'cons', 15: 'cons', 16: 'vow', 17: 'tsek', 18: 'cons', 19: 'vow', 41 | 20: 'cons', 21: 'cons', 22: 'punct'} 42 | 43 | .. note:: You may want to refine the groups that are implemented to have a finer analysis. 44 | Be sure to create the corresponding constants in ``__init__()`` and the corresponding 45 | entries in ``__init__().char_markers``. 46 | """ 47 | 48 | def __init__(self, string, ignore_chars=None): 49 | if ignore_chars is None: 50 | ignore_chars = [] 51 | self.ignore_chars = ignore_chars 52 | self.string = string 53 | self.len = len(string) 54 | self.base_structure = {} 55 | self.__attribute_basic_types() 56 | 57 | def __attribute_basic_types(self): 58 | """ 59 | Populates ``__init__().base_structure``. 60 | """ 61 | for i in range(len(self.string)): 62 | char = self.string[i] 63 | cat = get_char_category(char) 64 | self.__nfc_check(cat, i) 65 | if char in self.ignore_chars: 66 | self.base_structure[ 67 | i 68 | ] = ( 69 | a.TRANSPARENT.value 70 | ) # spaces chars are allowed anywhere, thus ignored 71 | else: 72 | self.base_structure[i] = cat 73 | 74 | def __nfc_check(self, cat, idx): 75 | if cat == a.NFC: 76 | slice_start = 10 77 | slice_end = 10 78 | while idx - slice_start < 0: 79 | slice_start -= 1 80 | while idx + slice_end >= self.len: 81 | slice_end -= 1 82 | warn( 83 | f'Beware of unexpected results: input string contains the non-expanded char "{self.string[idx]}", ' 84 | f'found in "{self.string[slice_start: slice_end]}".' 85 | ) 86 | 87 | def export_groups(self, start_idx, slice_len, for_substring=True): 88 | """ 89 | Export the base groups for a slice of the input string 90 | 91 | :param start_idx: starting index of the slice 92 | :param slice_len: length of the slice we want to export 93 | :param for_substring: if True, indices start at 0, Else the indices of the original string are kept. 94 | :type start_idx: int 95 | :type slice_len: int 96 | :return: the slice of ``__init__().base_structure`` described in the parameters 97 | :rtype: dict 98 | 99 | :Example: 100 | 101 | >>> bo_str = ' བཀྲ་ཤིས་ tr བདེ་ལེགས།' 102 | >>> bs = BoString(bo_str) 103 | 104 | >>> bs.export_groups(2, 5) 105 | {0: 1, 1: 2, 2: 4, 3: 1, 4: 3} 106 | 107 | >>> bs.export_groups(2, 5, for_substring=False) 108 | {2: 1, 3: 2, 4: 4, 5: 1, 6: 3} 109 | 110 | """ 111 | if for_substring: 112 | return { 113 | n: self.base_structure[i] 114 | for n, i in enumerate(range(start_idx, start_idx + slice_len)) 115 | } 116 | else: 117 | return { 118 | i: self.base_structure[i] 119 | for i in range(start_idx, start_idx + slice_len) 120 | } 121 | 122 | def get_categories(self, struct=None): 123 | if struct is None or not isinstance(struct, dict): 124 | return {k: char_values[v] for k, v in self.base_structure.items()} 125 | else: 126 | return {k: char_values[v] for k, v in struct.items()} 127 | -------------------------------------------------------------------------------- /botok/textunits/bosyl.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from .sylcomponents import SylComponents 3 | 4 | 5 | class BoSyl(SylComponents): 6 | def __init__(self): 7 | SylComponents.__init__(self) 8 | self.affixes = { 9 | "ར": {"len": 1, "type": "la"}, 10 | "ས": {"len": 1, "type": "gis"}, 11 | "འི": {"len": 2, "type": "gi"}, 12 | "འམ": {"len": 2, "type": "am"}, 13 | "འང": {"len": 2, "type": "ang"}, 14 | "འོ": {"len": 2, "type": "o"}, 15 | "འིའོ": {"len": 4, "type": "gi+o"}, 16 | "འིའམ": {"len": 4, "type": "gi+am"}, 17 | "འིའང": {"len": 4, "type": "gi+ang"}, 18 | "འོའམ": {"len": 4, "type": "o+am"}, 19 | "འོའང": {"len": 4, "type": "o+ang"}, 20 | } 21 | 22 | def is_affixable(self, syl): 23 | """expects a clean syllable without ending tsek""" 24 | affixable = False 25 | if self.is_thame(syl): 26 | affixable = True 27 | for ending in ["ར", "ས", "འི", "འོ", "མ", "ང"]: 28 | if len(syl) > len(ending) and syl.endswith(ending): 29 | affixable = False 30 | return affixable 31 | 32 | def get_all_affixed(self, syl): 33 | """ 34 | :param syl: syl to be affixed 35 | :return: if affixable: [(, {'len': int, 'type': str, 'aa': bool}), (..., ...)] 36 | otherwise : 37 | """ 38 | if self.is_affixable(syl): 39 | aa = False 40 | if syl.endswith("འ") and len(syl) > 1: 41 | syl = syl[:-1] 42 | aa = True 43 | 44 | affixed = [] 45 | for a in self.affixes.keys(): 46 | metadata = {} 47 | metadata.update(self.affixes[a]) 48 | metadata.update({"aa": aa}) 49 | affixed.append((syl + a, metadata)) 50 | return affixed 51 | 52 | else: 53 | return None 54 | -------------------------------------------------------------------------------- /botok/textunits/charcategories.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import csv 3 | from collections import defaultdict 4 | 5 | from ..vars import CharMarkers as c 6 | 7 | # Get the categories of Tibetan characters from the csv file 8 | categories = defaultdict(list) 9 | table_path = Path(__file__).parent.parent / "resources/bo_uni_table.csv" 10 | for row in list(csv.reader(table_path.open(encoding="utf-8-sig")))[1:]: 11 | char = row[1].replace("—", "") 12 | cat = c[row[2]].value 13 | categories[cat].append(char) 14 | 15 | # all unicode chars liable to be used as spaces or that allowed in a valid Tibetan string 16 | # yet that will be ignored when read by a human. 17 | transparent = [ 18 | " ", # \U32 SPACE 19 | "᠎", # \U6158 MONGOLIAN VOWEL SEPARATOR 20 | " ", # \U8192 EN QUAD 21 | " ", # \U8193 EM QUAD 22 | " ", # \U8194 EN SPACE 23 | " ", # \U8195 EM SPACE 24 | " ", # \U8196 THREE-PER-EM SPACE 25 | " ", # \U8197 FOUR-PER-EM SPACE 26 | " ", # \U8198 SIX-PER-EM SPACE 27 | " ", # \U8199 FIGURE SPACE 28 | " ", # \U8200 PUNCTUATION SPACE 29 | " ", # \U8201 THIN SPACE 30 | " ", # \U8202 HAIR SPACE 31 | "​", # \U8203 ZERO WIDTH SPACE 32 | " ", # \U8239 NARROW NO-BREAK SPACE 33 | " ", # \U8287 MEDIUM MATHEMATICAL SPACE 34 | " ", # \U12288 IDEOGRAPHIC SPACE 35 | "", # \U65279 ZERO WIDTH NO-BREAK SPACE 36 | "\t", # Tabulation 37 | "\n", # carriage return can happen in the middle of a word 38 | ] 39 | 40 | 41 | def get_char_category(char): 42 | # source for codepoints: https://jrgraphix.net/research/unicode.php 43 | if char in transparent: 44 | return c.TRANSPARENT.value 45 | 46 | # Tibetan range 47 | if "\u0f00" <= char <= "\u0fff": 48 | for cat, chars in categories.items(): 49 | if char in chars: 50 | return cat 51 | raise ValueError( 52 | f'The char "{char}" is expected to be in the tibetan table, but is not.' 53 | ) 54 | 55 | # CJK range 56 | elif ( 57 | "\u2e80" <= char <= "\ufaff" 58 | or "\ufe30" <= char <= "\ufe4f" 59 | or eval('"\u20000"') <= char <= eval('"\u2fa1f"') 60 | ): 61 | return c.CJK.value 62 | 63 | # LATIN range 64 | # 1. 0020 - 036f: Latin Basic + Latin-1 Supplement + Latin Extended-A + Latin Extended-B 65 | # IPA Extensions + Spacing Modifier Letters + Combining Diacritical Marks 66 | # 2. 1e00 - 20cf: Latin Extended Additional + Superscripts and Subscripts + Currency Symbols 67 | elif "\u0020" <= char <= "\u036f" or "\u1e00" <= char <= "\u20cf": 68 | return c.LATIN.value 69 | 70 | else: 71 | return c.OTHER.value 72 | -------------------------------------------------------------------------------- /botok/third_party/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/third_party/__init__.py -------------------------------------------------------------------------------- /botok/third_party/cqlparser.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from .pynpl.cql import Query 3 | 4 | 5 | __all__ = ["Query", "parse_cql_query", "replace_token_attributes"] 6 | 7 | 8 | def parse_cql_query(query, numerals=True, booleans=True): 9 | """ 10 | CQL parser for replacing the content of Token.attributes. 11 | From the CQL syntax, only the attribute names and the values 12 | are taken into account. 13 | 14 | :param query: CQL query string 15 | :param numerals: if True, gives the Python int instead of the string 16 | :param booleans: if True, gives the Python bool instead of the string 17 | :return: a list of dicts, one per token slot, where 18 | keys == Token.attributes and 19 | values == content of the expected Token.attributes 20 | """ 21 | 22 | def str2int(string): 23 | try: 24 | return int(string) 25 | except ValueError: 26 | return string 27 | 28 | def str2bool(string): 29 | b = {"True": True, "False": False, "None": None} 30 | if string in b: 31 | return b[string] 32 | else: 33 | return string 34 | 35 | def cql2pattern(tokenexpr, numerals, booleans): 36 | """ 37 | Expects the following syntax: 38 | '[attribute1="value1" & attribute2="value2" (& ...)]' 39 | """ 40 | changes = {} 41 | for attrexprs in tokenexpr: 42 | key = attrexprs.attribute 43 | value = attrexprs.valueexpr[0] 44 | if numerals: 45 | value = str2int(value) 46 | if booleans: 47 | value = str2bool(value) 48 | changes[key] = value 49 | return changes 50 | 51 | if query: 52 | parsed = Query(query) 53 | pattern = [] 54 | for tokenexpr in parsed.tokenexprs: 55 | pattern.append(cql2pattern(tokenexpr, numerals, booleans)) 56 | return pattern 57 | else: 58 | return None 59 | 60 | 61 | def replace_token_attributes(tokens, token_changes): 62 | """ 63 | Applies in place the replacements found in the CQL query (token_changes) 64 | the number of tokens in the list and the number of token slots in the query 65 | must be even. 66 | 67 | :param tokens: list of tokens 68 | :param token_changes: CQL query 69 | """ 70 | changes = parse_cql_query(token_changes) 71 | if type(tokens) == list: 72 | assert len(tokens) == len(changes) 73 | for i in range(len(tokens)): 74 | for attr, value in changes[i].items(): 75 | setattr(tokens[i], attr, value) 76 | else: 77 | assert len(changes) == 1 78 | for attr, value in changes[0].items(): 79 | setattr(tokens, attr, value) 80 | -------------------------------------------------------------------------------- /botok/third_party/has_skrt_syl.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import re 3 | 4 | from ..vars import TSEK 5 | 6 | 7 | def is_skrt(syl): 8 | """Source for regexes : Paul Hackett Visual Basic script 9 | 10 | regex1: Now do Sanskrit: Skt.vowels, [g|d|b|dz]+_h, hr, shr, Skt 11 | regex2: more Sanskrit: invalid superscript-subscript pairs 12 | regex3: tsa-phru mark used in Chinese transliteration 13 | :param syl: syllable to assert 14 | :return: True if matches either of the regexes, False otherwise 15 | """ 16 | regex1 = ( 17 | r"([ཀ-ཬཱ-྅ྐ-ྼ]{0,}[ཱཱཱིུ-ཹཻཽ-ྃ][ཀ-ཬཱ-྅ྐ-ྼ]{0,}|[ཀ-ཬཱ-྅ྐ-ྼ]{0,}" 18 | r"[གཌདབཛྒྜྡྦྫ][ྷ][ཀ-ཬཱ-྅ྐ-ྼ]{0,}|[ཀ-ཬཱ-྅ྐ-ྼ]{0,}[ཤཧ][ྲ][ཀ-ཬཱ-྅ྐ-ྼ]{0,}|[ཀ-ཬཱ-྅ྐ-ྼ]{0,}" 19 | r"[གྷཊ-ཎདྷབྷཛྷཥཀྵ-ཬཱཱཱིུ-ཹཻཽ-ྃྒྷྚ-ྞྡྷྦྷྫྷྵྐྵ-ྼ][ཀ-ཬཱ-྅ྐ-ྼ]{0,})" 20 | ) 21 | regex2 = r"([ཀ-ཬཱ-྅ྐ-ྼ]{0,}[ཀཁགང-ཉཏ-དན-བམ-ཛཝ-ཡཤཧཨ][ྐ-ྫྷྮ-ྰྴ-ྼ][ཀ-ཬཱ-྅ྐ-ྼ]{0,})" 22 | regex3 = r"([ཀ-ཬཱ-྅ྐ-ྼ]{0,}[༹][ཀ-ཬཱ-྅ྐ-ྼ]{0,})" 23 | return re.search(regex1, syl) or re.search(regex2, syl) or re.search(regex3, syl) 24 | 25 | 26 | def has_skrt_syl(word): 27 | """Uses is_skrt() to check for sanskrit syllables 28 | 29 | """ 30 | skrt = False 31 | syls = word.strip(TSEK).split(TSEK) 32 | for s in syls: 33 | if is_skrt(s): 34 | skrt = True 35 | 36 | return skrt 37 | -------------------------------------------------------------------------------- /botok/third_party/pynpl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/third_party/pynpl/__init__.py -------------------------------------------------------------------------------- /botok/third_party/pynpl/fsa.py: -------------------------------------------------------------------------------- 1 | # 2 | # Original copyright notice: 3 | # 4 | # --------------------------------------------------------------- 5 | # PyNLPl - Finite State Automata 6 | # by Maarten van Gompel 7 | # Centre for Language Studies 8 | # Radboud University Nijmegen 9 | # http://proycon.github.com/folia 10 | # http://www.github.com/proycon/pynlpl 11 | # proycon AT anaproy DOT nl 12 | # 13 | # Partially based/inspired on code by Xiayun Sun (https://github.com/xysun/regex) 14 | # 15 | # Licensed under GPLv3 16 | # 17 | # ---------------------------------------------------------------- 18 | # 19 | # This file is modified and reditstributed here under APL2 with 20 | # with written permission from the original author 21 | 22 | from __future__ import print_function, unicode_literals, division, absolute_import 23 | import sys 24 | 25 | 26 | class State(object): 27 | def __init__(self, **kwargs): 28 | if "epsilon" in kwargs: 29 | self.epsilon = kwargs["epsilon"] # epsilon-closure (lis of states) 30 | else: 31 | self.epsilon = [] # epsilon-closure 32 | if "transitions" in kwargs: 33 | self.transitions = kwargs["transitions"] 34 | else: 35 | self.transitions = [] # (matchitem, matchfunction(value), state) 36 | if "final" in kwargs: 37 | self.final = bool(kwargs["final"]) # ending state 38 | else: 39 | self.final = False 40 | self.transitioned = ( 41 | None 42 | ) # will be a tuple (state, matchitem) indicating how this state was reached 43 | 44 | 45 | class NFA(object): 46 | """Non-deterministic finite state automaton. Can be used to model DFAs as well if your state transitions are not ambiguous and epsilon is empty.""" 47 | 48 | def __init__(self, initialstate): 49 | self.initialstate = initialstate 50 | 51 | def run(self, sequence, mustmatchall=False, debug=False): 52 | def add(state, states): 53 | """add state and recursively add epsilon transitions""" 54 | assert isinstance(state, State) 55 | if state in states: 56 | return 57 | states.add(state) 58 | for eps in state.epsilon: # recurse into epsilon transitions 59 | add(eps, states) 60 | 61 | current_states = set() 62 | add(self.initialstate, current_states) 63 | if debug: 64 | print( 65 | "Starting run, current states: ", repr(current_states), file=sys.stderr 66 | ) 67 | 68 | for offset, value in enumerate(sequence): 69 | if not current_states: 70 | break 71 | if debug: 72 | print("Value: ", repr(value), file=sys.stderr) 73 | next_states = set() 74 | for state in current_states: 75 | for matchitem, matchfunction, trans_state in state.transitions: 76 | if matchfunction(value): 77 | trans_state.transitioned = (state, matchitem) 78 | add(trans_state, next_states) 79 | 80 | current_states = next_states 81 | if debug: 82 | print("Current states: ", repr(current_states), file=sys.stderr) 83 | if not mustmatchall: 84 | for s in current_states: 85 | if s.final: 86 | if debug: 87 | print("Final state reached", file=sys.stderr) 88 | yield offset + 1 89 | 90 | if mustmatchall: 91 | for s in current_states: 92 | if s.final: 93 | if debug: 94 | print("Final state reached", file=sys.stderr) 95 | yield offset + 1 96 | 97 | def match(self, sequence): 98 | try: 99 | return next(self.run(sequence, True)) == len(sequence) 100 | except StopIteration: 101 | return False 102 | 103 | def find(self, sequence, debug=False): 104 | l = len(sequence) 105 | for i in range(0, l): 106 | for length in self.run(sequence[i:], False, debug): 107 | yield sequence[i : i + length] 108 | -------------------------------------------------------------------------------- /botok/tokenizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/tokenizers/__init__.py -------------------------------------------------------------------------------- /botok/tokenizers/chunktokenizer.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from ..chunks.chunks import TokChunks 3 | 4 | 5 | class ChunkTokenizer(TokChunks): 6 | def __init__(self, string): 7 | super().__init__(string) 8 | 9 | def tokenize(self): 10 | tokens = self.make_chunks() 11 | return self.get_readable(tokens) 12 | -------------------------------------------------------------------------------- /botok/tokenizers/paragraphtokenizer.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from .sentencetokenizer import get_sentence_indices 3 | 4 | 5 | def paragraph_tokenizer(tokens): 6 | # a paragraph is defined as a group of sentences that does not have more words than a given threshold 7 | threshold = 70 8 | paragraph_max = 150 9 | par_indices = get_sentence_indices(tokens) 10 | 11 | # join small sentences to form paragraphs 12 | i = 0 13 | while i < len(par_indices): 14 | start, end, l = ( 15 | par_indices[i]["start"], 16 | par_indices[i]["end"], 17 | par_indices[i]["len"], 18 | ) 19 | if i > 0 and l < threshold: 20 | previous_len = par_indices[i - 1]["len"] 21 | if l + previous_len < paragraph_max: 22 | par_indices[i - 1]["end"] = par_indices[i]["end"] 23 | par_indices[i - 1]["len"] += par_indices[i]["len"] 24 | del par_indices[i] 25 | i -= 1 26 | i += 1 27 | 28 | # get tokens for each paragraph 29 | pars = [] 30 | for par in par_indices: 31 | start, end, l = par["start"], par["end"], par["len"] 32 | pars.append((l, tokens[start : end + 1])) 33 | 34 | return pars 35 | -------------------------------------------------------------------------------- /botok/tokenizers/stacktokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | #STACK_PARTS = re.compile(r"(?:[^\u0f18\u0f19\u0f35\u0f37\u0f71-\u0f7e\u0f80-\u0f84\u0f86\u0f87\u0f8d-\u0fbc][\u0f18\u0f19\u0f35\u0f37\u0f71-\u0f7e\u0f80-\u0f84\u0f86\u0f87\u0f8d-\u0fbc]*|^[\u0f18\u0f19\u0f35\u0f37\u0f71-\u0f7e\u0f80-\u0f84\u0f86\u0f87\u0f8d-\u0fbc]+)") 4 | STACK_PARTS = re.compile(r"[^\u0f18\u0f19\u0f35\u0f37\u0f71-\u0f7e\u0f80-\u0f84\u0f86\u0f87\u0f8d-\u0fbc][\u0f18\u0f19\u0f35\u0f37\u0f71-\u0f7e\u0f80-\u0f84\u0f86\u0f87\u0f8d-\u0fbc]*") 5 | 6 | def tokenize_in_stacks(str): 7 | return STACK_PARTS.findall(str) 8 | 9 | def test_stack_tokenizer(): 10 | assert(tokenize_in_stacks("ཀཿཐོག་འབྱམ་པའཱི་རོ།") == ["ཀ", "\u0f7f", "ཐོ", "ག", "་", "འ", "བྱ", "མ", "་", "པ", "འཱི", "་", "རོ", "།"]) 11 | assert(tokenize_in_stacks("\u0f7fཀཿ") == ["\u0f7f", "ཀ", "\u0f7f"]) 12 | 13 | if __name__ == "__main__": 14 | test_stack_tokenizer() -------------------------------------------------------------------------------- /botok/tokenizers/token.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from ..vars import TSEK, AA 3 | 4 | 5 | class Token: 6 | def __init__(self): 7 | self.text = "" 8 | self.char_types = [] 9 | self.has_merged_dagdra = None 10 | self.lemma = "" 11 | self.sense = "" 12 | self.chunk_type = None 13 | self.start = 0 14 | self.len = None 15 | self.syls_idx = None 16 | self.syls_start_end = None 17 | self.pos = "" 18 | self.affixation = {} 19 | self.senses = None 20 | self.affix = False 21 | self.affix_host = False 22 | self.form_freq = None 23 | self.freq = None 24 | self.skrt = False 25 | self._ = {} # dict for any user specific data 26 | 27 | def __getitem__(self, attr): 28 | # allows to access attributes with the Token['attr'] syntax, besides the Token.attr default 29 | try: 30 | return self.__getattribute__(attr) 31 | except AttributeError: 32 | raise AttributeError("does not have attribute: " + attr) 33 | 34 | def __setitem__(self, key, value): 35 | # enforces not to add any extra attribute. Token._ should be used for any custom data 36 | if hasattr(self, key): 37 | if key != "_": 38 | self.__dict__[key] = value 39 | else: 40 | if not isinstance(value, dict): 41 | raise TypeError("only dicts are accepted for Token._") 42 | self.__dict__[key].update(value) 43 | else: 44 | raise AttributeError("Token objects don't have " + key + " as attribute") 45 | 46 | @property 47 | def syls(self): 48 | return ( 49 | [[self.text[s] for s in syl] for syl in self.syls_idx] 50 | if self.syls_idx 51 | else "" 52 | ) 53 | 54 | @property 55 | def text_cleaned(self): 56 | """ 57 | Will append a TSEK to every syllable except syllables that host 58 | an affix. 59 | 60 | """ 61 | if self.syls: 62 | cleaned = TSEK.join(["".join(syl) for syl in self.syls]) 63 | if self.affix_host and not self.affix: 64 | return cleaned 65 | else: 66 | return cleaned + TSEK 67 | else: 68 | return "" 69 | 70 | @property 71 | def text_unaffixed(self): 72 | unaffixed = TSEK.join(["".join(syl) for syl in self.syls]) if self.syls else "" 73 | if ( 74 | self.affixation 75 | and not self.affix 76 | and "len" in self.affixation 77 | and len([True for m in self.senses if "affixed" in m and m["affixed"]]) > 0 78 | ): 79 | unaffixed = unaffixed[: -self.affixation["len"]] 80 | 81 | if unaffixed and "aa" in self.affixation and self.affixation["aa"]: 82 | unaffixed += AA 83 | 84 | if self.affixation and self.affix_host and not self.affix: 85 | return unaffixed 86 | elif unaffixed: 87 | return unaffixed + TSEK 88 | else: 89 | return "" 90 | 91 | def __repr__(self): 92 | out = 'text: "{}"\n'.format(self.text) 93 | if self.text_cleaned: 94 | out += 'text_cleaned: "{}"\n'.format(self.text_cleaned) 95 | if self.text_unaffixed: 96 | out += 'text_unaffixed: "{}"\n'.format(self.text_unaffixed) 97 | if self.syls and self.syls != []: 98 | out += ( 99 | 'syls: ["' + '", "'.join(["".join(syl) for syl in self.syls]) + '"]\n' 100 | ) 101 | if self.pos: 102 | out += "pos: {}\n".format(self.pos) 103 | if self.lemma: 104 | out += "lemma: {}\n".format(self.lemma) 105 | if self.sense: 106 | out += "sense: {}\n".format(self.sense) 107 | if self.senses: 108 | out += ( 109 | "senses: | " 110 | + " | ".join( 111 | [ 112 | ", ".join([f"{k}: {v}" for k, v in m.items()]) 113 | for m in self.senses 114 | ] 115 | ) 116 | + " |\n" 117 | ) 118 | if self.char_types: 119 | out += "char_types: |" + "|".join(self.char_types) + "|\n" 120 | if self.chunk_type: 121 | out += "chunk_type: {}\n".format(self.chunk_type) 122 | if self.form_freq: 123 | out += "form_freq: {}\n".format(self.form_freq) 124 | if self.freq: 125 | out += "freq: {}\n".format(self.freq) 126 | if self.skrt: 127 | out += "skrt: {}\n".format(self.skrt) 128 | if self.affix: 129 | out += "affix: {}\n".format(self.affix) 130 | if self.affix_host: 131 | out += "affix_host: {}\n".format(self.affix_host) 132 | if self.has_merged_dagdra: 133 | out += "has_merged_dagdra: {}\n".format(self.has_merged_dagdra) 134 | if self.syls_idx: 135 | out += "syls_idx: {}\n".format(self.syls_idx) 136 | if self.syls_start_end: 137 | out += "syls_start_end: {}\n".format(self.syls_start_end) 138 | out += "start: {}\n".format(self.start) 139 | out += "len: {}\n".format(self.len) 140 | if self._: 141 | out += "\n" 142 | for k, v in self._.items(): 143 | out += "_{}: {}\n".format(k, v) 144 | out += "\n" 145 | return out 146 | -------------------------------------------------------------------------------- /botok/tokenizers/wordtokenizer.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | import csv 3 | from pathlib import Path 4 | 5 | from ..chunks.chunks import TokChunks 6 | from ..config import Config 7 | from ..modifytokens.adjusttokens import AdjustTokens 8 | from ..modifytokens.mergedagdra import MergeDagdra 9 | from ..modifytokens.splitaffixed import split_affixed 10 | from ..textunits.bosyl import BoSyl 11 | from ..tries.trie import Trie 12 | from ..vars import AA, TSEK 13 | from .tokenize import Tokenize 14 | 15 | 16 | def get_part_lemmas(path): 17 | part_lemmas = {} 18 | if not path.is_file(): 19 | return part_lemmas 20 | with path.open("r", encoding="utf-8-sig") as f: 21 | reader = csv.reader(f, delimiter="\t") 22 | for row in list(reader)[1:]: 23 | form, _, lemma, _, _ = row 24 | part_lemmas[form] = lemma 25 | return part_lemmas 26 | 27 | 28 | class WordTokenizer: 29 | """ 30 | Convenience class to tokenize a given string. 31 | 32 | """ 33 | 34 | def __init__( 35 | self, config=None, ignore_chars=None, build_trie=False, 36 | ): 37 | """ 38 | :param tok_profile: profile for building the trie. (see config.yaml) 39 | """ 40 | if not config: 41 | # if config is not given then use default config 42 | config = Config() 43 | 44 | self.config = config 45 | self.ignore_chars = ignore_chars 46 | self.tok = Tokenize( 47 | Trie( 48 | BoSyl, 49 | config.profile, 50 | main_data=config.dictionary, 51 | custom_data=config.adjustments, 52 | pickle_path=config.dialect_pack_path.parent, 53 | build=build_trie, 54 | ) 55 | ) 56 | 57 | self.adj = AdjustTokens( 58 | main=config.dictionary["rules"], custom=config.adjustments["rules"] 59 | ) 60 | 61 | self.part_lemmas = get_part_lemmas( 62 | config.dialect_pack_path 63 | / "dictionary" 64 | / "words_non_inflected" 65 | / "particles.tsv" 66 | ) 67 | 68 | def tokenize(self, string, split_affixes=True, spaces_as_punct=False, debug=False): 69 | """ 70 | :param string: to be tokenized 71 | :param split_affixes: separates the affixed particles into seperate tokens if True 72 | :param debug: print debug info while parsing 73 | :return: list of pybo.tokenizers.Token objects 74 | """ 75 | preprocessed = TokChunks( 76 | string, ignore_chars=self.ignore_chars, space_as_punct=spaces_as_punct 77 | ) 78 | preprocessed.serve_syls_to_trie() 79 | tokens = self.tok.tokenize(preprocessed, debug=debug) 80 | 81 | if split_affixes: 82 | split_affixed(tokens) 83 | 84 | self._get_default_lemma(tokens) 85 | self._choose_default_entry(tokens) 86 | 87 | # merge pa/po/ba/bo tokens with previous ones 88 | MergeDagdra().merge(tokens) 89 | 90 | # do adjustments 91 | tokens = self.adj.adjust(tokens) 92 | 93 | return tokens 94 | 95 | def _get_default_lemma(self, token_list): 96 | for t in token_list: 97 | # pass any token that is not a word 98 | if not t.text_unaffixed: 99 | continue 100 | 101 | # otherwise, check whether the aa needs to be added and if a tsek should be added 102 | if t.affix and not t.affix_host: 103 | part = "".join(["".join(syl) for syl in t.syls]) 104 | lemma = self.part_lemmas[part] if part in self.part_lemmas else part 105 | lemma += TSEK 106 | elif not t.affix and t.affix_host: 107 | lemma = ( 108 | t.text_unaffixed + AA + TSEK 109 | if t.affixation["aa"] 110 | else t.text_unaffixed + TSEK 111 | ) 112 | else: 113 | lemma = ( 114 | t.text_unaffixed 115 | if t.text_unaffixed.endswith(TSEK) 116 | else t.text_unaffixed + TSEK 117 | ) 118 | 119 | for m in t.senses: 120 | if "lemma" not in m and ("pos" in m and m["pos"] != "NON_WORD"): 121 | m["lemma"] = lemma 122 | if not t.senses: 123 | t.senses.append({"lemma": lemma}) 124 | 125 | @staticmethod 126 | def _choose_default_entry(token_list): 127 | def choose_n_apply(senses, t): 128 | s = sorted(senses, key=len, reverse=True) 129 | for a in ["pos", "lemma", "freq", "sense"]: 130 | if a in s[0]: 131 | t[a] = s[0][a] 132 | 133 | for t in token_list: 134 | if t.senses: 135 | # Categorize all meanings in three groups 136 | affixed, non_affixed, no = [], [], [] 137 | for m in t.senses: 138 | if "affixed" in m: 139 | if m["affixed"]: 140 | affixed.append(m) 141 | else: 142 | non_affixed.append(m) 143 | else: 144 | no.append(m) 145 | 146 | # Decide what meaning to use as default 147 | # get a meaning from either group in the following order: non_affixed, no, affixed 148 | # take the one with the highest amount of attrs 149 | if non_affixed: 150 | choose_n_apply(non_affixed, t) 151 | elif no: 152 | choose_n_apply(no, t) 153 | elif affixed: 154 | choose_n_apply(affixed, t) 155 | else: 156 | raise ValueError("This should never happen.") 157 | -------------------------------------------------------------------------------- /botok/tries/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/tries/__init__.py -------------------------------------------------------------------------------- /botok/tries/basictrie.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | # inspired from https://gist.github.com/nickstanisha/733c134a0171a00f66d4 4 | # and https://github.com/eroux/tibetan-phonetics-py 5 | 6 | 7 | class Node: 8 | def __init__(self, label=None, leaf=False, data=None): 9 | if data is None: 10 | data = {'_': {}} # the dict in '_' is for user-data 11 | self.label = label 12 | self.leaf = leaf 13 | self.data = data 14 | self.children = dict() 15 | 16 | def add_child(self, key, leaf=False): 17 | if not isinstance(key, Node): 18 | self.children[key] = Node(key, leaf) 19 | else: 20 | self.children[key.leaf] = key 21 | 22 | def can_walk(self): 23 | return self.children != dict() 24 | 25 | def is_match(self): 26 | return self.leaf 27 | 28 | def __getitem__(self, key): 29 | return self.children[key] 30 | 31 | 32 | class BasicTrie: 33 | def __init__(self): 34 | self.head = Node() 35 | 36 | def __getitem__(self, key): 37 | return self.head.children[key] 38 | 39 | def add(self, word, data=None): 40 | # adding the word 41 | current_node = self.head 42 | word_finished = True 43 | 44 | i = 0 45 | for i in range(len(word)): 46 | if word[i] in current_node.children: 47 | current_node = current_node.children[word[i]] 48 | else: 49 | word_finished = False 50 | break 51 | 52 | if not word_finished: 53 | while i < len(word): 54 | current_node.add_child(word[i]) 55 | current_node = current_node.children[word[i]] 56 | i += 1 57 | 58 | current_node.leaf = True 59 | 60 | # adding data to the node 61 | if data: 62 | assert isinstance(data, dict) 63 | current_node.data.update(data) 64 | 65 | def walk(self, char, current_node=None): 66 | # logic of walking the trie adapted to be done outside the trie class (for Tokenize) 67 | if not current_node: 68 | current_node = self.head 69 | 70 | if char in current_node.children: 71 | next_node = current_node[char] 72 | else: 73 | next_node = None 74 | 75 | return next_node 76 | 77 | def has_word(self, word): 78 | if not word: 79 | raise ValueError('"word" must be non-null string') 80 | 81 | # parse the word 82 | current_node = self.head 83 | exists = True 84 | for syl in word: 85 | if syl in current_node.children: 86 | current_node = current_node.children[syl] 87 | else: 88 | exists = False 89 | break 90 | else: 91 | # reached a word like 't', not a full word in our dictionary 92 | if exists and not current_node.leaf: 93 | exists = False 94 | 95 | if exists: 96 | return {"exists": exists, "data": current_node.data} 97 | else: 98 | return {"exists": exists, "data": current_node.data} 99 | 100 | def add_data(self, word, data): 101 | """Adds data to words. 102 | 103 | :param word: word to add 104 | :param data: dict of content to add 105 | :return: True if any content added, False otherwise 106 | """ 107 | if not word: 108 | raise ValueError('"word" must be non-null string') 109 | 110 | # parse word 111 | current_node = self.head 112 | for syl in word: 113 | if syl in current_node.children: 114 | current_node = current_node.children[syl] 115 | else: 116 | return False 117 | 118 | # not a complete word 119 | if not current_node.leaf: 120 | return False 121 | 122 | # adding data 123 | if isinstance(data, int): 124 | current_node.data["form_freq"] = data 125 | added = True 126 | else: 127 | if "senses" not in current_node.data: 128 | current_node.data["senses"] = [] 129 | added = self.add_meaning(current_node.data["senses"], data) 130 | return added 131 | 132 | def add_meaning(self, meanings, meaning): 133 | if meanings: 134 | for m in meanings: 135 | if self.is_diff_meaning(meaning, m): 136 | meanings.append(meaning) 137 | return True 138 | return False 139 | else: 140 | meanings.append(meaning) 141 | return True 142 | 143 | @staticmethod 144 | def is_diff_meaning(m1, m2): 145 | is_diff = False 146 | for k, v in m1.items(): 147 | if k not in m2 or k in m2 and m2[k] != v: 148 | is_diff = True 149 | return is_diff 150 | 151 | def deactivate(self, word, rev=False): 152 | """Makes word not findable (words are found only when the leaf value is True) 153 | 154 | :param word: word to deactivate 155 | :param rev: reverse the deactivation 156 | :return True if the word exists, False otherwise 157 | """ 158 | current_node = self.head 159 | for syl in word: 160 | if syl in current_node.children: 161 | current_node = current_node.children[syl] 162 | else: 163 | return False 164 | if isinstance(current_node.data, dict): 165 | if not rev: 166 | current_node.leaf = False 167 | else: 168 | current_node.leaf = True 169 | return True 170 | else: 171 | return False 172 | -------------------------------------------------------------------------------- /botok/tries/trie.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import csv 3 | import pickle 4 | import time 5 | from pathlib import Path 6 | import logging 7 | 8 | from ..chunks.chunks import TokChunks 9 | from ..vars import HASH, NAMCHE, NO_POS, TSEK, __version__ 10 | from .basictrie import BasicTrie, Node 11 | 12 | 13 | class Trie(BasicTrie): 14 | def __init__( 15 | self, bosyl, profile, main_data, custom_data, build=False, pickle_path=None 16 | ): 17 | BasicTrie.__init__(self) 18 | self.bosyl = bosyl() 19 | self.main_data = main_data 20 | self.custom_data = custom_data 21 | self.pickled_file = Path(profile + "_trie.pickled") 22 | if pickle_path: 23 | self.pickled_file = Path(pickle_path) / self.pickled_file 24 | self.tmp_inflected = ( 25 | dict() 26 | ) # tmp to inflect only once, even if a word appears in many files. 27 | self.load_or_build_trie(build) 28 | 29 | def rebuild_trie(self): 30 | self.head = Node() 31 | self.load_or_build_trie(build=True) 32 | 33 | def load_or_build_trie(self, build=False): 34 | if build or not self.pickled_file.exists(): 35 | self._build_trie() 36 | else: 37 | self._load_trie() 38 | 39 | # add and deactivate the custom entries in memory (will not be written) 40 | self._populate_trie(self.custom_data) 41 | self.tmp_inflected = dict() 42 | 43 | def _load_trie(self): 44 | with self.pickled_file.open("rb") as f: 45 | self.head = pickle.load(f) 46 | version = self.head.data["_"]["version"] 47 | if version != __version__: 48 | print( 49 | f"\nThe trie was build for botok {version}. Current version: {__version__}" 50 | ) 51 | self._build_trie() 52 | 53 | def _build_trie(self): 54 | """ 55 | """ 56 | logging.debug("Building Trie:") 57 | start = time.time() 58 | self.head.data["_"]["version"] = __version__ # add version in trie 59 | self._populate_trie(self.main_data) 60 | 61 | with self.pickled_file.open("wb") as f: 62 | pickle.dump(self.head, f, pickle.HIGHEST_PROTOCOL) 63 | end = time.time() 64 | logging.debug("({:.0f} s.)".format(end - start)) 65 | 66 | def _populate_trie(self, files): 67 | # first populate the trie with words 68 | lexica = (d for d in files if d.startswith("lexica")) 69 | for l in lexica: 70 | for f in files[l]: 71 | self._add_one_file(f, l) 72 | 73 | # then add data to the added words 74 | rest = ( 75 | d for d in files if not d.startswith("lexica") and not d.startswith("rules") 76 | ) 77 | for r in rest: 78 | for f in files[r]: 79 | self._add_one_file(f, r) 80 | 81 | def _add_one_file(self, in_file, category): 82 | """ 83 | files can have comments starting with # 84 | spaces and empty lines are trimmed 85 | a single space(breaks if more than one), a comma or a tab can be used as separators 86 | """ 87 | logging.debug("\t" + str(in_file)) 88 | with in_file.open("r", encoding="utf-8-sig") as f: 89 | lines = self.__clean_lines(f) 90 | for l in lines: 91 | word = l.split("\t", 1)[0] 92 | if category == "words": 93 | self.inflect_n_modify_trie(word) 94 | self.inflect_n_add_data(l) 95 | 96 | elif category == "words_non_inflected": 97 | self.add_non_inflectible(word) 98 | self.inflect_n_add_data(l) 99 | 100 | elif category == "words_skrt": 101 | self.inflect_n_modify_trie(word, skrt=True) 102 | self.inflect_n_add_data(l) 103 | 104 | elif category == "remove": 105 | self.inflect_n_modify_trie(l, deactivate=True) 106 | 107 | else: 108 | raise SyntaxError( 109 | "'category' is: '" 110 | + category 111 | + "'. Valid answers: words_bo, words_skrt," 112 | "words_non_inflected, entry_data, remove" 113 | ) 114 | 115 | def add_non_inflectible(self, word): 116 | syls = TokChunks(word).get_syls() 117 | if not syls: 118 | return None 119 | 120 | # infl = self.__join_syls(syls) 121 | self.add(syls) 122 | 123 | def inflect_n_modify_trie(self, word, deactivate=False, skrt=False): 124 | """ 125 | Add or deactivate to the trie all the affixed versions of the word 126 | :param word: a word without ending tsek 127 | :param deactivate: switch to add or deactivate a word 128 | """ 129 | inflected = self._get_inflected(word) 130 | if not inflected: 131 | return 132 | 133 | for infl, data in inflected: 134 | if deactivate: 135 | self.deactivate(infl) 136 | else: 137 | if skrt: 138 | if data is None: 139 | data = {"skrt": True} 140 | else: 141 | data.update({"skrt": True}) 142 | self.add(infl, data=data) 143 | else: 144 | self.add(infl, data=data) 145 | 146 | def inflect_n_add_data(self, line): 147 | form, pos, lemma, sense, freq = self.__parse_line(line) 148 | freq = int(freq) if freq else None 149 | lemma = self.__join_syls(TokChunks(lemma).get_syls()) if lemma else None 150 | 151 | inflected = self._get_inflected(form) 152 | if not inflected: 153 | return # The entry is not Tibetan, so return doing nothing 154 | 155 | for infl, _ in inflected: 156 | affixed = True if _ else False 157 | data = { 158 | k: v 159 | for k, v in [ 160 | ("lemma", lemma), 161 | ("pos", pos), 162 | ("freq", freq), 163 | ("sense", sense), 164 | ("affixed", affixed), 165 | ] 166 | if v is not None 167 | } 168 | self.add_data(infl, data) 169 | 170 | def _get_inflected(self, word): 171 | """ 172 | gets the clean syls using TokChunks(), then inflects the last syl using BoSyl.get_all_affixed() 173 | 174 | :return: list of (, ) 175 | """ 176 | if word in self.tmp_inflected: 177 | return self.tmp_inflected[word] 178 | 179 | syls = TokChunks(word).get_syls() 180 | if not syls: 181 | return None 182 | 183 | inflected = [(syls, None)] 184 | affixed = self.bosyl.get_all_affixed(syls[-1]) 185 | if affixed: 186 | for infl, data in affixed: 187 | infl_word = syls[:-1] + [infl] 188 | inflected.append((infl_word, {"affixation": data})) 189 | 190 | self.tmp_inflected[word] = inflected 191 | return inflected 192 | 193 | @staticmethod 194 | def __join_syls(syls): 195 | return "".join([syl if syl.endswith(NAMCHE) else syl + TSEK for syl in syls]) 196 | 197 | @staticmethod 198 | def __clean_lines(f): 199 | # cuts off comments, then strips empty lines 200 | lines = ( 201 | line[: line.index(HASH)] if HASH in line else line for line in f.readlines() 202 | ) 203 | return (l for l in lines if l) 204 | 205 | @staticmethod 206 | def __parse_line(line): 207 | """ 208 | enables support of '\t' and ',' as separator. 209 | """ 210 | fields = [None, None, None, None, None] 211 | if "\t" in line: 212 | sep = "\t" 213 | elif "," in line: 214 | sep = "," 215 | else: 216 | fields[0] = line 217 | fields[2] = NO_POS 218 | return fields 219 | 220 | for num, cell in enumerate(list(csv.reader([line], delimiter=sep))[0]): 221 | fields[num] = cell if cell else None 222 | return fields 223 | -------------------------------------------------------------------------------- /botok/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/botok/utils/__init__.py -------------------------------------------------------------------------------- /botok/utils/expose_data.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from pathlib import Path 3 | 4 | from ..config import Config 5 | 6 | 7 | def expose_data(out_path, profile=None): 8 | """ Copies all the trie and adjustment data to out_path 9 | :param out_path: must be an existing empty folder 10 | """ 11 | if profile not in ["POS", "empty"]: 12 | raise SyntaxError('profile should be either one of ["POS", "empty"]') 13 | 14 | out_path = Path(out_path) 15 | if not out_path.is_dir() or list(out_path.glob("*")): 16 | raise IOError("out_path should be an empty folder") 17 | 18 | resources = Path(__file__).parent / "../resources" 19 | resources = resources.resolve() 20 | res_dirs = [r for r in resources.glob("*") if r.is_dir()] 21 | 22 | if profile: 23 | # export profile data 24 | for f in Config().config["tokenizers"]["profiles"][profile]: 25 | Path(out_path / Path(f).parent).mkdir( 26 | parents=True, exist_ok=True 27 | ) # create dir 28 | shutil.copy(resources / f, out_path / f) 29 | 30 | shutil.copytree(resources / "adjustment", out_path / "adjustment") 31 | 32 | else: 33 | # export all data 34 | for r in res_dirs: 35 | shutil.copytree(r, out_path / r.name) 36 | -------------------------------------------------------------------------------- /botok/utils/helpers.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | 3 | 4 | def decomment_file(file): 5 | for row in file: 6 | raw = row.split("#")[0].strip() 7 | if raw: 8 | yield raw 9 | -------------------------------------------------------------------------------- /botok/utils/unicode_normalization.py: -------------------------------------------------------------------------------- 1 | import re 2 | from enum import Enum 3 | 4 | class Cats(Enum): 5 | Other = 0 6 | Base = 1 7 | Subscript = 2 8 | BottomVowel = 3 9 | BottomMark = 4 10 | TopVowel = 5 11 | TopMark = 6 12 | RightMark = 7 13 | 14 | 15 | CATEGORIES = ( 16 | [Cats.Other] # 0F00 17 | + [Cats.Base] # 0F01, often followed by 0f083 18 | + [Cats.Other] * 22 # 0F02-0F17 19 | + [Cats.BottomVowel] * 2 # 0F18-0F19 20 | + [Cats.Other] * 6 # 0F1A-0F1F 21 | + [Cats.Base] 22 | * 20 # 0F20-0F33, numbers can be followed by 0f18, 0f19 or exceptionally by vowels 23 | + [Cats.Other] # 0F34 24 | + [Cats.BottomMark] # 0F35 25 | + [Cats.Other] # 0F36 26 | + [Cats.BottomMark] # OF37 27 | + [Cats.Other] # 0F38 28 | + [Cats.Subscript] # 0F39, kind of cheating but works 29 | + [Cats.Other] * 4 # 0F3A-0F3D 30 | + [Cats.RightMark] # 0F3E 31 | + [Cats.Other] # 0F3F, not quite sure 32 | + [Cats.Base] * 45 # 0F40-0F6C 33 | + [Cats.Other] * 4 # 0F6D-0F70 34 | + [Cats.BottomVowel] # 0F71 35 | + [Cats.TopVowel] # 0F72 36 | + [Cats.TopVowel] # 0F73 37 | + [Cats.BottomVowel] * 2 # 0F74-0F75 38 | + [Cats.TopVowel] * 8 # 0F76-0F7D 39 | + [Cats.TopMark] # 0F7E 40 | + [Cats.RightMark] # 0F7F 41 | + [Cats.TopVowel] * 2 # 0F80-0F81 42 | + [Cats.TopMark] * 2 # 0F82-0F83 43 | + [Cats.BottomMark] # 0F84 44 | + [Cats.Other] # 0F85 45 | + [Cats.TopMark] * 2 # 0F86-0F87 46 | + [Cats.Base] * 2 # 0F88-0F89 47 | + [Cats.Base] # 0F8A always followed by 0f82 (required by the Unicode spec) 48 | + [Cats.Other] # 0F8B 49 | + [Cats.Base] # 0F8C 50 | + [Cats.Subscript] * 48 # 0F8D-0FBC 51 | ) 52 | 53 | 54 | def charcat(c): 55 | """Returns the category for a single char string""" 56 | o = ord(c) 57 | if 0x0F00 <= o <= 0x0FBC: 58 | return CATEGORIES[o - 0x0F00] 59 | return Cats.Other 60 | 61 | 62 | # debug: 63 | # for i, c in enumerate(CATEGORIES): 64 | # print("%x : %d" % (0x0F00 + i , c.value)) 65 | 66 | 67 | def unicode_reorder(txt): 68 | # case of a syllable starting with a diacritic (ex: a vowel or subscript) 69 | # we push it after the first main letter 70 | # txt = re.sub(r"^([\u0f71-\u0f84\u0f8d-\u0fbc]+)([\u0f40-\u0f6c])", r"\2", txt) 71 | # return txt, True 72 | # inpired from code for Khmer Unicode provided by SIL 73 | # https://docs.microsoft.com/en-us/typography/script-development/tibetan#reor 74 | # https://docs.microsoft.com/en-us/typography/script-development/use#glyph-reordering 75 | charcats = [charcat(c) for c in txt] 76 | # find subranges of base+non other and sort components in the subrange 77 | i = 0 78 | res = [] 79 | valid = True 80 | while i < len(charcats): 81 | c = charcats[i] 82 | if c != Cats.Base: 83 | if c.value > Cats.Base.value: 84 | valid = False 85 | res.append(txt[i]) 86 | i += 1 87 | continue 88 | # scan for end of component 89 | j = i + 1 90 | while j < len(charcats) and charcats[j].value > Cats.Base.value: 91 | j += 1 92 | # sort syllable based on character categories 93 | # sort the char indices by category then position in string 94 | newindices = sorted(range(i, j), key=lambda e: (charcats[e].value, e)) 95 | replaces = "".join(txt[n] for n in newindices) 96 | res.append(replaces) 97 | i = j 98 | return "".join(res), valid 99 | 100 | 101 | def normalize_unicode(s, form="nfd"): 102 | # first, unify Unicode form: 103 | # http://www.unicode.org/faq/normalization.html 104 | # https://unicode.org/reports/tr15/ 105 | # https://unicode.org/charts/normalization/chart_Tibetan.html 106 | # although for some reason this chart considers 0f0c -> 0f0b in NFD 107 | # 108 | # deprecated or discouraged characters 109 | s = s.replace("\u0f73", "\u0f71\u0f72") # use is discouraged 110 | s = s.replace("\u0f75", "\u0f71\u0f74") # use is discouraged 111 | s = s.replace("\u0f77", "\u0fb2\u0f71\u0f80") # deprecated 112 | s = s.replace("\u0f79", "\u0fb3\u0f71\u0f80") # deprecated 113 | s = s.replace("\u0f81", "\u0f71\u0f80") # use is discouraged 114 | if form == "nfd": 115 | s = s.replace("\u0f43", "\u0f42\u0fb7") 116 | s = s.replace("\u0f4d", "\u0f4c\u0fb7") 117 | s = s.replace("\u0f52", "\u0f51\u0fb7") 118 | s = s.replace("\u0f57", "\u0f56\u0fb7") 119 | s = s.replace("\u0f5c", "\u0f5b\u0fb7") 120 | s = s.replace("\u0f69", "\u0f40\u0fb5") 121 | s = s.replace("\u0f76", "\u0fb2\u0f80") 122 | s = s.replace("\u0f78", "\u0fb3\u0f80") 123 | s = s.replace("\u0f93", "\u0f92\u0fb7") 124 | s = s.replace("\u0f9d", "\u0f9c\u0fb7") 125 | s = s.replace("\u0fa2", "\u0fa1\u0fb7") 126 | s = s.replace("\u0fa7", "\u0fa6\u0fb7") 127 | s = s.replace("\u0fac", "\u0fab\u0fb7") 128 | s = s.replace("\u0fb9", "\u0f90\u0fb5") 129 | else: 130 | s = s.replace("\u0f42\u0fb7", "\u0f43") 131 | s = s.replace("\u0f4c\u0fb7", "\u0f4d") 132 | s = s.replace("\u0f51\u0fb7", "\u0f52") 133 | s = s.replace("\u0f56\u0fb7", "\u0f57") 134 | s = s.replace("\u0f5b\u0fb7", "\u0f5c") 135 | s = s.replace("\u0f40\u0fb5", "\u0f69") 136 | s = s.replace("\u0fb2\u0f80", "\u0f76") 137 | s = s.replace("\u0fb3\u0f80", "\u0f78") 138 | s = s.replace("\u0f92\u0fb7", "\u0f93") 139 | s = s.replace("\u0f9c\u0fb7", "\u0f9d") 140 | s = s.replace("\u0fa1\u0fb7", "\u0fa2") 141 | s = s.replace("\u0fa6\u0fb7", "\u0fa7") 142 | s = s.replace("\u0fab\u0fb7", "\u0fac") 143 | s = s.replace("\u0f90\u0fb5", "\u0fb9") 144 | # 0f00 has not been marked as a composed character in Unicode 145 | # This is something that is now seen as a mistake, but it cannot be 146 | # changed because of Unicode change policies. 147 | s = s.replace("\u0f00", "\u0f68\u0f7c\u0f7e") 148 | s, valid = unicode_reorder(s) 149 | # ra doesn't transform into a small rago before anything else than (most) subjoined, 150 | # so 0f65 should be replaced with 0f62 in that case 151 | s = re.sub("\u0f65([^\u0f90-\u0f97\u0f9a-\u0fac\u0fae\u0faf\u0fb4-\u0fbc])", r"ར\1", s) 152 | s = normalize_invalid_start_string(s) 153 | return s 154 | 155 | def debug_to_unicode(s): 156 | res = "" 157 | for c in s: 158 | res += "\\u%x " % ord(c) 159 | return res 160 | 161 | 162 | def assert_conv(orig, expected, expectedValid=True): 163 | resultStr = normalize_unicode(orig) 164 | assert resultStr == expected, "{} -> {} but {} expected".format( 165 | debug_to_unicode(orig), debug_to_unicode(resultStr), debug_to_unicode(expected) 166 | ) 167 | #assert resultValid == expectedValid, "{} valid? -> {} but {} expected".format( 168 | # debug_to_unicode(orig), resultValid, expectedValid 169 | #) 170 | 171 | 172 | def is_vowel(char): 173 | if re.search(r"[\u0f71-\u0f84]", char): 174 | return True 175 | return False 176 | 177 | 178 | def is_suffix(char): 179 | if re.search(r"[\u0f90-\u0fbc]", char): 180 | return True 181 | return False 182 | 183 | 184 | def normalize_invalid_start_string(s): 185 | if len(s) < 2: 186 | return s 187 | # we put the vowel in second place if the string starts with a vowel 188 | if is_vowel(s[0]) and not is_vowel(s[1]) and not is_suffix(s[1]): 189 | return s[1] + s[0] + (s[2:] if len(s) > 2 else "") 190 | if is_suffix(s[0]): 191 | return s[1:] 192 | return s 193 | 194 | 195 | def test_normalize_unicode(): 196 | assert_conv("\u0F7B\u0F56", "\u0F56\u0F7B", False) 197 | assert_conv("\u0f40\u0f77", "\u0f40\u0fb2\u0f71\u0f80", False) 198 | assert_conv("\u0f40\u0f7e\u0f7c\u0f74\u0f71", "\u0f40\u0f74\u0f71\u0f7c\u0f7e") 199 | assert_conv("\u0f58\u0f74\u0fb0\u0f83", "\u0f58\u0fb0\u0f74\u0f83") 200 | assert_conv("\u0F51\u0FB7\u0F74\u0FB0", "\u0F51\u0FB7\u0fb0\u0F74") 201 | assert_conv("\u0F66\u0F7C\u0FB1", "\u0F66\u0FB1\u0F7C") 202 | assert_conv("\u0F0B\u0F7E", "\u0F0B\u0F7E", False) 203 | assert_conv("\u0f65\u0f99\u0f7a\u0f7a", "\u0f62\u0f99\u0f7a\u0f7a") 204 | assert_conv("\u0f01\u0f83", "\u0f01\u0f83") # should be valid 205 | 206 | 207 | if __name__ == "__main__": 208 | test_normalize_unicode() -------------------------------------------------------------------------------- /botok/vars.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | from enum import Enum, IntEnum 3 | 4 | __version__ = "0.9.0" 5 | 6 | NO_POS = "NOPOS" 7 | TSEK = "་" 8 | NAMCHE = "ཿ" 9 | SHAD = "།" 10 | AA = "འ" 11 | HASH = "#" 12 | VOWELS = ["ི"] 13 | NO_SHAD_CONS = ["ཀ", "ག", "ཤ"] 14 | DAGDRA = ["པ་", "པོ་", "བ་", "བོ་"] 15 | 16 | CharMarkers = IntEnum( 17 | "CharMarkers", 18 | [ 19 | # regular Tibetan 20 | "CONS", 21 | "SUB_CONS", 22 | "VOW", 23 | "TSEK", 24 | # punctuation 25 | "NORMAL_PUNCT", 26 | "SPECIAL_PUNCT", 27 | # others 28 | "NUMERAL", 29 | "SYMBOL", 30 | "IN_SYL_MARK", 31 | "NON_BO_NON_SKRT", 32 | # lexica_skrt 33 | "SKRT_CONS", 34 | "SKRT_SUB_CONS", 35 | "SKRT_VOW", 36 | "SKRT_LONG_VOW", 37 | # other languages 38 | "CJK", 39 | "LATIN", 40 | # misc 41 | "OTHER", 42 | "TRANSPARENT", 43 | "NFC", 44 | ], 45 | start=1, 46 | ) 47 | char_values = {c.value: c.name for c in CharMarkers} 48 | 49 | ChunkMarkers = IntEnum( 50 | "ChunkMarkers", 51 | [ 52 | # languages 53 | "BO", 54 | "LATIN", 55 | "CJK", 56 | "OTHER", 57 | # tibetan textual content 58 | "TEXT", 59 | # tibetan non-textual content 60 | "PUNCT", 61 | "NON_PUNCT", 62 | "SPACE", 63 | "NON_SPACE", 64 | "SYM", 65 | "NON_SYM", 66 | "NUM", 67 | "NON_NUM", 68 | ], 69 | start=100, 70 | ) 71 | chunk_values = {c.value: c.name for c in ChunkMarkers} 72 | 73 | WordMarkers = IntEnum("WordMarkers", ["WORD", "NO_POS", "NON_WORD"], start=1000) 74 | word_values = {w.value: w.name for w in WordMarkers} 75 | 76 | Ids = Enum("Ids", ["profile", "prep", "tok", "mod", "form"]) 77 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Generating the documentation 2 | 3 | To generate the documentation, you first have to build it. Several packages are necessary to build the doc, 4 | you can install them with the following command, at the root of the code repository: 5 | 6 | ```bash 7 | pip install -e ".[docs]" 8 | ``` 9 | 10 | --- 11 | **NOTE** 12 | 13 | You only need to generate the documentation to inspect it locally (if you're planning changes and want to 14 | check how they look like before committing for instance). You don't have to commit the built documentation. 15 | 16 | --- 17 | 18 | ## Packages installed 19 | 20 | Here's an overview of all the packages installed. If you ran the previous command installing all packages from 21 | `requirements.txt`, you do not need to run the following commands. 22 | 23 | Building it requires the package `sphinx` that you can 24 | install using: 25 | 26 | ```bash 27 | pip install -U sphinx 28 | ``` 29 | 30 | You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by 31 | [Read The Docs](https://readthedocs.org/). You can install it using the following command: 32 | 33 | ```bash 34 | pip install sphinx_rtd_theme 35 | ``` 36 | 37 | The third necessary package is the `recommonmark` package to accept Markdown as well as Restructured text: 38 | 39 | ```bash 40 | pip install recommonmark 41 | ``` 42 | 43 | ## Building the documentation 44 | 45 | Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder: 46 | 47 | ```bash 48 | make html 49 | ``` 50 | 51 | A folder called ``_build/html`` should have been created. You can now open the file ``_build/html/index.html`` in your 52 | browser. 53 | 54 | --- 55 | **NOTE** 56 | 57 | If you are adding/removing elements from the toc-tree or from any structural item, it is recommended to clean the build 58 | directory before rebuilding. Run the following command to clean and build: 59 | 60 | ```bash 61 | make clean && make html 62 | ``` 63 | 64 | --- 65 | 66 | It should build the static app that will be available under `/docs/_build/html` 67 | 68 | ## Adding a new element to the tree (toc-tree) 69 | 70 | Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it 71 | in the source directory. You can then link it to the toc-tree by putting the filename without the extension. 72 | 73 | ## Preview the documentation in a pull request 74 | 75 | Once you have made your pull request, you can check what the documentation will look like after it's merged by 76 | following these steps: 77 | 78 | - Look at the checks at the bottom of the conversation page of your PR (you may need to click on "show all checks" to 79 | expand them). 80 | - Click on "details" next to the `ci/circleci: build_doc` check. 81 | - In the new window, click on the "Artifacts" tab. 82 | - Locate the file "docs/_build/html/index.html" (or any specific page you want to check) and click on it to get a 83 | preview. 84 | 85 | ## Writing Documentation - Specification 86 | 87 | The `huggingface/transformers` documentation follows the 88 | [Google documentation](https://sphinxcontrib-napoleon.readthedocs.io/en/latest/example_google.html) style. It is 89 | mostly written in ReStructuredText 90 | ([Sphinx simple documentation](https://www.sphinx-doc.org/en/master/usage/restructuredtext/index.html), 91 | [Sourceforge complete documentation](https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html)) 92 | 93 | ### Adding a new section 94 | 95 | A section is a page held in the `Notes` toc-tree on the documentation. Adding a new section is done in two steps: 96 | 97 | - Add a new file under `./source`. This file can either be ReStructuredText (.rst) or Markdown (.md). 98 | - Link that file in `./source/index.rst` on the correct toc-tree. 99 | 100 | ### Adding a new model 101 | 102 | When adding a new model: 103 | 104 | - Create a file `xxx.rst` under `./source/model_doc`. 105 | - Link that file in `./source/index.rst` on the `model_doc` toc-tree. 106 | - Write a short overview of the model: 107 | - Overview with paper & authors 108 | - Paper abstract 109 | - Tips and tricks and how to use it best 110 | - Add the classes that should be linked in the model. This generally includes the configuration, the tokenizer, and 111 | every model of that class (the base model, alongside models with additional heads), both in PyTorch and TensorFlow. 112 | The order is generally: 113 | - Configuration, 114 | - Tokenizer 115 | - PyTorch base model 116 | - PyTorch head models 117 | - TensorFlow base model 118 | - TensorFlow head models 119 | 120 | These classes should be added using the RST syntax. Usually as follows: 121 | ``` 122 | XXXConfig 123 | ~~~~~~~~~~~~~~~~~~~~~ 124 | 125 | .. autoclass:: transformers.XXXConfig 126 | :members: 127 | ``` 128 | 129 | This will include every public method of the configuration. If for some reason you wish for a method not to be 130 | displayed in the documentation, you can do so by specifying which methods should be in the docs: 131 | 132 | ``` 133 | XXXTokenizer 134 | ~~~~~~~~~~~~~~~~~~~~~ 135 | 136 | .. autoclass:: transformers.XXXTokenizer 137 | :members: build_inputs_with_special_tokens, get_special_tokens_mask, 138 | create_token_type_ids_from_sequences, save_vocabulary 139 | 140 | ``` 141 | 142 | ### Writing source documentation 143 | 144 | Values that should be put in `code` should either be surrounded by double backticks: \`\`like so\`\` or be written as 145 | an object using the :obj: syntax: :obj:\`like so\`. 146 | 147 | When mentionning a class, it is recommended to use the :class: syntax as the mentioned class will be automatically 148 | linked by Sphinx: :class:\`transformers.XXXClass\` 149 | 150 | When mentioning a function, it is recommended to use the :func: syntax as the mentioned method will be automatically 151 | linked by Sphinx: :func:\`transformers.XXXClass.method\` 152 | 153 | Links should be done as so (note the double underscore at the end): \`text for the link <./local-link-or-global-link#loc>\`__ 154 | 155 | #### Defining arguments in a method 156 | 157 | Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 158 | The argument should be followed by its type, with its shape if it is a tensor, and a line return. 159 | Another indentation is necessary before writing the description of the argument. 160 | 161 | Here's an example showcasing everything so far: 162 | 163 | ``` 164 | Args: 165 | input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`): 166 | Indices of input sequence tokens in the vocabulary. 167 | 168 | Indices can be obtained using :class:`transformers.AlbertTokenizer`. 169 | See :func:`transformers.PreTrainedTokenizer.encode` and 170 | :func:`transformers.PreTrainedTokenizer.__call__` for details. 171 | 172 | `What are input IDs? <../glossary.html#input-ids>`__ 173 | ``` 174 | 175 | #### Writing a multi-line code block 176 | 177 | Multi-line code blocks can be useful for displaying examples. They are done like so: 178 | 179 | ``` 180 | Example:: 181 | 182 | # first line of code 183 | # second line 184 | # etc 185 | ``` 186 | 187 | The `Example` string at the beginning can be replaced by anything as long as there are two semicolons following it. 188 | 189 | #### Writing a return block 190 | 191 | Arguments should be defined with the `Args:` prefix, followed by a line return and an indentation. 192 | The first line should be the type of the return, followed by a line return. No need to indent further for the elements 193 | building the return. 194 | 195 | Here's an example for tuple return, comprising several objects: 196 | 197 | ``` 198 | Returns: 199 | :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs: 200 | loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: 201 | Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss. 202 | prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`) 203 | Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). 204 | ``` 205 | 206 | Here's an example for a single value return: 207 | 208 | ``` 209 | Returns: 210 | A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. 211 | ``` 212 | -------------------------------------------------------------------------------- /docs/old-docs/README.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | pybo is a word tokenizer for the Tibetan language written in Python. pybo takes in chunks of text and returns lists of words. It provides an easy-to-use, high-performance tokenization pipeline that can serve as a stand-alone solution or be adapted as a compliment. 4 | 5 | 6 | ## Getting started 7 | 8 | pip install pybo 9 | 10 | Or to install from the latest master branch: 11 | 12 | pip install git+https://github.com/Esukhia/pybo.git 13 | 14 | ## How to use pybo 15 | 16 | #### To initiate the tokenizer together with part-of-speech capability: 17 | 18 | # Initialize the tokenizer 19 | tok = bo.BoTokenizer('POS') 20 | 21 | # Feed it some Tibetan text 22 | input_str = '༄༅། །རྒྱ་གར་སྐད་དུ། བོ་དྷི་སཏྭ་ཙརྻ་ཨ་བ་ཏ་ར། བོད་སྐད་དུ། བྱང་ཆུབ་སེམས་དཔའི་སྤྱོད་པ་ལ་འཇུག་པ། །སངས་རྒྱས་དང་བྱང་ཆུབ་སེམས་དཔའ་ཐམས་ཅད་ལ་ཕྱག་འཚལ་ལོ། །བདེ་གཤེགས་ཆོས་ཀྱི་སྐུ་མངའ་སྲས་བཅས་དང༌། །ཕྱག་འོས་ཀུན་ལའང་གུས་པར་ཕྱག་འཚལ་ཏེ། །བདེ་གཤེགས་སྲས་ཀྱི་སྡོམ་ལ་འཇུག་པ་ནི། །ལུང་བཞིན་མདོར་བསྡུས་ནས་ནི་བརྗོད་པར་བྱ། །' 23 | 24 | # Run the tokenizer 25 | tokens = tok.tokenize(input_str) 26 | 27 | #### Now in 'tokens' you have an iterable where each token consist of several meta-data: 28 | 29 | # Access the first token in the iterable 30 | tokens[0] 31 | 32 | This yields: 33 | 34 | content: "༄༅། " 35 | char_types: |punct|punct|punct|space| 36 | type: punct 37 | start: 0 38 | len: 4 39 | syls: None 40 | tag: punct 41 | pos: punct 42 | skr: "False" 43 | freq: 0 44 | 45 | notes: 46 | - `start` is the starting index of the current token in the input string. 47 | - `syls` is a list of cleaned syllables, each syllable being represented as a list of indices. 48 | Each index leads to a constituting character within the input string. 49 | 50 | #### How to access all the words in a list 51 | 52 | # iterate through the tokens object to get all the words in a list 53 | [t.content for t in tokens] 54 | 55 | #### How to get all the nouns in a text 56 | 57 | # extract nouns from the tokens 58 | [t.content for t in tokens if t.tag == 'NOUNᛃᛃᛃ'] 59 | 60 | These examples highlight the basic principle of accessing attributes within each token object. 61 | 62 | ## Acknowledgements 63 | 64 | **pybo** is an open source library for Tibetan NLP. 65 | 66 | We are always open to cooperation in introducing new features, tool integrations and testing solutions. 67 | 68 | Many thanks to the companies and organizations who have supported pybo's development, especially: 69 | 70 | * [Khyentse Foundation](https://khyentsefoundation.org) for contributing USD22,000 to kickstart the project 71 | * The [Barom/Esukhia canon project](http://www.barom.org) for sponsoring training data curation 72 | * [BDRC](https://tbrc.org) for contributing 2 staff for 6 months for data curation 73 | 74 | ## Maintainance 75 | 76 | Build the source dist: 77 | 78 | ``` 79 | rm -rf dist/ 80 | python3 setup.py clean sdist 81 | ``` 82 | 83 | and upload on twine (versio >= `1.11.0`) with: 84 | 85 | ``` 86 | twine upload dist/* 87 | ``` 88 | 89 | ## License 90 | 91 | The Python code is Copyright (C) 2019 Esukhia, provided under [Apache 2](LICENSE). 92 | 93 | author: [Drupchen](https://github.com/drupchen) 94 | 95 | contributors: 96 | * [Élie Roux](https://github.com/eroux) 97 | * [Thubten Rinzin](https://github.com/thubtenrigzin) 98 | * [Ngawang Trinley](https://github.com/ngawangtrinley) 99 | * [Mikko Kotila](https://github.com/mikkokotila) 100 | * [Tenzin](https://github.com/10zinten) 101 | -------------------------------------------------------------------------------- /docs/old-docs/cql_readme.md: -------------------------------------------------------------------------------- 1 | # CQL basics 2 | 3 | To use CQL, go to the corpus search and select the CQL option. CQL will not work anywhere else in the interface. Expert users will use CQL for the writing of Word Sketch grammars and term grammars. 4 | 5 | ## Syntax 6 | 7 | With CQL, complex criteria can be set to find one or many tokens. Criteria for each token must be between a pair of square brackets [ ]. The format is: 8 | 9 | [attribute="value"] 10 | 11 | To find the lemma teapot, use 12 | 13 | [lemma="teapot"] 14 | 15 | Each token must be inside its own pair of square brackets. To search for phrase refill the teapot, use 16 | 17 | [lemma="refill"][lemma="the"][lemma="teapot"] 18 | 19 | ## Spaces 20 | 21 | Spaces have no function in CQL. Feel free to use spaces to make the code more readable. This code is equivalent to the previous one. 22 | 23 | 24 | [ lemma = "refill" ] [ lemma = "the" ] [ lemma= "teapot" ] 25 | 26 | ## Careful in values! 27 | 28 | There should not be any spaces inside quotes. This finds nothing because a lemma cannot start with spaces. 29 | 30 | [lemma=" the"] 31 | 32 | More examples 33 | 34 | | TASK | CQL CODE | RESULT | 35 | | -- | -- | -- | 36 | | find examples of “went” | [word="went"] | concordance of the word went 37 | | find examples of all forms of go | [lemma="go"] | concordance of go, goes, going, gone, went 38 | | find exaples of all words tagged with the tag NP | [tag="NP"] | concordance of various words tagged as NP 39 | 40 | 41 | 42 | * Matching on token annotations (properties or attributes), using regular expressions and =, !=, !. Example: [word="bank"] (or just "bank") 43 | * Combining criteria using &, | and !. Parentheses can also be used for grouping. Example: [lemma="bank" & pos="V"] 44 | * Matchall pattern [] matches any token. Example: "a" [] "day" 45 | * Regular expression operators +, *, ?, {n}, {n,m} at the token level. Example: [pos="ADJ"]+ 46 | * Sequences of token constraints. Example: [pos="ADJ"] "cow" 47 | * Operators |, & and parentheses can be used to build complex sequence queries. Example: "happy" "dog" | "sad" cat" 48 | * Querying with tag positions using e.g. (start of sentence), (end of sentence), (whole sentence) or ... (equivalent to containing ...). Example: "The". XML attribute values may be used as well, e.g. (“named entities that are persons”). 49 | * Using within and containing operators to find hits inside another set of hits. Example: "you" "are" within 50 | * Using an anchor to capture a token position. Example: "big" A:[]. Captured matches can be used in global constraints (see next item) or processed separately later (using the Java interface; capture information is not yet returned by BlackLab Server). Note that BlackLab can actually capture entire groups of tokens as well, similarly to regular expression engines. 51 | * Global constraints on captured tokens, such as requiring them to contain the same word. Example: "big" A:[] "or" "small" B:[] :: A.word = B.word 52 | -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | sphinx==3.1.2 2 | sphinx_rtd_theme==0.5.0 3 | recommonmark==0.6.0 4 | sphinx_markdown_tables==0.0.15 5 | sphinx_copybutton==0.3.0 -------------------------------------------------------------------------------- /docs/source/acknowledgement.rst: -------------------------------------------------------------------------------- 1 | Acknowledgement 2 | ---------------------------------------------- 3 | botok is an open source library for Tibetan NLP. 4 | 5 | We are always open to cooperation in introducing new features, tool integrations and testing solutions. 6 | 7 | Many thanks to the companies and organizations who have supported botok's development, especially: 8 | 9 | - `Khyentse Foundation `_ for contributing USD22,000 to kickstart the project 10 | - The `Barom/OpenPecha canon project `_ for sponsoring training data curation 11 | - `BDRC `_ for contributing 2 staff for 6 months for data curation 12 | -------------------------------------------------------------------------------- /docs/source/architecture.rst: -------------------------------------------------------------------------------- 1 | Architecture 2 | ============ 3 | 4 | This document explains the architecture of botok. 5 | 6 | WordTokenizer architecture 7 | -------------------------- 8 | 9 | Following is the architecture diagram of the `WordTokenizer `_ class 10 | 11 | .. image:: imgs/botok_architecture.svg 12 | :align: center 13 | 14 | 15 | Tokenization workflow 16 | --------------------- 17 | 18 | Here is botok tokenization workflow with an examples. 19 | 20 | .. code:: 21 | 22 | >>> input_string = "ཀུན་་་དགའི་དོན་གྲུབ།" 23 | >>> from botok import BoSyl, Config, TokChunks, Tokenize, Trie 24 | >>> config = Config() 25 | >>> trie = Trie(BoSyl, profile=config.profile, main_data=config.dictionary, custom_data=config.adjustments) 26 | >>> tok = Tokenize(trie) 27 | >>> preproc = TokChunks(input_string) 28 | >>> preproc.serve_syls_to_trie() 29 | >>> tokens = tok.tokenize(preproc) 30 | >>> 31 | >>> print(*tokens, sep=f"{'='*65}\n\n") 32 | text: "ཀུན་་་དགའི་" 33 | text_cleaned: "ཀུན་དགའི་" 34 | text_unaffixed: "ཀུན་དགའ་" 35 | syls: ["ཀུན", "དགའི"] 36 | senses: | pos: PROPN, freq: 2923, affixed: True | 37 | char_types: |CONS|VOW|CONS|TSEK|TSEK|TSEK|CONS|CONS|CONS|VOW|TSEK| 38 | chunk_type: TEXT 39 | syls_idx: [[0, 1, 2], [6, 7, 8, 9]] 40 | syls_start_end: [{'start': 0, 'end': 6}, {'start': 6, 'end': 11}] 41 | start: 0 42 | len: 11 43 | 44 | ================================================================= 45 | 46 | text: "དོན་གྲུབ" 47 | text_cleaned: "དོན་གྲུབ་" 48 | text_unaffixed: "དོན་གྲུབ་" 49 | syls: ["དོན", "གྲུབ"] 50 | senses: | pos: PROPN, freq: 1316, affixed: False | 51 | char_types: |CONS|VOW|CONS|TSEK|CONS|SUB_CONS|VOW|CONS| 52 | chunk_type: TEXT 53 | syls_idx: [[0, 1, 2], [4, 5, 6, 7]] 54 | syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 8}] 55 | start: 11 56 | len: 8 57 | 58 | ================================================================= 59 | 60 | text: "།" 61 | char_types: |NORMAL_PUNCT| 62 | chunk_type: PUNCT 63 | start: 19 64 | len: 1 65 | >>> 66 | >>> from botok import AdjustTokens 67 | >>> 68 | >>> adjust_tok = AdjustTokens(main=config.dictionary["rules"], custom=config.adjustments["rules"]) 69 | >>> adjusted_tokens = adjust_tok.adjust(tokens) 70 | >>> print(*adjusted_tokens, sep=f"{'='*65}\n\n") 71 | text: "ཀུན་་་དགའི་" 72 | text_cleaned: "ཀུན་དགའི་" 73 | text_unaffixed: "ཀུན་དགའ་" 74 | syls: ["ཀུན", "དགའི"] 75 | senses: | pos: PROPN, freq: 2923, affixed: True | 76 | char_types: |CONS|VOW|CONS|TSEK|TSEK|TSEK|CONS|CONS|CONS|VOW|TSEK| 77 | chunk_type: TEXT 78 | syls_idx: [[0, 1, 2], [6, 7, 8, 9]] 79 | syls_start_end: [{'start': 0, 'end': 6}, {'start': 6, 'end': 11}] 80 | start: 0 81 | len: 11 82 | 83 | ================================================================= 84 | 85 | text: "དོན་གྲུབ" 86 | text_cleaned: "དོན་གྲུབ་" 87 | text_unaffixed: "དོན་གྲུབ་" 88 | syls: ["དོན", "གྲུབ"] 89 | senses: | pos: PROPN, freq: 1316, affixed: False | 90 | char_types: |CONS|VOW|CONS|TSEK|CONS|SUB_CONS|VOW|CONS| 91 | chunk_type: TEXT 92 | syls_idx: [[0, 1, 2], [4, 5, 6, 7]] 93 | syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 8}] 94 | start: 11 95 | len: 8 96 | 97 | ================================================================= 98 | 99 | text: "།" 100 | char_types: |NORMAL_PUNCT| 101 | chunk_type: PUNCT 102 | start: 19 103 | len: 1 -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | 16 | sys.path.insert(0, os.path.abspath("../../")) 17 | 18 | 19 | # -- Project information ----------------------------------------------------- 20 | 21 | project = "botok" 22 | copyright = "2020-2025, OpenPecha" 23 | author = "OpenPecha" 24 | 25 | 26 | # -- General configuration --------------------------------------------------- 27 | 28 | # Add any Sphinx extension module names here, as strings. They can be 29 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 30 | # ones. 31 | extensions = [ 32 | "sphinx.ext.autodoc", 33 | "sphinx.ext.coverage", 34 | "sphinx.ext.napoleon", 35 | "recommonmark", 36 | "sphinx.ext.viewcode", 37 | "sphinx_markdown_tables", 38 | "sphinx_copybutton", 39 | ] 40 | 41 | # Add any paths that contain templates here, relative to this directory. 42 | templates_path = ["_templates"] 43 | 44 | # The suffix(es) of source filenames. 45 | # You can specify multiple suffix as a list of string: 46 | # 47 | source_suffix = [".rst", ".md"] 48 | 49 | # The master toctree document. 50 | master_doc = "index" 51 | 52 | # List of patterns, relative to source directory, that match files and 53 | # directories to ignore when looking for source files. 54 | # This pattern also affects html_static_path and html_extra_path. 55 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 56 | 57 | # Remove the prompt when copying examples 58 | copybutton_prompt_text = ">>> " 59 | 60 | # -- Options for HTML output ------------------------------------------------- 61 | 62 | # The theme to use for HTML and HTML Help pages. See the documentation for 63 | # a list of builtin themes. 64 | # 65 | html_theme = "sphinx_rtd_theme" 66 | 67 | # Theme options are theme-specific and customize the look and feel of a theme 68 | # further. For a list of options available for each theme, see the 69 | # documentation. 70 | # 71 | html_theme_options = {"analytics_id": "UA-83738774-2"} 72 | 73 | # Add any paths that contain custom static files (such as style sheets) here, 74 | # relative to this directory. They are copied after the builtin static files, 75 | # so a file named "default.css" will overwrite the builtin "default.css". 76 | html_static_path = ["_static"] 77 | 78 | # This must be the name of an image file (path relative to the configuration 79 | # directory) that is the favicon of the docs. Modern browsers use this as 80 | # the icon for tabs, windows and bookmarks. It should be a Windows-style 81 | # icon file (.ico). 82 | # html_favicon = "favicon.ico" 83 | 84 | # -- Autodoc configuration -------------------------------------------------- 85 | 86 | # Autodoc settings 87 | autodoc_member_order = 'bysource' 88 | autoclass_content = 'both' 89 | autodoc_typehints = 'description' 90 | 91 | # Napoleon settings 92 | napoleon_google_docstring = True 93 | napoleon_numpy_docstring = False 94 | napoleon_include_init_with_doc = True 95 | napoleon_include_private_with_doc = False 96 | napoleon_include_special_with_doc = True 97 | napoleon_use_admonition_for_examples = False 98 | napoleon_use_admonition_for_notes = False 99 | napoleon_use_admonition_for_references = False 100 | napoleon_use_ivar = False 101 | napoleon_use_param = True 102 | napoleon_use_rtype = True 103 | napoleon_type_aliases = None 104 | -------------------------------------------------------------------------------- /docs/source/custom-dialect-pack.rst: -------------------------------------------------------------------------------- 1 | Custom Dialect Pack 2 | =================== 3 | 4 | Why Custom Dialect Pack 5 | ----------------------- 6 | 7 | - For domain specific tokenization 8 | - Improving tokenization accuracy 9 | 10 | 11 | Example 12 | ------- 13 | 14 | To use a custom dialect pack for tokenization, all we have to do is to create a `botok.Config` object with path to the custom dialect pack and use this config for creating word tokenizer. 15 | 16 | First, create config for the custom dialect pack. 17 | 18 | .. code:: 19 | 20 | >>> from botok import Config 21 | >>> config = Config.from_path('custom/dialect/pack/path') 22 | 23 | Then, create word tokenizer with that same config. 24 | 25 | .. code:: 26 | 27 | >>> from botok import WordTokenizer 28 | >>> wt = WordTokenizer(config=config) 29 | >>> wt.tokenize("མཐའི་བཀྲ་ཤིས། ཀཀ abc མཐའི་རྒྱ་མཚོ་") 30 | -------------------------------------------------------------------------------- /docs/source/getting-started.rst: -------------------------------------------------------------------------------- 1 | Getting Started with Botok 2 | ========================== 3 | 4 | Installation 5 | ------------ 6 | 7 | .. Caution:: 8 | 9 | Botok only supports Python 3.6 or higher 10 | 11 | Install pre-built botok with pip: 12 | 13 | .. code-block:: 14 | 15 | $ pip install botok 16 | 17 | Install from the latest Master branch of botok with pip: 18 | 19 | .. code-block:: 20 | 21 | $ pip install git+https://github.com/OpenPecha/botok.git 22 | 23 | Install for developers, build botok from source: 24 | 25 | .. code-block:: 26 | 27 | $ git clone https://github.com/OpenPecha/botok.git 28 | $ cd botok 29 | $ python -m venv .env 30 | $ source .env/bin/activate # On Windows: .env\Scripts\activate 31 | $ pip install -e . 32 | 33 | Usage 34 | ----- 35 | 36 | Here is the simple usage of botok to tokenize Tibetan text: 37 | 38 | Import the botok tokenizer called WordTokenizer: 39 | 40 | .. code-block:: 41 | 42 | >>> from botok import WordTokenizer 43 | >>> 44 | >>> tokenizer = WordTokenizer() 45 | Building Trie... (12 s.) 46 | 47 | Tokenize the given text: 48 | 49 | .. code-block:: 50 | 51 | >>> input_str = '༆ ཤི་བཀྲ་ཤིས་ tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ།མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།' 52 | >>> tokens = tokenizer.tokenize(input_str) 53 | >>> print(f'The output is a {type(tokens)}') 54 | The output is a 55 | >>> print(f'The constituting elements are {type(tokens[0])}') 56 | The constituting elements are 57 | 58 | Now in 'tokens' you have an iterable where each token consists of several meta-data in attributes of Token Object: 59 | 60 | .. code-block:: 61 | 62 | >>> tokens[0] 63 | content: "༆ " 64 | char_types: |punct|space| 65 | type: punct 66 | start: 0 67 | len: 2 68 | tag: punct 69 | pos: punc 70 | 71 | 72 | Custom dialect pack 73 | ------------------ 74 | 75 | In order to use a custom dialect pack: 76 | 77 | 1. Prepare your dialect pack in the same folder structure as the `general dialect pack `_ 78 | 2. Instantiate a config object where you pass the dialect name and path 79 | 3. Instantiate your tokenizer object using that config object 80 | 4. Your tokenizer will use your custom dialect pack and will use a trie pickled file in the future to build the custom trie 81 | 82 | .. code-block:: 83 | 84 | from botok import WordTokenizer 85 | from botok.config import Config 86 | from pathlib import Path 87 | 88 | def get_tokens(wt, text): 89 | tokens = wt.tokenize(text, split_affixes=False) 90 | return tokens 91 | 92 | if __name__ == "__main__": 93 | config = Config(dialect_name="custom", base_path=Path.home()) 94 | wt = WordTokenizer(config=config) 95 | text = "བཀྲ་ཤིས་བདེ་ལེགས་ཞུས་རྒྱུ་ཡིན་ སེམས་པ་སྐྱིད་པོ་འདུག།" 96 | tokens = get_tokens(wt, text) 97 | for token in tokens: 98 | print(token) 99 | 100 | Advanced Usage 101 | ------------- 102 | 103 | For more advanced usage, including POS tagging and lemmatization, see the :doc:`advanced guides `. 104 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. botok documentation master file, created by 2 | sphinx-quickstart on Thu Jul 30 12:30:47 2020. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Botok 7 | =================================================================================================== 8 | State-of-the-art tokenizers for Tibetan language. 9 | 10 | This is the documentation of our repository `botok `_. 11 | 12 | Features 13 | -------------------------------- 14 | 15 | - Support various dialects 16 | - Word segmentation with support for affixed particles 17 | - Multiple tokenization modes (chunks, spaces, words) 18 | - Rich token attributes (lemma, POS, clean form) 19 | - File and string input processing 20 | - Word frequency counting 21 | - Handles complex cases like double tseks and spaces within words 22 | 23 | Contents 24 | ---------------------------------- 25 | 26 | .. toctree:: 27 | :maxdepth: 2 28 | :caption: Overview 29 | 30 | getting-started 31 | acknowledgement 32 | 33 | .. toctree:: 34 | :maxdepth: 2 35 | :caption: Advanced guides 36 | 37 | architecture 38 | custom-dialect-pack 39 | 40 | .. toctree:: 41 | :maxdepth: 2 42 | :caption: Package Reference 43 | 44 | main_classes/configuration 45 | -------------------------------------------------------------------------------- /docs/source/main_classes/configuration.rst: -------------------------------------------------------------------------------- 1 | Configuration 2 | ------------------------------------------------------- 3 | 4 | ``Config`` 5 | ~~~~~~~~~~ 6 | 7 | .. autoclass:: botok.Config 8 | :members: -------------------------------------------------------------------------------- /python-3.13.2-amd64.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/python-3.13.2-amd64.exe -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | black 2 | isort 3 | pytest>=5.0.0 4 | coveralls 5 | covdefaults 6 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | 4 | [coverage:run] 5 | plugins = covdefaults 6 | omit = 7 | .env/* 8 | botok/third_party/pynpl/cql.py 9 | botok/third_party/pynpl/fsa.py 10 | 11 | [semantic_release] 12 | version_variable = botok/vars.py:__version__ 13 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | # coding: utf8 3 | 4 | from __future__ import print_function 5 | 6 | import re 7 | from pathlib import Path 8 | 9 | import setuptools 10 | from pkg_resources import parse_version 11 | 12 | assert parse_version(setuptools.__version__) >= parse_version("38.6.0") 13 | 14 | 15 | def get_version(prop, project): 16 | project = Path(__file__).parent / project / "vars.py" 17 | result = re.search( 18 | r'{}\s*=\s*[\'"]([^\'"]*)[\'"]'.format(prop), 19 | project.read_text(encoding="utf-8-sig"), 20 | ) 21 | return result.group(1) 22 | 23 | 24 | def read(fname): 25 | p = Path(__file__).parent / fname 26 | with p.open(encoding="utf-8-sig") as f: 27 | return f.read() 28 | 29 | 30 | setuptools.setup( 31 | name="botok", 32 | version=get_version("__version__", "botok"), # edit version in botok/vars.py 33 | author="OpenPecha development team", 34 | author_email="openpecha@gmail.com", 35 | description="Tibetan Word Tokenizer", 36 | license="Apache2", 37 | keywords="nlp computational_linguistics tibetan tokenizer token", 38 | url="https://github.com/OpenPecha/botok", 39 | packages=setuptools.find_packages(), 40 | long_description=read("README.md"), 41 | long_description_content_type="text/markdown", 42 | project_urls={ 43 | "Source": "https://github.com/OpenPecha/botok", 44 | "Tracker": "https://github.com/OpenPecha/botok/issues", 45 | }, 46 | classifiers=[ 47 | "Development Status :: 3 - Alpha", 48 | "Topic :: Text Processing :: Linguistic", 49 | "Programming Language :: Python :: 3", 50 | "Operating System :: OS Independent", 51 | "Intended Audience :: Developers", 52 | "Intended Audience :: Science/Research", 53 | "License :: OSI Approved :: Apache Software License", 54 | "Natural Language :: Tibetan", 55 | ], 56 | package_data={ 57 | "botok": [ 58 | "resources/*", 59 | "resources/words_bo/*", 60 | "resources/entry_data/*", 61 | "resources/words_non_inflected/*", 62 | "resources/lemmas/*", 63 | "resources/rules/*", 64 | "resources/words_skrt/*", 65 | "resources/adjustment/*", 66 | ] 67 | }, 68 | python_requires=">=3.7", 69 | tests_require=["pytest>=5.0.0"], 70 | install_requires=["pyyaml", "requests"], 71 | ) 72 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/tests/__init__.py -------------------------------------------------------------------------------- /tests/chunks/test_chunkframework.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from botok import ChunkFramework 3 | from botok import ChunkMarkers as c 4 | 5 | 6 | def test_bo_nonbo(): 7 | string = 'བཀྲ་་ཤིས་བདེ་ལེགས། 23PIEIUZLDVéjoldvép«»("«»%=' 8 | cb = ChunkFramework(string) 9 | chunks = cb.chunk_bo_chars() 10 | 11 | output = cb.get_readable(chunks) 12 | assert output == [ 13 | ("BO", "བཀྲ་་ཤིས་བདེ་ལེགས། "), 14 | ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%='), 15 | ] 16 | 17 | 18 | def test_punct_nonpunct(): 19 | string = "༆ བཀྲ་ཤིས་བདེ་ལེགས།། །།" 20 | cb = ChunkFramework(string) 21 | chunks = cb.chunk_punct() 22 | 23 | output = cb.get_readable(chunks) 24 | assert output == [ 25 | ("PUNCT", "༆ "), 26 | ("NON_PUNCT", "བཀྲ་ཤིས་བདེ་ལེགས"), 27 | ("PUNCT", "།། །།"), 28 | ] 29 | 30 | 31 | def test_sym_nonsym(): 32 | string = "བཀྲ་ཤིས་བདེ་ལེགས། ༪༫༝༜༛༚༇༆" 33 | cb = ChunkFramework(string) 34 | chunks = cb.chunk_symbol() 35 | 36 | output = cb.get_readable(chunks) 37 | assert output == [ 38 | ("NON_SYM", "བཀྲ་ཤིས་བདེ་ལེགས།"), 39 | ("SYM", " ༪༫༝༜༛༚"), 40 | ("NON_SYM", "༇༆"), 41 | ] 42 | 43 | 44 | def test_num_nonnum(): 45 | string = "བཀྲ་ཤིས་བདེ་ལེགས། ༡༢༣༠༩༨" 46 | cb = ChunkFramework(string) 47 | chunks = cb.chunk_number() 48 | 49 | output = cb.get_readable(chunks) 50 | assert output == [("NON_NUM", "བཀྲ་ཤིས་བདེ་ལེགས།"), ("NUM", " ༡༢༣༠༩༨")] 51 | 52 | 53 | def test_space_nonspace(): 54 | string = "བཀྲ་ཤིས་བདེ་ལེགས། །བཀྲ་ཤིས་བདེ་ལེགས།" 55 | cb = ChunkFramework(string) 56 | chunks = cb.chunk_spaces() 57 | 58 | output = cb.get_readable(chunks) 59 | assert output == [ 60 | ("NON_SPACE", "བཀྲ་ཤིས་བདེ་ལེགས།"), 61 | ("SPACE", " "), 62 | ("NON_SPACE", "།བཀྲ་ཤིས་བདེ་ལེགས།"), 63 | ] 64 | 65 | 66 | def test_text(): 67 | string = "བཀྲ་ཤིས་བདེ་ལེགས" 68 | cb = ChunkFramework(string) 69 | chunks = cb.syllabify() 70 | 71 | output = cb.get_readable(chunks) 72 | assert output == [ 73 | ("TEXT", "བཀྲ་"), 74 | ("TEXT", "ཤིས་"), 75 | ("TEXT", "བདེ་"), 76 | ("TEXT", "ལེགས"), 77 | ] 78 | 79 | 80 | def test_latin(): 81 | string = "བཀྲ་ཤིས་བདེ་ལེགས This is a test." 82 | cb = ChunkFramework(string) 83 | chunks = cb.chunk_latin() 84 | 85 | output = cb.get_readable(chunks) 86 | assert output == [("OTHER", "བཀྲ་ཤིས་བདེ་ལེགས"), ("LATIN", " This is a test.")] 87 | 88 | 89 | def test_cjk(): 90 | string = "བཀྲ་ཤིས་བདེ་ལེགས 这是 什么" 91 | cb = ChunkFramework(string) 92 | chunks = cb.chunk_cjk() 93 | 94 | output = cb.get_readable(chunks) 95 | assert output == [("OTHER", "བཀྲ་ཤིས་བདེ་ལེགས"), ("CJK", " 这是 什么")] 96 | 97 | 98 | def test_other(): 99 | string = "བཀྲ་ཤིས་བདེ་ལེགས กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ" 100 | cb = ChunkFramework(string) 101 | chunks = cb.chunk_bo_chars() 102 | 103 | output = cb.get_readable(chunks) 104 | assert output == [ 105 | ("BO", "བཀྲ་ཤིས་བདེ་ལེགས "), 106 | ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"), 107 | ] 108 | 109 | 110 | def test_full_example(): 111 | # Follows the order implemented in Chunks 112 | string = ( 113 | '༆ བཀྲ་ཤིས་བདེ་ལེགས།། །། 23PIEIUZLDVéjoldvép«»("«»%= ༪༫༝༜༛༚༇༆ ༡༢༣༠༩༨ ' 114 | "This is a test. 这是 什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ" 115 | ) 116 | cb = ChunkFramework(string) 117 | 118 | # BO / OTHER 119 | chunks = cb.chunk_bo_chars() 120 | chunks = cb.clean_chunks(chunks) 121 | output = cb.get_readable(chunks) 122 | assert output == [ 123 | ("BO", "༆ བཀྲ་ཤིས་བདེ་ལེགས།། །། "), 124 | ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%='), 125 | ("BO", " ༪༫༝༜༛༚༇༆ ༡༢༣༠༩༨ "), 126 | ("OTHER", "This is a test. 这是 什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"), 127 | ] 128 | 129 | # BO / PUNCT 130 | chunks = cb.pipe_chunk(chunks, cb.chunk_punct, c.BO.value, c.PUNCT.value) 131 | chunks = cb.clean_chunks(chunks) 132 | output = cb.get_readable(chunks) 133 | assert output == [ 134 | ("PUNCT", "༆ "), 135 | ("BO", "བཀྲ་ཤིས་བདེ་ལེགས"), 136 | ("PUNCT", "།། །། "), # NEW 137 | ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '), 138 | ("BO", "༪༫༝༜༛༚"), 139 | ("PUNCT", "༇༆ "), # NEW 140 | ("BO", "༡༢༣༠༩༨ "), 141 | ("OTHER", "This is a test. 这是 什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"), 142 | ] 143 | 144 | # BO / NUM 145 | chunks = cb.pipe_chunk(chunks, cb.chunk_number, c.BO.value, c.NUM.value) 146 | chunks = cb.clean_chunks(chunks) 147 | output = cb.get_readable(chunks) 148 | assert output == [ 149 | ("PUNCT", "༆ "), 150 | ("BO", "བཀྲ་ཤིས་བདེ་ལེགས"), 151 | ("PUNCT", "།། །། "), 152 | ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '), 153 | ("BO", "༪༫༝༜༛༚"), 154 | ("PUNCT", "༇༆ "), 155 | ("NUM", "༡༢༣༠༩༨ "), # NEW 156 | ("OTHER", "This is a test. 这是 什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"), 157 | ] 158 | 159 | # BO / SYM 160 | chunks = cb.pipe_chunk(chunks, cb.chunk_symbol, c.BO.value, c.SYM.value) 161 | chunks = cb.clean_chunks(chunks) 162 | output = cb.get_readable(chunks) 163 | assert output == [ 164 | ("PUNCT", "༆ "), 165 | ("BO", "བཀྲ་ཤིས་བདེ་ལེགས"), 166 | ("PUNCT", "།། །། "), 167 | ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '), 168 | ("SYM", "༪༫༝༜༛༚"), # NEW 169 | ("PUNCT", "༇༆ "), 170 | ("NUM", "༡༢༣༠༩༨ "), 171 | ("OTHER", "This is a test. 这是 什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"), 172 | ] 173 | 174 | # TEXT 175 | chunks = cb.pipe_chunk(chunks, cb.syllabify, c.BO.value, c.TEXT.value) 176 | chunks = cb.clean_chunks(chunks) 177 | output = cb.get_readable(chunks) 178 | assert output == [ 179 | ("PUNCT", "༆ "), 180 | ("TEXT", "བཀྲ་"), # NEW 181 | ("TEXT", "ཤིས་"), # NEW 182 | ("TEXT", "བདེ་"), # NEW 183 | ("TEXT", "ལེགས"), # NEW 184 | ("PUNCT", "།། །། "), 185 | ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '), 186 | ("SYM", "༪༫༝༜༛༚"), 187 | ("PUNCT", "༇༆ "), 188 | ("NUM", "༡༢༣༠༩༨ "), 189 | ("OTHER", "This is a test. 这是 什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"), 190 | ] 191 | 192 | # OTHER / CJK 193 | chunks = cb.pipe_chunk(chunks, cb.chunk_cjk, c.OTHER.value, c.CJK.value) 194 | chunks = cb.clean_chunks(chunks) 195 | output = cb.get_readable(chunks) 196 | assert output == [ 197 | ("PUNCT", "༆ "), 198 | ("TEXT", "བཀྲ་"), 199 | ("TEXT", "ཤིས་"), 200 | ("TEXT", "བདེ་"), 201 | ("TEXT", "ལེགས"), 202 | ("PUNCT", "།། །། "), 203 | ("OTHER", '23PIEIUZLDVéjoldvép«»("«»%= '), 204 | ("SYM", "༪༫༝༜༛༚"), 205 | ("PUNCT", "༇༆ "), 206 | ("NUM", "༡༢༣༠༩༨ "), 207 | ("OTHER", "This is a test."), 208 | ("CJK", " 这是 什么 "), # NEW 209 | ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"), 210 | ] 211 | 212 | # OTHER / LATIN 213 | chunks = cb.pipe_chunk(chunks, cb.chunk_latin, c.OTHER.value, c.LATIN.value) 214 | chunks = cb.clean_chunks(chunks) 215 | output = cb.get_readable(chunks) 216 | assert output == [ 217 | ("PUNCT", "༆ "), 218 | ("TEXT", "བཀྲ་"), 219 | ("TEXT", "ཤིས་"), 220 | ("TEXT", "བདེ་"), 221 | ("TEXT", "ལེགས"), 222 | ("PUNCT", "།། །། "), 223 | ("LATIN", '23PIEIUZLDVéjoldvép«»("«»%= '), # NEW 224 | ("SYM", "༪༫༝༜༛༚"), 225 | ("PUNCT", "༇༆ "), 226 | ("NUM", "༡༢༣༠༩༨ "), 227 | ("LATIN", "This is a test."), # NEW 228 | ("CJK", " 这是 什么 "), 229 | ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"), 230 | ] 231 | -------------------------------------------------------------------------------- /tests/chunks/test_chunks.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from botok import Chunks, TokChunks, TSEK 3 | 4 | string = ( 5 | '༆ བཀྲ་ཤིས་བདེ་ལེགས།། །། 23PIEIUZLDVéjoldvép«»("«»%= ༪༫༝༜༛༚༇༆ ༡༢༣༠༩༨ ' 6 | "This is a test. 这是 什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ" 7 | ) 8 | 9 | 10 | def test_chunks(): 11 | c = Chunks(string) 12 | chunks = c.make_chunks() 13 | output = c.get_readable(chunks) 14 | assert output == [ 15 | ("PUNCT", "༆ "), 16 | ("TEXT", "བཀྲ་"), 17 | ("TEXT", "ཤིས་"), 18 | ("TEXT", "བདེ་"), 19 | ("TEXT", "ལེགས"), 20 | ("PUNCT", "།། །། "), 21 | ("LATIN", '23PIEIUZLDVéjoldvép«»("«»%= '), # NEW 22 | ("SYM", "༪༫༝༜༛༚"), 23 | ("PUNCT", "༇༆ "), 24 | ("NUM", "༡༢༣༠༩༨ "), 25 | ("LATIN", "This is a test."), # NEW 26 | ("CJK", " 这是 什么 "), 27 | ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"), 28 | ] 29 | 30 | 31 | def test_tokchunks(): 32 | c = TokChunks(string) 33 | c.serve_syls_to_trie() 34 | # generate what the tokenizer will ingest 35 | chunks = c.chunks 36 | assert chunks == [ 37 | (None, (105, 0, 2)), 38 | ([2, 3, 4], (104, 2, 4)), # syllable 1 39 | ([6, 7, 8], (104, 6, 4)), # syllable 2 40 | ([10, 11, 12], (104, 10, 4)), # syllable 3 41 | ([14, 15, 16, 17], (104, 14, 4)), # syllable 4 42 | (None, (105, 18, 6)), 43 | (None, (101, 24, 28)), 44 | (None, (109, 52, 6)), 45 | (None, (105, 58, 3)), 46 | (None, (111, 61, 7)), 47 | (None, (101, 68, 15)), 48 | (None, (102, 83, 8)), 49 | (None, (103, 91, 24)), 50 | ] 51 | 52 | # the second element of each tuple is the chunk from Chunks 53 | readable = [(a[0], c.get_readable([a[1]])[0]) for a in chunks] 54 | assert readable == [ 55 | (None, ("PUNCT", "༆ ")), 56 | ([2, 3, 4], ("TEXT", "བཀྲ་")), 57 | ([6, 7, 8], ("TEXT", "ཤིས་")), 58 | ([10, 11, 12], ("TEXT", "བདེ་")), 59 | ([14, 15, 16, 17], ("TEXT", "ལེགས")), 60 | (None, ("PUNCT", "།། །། ")), 61 | (None, ("LATIN", '23PIEIUZLDVéjoldvép«»("«»%= ')), 62 | (None, ("SYM", "༪༫༝༜༛༚")), 63 | (None, ("PUNCT", "༇༆ ")), 64 | (None, ("NUM", "༡༢༣༠༩༨ ")), 65 | (None, ("LATIN", "This is a test.")), 66 | (None, ("CJK", " 这是 什么 ")), 67 | (None, ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ")), 68 | ] 69 | 70 | # just for the fun of it: get the cleaned syllable as it is done in the Tokenizer 71 | chunks = [ 72 | "".join([string[c] for c in chars]) + TSEK for chars, chunk in chunks if chars 73 | ] 74 | assert chunks == ["བཀྲ་", "ཤིས་", "བདེ་", "ལེགས་"] 75 | -------------------------------------------------------------------------------- /tests/chunks/test_chunktokenizer.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from botok import * 3 | 4 | 5 | def test_chunktokenizer(): 6 | input_str = ( 7 | " ཤི་བཀྲ་ཤིས་ བདེ་་ལ ེ གས་ " 8 | 'བཀྲ་ཤིས་བདེ་ལེགས ༆ བཀྲ་ཤིས་བདེ་ལེགས།། །། 23PIEIUZLDVéjoldvép«»("«»%= ༪༫༝༜༛༚༇༆ ༡༢༣༠༩༨ ' 9 | "This is a test. 这是 什么 กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ" 10 | ) 11 | st = ChunkTokenizer(input_str) 12 | tokens = st.tokenize() 13 | assert tokens == [ 14 | ("TEXT", " ཤི་"), 15 | ("TEXT", "བཀྲ་"), 16 | ("TEXT", "ཤིས་ "), 17 | ("TEXT", "བདེ་་"), 18 | ("TEXT", "ལ ེ གས་ "), 19 | ("TEXT", "བཀྲ་"), 20 | ("TEXT", "ཤིས་"), 21 | ("TEXT", "བདེ་"), 22 | ("TEXT", "ལེགས"), 23 | ("PUNCT", " ༆ "), 24 | ("TEXT", "བཀྲ་"), 25 | ("TEXT", "ཤིས་"), 26 | ("TEXT", "བདེ་"), 27 | ("TEXT", "ལེགས"), 28 | ("PUNCT", "།། །། "), 29 | ("LATIN", '23PIEIUZLDVéjoldvép«»("«»%= '), 30 | ("SYM", "༪༫༝༜༛༚"), 31 | ("PUNCT", "༇༆ "), 32 | ("NUM", "༡༢༣༠༩༨ "), 33 | ("LATIN", "This is a test."), 34 | ("CJK", " 这是 什么 "), 35 | ("OTHER", "กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธ"), 36 | ] 37 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from botok import Config, Tokenize, WordTokenizer 4 | 5 | 6 | @pytest.fixture(scope="session") 7 | def empty_wt(): 8 | """Return empty word tokenizer.""" 9 | config = Config.from_path("./tests/data/empty_dialect_pack") 10 | return WordTokenizer(config=config) 11 | 12 | 13 | @pytest.fixture(scope="session") 14 | def wt(): 15 | """Return default word tokenizer.""" 16 | return WordTokenizer() 17 | -------------------------------------------------------------------------------- /tests/data/empty_dialect_pack/adjustments/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/tests/data/empty_dialect_pack/adjustments/.keep -------------------------------------------------------------------------------- /tests/data/empty_dialect_pack/dictionary/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/tests/data/empty_dialect_pack/dictionary/.keep -------------------------------------------------------------------------------- /tests/data/trie_dialect_pack/adjustments/remove/test.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/tests/data/trie_dialect_pack/adjustments/remove/test.tsv -------------------------------------------------------------------------------- /tests/data/trie_dialect_pack/adjustments/rules/adjust_rules.tsv: -------------------------------------------------------------------------------- 1 | # Syntax for the possible adjustment 2 | # =================================== 3 | # - CQL rules: "" can be used without specifying that there is "text_cleaned=" 4 | # - Index format: either "" or "-" 5 | # - Adjustment format: 6 | # - "+" for merge 7 | # - ":" for split (default: syllable mode) 8 | # - "::" for split in character mode 9 | # - "=" for replace 10 | # - Constraint: "-" is only allowed if adjustment is ":" or "::" 11 | 12 | ["ལ་ལ་"] ["ལ་ལ་"] 1 = [pos="PART"] 13 | ["ལ་ལ་"] ["ལ་ལ་"] 2 = [pos="PART"] 14 | ["ལ་ལ་"] ["ལ་ལ་"] 1-2 :: [pos="NOUN"] [pos="PART"] 15 | ["ལ་"] ["ལ་"] ["ལ་ལ་"] 3-2 :: [pos="PART"] [pos="PART"] 16 | ["ལ་"] ["ལ་"] ["ལ་"] ["ལ་"] 2 + [pos="DET"] -------------------------------------------------------------------------------- /tests/data/trie_dialect_pack/adjustments/words/test.tsv: -------------------------------------------------------------------------------- 1 | ཀཀ 2 | ཁཁ 3 | ལྟ། VERB ལྟ། 123 4 | ལྟར། ADV ལྟར། 456 5 | བཀྲ་ཤིས། བཀྲ་ཤིས། 6 | བཀྲིས། བཀྲ་ཤིས། -------------------------------------------------------------------------------- /tests/data/trie_dialect_pack/adjustments/words/test_comma_sep.tsv: -------------------------------------------------------------------------------- 1 | ང་།,PRON 2 | -------------------------------------------------------------------------------- /tests/data/trie_dialect_pack/adjustments/words_skrt/test.tsv: -------------------------------------------------------------------------------- 1 | ཀ་ར། -------------------------------------------------------------------------------- /tests/data/trie_dialect_pack/dictionary/words/empty.tsv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OpenPecha/Botok/b934119c6ea6a9166ba60f340775d40408c27e06/tests/data/trie_dialect_pack/dictionary/words/empty.tsv -------------------------------------------------------------------------------- /tests/modifytokens/test_matchers.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from pathlib import Path 3 | 4 | import pytest 5 | 6 | from botok import * 7 | 8 | 9 | input_str = " མཐའི་རྒྱ་མཚོའི་གླིང་། ཤི་བཀྲ་ཤིས་ tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་ཀཀ" 10 | 11 | 12 | @pytest.fixture 13 | def tokens_affix_split(wt): 14 | return wt.tokenize(input_str) 15 | 16 | 17 | @pytest.fixture 18 | def tokens(wt): 19 | return wt.tokenize(input_str, split_affixes=False) 20 | 21 | 22 | # IMPORTANT: all the tests have merely been adapted after refactorisation. 23 | # They should be split in tests per file that also show the expected behaviour of every matcher. 24 | 25 | 26 | def test_cql_query(): 27 | query = '[text="ན"] []' 28 | q = Query(query) 29 | assert q 30 | 31 | 32 | def test_dummy_cql(): 33 | test = [ 34 | {"word": "This", "lemma": "this", "tag": "Det"}, 35 | {"word": "is", "lemma": "be", "tag": "Verb"}, 36 | {"word": "it", "lemma": "it", "tag": "Pron"}, 37 | {"word": ".", "lemma": ".", "tag": "Punct"}, 38 | ] 39 | q = '[lemma="this" & tag="Det"] [tag!="ADJ"]' 40 | 41 | matcher = CQLMatcher(q) 42 | matched = matcher.match(test) 43 | assert matched == [(0, 1)] 44 | 45 | 46 | def test_regex_in_cql_query(): 47 | test = [ 48 | {"word": "This", "lemma": "this", "tag": "Det"}, 49 | {"word": "is", "lemma": "be", "tag": "Verb"}, 50 | {"word": "it", "lemma": "it", "tag": "Pron"}, 51 | {"word": ".", "lemma": ".", "tag": "Punct"}, 52 | ] 53 | q = r'[lemma="[^\n\s]+s" & tag="Det"] [tag!="ADJ"]' 54 | 55 | matcher = CQLMatcher(q) 56 | matched = matcher.match(test) 57 | expected = [test[m]["word"] for match in matched for m in match] 58 | assert expected == ["This", "is"] 59 | 60 | 61 | def test_cql(tokens): 62 | query = '[pos="NOUN" & text!=""] []' 63 | matcher = CQLMatcher(query) 64 | slices = matcher.match(tokens) 65 | slice_strings = [ 66 | tuple([tokens[i].text for i in range(start, end + 1)]) for start, end in slices 67 | ] 68 | assert slices == [(0, 1), (1, 2), (2, 3), (5, 6), (7, 8), (9, 10), (10, 11)] 69 | assert slice_strings == [ 70 | (" མཐའི་", "རྒྱ་མཚོའི་"), 71 | ("རྒྱ་མཚོའི་", "གླིང་"), 72 | ("གླིང་", "། "), 73 | ("བཀྲ་ཤིས་ ", "tr "), 74 | ("བདེ་་ལེ གས", "། "), 75 | ("བཀྲ་ཤིས་", "བདེ་ལེགས་"), 76 | ("བདེ་ལེགས་", "ཀཀ"), 77 | ] 78 | 79 | 80 | def test_token_split(tokens): 81 | ts = TokenSplit( 82 | tokens[3], 83 | 1, 84 | token_changes='[chunk_type="SPACE" & pos="PUNCT" & affix_host="False"] []', 85 | ) 86 | first, second = ts.split() 87 | assert first.chunk_type == "SPACE" 88 | assert first.pos == "PUNCT" 89 | 90 | 91 | def test_token_merge(tokens_affix_split): 92 | tm = TokenMerge(tokens_affix_split[0], tokens_affix_split[1]) 93 | merged = tm.merge() 94 | assert merged 95 | 96 | 97 | def test_match_split_char(tokens): 98 | match_query = '[pos="NOUN" & text!=""] []' 99 | replace_idx = 1 # slot number in match query 100 | split_idx = 1 # char index in token.content where split should occur 101 | replace = '[chunk_type="XXX" & pos="xxx"] []' 102 | 103 | sm = SplittingMatcher(match_query, replace_idx, split_idx, tokens, replace) 104 | split_tokens = sm.split_on_matches() 105 | assert len(tokens) == 12 106 | assert len(split_tokens) == 19 107 | 108 | 109 | def test_match_split_syl(tokens): 110 | match_query = '[pos="NOUN" & text!=""] []' 111 | replace_idx = 1 # slot number in match query 112 | split_idx = 1 # char index in token.content where split should occur 113 | replace = '[chunk_type="XXX" & pos="xxx"] []' 114 | 115 | sm = SplittingMatcher(match_query, replace_idx, split_idx, tokens, replace) 116 | split_tokens = sm.split_on_matches(mode="syl") 117 | assert len(tokens) == 12 118 | assert len(split_tokens) == 17 119 | 120 | 121 | def test_match_merge(tokens, tokens_affix_split): 122 | match_query = '[pos="NOUN" & text!=""] []' 123 | replace_idx = 1 # slot number in match query 124 | replace = '[chunk_type="XXX" & pos="xxx"]' 125 | 126 | mm = MergingMatcher(match_query, replace_idx, tokens_affix_split, replace) 127 | merged_tokens = mm.merge_on_matches() 128 | assert len(tokens) == 12 129 | assert len(merged_tokens) == 8 130 | 131 | 132 | def test_match_replace(tokens): 133 | match_query = '[pos="NOUN" & text!=""] []' 134 | replace_idx = 1 135 | replace = '[chunk_type="XXX" & pos="xxx"]' 136 | 137 | ReplacingMatcher(match_query, replace_idx, tokens, replace).replace_on_matches() 138 | assert len(tokens) == 12 139 | assert tokens[1].pos == "xxx" 140 | assert tokens[4].pos == "VERB" 141 | 142 | 143 | def test_adjust_tokens(wt): 144 | string = "ལ་ལ་ལ་ལ་ལ་བ་ཡོད།" 145 | token_list = wt.tokenize(string, split_affixes=False) 146 | 147 | # add test adjust rule to adjustments rules 148 | wt.config.adjustments["rules"].append( 149 | Path("./tests/data/trie_dialect_pack/adjustments/rules/adjust_rules.tsv") 150 | ) 151 | 152 | at = AdjustTokens( 153 | main=wt.config.dictionary["rules"], custom=wt.config.adjustments["rules"] 154 | ) 155 | adjusted = at.adjust(token_list) 156 | assert token_list[0].text == "ལ་ལ་" 157 | assert token_list[1].text == "ལ་ལ་" 158 | 159 | assert adjusted[0].text == "ལ་" 160 | assert adjusted[0].pos == "NOUN" 161 | assert adjusted[1].text == "ལ་ལ་" 162 | assert adjusted[1].pos == "DET" 163 | assert adjusted[2].text == "ལ་" 164 | assert adjusted[2].pos == "PART" 165 | 166 | 167 | def test_last_token(): 168 | token1 = Token() 169 | token1.pos = "NOUN" 170 | 171 | token2 = Token() 172 | token2.pos = "VERB" 173 | 174 | matcher = CQLMatcher('[pos="NOUN"]') 175 | slices = matcher.match([token1, token2]) 176 | assert slices == [(0, 0)] 177 | 178 | matcher = CQLMatcher('[pos="VERB"]') 179 | slices = matcher.match([token1, token2]) 180 | assert slices == [(1, 1)] 181 | 182 | 183 | def test_merge_dagdra(wt): 184 | token_list = wt.tokenize("བཀྲ་ཤིས་-པ་") 185 | token_list = [ 186 | t for t in token_list if t.text != "-" 187 | ] # remove the "-" inserted to ensure we have two tokens 188 | mp = MergeDagdra() 189 | mp.merge(token_list) 190 | assert len(token_list) == 1 and token_list[0].text == "བཀྲ་ཤིས་པ་" 191 | 192 | token_list = wt.tokenize("བཀྲ་ཤིས་-པའོ།") 193 | token_list = [ 194 | t for t in token_list if t.text != "-" 195 | ] # remove the "-" inserted to ensure we have two tokens 196 | mp.merge(token_list) 197 | assert len(token_list) == 3 and token_list[0].text == "བཀྲ་ཤིས་པ" 198 | -------------------------------------------------------------------------------- /tests/resources/rdr_rules.txt: -------------------------------------------------------------------------------- 1 | object.tag == "SCONJ" : object.conclusion = "SCONJ" 2 | object.prevTag1 == "DET" : object.conclusion = "ADP" 3 | object.word == "སྟེ་" : object.conclusion = "SCONJ" 4 | object.word == "ཅིང་" : object.conclusion = "SCONJ" 5 | object.word == "ཞིང་" : object.conclusion = "NOUN" 6 | object.prevTag1 == "DET" and object.word == "ཤིང་" : object.conclusion = "NOUN" 7 | object.prevTag1 == "NOUN" : object.conclusion = "ADP" 8 | object.word == "སྟེ་" : object.conclusion = "SCONJ" 9 | object.word == "ཏེ་" : object.conclusion = "SCONJ" 10 | object.prevTag1 == "NOUN" and object.word == "ཞིང་" : object.conclusion = "SCONJ" 11 | object.word == "ཤིང་" : object.conclusion = "SCONJ" 12 | object.nextTag2 == "ADP" : object.conclusion = "NOUN" 13 | object.prevTag1 == "PRON" and object.word == "ནས་" : object.conclusion = "ADP" 14 | object.prevWord2 == "སུ་" and object.word == "ནས་" : object.conclusion = "SCONJ" 15 | object.prevTag1 == "NUM" : object.conclusion = "ADP" 16 | object.word == "སྟེ་" : object.conclusion = "SCONJ" 17 | object.prevTag1 == "ADP" : object.conclusion = "NOUN" 18 | object.prevTag2 == "NOUN" and object.prevTag1 == "ADP" and object.word == "ནས་" : object.conclusion = "ADP" 19 | object.word == "ཏེ་" : object.conclusion = "SCONJ" -------------------------------------------------------------------------------- /tests/resources/test.txt: -------------------------------------------------------------------------------- 1 | # this is a test line, followed by an empty line, that should be ignored 2 | 3 | བཀྲ་,NOUN # lines can have either a comma 4 | ཤིས་ NOUN # a space 5 | བཀྲ་ཤིས་ NOUN # or a tab as separator 6 | བདེ་,NOUN 7 | # this line, being empty after removing the comment, should be ignored 8 | ལེགས་,ADJ # a comment preceded by a space 9 | བདེ་ལེགས་,NOUN 10 | བཀྲ་ཤིས་བདེ་ལེགས་,EXCLS # Not so sure about this POS. 11 | -------------------------------------------------------------------------------- /tests/resources/test_file_to_tokenize.txt: -------------------------------------------------------------------------------- 1 | ལེ གས། བཀྲ་ཤིས་མཐའི་ ༆ ཤི་བཀྲ་ཤིས་ tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ། -------------------------------------------------------------------------------- /tests/resources/test_file_to_tokenize_pybo.txt: -------------------------------------------------------------------------------- 1 | ལེ_གས །_ བཀྲ་ ཤིས་ མཐའི་ _༆_ ཤི་ བཀྲ་ ཤིས་__ tr_ བདེ་་ ལེ_གས །_ བཀྲ་ ཤིས་ བདེ་ ལེགས་ ༡༢༣ ཀཀ །_ མཐའི་ རྒྱ་ མཚོར་ གནས་ པའི་ ཉས་ ཆུ་ འཐུང་ །།_།། མཁའ ། -------------------------------------------------------------------------------- /tests/test_bugs.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | import sys 3 | 4 | import pytest 5 | 6 | from botok import TokChunks, Config, Trie, BoSyl, Tokenize, Chunks, ChunkFramework 7 | 8 | 9 | sys.path.append("../") 10 | 11 | 12 | def test_syl_tokenize(): 13 | instr = " མཐའི་རྒྱ་མཚོའི་གླིང་། ཤི་བཀྲ་ཤིས་ tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་ཀཀ" 14 | preprocessed = TokChunks(instr) 15 | preprocessed.serve_syls_to_trie() 16 | config = Config() 17 | trie = Trie(BoSyl, config.profile, config.dictionary, config.adjustments) 18 | tok = Tokenize(trie) 19 | tokens = tok.tokenize(preprocessed) 20 | texts = [t.text for t in tokens] 21 | expected = [ 22 | " མཐའི་", 23 | "རྒྱ་མཚོའི་", 24 | "གླིང་", 25 | "། ", 26 | "ཤི་", 27 | "བཀྲ་ཤིས་ ", 28 | "tr ", 29 | "བདེ་་ལེ གས", 30 | "། ", 31 | "བཀྲ་ཤིས་", 32 | "བདེ་ལེགས་", 33 | "ཀཀ", 34 | ] 35 | # current: [' མཐའི་', 'རྒྱ་མཚོའི་', '། ', 'གླིང་', 'བཀྲ་', 'ཤི་', 'tr ', 'ཤིས་ ', 'བདེ་་ལེ གས', '། ', 'བདེ་', 36 | # 'བཀྲ་ཤིས་', 'ཀཀ', 'ལེགས་'] 37 | assert texts == expected 38 | 39 | 40 | def test_num_lemmas_missing(wt): 41 | in_str = "སྟོང་ཕྲག་བརྒྱ་པ་སུམ་བརྒྱ་པ་བཅུ་པ་ལྔ་པ་" 42 | tokens = wt.tokenize(in_str) 43 | assert [t.lemma for t in tokens] == [ 44 | "སྟོང་ཕྲག་", 45 | "བརྒྱ་པ་", 46 | "སུམ་བརྒྱ་པ་", 47 | "བཅུ་པ་", 48 | "ལྔ་པ་", 49 | ] 50 | 51 | 52 | def test_no_shad_syllable(): 53 | in_str = "ཀ འདི་ ཤི དེ་ག རེད་དོ།" 54 | bo_string = Chunks(in_str) 55 | chunks = bo_string.make_chunks() 56 | chunks = bo_string.get_readable(chunks) 57 | assert chunks == [ 58 | ("TEXT", "ཀ "), 59 | ("TEXT", "འདི་ "), 60 | ("TEXT", "ཤི "), 61 | ("TEXT", "དེ་"), 62 | ("TEXT", "ག "), 63 | ("TEXT", "རེད་"), 64 | ("TEXT", "དོ"), 65 | ("PUNCT", "།"), 66 | ] 67 | 68 | 69 | def test_segmentation_bug(wt): 70 | tokens = wt.tokenize("ལ་པོ་ལ་པོ་ལ་པོ་") 71 | assert len(tokens) == 3 72 | 73 | tokens = wt.tokenize("ལ་མོ་ལ་མོ་ལ་མོ་") 74 | assert len(tokens) == 3 75 | 76 | tokens = wt.tokenize("གྲོགས་པོ་གྲོགས་པོ་གྲོགས་པོ་") 77 | assert len(tokens) == 3 78 | 79 | tokens = wt.tokenize("བདག་པོ་བདག་པོ་བདག་པོ་དང་") 80 | assert len(tokens) == 4 81 | 82 | tokens = wt.tokenize("བདག་པོ་བདག་པོ་བདག་པོ་") 83 | assert len(tokens) == 3 84 | 85 | tokens = wt.tokenize( 86 | "བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་བདག་པོ་" 87 | ) 88 | assert len(tokens) == 9 89 | 90 | 91 | def test_keyerror_part_lemma(wt): 92 | tokens = wt.tokenize("ཕའིའོ།") 93 | assert len(tokens) == 3 94 | 95 | 96 | def test_split_token(empty_wt): 97 | wt = empty_wt 98 | wt.tok.trie.rebuild_trie() 99 | wt.tok.trie.inflect_n_modify_trie("འ་") 100 | assert not wt.tok.trie.has_word("ར་")["exists"] 101 | 102 | 103 | def test_missing_entries_n_bad_unaffixed(wt): 104 | input_str = "ཤུ་ཀ་ར་" 105 | tokens = wt.tokenize(input_str, split_affixes=False) 106 | assert [t.text for t in tokens] == ["ཤུ་", "ཀ་ར་"] 107 | assert tokens[0].senses 108 | assert tokens[1].text_unaffixed == "ཀ་ར་" 109 | 110 | 111 | def test_multiple_spaces(): 112 | bo_string = Chunks("ཁྱོ ད་ད ང་") 113 | chunks = bo_string.make_chunks() 114 | chunks = bo_string.get_readable(chunks) 115 | assert chunks[0] == ("TEXT", "ཁྱོ ད་") 116 | assert chunks[1] == ("TEXT", "ད ང་") 117 | assert len(chunks) == 2 118 | 119 | 120 | def test_bug1(wt): 121 | string = "བ་ཀུ་" 122 | tokens = wt.tokenize(string, debug=True) 123 | assert tokens 124 | 125 | 126 | def test_bug2(wt): 127 | string = "བྲ་གྲྀ་" 128 | tokens = wt.tokenize(string, debug=True) 129 | assert tokens 130 | 131 | 132 | def test_many_tseks_in_syllable(): 133 | input_str = " ཤི་བཀྲ་ཤིས་ བདེ་་ལ ེ གས་ བཀྲ་ཤིས་བདེ་ལེགས" 134 | cb = ChunkFramework(input_str) 135 | chunks = cb.syllabify() 136 | readable = cb.get_readable(chunks) 137 | assert readable == [ 138 | ("TEXT", " ཤི་"), 139 | ("TEXT", "བཀྲ་"), 140 | ("TEXT", "ཤིས་"), 141 | ("TEXT", " བདེ་་"), 142 | ("TEXT", "ལ ེ གས་"), 143 | ("TEXT", " བཀྲ་"), 144 | ("TEXT", "ཤིས་"), 145 | ("TEXT", "བདེ་"), 146 | ("TEXT", "ལེགས"), 147 | ] 148 | 149 | chunks = cb.chunk_punct() 150 | chunks = cb.merge_skippable_punct(chunks) 151 | readable = cb.get_readable(chunks) 152 | assert readable == [ 153 | ("NON_PUNCT", " ཤི་བཀྲ་ཤིས་ བདེ་་ལ ེ གས་ བཀྲ་ཤིས་བདེ་ལེགས") 154 | ] 155 | 156 | ck = Chunks(input_str) 157 | chunks = ck.make_chunks() 158 | readable = ck.get_readable(chunks) 159 | assert readable == [ 160 | ("TEXT", " ཤི་"), 161 | ("TEXT", "བཀྲ་"), 162 | ("TEXT", "ཤིས་ "), 163 | ("TEXT", "བདེ་་"), 164 | ("TEXT", "ལ ེ གས་ "), 165 | ("TEXT", "བཀྲ་"), 166 | ("TEXT", "ཤིས་"), 167 | ("TEXT", "བདེ་"), 168 | ("TEXT", "ལེགས"), 169 | ] 170 | 171 | 172 | def test_shad_in_syllable(): 173 | input_str = " tr བདེ་་ལེ གས། བཀྲ་" 174 | ck = Chunks(input_str) 175 | chunks = ck.make_chunks() 176 | readable = ck.get_readable(chunks) 177 | assert readable == [ 178 | ("LATIN", " tr "), 179 | ("TEXT", "བདེ་་"), 180 | ("TEXT", "ལེ གས"), 181 | ("PUNCT", "། "), 182 | ("TEXT", "བཀྲ་"), 183 | ] 184 | 185 | def test_unexpected_skip_syl(wt): 186 | input_strs = ["དེའི་སྒོ་ནས་བསྟན་པ་དང་སེམས་ཅན་ལ་ཕན་ཐོགས་མཛད་ཚུལ།", "དེ་ཁོ་རང་ཡིན་མོད།"] 187 | wt.tok.trie.inflect_n_modify_trie("དང་སེམས་", deactivate=True) # To remove དང་སེམས་ from trie 188 | wt.tok.trie.inflect_n_modify_trie("ཡིན་མོད", deactivate=True) 189 | wt.tok.trie.inflect_n_modify_trie("ཕན་ཐོགས་") 190 | expected_strs = ["དེའི་ སྒོ་ ནས་ བསྟན་པ་ དང་ སེམས་ཅན་ ལ་ ཕན་ཐོགས་ མཛད་ ཚུལ ། ", "དེ་ ཁོ་རང་ ཡིན་ མོད ། "] 191 | result_strs = [] 192 | for input_str in input_strs: 193 | tokens = wt.tokenize(input_str, split_affixes = False) 194 | result_str = '' 195 | for token in tokens: 196 | result_str += f'{token.text} ' 197 | result_strs.append(result_str) 198 | assert expected_strs == result_strs 199 | 200 | 201 | if __name__ == "__main__": 202 | test_split_token() 203 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | import copy 3 | from pathlib import Path 4 | 5 | import pytest 6 | 7 | from botok import Config 8 | from botok.config import DEFAULT_BASE_PATH 9 | 10 | 11 | @pytest.fixture(scope="module") 12 | def base_path(): 13 | return DEFAULT_BASE_PATH 14 | 15 | 16 | def test_defaults(base_path): 17 | config = Config() 18 | 19 | # default dialect pach path 20 | assert config.dialect_pack_path == base_path / "general" 21 | assert config.dialect_pack_path.is_dir() 22 | 23 | # Trie data should be .tsv file 24 | for data_type in ["words", "rules"]: 25 | assert data_type in config.dictionary 26 | for data_fn in config.dictionary[data_type]: 27 | assert data_fn.suffix == ".tsv" 28 | 29 | # Segmentation adjustment 30 | for data_type in ["remove", "rules", "words", "words_skrt"]: 31 | assert data_type in config.adjustments 32 | for data_fn in config.adjustments[data_type]: 33 | if data_fn.suffix: 34 | assert data_fn.suffix == ".tsv" 35 | 36 | 37 | def test_custome_dialect_pack(base_path): 38 | config = Config(dialect_name="kangyur") 39 | assert config.dialect_pack_path == base_path / "kangyur" 40 | assert config.dialect_pack_path.is_dir() 41 | 42 | 43 | def test_reset(base_path): 44 | custome_pack_name = "kangyur" 45 | config = Config(dialect_name=custome_pack_name) 46 | assert config.dialect_pack_path == base_path / custome_pack_name 47 | 48 | config.reset() 49 | 50 | assert config.dialect_pack_path == base_path / "general" 51 | 52 | 53 | def test_empty_config(): 54 | config = Config.from_path("./tests/data/empty_dialect_pack") 55 | 56 | assert not config.dictionary 57 | assert not config.adjustments 58 | 59 | 60 | def test_add_dialect_pack(): 61 | config = Config() 62 | old_dictionary = copy.deepcopy(config.dictionary) 63 | old_adjustments = copy.deepcopy(config.adjustments) 64 | 65 | config.add_dialect_pack(Path("./tests/data/trie_dialect_pack")) 66 | 67 | assert config.dictionary != old_dictionary 68 | assert config.adjustments != old_adjustments 69 | -------------------------------------------------------------------------------- /tests/text/test_text_tokenize.py: -------------------------------------------------------------------------------- 1 | from botok.text.tokenize import space_tok, word_tok, sentence_tok, paragraph_tok 2 | from botok.config import Config 3 | 4 | 5 | def test_text_space_tokenizer(): 6 | """Test the space tokenizer functionality.""" 7 | text = "ཀཿ ཐོག་ འབྱམ་ པའཱི་ རོ།" 8 | tokens = space_tok(text) 9 | assert len(tokens) == 5 10 | assert tokens[0] == "ཀཿ" 11 | assert tokens[1] == "ཐོག་" 12 | 13 | 14 | def test_text_word_tokenizer(): 15 | """Test the word tokenizer functionality.""" 16 | text = "ཀཿཐོག་འབྱམ་པའཱི་རོ།" 17 | tokens = word_tok(text) 18 | assert len(tokens) > 0 19 | assert hasattr(tokens[0], "text") 20 | 21 | 22 | def test_text_sentence_tokenizer(): 23 | """Test the sentence tokenizer functionality.""" 24 | text = "ཀཿཐོག་འབྱམ་པའཱི་རོ། འདི་ནི་ཚིག་གྲུབ་གཉིས་པ་ཡིན།" 25 | sentences = sentence_tok(text) 26 | # It should have at least one sentence with tokens 27 | assert len(sentences) > 0 28 | # The expected output format is a dictionary with sentence data 29 | assert isinstance(sentences[0], dict) 30 | assert 'tokens' in sentences[0] 31 | assert isinstance(sentences[0]['tokens'], list) 32 | assert len(sentences[0]['tokens']) > 0 33 | 34 | 35 | def test_text_paragraph_tokenizer(): 36 | """Test the paragraph tokenizer functionality.""" 37 | text = "ཀཿཐོག་འབྱམ་པའཱི་རོ།\n\nའདི་ནི་དུམ་བུ་གཉིས་པ་ཡིན།" 38 | paragraphs = paragraph_tok(text) 39 | # It should have at least one paragraph with tokens 40 | assert len(paragraphs) > 0 41 | assert isinstance(paragraphs[0], tuple) 42 | assert len(paragraphs[0]) == 2 43 | # The first element is the paragraph index 44 | assert isinstance(paragraphs[0][0], int) 45 | # The second element is the list of tokens 46 | assert isinstance(paragraphs[0][1], list) 47 | 48 | 49 | def test_text_tokenizers_with_config(): 50 | """Test tokenizers with custom configuration.""" 51 | config = Config() 52 | text = "ཀཿཐོག་འབྱམ་པའཱི་རོ།" 53 | 54 | # Test word tokenizer with config 55 | tokens = word_tok(text, config=config) 56 | assert len(tokens) > 0 57 | 58 | # Test sentence tokenizer with config 59 | sentences = sentence_tok(text, config=config) 60 | assert len(sentences) > 0 61 | 62 | # Test paragraph tokenizer with config 63 | paragraphs = paragraph_tok(text, config=config) 64 | assert len(paragraphs) > 0 65 | -------------------------------------------------------------------------------- /tests/textunits/test_bostring.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | import warnings 3 | 4 | from botok import BoString 5 | from botok import CharMarkers as m 6 | 7 | bo_str = "བཀྲ་ཤིས་ ༡༢༣ tr 就到 郊外玩བདེ་ལེགས།" 8 | 9 | 10 | def test_string(): 11 | """Testing whether, at a given index, the char category corresponds to what is expected.""" 12 | bs = BoString(bo_str) 13 | 14 | idx = 0 15 | assert "བ" == bo_str[idx] 16 | assert m.CONS == bs.base_structure[idx] 17 | 18 | idx = 2 19 | assert "ྲ" == bo_str[idx] 20 | assert m.SUB_CONS == bs.base_structure[idx] 21 | 22 | idx = 7 23 | assert "་" == bo_str[idx] 24 | assert m.TSEK == bs.base_structure[idx] 25 | 26 | idx = 9 27 | assert "༡" == bo_str[idx] 28 | assert m.NUMERAL == bs.base_structure[idx] 29 | 30 | idx = 13 31 | assert "t" == bo_str[idx] 32 | assert m.LATIN == bs.base_structure[idx] 33 | 34 | idx = 17 35 | assert "就" == bo_str[idx] 36 | assert m.CJK == bs.base_structure[idx] 37 | 38 | 39 | def test_warning(): 40 | with warnings.catch_warnings(record=True) as w: 41 | BoString("ༀ་པ་ཊུ་") 42 | assert len(w) == 1 43 | assert ( 44 | str(w[0].message) 45 | == 'Beware of unexpected results: input string contains the non-expanded char "ༀ", found in "ༀ་པ་ཊུ".' 46 | ) 47 | -------------------------------------------------------------------------------- /tests/textunits/test_bosyl.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from botok import BoSyl 3 | 4 | bs = BoSyl() 5 | 6 | 7 | def test_bosyl(): 8 | # is_affixable() Vs. SylComponents.is_thame() 9 | assert bs.is_thame("ཀུན") is False and bs.is_affixable("ཀུན") is False 10 | assert bs.is_thame("དེའིའམ") is True and bs.is_affixable("དེའིའམ") is False 11 | assert bs.is_thame("དེའི") is True and bs.is_affixable("དེའི") is False 12 | assert bs.is_thame("ང") is True and bs.is_affixable("ང") is True 13 | 14 | # get_all_affixed() 15 | affixed = bs.get_all_affixed("ང") 16 | assert affixed == [ 17 | ("ངར", {"len": 1, "type": "la", "aa": False}), 18 | ("ངས", {"len": 1, "type": "gis", "aa": False}), 19 | ("ངའི", {"len": 2, "type": "gi", "aa": False}), 20 | ("ངའམ", {"len": 2, "type": "am", "aa": False}), 21 | ("ངའང", {"len": 2, "type": "ang", "aa": False}), 22 | ("ངའོ", {"len": 2, "type": "o", "aa": False}), 23 | ("ངའིའོ", {"len": 4, "type": "gi+o", "aa": False}), 24 | ("ངའིའམ", {"len": 4, "type": "gi+am", "aa": False}), 25 | ("ངའིའང", {"len": 4, "type": "gi+ang", "aa": False}), 26 | ("ངའོའམ", {"len": 4, "type": "o+am", "aa": False}), 27 | ("ངའོའང", {"len": 4, "type": "o+ang", "aa": False}), 28 | ] 29 | 30 | affixed = bs.get_all_affixed("མཐའ") 31 | assert affixed == [ 32 | ("མཐར", {"len": 1, "type": "la", "aa": True}), 33 | ("མཐས", {"len": 1, "type": "gis", "aa": True}), 34 | ("མཐའི", {"len": 2, "type": "gi", "aa": True}), 35 | ("མཐའམ", {"len": 2, "type": "am", "aa": True}), 36 | ("མཐའང", {"len": 2, "type": "ang", "aa": True}), 37 | ("མཐའོ", {"len": 2, "type": "o", "aa": True}), 38 | ("མཐའིའོ", {"len": 4, "type": "gi+o", "aa": True}), 39 | ("མཐའིའམ", {"len": 4, "type": "gi+am", "aa": True}), 40 | ("མཐའིའང", {"len": 4, "type": "gi+ang", "aa": True}), 41 | ("མཐའོའམ", {"len": 4, "type": "o+am", "aa": True}), 42 | ("མཐའོའང", {"len": 4, "type": "o+ang", "aa": True}), 43 | ] 44 | 45 | affixed = bs.get_all_affixed("ཀུན") 46 | assert affixed is None 47 | -------------------------------------------------------------------------------- /tests/textunits/test_sylcomponents.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from botok import SylComponents 3 | 4 | 5 | def test_components(): 6 | sc = SylComponents() 7 | 8 | # A) get_parts() 9 | # 1. (prefix+main-stack, vowel+suffixes) 10 | assert sc.get_parts("བཀྲིས") == ("བཀྲ", "ིས") 11 | # 2. (exceptions, 'x') 12 | assert sc.get_parts("མདྲོན") == ("མདྲོན", "x") 13 | # 3. a list of solutions if there is more than one (not yet encountered) 14 | # 4. None if the syllable is not wellformed 15 | assert sc.get_parts("ཀཀ") is None 16 | 17 | # B) get_mingzhi() 18 | assert sc.get_mingzhi("བསྒྲུབས") == "ྒ" 19 | # the mingzhi that will serve for the particle agreement: 20 | assert sc.get_mingzhi("ཁྱེའུར") == "འ" 21 | # None if more than one solution from get_parts() (not yet encountered) 22 | 23 | # support for dadrag 24 | assert sc.get_mingzhi("ཀུནད") == "ཀ" 25 | 26 | # dadrag normalize 27 | assert sc.normalize_dadrag("ཀུནད") == "ཀུན" 28 | 29 | # C) get_info() 30 | # 1. 'dadrag' 31 | # A syllable that historically received a "da" second suffix. 32 | # As for now, the list contains ["ཀུན", "ཤིན", "འོན"] (See pybo/resources/SylComponents.json) 33 | assert sc.get_info("ཀུན") == "dadrag" 34 | # 2. 'thame' 35 | # A syllable that has the potential of hosting an affixed particle. 36 | # Will be returned for all such syls, whether or not a particle is affixed. 37 | assert sc.get_info("དེའིའམ") == "thame" 38 | assert sc.get_info("དེའི") == "thame" 39 | # 3 the syllable itself in all other cases 40 | assert sc.get_info("ང") == "thame" 41 | assert sc.get_info("རྒྱལ") == "རྒྱལ" 42 | 43 | # D) is_thame() 44 | # True if the syllabe is affixable or is already affixed, False otherwise 45 | assert sc.is_thame("ཀུན") is False 46 | assert sc.is_thame("དེའིའམ") is True 47 | assert sc.is_thame("དེའི") is True 48 | assert sc.is_thame("ང") is True 49 | -------------------------------------------------------------------------------- /tests/tokenizers/test_sent_par_tokenizer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from pathlib import Path 4 | 5 | from botok import paragraph_tokenizer, sentence_tokenizer 6 | 7 | 8 | text = ( 9 | "བཀུར་བར་མི་འགྱུར་ཞིང༌། །བརྙས་བཅོས་མི་སྙན་རྗོད་པར་བྱེད། །དབང་དང་འབྱོར་པ་ལྡན་པ་ཡི། །རྒྱལ་རིགས་ཕལ་ཆེར་བག་མེད་པས། །" 10 | "མྱོས་པའི་གླང་ཆེན་བཞིན་དུ་འཁྱམས། །དེ་ཡི་འཁོར་ཀྱང་དེ་འདྲར་འགྱུར། །གཞན་ཡང་རྒྱལ་པོ་རྒྱལ་རིགས་ཀྱི། །སྤྱོད་པ་བཟང་ངན་ཅི་འདྲ་བ། །" 11 | "དེ་འདྲའི་ཚུལ་ལ་བལྟས་ནས་སུ། །འབངས་རྣམས་དེ་དང་དེ་འདྲ་སྟེ། །རྒྱལ་པོ་ནོར་ལ་བརྐམས་གྱུར་ན། །ནོར་གྱིས་རྒྱལ་ཁྲིམས་བསླུ་བར་རྩོམ། །" 12 | "མི་བདག་གཡེམ་ལ་དགའ་གྱུར་ན། །འཕྱོན་མའི་ཚོགས་རྣམས་མགོ་འཕང་མཐོ། །ཕྲ་མར་ཉན་ན་དབྱེན་གྱིས་གཏོར། །བརྟག་དཔྱད་མི་ཤེས་རྫུན་གྱིས་སླུ། །" 13 | "ང་ལོ་ཡང་ན་ཀུན་གྱིས་བསྐྱོད། །ངོ་དགར་བརྩི་ན་ཟོལ་ཚིག་སྨྲ། །དེ་དང་དེ་ལ་སོགས་པ་ཡི། །མི་བདག་དེ་ལ་གང་གང་གིས། །" 14 | "བསླུ་བར་རུང་བའི་སྐབས་མཐོང་ན། །གཡོན་ཅན་ཚོགས་ཀྱིས་ཐབས་དེ་སེམས། །མི་རྣམས་རང་འདོད་སྣ་ཚོགས་ལ། །རྒྱལ་པོ་ཀུན་གྱི་ཐུན་མོང་ཕྱིར། །" 15 | "རྒྱལ་པོས་བསམ་གཞིགས་མ་བྱས་ན། །ཐ་མར་རྒྱལ་སྲིད་འཇིག་པར་འགྱུར། །ཆེན་པོའི་གོ་སར་གནས་པ་ལ། །སྐྱོན་ཀྱང་ཡོན་ཏན་ཡིན་ཚུལ་དུ། །" 16 | "འཁོར་ངན་རྣམས་ཀྱིས་ངོ་བསྟོད་སྨྲ། །དེ་ཕྱིར་སྐྱོན་ཡོན་ཤེས་པ་དཀའ། །ལྷག་པར་རྩོད་ལྡན་སྙིགས་མའི་ཚེ། །འཁོར་གྱི་ནང་ན་མ་རབས་མང༌། །" 17 | "སྐྱོན་ཡང་ཡོན་ཏན་ལྟར་མཐོང་ལ། །རང་འདོད་ཆེ་ཞིང་རྒྱལ་པོ་བསླུ། །ཆུས་དང་འཁོར་གྱི་བདེ་ཐབས་ལ། །བསམ་གཞིགས་བྱེད་པ་དཀོན་པའི་ཕྱིར། །" 18 | "རྒྱལ་པོས་ལེགས་པར་དཔྱད་ནས་སུ། །བདེན་པའི་ངག་ལས་" 19 | ) 20 | 21 | 22 | @pytest.fixture 23 | def tokens(wt): 24 | return wt.tokenize(text, split_affixes=True) 25 | 26 | 27 | @pytest.mark.skip(reason="not a config bug") 28 | def test_sent_tokenizer(tokens): 29 | sents = sentence_tokenizer(tokens) 30 | 31 | out = ["".join([word.text for word in s['tokens']]) for s in sents] 32 | expected = [ 33 | "བཀུར་བར་མི་འགྱུར་ཞིང༌། །བརྙས་བཅོས་མི་སྙན་རྗོད་པར་བྱེད། །", 34 | "དབང་དང་འབྱོར་པ་ལྡན་པ་ཡི། །རྒྱལ་རིགས་ཕལ་ཆེར་བག་མེད་པས། །མྱོས་པའི་གླང་ཆེན་བཞིན་དུ་འཁྱམས། །དེ་ཡི་འཁོར་ཀྱང་དེ་འདྲར་འགྱུར། །", 35 | "གཞན་ཡང་རྒྱལ་པོ་རྒྱལ་རིགས་ཀྱི། །སྤྱོད་པ་བཟང་ངན་ཅི་འདྲ་བ། །དེ་འདྲའི་ཚུལ་ལ་བལྟས་ནས་སུ། །འབངས་རྣམས་དེ་དང་དེ་འདྲ་སྟེ། །", 36 | "རྒྱལ་པོ་ནོར་ལ་བརྐམས་གྱུར་ན། །", 37 | "ནོར་གྱིས་རྒྱལ་ཁྲིམས་བསླུ་བར་རྩོམ། །", 38 | "མི་བདག་གཡེམ་ལ་དགའ་གྱུར་ན། །", 39 | "འཕྱོན་མའི་ཚོགས་རྣམས་མགོ་འཕང་མཐོ། །", 40 | "ཕྲ་མར་ཉན་ན་དབྱེན་གྱིས་གཏོར། །", 41 | "བརྟག་དཔྱད་མི་ཤེས་རྫུན་གྱིས་སླུ། །ང་ལོ་ཡང་ན་ཀུན་གྱིས་བསྐྱོད། །", 42 | "ངོ་དགར་བརྩི་ན་ཟོལ་ཚིག་སྨྲ། །", 43 | "དེ་དང་དེ་ལ་སོགས་པ་ཡི། །མི་བདག་དེ་ལ་གང་གང་གིས། །བསླུ་བར་རུང་བའི་སྐབས་མཐོང་ན། །", 44 | "གཡོན་ཅན་ཚོགས་ཀྱིས་ཐབས་དེ་སེམས། །མི་རྣམས་རང་འདོད་སྣ་ཚོགས་ལ། །རྒྱལ་པོ་ཀུན་གྱི་ཐུན་མོང་ཕྱིར། །རྒྱལ་པོས་བསམ་གཞིགས་མ་བྱས་ན། །", 45 | "ཐ་མར་རྒྱལ་སྲིད་འཇིག་པར་འགྱུར། །", 46 | "ཆེན་པོའི་གོ་སར་གནས་པ་ལ། །སྐྱོན་ཀྱང་ཡོན་ཏན་ཡིན་ཚུལ་དུ། །འཁོར་ངན་རྣམས་ཀྱིས་ངོ་བསྟོད་སྨྲ། །", 47 | "དེ་ཕྱིར་སྐྱོན་ཡོན་ཤེས་པ་དཀའ། །", 48 | "ལྷག་པར་རྩོད་ལྡན་སྙིགས་མའི་ཚེ། །འཁོར་གྱི་ནང་ན་མ་རབས་མང༌། །", 49 | "སྐྱོན་ཡང་ཡོན་ཏན་ལྟར་མཐོང་ལ། །རང་འདོད་ཆེ་ཞིང་རྒྱལ་པོ་བསླུ། །ཆུས་དང་འཁོར་གྱི་བདེ་ཐབས་ལ། །བསམ་གཞིགས་བྱེད་པ་དཀོན་པའི་ཕྱིར། །རྒྱལ་པོས་ལེགས་པར་དཔྱད་ནས་", 50 | "སུ། །བདེན་པའི་ངག་ལས་", 51 | ] 52 | assert out == expected 53 | 54 | def test_normalized_sentence(tokens): 55 | sents = sentence_tokenizer(tokens) 56 | 57 | norm_sentences = [sentence['norm_sent'] for sentence in sents] 58 | expected = [ 59 | "བཀུར་བ་ -ར་ མི་ འགྱུར་ ཞིང་ ། །", 60 | "བརྙས་བཅོས་ མི་ སྙན་ རྗོད་པ་ -ར་ བྱེད་ ། །", 61 | "དབང་ དང་ འབྱོར་པ་ ལྡན་པ་ ཡི་ ། ། རྒྱལ་རིགས་ ཕལ་ཆེར་ བག་མེད་པ་ -ས་ ། ། མྱོས་པ་ -འི་ གླང་ཆེན་ བཞིན་ དུ་ འཁྱམས་ ། ། དེ་ ཡི་ འཁོར་ ཀྱང་ དེ་ འདྲ་ -ར་ འགྱུར་ ། །", 62 | "གཞན་ ཡང་ རྒྱལ་པོ་ རྒྱལ་རིགས་ ཀྱི་ ། ། སྤྱོད་པ་ བཟང་ངན་ ཅི་འདྲ་བ་ ། ། དེ་ འདྲ་ -འི་ ཚུལ་ ལ་ བལྟས་ ནས་ སུ་ ། ། འབངས་ རྣམས་ དེ་ དང་ དེ་ འདྲ་ སྟེ་ ། །", 63 | "རྒྱལ་པོ་ ནོར་ ལ་ བརྐམས་ གྱུར་ ན་ ། །", 64 | "ནོར་ གྱིས་ རྒྱལ་ཁྲིམས་ བསླུ་བ་ -ར་ རྩོམ་ ། །", 65 | "མི་བདག་ གཡེམ་ ལ་ དགའ་ གྱུར་ ན་ ། །", 66 | "འཕྱོན་མ་ -འི་ ཚོགས་ རྣམས་ མགོ་འཕང་ མཐོ་ ། །", 67 | "ཕྲ་མ་ -ར་ ཉན་ ན་ དབྱེན་ གྱིས་ གཏོར་ ། །", 68 | "བརྟག་དཔྱད་ མི་ ཤེས་ རྫུན་ གྱིས་ སླུ་ ། ། ང་ ལོ་ ཡང་ན་ ཀུན་ གྱིས་ བསྐྱོད་ ། །", 69 | "ངོ་དགའ་ -ར་ བརྩི་ ན་ ཟོལ་ཚིག་ སྨྲ་ ། །", 70 | "དེ་ དང་ དེ་ ལ་སོགས་པ་ ཡི་ ། ། མི་བདག་ དེ་ ལ་ གང་ གང་ གིས་ ། ། བསླུ་བ་ -ར་ རུང་བ་ -འི་ སྐབས་ མཐོང་ ན་ ། །", 71 | "གཡོན་ཅན་ ཚོགས་ ཀྱིས་ ཐབས་ དེ་ སེམས་ ། ། མི་ རྣམས་ རང་འདོད་ སྣ་ཚོགས་ ལ་ ། །", 72 | "རྒྱལ་པོ་ ཀུན་ གྱི་ ཐུན་མོང་ ཕྱིར་ ། ། རྒྱལ་པོ་ -ས་ བསམ་ གཞིགས་ མ་ བྱས་ ན་ ། །", 73 | "ཐ་མ་ -ར་ རྒྱལ་སྲིད་ འཇིག་པ་ -ར་ འགྱུར་ ། །", 74 | "ཆེན་པོ་ -འི་ གོ་ས་ -ར་ གནས་པ་ ལ་ ། །", 75 | "སྐྱོན་ ཀྱང་ ཡོན་ཏན་ ཡིན་ཚུལ་ དུ་ ། ། འཁོར་ ངན་ རྣམས་ ཀྱིས་ ངོ་བསྟོད་ སྨྲ་ ། །", 76 | "དེ་ཕྱིར་ སྐྱོན་ཡོན་ ཤེས་པ་ དཀའ་ ། །", 77 | "ལྷག་པར་ རྩོད་ ལྡན་ སྙིགས་མ་ -འི་ ཚེ་ ། ། འཁོར་ གྱི་ ནང་ ན་མ་ རབས་ མང་ ། །", 78 | "སྐྱོན་ ཡང་ ཡོན་ཏན་ ལྟར་ མཐོང་ ལ་ ། །", 79 | "རང་འདོད་ ཆེ་ ཞིང་ རྒྱལ་པོ་ བསླུ་ ། ། ཆུས་ དང་ འཁོར་ གྱི་ བདེ་ ཐབས་ ལ་ ། །", 80 | "བསམ་ གཞིགས་ བྱེད་པ་ དཀོན་པ་ -འི་ ཕྱིར་ ། ། རྒྱལ་པོ་ -ས་ ལེགས་པ་ -ར་ དཔྱད་ ནས་ སུ་ ། ། བདེན་པ་ -འི་ ངག་ ལས་", 81 | ] 82 | assert norm_sentences == expected 83 | 84 | 85 | def test_par_tokenizer(tokens): 86 | pars = paragraph_tokenizer(tokens) 87 | 88 | out = ["".join([word.text for word in p[1]]) for p in pars] 89 | expected = [ 90 | "བཀུར་བར་མི་འགྱུར་ཞིང༌། །བརྙས་བཅོས་མི་སྙན་རྗོད་པར་བྱེད། །དབང་དང་འབྱོར་པ་ལྡན་པ་ཡི། །རྒྱལ་རིགས་ཕལ་ཆེར་བག་མེད་པས། །" 91 | "མྱོས་པའི་གླང་ཆེན་བཞིན་དུ་འཁྱམས། །དེ་ཡི་འཁོར་ཀྱང་དེ་འདྲར་འགྱུར། །གཞན་ཡང་རྒྱལ་པོ་རྒྱལ་རིགས་ཀྱི། །སྤྱོད་པ་བཟང་ངན་ཅི་འདྲ་བ། །" 92 | "དེ་འདྲའི་ཚུལ་ལ་བལྟས་ནས་སུ། །འབངས་རྣམས་དེ་དང་དེ་འདྲ་སྟེ། །རྒྱལ་པོ་ནོར་ལ་བརྐམས་གྱུར་ན། །ནོར་གྱིས་རྒྱལ་ཁྲིམས་བསླུ་བར་རྩོམ། །" 93 | "མི་བདག་གཡེམ་ལ་དགའ་གྱུར་ན། །འཕྱོན་མའི་ཚོགས་རྣམས་མགོ་འཕང་མཐོ། །ཕྲ་མར་ཉན་ན་དབྱེན་གྱིས་གཏོར། །བརྟག་དཔྱད་མི་ཤེས་རྫུན་གྱིས་སླུ། །" 94 | "ང་ལོ་ཡང་ན་ཀུན་གྱིས་བསྐྱོད། །ངོ་དགར་བརྩི་ན་ཟོལ་ཚིག་སྨྲ། །དེ་དང་དེ་ལ་སོགས་པ་ཡི། །མི་བདག་དེ་ལ་གང་གང་གིས། །བསླུ་བར་རུང་བའི་སྐབས་མཐོང་ན། །", 95 | "གཡོན་ཅན་ཚོགས་ཀྱིས་ཐབས་དེ་སེམས། །མི་རྣམས་རང་འདོད་སྣ་ཚོགས་ལ། །རྒྱལ་པོ་ཀུན་གྱི་ཐུན་མོང་ཕྱིར། །རྒྱལ་པོས་བསམ་གཞིགས་མ་བྱས་ན། །" 96 | "ཐ་མར་རྒྱལ་སྲིད་འཇིག་པར་འགྱུར། །ཆེན་པོའི་གོ་སར་གནས་པ་ལ། །སྐྱོན་ཀྱང་ཡོན་ཏན་ཡིན་ཚུལ་དུ། །འཁོར་ངན་རྣམས་ཀྱིས་ངོ་བསྟོད་སྨྲ། །" 97 | "དེ་ཕྱིར་སྐྱོན་ཡོན་ཤེས་པ་དཀའ། །ལྷག་པར་རྩོད་ལྡན་སྙིགས་མའི་ཚེ། །འཁོར་གྱི་ནང་ན་མ་རབས་མང༌། །སྐྱོན་ཡང་ཡོན་ཏན་ལྟར་མཐོང་ལ། །" 98 | "རང་འདོད་ཆེ་ཞིང་རྒྱལ་པོ་བསླུ། །ཆུས་དང་འཁོར་གྱི་བདེ་ཐབས་ལ། །བསམ་གཞིགས་བྱེད་པ་དཀོན་པའི་ཕྱིར། །རྒྱལ་པོས་ལེགས་པར་དཔྱད་ནས་སུ། །བདེན་པའི་ངག་ལས་", 99 | ] 100 | assert out == expected 101 | -------------------------------------------------------------------------------- /tests/tokenizers/test_splitaffixed.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from botok import * 3 | 4 | 5 | def test_split_token(): 6 | config = Config.from_path("./tests/data/empty_dialect_pack") 7 | wt = WordTokenizer(config=config) 8 | wt.tok.trie.rebuild_trie() 9 | wt.tok.trie.inflect_n_modify_trie("བདེ་བ་") 10 | wt.tok.trie.inflect_n_add_data("བདེ་བ་\t\tNOUN") 11 | wt.tok.trie.inflect_n_modify_trie("གཏན་") 12 | wt.tok.trie.inflect_n_add_data("གཏན་\t\tNOUN") 13 | wt.tok.trie.inflect_n_modify_trie("གྱི་") 14 | wt.tok.trie.inflect_n_add_data("གྱི་\tགི\tPART") 15 | tokens = wt.tokenize("གཏན་གྱི་བདེ་བའི་རྒྱུ།", split_affixes=False) 16 | assert len(tokens) == 5 17 | assert tokens[2].text == "བདེ་བའི་" 18 | tokens = wt.tokenize("གཏན་གྱི་བདེ་བའི་རྒྱུ།") 19 | assert len(tokens) == 6 20 | assert tokens[2].text == "བདེ་བ" 21 | assert tokens[3].text == "འི་" 22 | -------------------------------------------------------------------------------- /tests/tokenizers/test_stack_tokenizer.py: -------------------------------------------------------------------------------- 1 | from botok.tokenizers.stacktokenizer import tokenize_in_stacks 2 | 3 | 4 | def test_stack_tokenizer(): 5 | """Test the stack tokenizer functionality.""" 6 | # Test with standard Tibetan text 7 | assert tokenize_in_stacks("ཀཿཐོག་འབྱམ་པའཱི་རོ།") == ["ཀ", "\u0f7f", "ཐོ", "ག", "་", "འ", "བྱ", "མ", "་", "པ", "འཱི", "་", "རོ", "།"] 8 | 9 | # Test with special character at the beginning 10 | assert tokenize_in_stacks("\u0f7fཀཿ") == ["\u0f7f", "ཀ", "\u0f7f"] 11 | 12 | # Test with empty string 13 | assert tokenize_in_stacks("") == [] 14 | 15 | # Test with single character 16 | assert tokenize_in_stacks("ཀ") == ["ཀ"] 17 | -------------------------------------------------------------------------------- /tests/tokenizers/test_token.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from textwrap import dedent 3 | from pytest import raises 4 | 5 | from botok import * 6 | 7 | 8 | def test_token(): 9 | t = Token() 10 | t.text = "test" 11 | # Token supports access to attributes in two ways (required for CQL found in third_party/cql.py) 12 | assert t.text == t["text"] 13 | assert t._ == t["_"] 14 | 15 | # setting existing attributes like dicts is supported 16 | attrs = {"pos": "NOUN", "freq": "123", "len": 4} 17 | for k, v in attrs.items(): 18 | t[k] = v 19 | assert str(t) == dedent( 20 | """\ 21 | text: "test" 22 | pos: NOUN 23 | freq: 123 24 | start: 0 25 | len: 4 26 | 27 | """ 28 | ) 29 | 30 | # raises an error when trying to add a new attribute 31 | with raises(AttributeError, match=r"Token objects don't have .* as attribute"): 32 | t["non_attr"] = "test" 33 | -------------------------------------------------------------------------------- /tests/tokenizers/test_tokenize.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from textwrap import dedent 3 | 4 | import pytest 5 | 6 | from botok import * 7 | 8 | 9 | @pytest.fixture(scope="module") 10 | def empty_config(): 11 | return Config.from_path("./tests/data/empty_dialect_pack") 12 | 13 | 14 | def test_tokenize(empty_config, wt): 15 | profile = "empty" 16 | config = empty_config 17 | tok = Tokenize(Trie(BoSyl, profile, config.dictionary, config.adjustments)) 18 | tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་") 19 | tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN\t\tབཀྲ་ཤིས་\t17500") 20 | tok.trie.inflect_n_modify_trie("མཐའ་") 21 | tok.trie.inflect_n_add_data("མཐའ་\tNOUN") 22 | in_str = "མཐའི་བཀྲ་ཤིས། ཀཀ abc མཐའི་རྒྱ་མཚོ་" 23 | preproc = TokChunks(in_str) 24 | preproc.serve_syls_to_trie() 25 | tokens = tok.tokenize(preproc) 26 | expected = dedent( 27 | """\ 28 | text: "བཀྲ་ཤིས" 29 | text_cleaned: "བཀྲ་ཤིས་" 30 | text_unaffixed: "བཀྲ་ཤིས་" 31 | syls: ["བཀྲ", "ཤིས"] 32 | senses: | pos: NOUN, freq: 17500, sense: བཀྲ་ཤིས་, affixed: False | 33 | char_types: |CONS|CONS|SUB_CONS|TSEK|CONS|VOW|CONS| 34 | chunk_type: TEXT 35 | syls_idx: [[0, 1, 2], [4, 5, 6]] 36 | syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}] 37 | start: 5 38 | len: 7 39 | 40 | """ 41 | ) 42 | str(tokens[0]) 43 | assert str(tokens[1]) == expected 44 | assert tokens[2].text == "། " 45 | assert tokens[2].chunk_type == "PUNCT" 46 | # add sense to བཀྲ་ཤིས་ 47 | wt.tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN\t\tབཀྲ་ཤིས་\t17500") 48 | tokens = wt.tokenize(in_str) 49 | expected = dedent( 50 | """\ 51 | text: "བཀྲ་ཤིས" 52 | text_cleaned: "བཀྲ་ཤིས་" 53 | text_unaffixed: "བཀྲ་ཤིས་" 54 | syls: ["བཀྲ", "ཤིས"] 55 | pos: NOUN 56 | lemma: བཀྲ་ཤིས་ 57 | sense: བཀྲ་ཤིས་ 58 | senses: | pos: NOUN, freq: 17204, affixed: False, lemma: བཀྲ་ཤིས་ | pos: NOUN, freq: 17500, sense: བཀྲ་ཤིས་, affixed: False, lemma: བཀྲ་ཤིས་ | 59 | char_types: |CONS|CONS|SUB_CONS|TSEK|CONS|VOW|CONS| 60 | chunk_type: TEXT 61 | freq: 17500 62 | syls_idx: [[0, 1, 2], [4, 5, 6]] 63 | syls_start_end: [{'start': 0, 'end': 4}, {'start': 4, 'end': 7}] 64 | start: 5 65 | len: 7 66 | 67 | """ 68 | ) 69 | assert str(tokens[2]) == expected 70 | 71 | 72 | def test_non_max2(empty_config): 73 | profile = "empty" 74 | config = empty_config 75 | tok = Tokenize(Trie(BoSyl, profile, config.dictionary, config.adjustments)) 76 | tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་") 77 | tok.trie.inflect_n_add_data("བཀྲ་ཤིས་\tNOUN") 78 | tok.trie.inflect_n_modify_trie( 79 | "བཀྲ་ཤིས་བདེ་ལེགས།" 80 | ) # to ensure we're not in a maximal match 81 | preproc = TokChunks("བཀྲ་ཤིས་བདེ་བཀྲ་") 82 | preproc.serve_syls_to_trie() 83 | tokens = tok.tokenize(preproc) 84 | assert tokens[0].text == "བཀྲ་ཤིས་" 85 | assert tokens[0]["senses"][0]["pos"] == "NOUN" 86 | assert tokens[1].text == "བདེ་" 87 | assert tokens[1]["senses"][0]["pos"] == "NON_WORD" 88 | assert tokens[2].text == "བཀྲ་" 89 | assert tokens[2]["senses"][0]["pos"] == "NO_POS" 90 | 91 | 92 | def test_non_max_end_of_string(empty_config): 93 | profile = "empty" 94 | config = empty_config 95 | tok = Tokenize(Trie(BoSyl, profile, config.dictionary, config.adjustments)) 96 | tok.trie.inflect_n_modify_trie("བཀྲ་ཤིས་") 97 | tok.trie.inflect_n_modify_trie( 98 | "བཀྲ་ཤིས་བདེ་ལེགས།" 99 | ) # to ensure we're not in a maximal match 100 | preproc = TokChunks("བཀྲ་ཤིས་བདེ་") 101 | preproc.serve_syls_to_trie() 102 | tokens = tok.tokenize(preproc) 103 | assert tokens[0].text == "བཀྲ་ཤིས་" 104 | assert tokens[1].text == "བདེ་" 105 | 106 | 107 | if __name__ == "__main__": 108 | test_non_max2() 109 | -------------------------------------------------------------------------------- /tests/tokenizers/test_wordtokenizer.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from textwrap import dedent 3 | 4 | from botok import * 5 | 6 | 7 | def test_get_default_lemma(wt): 8 | input_str = "བཀྲ་ཤིས་བདེ་ལེགས། མཐའི་རྒྱ་མཚོར་གནས་སོ།། །།ཀཀ" 9 | config = Config() 10 | profile = config.dialect_pack_path.name 11 | 12 | # reconstitute all the pieces that WordTokenizer gathers 13 | tok = Tokenize(Trie(BoSyl, profile, config.dictionary, config.adjustments)) 14 | preproc = TokChunks(input_str) 15 | preproc.serve_syls_to_trie() 16 | tokens = tok.tokenize(preproc) 17 | split_affixed(tokens) 18 | 19 | # if __get_default_lemma() is not run, only the lemmas coming from the lemma folder will be included 20 | # in the Token objects. 21 | assert str(tokens[3]) == dedent( 22 | """\ 23 | text: "མཐ" 24 | text_cleaned: "མཐ" 25 | text_unaffixed: "མཐ" 26 | syls: ["མཐ"] 27 | senses: | pos: NOUN, freq: 45097, affixed: True | 28 | char_types: |CONS|CONS| 29 | chunk_type: TEXT 30 | affix_host: True 31 | syls_idx: [[0, 1]] 32 | syls_start_end: [{'start': 0, 'end': 2}] 33 | start: 18 34 | len: 2 35 | 36 | """ 37 | ) 38 | assert "lemma" not in tokens[3]["senses"][0] 39 | 40 | assert str(tokens[4]) == dedent( 41 | """\ 42 | text: "འི་" 43 | text_cleaned: "འི་" 44 | text_unaffixed: "འི་" 45 | syls: ["འི"] 46 | pos: PART 47 | char_types: |CONS|VOW|TSEK| 48 | chunk_type: TEXT 49 | affix: True 50 | syls_idx: [[0, 1]] 51 | syls_start_end: [{'start': 2, 'end': 5}] 52 | start: 20 53 | len: 3 54 | 55 | """ 56 | ) 57 | 58 | # regular words also have no lemmas 59 | assert "lemma" not in tokens[0]["senses"][0] 60 | 61 | # doing the same thing using WordTokenizer, which will apply its __get_default_lemma() method 62 | # the profile is the same, so no lemma comes from the trie content files. 63 | tokens = wt.tokenize(input_str) 64 | 65 | # the lemma is Token.text_unaffixed with an extra འ and/or a tsek where required 66 | assert str(tokens[3]) == dedent( 67 | """\ 68 | text: "མཐ" 69 | text_cleaned: "མཐ" 70 | text_unaffixed: "མཐ" 71 | syls: ["མཐ"] 72 | pos: NOUN 73 | lemma: མཐའ་ 74 | senses: | pos: NOUN, freq: 45097, affixed: True, lemma: མཐའ་ | 75 | char_types: |CONS|CONS| 76 | chunk_type: TEXT 77 | freq: 45097 78 | affix_host: True 79 | syls_idx: [[0, 1]] 80 | syls_start_end: [{'start': 0, 'end': 2}] 81 | start: 18 82 | len: 2 83 | 84 | """ 85 | ) 86 | assert tokens[3]["senses"][0]["lemma"] == "མཐའ་" 87 | 88 | # for particles, WordTokenizer reads the lemmas from a file and attributes them 89 | assert str(tokens[4]) == dedent( 90 | """\ 91 | text: "འི་" 92 | text_cleaned: "འི་" 93 | text_unaffixed: "འི་" 94 | syls: ["འི"] 95 | pos: PART 96 | lemma: གི་ 97 | senses: | lemma: གི་ | 98 | char_types: |CONS|VOW|TSEK| 99 | chunk_type: TEXT 100 | affix: True 101 | syls_idx: [[0, 1]] 102 | syls_start_end: [{'start': 2, 'end': 5}] 103 | start: 20 104 | len: 3 105 | 106 | """ 107 | ) 108 | 109 | # for regular words, Token.text_unaffixed is simply copied 110 | assert tokens[0]["senses"][0]["lemma"] == "བཀྲ་ཤིས་" 111 | 112 | # non-words do not have lemmas 113 | assert "lemma" not in tokens[10]["senses"][0] 114 | assert tokens[10].text_cleaned == "ཀཀ་" 115 | assert tokens[10].text_unaffixed == "ཀཀ་" 116 | 117 | # Token objects whose chunk_type is not 'TEXT' will be attributed no lemma. 118 | # text_unaffixed and text_cleaned are also empty. Token.text must be retrieved 119 | assert tokens[2].text_unaffixed == "" == tokens[2].text_cleaned 120 | 121 | 122 | def test_spaces_as_punct(wt): 123 | input_str = "བ ཀྲ་ཤིས་ བདེ་ལེགས། \nམཐའི་རྒྱ་མཚོར་ག ནས་སོ།། །།ཀཀ" 124 | tokens = wt.tokenize(input_str, spaces_as_punct=True) 125 | assert tokens[0].text == "བ" 126 | assert tokens[1].text == " " 127 | assert tokens[2].text == "ཀྲ་" 128 | assert tokens[8].text == " \n" 129 | 130 | def test_particle_bug(wt): 131 | input_str = "བོད་གིས" 132 | tokens = wt.tokenize(input_str) 133 | assert tokens[1].pos == "PART" -------------------------------------------------------------------------------- /tests/tries/test_basictrie.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from botok import BasicTrie 3 | 4 | 5 | def test_trie(): 6 | trie = BasicTrie() 7 | 8 | # populate the basic trie 9 | words = "hello goo good goodbye help gerald gold tea ted team to too tom stan standard money" 10 | for w in words.split(): 11 | trie.add(w) 12 | 13 | # test word existence. has_word() is not used in pybo. it is only there for testing purposes 14 | assert trie.has_word("goodbye") == {"data": {"_": {}}, "exists": True} 15 | 16 | # add content to data 17 | trie.add_data("goodbye", {"pos": "NOUN"}) 18 | assert trie.has_word("goodbye") == { 19 | "exists": True, 20 | "data": {"_": {}, "senses": [{"pos": "NOUN"}]}, 21 | } 22 | 23 | # adding an empty dict to show it does not replace existing content but updates it 24 | trie.add_data("goodbye", {}) 25 | assert trie.has_word("goodbye") == { 26 | "exists": True, 27 | "data": {"_": {}, "senses": [{"pos": "NOUN"}]}, 28 | } 29 | 30 | # by default, overwrites existing dict values 31 | trie.add_data("goodbye", {"pos": "VERB", "lemma": "goodbye"}) 32 | assert trie.has_word("goodbye") == { 33 | "exists": True, 34 | "data": { 35 | "_": {}, 36 | "senses": [{"pos": "NOUN"}, {"pos": "VERB", "lemma": "goodbye"}], 37 | }, 38 | } 39 | 40 | # deactivates an entry, only modifying the Node.leaf value (bool) instead of removing it from the trie. 41 | trie.deactivate("goodbye") 42 | assert trie.has_word("goodbye") == { 43 | "exists": False, 44 | "data": { 45 | "_": {}, 46 | "senses": [{"pos": "NOUN"}, {"pos": "VERB", "lemma": "goodbye"}], 47 | }, 48 | } 49 | 50 | # reactivates the entry 51 | trie.deactivate("goodbye", rev=True) 52 | assert trie.has_word("goodbye") == { 53 | "exists": True, 54 | "data": { 55 | "_": {}, 56 | "senses": [{"pos": "NOUN"}, {"pos": "VERB", "lemma": "goodbye"}], 57 | }, 58 | } 59 | 60 | # walk() is used to externalize the walking of the trie 61 | current_node = None # setting an empty variable for the current node 62 | for char in "goodbye": 63 | current_node = trie.walk(char, current_node) 64 | 65 | assert current_node.label == "e" # last char of the word 66 | assert current_node.leaf is True # we reached the end of a word 67 | assert current_node.data == { 68 | "_": {}, 69 | "senses": [{"pos": "NOUN"}, {"pos": "VERB", "lemma": "goodbye"}], 70 | } 71 | -------------------------------------------------------------------------------- /tests/tries/test_trie.py: -------------------------------------------------------------------------------- 1 | # coding: utf8 2 | from collections import defaultdict 3 | from pathlib import Path 4 | 5 | from botok import BoSyl, Config, TokChunks, Trie 6 | 7 | config = Config() 8 | 9 | 10 | def syls(string): 11 | return TokChunks(string).get_syls() 12 | 13 | 14 | def test_createtrie(): 15 | profile = "empty" 16 | config = Config.from_path("./tests/data/trie_dialect_pack") 17 | bt = Trie(BoSyl, profile, config.dictionary, config.adjustments) 18 | 19 | # the trie works as expected. but the add() method should never be used directly: 20 | # it does not inflect entries, so the tokenizer won't work as expected. 21 | # be careful only to use it with words that can't ever be inflected, like case particles. 22 | bt.add(syls("གྲུབ་མཐའ་"), {"pos": "NOUN"}) 23 | assert bt.has_word(syls("གྲུབ་མཐའི་")) == {"exists": False, "data": {"_": {}}} 24 | 25 | # use inflect_n_modify_trie() instead, to add entries 26 | bt.inflect_n_modify_trie("གྲུབ་མཐའ་") 27 | assert bt.has_word(syls("གྲུབ་མཐའི་")) == { 28 | "exists": True, 29 | "data": {"_": {}, "affixation": {"len": 2, "type": "gi", "aa": True}}, 30 | } 31 | 32 | bt.inflect_n_modify_trie("ཀ་ར་", skrt=True) 33 | assert bt.has_word(syls("ཀ་རར་")) == { 34 | "exists": True, 35 | "data": { 36 | "_": {}, 37 | "affixation": {"len": 1, "type": "la", "aa": False}, 38 | "skrt": True, 39 | "senses": [{"lemma": "", "affixed": True}], 40 | }, 41 | } # arrives here because skrt was True 42 | 43 | bt.inflect_n_add_data( 44 | "གྲུབ་མཐའ་\t\t\t\t532" 45 | ) # 'freq' is hard-coded in Trie, just as 'lemma' and 'pos' are 46 | assert bt.has_word(syls("གྲུབ་མཐའི་")) == { 47 | "exists": True, 48 | "data": { 49 | "_": {}, 50 | "affixation": {"len": 2, "type": "gi", "aa": True}, 51 | "senses": [{"freq": 532, "affixed": True}], 52 | }, 53 | } # freq is an int 54 | 55 | # just like add() was not meant to be used directly, deactivate() is not 56 | # instead, use bt.inflect_n_modify_trie("word", deactivate=True) 57 | bt.deactivate(syls("ཀ་ར་")) 58 | assert ( 59 | bt.has_word(syls("ཀ་ར་"))["exists"] is False 60 | ) # since 'ཀ་ར་' has been deactivated 61 | 62 | 63 | def test_multiple_words_per_entry(): 64 | profile = "POS" 65 | config = Config.from_path("./tests/data/trie_dialect_pack") 66 | bt = Trie(BoSyl, profile, config.dictionary, config.adjustments) 67 | 68 | res = bt.has_word(syls("ལྟར་")) 69 | assert {"lemma": "ལྟ་", "pos": "VERB", "freq": 123, "affixed": True} in res["data"][ 70 | "senses" 71 | ] 72 | assert {"lemma": "ལྟར་", "pos": "ADV", "freq": 456, "affixed": False} in res[ 73 | "data" 74 | ]["senses"] 75 | -------------------------------------------------------------------------------- /usage.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from botok import WordTokenizer, Text, Config 4 | 5 | ########################################### 6 | in_str = "ལེ གས། བཀྲ་ཤིས་མཐའི་ ༆ ཤི་བཀྲ་ཤིས་ tr བདེ་་ལེ གས། བཀྲ་ཤིས་བདེ་ལེགས་༡༢༣ཀཀ། མཐའི་རྒྱ་མཚོར་གནས་པའི་ཉས་ཆུ་འཐུང་།། །།མཁའ།" 7 | WT = WordTokenizer() 8 | tokens = WT.tokenize(in_str) 9 | 10 | in_str = "ལ་པོ་ལ་པོ་ལ་པོ་" 11 | t = Text(in_str, tok_params={"config": Config()}) 12 | tokens = t.tokenize_words_raw_text 13 | tt = Text( 14 | in_str, tok_params={"config": Config.from_path("./tests/data/trie_dialect_pack")}, 15 | ) 16 | ttokens = tt.tokenize_words_raw_text 17 | print(tokens) 18 | print(ttokens) 19 | ########################################### 20 | 21 | # 22 | # ### Extract token-string / POS pairs ######## 23 | # 24 | # tagged = ['"{}"/{}'.format(w.text, w.pos) for w in tokens] 25 | # print(', '.join(tagged)) 26 | # 27 | # 28 | # ### Extract the cleaned version of the tokens 29 | # 30 | # cleaned = [w.text_cleaned for w in tokens if w.text_cleaned] 31 | # print(' '.join(cleaned)) 32 | --------------------------------------------------------------------------------