├── requirements.txt
├── MANIFEST.in
├── docs
    ├── reference
    │   ├── textprocessor.md
    │   ├── lstm.md
    │   ├── bert.md
    │   └── g2p.md
    ├── contributing.md
    ├── index.md
    └── algorithm.md
├── requirements_test.txt
├── g2p_id
    ├── models
    │   ├── bert
    │   │   ├── bert_mlm.onnx
    │   │   ├── config.json
    │   │   └── token2id.json
    │   └── lstm
    │   │   ├── decoder_model.onnx
    │   │   ├── encoder_model.onnx
    │   │   ├── config.json
    │   │   ├── g2id.json
    │   │   └── p2id.json
    ├── resources
    │   ├── id_posp_tagger.pickle
    │   ├── timezones.tsv
    │   ├── currency.tsv
    │   ├── measurements.tsv
    │   └── homographs_id.tsv
    ├── __init__.py
    ├── onnx_utils.py
    ├── bert.py
    ├── lstm.py
    ├── g2p.py
    └── text_processor.py
├── NOTICE.md
├── tests
    ├── conftest.py
    ├── test_text_processor.py
    └── test_g2p.py
├── .github
    └── workflows
    │   ├── docs.yml
    │   └── tests.yml
├── tox.ini
├── setup.py
├── mkdocs.yml
├── PROJECT_CHARTER.md
├── .gitignore
├── CONTRIBUTING.md
├── README.md
├── CODE_OF_CONDUCT.md
└── LICENSE.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | num2words
2 | nltk==3.9.1
3 | onnxruntime


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include g2p_id/resources/*
3 | include g2p_id/models/*/*


--------------------------------------------------------------------------------
/docs/reference/textprocessor.md:
--------------------------------------------------------------------------------
1 | # TextProcessor
2 | 
3 | ::: g2p_id.text_processor.TextProcessor
4 | 


--------------------------------------------------------------------------------
/requirements_test.txt:
--------------------------------------------------------------------------------
1 | importlib_metadata<5
2 | flake8
3 | tox
4 | pytest
5 | pytest-cov
6 | mypy
7 | pylint


--------------------------------------------------------------------------------
/g2p_id/models/bert/bert_mlm.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bookbot-kids/g2p_id/HEAD/g2p_id/models/bert/bert_mlm.onnx


--------------------------------------------------------------------------------
/g2p_id/models/bert/config.json:
--------------------------------------------------------------------------------
1 | {
2 | 	"mask_token": "[mask]",
3 | 	"pad_token": "",
4 | 	"max_seq_length": 32
5 | }
6 | 


--------------------------------------------------------------------------------
/g2p_id/models/lstm/decoder_model.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bookbot-kids/g2p_id/HEAD/g2p_id/models/lstm/decoder_model.onnx


--------------------------------------------------------------------------------
/g2p_id/models/lstm/encoder_model.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bookbot-kids/g2p_id/HEAD/g2p_id/models/lstm/encoder_model.onnx


--------------------------------------------------------------------------------
/g2p_id/resources/id_posp_tagger.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bookbot-kids/g2p_id/HEAD/g2p_id/resources/id_posp_tagger.pickle


--------------------------------------------------------------------------------
/g2p_id/resources/timezones.tsv:
--------------------------------------------------------------------------------
1 | WITA	Waktu Indonesia Tengah
2 | WIB	Waktu Indonesia Barat
3 | WIT	Waktu Indonesia Timur
4 | GMT	Greenwich Mean Time
5 | 


--------------------------------------------------------------------------------
/NOTICE.md:
--------------------------------------------------------------------------------
1 | g2p ID
2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/)
3 | 
4 | This product includes software developed at
5 | PT BOOKBOT INDONESIA (https://bookbot.id/).
6 | 


--------------------------------------------------------------------------------
/g2p_id/models/lstm/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"latent_dim": 256,
 3 | 	"bos_token": "\t",
 4 | 	"eos_token": "\n",
 5 | 	"pad_token": " ",
 6 | 	"num_encoder_tokens": 28,
 7 | 	"num_decoder_tokens": 32,
 8 | 	"max_encoder_seq_length": 24,
 9 | 	"max_decoder_seq_length": 25
10 | }
11 | 


--------------------------------------------------------------------------------
/docs/reference/lstm.md:
--------------------------------------------------------------------------------
 1 | # LSTM
 2 | 
 3 | ::: g2p_id.lstm.LSTM
 4 | 
 5 | ## Usage
 6 | 
 7 | ```py
 8 | texts = ["mengembangkannya", "merdeka", "pecel", "lele"]
 9 | lstm = LSTM()
10 | for text in texts:
11 |     print(lstm.predict(text))
12 | ```
13 | 
14 | ```py
15 | >> məŋəmbaŋkanɲa
16 | >> mərdeka
17 | >> pətʃəl
18 | >> lele
19 | ```


--------------------------------------------------------------------------------
/docs/reference/bert.md:
--------------------------------------------------------------------------------
 1 | # BERT
 2 | 
 3 | ::: g2p_id.bert.BERT
 4 | 
 5 | ## Usage
 6 | 
 7 | ```py
 8 | texts = ["mengembangkannya", "merdeka", "pecel", "lele"]
 9 | bert = BERT()
10 | for text in texts:
11 |     print(bert.predict(text))
12 | ```
13 | 
14 | ```py
15 | >> məngəmbangkannya
16 | >> mərdeka
17 | >> pəcel
18 | >> lele
19 | ```
20 | 


--------------------------------------------------------------------------------
/g2p_id/models/lstm/g2id.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	" ": 27,
 3 | 	"'": 0,
 4 | 	"-": 1,
 5 | 	"a": 2,
 6 | 	"b": 3,
 7 | 	"c": 4,
 8 | 	"d": 5,
 9 | 	"e": 6,
10 | 	"f": 7,
11 | 	"g": 8,
12 | 	"h": 9,
13 | 	"i": 10,
14 | 	"j": 11,
15 | 	"k": 12,
16 | 	"l": 13,
17 | 	"m": 14,
18 | 	"n": 15,
19 | 	"o": 16,
20 | 	"p": 17,
21 | 	"q": 18,
22 | 	"r": 19,
23 | 	"s": 20,
24 | 	"t": 21,
25 | 	"u": 22,
26 | 	"v": 23,
27 | 	"w": 24,
28 | 	"y": 25,
29 | 	"z": 26
30 | }
31 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from g2p_id import BERT, LSTM, G2p, TextProcessor
 4 | 
 5 | 
 6 | @pytest.fixture(scope="session")
 7 | def g2p():
 8 |     return G2p()
 9 | 
10 | 
11 | @pytest.fixture(scope="session")
12 | def lstm():
13 |     return LSTM()
14 | 
15 | 
16 | @pytest.fixture(scope="session")
17 | def bert():
18 |     return BERT()
19 | 
20 | 
21 | @pytest.fixture(scope="session")
22 | def text_processor():
23 |     return TextProcessor()
24 | 


--------------------------------------------------------------------------------
/g2p_id/models/bert/token2id.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"": 0,
 3 | 	"'": 28,
 4 | 	"-": 26,
 5 | 	"[UNK]": 1,
 6 | 	"[mask]": 30,
 7 | 	"a": 2,
 8 | 	"b": 13,
 9 | 	"c": 20,
10 | 	"d": 16,
11 | 	"e": 18,
12 | 	"f": 24,
13 | 	"g": 11,
14 | 	"h": 19,
15 | 	"i": 5,
16 | 	"j": 22,
17 | 	"k": 7,
18 | 	"l": 15,
19 | 	"m": 8,
20 | 	"n": 3,
21 | 	"o": 17,
22 | 	"p": 14,
23 | 	"q": 29,
24 | 	"r": 6,
25 | 	"s": 12,
26 | 	"t": 9,
27 | 	"u": 10,
28 | 	"v": 25,
29 | 	"w": 23,
30 | 	"y": 21,
31 | 	"z": 27,
32 | 	"ə": 4
33 | }
34 | 


--------------------------------------------------------------------------------
/g2p_id/models/lstm/p2id.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"\t": 0,
 3 | 	"\n": 1,
 4 | 	" ": 31,
 5 | 	"-": 2,
 6 | 	"a": 3,
 7 | 	"b": 4,
 8 | 	"d": 5,
 9 | 	"e": 6,
10 | 	"f": 7,
11 | 	"g": 8,
12 | 	"h": 9,
13 | 	"i": 10,
14 | 	"j": 11,
15 | 	"k": 12,
16 | 	"l": 13,
17 | 	"m": 14,
18 | 	"n": 15,
19 | 	"o": 16,
20 | 	"p": 17,
21 | 	"r": 18,
22 | 	"s": 19,
23 | 	"t": 20,
24 | 	"u": 21,
25 | 	"v": 22,
26 | 	"w": 23,
27 | 	"z": 24,
28 | 	"ŋ": 25,
29 | 	"ə": 26,
30 | 	"ɲ": 27,
31 | 	"ʃ": 28,
32 | 	"ʒ": 29,
33 | 	"ʔ": 30
34 | }
35 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: Deploy docs to Github Pages
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | jobs:
 7 |   deploy:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - name: Checkout master branch
11 |         uses: actions/checkout@v2
12 |       - name: Setup Python
13 |         uses: actions/setup-python@v2
14 |         with:
15 |           python-version: 3.9
16 |       - name: Install dependencies
17 |         run: pip3 install mkdocs-material mkdocstrings mkdocstrings-python-legacy
18 |       - name: Install package
19 |         run: pip3 install .
20 |       - name: Deploy docs
21 |         run: mkdocs gh-deploy --force
22 | 


--------------------------------------------------------------------------------
/g2p_id/resources/currency.tsv:
--------------------------------------------------------------------------------
 1 | US$	dollar amerika serikat
 2 | nzd	dollar new zealand
 3 | rs	rupee
 4 | chf	franc swiss
 5 | dkk	kroner denmark
 6 | fim	markka finland
 7 | aed	dirham arab
 8 | czk	koruna ceko
 9 | mro	ouguiya mauritania
10 | pkr	rupee pakistan
11 | crc	colon costa rica
12 | hk$	dollar hong kong
13 | npr	rupee nepal
14 | awg	florin aruban
15 | nok	kroner norwegia
16 | tzs	shilling tanzania
17 | sek	kronor swedish
18 | cyp	pounds cypriot
19 | sar	riyal saudi
20 | cve	escudo cape verde
21 | rsd	dinar serbia
22 | dm	mark jerman
23 | shp	pounds saint helena
24 | php	peso philipina
25 | cad	dollar canada
26 | ssp	pounds sudan selatan
27 | scr	rupee seychell
28 | mvr	rufiyaa maldivia
29 | Rp	rupiah
30 | r	real
31 | $	dollar
32 | €	euro
33 | £	pounds
34 | ₩	won
35 | ¥	yen


--------------------------------------------------------------------------------
/docs/reference/g2p.md:
--------------------------------------------------------------------------------
 1 | # G2p
 2 | 
 3 | ::: g2p_id.g2p.G2p
 4 | 
 5 | ## Usage
 6 | 
 7 | ```py
 8 | texts = [
 9 |     "Apel itu berwarna merah.",
10 |     "Rahel bersekolah di Jakarta.",
11 |     "Mereka sedang bermain bola di lapangan.",
12 | ]
13 | g2p = G2p(model_type="BERT")
14 | for text in texts:
15 |     print(g2p(text))
16 | ```
17 | 
18 | ```py
19 | >> [['a', 'p', 'ə', 'l'], ['i', 't', 'u'], ['b', 'ə', 'r', 'w', 'a', 'r', 'n', 'a'], ['m', 'e', 'r', 'a', 'h'], ['.']]
20 | >> [['r', 'a', 'h', 'e', 'l'], ['b', 'ə', 'r', 's', 'ə', 'k', 'o', 'l', 'a', 'h'], ['d', 'i'], ['dʒ', 'a', 'k', 'a', 'r', 't', 'a'], ['.']]
21 | >> [['m', 'ə', 'r', 'e', 'k', 'a'], ['s', 'ə', 'd', 'a', 'ŋ'], ['b', 'ə', 'r', 'm', 'a', 'i', 'n'], ['b', 'o', 'l', 'a'], ['d', 'i'], ['l', 'a', 'p', 'a', 'ŋ', 'a', 'n'], ['.']]
22 | ```


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | jobs:
11 |   test:
12 |     runs-on: ${{ matrix.os }}
13 |     strategy:
14 |       matrix:
15 |         os: [ubuntu-latest, windows-latest]
16 |         python-version: ["3.8", "3.9"]
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v2
20 |       - name: Set up Python ${{ matrix.python-version }}
21 |         uses: actions/setup-python@v2
22 |         with:
23 |           python-version: ${{ matrix.python-version }}
24 |       - name: Install dependencies
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           pip install tox tox-gh-actions
28 |       - name: Install package
29 |         run: pip install .
30 |       - name: Test with tox
31 |         run: tox
32 |       - name: Upload coverage reports to Codecov
33 |         uses: codecov/codecov-action@v3
34 | 


--------------------------------------------------------------------------------
/g2p_id/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/)
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 | 
17 | from .bert import BERT
18 | from .g2p import G2p
19 | from .lstm import LSTM
20 | from .onnx_utils import WrapInferenceSession
21 | from .text_processor import TextProcessor
22 | 
23 | __version__ = "0.4.2"
24 | __all__ = ["G2p", "LSTM", "BERT", "WrapInferenceSession", "TextProcessor"]
25 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | minversion = 3.8.0
 3 | envlist = python3.8, python3.9, flake8, mypy
 4 | isolated_build = true
 5 | 
 6 | [gh-actions]
 7 | python =
 8 |     3.8: python3.8, flake8, mypy, pylint
 9 |     3.9: python3.9, flake8, mypy, pylint
10 | 
11 | [testenv]
12 | setenv =
13 |     PYTHONPATH = {toxinidir}
14 | deps =
15 |     -r{toxinidir}/requirements.txt
16 |     -r{toxinidir}/requirements_test.txt
17 | commands =
18 |     coverage erase
19 |     coverage run --branch -m pytest
20 |     coverage report
21 |     coverage xml -i -o coverage.xml
22 |     flake8 g2p_id tests
23 |     mypy g2p_id --ignore-missing-imports
24 |     pylint --rcfile=tox.ini g2p_id
25 | 
26 | [flake8]
27 | extend-ignore = E203
28 | max-line-length = 120
29 | 
30 | [pylint]
31 | ; R0902: Too many instance attribute
32 | ; R0903: Too few public methods
33 | ; R0914: Too many local variables
34 | disable = 
35 |     R0902,
36 |     R0903,
37 |     R0914
38 | max-line-length = 120
39 | 
40 | [coverage:run]
41 | source=g2p_id
42 | 
43 | [coverage:report]
44 | exclude_lines =
45 |     except


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import find_packages, setup
 2 | from pathlib import Path
 3 | 
 4 | this_path = Path(__file__).parent
 5 | 
 6 | readme_path = this_path / "README.md"
 7 | requirements_path = this_path / "requirements.txt"
 8 | 
 9 | long_description = readme_path.read_text(encoding="utf-8")
10 | 
11 | with open(requirements_path, "r", encoding="utf-8") as requirements_file:
12 |     requirements = requirements_file.read().splitlines()
13 | 
14 | if __name__ == "__main__":
15 |     setup(
16 |         name="g2p_id_py",
17 |         version="0.4.2",
18 |         description="Indonesian G2P.",
19 |         long_description=long_description,
20 |         long_description_content_type="text/markdown",
21 |         author="w11wo",
22 |         author_email="wilson@bookbotkids.com",
23 |         url="https://github.com/bookbot-kids/g2p_id",
24 |         license="Apache License",
25 |         packages=find_packages(),
26 |         install_requires=requirements,
27 |         include_package_data=True,
28 |         platforms=["linux", "unix", "windows"],
29 |         python_requires=">=3.8",
30 |     )
31 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: g2p ID
 2 | repo_url: https://github.com/bookbot-kids/g2p_id
 3 | docs_dir: docs
 4 | 
 5 | theme:
 6 |   name: material
 7 |   palette:
 8 |     - media: "(prefers-color-scheme: light)"
 9 |       scheme: default
10 |       primary: indigo
11 |       accent: indigo
12 |       toggle:
13 |         icon: material/weather-night
14 |         name: Switch to dark mode
15 |     - media: "(prefers-color-scheme: dark)"
16 |       scheme: slate
17 |       primary: red
18 |       accent: red
19 |       toggle:
20 |         icon: material/weather-sunny
21 |         name: Switch to light mode
22 |   features:
23 |     - navigation.sections
24 | 
25 | plugins:
26 |   - search
27 |   - mkdocstrings:
28 |       handlers:
29 |         python:
30 |           options:
31 |             show_source: true
32 |             show_root_heading: true
33 |             heading_level: 2
34 | 
35 | markdown_extensions:
36 |   - tables
37 |   - pymdownx.highlight:
38 |       anchor_linenums: true
39 |   - pymdownx.inlinehilite
40 |   - pymdownx.snippets
41 |   - pymdownx.superfences
42 |   - def_list
43 |   - pymdownx.tasklist:
44 |       custom_checkbox: true
45 | 
46 | watch:
47 |   - g2p_id
48 | 


--------------------------------------------------------------------------------
/tests/test_text_processor.py:
--------------------------------------------------------------------------------
 1 | def test_text_processor(text_processor):
 2 |     # URLs
 3 |     assert text_processor.normalize("Situs: https://www.google.com") == "Situs: "
 4 |     # measurements
 5 |     assert (
 6 |         text_processor.normalize("123,1 kg")
 7 |         == "seratus dua puluh tiga koma satu kilogram"
 8 |     )
 9 |     assert text_processor.normalize("500 cm") == "lima ratus centimeter"
10 |     # currency/money
11 |     assert text_processor.normalize("$100") == "seratus dollar"
12 |     assert text_processor.normalize("Rp 3,000,000") == "tiga juta rupiah"
13 |     # dates
14 |     assert (
15 |         text_processor.normalize("(17/8/1945)").strip()
16 |         == "tujuh belas Agustus seribu sembilan ratus empat puluh lima"
17 |     )
18 |     assert text_processor.normalize("(1/13)").strip() == "satu Januari"
19 |     # time/time zone
20 |     assert (
21 |         text_processor.normalize("19.45 WIB")
22 |         == "sembilan belas lewat empat puluh lima menit Waktu Indonesia Barat"
23 |     )
24 |     assert (
25 |         text_processor.normalize("19.00 WIB") == "sembilan belas Waktu Indonesia Barat"
26 |     )
27 |     # numerics
28 |     assert text_processor.normalize("105.000") == "seratus lima ribu"
29 |     assert text_processor.normalize("0,5") == "nol koma lima"
30 | 


--------------------------------------------------------------------------------
/PROJECT_CHARTER.md:
--------------------------------------------------------------------------------
 1 | # Project Charter
 2 | 
 3 | ## Vision statement
 4 | Literacy is fundamental, not only for our personal and social development, but also for our ability to function effectively in society. Our vision at Bookbot is that every child should have the opportunity to develop their reading, writing and communication skills to create a happy and successful life.
 5 | 
 6 | 
 7 | ## Mission statement
 8 | Deliver the Bookbot app that combines speech recognition and a scientifically designed reading program for school children to achieve greater literacy and providing better tools for educators to monitor a child’s reading progress. 
 9 | 
10 | 
11 | ## Community (Impact) statement
12 | <!-- Bookbot exists to ensure that every child, regardless of their situation, is able to develop their literacy skills. We want to ensure that all children in the world have access to the necessary resources to overcome any barriers, whether that be accessibility or reading difficulties, that may hinder their ability to read. -->
13 | Bookbot is founded on the grounds of building a community of learners. Members of the Bookbot community consist of software developers, educators, students, writers, editors, linguists, people with disabilities, and more. We exist to ensure that every child, regardless of their situation, is able to develop their literacy skills.
14 | 
15 | 
16 | ## Licensing strategy
17 | Open source (creative commons), and reseller model for the app. Parts of code are Apache 2
18 | 
19 | ## Identification of key trademarks
20 | No key trademarks
21 | 


--------------------------------------------------------------------------------
/g2p_id/onnx_utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/)
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 | 
17 | import onnxruntime as ort
18 | 
19 | 
20 | class WrapInferenceSession:
21 |     """Wrapper class for serializing ONNX InferenceSession objects.
22 |     Based on: https://github.com/microsoft/onnxruntime/pull/800#issuecomment-844326099
23 |     """
24 | 
25 |     def __init__(self, onnx_bytes, sess_options=None, providers=None):
26 |         self.sess = ort.InferenceSession(onnx_bytes, sess_options=sess_options, providers=providers)
27 |         self.onnx_bytes = onnx_bytes
28 |         self.providers = providers
29 | 
30 |     def run(self, *args):
31 |         """Wrapper for ONNX InferenceSession run method.
32 | 
33 |         Returns:
34 |             Any: Inference result.
35 |         """
36 |         return self.sess.run(*args)
37 | 
38 |     def __getstate__(self):
39 |         return {"onnx_bytes": self.onnx_bytes}
40 | 
41 |     def __setstate__(self, values):
42 |         self.onnx_bytes = values["onnx_bytes"]
43 |         self.providers = values.get("providers", None)
44 |         self.sess = ort.InferenceSession(self.onnx_bytes, self.providers)
45 | 


--------------------------------------------------------------------------------
/g2p_id/resources/measurements.tsv:
--------------------------------------------------------------------------------
  1 | sq mi	mil kuadrat
  2 | sq ft	kaki kuadrat
  3 | kbps	kilobit per detik
  4 | mbps	megabit per detik
  5 | kcal	kilo kalori
  6 | ghz	gigahertz
  7 | khz	kilohertz
  8 | mhz	megahertz
  9 | lbs	pound
 10 | rpm	revolution per menit
 11 | kwh	kilo watt jam
 12 | min	menit
 13 | mph	mil per jam
 14 | mol	mol
 15 | gpa	giga pascal
 16 | km²	kilometer kuadrat
 17 | km2	kilometer kuadrat
 18 | rad	radian
 19 | kgf	kilogram force
 20 | mm²	millimeter kuadrat
 21 | mm2	millimeter kuadrat
 22 | cm²	centimeter kuadrat
 23 | cm2	centimeter kuadrat
 24 | dm³	desimeter kubik
 25 | dm3	desimeter kubik
 26 | amu	atomic mass unit
 27 | gwh	giga watt jam
 28 | kpa	kilopascal
 29 | cwt	hundredweight
 30 | atm	atmosphere
 31 | bar	bar
 32 | km	kilometer
 33 | cm	centimeter
 34 | mm	millimeter
 35 | ha	hectare
 36 | mi	mil
 37 | m²	meter kuadrat
 38 | m2	meter kuadrat
 39 | ft	kaki
 40 | hz	hertz
 41 | kw	kilowatt
 42 | hp	tenaga kuda
 43 | mg	milligram
 44 | kg	kilogram
 45 | lb	pound
 46 | mc	mega coulomb
 47 | nm	nanometer
 48 | mA	milli ampere
 49 | m³	meter kubik
 50 | m3	meter kubik
 51 | tw	tera watt
 52 | mv	milli volt
 53 | mw	megawatt
 54 | μm	mikrometer
 55 | "	inch
 56 | TB	terabyte
 57 | cc	c c
 58 | da	dalton
 59 | db	desibel
 60 | ps	peta detik
 61 | oz	ounce
 62 | hl	hecto liter
 63 | μg	mikrogram
 64 | pg	petagram
 65 | GB	gigabyte
 66 | kb	kilobit
 67 | ev	electron volt
 68 | MB	megabyte
 69 | KB	kilobyte
 70 | kl	kilo liter
 71 | tj	tera joule
 72 | kv	kilo volt
 73 | mv	mega volt
 74 | kn	kilonewton
 75 | mm	megameter
 76 | au	astronomical unit
 77 | yd	yard
 78 | lm	lumen
 79 | hs	hecto detik
 80 | ml	milliliter
 81 | gw	gigawatt
 82 | ma	mega ampere
 83 | kt	knot
 84 | ng	nano gram
 85 | ns	nano detik
 86 | ms	mega siemens
 87 | gl	giga liter
 88 | μs	mikro detik
 89 | da	desi ampere
 90 | pa	pascal
 91 | ds	desi detik
 92 | ms	milli detik
 93 | dm	desimeter
 94 | mb	megabit
 95 | mf	mega farad
 96 | bq	becquerel
 97 | pb	petabit
 98 | cd	candela
 99 | tl	tera liter
100 | ms	mega detik
101 | mpa	megapascal
102 | pb	peta byte
103 | gy	gray
104 | sv	sievert
105 | cc	c c
106 | °F	derajat fahrenheit
107 | °f	derajat fahrenheit
108 | °C	derajat celsius
109 | °c	derajat celsius
110 | m	meter
111 | %	percent
112 | v	volt
113 | h	jam
114 | g	gram
115 | s	detik
116 | ω	ohm


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | .DS_Store
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | pip-wheel-metadata/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | target/
 78 | 
 79 | # Jupyter Notebook
 80 | .ipynb_checkpoints
 81 | 
 82 | # IPython
 83 | profile_default/
 84 | ipython_config.py
 85 | 
 86 | # pyenv
 87 | .python-version
 88 | 
 89 | # pipenv
 90 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 91 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 92 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 93 | #   install all needed dependencies.
 94 | #Pipfile.lock
 95 | 
 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 97 | __pypackages__/
 98 | 
 99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 | 
103 | # SageMath parsed files
104 | *.sage.py
105 | 
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 | 
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 | 
119 | # Rope project settings
120 | .ropeproject
121 | 
122 | # mkdocs documentation
123 | /site
124 | 
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 | 
130 | # Pyre type checker
131 | .pyre/
132 | 


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | Hi there! Thanks for taking your time to contribute!
 3 | 
 4 | We welcome everyone to contribute and we value each contribution, even the smallest ones! We want to make contributing to this project as easy and transparent as possible, whether it's:
 5 | 
 6 | - Reporting a bug
 7 | - Discussing the current state of the code
 8 | - Submitting a fix
 9 | - Proposing new features
10 | - Becoming a maintainer
11 | 
12 | ## Code of Conduct
13 | 
14 | Please be mindful to respect our [Code of Conduct](https://github.com/bookbot-kids/g2p_id/blob/main/CODE_OF_CONDUCT.md).
15 | 
16 | ## We Develop with Github
17 | We use github to host code, to track issues and feature requests, as well as accept pull requests.
18 | 
19 | ## We Use Github, So All Code Changes Happen Through Pull Requests
20 | Pull requests are the best way to propose changes to the codebase. We actively welcome your pull requests:
21 | 
22 | 1. Fork the repo and create your branch from `main`.
23 | 2. If you've added code that should be tested, add tests.
24 | 3. If you've changed APIs, update the documentation.
25 | 4. Ensure the test suite passes.
26 | 5. Make sure your code lints.
27 | 6. Issue that pull request!
28 | 
29 | ## Any contributions you make will be under the Apache 2.0 License
30 | In short, when you submit code changes, your submissions are understood to be under the same [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0) that covers the project. Feel free to contact the maintainers if that's a concern.
31 | 
32 | ## Report bugs using Github's [issues](https://github.com/bookbot-kids/g2p_id/issues)
33 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/bookbot-kids/g2p_id/issues/new).
34 | 
35 | ## Write bug reports with detail, background, and sample code
36 | [This is an example](http://stackoverflow.com/q/12488905/180626) of a good and thorough bug report.
37 | 
38 | **Great Bug Reports** tend to have:
39 | 
40 | - A quick summary and/or background
41 | - Steps to reproduce
42 |   - Be specific!
43 |   - Give sample code if you can.
44 | - What you expected would happen
45 | - What actually happens
46 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
47 | 
48 | ## License
49 | By contributing, you agree that your contributions will be licensed under its Apache 2.0 License.
50 | 
51 | ## References
52 | This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md)


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to g2p ID
 2 | Hi there! Thanks for taking your time to contribute!
 3 | 
 4 | We welcome everyone to contribute and we value each contribution, even the smallest ones! We want to make contributing to this project as easy and transparent as possible, whether it's:
 5 | 
 6 | - Reporting a bug
 7 | - Discussing the current state of the code
 8 | - Submitting a fix
 9 | - Proposing new features
10 | - Becoming a maintainer
11 | 
12 | ## Code of Conduct
13 | 
14 | Please be mindful to respect our [Code of Conduct](https://github.com/bookbot-kids/g2p_id/blob/main/CODE_OF_CONDUCT.md).
15 | 
16 | ## We Develop with Github
17 | We use github to host code, to track issues and feature requests, as well as accept pull requests.
18 | 
19 | ## We Use Github, So All Code Changes Happen Through Pull Requests
20 | Pull requests are the best way to propose changes to the codebase. We actively welcome your pull requests:
21 | 
22 | 1. Fork the repo and create your branch from `main`.
23 | 2. If you've added code that should be tested, add tests.
24 | 3. If you've changed APIs, update the documentation.
25 | 4. Ensure the test suite passes.
26 | 5. Make sure your code lints.
27 | 6. Issue that pull request!
28 | 
29 | ## Any contributions you make will be under the Apache 2.0 License
30 | In short, when you submit code changes, your submissions are understood to be under the same [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0) that covers the project. Feel free to contact the maintainers if that's a concern.
31 | 
32 | ## Report bugs using Github's [issues](https://github.com/bookbot-kids/g2p_id/issues)
33 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/bookbot-kids/g2p_id/issues/new).
34 | 
35 | ## Write bug reports with detail, background, and sample code
36 | [This is an example](http://stackoverflow.com/q/12488905/180626) of a good and thorough bug report.
37 | 
38 | **Great Bug Reports** tend to have:
39 | 
40 | - A quick summary and/or background
41 | - Steps to reproduce
42 |   - Be specific!
43 |   - Give sample code if you can.
44 | - What you expected would happen
45 | - What actually happens
46 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
47 | 
48 | ## License
49 | By contributing, you agree that your contributions will be licensed under its Apache 2.0 License.
50 | 
51 | ## References
52 | This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md)


--------------------------------------------------------------------------------
/g2p_id/bert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/)
 3 | 
 4 | Licensed under the Apache License, Version 2.0 (the "License");
 5 | you may not use this file except in compliance with the License.
 6 | You may obtain a copy of the License at
 7 | 
 8 |     http://www.apache.org/licenses/LICENSE-2.0
 9 | 
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 | 
17 | import json
18 | import os
19 | 
20 | import numpy as np
21 | import onnxruntime
22 | 
23 | from g2p_id.onnx_utils import WrapInferenceSession
24 | 
25 | model_path = os.path.join(os.path.dirname(__file__), "models", "bert")
26 | 
27 | 
28 | class BERT:
29 |     """Phoneme-level BERT model for predicting the correct phoneme for the letter `e`.
30 |     Trained with [Keras](https://keras.io/examples/nlp/masked_language_modeling/),
31 |     and exported to ONNX. ONNX Runtime engine used during inference.
32 |     """
33 | 
34 |     def __init__(self):
35 |         bert_model_path = os.path.join(model_path, "bert_mlm.onnx")
36 |         token2id = os.path.join(model_path, "token2id.json")
37 |         config_path = os.path.join(model_path, "config.json")
38 |         self.model = WrapInferenceSession(bert_model_path, providers=onnxruntime.get_available_providers())
39 |         with open(config_path, encoding="utf-8") as file:
40 |             self.config = json.load(file)
41 |         with open(token2id, encoding="utf-8") as file:
42 |             self.token2id = json.load(file)
43 |         self.id2token = {v: k for k, v in self.token2id.items()}
44 | 
45 |     def predict(self, text: str) -> str:
46 |         """Performs BERT inference, predicting the correct phoneme for the letter `e`.
47 | 
48 |         Args:
49 |             text (str): Word to predict from.
50 | 
51 |         Returns:
52 |             str: Word after prediction.
53 |         """
54 |         # `x` is currently OOV, we replace with
55 |         text = text.replace("x", "ks")
56 |         # mask `e`'s
57 |         text = " ".join([c if c != "e" else "[mask]" for c in text])
58 | 
59 |         # tokenize and pad to max length
60 |         tokens = [self.token2id[c] for c in text.split()]
61 |         padding = [self.token2id[self.config["pad_token"]] for _ in range(self.config["max_seq_length"] - len(tokens))]
62 |         tokens = tokens + padding
63 | 
64 |         input_ids = np.array([tokens], dtype="int64")
65 |         inputs = {"input_1": input_ids}
66 |         prediction = self.model.run(None, inputs)
67 | 
68 |         # find masked idx token
69 |         mask_token_id = self.token2id[self.config["mask_token"]]
70 |         masked_index = np.where(input_ids == mask_token_id)[1]
71 | 
72 |         # get prediction at masked indices
73 |         mask_prediction = prediction[0][0][masked_index]
74 |         predicted_ids = np.argmax(mask_prediction, axis=1)
75 | 
76 |         # replace mask with predicted token
77 |         for i, idx in enumerate(masked_index):
78 |             tokens[idx] = predicted_ids[i]
79 | 
80 |         return "".join([self.id2token[t] for t in tokens if t != 0])
81 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
 1 | # Home
 2 | 
 3 | ## g2p ID: Indonesian Grapheme-to-Phoneme Converter
 4 | 
 5 | <p align="center">
 6 |     <a href="https://github.com/bookbot-kids/g2p_id/blob/main/LICENSE.md">
 7 |         <img alt="GitHub" src="https://img.shields.io/github/license/bookbot-kids/g2p_id.svg?color=blue">
 8 |     </a>
 9 |     <a href="https://bookbot-kids.github.io/g2p_id/">
10 |         <img alt="Documentation" src="https://img.shields.io/website/http/bookbot-kids.github.io/g2p_id.svg?down_color=red&down_message=offline&up_message=online">
11 |     </a>
12 |     <a href="https://github.com/bookbot-kids/g2p_id/releases">
13 |         <img alt="GitHub release" src="https://img.shields.io/github/release/bookbot-kids/g2p_id.svg">
14 |     </a>
15 |     <a href="https://github.com/bookbot-kids/g2p_id/blob/main/CODE_OF_CONDUCT.md">
16 |         <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
17 |     </a>
18 |     <a href="https://github.com/bookbot-kids/g2p_id/actions/workflows/tests.yml">
19 |         <img alt="Tests" src="https://github.com/bookbot-kids/g2p_id/actions/workflows/tests.yml/badge.svg">
20 |     </a>
21 |     <a href="https://codecov.io/gh/bookbot-kids/g2p_id">
22 |         <img alt="Code Coverage" src="https://img.shields.io/codecov/c/github/bookbot-kids/g2p_id">
23 |     </a>
24 |     <a href="https://discord.gg/gqwTPyPxa6">
25 |         <img alt="chat on Discord" src="https://img.shields.io/discord/1001447685645148169?logo=discord">
26 |     </a>
27 |     <a href="https://github.com/bookbot-kids/g2p_id/blob/main/CONTRIBUTING.md">
28 |         <img alt="contributing guidelines" src="https://img.shields.io/badge/contributing-guidelines-brightgreen">
29 |     </a>
30 | </p>
31 | 
32 | This library is developed to convert Indonesian (Bahasa Indonesia) graphemes (words) to phonemes in IPA. We followed the methods and designs used in the English equivalent library, [g2p](https://github.com/Kyubyong/g2p).
33 | 
34 | ## Installation
35 | 
36 | ```bash
37 | pip install g2p_id_py
38 | ```
39 | 
40 | ## How to Use
41 | 
42 | ```py
43 | from g2p_id import G2p
44 | 
45 | texts = [
46 |     "Apel itu berwarna merah.",
47 |     "Rahel bersekolah di Jakarta.",
48 |     "Mereka sedang bermain bola di lapangan.",
49 | ]
50 | 
51 | g2p = G2p()
52 | for text in texts:
53 |     print(g2p(text))
54 | 
55 | >> [['a', 'p', 'ə', 'l'], ['i', 't', 'u'], ['b', 'ə', 'r', 'w', 'a', 'r', 'n', 'a'], ['m', 'e', 'r', 'a', 'h'], ['.']]
56 | >> [['r', 'a', 'h', 'e', 'l'], ['b', 'ə', 'r', 's', 'ə', 'k', 'o', 'l', 'a', 'h'], ['d', 'i'], ['dʒ', 'a', 'k', 'a', 'r', 't', 'a'], ['.']]
57 | >> [['m', 'ə', 'r', 'e', 'k', 'a'], ['s', 'ə', 'd', 'a', 'ŋ'], ['b', 'ə', 'r', 'm', 'a', 'i', 'n'], ['b', 'o', 'l', 'a'], ['d', 'i'], ['l', 'a', 'p', 'a', 'ŋ', 'a', 'n'], ['.']]
58 | ```
59 | 
60 | ## References
61 | 
62 | ```bib
63 | @misc{g2pE2019,
64 |   author = {Park, Kyubyong & Kim, Jongseok},
65 |   title = {g2pE},
66 |   year = {2019},
67 |   publisher = {GitHub},
68 |   journal = {GitHub repository},
69 |   howpublished = {\url{https://github.com/Kyubyong/g2p}}
70 | }
71 | ```
72 | 
73 | ```bib
74 | @misc{TextProcessor2021,
75 |   author = {Cahya Wirawan},
76 |   title = {Text Processor},
77 |   year = {2021},
78 |   publisher = {GitHub},
79 |   journal = {GitHub repository},
80 |   howpublished = {\url{https://github.com/cahya-wirawan/text_processor}}
81 | }
82 | ```
83 | 
84 | ## Contributors
85 | 
86 | <a href="https://github.com/w11wo/g2p_id/graphs/contributors">
87 |   <img src="https://contrib.rocks/image?repo=w11wo/g2p_id" />
88 | </a>


--------------------------------------------------------------------------------
/tests/test_g2p.py:
--------------------------------------------------------------------------------
 1 | def test_g2p(g2p):
 2 |     assert g2p("Apel itu berwarna merah.") == [
 3 |         ["a", "p", "ə", "l"],
 4 |         ["i", "t", "u"],
 5 |         ["b", "ə", "r", "w", "a", "r", "n", "a"],
 6 |         ["m", "e", "r", "a", "h"],
 7 |         ["."],
 8 |     ]
 9 |     assert g2p("Rahel bersekolah di S M A Jakarta 17.") == [
10 |         ["r", "a", "h", "e", "l"],
11 |         ["b", "ə", "r", "s", "ə", "k", "o", "l", "a", "h"],
12 |         ["d", "i"],
13 |         ["e", "s"],
14 |         ["e", "m"],
15 |         ["a"],
16 |         ["dʒ", "a", "k", "a", "r", "t", "a"],
17 |         ["t", "u", "dʒ", "u", "h"],
18 |         ["b", "ə", "l", "a", "s"],
19 |         ["."],
20 |     ]
21 |     assert g2p("Mereka sedang bermain bola di lapangan.") == [
22 |         ["m", "ə", "r", "e", "k", "a"],
23 |         ["s", "ə", "d", "a", "ŋ"],
24 |         ["b", "ə", "r", "m", "a", "ʔ", "i", "n"],
25 |         ["b", "o", "l", "a"],
26 |         ["d", "i"],
27 |         ["l", "a", "p", "a", "ŋ", "a", "n"],
28 |         ["."],
29 |     ]
30 |     assert g2p("Ini rumahnya Aisyah dan Ceri.") == [
31 |         ["i", "n", "i"],
32 |         ["r", "u", "m", "a", "h", "ɲ", "a"],
33 |         ["a", "ʔ", "i", "ʃ", "a", "h"],
34 |         ["d", "a", "n"],
35 |         ["tʃ", "e", "r", "i"],
36 |         ["."],
37 |     ]
38 |     assert g2p("keset selamat datang") == [
39 |         ["k", "e", "s", "e", "t"],
40 |         ["s", "ə", "l", "a", "m", "a", "t"],
41 |         ["d", "a", "t", "a", "ŋ"],
42 |     ]
43 |     assert g2p("kakak layak") == [["k", "a", "k", "a", "k"], ["l", "a", "j", "a", "k"]]
44 | 
45 | 
46 | def test_rule_based_g2p(g2p):
47 |     assert g2p._rule_based_g2p("berakhirnya") == "b e r a x i r ɲ a"
48 |     assert g2p._rule_based_g2p("bermaaf-maafan") == "b e r m a ʔ a f - m a ʔ a f a n"
49 |     assert g2p._rule_based_g2p("kecolongan") == "k e tʃ o l o ŋ a n"
50 |     assert g2p._rule_based_g2p("jayapura") == "dʒ a j a p u r a"
51 |     assert g2p._rule_based_g2p("xenon") == "s e n o n"
52 |     assert g2p._rule_based_g2p("layak") == "l a j a k"
53 | 
54 | 
55 | def test_lstm(lstm):
56 |     assert lstm.predict("mengembangkannya") == "məŋəmbaŋkanɲa"
57 |     assert lstm.predict("merdeka") == "mərdeka"
58 |     assert lstm.predict("pecel") == "pətʃəl"
59 |     assert lstm.predict("lele") == "lele"
60 | 
61 | 
62 | def test_bert(bert):
63 |     assert bert.predict("mengembangkannya") == "məngəmbangkannya"
64 |     assert bert.predict("merdeka") == "mərdeka"
65 |     assert bert.predict("pecel") == "pəcel"
66 |     assert bert.predict("lele") == "lele"
67 |     assert bert.predict("banyak") == "banyak"
68 | 
69 | 
70 | def test_ps(g2p):
71 |     assert g2p("psikologi") == [["s", "i", "k", "o", "l", "o", "ɡ", "i"]]
72 |     assert g2p("psikometri") == [["s", "i", "k", "o", "m", "e", "t", "r", "i"]]
73 |     assert g2p("psikotes") == [["s", "i", "k", "o", "t", "e", "s"]]
74 | 
75 | 
76 | def test_sticking_dot(g2p):
77 |     assert g2p("Seniornya Brigadir Jendral A.Yani mengambil alih pimpinan.") == [
78 |         ["s", "ə", "n", "i", "ʔ", "o", "r", "ɲ", "a"],
79 |         ["b", "r", "i", "ɡ", "a", "d", "i", "r"],
80 |         ["dʒ", "ə", "n", "d", "r", "a", "l"],
81 |         ["a"],
82 |         ["j", "a", "n", "i"],
83 |         ["m", "ə", "ŋ", "a", "m", "b", "i", "l"],
84 |         ["a", "l", "i", "h"],
85 |         ["p", "i", "m", "p", "i", "n", "a", "n"],
86 |         ["."],
87 |     ]
88 | 
89 | 
90 | def test_onnx_wrapper(bert):
91 |     assert bert.predict("mengembangkannya") == "məngəmbangkannya"
92 |     model_state = bert.model.__getstate__()
93 |     bert.model.__setstate__(model_state)
94 |     assert bert.predict("mengembangkannya") == "məngəmbangkannya"
95 | 


--------------------------------------------------------------------------------
/g2p_id/resources/homographs_id.tsv:
--------------------------------------------------------------------------------
  1 | angel	a ŋ e l	a ŋ ə l	A	A
  2 | apel	a p ə l	a p e l	N	V
  3 | begar	b e ɡ a r	b ə ɡ a r	V	A
  4 | begu	b e ɡ u	b ə ɡ u	N	N
  5 | bekel	b e k ə l	b e k ə l	N	N
  6 | belek	b ə l e ʔ	b e l e ʔ	V	N
  7 | belok	b e l o ʔ	b ə l o ʔ	V	A
  8 | bena	b e n a	b ə n a	A	N
  9 | berak	b e r a k	b ə r a k	V	A
 10 | berang	b e r a ŋ	b ə r a ŋ	A	N
 11 | berok	b e r o ʔ	b ə r o ʔ	N	N
 12 | berpendar	b ə r p e n d a r	b ə r p ə n d a r	V	V
 13 | berseri	b ə r s ə r i	b ə r s e r i	V	V
 14 | boreh	b o r e h	b o r ə h	N	N
 15 | cegak	tʃ e ɡ a ʔ	tʃ ə ɡ a ʔ	A	A
 16 | cela	tʃ ə l a	tʃ e l a	N	N
 17 | celak	tʃ e l a ʔ	tʃ ə l a ʔ	N	N
 18 | cetok	tʃ e t o ʔ	tʃ ə t o ʔ	N	N
 19 | debut	d e b u t	d ə b u t	N	N
 20 | dekan	d e k a n	d ə k a n	N	N
 21 | dendang	d e n d a ŋ	d ə n d a ŋ	N	N
 22 | depak	d e p a ʔ	d ə p a ʔ	V	M
 23 | dera	d e r a	d ə r a	N	N
 24 | embel	e m b e l	ə m b ə l	N	N
 25 | erang	ə r a ŋ	e r a ŋ	N	A
 26 | ganteng	ɡ a n t ə ŋ	ɡ a n t e ŋ	A	V
 27 | gedek	ɡ ə d ə ʔ	ɡ ə d e ʔ	N	V
 28 | gelang	ɡ ə l a ŋ	ɡ e l a ŋ	N	N
 29 | genggang	ɡ ə ŋ ɡ a ŋ	ɡ e ŋ ɡ a ŋ	N	N
 30 | helat	h ə l a t	h e l a t	A	N
 31 | jejer	dʒ e dʒ e r	dʒ ə dʒ ə r	V	N
 32 | jeli	dʒ e l i	dʒ ə l i	N	A
 33 | kecap	k e tʃ a p	k ə tʃ a p	N	V
 34 | keder	k ə d e r	k e d ə r	N	A
 35 | kedi	k e d i	k ə d i	N	N
 36 | kekel	k e k e l	k ə k ə l	A	N
 37 | kelah	k e l a h	k ə l a h	V	N
 38 | kelentang	k ə l ə n t a ŋ	k ə l e n t a ŋ	N	N
 39 | kelenteng	k ə l ə n t e ŋ	k ə l e n t e ŋ	N	N
 40 | kelepak	k ə l ə p a ʔ	k ə l e p a ʔ	N	A
 41 | kelesa	k ə l e s a	k ə l ə s a	A	N
 42 | kena	k ə n a	k e n a	V	N
 43 | kepang	k e p a ŋ	k ə p a ŋ	N	N
 44 | kepar	k e p a r	k ə p a r	N	N
 45 | kere	k e r e	k ə r e	A	N
 46 | keset	k ə s ə t	k e s e t	A	N
 47 | ketek	k e t e ʔ	k ə t e ʔ	N	N
 48 | ketel	k e t e l	k ə t ə l	N	A
 49 | lebam	l e b a m	l ə b a m	N	A
 50 | leding	l e d i ŋ	l ə d i ŋ	N	V
 51 | legar	l e ɡ a r	l ə ɡ a r	V	N
 52 | lembang	l e m b a ŋ	l ə m b a ŋ	N	A
 53 | lempeng	l e m p e ŋ	l ə m p ə ŋ	A	A
 54 | lenggang	l e ŋ ɡ a ŋ	l ə ŋ ɡ a ŋ	N	A
 55 | letak	l e t a ʔ	l ə t a ʔ	A	N
 56 | leter	l e t e r	l e t ə r	A	N
 57 | mejan	m e dʒ a n	m ə dʒ a n	N	N
 58 | memepet	m ə m e p e t	m ə m ə p e t	V	N
 59 | memerah	m ə m e r a h	m ə m ə r a h	V	V
 60 | mendera	m ə n d e r a	m ə n d e r a	V	A
 61 | mental	m e n t a l	m ə n t a l	V	N
 62 | pelak	p e l a k	p ə l a k	N	A
 63 | pelang	p e l a ŋ	p ə l a ŋ	N	N
 64 | pelat	p ə l a t	p e l a t	N	A
 65 | pelekat	p ə l ə k a t	p ə l e k a t	N	N
 66 | penggemblengan	p ə ŋ ɡ ə m b l e ŋ a n	p ə ŋ ɡ ə m b l ə ŋ a n	N	N
 67 | pening	p ə n i ŋ	p e n i ŋ	A	N
 68 | pentil	p e n t i l	p ə n t i l	N	N
 69 | pepet	p e p e t	p ə p ə t	V	N
 70 | per	p e r	p ə r	N	P
 71 | rebak	r e b a ʔ	r ə b a ʔ	A	V
 72 | relai	r e l a i	r ə l a i	V	N
 73 | remah	r e m a h	r ə m a h	N	N
 74 | rembes	r e m b e s	r e m b ə s	A	N
 75 | samseng	s a m s e ŋ	s a m s ə ŋ	N	V
 76 | seba	s e b a	s ə b a	V	V
 77 | sebat	s e b a t	s ə b a t	A	V
 78 | sedan	s e d a n	s ə d a n	N	N
 79 | sela	s ə l a	s e l a	N	N
 80 | selak	s e l a ʔ	s ə l a ʔ	V	V
 81 | selempang	s ə l e m p a ŋ	s ə l ə m p a ŋ	N	A
 82 | semen	s e m ə n	s e m e n	N	N
 83 | semi	s ə m i	s e m i	N	A
 84 | senggang	s e ŋ ɡ a ŋ	s ə ŋ ɡ a ŋ	A	A
 85 | sengkang	s e ŋ k a ŋ	s ə ŋ k a ŋ	N	N
 86 | sengkelat	s ə ŋ k e l a t	s ə ŋ k ə l a t	A	V
 87 | sepak	s e p a k	s ə p a k	N	N
 88 | serak	s ə r a ʔ	s e r a ʔ	A	V
 89 | serang	s ə r a ŋ	s e r a ŋ	V	N
 90 | seret	s e r e t	s ə r ə t	V	A
 91 | seri	s e r i	s ə r i	N	A
 92 | sertu	s e r t u	s ə r t u	N	V
 93 | tekek	t ə k e ʔ	t e k e ʔ	N	V
 94 | teken	t ə k ə n	t e k ə n	V	V
 95 | tela	t e l a	t ə l a	N	N
 96 | telan	t e l a n	t ə l a n	N	V
 97 | teleng	t e l e ŋ	t ə l ə ŋ	A	N
 98 | telor	t ə l o r	t e l o r	N	A
 99 | tepak	t e p a ʔ	t ə p a ʔ	N	N
100 | tepok	t ə p o ʔ	t e p o ʔ	V	A
101 | terapi	t e r a p i	t ə r a p i	N	A
102 | teras	t e r a s	t ə r a s	N	N
103 | 


--------------------------------------------------------------------------------
/g2p_id/lstm.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/)
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | 
 17 | import json
 18 | import os
 19 | 
 20 | import numpy as np
 21 | import onnxruntime
 22 | 
 23 | from g2p_id.onnx_utils import WrapInferenceSession
 24 | 
 25 | model_path = os.path.join(os.path.dirname(__file__), "models", "lstm")
 26 | 
 27 | 
 28 | class LSTM:
 29 |     """Phoneme-level LSTM model for sequence-to-sequence phonemization.
 30 |     Trained with [Keras](https://keras.io/examples/nlp/lstm_seq2seq/),
 31 |     and exported to ONNX. ONNX Runtime engine used during inference.
 32 |     """
 33 | 
 34 |     def __init__(self):
 35 |         encoder_model_path = os.path.join(model_path, "encoder_model.onnx")
 36 |         decoder_model_path = os.path.join(model_path, "decoder_model.onnx")
 37 |         g2id_path = os.path.join(model_path, "g2id.json")
 38 |         p2id_path = os.path.join(model_path, "p2id.json")
 39 |         config_path = os.path.join(model_path, "config.json")
 40 |         self.encoder = WrapInferenceSession(
 41 |             encoder_model_path,
 42 |             providers=onnxruntime.get_available_providers(),
 43 |         )
 44 |         self.decoder = WrapInferenceSession(
 45 |             decoder_model_path,
 46 |             providers=onnxruntime.get_available_providers(),
 47 |         )
 48 |         with open(g2id_path, encoding="utf-8") as file:
 49 |             self.g2id = json.load(file)
 50 |         with open(p2id_path, encoding="utf-8") as file:
 51 |             self.p2id = json.load(file)
 52 |         self.id2p = {v: k for k, v in self.p2id.items()}
 53 |         with open(config_path, encoding="utf-8") as file:
 54 |             self.config = json.load(file)
 55 | 
 56 |     def predict(self, text: str) -> str:
 57 |         """Performs LSTM inference, predicting phonemes of a given word.
 58 | 
 59 |         Args:
 60 |             text (str): Word to convert to phonemes.
 61 | 
 62 |         Returns:
 63 |             str: Word in phonemes.
 64 |         """
 65 |         input_seq = np.zeros(
 66 |             (
 67 |                 1,
 68 |                 self.config["max_encoder_seq_length"],
 69 |                 self.config["num_encoder_tokens"],
 70 |             ),
 71 |             dtype="float32",
 72 |         )
 73 | 
 74 |         for idx, char in enumerate(text):
 75 |             input_seq[0, idx, self.g2id[char]] = 1.0
 76 |         input_seq[0, len(text) :, self.g2id[self.config["pad_token"]]] = 1.0
 77 | 
 78 |         encoder_inputs = {"input_1": input_seq}
 79 |         states_value = self.encoder.run(None, encoder_inputs)
 80 | 
 81 |         target_seq = np.zeros((1, 1, self.config["num_decoder_tokens"]), dtype="float32")
 82 |         target_seq[0, 0, self.p2id[self.config["bos_token"]]] = 1.0
 83 | 
 84 |         stop_condition = False
 85 |         decoded_sentence = ""
 86 |         while not stop_condition:
 87 |             decoder_inputs = {
 88 |                 "input_2": target_seq,
 89 |                 "input_3": states_value[0],
 90 |                 "input_4": states_value[1],
 91 |             }
 92 |             output_tokens, state_memory, state_carry = self.decoder.run(None, decoder_inputs)
 93 | 
 94 |             sampled_token_index = np.argmax(output_tokens[0, -1, :])
 95 |             sampled_char = self.id2p[sampled_token_index]
 96 |             decoded_sentence += sampled_char
 97 | 
 98 |             if (
 99 |                 sampled_char == self.config["eos_token"]
100 |                 or len(decoded_sentence) > self.config["max_decoder_seq_length"]
101 |             ):
102 |                 stop_condition = True
103 | 
104 |             target_seq = np.zeros((1, 1, self.config["num_decoder_tokens"]), dtype="float32")
105 |             target_seq[0, 0, sampled_token_index] = 1.0
106 | 
107 |             states_value = [state_memory, state_carry]
108 | 
109 |         return decoded_sentence.replace(self.config["eos_token"], "")
110 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # g2p ID: Indonesian Grapheme-to-Phoneme Converter
  2 | 
  3 | <p align="center">
  4 |     <a href="https://github.com/bookbot-kids/g2p_id/blob/main/LICENSE.md">
  5 |         <img alt="GitHub" src="https://img.shields.io/github/license/bookbot-kids/g2p_id.svg?color=blue">
  6 |     </a>
  7 |     <a href="https://bookbot-kids.github.io/g2p_id/">
  8 |         <img alt="Documentation" src="https://img.shields.io/website/http/bookbot-kids.github.io/g2p_id.svg?down_color=red&down_message=offline&up_message=online">
  9 |     </a>
 10 |     <a href="https://github.com/bookbot-kids/g2p_id/releases">
 11 |         <img alt="GitHub release" src="https://img.shields.io/github/release/bookbot-kids/g2p_id.svg">
 12 |     </a>
 13 |     <a href="https://github.com/bookbot-kids/g2p_id/blob/main/CODE_OF_CONDUCT.md">
 14 |         <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
 15 |     </a>
 16 |     <a href="https://github.com/bookbot-kids/g2p_id/actions/workflows/tests.yml">
 17 |         <img alt="Tests" src="https://github.com/bookbot-kids/g2p_id/actions/workflows/tests.yml/badge.svg">
 18 |     </a>
 19 |     <a href="https://codecov.io/gh/bookbot-kids/g2p_id">
 20 |         <img alt="Code Coverage" src="https://img.shields.io/codecov/c/github/bookbot-kids/g2p_id">
 21 |     </a>
 22 |     <a href="https://discord.gg/gqwTPyPxa6">
 23 |         <img alt="chat on Discord" src="https://img.shields.io/discord/1001447685645148169?logo=discord">
 24 |     </a>
 25 |     <a href="https://github.com/bookbot-kids/g2p_id/blob/main/CONTRIBUTING.md">
 26 |         <img alt="contributing guidelines" src="https://img.shields.io/badge/contributing-guidelines-brightgreen">
 27 |     </a>
 28 | </p>
 29 | 
 30 | This library is developed to convert Indonesian (Bahasa Indonesia) graphemes (words) to phonemes in IPA. We followed the methods and designs used in the English equivalent library, [g2p](https://github.com/Kyubyong/g2p).
 31 | 
 32 | ## Installation
 33 | 
 34 | ```bash
 35 | pip install g2p_id_py
 36 | ```
 37 | 
 38 | ## How to Use
 39 | 
 40 | ```py
 41 | from g2p_id import G2p
 42 | 
 43 | texts = [
 44 |     "Apel itu berwarna merah.",
 45 |     "Rahel bersekolah di S M A Jakarta 17.",
 46 |     "Mereka sedang bermain bola di lapangan.",
 47 | ]
 48 | 
 49 | g2p = G2p()
 50 | for text in texts:
 51 |     print(g2p(text))
 52 | 
 53 | >> [['a', 'p', 'ə', 'l'], ['i', 't', 'u'], ['b', 'ə', 'r', 'w', 'a', 'r', 'n', 'a'], ['m', 'e', 'r', 'a', 'h'], ['.']]
 54 | >> [['r', 'a', 'h', 'e', 'l'], ['b', 'ə', 'r', 's', 'ə', 'k', 'o', 'l', 'a', 'h'], ['d', 'i'], ['e', 's'], ['e', 'm'], ['a'], ['dʒ', 'a', 'k', 'a', 'r', 't', 'a'], ['t', 'u', 'dʒ', 'u', 'h'], ['b', 'ə', 'l', 'a', 's'], ['.']]
 55 | >> [['m', 'ə', 'r', 'e', 'k', 'a'], ['s', 'ə', 'd', 'a', 'ŋ'], ['b', 'ə', 'r', 'm', 'a', 'i', 'n'], ['b', 'o', 'l', 'a'], ['d', 'i'], ['l', 'a', 'p', 'a', 'ŋ', 'a', 'n'], ['.']]
 56 | ```
 57 | 
 58 | ## Algorithm
 59 | 
 60 | This is heavily inspired from the English [g2p](https://github.com/Kyubyong/g2p).
 61 | 
 62 | 1. Spells out arabic numbers and some currency symbols, e.g. `Rp 200,000 -> dua ratus ribu rupiah`. This is borrowed from [Cahya's code](https://github.com/cahya-wirawan/text_processor).
 63 | 2. Attempts to retrieve the correct pronunciation for homographs based on their [POS (part-of-speech) tags](#pos-tagging).
 64 | 3. Looks up a lexicon (pronunciation dictionary) for non-homographs. This list is originally from [ipa-dict](https://github.com/open-dict-data/ipa-dict/blob/master/data/ma.txt), and we later made a modified version.
 65 | 4. For OOVs, we predict their pronunciations using either a [BERT model](https://huggingface.co/bookbot/id-g2p-bert) or an [LSTM model](https://huggingface.co/bookbot/id-g2p-lstm).
 66 | 
 67 | ## Phoneme and Grapheme Sets
 68 | 
 69 | ```python
 70 | graphemes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
 71 | phonemes = ['a', 'b', 'd', 'e', 'f', 'ɡ', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'z', 'ŋ', 'ə', 'ɲ', 'tʃ', 'ʃ', 'dʒ', 'x', 'ʔ']
 72 | ```
 73 | 
 74 | ## Implementation Details
 75 | 
 76 | You can find more details on how we handled homographs and out-of-vocabulary prediction on our [documentation](https://bookbot-kids.github.io/g2p_id/algorithm/) page.
 77 | 
 78 | ## References
 79 | 
 80 | ```bib
 81 | @misc{g2pE2019,
 82 |   author = {Park, Kyubyong & Kim, Jongseok},
 83 |   title = {g2pE},
 84 |   year = {2019},
 85 |   publisher = {GitHub},
 86 |   journal = {GitHub repository},
 87 |   howpublished = {\url{https://github.com/Kyubyong/g2p}}
 88 | }
 89 | ```
 90 | 
 91 | ```bib
 92 | @misc{TextProcessor2021,
 93 |   author = {Cahya Wirawan},
 94 |   title = {Text Processor},
 95 |   year = {2021},
 96 |   publisher = {GitHub},
 97 |   journal = {GitHub repository},
 98 |   howpublished = {\url{https://github.com/cahya-wirawan/text_processor}}
 99 | }
100 | ```
101 | 
102 | ## Contributors
103 | 
104 | <a href="https://github.com/bookbot-kids/g2p_id/graphs/contributors">
105 |   <img src="https://contrib.rocks/image?repo=bookbot-kids/g2p_id" />
106 | </a>


--------------------------------------------------------------------------------
/docs/algorithm.md:
--------------------------------------------------------------------------------
 1 | # Algorithm
 2 | 
 3 | This is heavily inspired from the English [g2p](https://github.com/Kyubyong/g2p).
 4 | 
 5 | 1. Spells out arabic numbers and some currency symbols, e.g. `Rp 200,000 -> dua ratus ribu rupiah`. This is borrowed from [Cahya's code](https://github.com/cahya-wirawan/text_processor).
 6 | 2. Attempts to retrieve the correct pronunciation for homographs based on their [POS (part-of-speech) tags](#pos-tagging).
 7 | 3. Looks up a lexicon (pronunciation dictionary) for non-homographs. This list is originally from [ipa-dict](https://github.com/open-dict-data/ipa-dict/blob/master/data/ma.txt), and we later made a modified version.
 8 | 4. For OOVs, we predict their pronunciations using either a [BERT model](https://huggingface.co/bookbot/id-g2p-bert) or an [LSTM model](https://huggingface.co/bookbot/id-g2p-lstm).
 9 | 
10 | ## Phoneme and Grapheme Sets
11 | 
12 | ```python
13 | graphemes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
14 | phonemes = ['a', 'b', 'd', 'e', 'f', 'ɡ', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'z', 'ŋ', 'ə', 'ɲ', 'tʃ', 'ʃ', 'dʒ', 'x', 'ʔ']
15 | ```
16 | 
17 | ## Homographs
18 | 
19 | Indonesian words (as far as we know) only have one case of homograph, that is, differing ways to pronounce the letter `e`. For instance, in the word `apel` (meaning: apple), the letter `e` is a mid central vowel `ə`. On the other hand, the letter `e` in the word `apel` (meaning: going to a significant other's house; courting), is a closed-mid front unrounded vowel `e`. Sometimes, a word might have >1 `e`s pronounced in both ways, for instance, `mereka` (meaning: they) is pronounced as `məreka`. Because of this, there needs a way to disambiguate homographs, and in our case, we used their POS (part-of-speech) tags. However, this is not a foolproof method since homographs may even have the same POS tag. We are considering a contextual model to handle this better.
20 | 
21 | ## OOV Prediction
22 | 
23 | Initially, we relied on a sequence2sequence LSTM model for OOV (out-of-vocabulary) prediction. This was a natural choice given that it can "automatically" learn the rules of grapheme-to-phoneme conversion without having to determine the rules by hand. However, we soon noticed that despite its validation results, the model performed poorly on unseen words, especially on longer ones. We needed a more controllable model that makes predictions on necessary characters only. We ended up with a customized BERT that predicts the correct pronunciation of the letter `e` while keeping the rest of the string unchanged. We then apply a hand-written g2p conversion algorithm that handles the other characters.
24 | 
25 | You can find more detail in [this blog post](https://wilsonwongso.dev/posts/2022/04/predicting-phonemes-with-bert/).
26 | 
27 | ## POS Tagging
28 | 
29 | We trained an [NLTK PerceptronTagger](https://www.nltk.org/_modules/nltk/tag/perceptron.html) on the [POSP](https://huggingface.co/datasets/indonlu) dataset, which achieved 0.956 and 0.945 F1-score on the valid and test sets, respectively. Given its performance and speed, we decided to adopt this model as the POS tagger for the purpose of disambiguating homographs, which is just like the English g2p library.
30 | 
31 | | tag       | precision | recall   | f1-score |
32 | | --------- | --------- | -------- | -------- |
33 | | B-$$$     | 1.000000  | 1.000000 | 1.000000 |
34 | | B-ADJ     | 0.904132  | 0.864139 | 0.883683 |
35 | | B-ADK     | 1.000000  | 0.986667 | 0.993289 |
36 | | B-ADV     | 0.966874  | 0.976987 | 0.971904 |
37 | | B-ART     | 0.988920  | 0.978082 | 0.983471 |
38 | | B-CCN     | 0.997934  | 0.997934 | 0.997934 |
39 | | B-CSN     | 0.986395  | 0.963455 | 0.974790 |
40 | | B-INT     | 1.000000  | 1.000000 | 1.000000 |
41 | | B-KUA     | 0.976744  | 0.976744 | 0.976744 |
42 | | B-NEG     | 0.992857  | 0.972028 | 0.982332 |
43 | | B-NNO     | 0.919917  | 0.941288 | 0.930480 |
44 | | B-NNP     | 0.917685  | 0.914703 | 0.916192 |
45 | | B-NUM     | 0.997358  | 0.954488 | 0.975452 |
46 | | B-PAR     | 1.000000  | 0.851064 | 0.919540 |
47 | | B-PPO     | 0.991206  | 0.991829 | 0.991517 |
48 | | B-PRI     | 1.000000  | 0.928571 | 0.962963 |
49 | | B-PRK     | 0.793103  | 0.851852 | 0.821429 |
50 | | B-PRN     | 0.988327  | 0.988327 | 0.988327 |
51 | | B-PRR     | 0.995465  | 1.000000 | 0.997727 |
52 | | B-SYM     | 0.999662  | 0.999323 | 0.999492 |
53 | | B-UNS     | 0.916667  | 0.733333 | 0.814815 |
54 | | B-VBE     | 1.000000  | 0.985714 | 0.992806 |
55 | | B-VBI     | 0.929119  | 0.877034 | 0.902326 |
56 | | B-VBL     | 1.000000  | 1.000000 | 1.000000 |
57 | | B-VBP     | 0.926606  | 0.933457 | 0.930018 |
58 | | B-VBT     | 0.939759  | 0.953333 | 0.946498 |
59 | | --------- | --------- | -------- | -------- |
60 | | macro avg | 0.966490  | 0.946937 | 0.955913 |
61 | 
62 | ## Attempts that Failed
63 | 
64 | - Parsed [online PDF KBBI](https://oldi.lipi.go.id/public/Kamus%20Indonesia.pdf), but it turns out that it has very little phoneme descriptions.
65 | - Scraped [online Web KBBI](https://github.com/laymonage/kbbi-python), but it had a daily bandwidth which was too low to be used at this level.
66 | 
67 | ## Potential Improvements
68 | 
69 | There is a ton of room for improvements, both from the technical and the linguistic side of the approaches. Consider that a failure of one component may cascade to an incorrect conclusion. For instance, an incorrect POS tag can lead to the wrong phoneme, ditto for incorrect OOV prediction. We propose the following future improvements.
70 | 
71 | - [ ] Use a larger pronunciation lexicon instead of having to guess.
72 | - [x] Find a larger homograph list.
73 | - [x] Use contextual model instead of character-level RNNs.
74 | - [x] Consider hand-written rules for g2p conversion.
75 | - [x] Add to PyPI.


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Contributor Covenant Code of Conduct
  3 | 
  4 | ## Our Pledge
  5 | 
  6 | We as members, contributors, and leaders pledge to make participation in our
  7 | community a harassment-free experience for everyone, regardless of age, body
  8 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  9 | identity and expression, level of experience, education, socio-economic status,
 10 | nationality, personal appearance, race, caste, color, religion, or sexual
 11 | identity and orientation.
 12 | 
 13 | We pledge to act and interact in ways that contribute to an open, welcoming,
 14 | diverse, inclusive, and healthy community.
 15 | 
 16 | ## Our Standards
 17 | 
 18 | Examples of behavior that contributes to a positive environment for our
 19 | community include:
 20 | 
 21 | * Demonstrating empathy and kindness toward other people
 22 | * Being respectful of differing opinions, viewpoints, and experiences
 23 | * Giving and gracefully accepting constructive feedback
 24 | * Accepting responsibility and apologizing to those affected by our mistakes,
 25 |   and learning from the experience
 26 | * Focusing on what is best not just for us as individuals, but for the overall
 27 |   community
 28 | 
 29 | Examples of unacceptable behavior include:
 30 | 
 31 | * The use of sexualized language or imagery, and sexual attention or advances of
 32 |   any kind
 33 | * Trolling, insulting or derogatory comments, and personal or political attacks
 34 | * Public or private harassment
 35 | * Publishing others' private information, such as a physical or email address,
 36 |   without their explicit permission
 37 | * Other conduct which could reasonably be considered inappropriate in a
 38 |   professional setting
 39 | 
 40 | ## Enforcement Responsibilities
 41 | 
 42 | Community leaders are responsible for clarifying and enforcing our standards of
 43 | acceptable behavior and will take appropriate and fair corrective action in
 44 | response to any behavior that they deem inappropriate, threatening, offensive,
 45 | or harmful.
 46 | 
 47 | Community leaders have the right and responsibility to remove, edit, or reject
 48 | comments, commits, code, wiki edits, issues, and other contributions that are
 49 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 50 | decisions when appropriate.
 51 | 
 52 | ## Scope
 53 | 
 54 | This Code of Conduct applies within all community spaces, and also applies when
 55 | an individual is officially representing the community in public spaces.
 56 | Examples of representing our community include using an official e-mail address,
 57 | posting via an official social media account, or acting as an appointed
 58 | representative at an online or offline event.
 59 | 
 60 | ## Enforcement
 61 | 
 62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 63 | reported to the community leaders responsible for enforcement at [team@bookbotkids.com](mailto:team@bookbotkids.com).
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series of
 86 | actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or permanent
 93 | ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within the
113 | community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.1, available at
119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
120 | 
121 | Community Impact Guidelines were inspired by
122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
123 | 
124 | For answers to common questions about this code of conduct, see the FAQ at
125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
126 | [https://www.contributor-covenant.org/translations][translations].
127 | 
128 | [homepage]: https://www.contributor-covenant.org
129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
130 | [Mozilla CoC]: https://github.com/mozilla/diversity
131 | [FAQ]: https://www.contributor-covenant.org/faq
132 | [translations]: https://www.contributor-covenant.org/translations
133 | 


--------------------------------------------------------------------------------
/g2p_id/g2p.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/)
  3 | 
  4 | Licensed under the Apache License, Version 2.0 (the "License");
  5 | you may not use this file except in compliance with the License.
  6 | You may obtain a copy of the License at
  7 | 
  8 |     http://www.apache.org/licenses/LICENSE-2.0
  9 | 
 10 | Unless required by applicable law or agreed to in writing, software
 11 | distributed under the License is distributed on an "AS IS" BASIS,
 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | See the License for the specific language governing permissions and
 14 | limitations under the License.
 15 | """
 16 | 
 17 | import os
 18 | import re
 19 | import pickle
 20 | import unicodedata
 21 | from builtins import str as unicode
 22 | from itertools import permutations
 23 | from typing import Dict, List, Tuple, Union
 24 | 
 25 | import nltk
 26 | from nltk.tag.perceptron import PerceptronTagger
 27 | from nltk.tokenize import TweetTokenizer
 28 | 
 29 | from g2p_id.bert import BERT
 30 | from g2p_id.lstm import LSTM
 31 | from g2p_id.text_processor import TextProcessor
 32 | 
 33 | nltk.download("wordnet")
 34 | resources_path = os.path.join(os.path.dirname(__file__), "resources")
 35 | 
 36 | 
 37 | def construct_homographs_dictionary() -> Dict[str, Tuple[str, str, str, str]]:
 38 |     """Creates a dictionary of homographs
 39 | 
 40 |     Returns:
 41 |         Dict[str, Tuple[str, str, str, str]]:
 42 |             Key: WORD
 43 |             Value: (PH1, PH2, POS1, POS2)
 44 |     """
 45 |     homograph_path = os.path.join(resources_path, "homographs_id.tsv")
 46 |     homograph2features = {}
 47 |     with open(homograph_path, encoding="utf-8") as file:
 48 |         lines = file.readlines()
 49 |         for line in lines:
 50 |             grapheme, phone_1, phone_2, pos_1, pos_2 = line.strip("\n").split("\t")
 51 |             homograph2features[grapheme.lower()] = (phone_1, phone_2, pos_1, pos_2)
 52 | 
 53 |     return homograph2features
 54 | 
 55 | 
 56 | def construct_lexicon_dictionary() -> Dict[str, str]:
 57 |     """Creates a lexicon dictionary.
 58 | 
 59 |     Returns:
 60 |         Dict[str, str]:
 61 |             Key: WORD
 62 |             Value: Phoneme (IPA)
 63 |     """
 64 |     lexicon_path = os.path.join(resources_path, "lexicon_id.tsv")
 65 |     lexicon2features = {}
 66 |     with open(lexicon_path, encoding="utf-8") as file:
 67 |         lines = file.readlines()
 68 |         for line in lines:
 69 |             grapheme, phoneme = line.strip("\n").split("\t")
 70 |             lexicon2features[grapheme.lower()] = phoneme
 71 |     return lexicon2features
 72 | 
 73 | 
 74 | class G2p:
 75 |     """Grapheme-to-phoneme (g2p) main class for phonemization.
 76 |     This class provides a high-level API for grapheme-to-phoneme conversion.
 77 | 
 78 |     1. Preprocess and normalize text
 79 |     2. Word tokenizes text
 80 |     3. Predict POS for every word
 81 |     4. If word is non-alphabetic, add to list (i.e. punctuation)
 82 |     5. If word is a homograph, check POS and use matching word's phonemes
 83 |     6. If word is a non-homograph, lookup lexicon
 84 |     7. Otherwise, predict with a neural network
 85 |     """
 86 | 
 87 |     def __init__(self, model_type="BERT"):
 88 |         """Constructor for G2p.
 89 | 
 90 |         Args:
 91 |             model_type (str, optional):
 92 |                 Type of neural network to use for prediction.
 93 |                 Choices are "LSTM" or "BERT". Defaults to "BERT".
 94 |         """
 95 |         self.homograph2features = construct_homographs_dictionary()
 96 |         self.lexicon2features = construct_lexicon_dictionary()
 97 |         self.normalizer = TextProcessor()
 98 |         self.tagger = PerceptronTagger(load=False)
 99 |         tagger_path = os.path.join(resources_path, "id_posp_tagger.pickle")
100 |         with open(tagger_path, "rb") as f:
101 |             self.tagger = self.tagger.decode_json_obj(pickle.load(f))
102 |         self.model: Union[BERT, LSTM] = BERT() if model_type == "BERT" else LSTM()
103 |         self.tokenizer = TweetTokenizer()
104 |         self.pos_dict = {
105 |             "N": ["B-NNO", "B-NNP", "B-PRN", "B-PRN", "B-PRK"],
106 |             "V": ["B-VBI", "B-VBT", "B-VBP", "B-VBL", "B-VBE"],
107 |             "A": ["B-ADJ"],
108 |             "P": ["B-PAR"],
109 |         }
110 | 
111 |     def _preprocess(self, text: str) -> str:
112 |         """Performs preprocessing.
113 |         (1) Adds spaces in between tokens
114 |         (2) Normalizes unicode and accents
115 |         (3) Normalizes numbers
116 |         (4) Lower case texts
117 |         (5) Removes unwanted tokens
118 | 
119 |         Arguments:
120 |             text (str): Text to preprocess.
121 | 
122 |         Returns:
123 |             str: Preprocessed text.
124 |         """
125 |         text = text.replace("-", " ")
126 |         text = re.sub(r"\.(?=.*\.)", " ", text)
127 |         text = " ".join(self.tokenizer.tokenize(text))
128 |         text = unicode(text)
129 |         text = "".join(char for char in unicodedata.normalize("NFD", text) if unicodedata.category(char) != "Mn")
130 |         text = self.normalizer.normalize(text).strip()
131 |         text = text.lower()
132 |         text = re.sub(r"[^ a-z'.,?!\-]", "", text)
133 |         return text
134 | 
135 |     def _rule_based_g2p(self, text: str) -> str:
136 |         """Applies rule-based Indonesian grapheme2phoneme conversion.
137 | 
138 |         Args:
139 |             text (str): Grapheme text to convert to phoneme.
140 | 
141 |         Returns:
142 |             str: Phoneme string.
143 |         """
144 |         phonetic_mapping = {
145 |             "ny": "ɲ",
146 |             "ng": "ŋ",
147 |             "sy": "ʃ",
148 |             "aa": "aʔa",
149 |             "ii": "iʔi",
150 |             "oo": "oʔo",
151 |             "əə": "əʔə",
152 |             "uu": "uʔu",
153 |             "'": "ʔ",
154 |             "g": "ɡ",
155 |             "q": "k",
156 |             "j": "dʒ",
157 |             "y": "j",
158 |             "x": "ks",
159 |             "c": "tʃ",
160 |             "kh": "x",
161 |         }
162 | 
163 |         if text.startswith("x"):
164 |             text = "s" + text[1:]
165 | 
166 |         if text.startswith("ps"):
167 |             text = text[1:]
168 | 
169 |         for graph, phone in phonetic_mapping.items():
170 |             text = text.replace(graph, phone)
171 | 
172 |         phonemes = [list(phn) if phn not in ("dʒ", "tʃ") else [phn] for phn in re.split("(tʃ|dʒ)", text)]
173 |         return " ".join([p for phn in phonemes for p in phn])
174 | 
175 |     def __call__(self, text: str) -> List[List[str]]:
176 |         """Grapheme-to-phoneme converter.
177 | 
178 |         1. Preprocess and normalize text
179 |         2. Word tokenizes text
180 |         3. Predict POS for every word
181 |         4. If word is non-alphabetic, add to list (i.e. punctuation)
182 |         5. If word is a homograph, check POS and use matching word's phonemes
183 |         6. If word is a non-homograph, lookup lexicon
184 |         7. Otherwise, predict with a neural network
185 | 
186 |         Args:
187 |             text (str): Grapheme text to convert to phoneme.
188 | 
189 |         Returns:
190 |             List[List[str]]: List of strings in phonemes.
191 |         """
192 |         text = self._preprocess(text)
193 |         words = self.tokenizer.tokenize(text)
194 |         tokens = self.tagger.tag(words)
195 | 
196 |         prons = []
197 |         for word, pos in tokens:
198 |             pron = ""
199 |             if re.search("[a-z]", word) is None:  # non-alphabetic
200 |                 pron = word
201 | 
202 |             elif word in self.homograph2features:  # check if homograph
203 |                 pron1, pron2, pos1, _ = self.homograph2features[word]
204 | 
205 |                 # check for the matching POS
206 |                 if pos in self.pos_dict[pos1]:
207 |                     pron = pron1
208 |                 else:
209 |                     pron = pron2
210 | 
211 |             elif word in self.lexicon2features:  # non-homographs
212 |                 pron = self.lexicon2features[word]
213 | 
214 |             else:  # predict for OOV
215 |                 pron = self.model.predict(word)
216 |                 if isinstance(self.model, BERT):
217 |                     pron = self._rule_based_g2p(pron)
218 | 
219 |             if pron.endswith("ʔ"):
220 |                 pron = pron[:-1] + "k"
221 | 
222 |             consonants = "bdjklmnprstwɲ"
223 |             vowels = "aeiouə"
224 | 
225 |             for letter in consonants:
226 |                 pron = pron.replace(f"ʔ {letter}", f"k {letter}")
227 | 
228 |             # add a glottal stop in between consecutive vowels
229 |             for v1, v2 in permutations(vowels, 2):
230 |                 pron = pron.replace(f"{v1} {v2}", f"{v1} ʔ {v2}")
231 | 
232 |             prons.append(pron.split())
233 | 
234 |         return prons
235 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/)
179 | 
180 |    Licensed under the Apache License, Version 2.0 (the "License");
181 |    you may not use this file except in compliance with the License.
182 |    You may obtain a copy of the License at
183 | 
184 |        http://www.apache.org/licenses/LICENSE-2.0
185 | 
186 |    Unless required by applicable law or agreed to in writing, software
187 |    distributed under the License is distributed on an "AS IS" BASIS,
188 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
189 |    See the License for the specific language governing permissions and
190 |    limitations under the License.
191 | 


--------------------------------------------------------------------------------
/g2p_id/text_processor.py:
--------------------------------------------------------------------------------
  1 | """
  2 | MIT License
  3 | 
  4 | Copyright (c) 2021 Cahya Wirawan
  5 | 
  6 | Permission is hereby granted, free of charge, to any person obtaining a copy
  7 | of this software and associated documentation files (the "Software"), to deal
  8 | in the Software without restriction, including without limitation the rights
  9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 10 | copies of the Software, and to permit persons to whom the Software is
 11 | furnished to do so, subject to the following conditions:
 12 | 
 13 | The above copyright notice and this permission notice shall be included in all
 14 | copies or substantial portions of the Software.
 15 | 
 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 22 | SOFTWARE.
 23 | """
 24 | 
 25 | 
 26 | import os
 27 | import re
 28 | from typing import Any
 29 | 
 30 | from num2words import num2words
 31 | 
 32 | resources_path = os.path.join(os.path.dirname(__file__), "resources")
 33 | 
 34 | 
 35 | class TextProcessor:
 36 |     """Indonesian text processor to normalize numerics, currencies, and timezones."""
 37 | 
 38 |     def __init__(self):
 39 |         self.measurements = {}
 40 |         self.thousands = ["ratus", "ribu", "juta", "miliar", "milyar", "triliun"]
 41 |         self.months = [
 42 |             "Januari",
 43 |             "Februari",
 44 |             "Maret",
 45 |             "April",
 46 |             "Mei",
 47 |             "Juni",
 48 |             "Juli",
 49 |             "Agustus",
 50 |             "September",
 51 |             "Oktober",
 52 |             "November",
 53 |             "Desember",
 54 |         ]
 55 |         measurements_path = os.path.join(resources_path, "measurements.tsv")
 56 |         currencies_path = os.path.join(resources_path, "currency.tsv")
 57 |         timezones_path = os.path.join(resources_path, "timezones.tsv")
 58 | 
 59 |         with open(measurements_path, "r", encoding="utf-8") as file:
 60 |             for lines in file:
 61 |                 line = lines.strip().split("\t")
 62 |                 self.measurements[line[0]] = line[1]
 63 | 
 64 |         self.currencies = {}
 65 |         with open(currencies_path, "r", encoding="utf-8") as file:
 66 |             for lines in file:
 67 |                 line = lines.strip().split("\t")
 68 |                 self.currencies[line[0]] = line[1]
 69 | 
 70 |         self.timezones = {}
 71 |         with open(timezones_path, "r", encoding="utf-8") as file:
 72 |             for lines in file:
 73 |                 line = lines.strip().split("\t")
 74 |                 self.timezones[line[0]] = line[1]
 75 | 
 76 |         self.re_thousands = "|".join(self.thousands)
 77 |         self.re_currencies = r"\b" + re.sub(
 78 |             r"\|([^|$£€¥₩]+)", r"|\\b\1", "|".join(list(self.currencies))
 79 |         )
 80 |         self.re_currencies = re.sub(r"([$£€¥₩])", r"\\\1", self.re_currencies)
 81 |         self.re_moneys = (
 82 |             rf"(({self.re_currencies}) ?([\d\.\,]+)( ({self.re_thousands})?(an)?)?)"
 83 |         )
 84 |         self.re_measurements = "|".join(list(self.measurements))
 85 |         self.re_measurements = rf"(\b([\d\.\,]+) ?({self.re_measurements})\b)"
 86 |         self.re_timezones = "|".join(list(self.timezones))
 87 |         self.re_timezones = (
 88 |             r"((\d{1,2})[\.:](\d{1,2}) " + rf"\b({self.re_timezones})\b)"
 89 |         )
 90 |         self.re_http = re.compile(
 91 |             r"""
 92 |             (https?://(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.
 93 |             [a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9()@:%_\+.~#?&//=]*)
 94 |             """,
 95 |             re.X,
 96 |         )
 97 | 
 98 |     @staticmethod
 99 |     def is_integer(number: Any) -> bool:
100 |         """Check if integer by type-casting.
101 | 
102 |         Args:
103 |             number (Any): Number to check.
104 | 
105 |         Returns:
106 |             bool: Is a valid integer.
107 |         """
108 |         try:
109 |             int(number)
110 |             return True
111 |         except ValueError:
112 |             return False
113 | 
114 |     @staticmethod
115 |     def is_float(number: Any) -> bool:
116 |         """Check if float by type-casting.
117 | 
118 |         Args:
119 |             number (Any): Number to check.
120 | 
121 |         Returns:
122 |             bool: Is a valid float.
123 |         """
124 |         try:
125 |             float(number)
126 |             return True
127 |         except ValueError:
128 |             return False
129 | 
130 |     def normalize_url(self, text: str) -> str:
131 |         """Removes URL from text.
132 | 
133 |         Args:
134 |             text (str): Text with URL to normalize.
135 | 
136 |         Returns:
137 |             str: Normalized text with URLs removed.
138 |         """
139 |         urls = re.findall(self.re_http, text)
140 |         for url in urls:
141 |             text = text.replace(url[0], "")
142 |         return text
143 | 
144 |     def normalize_currency(self, text: str) -> str:
145 |         """Normalizes international and Indonesian (Rupiah) currencies.
146 | 
147 |         Examples:
148 |         - `"$250"` -> `"dua ratus lima puluh dollar"`
149 |         - `"Rp 3,000,000"` -> `"tiga juta rupiah"`
150 | 
151 |         Args:
152 |             text (str): Text with currency to normalize.
153 | 
154 |         Returns:
155 |             str: Normalized text with currency transliterated.
156 |         """
157 |         moneys = re.findall(self.re_moneys, text)
158 |         for money in moneys:
159 |             number: Any = re.sub(",", ".", re.sub(r"\.", "", money[2].strip(" ,.")))
160 |             try:
161 |                 if number == "":
162 |                     continue
163 |                 if self.is_integer(number):
164 |                     number = int(number)
165 |                 elif self.is_float(number):
166 |                     number = float(number)
167 |                 else:
168 |                     number = re.sub(r"[.,]", "", number)
169 |                     number = int(number)
170 |                 number = num2words(number, to="cardinal", lang="id")
171 |                 text = text.replace(
172 |                     money[0].strip(" ,."),
173 |                     f"{number} {money[3]} {self.currencies[money[1]]}",
174 |                 )
175 |             except NotImplementedError as error:
176 |                 print(error)
177 |                 print(f"Problem with money: <{text}>: {number}")
178 |         return text
179 | 
180 |     def normalize_measurement(self, text: str) -> str:
181 |         """Normalizes measurement units, including its scalar value.
182 | 
183 |         Examples:
184 |         - `"10,5 km"` -> `"sepuluh koma lima kilometer"`
185 |         - `"5°C"` -> `"lima derajat celsius"`
186 | 
187 |         Args:
188 |             text (str): Text with measurements to normalize.
189 | 
190 |         Returns:
191 |             str: Normalized text with measurements transliterated.
192 |         """
193 |         units = re.findall(self.re_measurements, text)
194 |         for unit in units:
195 |             number: Any = re.sub(",", ".", re.sub(r"\.", "", unit[1].strip(" ,.")))
196 |             try:
197 |                 if number == "":
198 |                     continue
199 |                 if re.search(r"\.", number):
200 |                     number = float(number)
201 |                 else:
202 |                     number = int(number)
203 |                 number = num2words(number, to="cardinal", lang="id")
204 |                 text = text.replace(
205 |                     unit[0].strip(" ,."), f"{number} {self.measurements[unit[2]]}"
206 |                 )
207 |             except NotImplementedError as error:
208 |                 print(error)
209 |                 print(f"Problem with measurements: <{text}>: {number}")
210 |         return text
211 | 
212 |     def normalize_date(self, text: str) -> str:
213 |         """Normalizes dates.
214 | 
215 |         Examples:
216 |         - `"(12/3/2021)"` -> `"dua belas Maret dua ribu dua puluh satu"`
217 | 
218 |         Args:
219 |             text (str): Text with dates to normalize.
220 | 
221 |         Returns:
222 |             str: Normalized text with dates transliterated.
223 |         """
224 |         dates = re.findall(r"(\((\d{1,2})/(\d{1,2})(/(\d+))?\))", text)
225 |         for date in dates:
226 |             try:
227 |                 day = num2words(int(date[1]), to="cardinal", lang="id")
228 |                 month: Any = int(date[2]) - 1
229 |                 if month >= 12:
230 |                     month = 0
231 |                 month = self.months[month]
232 |                 if date[4] != "":
233 |                     year = num2words(int(date[4]), to="cardinal", lang="id")
234 |                     date_string = f"{day} {month} {year}"
235 |                 else:
236 |                     date_string = f"{day} {month}"
237 |                 text = text.replace(date[0], f" {date_string} ")
238 |             except NotImplementedError as error:
239 |                 print(error)
240 |                 print(f"Problem with dates: <{text}>: {date}")
241 |         return text
242 | 
243 |     def normalize_timezone(self, text: str) -> str:
244 |         """Normalizes Indonesian time with timezones.
245 | 
246 |         Examples:
247 |         - `"22.30 WITA"`
248 |             -> `"dua puluh dua lewat tiga puluh menit Waktu Indonesia Tengah"`
249 | 
250 |         Args:
251 |             text (str): Text with timezones to normalize.
252 | 
253 |         Returns:
254 |             str: Normalized text with timezones transliterated.
255 |         """
256 |         timezones = re.findall(self.re_timezones, text)
257 |         for timezone in timezones:
258 |             try:
259 |                 hour = num2words(int(timezone[1]), to="cardinal", lang="id")
260 |                 minute = num2words(int(timezone[2]), to="cardinal", lang="id")
261 |                 zone = self.timezones[timezone[3]]
262 |                 if minute == "nol":
263 |                     time_string = f"{hour} {zone}"
264 |                 else:
265 |                     time_string = f"{hour} lewat {minute} menit {zone}"
266 |                 text = text.replace(timezone[0], f"{time_string}")
267 |             except NotImplementedError as error:
268 |                 print(error)
269 |                 print(f"Problem with timezones: <{text}>: {timezone}")
270 |         return text
271 | 
272 |     def normalize_number(self, text: str) -> str:
273 |         """Normalizes Arabic numbers to Indonesian.
274 | 
275 |         Examples:
276 |         - `"1.000"` -> `"seribu"`
277 |         - `"10,5"` -> `"sepuluh koma lima"`
278 | 
279 |         Args:
280 |             text (str): Text with numbers to normalize.
281 | 
282 |         Returns:
283 |             str: Normalized text with numbers transliterated.
284 |         """
285 |         re_numbers = [r"([\d.,]+)", r"\d+"]
286 |         for re_number in re_numbers:
287 |             number_len = 0
288 |             for i in re.finditer(re_number, text):
289 |                 start = i.start() + number_len
290 |                 end = i.end() + number_len
291 |                 number: Any = text[start:end]
292 |                 number = re.sub(",", ".", re.sub(r"\.", "", number.strip(" ,.")))
293 |                 if number == "":
294 |                     continue
295 |                 if self.is_float(number) or self.is_integer(number):
296 |                     try:
297 |                         if self.is_integer(number):
298 |                             number = int(number)
299 |                         else:
300 |                             number = float(number)
301 |                         number = num2words(number, to="cardinal", lang="id")
302 |                         text = text[:start] + number + text[end:]
303 |                         number_len += len(number) - (end - start)
304 |                     except NotImplementedError as error:
305 |                         print(error)
306 |                         print(f"Problem with number: <{text}>: {number}")
307 |         return text
308 | 
309 |     def normalize(self, text: str) -> str:
310 |         """Normalizes Indonesian text by expanding:
311 | 
312 |         - URL
313 |         - Currency
314 |         - Measurements
315 |         - Dates
316 |         - Timezones
317 |         - Arabic Numerals
318 | 
319 |         Args:
320 |             text (str): Text to normalize.
321 | 
322 |         Returns:
323 |             str: Normalized text.
324 |         """
325 |         # Remove URL
326 |         text = self.normalize_url(text)
327 |         # Currency
328 |         text = self.normalize_currency(text)
329 |         # Measurements
330 |         text = self.normalize_measurement(text)
331 |         # Date
332 |         text = self.normalize_date(text)
333 |         # Timezones
334 |         text = self.normalize_timezone(text)
335 |         # Any number
336 |         text = self.normalize_number(text)
337 |         # collapse consecutive whitespaces
338 |         text = re.sub(r"\s+", " ", text)
339 |         return text
340 | 


--------------------------------------------------------------------------------