├── requirements.txt
├── MANIFEST.in
├── docs
├── reference
│ ├── textprocessor.md
│ ├── lstm.md
│ ├── bert.md
│ └── g2p.md
├── contributing.md
├── index.md
└── algorithm.md
├── requirements_test.txt
├── g2p_id
├── models
│ ├── bert
│ │ ├── bert_mlm.onnx
│ │ ├── config.json
│ │ └── token2id.json
│ └── lstm
│ │ ├── decoder_model.onnx
│ │ ├── encoder_model.onnx
│ │ ├── config.json
│ │ ├── g2id.json
│ │ └── p2id.json
├── resources
│ ├── id_posp_tagger.pickle
│ ├── timezones.tsv
│ ├── currency.tsv
│ ├── measurements.tsv
│ └── homographs_id.tsv
├── __init__.py
├── onnx_utils.py
├── bert.py
├── lstm.py
├── g2p.py
└── text_processor.py
├── NOTICE.md
├── tests
├── conftest.py
├── test_text_processor.py
└── test_g2p.py
├── .github
└── workflows
│ ├── docs.yml
│ └── tests.yml
├── tox.ini
├── setup.py
├── mkdocs.yml
├── PROJECT_CHARTER.md
├── .gitignore
├── CONTRIBUTING.md
├── README.md
├── CODE_OF_CONDUCT.md
└── LICENSE.md
/requirements.txt:
--------------------------------------------------------------------------------
1 | num2words
2 | nltk==3.9.1
3 | onnxruntime
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include g2p_id/resources/*
3 | include g2p_id/models/*/*
--------------------------------------------------------------------------------
/docs/reference/textprocessor.md:
--------------------------------------------------------------------------------
1 | # TextProcessor
2 |
3 | ::: g2p_id.text_processor.TextProcessor
4 |
--------------------------------------------------------------------------------
/requirements_test.txt:
--------------------------------------------------------------------------------
1 | importlib_metadata<5
2 | flake8
3 | tox
4 | pytest
5 | pytest-cov
6 | mypy
7 | pylint
--------------------------------------------------------------------------------
/g2p_id/models/bert/bert_mlm.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bookbot-kids/g2p_id/HEAD/g2p_id/models/bert/bert_mlm.onnx
--------------------------------------------------------------------------------
/g2p_id/models/bert/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "mask_token": "[mask]",
3 | "pad_token": "",
4 | "max_seq_length": 32
5 | }
6 |
--------------------------------------------------------------------------------
/g2p_id/models/lstm/decoder_model.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bookbot-kids/g2p_id/HEAD/g2p_id/models/lstm/decoder_model.onnx
--------------------------------------------------------------------------------
/g2p_id/models/lstm/encoder_model.onnx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bookbot-kids/g2p_id/HEAD/g2p_id/models/lstm/encoder_model.onnx
--------------------------------------------------------------------------------
/g2p_id/resources/id_posp_tagger.pickle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bookbot-kids/g2p_id/HEAD/g2p_id/resources/id_posp_tagger.pickle
--------------------------------------------------------------------------------
/g2p_id/resources/timezones.tsv:
--------------------------------------------------------------------------------
1 | WITA Waktu Indonesia Tengah
2 | WIB Waktu Indonesia Barat
3 | WIT Waktu Indonesia Timur
4 | GMT Greenwich Mean Time
5 |
--------------------------------------------------------------------------------
/NOTICE.md:
--------------------------------------------------------------------------------
1 | g2p ID
2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/)
3 |
4 | This product includes software developed at
5 | PT BOOKBOT INDONESIA (https://bookbot.id/).
6 |
--------------------------------------------------------------------------------
/g2p_id/models/lstm/config.json:
--------------------------------------------------------------------------------
1 | {
2 | "latent_dim": 256,
3 | "bos_token": "\t",
4 | "eos_token": "\n",
5 | "pad_token": " ",
6 | "num_encoder_tokens": 28,
7 | "num_decoder_tokens": 32,
8 | "max_encoder_seq_length": 24,
9 | "max_decoder_seq_length": 25
10 | }
11 |
--------------------------------------------------------------------------------
/docs/reference/lstm.md:
--------------------------------------------------------------------------------
1 | # LSTM
2 |
3 | ::: g2p_id.lstm.LSTM
4 |
5 | ## Usage
6 |
7 | ```py
8 | texts = ["mengembangkannya", "merdeka", "pecel", "lele"]
9 | lstm = LSTM()
10 | for text in texts:
11 | print(lstm.predict(text))
12 | ```
13 |
14 | ```py
15 | >> məŋəmbaŋkanɲa
16 | >> mərdeka
17 | >> pətʃəl
18 | >> lele
19 | ```
--------------------------------------------------------------------------------
/docs/reference/bert.md:
--------------------------------------------------------------------------------
1 | # BERT
2 |
3 | ::: g2p_id.bert.BERT
4 |
5 | ## Usage
6 |
7 | ```py
8 | texts = ["mengembangkannya", "merdeka", "pecel", "lele"]
9 | bert = BERT()
10 | for text in texts:
11 | print(bert.predict(text))
12 | ```
13 |
14 | ```py
15 | >> məngəmbangkannya
16 | >> mərdeka
17 | >> pəcel
18 | >> lele
19 | ```
20 |
--------------------------------------------------------------------------------
/g2p_id/models/lstm/g2id.json:
--------------------------------------------------------------------------------
1 | {
2 | " ": 27,
3 | "'": 0,
4 | "-": 1,
5 | "a": 2,
6 | "b": 3,
7 | "c": 4,
8 | "d": 5,
9 | "e": 6,
10 | "f": 7,
11 | "g": 8,
12 | "h": 9,
13 | "i": 10,
14 | "j": 11,
15 | "k": 12,
16 | "l": 13,
17 | "m": 14,
18 | "n": 15,
19 | "o": 16,
20 | "p": 17,
21 | "q": 18,
22 | "r": 19,
23 | "s": 20,
24 | "t": 21,
25 | "u": 22,
26 | "v": 23,
27 | "w": 24,
28 | "y": 25,
29 | "z": 26
30 | }
31 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import pytest
2 |
3 | from g2p_id import BERT, LSTM, G2p, TextProcessor
4 |
5 |
6 | @pytest.fixture(scope="session")
7 | def g2p():
8 | return G2p()
9 |
10 |
11 | @pytest.fixture(scope="session")
12 | def lstm():
13 | return LSTM()
14 |
15 |
16 | @pytest.fixture(scope="session")
17 | def bert():
18 | return BERT()
19 |
20 |
21 | @pytest.fixture(scope="session")
22 | def text_processor():
23 | return TextProcessor()
24 |
--------------------------------------------------------------------------------
/g2p_id/models/bert/token2id.json:
--------------------------------------------------------------------------------
1 | {
2 | "": 0,
3 | "'": 28,
4 | "-": 26,
5 | "[UNK]": 1,
6 | "[mask]": 30,
7 | "a": 2,
8 | "b": 13,
9 | "c": 20,
10 | "d": 16,
11 | "e": 18,
12 | "f": 24,
13 | "g": 11,
14 | "h": 19,
15 | "i": 5,
16 | "j": 22,
17 | "k": 7,
18 | "l": 15,
19 | "m": 8,
20 | "n": 3,
21 | "o": 17,
22 | "p": 14,
23 | "q": 29,
24 | "r": 6,
25 | "s": 12,
26 | "t": 9,
27 | "u": 10,
28 | "v": 25,
29 | "w": 23,
30 | "y": 21,
31 | "z": 27,
32 | "ə": 4
33 | }
34 |
--------------------------------------------------------------------------------
/g2p_id/models/lstm/p2id.json:
--------------------------------------------------------------------------------
1 | {
2 | "\t": 0,
3 | "\n": 1,
4 | " ": 31,
5 | "-": 2,
6 | "a": 3,
7 | "b": 4,
8 | "d": 5,
9 | "e": 6,
10 | "f": 7,
11 | "g": 8,
12 | "h": 9,
13 | "i": 10,
14 | "j": 11,
15 | "k": 12,
16 | "l": 13,
17 | "m": 14,
18 | "n": 15,
19 | "o": 16,
20 | "p": 17,
21 | "r": 18,
22 | "s": 19,
23 | "t": 20,
24 | "u": 21,
25 | "v": 22,
26 | "w": 23,
27 | "z": 24,
28 | "ŋ": 25,
29 | "ə": 26,
30 | "ɲ": 27,
31 | "ʃ": 28,
32 | "ʒ": 29,
33 | "ʔ": 30
34 | }
35 |
--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
1 | name: Deploy docs to Github Pages
2 | on:
3 | push:
4 | branches:
5 | - main
6 | jobs:
7 | deploy:
8 | runs-on: ubuntu-latest
9 | steps:
10 | - name: Checkout master branch
11 | uses: actions/checkout@v2
12 | - name: Setup Python
13 | uses: actions/setup-python@v2
14 | with:
15 | python-version: 3.9
16 | - name: Install dependencies
17 | run: pip3 install mkdocs-material mkdocstrings mkdocstrings-python-legacy
18 | - name: Install package
19 | run: pip3 install .
20 | - name: Deploy docs
21 | run: mkdocs gh-deploy --force
22 |
--------------------------------------------------------------------------------
/g2p_id/resources/currency.tsv:
--------------------------------------------------------------------------------
1 | US$ dollar amerika serikat
2 | nzd dollar new zealand
3 | rs rupee
4 | chf franc swiss
5 | dkk kroner denmark
6 | fim markka finland
7 | aed dirham arab
8 | czk koruna ceko
9 | mro ouguiya mauritania
10 | pkr rupee pakistan
11 | crc colon costa rica
12 | hk$ dollar hong kong
13 | npr rupee nepal
14 | awg florin aruban
15 | nok kroner norwegia
16 | tzs shilling tanzania
17 | sek kronor swedish
18 | cyp pounds cypriot
19 | sar riyal saudi
20 | cve escudo cape verde
21 | rsd dinar serbia
22 | dm mark jerman
23 | shp pounds saint helena
24 | php peso philipina
25 | cad dollar canada
26 | ssp pounds sudan selatan
27 | scr rupee seychell
28 | mvr rufiyaa maldivia
29 | Rp rupiah
30 | r real
31 | $ dollar
32 | € euro
33 | £ pounds
34 | ₩ won
35 | ¥ yen
--------------------------------------------------------------------------------
/docs/reference/g2p.md:
--------------------------------------------------------------------------------
1 | # G2p
2 |
3 | ::: g2p_id.g2p.G2p
4 |
5 | ## Usage
6 |
7 | ```py
8 | texts = [
9 | "Apel itu berwarna merah.",
10 | "Rahel bersekolah di Jakarta.",
11 | "Mereka sedang bermain bola di lapangan.",
12 | ]
13 | g2p = G2p(model_type="BERT")
14 | for text in texts:
15 | print(g2p(text))
16 | ```
17 |
18 | ```py
19 | >> [['a', 'p', 'ə', 'l'], ['i', 't', 'u'], ['b', 'ə', 'r', 'w', 'a', 'r', 'n', 'a'], ['m', 'e', 'r', 'a', 'h'], ['.']]
20 | >> [['r', 'a', 'h', 'e', 'l'], ['b', 'ə', 'r', 's', 'ə', 'k', 'o', 'l', 'a', 'h'], ['d', 'i'], ['dʒ', 'a', 'k', 'a', 'r', 't', 'a'], ['.']]
21 | >> [['m', 'ə', 'r', 'e', 'k', 'a'], ['s', 'ə', 'd', 'a', 'ŋ'], ['b', 'ə', 'r', 'm', 'a', 'i', 'n'], ['b', 'o', 'l', 'a'], ['d', 'i'], ['l', 'a', 'p', 'a', 'ŋ', 'a', 'n'], ['.']]
22 | ```
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - main
10 | jobs:
11 | test:
12 | runs-on: ${{ matrix.os }}
13 | strategy:
14 | matrix:
15 | os: [ubuntu-latest, windows-latest]
16 | python-version: ["3.8", "3.9"]
17 |
18 | steps:
19 | - uses: actions/checkout@v2
20 | - name: Set up Python ${{ matrix.python-version }}
21 | uses: actions/setup-python@v2
22 | with:
23 | python-version: ${{ matrix.python-version }}
24 | - name: Install dependencies
25 | run: |
26 | python -m pip install --upgrade pip
27 | pip install tox tox-gh-actions
28 | - name: Install package
29 | run: pip install .
30 | - name: Test with tox
31 | run: tox
32 | - name: Upload coverage reports to Codecov
33 | uses: codecov/codecov-action@v3
34 |
--------------------------------------------------------------------------------
/g2p_id/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/)
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 |
17 | from .bert import BERT
18 | from .g2p import G2p
19 | from .lstm import LSTM
20 | from .onnx_utils import WrapInferenceSession
21 | from .text_processor import TextProcessor
22 |
23 | __version__ = "0.4.2"
24 | __all__ = ["G2p", "LSTM", "BERT", "WrapInferenceSession", "TextProcessor"]
25 |
--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [tox]
2 | minversion = 3.8.0
3 | envlist = python3.8, python3.9, flake8, mypy
4 | isolated_build = true
5 |
6 | [gh-actions]
7 | python =
8 | 3.8: python3.8, flake8, mypy, pylint
9 | 3.9: python3.9, flake8, mypy, pylint
10 |
11 | [testenv]
12 | setenv =
13 | PYTHONPATH = {toxinidir}
14 | deps =
15 | -r{toxinidir}/requirements.txt
16 | -r{toxinidir}/requirements_test.txt
17 | commands =
18 | coverage erase
19 | coverage run --branch -m pytest
20 | coverage report
21 | coverage xml -i -o coverage.xml
22 | flake8 g2p_id tests
23 | mypy g2p_id --ignore-missing-imports
24 | pylint --rcfile=tox.ini g2p_id
25 |
26 | [flake8]
27 | extend-ignore = E203
28 | max-line-length = 120
29 |
30 | [pylint]
31 | ; R0902: Too many instance attribute
32 | ; R0903: Too few public methods
33 | ; R0914: Too many local variables
34 | disable =
35 | R0902,
36 | R0903,
37 | R0914
38 | max-line-length = 120
39 |
40 | [coverage:run]
41 | source=g2p_id
42 |
43 | [coverage:report]
44 | exclude_lines =
45 | except
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import find_packages, setup
2 | from pathlib import Path
3 |
4 | this_path = Path(__file__).parent
5 |
6 | readme_path = this_path / "README.md"
7 | requirements_path = this_path / "requirements.txt"
8 |
9 | long_description = readme_path.read_text(encoding="utf-8")
10 |
11 | with open(requirements_path, "r", encoding="utf-8") as requirements_file:
12 | requirements = requirements_file.read().splitlines()
13 |
14 | if __name__ == "__main__":
15 | setup(
16 | name="g2p_id_py",
17 | version="0.4.2",
18 | description="Indonesian G2P.",
19 | long_description=long_description,
20 | long_description_content_type="text/markdown",
21 | author="w11wo",
22 | author_email="wilson@bookbotkids.com",
23 | url="https://github.com/bookbot-kids/g2p_id",
24 | license="Apache License",
25 | packages=find_packages(),
26 | install_requires=requirements,
27 | include_package_data=True,
28 | platforms=["linux", "unix", "windows"],
29 | python_requires=">=3.8",
30 | )
31 |
--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
1 | site_name: g2p ID
2 | repo_url: https://github.com/bookbot-kids/g2p_id
3 | docs_dir: docs
4 |
5 | theme:
6 | name: material
7 | palette:
8 | - media: "(prefers-color-scheme: light)"
9 | scheme: default
10 | primary: indigo
11 | accent: indigo
12 | toggle:
13 | icon: material/weather-night
14 | name: Switch to dark mode
15 | - media: "(prefers-color-scheme: dark)"
16 | scheme: slate
17 | primary: red
18 | accent: red
19 | toggle:
20 | icon: material/weather-sunny
21 | name: Switch to light mode
22 | features:
23 | - navigation.sections
24 |
25 | plugins:
26 | - search
27 | - mkdocstrings:
28 | handlers:
29 | python:
30 | options:
31 | show_source: true
32 | show_root_heading: true
33 | heading_level: 2
34 |
35 | markdown_extensions:
36 | - tables
37 | - pymdownx.highlight:
38 | anchor_linenums: true
39 | - pymdownx.inlinehilite
40 | - pymdownx.snippets
41 | - pymdownx.superfences
42 | - def_list
43 | - pymdownx.tasklist:
44 | custom_checkbox: true
45 |
46 | watch:
47 | - g2p_id
48 |
--------------------------------------------------------------------------------
/tests/test_text_processor.py:
--------------------------------------------------------------------------------
1 | def test_text_processor(text_processor):
2 | # URLs
3 | assert text_processor.normalize("Situs: https://www.google.com") == "Situs: "
4 | # measurements
5 | assert (
6 | text_processor.normalize("123,1 kg")
7 | == "seratus dua puluh tiga koma satu kilogram"
8 | )
9 | assert text_processor.normalize("500 cm") == "lima ratus centimeter"
10 | # currency/money
11 | assert text_processor.normalize("$100") == "seratus dollar"
12 | assert text_processor.normalize("Rp 3,000,000") == "tiga juta rupiah"
13 | # dates
14 | assert (
15 | text_processor.normalize("(17/8/1945)").strip()
16 | == "tujuh belas Agustus seribu sembilan ratus empat puluh lima"
17 | )
18 | assert text_processor.normalize("(1/13)").strip() == "satu Januari"
19 | # time/time zone
20 | assert (
21 | text_processor.normalize("19.45 WIB")
22 | == "sembilan belas lewat empat puluh lima menit Waktu Indonesia Barat"
23 | )
24 | assert (
25 | text_processor.normalize("19.00 WIB") == "sembilan belas Waktu Indonesia Barat"
26 | )
27 | # numerics
28 | assert text_processor.normalize("105.000") == "seratus lima ribu"
29 | assert text_processor.normalize("0,5") == "nol koma lima"
30 |
--------------------------------------------------------------------------------
/PROJECT_CHARTER.md:
--------------------------------------------------------------------------------
1 | # Project Charter
2 |
3 | ## Vision statement
4 | Literacy is fundamental, not only for our personal and social development, but also for our ability to function effectively in society. Our vision at Bookbot is that every child should have the opportunity to develop their reading, writing and communication skills to create a happy and successful life.
5 |
6 |
7 | ## Mission statement
8 | Deliver the Bookbot app that combines speech recognition and a scientifically designed reading program for school children to achieve greater literacy and providing better tools for educators to monitor a child’s reading progress.
9 |
10 |
11 | ## Community (Impact) statement
12 |
13 | Bookbot is founded on the grounds of building a community of learners. Members of the Bookbot community consist of software developers, educators, students, writers, editors, linguists, people with disabilities, and more. We exist to ensure that every child, regardless of their situation, is able to develop their literacy skills.
14 |
15 |
16 | ## Licensing strategy
17 | Open source (creative commons), and reseller model for the app. Parts of code are Apache 2
18 |
19 | ## Identification of key trademarks
20 | No key trademarks
21 |
--------------------------------------------------------------------------------
/g2p_id/onnx_utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/)
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 |
17 | import onnxruntime as ort
18 |
19 |
20 | class WrapInferenceSession:
21 | """Wrapper class for serializing ONNX InferenceSession objects.
22 | Based on: https://github.com/microsoft/onnxruntime/pull/800#issuecomment-844326099
23 | """
24 |
25 | def __init__(self, onnx_bytes, sess_options=None, providers=None):
26 | self.sess = ort.InferenceSession(onnx_bytes, sess_options=sess_options, providers=providers)
27 | self.onnx_bytes = onnx_bytes
28 | self.providers = providers
29 |
30 | def run(self, *args):
31 | """Wrapper for ONNX InferenceSession run method.
32 |
33 | Returns:
34 | Any: Inference result.
35 | """
36 | return self.sess.run(*args)
37 |
38 | def __getstate__(self):
39 | return {"onnx_bytes": self.onnx_bytes}
40 |
41 | def __setstate__(self, values):
42 | self.onnx_bytes = values["onnx_bytes"]
43 | self.providers = values.get("providers", None)
44 | self.sess = ort.InferenceSession(self.onnx_bytes, self.providers)
45 |
--------------------------------------------------------------------------------
/g2p_id/resources/measurements.tsv:
--------------------------------------------------------------------------------
1 | sq mi mil kuadrat
2 | sq ft kaki kuadrat
3 | kbps kilobit per detik
4 | mbps megabit per detik
5 | kcal kilo kalori
6 | ghz gigahertz
7 | khz kilohertz
8 | mhz megahertz
9 | lbs pound
10 | rpm revolution per menit
11 | kwh kilo watt jam
12 | min menit
13 | mph mil per jam
14 | mol mol
15 | gpa giga pascal
16 | km² kilometer kuadrat
17 | km2 kilometer kuadrat
18 | rad radian
19 | kgf kilogram force
20 | mm² millimeter kuadrat
21 | mm2 millimeter kuadrat
22 | cm² centimeter kuadrat
23 | cm2 centimeter kuadrat
24 | dm³ desimeter kubik
25 | dm3 desimeter kubik
26 | amu atomic mass unit
27 | gwh giga watt jam
28 | kpa kilopascal
29 | cwt hundredweight
30 | atm atmosphere
31 | bar bar
32 | km kilometer
33 | cm centimeter
34 | mm millimeter
35 | ha hectare
36 | mi mil
37 | m² meter kuadrat
38 | m2 meter kuadrat
39 | ft kaki
40 | hz hertz
41 | kw kilowatt
42 | hp tenaga kuda
43 | mg milligram
44 | kg kilogram
45 | lb pound
46 | mc mega coulomb
47 | nm nanometer
48 | mA milli ampere
49 | m³ meter kubik
50 | m3 meter kubik
51 | tw tera watt
52 | mv milli volt
53 | mw megawatt
54 | μm mikrometer
55 | " inch
56 | TB terabyte
57 | cc c c
58 | da dalton
59 | db desibel
60 | ps peta detik
61 | oz ounce
62 | hl hecto liter
63 | μg mikrogram
64 | pg petagram
65 | GB gigabyte
66 | kb kilobit
67 | ev electron volt
68 | MB megabyte
69 | KB kilobyte
70 | kl kilo liter
71 | tj tera joule
72 | kv kilo volt
73 | mv mega volt
74 | kn kilonewton
75 | mm megameter
76 | au astronomical unit
77 | yd yard
78 | lm lumen
79 | hs hecto detik
80 | ml milliliter
81 | gw gigawatt
82 | ma mega ampere
83 | kt knot
84 | ng nano gram
85 | ns nano detik
86 | ms mega siemens
87 | gl giga liter
88 | μs mikro detik
89 | da desi ampere
90 | pa pascal
91 | ds desi detik
92 | ms milli detik
93 | dm desimeter
94 | mb megabit
95 | mf mega farad
96 | bq becquerel
97 | pb petabit
98 | cd candela
99 | tl tera liter
100 | ms mega detik
101 | mpa megapascal
102 | pb peta byte
103 | gy gray
104 | sv sievert
105 | cc c c
106 | °F derajat fahrenheit
107 | °f derajat fahrenheit
108 | °C derajat celsius
109 | °c derajat celsius
110 | m meter
111 | % percent
112 | v volt
113 | h jam
114 | g gram
115 | s detik
116 | ω ohm
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
3 | # Byte-compiled / optimized / DLL files
4 | __pycache__/
5 | *.py[cod]
6 | *$py.class
7 |
8 | # C extensions
9 | *.so
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | pip-wheel-metadata/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 |
32 | # PyInstaller
33 | # Usually these files are written by a python script from a template
34 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
35 | *.manifest
36 | *.spec
37 |
38 | # Installer logs
39 | pip-log.txt
40 | pip-delete-this-directory.txt
41 |
42 | # Unit test / coverage reports
43 | htmlcov/
44 | .tox/
45 | .nox/
46 | .coverage
47 | .coverage.*
48 | .cache
49 | nosetests.xml
50 | coverage.xml
51 | *.cover
52 | *.py,cover
53 | .hypothesis/
54 | .pytest_cache/
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | target/
78 |
79 | # Jupyter Notebook
80 | .ipynb_checkpoints
81 |
82 | # IPython
83 | profile_default/
84 | ipython_config.py
85 |
86 | # pyenv
87 | .python-version
88 |
89 | # pipenv
90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
93 | # install all needed dependencies.
94 | #Pipfile.lock
95 |
96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
97 | __pypackages__/
98 |
99 | # Celery stuff
100 | celerybeat-schedule
101 | celerybeat.pid
102 |
103 | # SageMath parsed files
104 | *.sage.py
105 |
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 |
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 |
119 | # Rope project settings
120 | .ropeproject
121 |
122 | # mkdocs documentation
123 | /site
124 |
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 |
130 | # Pyre type checker
131 | .pyre/
132 |
--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 | Hi there! Thanks for taking your time to contribute!
3 |
4 | We welcome everyone to contribute and we value each contribution, even the smallest ones! We want to make contributing to this project as easy and transparent as possible, whether it's:
5 |
6 | - Reporting a bug
7 | - Discussing the current state of the code
8 | - Submitting a fix
9 | - Proposing new features
10 | - Becoming a maintainer
11 |
12 | ## Code of Conduct
13 |
14 | Please be mindful to respect our [Code of Conduct](https://github.com/bookbot-kids/g2p_id/blob/main/CODE_OF_CONDUCT.md).
15 |
16 | ## We Develop with Github
17 | We use github to host code, to track issues and feature requests, as well as accept pull requests.
18 |
19 | ## We Use Github, So All Code Changes Happen Through Pull Requests
20 | Pull requests are the best way to propose changes to the codebase. We actively welcome your pull requests:
21 |
22 | 1. Fork the repo and create your branch from `main`.
23 | 2. If you've added code that should be tested, add tests.
24 | 3. If you've changed APIs, update the documentation.
25 | 4. Ensure the test suite passes.
26 | 5. Make sure your code lints.
27 | 6. Issue that pull request!
28 |
29 | ## Any contributions you make will be under the Apache 2.0 License
30 | In short, when you submit code changes, your submissions are understood to be under the same [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0) that covers the project. Feel free to contact the maintainers if that's a concern.
31 |
32 | ## Report bugs using Github's [issues](https://github.com/bookbot-kids/g2p_id/issues)
33 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/bookbot-kids/g2p_id/issues/new).
34 |
35 | ## Write bug reports with detail, background, and sample code
36 | [This is an example](http://stackoverflow.com/q/12488905/180626) of a good and thorough bug report.
37 |
38 | **Great Bug Reports** tend to have:
39 |
40 | - A quick summary and/or background
41 | - Steps to reproduce
42 | - Be specific!
43 | - Give sample code if you can.
44 | - What you expected would happen
45 | - What actually happens
46 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
47 |
48 | ## License
49 | By contributing, you agree that your contributions will be licensed under its Apache 2.0 License.
50 |
51 | ## References
52 | This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md)
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to g2p ID
2 | Hi there! Thanks for taking your time to contribute!
3 |
4 | We welcome everyone to contribute and we value each contribution, even the smallest ones! We want to make contributing to this project as easy and transparent as possible, whether it's:
5 |
6 | - Reporting a bug
7 | - Discussing the current state of the code
8 | - Submitting a fix
9 | - Proposing new features
10 | - Becoming a maintainer
11 |
12 | ## Code of Conduct
13 |
14 | Please be mindful to respect our [Code of Conduct](https://github.com/bookbot-kids/g2p_id/blob/main/CODE_OF_CONDUCT.md).
15 |
16 | ## We Develop with Github
17 | We use github to host code, to track issues and feature requests, as well as accept pull requests.
18 |
19 | ## We Use Github, So All Code Changes Happen Through Pull Requests
20 | Pull requests are the best way to propose changes to the codebase. We actively welcome your pull requests:
21 |
22 | 1. Fork the repo and create your branch from `main`.
23 | 2. If you've added code that should be tested, add tests.
24 | 3. If you've changed APIs, update the documentation.
25 | 4. Ensure the test suite passes.
26 | 5. Make sure your code lints.
27 | 6. Issue that pull request!
28 |
29 | ## Any contributions you make will be under the Apache 2.0 License
30 | In short, when you submit code changes, your submissions are understood to be under the same [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0) that covers the project. Feel free to contact the maintainers if that's a concern.
31 |
32 | ## Report bugs using Github's [issues](https://github.com/bookbot-kids/g2p_id/issues)
33 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/bookbot-kids/g2p_id/issues/new).
34 |
35 | ## Write bug reports with detail, background, and sample code
36 | [This is an example](http://stackoverflow.com/q/12488905/180626) of a good and thorough bug report.
37 |
38 | **Great Bug Reports** tend to have:
39 |
40 | - A quick summary and/or background
41 | - Steps to reproduce
42 | - Be specific!
43 | - Give sample code if you can.
44 | - What you expected would happen
45 | - What actually happens
46 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work)
47 |
48 | ## License
49 | By contributing, you agree that your contributions will be licensed under its Apache 2.0 License.
50 |
51 | ## References
52 | This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md)
--------------------------------------------------------------------------------
/g2p_id/bert.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/)
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 |
17 | import json
18 | import os
19 |
20 | import numpy as np
21 | import onnxruntime
22 |
23 | from g2p_id.onnx_utils import WrapInferenceSession
24 |
25 | model_path = os.path.join(os.path.dirname(__file__), "models", "bert")
26 |
27 |
28 | class BERT:
29 | """Phoneme-level BERT model for predicting the correct phoneme for the letter `e`.
30 | Trained with [Keras](https://keras.io/examples/nlp/masked_language_modeling/),
31 | and exported to ONNX. ONNX Runtime engine used during inference.
32 | """
33 |
34 | def __init__(self):
35 | bert_model_path = os.path.join(model_path, "bert_mlm.onnx")
36 | token2id = os.path.join(model_path, "token2id.json")
37 | config_path = os.path.join(model_path, "config.json")
38 | self.model = WrapInferenceSession(bert_model_path, providers=onnxruntime.get_available_providers())
39 | with open(config_path, encoding="utf-8") as file:
40 | self.config = json.load(file)
41 | with open(token2id, encoding="utf-8") as file:
42 | self.token2id = json.load(file)
43 | self.id2token = {v: k for k, v in self.token2id.items()}
44 |
45 | def predict(self, text: str) -> str:
46 | """Performs BERT inference, predicting the correct phoneme for the letter `e`.
47 |
48 | Args:
49 | text (str): Word to predict from.
50 |
51 | Returns:
52 | str: Word after prediction.
53 | """
54 | # `x` is currently OOV, we replace with
55 | text = text.replace("x", "ks")
56 | # mask `e`'s
57 | text = " ".join([c if c != "e" else "[mask]" for c in text])
58 |
59 | # tokenize and pad to max length
60 | tokens = [self.token2id[c] for c in text.split()]
61 | padding = [self.token2id[self.config["pad_token"]] for _ in range(self.config["max_seq_length"] - len(tokens))]
62 | tokens = tokens + padding
63 |
64 | input_ids = np.array([tokens], dtype="int64")
65 | inputs = {"input_1": input_ids}
66 | prediction = self.model.run(None, inputs)
67 |
68 | # find masked idx token
69 | mask_token_id = self.token2id[self.config["mask_token"]]
70 | masked_index = np.where(input_ids == mask_token_id)[1]
71 |
72 | # get prediction at masked indices
73 | mask_prediction = prediction[0][0][masked_index]
74 | predicted_ids = np.argmax(mask_prediction, axis=1)
75 |
76 | # replace mask with predicted token
77 | for i, idx in enumerate(masked_index):
78 | tokens[idx] = predicted_ids[i]
79 |
80 | return "".join([self.id2token[t] for t in tokens if t != 0])
81 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # Home
2 |
3 | ## g2p ID: Indonesian Grapheme-to-Phoneme Converter
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 | This library is developed to convert Indonesian (Bahasa Indonesia) graphemes (words) to phonemes in IPA. We followed the methods and designs used in the English equivalent library, [g2p](https://github.com/Kyubyong/g2p).
33 |
34 | ## Installation
35 |
36 | ```bash
37 | pip install g2p_id_py
38 | ```
39 |
40 | ## How to Use
41 |
42 | ```py
43 | from g2p_id import G2p
44 |
45 | texts = [
46 | "Apel itu berwarna merah.",
47 | "Rahel bersekolah di Jakarta.",
48 | "Mereka sedang bermain bola di lapangan.",
49 | ]
50 |
51 | g2p = G2p()
52 | for text in texts:
53 | print(g2p(text))
54 |
55 | >> [['a', 'p', 'ə', 'l'], ['i', 't', 'u'], ['b', 'ə', 'r', 'w', 'a', 'r', 'n', 'a'], ['m', 'e', 'r', 'a', 'h'], ['.']]
56 | >> [['r', 'a', 'h', 'e', 'l'], ['b', 'ə', 'r', 's', 'ə', 'k', 'o', 'l', 'a', 'h'], ['d', 'i'], ['dʒ', 'a', 'k', 'a', 'r', 't', 'a'], ['.']]
57 | >> [['m', 'ə', 'r', 'e', 'k', 'a'], ['s', 'ə', 'd', 'a', 'ŋ'], ['b', 'ə', 'r', 'm', 'a', 'i', 'n'], ['b', 'o', 'l', 'a'], ['d', 'i'], ['l', 'a', 'p', 'a', 'ŋ', 'a', 'n'], ['.']]
58 | ```
59 |
60 | ## References
61 |
62 | ```bib
63 | @misc{g2pE2019,
64 | author = {Park, Kyubyong & Kim, Jongseok},
65 | title = {g2pE},
66 | year = {2019},
67 | publisher = {GitHub},
68 | journal = {GitHub repository},
69 | howpublished = {\url{https://github.com/Kyubyong/g2p}}
70 | }
71 | ```
72 |
73 | ```bib
74 | @misc{TextProcessor2021,
75 | author = {Cahya Wirawan},
76 | title = {Text Processor},
77 | year = {2021},
78 | publisher = {GitHub},
79 | journal = {GitHub repository},
80 | howpublished = {\url{https://github.com/cahya-wirawan/text_processor}}
81 | }
82 | ```
83 |
84 | ## Contributors
85 |
86 |
87 |
88 |
--------------------------------------------------------------------------------
/tests/test_g2p.py:
--------------------------------------------------------------------------------
1 | def test_g2p(g2p):
2 | assert g2p("Apel itu berwarna merah.") == [
3 | ["a", "p", "ə", "l"],
4 | ["i", "t", "u"],
5 | ["b", "ə", "r", "w", "a", "r", "n", "a"],
6 | ["m", "e", "r", "a", "h"],
7 | ["."],
8 | ]
9 | assert g2p("Rahel bersekolah di S M A Jakarta 17.") == [
10 | ["r", "a", "h", "e", "l"],
11 | ["b", "ə", "r", "s", "ə", "k", "o", "l", "a", "h"],
12 | ["d", "i"],
13 | ["e", "s"],
14 | ["e", "m"],
15 | ["a"],
16 | ["dʒ", "a", "k", "a", "r", "t", "a"],
17 | ["t", "u", "dʒ", "u", "h"],
18 | ["b", "ə", "l", "a", "s"],
19 | ["."],
20 | ]
21 | assert g2p("Mereka sedang bermain bola di lapangan.") == [
22 | ["m", "ə", "r", "e", "k", "a"],
23 | ["s", "ə", "d", "a", "ŋ"],
24 | ["b", "ə", "r", "m", "a", "ʔ", "i", "n"],
25 | ["b", "o", "l", "a"],
26 | ["d", "i"],
27 | ["l", "a", "p", "a", "ŋ", "a", "n"],
28 | ["."],
29 | ]
30 | assert g2p("Ini rumahnya Aisyah dan Ceri.") == [
31 | ["i", "n", "i"],
32 | ["r", "u", "m", "a", "h", "ɲ", "a"],
33 | ["a", "ʔ", "i", "ʃ", "a", "h"],
34 | ["d", "a", "n"],
35 | ["tʃ", "e", "r", "i"],
36 | ["."],
37 | ]
38 | assert g2p("keset selamat datang") == [
39 | ["k", "e", "s", "e", "t"],
40 | ["s", "ə", "l", "a", "m", "a", "t"],
41 | ["d", "a", "t", "a", "ŋ"],
42 | ]
43 | assert g2p("kakak layak") == [["k", "a", "k", "a", "k"], ["l", "a", "j", "a", "k"]]
44 |
45 |
46 | def test_rule_based_g2p(g2p):
47 | assert g2p._rule_based_g2p("berakhirnya") == "b e r a x i r ɲ a"
48 | assert g2p._rule_based_g2p("bermaaf-maafan") == "b e r m a ʔ a f - m a ʔ a f a n"
49 | assert g2p._rule_based_g2p("kecolongan") == "k e tʃ o l o ŋ a n"
50 | assert g2p._rule_based_g2p("jayapura") == "dʒ a j a p u r a"
51 | assert g2p._rule_based_g2p("xenon") == "s e n o n"
52 | assert g2p._rule_based_g2p("layak") == "l a j a k"
53 |
54 |
55 | def test_lstm(lstm):
56 | assert lstm.predict("mengembangkannya") == "məŋəmbaŋkanɲa"
57 | assert lstm.predict("merdeka") == "mərdeka"
58 | assert lstm.predict("pecel") == "pətʃəl"
59 | assert lstm.predict("lele") == "lele"
60 |
61 |
62 | def test_bert(bert):
63 | assert bert.predict("mengembangkannya") == "məngəmbangkannya"
64 | assert bert.predict("merdeka") == "mərdeka"
65 | assert bert.predict("pecel") == "pəcel"
66 | assert bert.predict("lele") == "lele"
67 | assert bert.predict("banyak") == "banyak"
68 |
69 |
70 | def test_ps(g2p):
71 | assert g2p("psikologi") == [["s", "i", "k", "o", "l", "o", "ɡ", "i"]]
72 | assert g2p("psikometri") == [["s", "i", "k", "o", "m", "e", "t", "r", "i"]]
73 | assert g2p("psikotes") == [["s", "i", "k", "o", "t", "e", "s"]]
74 |
75 |
76 | def test_sticking_dot(g2p):
77 | assert g2p("Seniornya Brigadir Jendral A.Yani mengambil alih pimpinan.") == [
78 | ["s", "ə", "n", "i", "ʔ", "o", "r", "ɲ", "a"],
79 | ["b", "r", "i", "ɡ", "a", "d", "i", "r"],
80 | ["dʒ", "ə", "n", "d", "r", "a", "l"],
81 | ["a"],
82 | ["j", "a", "n", "i"],
83 | ["m", "ə", "ŋ", "a", "m", "b", "i", "l"],
84 | ["a", "l", "i", "h"],
85 | ["p", "i", "m", "p", "i", "n", "a", "n"],
86 | ["."],
87 | ]
88 |
89 |
90 | def test_onnx_wrapper(bert):
91 | assert bert.predict("mengembangkannya") == "məngəmbangkannya"
92 | model_state = bert.model.__getstate__()
93 | bert.model.__setstate__(model_state)
94 | assert bert.predict("mengembangkannya") == "məngəmbangkannya"
95 |
--------------------------------------------------------------------------------
/g2p_id/resources/homographs_id.tsv:
--------------------------------------------------------------------------------
1 | angel a ŋ e l a ŋ ə l A A
2 | apel a p ə l a p e l N V
3 | begar b e ɡ a r b ə ɡ a r V A
4 | begu b e ɡ u b ə ɡ u N N
5 | bekel b e k ə l b e k ə l N N
6 | belek b ə l e ʔ b e l e ʔ V N
7 | belok b e l o ʔ b ə l o ʔ V A
8 | bena b e n a b ə n a A N
9 | berak b e r a k b ə r a k V A
10 | berang b e r a ŋ b ə r a ŋ A N
11 | berok b e r o ʔ b ə r o ʔ N N
12 | berpendar b ə r p e n d a r b ə r p ə n d a r V V
13 | berseri b ə r s ə r i b ə r s e r i V V
14 | boreh b o r e h b o r ə h N N
15 | cegak tʃ e ɡ a ʔ tʃ ə ɡ a ʔ A A
16 | cela tʃ ə l a tʃ e l a N N
17 | celak tʃ e l a ʔ tʃ ə l a ʔ N N
18 | cetok tʃ e t o ʔ tʃ ə t o ʔ N N
19 | debut d e b u t d ə b u t N N
20 | dekan d e k a n d ə k a n N N
21 | dendang d e n d a ŋ d ə n d a ŋ N N
22 | depak d e p a ʔ d ə p a ʔ V M
23 | dera d e r a d ə r a N N
24 | embel e m b e l ə m b ə l N N
25 | erang ə r a ŋ e r a ŋ N A
26 | ganteng ɡ a n t ə ŋ ɡ a n t e ŋ A V
27 | gedek ɡ ə d ə ʔ ɡ ə d e ʔ N V
28 | gelang ɡ ə l a ŋ ɡ e l a ŋ N N
29 | genggang ɡ ə ŋ ɡ a ŋ ɡ e ŋ ɡ a ŋ N N
30 | helat h ə l a t h e l a t A N
31 | jejer dʒ e dʒ e r dʒ ə dʒ ə r V N
32 | jeli dʒ e l i dʒ ə l i N A
33 | kecap k e tʃ a p k ə tʃ a p N V
34 | keder k ə d e r k e d ə r N A
35 | kedi k e d i k ə d i N N
36 | kekel k e k e l k ə k ə l A N
37 | kelah k e l a h k ə l a h V N
38 | kelentang k ə l ə n t a ŋ k ə l e n t a ŋ N N
39 | kelenteng k ə l ə n t e ŋ k ə l e n t e ŋ N N
40 | kelepak k ə l ə p a ʔ k ə l e p a ʔ N A
41 | kelesa k ə l e s a k ə l ə s a A N
42 | kena k ə n a k e n a V N
43 | kepang k e p a ŋ k ə p a ŋ N N
44 | kepar k e p a r k ə p a r N N
45 | kere k e r e k ə r e A N
46 | keset k ə s ə t k e s e t A N
47 | ketek k e t e ʔ k ə t e ʔ N N
48 | ketel k e t e l k ə t ə l N A
49 | lebam l e b a m l ə b a m N A
50 | leding l e d i ŋ l ə d i ŋ N V
51 | legar l e ɡ a r l ə ɡ a r V N
52 | lembang l e m b a ŋ l ə m b a ŋ N A
53 | lempeng l e m p e ŋ l ə m p ə ŋ A A
54 | lenggang l e ŋ ɡ a ŋ l ə ŋ ɡ a ŋ N A
55 | letak l e t a ʔ l ə t a ʔ A N
56 | leter l e t e r l e t ə r A N
57 | mejan m e dʒ a n m ə dʒ a n N N
58 | memepet m ə m e p e t m ə m ə p e t V N
59 | memerah m ə m e r a h m ə m ə r a h V V
60 | mendera m ə n d e r a m ə n d e r a V A
61 | mental m e n t a l m ə n t a l V N
62 | pelak p e l a k p ə l a k N A
63 | pelang p e l a ŋ p ə l a ŋ N N
64 | pelat p ə l a t p e l a t N A
65 | pelekat p ə l ə k a t p ə l e k a t N N
66 | penggemblengan p ə ŋ ɡ ə m b l e ŋ a n p ə ŋ ɡ ə m b l ə ŋ a n N N
67 | pening p ə n i ŋ p e n i ŋ A N
68 | pentil p e n t i l p ə n t i l N N
69 | pepet p e p e t p ə p ə t V N
70 | per p e r p ə r N P
71 | rebak r e b a ʔ r ə b a ʔ A V
72 | relai r e l a i r ə l a i V N
73 | remah r e m a h r ə m a h N N
74 | rembes r e m b e s r e m b ə s A N
75 | samseng s a m s e ŋ s a m s ə ŋ N V
76 | seba s e b a s ə b a V V
77 | sebat s e b a t s ə b a t A V
78 | sedan s e d a n s ə d a n N N
79 | sela s ə l a s e l a N N
80 | selak s e l a ʔ s ə l a ʔ V V
81 | selempang s ə l e m p a ŋ s ə l ə m p a ŋ N A
82 | semen s e m ə n s e m e n N N
83 | semi s ə m i s e m i N A
84 | senggang s e ŋ ɡ a ŋ s ə ŋ ɡ a ŋ A A
85 | sengkang s e ŋ k a ŋ s ə ŋ k a ŋ N N
86 | sengkelat s ə ŋ k e l a t s ə ŋ k ə l a t A V
87 | sepak s e p a k s ə p a k N N
88 | serak s ə r a ʔ s e r a ʔ A V
89 | serang s ə r a ŋ s e r a ŋ V N
90 | seret s e r e t s ə r ə t V A
91 | seri s e r i s ə r i N A
92 | sertu s e r t u s ə r t u N V
93 | tekek t ə k e ʔ t e k e ʔ N V
94 | teken t ə k ə n t e k ə n V V
95 | tela t e l a t ə l a N N
96 | telan t e l a n t ə l a n N V
97 | teleng t e l e ŋ t ə l ə ŋ A N
98 | telor t ə l o r t e l o r N A
99 | tepak t e p a ʔ t ə p a ʔ N N
100 | tepok t ə p o ʔ t e p o ʔ V A
101 | terapi t e r a p i t ə r a p i N A
102 | teras t e r a s t ə r a s N N
103 |
--------------------------------------------------------------------------------
/g2p_id/lstm.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/)
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 |
17 | import json
18 | import os
19 |
20 | import numpy as np
21 | import onnxruntime
22 |
23 | from g2p_id.onnx_utils import WrapInferenceSession
24 |
25 | model_path = os.path.join(os.path.dirname(__file__), "models", "lstm")
26 |
27 |
28 | class LSTM:
29 | """Phoneme-level LSTM model for sequence-to-sequence phonemization.
30 | Trained with [Keras](https://keras.io/examples/nlp/lstm_seq2seq/),
31 | and exported to ONNX. ONNX Runtime engine used during inference.
32 | """
33 |
34 | def __init__(self):
35 | encoder_model_path = os.path.join(model_path, "encoder_model.onnx")
36 | decoder_model_path = os.path.join(model_path, "decoder_model.onnx")
37 | g2id_path = os.path.join(model_path, "g2id.json")
38 | p2id_path = os.path.join(model_path, "p2id.json")
39 | config_path = os.path.join(model_path, "config.json")
40 | self.encoder = WrapInferenceSession(
41 | encoder_model_path,
42 | providers=onnxruntime.get_available_providers(),
43 | )
44 | self.decoder = WrapInferenceSession(
45 | decoder_model_path,
46 | providers=onnxruntime.get_available_providers(),
47 | )
48 | with open(g2id_path, encoding="utf-8") as file:
49 | self.g2id = json.load(file)
50 | with open(p2id_path, encoding="utf-8") as file:
51 | self.p2id = json.load(file)
52 | self.id2p = {v: k for k, v in self.p2id.items()}
53 | with open(config_path, encoding="utf-8") as file:
54 | self.config = json.load(file)
55 |
56 | def predict(self, text: str) -> str:
57 | """Performs LSTM inference, predicting phonemes of a given word.
58 |
59 | Args:
60 | text (str): Word to convert to phonemes.
61 |
62 | Returns:
63 | str: Word in phonemes.
64 | """
65 | input_seq = np.zeros(
66 | (
67 | 1,
68 | self.config["max_encoder_seq_length"],
69 | self.config["num_encoder_tokens"],
70 | ),
71 | dtype="float32",
72 | )
73 |
74 | for idx, char in enumerate(text):
75 | input_seq[0, idx, self.g2id[char]] = 1.0
76 | input_seq[0, len(text) :, self.g2id[self.config["pad_token"]]] = 1.0
77 |
78 | encoder_inputs = {"input_1": input_seq}
79 | states_value = self.encoder.run(None, encoder_inputs)
80 |
81 | target_seq = np.zeros((1, 1, self.config["num_decoder_tokens"]), dtype="float32")
82 | target_seq[0, 0, self.p2id[self.config["bos_token"]]] = 1.0
83 |
84 | stop_condition = False
85 | decoded_sentence = ""
86 | while not stop_condition:
87 | decoder_inputs = {
88 | "input_2": target_seq,
89 | "input_3": states_value[0],
90 | "input_4": states_value[1],
91 | }
92 | output_tokens, state_memory, state_carry = self.decoder.run(None, decoder_inputs)
93 |
94 | sampled_token_index = np.argmax(output_tokens[0, -1, :])
95 | sampled_char = self.id2p[sampled_token_index]
96 | decoded_sentence += sampled_char
97 |
98 | if (
99 | sampled_char == self.config["eos_token"]
100 | or len(decoded_sentence) > self.config["max_decoder_seq_length"]
101 | ):
102 | stop_condition = True
103 |
104 | target_seq = np.zeros((1, 1, self.config["num_decoder_tokens"]), dtype="float32")
105 | target_seq[0, 0, sampled_token_index] = 1.0
106 |
107 | states_value = [state_memory, state_carry]
108 |
109 | return decoded_sentence.replace(self.config["eos_token"], "")
110 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # g2p ID: Indonesian Grapheme-to-Phoneme Converter
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 | This library is developed to convert Indonesian (Bahasa Indonesia) graphemes (words) to phonemes in IPA. We followed the methods and designs used in the English equivalent library, [g2p](https://github.com/Kyubyong/g2p).
31 |
32 | ## Installation
33 |
34 | ```bash
35 | pip install g2p_id_py
36 | ```
37 |
38 | ## How to Use
39 |
40 | ```py
41 | from g2p_id import G2p
42 |
43 | texts = [
44 | "Apel itu berwarna merah.",
45 | "Rahel bersekolah di S M A Jakarta 17.",
46 | "Mereka sedang bermain bola di lapangan.",
47 | ]
48 |
49 | g2p = G2p()
50 | for text in texts:
51 | print(g2p(text))
52 |
53 | >> [['a', 'p', 'ə', 'l'], ['i', 't', 'u'], ['b', 'ə', 'r', 'w', 'a', 'r', 'n', 'a'], ['m', 'e', 'r', 'a', 'h'], ['.']]
54 | >> [['r', 'a', 'h', 'e', 'l'], ['b', 'ə', 'r', 's', 'ə', 'k', 'o', 'l', 'a', 'h'], ['d', 'i'], ['e', 's'], ['e', 'm'], ['a'], ['dʒ', 'a', 'k', 'a', 'r', 't', 'a'], ['t', 'u', 'dʒ', 'u', 'h'], ['b', 'ə', 'l', 'a', 's'], ['.']]
55 | >> [['m', 'ə', 'r', 'e', 'k', 'a'], ['s', 'ə', 'd', 'a', 'ŋ'], ['b', 'ə', 'r', 'm', 'a', 'i', 'n'], ['b', 'o', 'l', 'a'], ['d', 'i'], ['l', 'a', 'p', 'a', 'ŋ', 'a', 'n'], ['.']]
56 | ```
57 |
58 | ## Algorithm
59 |
60 | This is heavily inspired from the English [g2p](https://github.com/Kyubyong/g2p).
61 |
62 | 1. Spells out arabic numbers and some currency symbols, e.g. `Rp 200,000 -> dua ratus ribu rupiah`. This is borrowed from [Cahya's code](https://github.com/cahya-wirawan/text_processor).
63 | 2. Attempts to retrieve the correct pronunciation for homographs based on their [POS (part-of-speech) tags](#pos-tagging).
64 | 3. Looks up a lexicon (pronunciation dictionary) for non-homographs. This list is originally from [ipa-dict](https://github.com/open-dict-data/ipa-dict/blob/master/data/ma.txt), and we later made a modified version.
65 | 4. For OOVs, we predict their pronunciations using either a [BERT model](https://huggingface.co/bookbot/id-g2p-bert) or an [LSTM model](https://huggingface.co/bookbot/id-g2p-lstm).
66 |
67 | ## Phoneme and Grapheme Sets
68 |
69 | ```python
70 | graphemes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
71 | phonemes = ['a', 'b', 'd', 'e', 'f', 'ɡ', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'z', 'ŋ', 'ə', 'ɲ', 'tʃ', 'ʃ', 'dʒ', 'x', 'ʔ']
72 | ```
73 |
74 | ## Implementation Details
75 |
76 | You can find more details on how we handled homographs and out-of-vocabulary prediction on our [documentation](https://bookbot-kids.github.io/g2p_id/algorithm/) page.
77 |
78 | ## References
79 |
80 | ```bib
81 | @misc{g2pE2019,
82 | author = {Park, Kyubyong & Kim, Jongseok},
83 | title = {g2pE},
84 | year = {2019},
85 | publisher = {GitHub},
86 | journal = {GitHub repository},
87 | howpublished = {\url{https://github.com/Kyubyong/g2p}}
88 | }
89 | ```
90 |
91 | ```bib
92 | @misc{TextProcessor2021,
93 | author = {Cahya Wirawan},
94 | title = {Text Processor},
95 | year = {2021},
96 | publisher = {GitHub},
97 | journal = {GitHub repository},
98 | howpublished = {\url{https://github.com/cahya-wirawan/text_processor}}
99 | }
100 | ```
101 |
102 | ## Contributors
103 |
104 |
105 |
106 |
--------------------------------------------------------------------------------
/docs/algorithm.md:
--------------------------------------------------------------------------------
1 | # Algorithm
2 |
3 | This is heavily inspired from the English [g2p](https://github.com/Kyubyong/g2p).
4 |
5 | 1. Spells out arabic numbers and some currency symbols, e.g. `Rp 200,000 -> dua ratus ribu rupiah`. This is borrowed from [Cahya's code](https://github.com/cahya-wirawan/text_processor).
6 | 2. Attempts to retrieve the correct pronunciation for homographs based on their [POS (part-of-speech) tags](#pos-tagging).
7 | 3. Looks up a lexicon (pronunciation dictionary) for non-homographs. This list is originally from [ipa-dict](https://github.com/open-dict-data/ipa-dict/blob/master/data/ma.txt), and we later made a modified version.
8 | 4. For OOVs, we predict their pronunciations using either a [BERT model](https://huggingface.co/bookbot/id-g2p-bert) or an [LSTM model](https://huggingface.co/bookbot/id-g2p-lstm).
9 |
10 | ## Phoneme and Grapheme Sets
11 |
12 | ```python
13 | graphemes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
14 | phonemes = ['a', 'b', 'd', 'e', 'f', 'ɡ', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'z', 'ŋ', 'ə', 'ɲ', 'tʃ', 'ʃ', 'dʒ', 'x', 'ʔ']
15 | ```
16 |
17 | ## Homographs
18 |
19 | Indonesian words (as far as we know) only have one case of homograph, that is, differing ways to pronounce the letter `e`. For instance, in the word `apel` (meaning: apple), the letter `e` is a mid central vowel `ə`. On the other hand, the letter `e` in the word `apel` (meaning: going to a significant other's house; courting), is a closed-mid front unrounded vowel `e`. Sometimes, a word might have >1 `e`s pronounced in both ways, for instance, `mereka` (meaning: they) is pronounced as `məreka`. Because of this, there needs a way to disambiguate homographs, and in our case, we used their POS (part-of-speech) tags. However, this is not a foolproof method since homographs may even have the same POS tag. We are considering a contextual model to handle this better.
20 |
21 | ## OOV Prediction
22 |
23 | Initially, we relied on a sequence2sequence LSTM model for OOV (out-of-vocabulary) prediction. This was a natural choice given that it can "automatically" learn the rules of grapheme-to-phoneme conversion without having to determine the rules by hand. However, we soon noticed that despite its validation results, the model performed poorly on unseen words, especially on longer ones. We needed a more controllable model that makes predictions on necessary characters only. We ended up with a customized BERT that predicts the correct pronunciation of the letter `e` while keeping the rest of the string unchanged. We then apply a hand-written g2p conversion algorithm that handles the other characters.
24 |
25 | You can find more detail in [this blog post](https://wilsonwongso.dev/posts/2022/04/predicting-phonemes-with-bert/).
26 |
27 | ## POS Tagging
28 |
29 | We trained an [NLTK PerceptronTagger](https://www.nltk.org/_modules/nltk/tag/perceptron.html) on the [POSP](https://huggingface.co/datasets/indonlu) dataset, which achieved 0.956 and 0.945 F1-score on the valid and test sets, respectively. Given its performance and speed, we decided to adopt this model as the POS tagger for the purpose of disambiguating homographs, which is just like the English g2p library.
30 |
31 | | tag | precision | recall | f1-score |
32 | | --------- | --------- | -------- | -------- |
33 | | B-$$$ | 1.000000 | 1.000000 | 1.000000 |
34 | | B-ADJ | 0.904132 | 0.864139 | 0.883683 |
35 | | B-ADK | 1.000000 | 0.986667 | 0.993289 |
36 | | B-ADV | 0.966874 | 0.976987 | 0.971904 |
37 | | B-ART | 0.988920 | 0.978082 | 0.983471 |
38 | | B-CCN | 0.997934 | 0.997934 | 0.997934 |
39 | | B-CSN | 0.986395 | 0.963455 | 0.974790 |
40 | | B-INT | 1.000000 | 1.000000 | 1.000000 |
41 | | B-KUA | 0.976744 | 0.976744 | 0.976744 |
42 | | B-NEG | 0.992857 | 0.972028 | 0.982332 |
43 | | B-NNO | 0.919917 | 0.941288 | 0.930480 |
44 | | B-NNP | 0.917685 | 0.914703 | 0.916192 |
45 | | B-NUM | 0.997358 | 0.954488 | 0.975452 |
46 | | B-PAR | 1.000000 | 0.851064 | 0.919540 |
47 | | B-PPO | 0.991206 | 0.991829 | 0.991517 |
48 | | B-PRI | 1.000000 | 0.928571 | 0.962963 |
49 | | B-PRK | 0.793103 | 0.851852 | 0.821429 |
50 | | B-PRN | 0.988327 | 0.988327 | 0.988327 |
51 | | B-PRR | 0.995465 | 1.000000 | 0.997727 |
52 | | B-SYM | 0.999662 | 0.999323 | 0.999492 |
53 | | B-UNS | 0.916667 | 0.733333 | 0.814815 |
54 | | B-VBE | 1.000000 | 0.985714 | 0.992806 |
55 | | B-VBI | 0.929119 | 0.877034 | 0.902326 |
56 | | B-VBL | 1.000000 | 1.000000 | 1.000000 |
57 | | B-VBP | 0.926606 | 0.933457 | 0.930018 |
58 | | B-VBT | 0.939759 | 0.953333 | 0.946498 |
59 | | --------- | --------- | -------- | -------- |
60 | | macro avg | 0.966490 | 0.946937 | 0.955913 |
61 |
62 | ## Attempts that Failed
63 |
64 | - Parsed [online PDF KBBI](https://oldi.lipi.go.id/public/Kamus%20Indonesia.pdf), but it turns out that it has very little phoneme descriptions.
65 | - Scraped [online Web KBBI](https://github.com/laymonage/kbbi-python), but it had a daily bandwidth which was too low to be used at this level.
66 |
67 | ## Potential Improvements
68 |
69 | There is a ton of room for improvements, both from the technical and the linguistic side of the approaches. Consider that a failure of one component may cascade to an incorrect conclusion. For instance, an incorrect POS tag can lead to the wrong phoneme, ditto for incorrect OOV prediction. We propose the following future improvements.
70 |
71 | - [ ] Use a larger pronunciation lexicon instead of having to guess.
72 | - [x] Find a larger homograph list.
73 | - [x] Use contextual model instead of character-level RNNs.
74 | - [x] Consider hand-written rules for g2p conversion.
75 | - [x] Add to PyPI.
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 |
2 | # Contributor Covenant Code of Conduct
3 |
4 | ## Our Pledge
5 |
6 | We as members, contributors, and leaders pledge to make participation in our
7 | community a harassment-free experience for everyone, regardless of age, body
8 | size, visible or invisible disability, ethnicity, sex characteristics, gender
9 | identity and expression, level of experience, education, socio-economic status,
10 | nationality, personal appearance, race, caste, color, religion, or sexual
11 | identity and orientation.
12 |
13 | We pledge to act and interact in ways that contribute to an open, welcoming,
14 | diverse, inclusive, and healthy community.
15 |
16 | ## Our Standards
17 |
18 | Examples of behavior that contributes to a positive environment for our
19 | community include:
20 |
21 | * Demonstrating empathy and kindness toward other people
22 | * Being respectful of differing opinions, viewpoints, and experiences
23 | * Giving and gracefully accepting constructive feedback
24 | * Accepting responsibility and apologizing to those affected by our mistakes,
25 | and learning from the experience
26 | * Focusing on what is best not just for us as individuals, but for the overall
27 | community
28 |
29 | Examples of unacceptable behavior include:
30 |
31 | * The use of sexualized language or imagery, and sexual attention or advances of
32 | any kind
33 | * Trolling, insulting or derogatory comments, and personal or political attacks
34 | * Public or private harassment
35 | * Publishing others' private information, such as a physical or email address,
36 | without their explicit permission
37 | * Other conduct which could reasonably be considered inappropriate in a
38 | professional setting
39 |
40 | ## Enforcement Responsibilities
41 |
42 | Community leaders are responsible for clarifying and enforcing our standards of
43 | acceptable behavior and will take appropriate and fair corrective action in
44 | response to any behavior that they deem inappropriate, threatening, offensive,
45 | or harmful.
46 |
47 | Community leaders have the right and responsibility to remove, edit, or reject
48 | comments, commits, code, wiki edits, issues, and other contributions that are
49 | not aligned to this Code of Conduct, and will communicate reasons for moderation
50 | decisions when appropriate.
51 |
52 | ## Scope
53 |
54 | This Code of Conduct applies within all community spaces, and also applies when
55 | an individual is officially representing the community in public spaces.
56 | Examples of representing our community include using an official e-mail address,
57 | posting via an official social media account, or acting as an appointed
58 | representative at an online or offline event.
59 |
60 | ## Enforcement
61 |
62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
63 | reported to the community leaders responsible for enforcement at [team@bookbotkids.com](mailto:team@bookbotkids.com).
64 | All complaints will be reviewed and investigated promptly and fairly.
65 |
66 | All community leaders are obligated to respect the privacy and security of the
67 | reporter of any incident.
68 |
69 | ## Enforcement Guidelines
70 |
71 | Community leaders will follow these Community Impact Guidelines in determining
72 | the consequences for any action they deem in violation of this Code of Conduct:
73 |
74 | ### 1. Correction
75 |
76 | **Community Impact**: Use of inappropriate language or other behavior deemed
77 | unprofessional or unwelcome in the community.
78 |
79 | **Consequence**: A private, written warning from community leaders, providing
80 | clarity around the nature of the violation and an explanation of why the
81 | behavior was inappropriate. A public apology may be requested.
82 |
83 | ### 2. Warning
84 |
85 | **Community Impact**: A violation through a single incident or series of
86 | actions.
87 |
88 | **Consequence**: A warning with consequences for continued behavior. No
89 | interaction with the people involved, including unsolicited interaction with
90 | those enforcing the Code of Conduct, for a specified period of time. This
91 | includes avoiding interactions in community spaces as well as external channels
92 | like social media. Violating these terms may lead to a temporary or permanent
93 | ban.
94 |
95 | ### 3. Temporary Ban
96 |
97 | **Community Impact**: A serious violation of community standards, including
98 | sustained inappropriate behavior.
99 |
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 |
106 | ### 4. Permanent Ban
107 |
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 |
112 | **Consequence**: A permanent ban from any sort of public interaction within the
113 | community.
114 |
115 | ## Attribution
116 |
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.1, available at
119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1].
120 |
121 | Community Impact Guidelines were inspired by
122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC].
123 |
124 | For answers to common questions about this code of conduct, see the FAQ at
125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at
126 | [https://www.contributor-covenant.org/translations][translations].
127 |
128 | [homepage]: https://www.contributor-covenant.org
129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html
130 | [Mozilla CoC]: https://github.com/mozilla/diversity
131 | [FAQ]: https://www.contributor-covenant.org/faq
132 | [translations]: https://www.contributor-covenant.org/translations
133 |
--------------------------------------------------------------------------------
/g2p_id/g2p.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/)
3 |
4 | Licensed under the Apache License, Version 2.0 (the "License");
5 | you may not use this file except in compliance with the License.
6 | You may obtain a copy of the License at
7 |
8 | http://www.apache.org/licenses/LICENSE-2.0
9 |
10 | Unless required by applicable law or agreed to in writing, software
11 | distributed under the License is distributed on an "AS IS" BASIS,
12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | See the License for the specific language governing permissions and
14 | limitations under the License.
15 | """
16 |
17 | import os
18 | import re
19 | import pickle
20 | import unicodedata
21 | from builtins import str as unicode
22 | from itertools import permutations
23 | from typing import Dict, List, Tuple, Union
24 |
25 | import nltk
26 | from nltk.tag.perceptron import PerceptronTagger
27 | from nltk.tokenize import TweetTokenizer
28 |
29 | from g2p_id.bert import BERT
30 | from g2p_id.lstm import LSTM
31 | from g2p_id.text_processor import TextProcessor
32 |
33 | nltk.download("wordnet")
34 | resources_path = os.path.join(os.path.dirname(__file__), "resources")
35 |
36 |
37 | def construct_homographs_dictionary() -> Dict[str, Tuple[str, str, str, str]]:
38 | """Creates a dictionary of homographs
39 |
40 | Returns:
41 | Dict[str, Tuple[str, str, str, str]]:
42 | Key: WORD
43 | Value: (PH1, PH2, POS1, POS2)
44 | """
45 | homograph_path = os.path.join(resources_path, "homographs_id.tsv")
46 | homograph2features = {}
47 | with open(homograph_path, encoding="utf-8") as file:
48 | lines = file.readlines()
49 | for line in lines:
50 | grapheme, phone_1, phone_2, pos_1, pos_2 = line.strip("\n").split("\t")
51 | homograph2features[grapheme.lower()] = (phone_1, phone_2, pos_1, pos_2)
52 |
53 | return homograph2features
54 |
55 |
56 | def construct_lexicon_dictionary() -> Dict[str, str]:
57 | """Creates a lexicon dictionary.
58 |
59 | Returns:
60 | Dict[str, str]:
61 | Key: WORD
62 | Value: Phoneme (IPA)
63 | """
64 | lexicon_path = os.path.join(resources_path, "lexicon_id.tsv")
65 | lexicon2features = {}
66 | with open(lexicon_path, encoding="utf-8") as file:
67 | lines = file.readlines()
68 | for line in lines:
69 | grapheme, phoneme = line.strip("\n").split("\t")
70 | lexicon2features[grapheme.lower()] = phoneme
71 | return lexicon2features
72 |
73 |
74 | class G2p:
75 | """Grapheme-to-phoneme (g2p) main class for phonemization.
76 | This class provides a high-level API for grapheme-to-phoneme conversion.
77 |
78 | 1. Preprocess and normalize text
79 | 2. Word tokenizes text
80 | 3. Predict POS for every word
81 | 4. If word is non-alphabetic, add to list (i.e. punctuation)
82 | 5. If word is a homograph, check POS and use matching word's phonemes
83 | 6. If word is a non-homograph, lookup lexicon
84 | 7. Otherwise, predict with a neural network
85 | """
86 |
87 | def __init__(self, model_type="BERT"):
88 | """Constructor for G2p.
89 |
90 | Args:
91 | model_type (str, optional):
92 | Type of neural network to use for prediction.
93 | Choices are "LSTM" or "BERT". Defaults to "BERT".
94 | """
95 | self.homograph2features = construct_homographs_dictionary()
96 | self.lexicon2features = construct_lexicon_dictionary()
97 | self.normalizer = TextProcessor()
98 | self.tagger = PerceptronTagger(load=False)
99 | tagger_path = os.path.join(resources_path, "id_posp_tagger.pickle")
100 | with open(tagger_path, "rb") as f:
101 | self.tagger = self.tagger.decode_json_obj(pickle.load(f))
102 | self.model: Union[BERT, LSTM] = BERT() if model_type == "BERT" else LSTM()
103 | self.tokenizer = TweetTokenizer()
104 | self.pos_dict = {
105 | "N": ["B-NNO", "B-NNP", "B-PRN", "B-PRN", "B-PRK"],
106 | "V": ["B-VBI", "B-VBT", "B-VBP", "B-VBL", "B-VBE"],
107 | "A": ["B-ADJ"],
108 | "P": ["B-PAR"],
109 | }
110 |
111 | def _preprocess(self, text: str) -> str:
112 | """Performs preprocessing.
113 | (1) Adds spaces in between tokens
114 | (2) Normalizes unicode and accents
115 | (3) Normalizes numbers
116 | (4) Lower case texts
117 | (5) Removes unwanted tokens
118 |
119 | Arguments:
120 | text (str): Text to preprocess.
121 |
122 | Returns:
123 | str: Preprocessed text.
124 | """
125 | text = text.replace("-", " ")
126 | text = re.sub(r"\.(?=.*\.)", " ", text)
127 | text = " ".join(self.tokenizer.tokenize(text))
128 | text = unicode(text)
129 | text = "".join(char for char in unicodedata.normalize("NFD", text) if unicodedata.category(char) != "Mn")
130 | text = self.normalizer.normalize(text).strip()
131 | text = text.lower()
132 | text = re.sub(r"[^ a-z'.,?!\-]", "", text)
133 | return text
134 |
135 | def _rule_based_g2p(self, text: str) -> str:
136 | """Applies rule-based Indonesian grapheme2phoneme conversion.
137 |
138 | Args:
139 | text (str): Grapheme text to convert to phoneme.
140 |
141 | Returns:
142 | str: Phoneme string.
143 | """
144 | phonetic_mapping = {
145 | "ny": "ɲ",
146 | "ng": "ŋ",
147 | "sy": "ʃ",
148 | "aa": "aʔa",
149 | "ii": "iʔi",
150 | "oo": "oʔo",
151 | "əə": "əʔə",
152 | "uu": "uʔu",
153 | "'": "ʔ",
154 | "g": "ɡ",
155 | "q": "k",
156 | "j": "dʒ",
157 | "y": "j",
158 | "x": "ks",
159 | "c": "tʃ",
160 | "kh": "x",
161 | }
162 |
163 | if text.startswith("x"):
164 | text = "s" + text[1:]
165 |
166 | if text.startswith("ps"):
167 | text = text[1:]
168 |
169 | for graph, phone in phonetic_mapping.items():
170 | text = text.replace(graph, phone)
171 |
172 | phonemes = [list(phn) if phn not in ("dʒ", "tʃ") else [phn] for phn in re.split("(tʃ|dʒ)", text)]
173 | return " ".join([p for phn in phonemes for p in phn])
174 |
175 | def __call__(self, text: str) -> List[List[str]]:
176 | """Grapheme-to-phoneme converter.
177 |
178 | 1. Preprocess and normalize text
179 | 2. Word tokenizes text
180 | 3. Predict POS for every word
181 | 4. If word is non-alphabetic, add to list (i.e. punctuation)
182 | 5. If word is a homograph, check POS and use matching word's phonemes
183 | 6. If word is a non-homograph, lookup lexicon
184 | 7. Otherwise, predict with a neural network
185 |
186 | Args:
187 | text (str): Grapheme text to convert to phoneme.
188 |
189 | Returns:
190 | List[List[str]]: List of strings in phonemes.
191 | """
192 | text = self._preprocess(text)
193 | words = self.tokenizer.tokenize(text)
194 | tokens = self.tagger.tag(words)
195 |
196 | prons = []
197 | for word, pos in tokens:
198 | pron = ""
199 | if re.search("[a-z]", word) is None: # non-alphabetic
200 | pron = word
201 |
202 | elif word in self.homograph2features: # check if homograph
203 | pron1, pron2, pos1, _ = self.homograph2features[word]
204 |
205 | # check for the matching POS
206 | if pos in self.pos_dict[pos1]:
207 | pron = pron1
208 | else:
209 | pron = pron2
210 |
211 | elif word in self.lexicon2features: # non-homographs
212 | pron = self.lexicon2features[word]
213 |
214 | else: # predict for OOV
215 | pron = self.model.predict(word)
216 | if isinstance(self.model, BERT):
217 | pron = self._rule_based_g2p(pron)
218 |
219 | if pron.endswith("ʔ"):
220 | pron = pron[:-1] + "k"
221 |
222 | consonants = "bdjklmnprstwɲ"
223 | vowels = "aeiouə"
224 |
225 | for letter in consonants:
226 | pron = pron.replace(f"ʔ {letter}", f"k {letter}")
227 |
228 | # add a glottal stop in between consecutive vowels
229 | for v1, v2 in permutations(vowels, 2):
230 | pron = pron.replace(f"{v1} {v2}", f"{v1} ʔ {v2}")
231 |
232 | prons.append(pron.split())
233 |
234 | return prons
235 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/)
179 |
180 | Licensed under the Apache License, Version 2.0 (the "License");
181 | you may not use this file except in compliance with the License.
182 | You may obtain a copy of the License at
183 |
184 | http://www.apache.org/licenses/LICENSE-2.0
185 |
186 | Unless required by applicable law or agreed to in writing, software
187 | distributed under the License is distributed on an "AS IS" BASIS,
188 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
189 | See the License for the specific language governing permissions and
190 | limitations under the License.
191 |
--------------------------------------------------------------------------------
/g2p_id/text_processor.py:
--------------------------------------------------------------------------------
1 | """
2 | MIT License
3 |
4 | Copyright (c) 2021 Cahya Wirawan
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | """
24 |
25 |
26 | import os
27 | import re
28 | from typing import Any
29 |
30 | from num2words import num2words
31 |
32 | resources_path = os.path.join(os.path.dirname(__file__), "resources")
33 |
34 |
35 | class TextProcessor:
36 | """Indonesian text processor to normalize numerics, currencies, and timezones."""
37 |
38 | def __init__(self):
39 | self.measurements = {}
40 | self.thousands = ["ratus", "ribu", "juta", "miliar", "milyar", "triliun"]
41 | self.months = [
42 | "Januari",
43 | "Februari",
44 | "Maret",
45 | "April",
46 | "Mei",
47 | "Juni",
48 | "Juli",
49 | "Agustus",
50 | "September",
51 | "Oktober",
52 | "November",
53 | "Desember",
54 | ]
55 | measurements_path = os.path.join(resources_path, "measurements.tsv")
56 | currencies_path = os.path.join(resources_path, "currency.tsv")
57 | timezones_path = os.path.join(resources_path, "timezones.tsv")
58 |
59 | with open(measurements_path, "r", encoding="utf-8") as file:
60 | for lines in file:
61 | line = lines.strip().split("\t")
62 | self.measurements[line[0]] = line[1]
63 |
64 | self.currencies = {}
65 | with open(currencies_path, "r", encoding="utf-8") as file:
66 | for lines in file:
67 | line = lines.strip().split("\t")
68 | self.currencies[line[0]] = line[1]
69 |
70 | self.timezones = {}
71 | with open(timezones_path, "r", encoding="utf-8") as file:
72 | for lines in file:
73 | line = lines.strip().split("\t")
74 | self.timezones[line[0]] = line[1]
75 |
76 | self.re_thousands = "|".join(self.thousands)
77 | self.re_currencies = r"\b" + re.sub(
78 | r"\|([^|$£€¥₩]+)", r"|\\b\1", "|".join(list(self.currencies))
79 | )
80 | self.re_currencies = re.sub(r"([$£€¥₩])", r"\\\1", self.re_currencies)
81 | self.re_moneys = (
82 | rf"(({self.re_currencies}) ?([\d\.\,]+)( ({self.re_thousands})?(an)?)?)"
83 | )
84 | self.re_measurements = "|".join(list(self.measurements))
85 | self.re_measurements = rf"(\b([\d\.\,]+) ?({self.re_measurements})\b)"
86 | self.re_timezones = "|".join(list(self.timezones))
87 | self.re_timezones = (
88 | r"((\d{1,2})[\.:](\d{1,2}) " + rf"\b({self.re_timezones})\b)"
89 | )
90 | self.re_http = re.compile(
91 | r"""
92 | (https?://(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.
93 | [a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9()@:%_\+.~#?&//=]*)
94 | """,
95 | re.X,
96 | )
97 |
98 | @staticmethod
99 | def is_integer(number: Any) -> bool:
100 | """Check if integer by type-casting.
101 |
102 | Args:
103 | number (Any): Number to check.
104 |
105 | Returns:
106 | bool: Is a valid integer.
107 | """
108 | try:
109 | int(number)
110 | return True
111 | except ValueError:
112 | return False
113 |
114 | @staticmethod
115 | def is_float(number: Any) -> bool:
116 | """Check if float by type-casting.
117 |
118 | Args:
119 | number (Any): Number to check.
120 |
121 | Returns:
122 | bool: Is a valid float.
123 | """
124 | try:
125 | float(number)
126 | return True
127 | except ValueError:
128 | return False
129 |
130 | def normalize_url(self, text: str) -> str:
131 | """Removes URL from text.
132 |
133 | Args:
134 | text (str): Text with URL to normalize.
135 |
136 | Returns:
137 | str: Normalized text with URLs removed.
138 | """
139 | urls = re.findall(self.re_http, text)
140 | for url in urls:
141 | text = text.replace(url[0], "")
142 | return text
143 |
144 | def normalize_currency(self, text: str) -> str:
145 | """Normalizes international and Indonesian (Rupiah) currencies.
146 |
147 | Examples:
148 | - `"$250"` -> `"dua ratus lima puluh dollar"`
149 | - `"Rp 3,000,000"` -> `"tiga juta rupiah"`
150 |
151 | Args:
152 | text (str): Text with currency to normalize.
153 |
154 | Returns:
155 | str: Normalized text with currency transliterated.
156 | """
157 | moneys = re.findall(self.re_moneys, text)
158 | for money in moneys:
159 | number: Any = re.sub(",", ".", re.sub(r"\.", "", money[2].strip(" ,.")))
160 | try:
161 | if number == "":
162 | continue
163 | if self.is_integer(number):
164 | number = int(number)
165 | elif self.is_float(number):
166 | number = float(number)
167 | else:
168 | number = re.sub(r"[.,]", "", number)
169 | number = int(number)
170 | number = num2words(number, to="cardinal", lang="id")
171 | text = text.replace(
172 | money[0].strip(" ,."),
173 | f"{number} {money[3]} {self.currencies[money[1]]}",
174 | )
175 | except NotImplementedError as error:
176 | print(error)
177 | print(f"Problem with money: <{text}>: {number}")
178 | return text
179 |
180 | def normalize_measurement(self, text: str) -> str:
181 | """Normalizes measurement units, including its scalar value.
182 |
183 | Examples:
184 | - `"10,5 km"` -> `"sepuluh koma lima kilometer"`
185 | - `"5°C"` -> `"lima derajat celsius"`
186 |
187 | Args:
188 | text (str): Text with measurements to normalize.
189 |
190 | Returns:
191 | str: Normalized text with measurements transliterated.
192 | """
193 | units = re.findall(self.re_measurements, text)
194 | for unit in units:
195 | number: Any = re.sub(",", ".", re.sub(r"\.", "", unit[1].strip(" ,.")))
196 | try:
197 | if number == "":
198 | continue
199 | if re.search(r"\.", number):
200 | number = float(number)
201 | else:
202 | number = int(number)
203 | number = num2words(number, to="cardinal", lang="id")
204 | text = text.replace(
205 | unit[0].strip(" ,."), f"{number} {self.measurements[unit[2]]}"
206 | )
207 | except NotImplementedError as error:
208 | print(error)
209 | print(f"Problem with measurements: <{text}>: {number}")
210 | return text
211 |
212 | def normalize_date(self, text: str) -> str:
213 | """Normalizes dates.
214 |
215 | Examples:
216 | - `"(12/3/2021)"` -> `"dua belas Maret dua ribu dua puluh satu"`
217 |
218 | Args:
219 | text (str): Text with dates to normalize.
220 |
221 | Returns:
222 | str: Normalized text with dates transliterated.
223 | """
224 | dates = re.findall(r"(\((\d{1,2})/(\d{1,2})(/(\d+))?\))", text)
225 | for date in dates:
226 | try:
227 | day = num2words(int(date[1]), to="cardinal", lang="id")
228 | month: Any = int(date[2]) - 1
229 | if month >= 12:
230 | month = 0
231 | month = self.months[month]
232 | if date[4] != "":
233 | year = num2words(int(date[4]), to="cardinal", lang="id")
234 | date_string = f"{day} {month} {year}"
235 | else:
236 | date_string = f"{day} {month}"
237 | text = text.replace(date[0], f" {date_string} ")
238 | except NotImplementedError as error:
239 | print(error)
240 | print(f"Problem with dates: <{text}>: {date}")
241 | return text
242 |
243 | def normalize_timezone(self, text: str) -> str:
244 | """Normalizes Indonesian time with timezones.
245 |
246 | Examples:
247 | - `"22.30 WITA"`
248 | -> `"dua puluh dua lewat tiga puluh menit Waktu Indonesia Tengah"`
249 |
250 | Args:
251 | text (str): Text with timezones to normalize.
252 |
253 | Returns:
254 | str: Normalized text with timezones transliterated.
255 | """
256 | timezones = re.findall(self.re_timezones, text)
257 | for timezone in timezones:
258 | try:
259 | hour = num2words(int(timezone[1]), to="cardinal", lang="id")
260 | minute = num2words(int(timezone[2]), to="cardinal", lang="id")
261 | zone = self.timezones[timezone[3]]
262 | if minute == "nol":
263 | time_string = f"{hour} {zone}"
264 | else:
265 | time_string = f"{hour} lewat {minute} menit {zone}"
266 | text = text.replace(timezone[0], f"{time_string}")
267 | except NotImplementedError as error:
268 | print(error)
269 | print(f"Problem with timezones: <{text}>: {timezone}")
270 | return text
271 |
272 | def normalize_number(self, text: str) -> str:
273 | """Normalizes Arabic numbers to Indonesian.
274 |
275 | Examples:
276 | - `"1.000"` -> `"seribu"`
277 | - `"10,5"` -> `"sepuluh koma lima"`
278 |
279 | Args:
280 | text (str): Text with numbers to normalize.
281 |
282 | Returns:
283 | str: Normalized text with numbers transliterated.
284 | """
285 | re_numbers = [r"([\d.,]+)", r"\d+"]
286 | for re_number in re_numbers:
287 | number_len = 0
288 | for i in re.finditer(re_number, text):
289 | start = i.start() + number_len
290 | end = i.end() + number_len
291 | number: Any = text[start:end]
292 | number = re.sub(",", ".", re.sub(r"\.", "", number.strip(" ,.")))
293 | if number == "":
294 | continue
295 | if self.is_float(number) or self.is_integer(number):
296 | try:
297 | if self.is_integer(number):
298 | number = int(number)
299 | else:
300 | number = float(number)
301 | number = num2words(number, to="cardinal", lang="id")
302 | text = text[:start] + number + text[end:]
303 | number_len += len(number) - (end - start)
304 | except NotImplementedError as error:
305 | print(error)
306 | print(f"Problem with number: <{text}>: {number}")
307 | return text
308 |
309 | def normalize(self, text: str) -> str:
310 | """Normalizes Indonesian text by expanding:
311 |
312 | - URL
313 | - Currency
314 | - Measurements
315 | - Dates
316 | - Timezones
317 | - Arabic Numerals
318 |
319 | Args:
320 | text (str): Text to normalize.
321 |
322 | Returns:
323 | str: Normalized text.
324 | """
325 | # Remove URL
326 | text = self.normalize_url(text)
327 | # Currency
328 | text = self.normalize_currency(text)
329 | # Measurements
330 | text = self.normalize_measurement(text)
331 | # Date
332 | text = self.normalize_date(text)
333 | # Timezones
334 | text = self.normalize_timezone(text)
335 | # Any number
336 | text = self.normalize_number(text)
337 | # collapse consecutive whitespaces
338 | text = re.sub(r"\s+", " ", text)
339 | return text
340 |
--------------------------------------------------------------------------------