├── requirements.txt ├── MANIFEST.in ├── docs ├── reference │ ├── textprocessor.md │ ├── lstm.md │ ├── bert.md │ └── g2p.md ├── contributing.md ├── index.md └── algorithm.md ├── requirements_test.txt ├── g2p_id ├── models │ ├── bert │ │ ├── bert_mlm.onnx │ │ ├── config.json │ │ └── token2id.json │ └── lstm │ │ ├── decoder_model.onnx │ │ ├── encoder_model.onnx │ │ ├── config.json │ │ ├── g2id.json │ │ └── p2id.json ├── resources │ ├── id_posp_tagger.pickle │ ├── timezones.tsv │ ├── currency.tsv │ ├── measurements.tsv │ └── homographs_id.tsv ├── __init__.py ├── onnx_utils.py ├── bert.py ├── lstm.py ├── g2p.py └── text_processor.py ├── NOTICE.md ├── tests ├── conftest.py ├── test_text_processor.py └── test_g2p.py ├── .github └── workflows │ ├── docs.yml │ └── tests.yml ├── tox.ini ├── setup.py ├── mkdocs.yml ├── PROJECT_CHARTER.md ├── .gitignore ├── CONTRIBUTING.md ├── README.md ├── CODE_OF_CONDUCT.md └── LICENSE.md /requirements.txt: -------------------------------------------------------------------------------- 1 | num2words 2 | nltk==3.9.1 3 | onnxruntime -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include g2p_id/resources/* 3 | include g2p_id/models/*/* -------------------------------------------------------------------------------- /docs/reference/textprocessor.md: -------------------------------------------------------------------------------- 1 | # TextProcessor 2 | 3 | ::: g2p_id.text_processor.TextProcessor 4 | -------------------------------------------------------------------------------- /requirements_test.txt: -------------------------------------------------------------------------------- 1 | importlib_metadata<5 2 | flake8 3 | tox 4 | pytest 5 | pytest-cov 6 | mypy 7 | pylint -------------------------------------------------------------------------------- /g2p_id/models/bert/bert_mlm.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bookbot-kids/g2p_id/HEAD/g2p_id/models/bert/bert_mlm.onnx -------------------------------------------------------------------------------- /g2p_id/models/bert/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "mask_token": "[mask]", 3 | "pad_token": "", 4 | "max_seq_length": 32 5 | } 6 | -------------------------------------------------------------------------------- /g2p_id/models/lstm/decoder_model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bookbot-kids/g2p_id/HEAD/g2p_id/models/lstm/decoder_model.onnx -------------------------------------------------------------------------------- /g2p_id/models/lstm/encoder_model.onnx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bookbot-kids/g2p_id/HEAD/g2p_id/models/lstm/encoder_model.onnx -------------------------------------------------------------------------------- /g2p_id/resources/id_posp_tagger.pickle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bookbot-kids/g2p_id/HEAD/g2p_id/resources/id_posp_tagger.pickle -------------------------------------------------------------------------------- /g2p_id/resources/timezones.tsv: -------------------------------------------------------------------------------- 1 | WITA Waktu Indonesia Tengah 2 | WIB Waktu Indonesia Barat 3 | WIT Waktu Indonesia Timur 4 | GMT Greenwich Mean Time 5 | -------------------------------------------------------------------------------- /NOTICE.md: -------------------------------------------------------------------------------- 1 | g2p ID 2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/) 3 | 4 | This product includes software developed at 5 | PT BOOKBOT INDONESIA (https://bookbot.id/). 6 | -------------------------------------------------------------------------------- /g2p_id/models/lstm/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "latent_dim": 256, 3 | "bos_token": "\t", 4 | "eos_token": "\n", 5 | "pad_token": " ", 6 | "num_encoder_tokens": 28, 7 | "num_decoder_tokens": 32, 8 | "max_encoder_seq_length": 24, 9 | "max_decoder_seq_length": 25 10 | } 11 | -------------------------------------------------------------------------------- /docs/reference/lstm.md: -------------------------------------------------------------------------------- 1 | # LSTM 2 | 3 | ::: g2p_id.lstm.LSTM 4 | 5 | ## Usage 6 | 7 | ```py 8 | texts = ["mengembangkannya", "merdeka", "pecel", "lele"] 9 | lstm = LSTM() 10 | for text in texts: 11 | print(lstm.predict(text)) 12 | ``` 13 | 14 | ```py 15 | >> məŋəmbaŋkanɲa 16 | >> mərdeka 17 | >> pətʃəl 18 | >> lele 19 | ``` -------------------------------------------------------------------------------- /docs/reference/bert.md: -------------------------------------------------------------------------------- 1 | # BERT 2 | 3 | ::: g2p_id.bert.BERT 4 | 5 | ## Usage 6 | 7 | ```py 8 | texts = ["mengembangkannya", "merdeka", "pecel", "lele"] 9 | bert = BERT() 10 | for text in texts: 11 | print(bert.predict(text)) 12 | ``` 13 | 14 | ```py 15 | >> məngəmbangkannya 16 | >> mərdeka 17 | >> pəcel 18 | >> lele 19 | ``` 20 | -------------------------------------------------------------------------------- /g2p_id/models/lstm/g2id.json: -------------------------------------------------------------------------------- 1 | { 2 | " ": 27, 3 | "'": 0, 4 | "-": 1, 5 | "a": 2, 6 | "b": 3, 7 | "c": 4, 8 | "d": 5, 9 | "e": 6, 10 | "f": 7, 11 | "g": 8, 12 | "h": 9, 13 | "i": 10, 14 | "j": 11, 15 | "k": 12, 16 | "l": 13, 17 | "m": 14, 18 | "n": 15, 19 | "o": 16, 20 | "p": 17, 21 | "q": 18, 22 | "r": 19, 23 | "s": 20, 24 | "t": 21, 25 | "u": 22, 26 | "v": 23, 27 | "w": 24, 28 | "y": 25, 29 | "z": 26 30 | } 31 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from g2p_id import BERT, LSTM, G2p, TextProcessor 4 | 5 | 6 | @pytest.fixture(scope="session") 7 | def g2p(): 8 | return G2p() 9 | 10 | 11 | @pytest.fixture(scope="session") 12 | def lstm(): 13 | return LSTM() 14 | 15 | 16 | @pytest.fixture(scope="session") 17 | def bert(): 18 | return BERT() 19 | 20 | 21 | @pytest.fixture(scope="session") 22 | def text_processor(): 23 | return TextProcessor() 24 | -------------------------------------------------------------------------------- /g2p_id/models/bert/token2id.json: -------------------------------------------------------------------------------- 1 | { 2 | "": 0, 3 | "'": 28, 4 | "-": 26, 5 | "[UNK]": 1, 6 | "[mask]": 30, 7 | "a": 2, 8 | "b": 13, 9 | "c": 20, 10 | "d": 16, 11 | "e": 18, 12 | "f": 24, 13 | "g": 11, 14 | "h": 19, 15 | "i": 5, 16 | "j": 22, 17 | "k": 7, 18 | "l": 15, 19 | "m": 8, 20 | "n": 3, 21 | "o": 17, 22 | "p": 14, 23 | "q": 29, 24 | "r": 6, 25 | "s": 12, 26 | "t": 9, 27 | "u": 10, 28 | "v": 25, 29 | "w": 23, 30 | "y": 21, 31 | "z": 27, 32 | "ə": 4 33 | } 34 | -------------------------------------------------------------------------------- /g2p_id/models/lstm/p2id.json: -------------------------------------------------------------------------------- 1 | { 2 | "\t": 0, 3 | "\n": 1, 4 | " ": 31, 5 | "-": 2, 6 | "a": 3, 7 | "b": 4, 8 | "d": 5, 9 | "e": 6, 10 | "f": 7, 11 | "g": 8, 12 | "h": 9, 13 | "i": 10, 14 | "j": 11, 15 | "k": 12, 16 | "l": 13, 17 | "m": 14, 18 | "n": 15, 19 | "o": 16, 20 | "p": 17, 21 | "r": 18, 22 | "s": 19, 23 | "t": 20, 24 | "u": 21, 25 | "v": 22, 26 | "w": 23, 27 | "z": 24, 28 | "ŋ": 25, 29 | "ə": 26, 30 | "ɲ": 27, 31 | "ʃ": 28, 32 | "ʒ": 29, 33 | "ʔ": 30 34 | } 35 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: Deploy docs to Github Pages 2 | on: 3 | push: 4 | branches: 5 | - main 6 | jobs: 7 | deploy: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout master branch 11 | uses: actions/checkout@v2 12 | - name: Setup Python 13 | uses: actions/setup-python@v2 14 | with: 15 | python-version: 3.9 16 | - name: Install dependencies 17 | run: pip3 install mkdocs-material mkdocstrings mkdocstrings-python-legacy 18 | - name: Install package 19 | run: pip3 install . 20 | - name: Deploy docs 21 | run: mkdocs gh-deploy --force 22 | -------------------------------------------------------------------------------- /g2p_id/resources/currency.tsv: -------------------------------------------------------------------------------- 1 | US$ dollar amerika serikat 2 | nzd dollar new zealand 3 | rs rupee 4 | chf franc swiss 5 | dkk kroner denmark 6 | fim markka finland 7 | aed dirham arab 8 | czk koruna ceko 9 | mro ouguiya mauritania 10 | pkr rupee pakistan 11 | crc colon costa rica 12 | hk$ dollar hong kong 13 | npr rupee nepal 14 | awg florin aruban 15 | nok kroner norwegia 16 | tzs shilling tanzania 17 | sek kronor swedish 18 | cyp pounds cypriot 19 | sar riyal saudi 20 | cve escudo cape verde 21 | rsd dinar serbia 22 | dm mark jerman 23 | shp pounds saint helena 24 | php peso philipina 25 | cad dollar canada 26 | ssp pounds sudan selatan 27 | scr rupee seychell 28 | mvr rufiyaa maldivia 29 | Rp rupiah 30 | r real 31 | $ dollar 32 | € euro 33 | £ pounds 34 | ₩ won 35 | ¥ yen -------------------------------------------------------------------------------- /docs/reference/g2p.md: -------------------------------------------------------------------------------- 1 | # G2p 2 | 3 | ::: g2p_id.g2p.G2p 4 | 5 | ## Usage 6 | 7 | ```py 8 | texts = [ 9 | "Apel itu berwarna merah.", 10 | "Rahel bersekolah di Jakarta.", 11 | "Mereka sedang bermain bola di lapangan.", 12 | ] 13 | g2p = G2p(model_type="BERT") 14 | for text in texts: 15 | print(g2p(text)) 16 | ``` 17 | 18 | ```py 19 | >> [['a', 'p', 'ə', 'l'], ['i', 't', 'u'], ['b', 'ə', 'r', 'w', 'a', 'r', 'n', 'a'], ['m', 'e', 'r', 'a', 'h'], ['.']] 20 | >> [['r', 'a', 'h', 'e', 'l'], ['b', 'ə', 'r', 's', 'ə', 'k', 'o', 'l', 'a', 'h'], ['d', 'i'], ['dʒ', 'a', 'k', 'a', 'r', 't', 'a'], ['.']] 21 | >> [['m', 'ə', 'r', 'e', 'k', 'a'], ['s', 'ə', 'd', 'a', 'ŋ'], ['b', 'ə', 'r', 'm', 'a', 'i', 'n'], ['b', 'o', 'l', 'a'], ['d', 'i'], ['l', 'a', 'p', 'a', 'ŋ', 'a', 'n'], ['.']] 22 | ``` -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | jobs: 11 | test: 12 | runs-on: ${{ matrix.os }} 13 | strategy: 14 | matrix: 15 | os: [ubuntu-latest, windows-latest] 16 | python-version: ["3.8", "3.9"] 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install tox tox-gh-actions 28 | - name: Install package 29 | run: pip install . 30 | - name: Test with tox 31 | run: tox 32 | - name: Upload coverage reports to Codecov 33 | uses: codecov/codecov-action@v3 34 | -------------------------------------------------------------------------------- /g2p_id/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/) 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | from .bert import BERT 18 | from .g2p import G2p 19 | from .lstm import LSTM 20 | from .onnx_utils import WrapInferenceSession 21 | from .text_processor import TextProcessor 22 | 23 | __version__ = "0.4.2" 24 | __all__ = ["G2p", "LSTM", "BERT", "WrapInferenceSession", "TextProcessor"] 25 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | minversion = 3.8.0 3 | envlist = python3.8, python3.9, flake8, mypy 4 | isolated_build = true 5 | 6 | [gh-actions] 7 | python = 8 | 3.8: python3.8, flake8, mypy, pylint 9 | 3.9: python3.9, flake8, mypy, pylint 10 | 11 | [testenv] 12 | setenv = 13 | PYTHONPATH = {toxinidir} 14 | deps = 15 | -r{toxinidir}/requirements.txt 16 | -r{toxinidir}/requirements_test.txt 17 | commands = 18 | coverage erase 19 | coverage run --branch -m pytest 20 | coverage report 21 | coverage xml -i -o coverage.xml 22 | flake8 g2p_id tests 23 | mypy g2p_id --ignore-missing-imports 24 | pylint --rcfile=tox.ini g2p_id 25 | 26 | [flake8] 27 | extend-ignore = E203 28 | max-line-length = 120 29 | 30 | [pylint] 31 | ; R0902: Too many instance attribute 32 | ; R0903: Too few public methods 33 | ; R0914: Too many local variables 34 | disable = 35 | R0902, 36 | R0903, 37 | R0914 38 | max-line-length = 120 39 | 40 | [coverage:run] 41 | source=g2p_id 42 | 43 | [coverage:report] 44 | exclude_lines = 45 | except -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | from pathlib import Path 3 | 4 | this_path = Path(__file__).parent 5 | 6 | readme_path = this_path / "README.md" 7 | requirements_path = this_path / "requirements.txt" 8 | 9 | long_description = readme_path.read_text(encoding="utf-8") 10 | 11 | with open(requirements_path, "r", encoding="utf-8") as requirements_file: 12 | requirements = requirements_file.read().splitlines() 13 | 14 | if __name__ == "__main__": 15 | setup( 16 | name="g2p_id_py", 17 | version="0.4.2", 18 | description="Indonesian G2P.", 19 | long_description=long_description, 20 | long_description_content_type="text/markdown", 21 | author="w11wo", 22 | author_email="wilson@bookbotkids.com", 23 | url="https://github.com/bookbot-kids/g2p_id", 24 | license="Apache License", 25 | packages=find_packages(), 26 | install_requires=requirements, 27 | include_package_data=True, 28 | platforms=["linux", "unix", "windows"], 29 | python_requires=">=3.8", 30 | ) 31 | -------------------------------------------------------------------------------- /mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: g2p ID 2 | repo_url: https://github.com/bookbot-kids/g2p_id 3 | docs_dir: docs 4 | 5 | theme: 6 | name: material 7 | palette: 8 | - media: "(prefers-color-scheme: light)" 9 | scheme: default 10 | primary: indigo 11 | accent: indigo 12 | toggle: 13 | icon: material/weather-night 14 | name: Switch to dark mode 15 | - media: "(prefers-color-scheme: dark)" 16 | scheme: slate 17 | primary: red 18 | accent: red 19 | toggle: 20 | icon: material/weather-sunny 21 | name: Switch to light mode 22 | features: 23 | - navigation.sections 24 | 25 | plugins: 26 | - search 27 | - mkdocstrings: 28 | handlers: 29 | python: 30 | options: 31 | show_source: true 32 | show_root_heading: true 33 | heading_level: 2 34 | 35 | markdown_extensions: 36 | - tables 37 | - pymdownx.highlight: 38 | anchor_linenums: true 39 | - pymdownx.inlinehilite 40 | - pymdownx.snippets 41 | - pymdownx.superfences 42 | - def_list 43 | - pymdownx.tasklist: 44 | custom_checkbox: true 45 | 46 | watch: 47 | - g2p_id 48 | -------------------------------------------------------------------------------- /tests/test_text_processor.py: -------------------------------------------------------------------------------- 1 | def test_text_processor(text_processor): 2 | # URLs 3 | assert text_processor.normalize("Situs: https://www.google.com") == "Situs: " 4 | # measurements 5 | assert ( 6 | text_processor.normalize("123,1 kg") 7 | == "seratus dua puluh tiga koma satu kilogram" 8 | ) 9 | assert text_processor.normalize("500 cm") == "lima ratus centimeter" 10 | # currency/money 11 | assert text_processor.normalize("$100") == "seratus dollar" 12 | assert text_processor.normalize("Rp 3,000,000") == "tiga juta rupiah" 13 | # dates 14 | assert ( 15 | text_processor.normalize("(17/8/1945)").strip() 16 | == "tujuh belas Agustus seribu sembilan ratus empat puluh lima" 17 | ) 18 | assert text_processor.normalize("(1/13)").strip() == "satu Januari" 19 | # time/time zone 20 | assert ( 21 | text_processor.normalize("19.45 WIB") 22 | == "sembilan belas lewat empat puluh lima menit Waktu Indonesia Barat" 23 | ) 24 | assert ( 25 | text_processor.normalize("19.00 WIB") == "sembilan belas Waktu Indonesia Barat" 26 | ) 27 | # numerics 28 | assert text_processor.normalize("105.000") == "seratus lima ribu" 29 | assert text_processor.normalize("0,5") == "nol koma lima" 30 | -------------------------------------------------------------------------------- /PROJECT_CHARTER.md: -------------------------------------------------------------------------------- 1 | # Project Charter 2 | 3 | ## Vision statement 4 | Literacy is fundamental, not only for our personal and social development, but also for our ability to function effectively in society. Our vision at Bookbot is that every child should have the opportunity to develop their reading, writing and communication skills to create a happy and successful life. 5 | 6 | 7 | ## Mission statement 8 | Deliver the Bookbot app that combines speech recognition and a scientifically designed reading program for school children to achieve greater literacy and providing better tools for educators to monitor a child’s reading progress. 9 | 10 | 11 | ## Community (Impact) statement 12 | 13 | Bookbot is founded on the grounds of building a community of learners. Members of the Bookbot community consist of software developers, educators, students, writers, editors, linguists, people with disabilities, and more. We exist to ensure that every child, regardless of their situation, is able to develop their literacy skills. 14 | 15 | 16 | ## Licensing strategy 17 | Open source (creative commons), and reseller model for the app. Parts of code are Apache 2 18 | 19 | ## Identification of key trademarks 20 | No key trademarks 21 | -------------------------------------------------------------------------------- /g2p_id/onnx_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/) 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import onnxruntime as ort 18 | 19 | 20 | class WrapInferenceSession: 21 | """Wrapper class for serializing ONNX InferenceSession objects. 22 | Based on: https://github.com/microsoft/onnxruntime/pull/800#issuecomment-844326099 23 | """ 24 | 25 | def __init__(self, onnx_bytes, sess_options=None, providers=None): 26 | self.sess = ort.InferenceSession(onnx_bytes, sess_options=sess_options, providers=providers) 27 | self.onnx_bytes = onnx_bytes 28 | self.providers = providers 29 | 30 | def run(self, *args): 31 | """Wrapper for ONNX InferenceSession run method. 32 | 33 | Returns: 34 | Any: Inference result. 35 | """ 36 | return self.sess.run(*args) 37 | 38 | def __getstate__(self): 39 | return {"onnx_bytes": self.onnx_bytes} 40 | 41 | def __setstate__(self, values): 42 | self.onnx_bytes = values["onnx_bytes"] 43 | self.providers = values.get("providers", None) 44 | self.sess = ort.InferenceSession(self.onnx_bytes, self.providers) 45 | -------------------------------------------------------------------------------- /g2p_id/resources/measurements.tsv: -------------------------------------------------------------------------------- 1 | sq mi mil kuadrat 2 | sq ft kaki kuadrat 3 | kbps kilobit per detik 4 | mbps megabit per detik 5 | kcal kilo kalori 6 | ghz gigahertz 7 | khz kilohertz 8 | mhz megahertz 9 | lbs pound 10 | rpm revolution per menit 11 | kwh kilo watt jam 12 | min menit 13 | mph mil per jam 14 | mol mol 15 | gpa giga pascal 16 | km² kilometer kuadrat 17 | km2 kilometer kuadrat 18 | rad radian 19 | kgf kilogram force 20 | mm² millimeter kuadrat 21 | mm2 millimeter kuadrat 22 | cm² centimeter kuadrat 23 | cm2 centimeter kuadrat 24 | dm³ desimeter kubik 25 | dm3 desimeter kubik 26 | amu atomic mass unit 27 | gwh giga watt jam 28 | kpa kilopascal 29 | cwt hundredweight 30 | atm atmosphere 31 | bar bar 32 | km kilometer 33 | cm centimeter 34 | mm millimeter 35 | ha hectare 36 | mi mil 37 | m² meter kuadrat 38 | m2 meter kuadrat 39 | ft kaki 40 | hz hertz 41 | kw kilowatt 42 | hp tenaga kuda 43 | mg milligram 44 | kg kilogram 45 | lb pound 46 | mc mega coulomb 47 | nm nanometer 48 | mA milli ampere 49 | m³ meter kubik 50 | m3 meter kubik 51 | tw tera watt 52 | mv milli volt 53 | mw megawatt 54 | μm mikrometer 55 | " inch 56 | TB terabyte 57 | cc c c 58 | da dalton 59 | db desibel 60 | ps peta detik 61 | oz ounce 62 | hl hecto liter 63 | μg mikrogram 64 | pg petagram 65 | GB gigabyte 66 | kb kilobit 67 | ev electron volt 68 | MB megabyte 69 | KB kilobyte 70 | kl kilo liter 71 | tj tera joule 72 | kv kilo volt 73 | mv mega volt 74 | kn kilonewton 75 | mm megameter 76 | au astronomical unit 77 | yd yard 78 | lm lumen 79 | hs hecto detik 80 | ml milliliter 81 | gw gigawatt 82 | ma mega ampere 83 | kt knot 84 | ng nano gram 85 | ns nano detik 86 | ms mega siemens 87 | gl giga liter 88 | μs mikro detik 89 | da desi ampere 90 | pa pascal 91 | ds desi detik 92 | ms milli detik 93 | dm desimeter 94 | mb megabit 95 | mf mega farad 96 | bq becquerel 97 | pb petabit 98 | cd candela 99 | tl tera liter 100 | ms mega detik 101 | mpa megapascal 102 | pb peta byte 103 | gy gray 104 | sv sievert 105 | cc c c 106 | °F derajat fahrenheit 107 | °f derajat fahrenheit 108 | °C derajat celsius 109 | °c derajat celsius 110 | m meter 111 | % percent 112 | v volt 113 | h jam 114 | g gram 115 | s detik 116 | ω ohm -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | target/ 78 | 79 | # Jupyter Notebook 80 | .ipynb_checkpoints 81 | 82 | # IPython 83 | profile_default/ 84 | ipython_config.py 85 | 86 | # pyenv 87 | .python-version 88 | 89 | # pipenv 90 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 91 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 92 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 93 | # install all needed dependencies. 94 | #Pipfile.lock 95 | 96 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 97 | __pypackages__/ 98 | 99 | # Celery stuff 100 | celerybeat-schedule 101 | celerybeat.pid 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | -------------------------------------------------------------------------------- /docs/contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | Hi there! Thanks for taking your time to contribute! 3 | 4 | We welcome everyone to contribute and we value each contribution, even the smallest ones! We want to make contributing to this project as easy and transparent as possible, whether it's: 5 | 6 | - Reporting a bug 7 | - Discussing the current state of the code 8 | - Submitting a fix 9 | - Proposing new features 10 | - Becoming a maintainer 11 | 12 | ## Code of Conduct 13 | 14 | Please be mindful to respect our [Code of Conduct](https://github.com/bookbot-kids/g2p_id/blob/main/CODE_OF_CONDUCT.md). 15 | 16 | ## We Develop with Github 17 | We use github to host code, to track issues and feature requests, as well as accept pull requests. 18 | 19 | ## We Use Github, So All Code Changes Happen Through Pull Requests 20 | Pull requests are the best way to propose changes to the codebase. We actively welcome your pull requests: 21 | 22 | 1. Fork the repo and create your branch from `main`. 23 | 2. If you've added code that should be tested, add tests. 24 | 3. If you've changed APIs, update the documentation. 25 | 4. Ensure the test suite passes. 26 | 5. Make sure your code lints. 27 | 6. Issue that pull request! 28 | 29 | ## Any contributions you make will be under the Apache 2.0 License 30 | In short, when you submit code changes, your submissions are understood to be under the same [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0) that covers the project. Feel free to contact the maintainers if that's a concern. 31 | 32 | ## Report bugs using Github's [issues](https://github.com/bookbot-kids/g2p_id/issues) 33 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/bookbot-kids/g2p_id/issues/new). 34 | 35 | ## Write bug reports with detail, background, and sample code 36 | [This is an example](http://stackoverflow.com/q/12488905/180626) of a good and thorough bug report. 37 | 38 | **Great Bug Reports** tend to have: 39 | 40 | - A quick summary and/or background 41 | - Steps to reproduce 42 | - Be specific! 43 | - Give sample code if you can. 44 | - What you expected would happen 45 | - What actually happens 46 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work) 47 | 48 | ## License 49 | By contributing, you agree that your contributions will be licensed under its Apache 2.0 License. 50 | 51 | ## References 52 | This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md) -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to g2p ID 2 | Hi there! Thanks for taking your time to contribute! 3 | 4 | We welcome everyone to contribute and we value each contribution, even the smallest ones! We want to make contributing to this project as easy and transparent as possible, whether it's: 5 | 6 | - Reporting a bug 7 | - Discussing the current state of the code 8 | - Submitting a fix 9 | - Proposing new features 10 | - Becoming a maintainer 11 | 12 | ## Code of Conduct 13 | 14 | Please be mindful to respect our [Code of Conduct](https://github.com/bookbot-kids/g2p_id/blob/main/CODE_OF_CONDUCT.md). 15 | 16 | ## We Develop with Github 17 | We use github to host code, to track issues and feature requests, as well as accept pull requests. 18 | 19 | ## We Use Github, So All Code Changes Happen Through Pull Requests 20 | Pull requests are the best way to propose changes to the codebase. We actively welcome your pull requests: 21 | 22 | 1. Fork the repo and create your branch from `main`. 23 | 2. If you've added code that should be tested, add tests. 24 | 3. If you've changed APIs, update the documentation. 25 | 4. Ensure the test suite passes. 26 | 5. Make sure your code lints. 27 | 6. Issue that pull request! 28 | 29 | ## Any contributions you make will be under the Apache 2.0 License 30 | In short, when you submit code changes, your submissions are understood to be under the same [Apache 2.0 License](https://www.apache.org/licenses/LICENSE-2.0) that covers the project. Feel free to contact the maintainers if that's a concern. 31 | 32 | ## Report bugs using Github's [issues](https://github.com/bookbot-kids/g2p_id/issues) 33 | We use GitHub issues to track public bugs. Report a bug by [opening a new issue](https://github.com/bookbot-kids/g2p_id/issues/new). 34 | 35 | ## Write bug reports with detail, background, and sample code 36 | [This is an example](http://stackoverflow.com/q/12488905/180626) of a good and thorough bug report. 37 | 38 | **Great Bug Reports** tend to have: 39 | 40 | - A quick summary and/or background 41 | - Steps to reproduce 42 | - Be specific! 43 | - Give sample code if you can. 44 | - What you expected would happen 45 | - What actually happens 46 | - Notes (possibly including why you think this might be happening, or stuff you tried that didn't work) 47 | 48 | ## License 49 | By contributing, you agree that your contributions will be licensed under its Apache 2.0 License. 50 | 51 | ## References 52 | This document was adapted from the open-source contribution guidelines for [Facebook's Draft](https://github.com/facebook/draft-js/blob/a9316a723f9e918afde44dea68b5f9f39b7d9b00/CONTRIBUTING.md) -------------------------------------------------------------------------------- /g2p_id/bert.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/) 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import json 18 | import os 19 | 20 | import numpy as np 21 | import onnxruntime 22 | 23 | from g2p_id.onnx_utils import WrapInferenceSession 24 | 25 | model_path = os.path.join(os.path.dirname(__file__), "models", "bert") 26 | 27 | 28 | class BERT: 29 | """Phoneme-level BERT model for predicting the correct phoneme for the letter `e`. 30 | Trained with [Keras](https://keras.io/examples/nlp/masked_language_modeling/), 31 | and exported to ONNX. ONNX Runtime engine used during inference. 32 | """ 33 | 34 | def __init__(self): 35 | bert_model_path = os.path.join(model_path, "bert_mlm.onnx") 36 | token2id = os.path.join(model_path, "token2id.json") 37 | config_path = os.path.join(model_path, "config.json") 38 | self.model = WrapInferenceSession(bert_model_path, providers=onnxruntime.get_available_providers()) 39 | with open(config_path, encoding="utf-8") as file: 40 | self.config = json.load(file) 41 | with open(token2id, encoding="utf-8") as file: 42 | self.token2id = json.load(file) 43 | self.id2token = {v: k for k, v in self.token2id.items()} 44 | 45 | def predict(self, text: str) -> str: 46 | """Performs BERT inference, predicting the correct phoneme for the letter `e`. 47 | 48 | Args: 49 | text (str): Word to predict from. 50 | 51 | Returns: 52 | str: Word after prediction. 53 | """ 54 | # `x` is currently OOV, we replace with 55 | text = text.replace("x", "ks") 56 | # mask `e`'s 57 | text = " ".join([c if c != "e" else "[mask]" for c in text]) 58 | 59 | # tokenize and pad to max length 60 | tokens = [self.token2id[c] for c in text.split()] 61 | padding = [self.token2id[self.config["pad_token"]] for _ in range(self.config["max_seq_length"] - len(tokens))] 62 | tokens = tokens + padding 63 | 64 | input_ids = np.array([tokens], dtype="int64") 65 | inputs = {"input_1": input_ids} 66 | prediction = self.model.run(None, inputs) 67 | 68 | # find masked idx token 69 | mask_token_id = self.token2id[self.config["mask_token"]] 70 | masked_index = np.where(input_ids == mask_token_id)[1] 71 | 72 | # get prediction at masked indices 73 | mask_prediction = prediction[0][0][masked_index] 74 | predicted_ids = np.argmax(mask_prediction, axis=1) 75 | 76 | # replace mask with predicted token 77 | for i, idx in enumerate(masked_index): 78 | tokens[idx] = predicted_ids[i] 79 | 80 | return "".join([self.id2token[t] for t in tokens if t != 0]) 81 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | # Home 2 | 3 | ## g2p ID: Indonesian Grapheme-to-Phoneme Converter 4 | 5 |

6 | 7 | GitHub 8 | 9 | 10 | Documentation 11 | 12 | 13 | GitHub release 14 | 15 | 16 | Contributor Covenant 17 | 18 | 19 | Tests 20 | 21 | 22 | Code Coverage 23 | 24 | 25 | chat on Discord 26 | 27 | 28 | contributing guidelines 29 | 30 |

31 | 32 | This library is developed to convert Indonesian (Bahasa Indonesia) graphemes (words) to phonemes in IPA. We followed the methods and designs used in the English equivalent library, [g2p](https://github.com/Kyubyong/g2p). 33 | 34 | ## Installation 35 | 36 | ```bash 37 | pip install g2p_id_py 38 | ``` 39 | 40 | ## How to Use 41 | 42 | ```py 43 | from g2p_id import G2p 44 | 45 | texts = [ 46 | "Apel itu berwarna merah.", 47 | "Rahel bersekolah di Jakarta.", 48 | "Mereka sedang bermain bola di lapangan.", 49 | ] 50 | 51 | g2p = G2p() 52 | for text in texts: 53 | print(g2p(text)) 54 | 55 | >> [['a', 'p', 'ə', 'l'], ['i', 't', 'u'], ['b', 'ə', 'r', 'w', 'a', 'r', 'n', 'a'], ['m', 'e', 'r', 'a', 'h'], ['.']] 56 | >> [['r', 'a', 'h', 'e', 'l'], ['b', 'ə', 'r', 's', 'ə', 'k', 'o', 'l', 'a', 'h'], ['d', 'i'], ['dʒ', 'a', 'k', 'a', 'r', 't', 'a'], ['.']] 57 | >> [['m', 'ə', 'r', 'e', 'k', 'a'], ['s', 'ə', 'd', 'a', 'ŋ'], ['b', 'ə', 'r', 'm', 'a', 'i', 'n'], ['b', 'o', 'l', 'a'], ['d', 'i'], ['l', 'a', 'p', 'a', 'ŋ', 'a', 'n'], ['.']] 58 | ``` 59 | 60 | ## References 61 | 62 | ```bib 63 | @misc{g2pE2019, 64 | author = {Park, Kyubyong & Kim, Jongseok}, 65 | title = {g2pE}, 66 | year = {2019}, 67 | publisher = {GitHub}, 68 | journal = {GitHub repository}, 69 | howpublished = {\url{https://github.com/Kyubyong/g2p}} 70 | } 71 | ``` 72 | 73 | ```bib 74 | @misc{TextProcessor2021, 75 | author = {Cahya Wirawan}, 76 | title = {Text Processor}, 77 | year = {2021}, 78 | publisher = {GitHub}, 79 | journal = {GitHub repository}, 80 | howpublished = {\url{https://github.com/cahya-wirawan/text_processor}} 81 | } 82 | ``` 83 | 84 | ## Contributors 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /tests/test_g2p.py: -------------------------------------------------------------------------------- 1 | def test_g2p(g2p): 2 | assert g2p("Apel itu berwarna merah.") == [ 3 | ["a", "p", "ə", "l"], 4 | ["i", "t", "u"], 5 | ["b", "ə", "r", "w", "a", "r", "n", "a"], 6 | ["m", "e", "r", "a", "h"], 7 | ["."], 8 | ] 9 | assert g2p("Rahel bersekolah di S M A Jakarta 17.") == [ 10 | ["r", "a", "h", "e", "l"], 11 | ["b", "ə", "r", "s", "ə", "k", "o", "l", "a", "h"], 12 | ["d", "i"], 13 | ["e", "s"], 14 | ["e", "m"], 15 | ["a"], 16 | ["dʒ", "a", "k", "a", "r", "t", "a"], 17 | ["t", "u", "dʒ", "u", "h"], 18 | ["b", "ə", "l", "a", "s"], 19 | ["."], 20 | ] 21 | assert g2p("Mereka sedang bermain bola di lapangan.") == [ 22 | ["m", "ə", "r", "e", "k", "a"], 23 | ["s", "ə", "d", "a", "ŋ"], 24 | ["b", "ə", "r", "m", "a", "ʔ", "i", "n"], 25 | ["b", "o", "l", "a"], 26 | ["d", "i"], 27 | ["l", "a", "p", "a", "ŋ", "a", "n"], 28 | ["."], 29 | ] 30 | assert g2p("Ini rumahnya Aisyah dan Ceri.") == [ 31 | ["i", "n", "i"], 32 | ["r", "u", "m", "a", "h", "ɲ", "a"], 33 | ["a", "ʔ", "i", "ʃ", "a", "h"], 34 | ["d", "a", "n"], 35 | ["tʃ", "e", "r", "i"], 36 | ["."], 37 | ] 38 | assert g2p("keset selamat datang") == [ 39 | ["k", "e", "s", "e", "t"], 40 | ["s", "ə", "l", "a", "m", "a", "t"], 41 | ["d", "a", "t", "a", "ŋ"], 42 | ] 43 | assert g2p("kakak layak") == [["k", "a", "k", "a", "k"], ["l", "a", "j", "a", "k"]] 44 | 45 | 46 | def test_rule_based_g2p(g2p): 47 | assert g2p._rule_based_g2p("berakhirnya") == "b e r a x i r ɲ a" 48 | assert g2p._rule_based_g2p("bermaaf-maafan") == "b e r m a ʔ a f - m a ʔ a f a n" 49 | assert g2p._rule_based_g2p("kecolongan") == "k e tʃ o l o ŋ a n" 50 | assert g2p._rule_based_g2p("jayapura") == "dʒ a j a p u r a" 51 | assert g2p._rule_based_g2p("xenon") == "s e n o n" 52 | assert g2p._rule_based_g2p("layak") == "l a j a k" 53 | 54 | 55 | def test_lstm(lstm): 56 | assert lstm.predict("mengembangkannya") == "məŋəmbaŋkanɲa" 57 | assert lstm.predict("merdeka") == "mərdeka" 58 | assert lstm.predict("pecel") == "pətʃəl" 59 | assert lstm.predict("lele") == "lele" 60 | 61 | 62 | def test_bert(bert): 63 | assert bert.predict("mengembangkannya") == "məngəmbangkannya" 64 | assert bert.predict("merdeka") == "mərdeka" 65 | assert bert.predict("pecel") == "pəcel" 66 | assert bert.predict("lele") == "lele" 67 | assert bert.predict("banyak") == "banyak" 68 | 69 | 70 | def test_ps(g2p): 71 | assert g2p("psikologi") == [["s", "i", "k", "o", "l", "o", "ɡ", "i"]] 72 | assert g2p("psikometri") == [["s", "i", "k", "o", "m", "e", "t", "r", "i"]] 73 | assert g2p("psikotes") == [["s", "i", "k", "o", "t", "e", "s"]] 74 | 75 | 76 | def test_sticking_dot(g2p): 77 | assert g2p("Seniornya Brigadir Jendral A.Yani mengambil alih pimpinan.") == [ 78 | ["s", "ə", "n", "i", "ʔ", "o", "r", "ɲ", "a"], 79 | ["b", "r", "i", "ɡ", "a", "d", "i", "r"], 80 | ["dʒ", "ə", "n", "d", "r", "a", "l"], 81 | ["a"], 82 | ["j", "a", "n", "i"], 83 | ["m", "ə", "ŋ", "a", "m", "b", "i", "l"], 84 | ["a", "l", "i", "h"], 85 | ["p", "i", "m", "p", "i", "n", "a", "n"], 86 | ["."], 87 | ] 88 | 89 | 90 | def test_onnx_wrapper(bert): 91 | assert bert.predict("mengembangkannya") == "məngəmbangkannya" 92 | model_state = bert.model.__getstate__() 93 | bert.model.__setstate__(model_state) 94 | assert bert.predict("mengembangkannya") == "məngəmbangkannya" 95 | -------------------------------------------------------------------------------- /g2p_id/resources/homographs_id.tsv: -------------------------------------------------------------------------------- 1 | angel a ŋ e l a ŋ ə l A A 2 | apel a p ə l a p e l N V 3 | begar b e ɡ a r b ə ɡ a r V A 4 | begu b e ɡ u b ə ɡ u N N 5 | bekel b e k ə l b e k ə l N N 6 | belek b ə l e ʔ b e l e ʔ V N 7 | belok b e l o ʔ b ə l o ʔ V A 8 | bena b e n a b ə n a A N 9 | berak b e r a k b ə r a k V A 10 | berang b e r a ŋ b ə r a ŋ A N 11 | berok b e r o ʔ b ə r o ʔ N N 12 | berpendar b ə r p e n d a r b ə r p ə n d a r V V 13 | berseri b ə r s ə r i b ə r s e r i V V 14 | boreh b o r e h b o r ə h N N 15 | cegak tʃ e ɡ a ʔ tʃ ə ɡ a ʔ A A 16 | cela tʃ ə l a tʃ e l a N N 17 | celak tʃ e l a ʔ tʃ ə l a ʔ N N 18 | cetok tʃ e t o ʔ tʃ ə t o ʔ N N 19 | debut d e b u t d ə b u t N N 20 | dekan d e k a n d ə k a n N N 21 | dendang d e n d a ŋ d ə n d a ŋ N N 22 | depak d e p a ʔ d ə p a ʔ V M 23 | dera d e r a d ə r a N N 24 | embel e m b e l ə m b ə l N N 25 | erang ə r a ŋ e r a ŋ N A 26 | ganteng ɡ a n t ə ŋ ɡ a n t e ŋ A V 27 | gedek ɡ ə d ə ʔ ɡ ə d e ʔ N V 28 | gelang ɡ ə l a ŋ ɡ e l a ŋ N N 29 | genggang ɡ ə ŋ ɡ a ŋ ɡ e ŋ ɡ a ŋ N N 30 | helat h ə l a t h e l a t A N 31 | jejer dʒ e dʒ e r dʒ ə dʒ ə r V N 32 | jeli dʒ e l i dʒ ə l i N A 33 | kecap k e tʃ a p k ə tʃ a p N V 34 | keder k ə d e r k e d ə r N A 35 | kedi k e d i k ə d i N N 36 | kekel k e k e l k ə k ə l A N 37 | kelah k e l a h k ə l a h V N 38 | kelentang k ə l ə n t a ŋ k ə l e n t a ŋ N N 39 | kelenteng k ə l ə n t e ŋ k ə l e n t e ŋ N N 40 | kelepak k ə l ə p a ʔ k ə l e p a ʔ N A 41 | kelesa k ə l e s a k ə l ə s a A N 42 | kena k ə n a k e n a V N 43 | kepang k e p a ŋ k ə p a ŋ N N 44 | kepar k e p a r k ə p a r N N 45 | kere k e r e k ə r e A N 46 | keset k ə s ə t k e s e t A N 47 | ketek k e t e ʔ k ə t e ʔ N N 48 | ketel k e t e l k ə t ə l N A 49 | lebam l e b a m l ə b a m N A 50 | leding l e d i ŋ l ə d i ŋ N V 51 | legar l e ɡ a r l ə ɡ a r V N 52 | lembang l e m b a ŋ l ə m b a ŋ N A 53 | lempeng l e m p e ŋ l ə m p ə ŋ A A 54 | lenggang l e ŋ ɡ a ŋ l ə ŋ ɡ a ŋ N A 55 | letak l e t a ʔ l ə t a ʔ A N 56 | leter l e t e r l e t ə r A N 57 | mejan m e dʒ a n m ə dʒ a n N N 58 | memepet m ə m e p e t m ə m ə p e t V N 59 | memerah m ə m e r a h m ə m ə r a h V V 60 | mendera m ə n d e r a m ə n d e r a V A 61 | mental m e n t a l m ə n t a l V N 62 | pelak p e l a k p ə l a k N A 63 | pelang p e l a ŋ p ə l a ŋ N N 64 | pelat p ə l a t p e l a t N A 65 | pelekat p ə l ə k a t p ə l e k a t N N 66 | penggemblengan p ə ŋ ɡ ə m b l e ŋ a n p ə ŋ ɡ ə m b l ə ŋ a n N N 67 | pening p ə n i ŋ p e n i ŋ A N 68 | pentil p e n t i l p ə n t i l N N 69 | pepet p e p e t p ə p ə t V N 70 | per p e r p ə r N P 71 | rebak r e b a ʔ r ə b a ʔ A V 72 | relai r e l a i r ə l a i V N 73 | remah r e m a h r ə m a h N N 74 | rembes r e m b e s r e m b ə s A N 75 | samseng s a m s e ŋ s a m s ə ŋ N V 76 | seba s e b a s ə b a V V 77 | sebat s e b a t s ə b a t A V 78 | sedan s e d a n s ə d a n N N 79 | sela s ə l a s e l a N N 80 | selak s e l a ʔ s ə l a ʔ V V 81 | selempang s ə l e m p a ŋ s ə l ə m p a ŋ N A 82 | semen s e m ə n s e m e n N N 83 | semi s ə m i s e m i N A 84 | senggang s e ŋ ɡ a ŋ s ə ŋ ɡ a ŋ A A 85 | sengkang s e ŋ k a ŋ s ə ŋ k a ŋ N N 86 | sengkelat s ə ŋ k e l a t s ə ŋ k ə l a t A V 87 | sepak s e p a k s ə p a k N N 88 | serak s ə r a ʔ s e r a ʔ A V 89 | serang s ə r a ŋ s e r a ŋ V N 90 | seret s e r e t s ə r ə t V A 91 | seri s e r i s ə r i N A 92 | sertu s e r t u s ə r t u N V 93 | tekek t ə k e ʔ t e k e ʔ N V 94 | teken t ə k ə n t e k ə n V V 95 | tela t e l a t ə l a N N 96 | telan t e l a n t ə l a n N V 97 | teleng t e l e ŋ t ə l ə ŋ A N 98 | telor t ə l o r t e l o r N A 99 | tepak t e p a ʔ t ə p a ʔ N N 100 | tepok t ə p o ʔ t e p o ʔ V A 101 | terapi t e r a p i t ə r a p i N A 102 | teras t e r a s t ə r a s N N 103 | -------------------------------------------------------------------------------- /g2p_id/lstm.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/) 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import json 18 | import os 19 | 20 | import numpy as np 21 | import onnxruntime 22 | 23 | from g2p_id.onnx_utils import WrapInferenceSession 24 | 25 | model_path = os.path.join(os.path.dirname(__file__), "models", "lstm") 26 | 27 | 28 | class LSTM: 29 | """Phoneme-level LSTM model for sequence-to-sequence phonemization. 30 | Trained with [Keras](https://keras.io/examples/nlp/lstm_seq2seq/), 31 | and exported to ONNX. ONNX Runtime engine used during inference. 32 | """ 33 | 34 | def __init__(self): 35 | encoder_model_path = os.path.join(model_path, "encoder_model.onnx") 36 | decoder_model_path = os.path.join(model_path, "decoder_model.onnx") 37 | g2id_path = os.path.join(model_path, "g2id.json") 38 | p2id_path = os.path.join(model_path, "p2id.json") 39 | config_path = os.path.join(model_path, "config.json") 40 | self.encoder = WrapInferenceSession( 41 | encoder_model_path, 42 | providers=onnxruntime.get_available_providers(), 43 | ) 44 | self.decoder = WrapInferenceSession( 45 | decoder_model_path, 46 | providers=onnxruntime.get_available_providers(), 47 | ) 48 | with open(g2id_path, encoding="utf-8") as file: 49 | self.g2id = json.load(file) 50 | with open(p2id_path, encoding="utf-8") as file: 51 | self.p2id = json.load(file) 52 | self.id2p = {v: k for k, v in self.p2id.items()} 53 | with open(config_path, encoding="utf-8") as file: 54 | self.config = json.load(file) 55 | 56 | def predict(self, text: str) -> str: 57 | """Performs LSTM inference, predicting phonemes of a given word. 58 | 59 | Args: 60 | text (str): Word to convert to phonemes. 61 | 62 | Returns: 63 | str: Word in phonemes. 64 | """ 65 | input_seq = np.zeros( 66 | ( 67 | 1, 68 | self.config["max_encoder_seq_length"], 69 | self.config["num_encoder_tokens"], 70 | ), 71 | dtype="float32", 72 | ) 73 | 74 | for idx, char in enumerate(text): 75 | input_seq[0, idx, self.g2id[char]] = 1.0 76 | input_seq[0, len(text) :, self.g2id[self.config["pad_token"]]] = 1.0 77 | 78 | encoder_inputs = {"input_1": input_seq} 79 | states_value = self.encoder.run(None, encoder_inputs) 80 | 81 | target_seq = np.zeros((1, 1, self.config["num_decoder_tokens"]), dtype="float32") 82 | target_seq[0, 0, self.p2id[self.config["bos_token"]]] = 1.0 83 | 84 | stop_condition = False 85 | decoded_sentence = "" 86 | while not stop_condition: 87 | decoder_inputs = { 88 | "input_2": target_seq, 89 | "input_3": states_value[0], 90 | "input_4": states_value[1], 91 | } 92 | output_tokens, state_memory, state_carry = self.decoder.run(None, decoder_inputs) 93 | 94 | sampled_token_index = np.argmax(output_tokens[0, -1, :]) 95 | sampled_char = self.id2p[sampled_token_index] 96 | decoded_sentence += sampled_char 97 | 98 | if ( 99 | sampled_char == self.config["eos_token"] 100 | or len(decoded_sentence) > self.config["max_decoder_seq_length"] 101 | ): 102 | stop_condition = True 103 | 104 | target_seq = np.zeros((1, 1, self.config["num_decoder_tokens"]), dtype="float32") 105 | target_seq[0, 0, sampled_token_index] = 1.0 106 | 107 | states_value = [state_memory, state_carry] 108 | 109 | return decoded_sentence.replace(self.config["eos_token"], "") 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # g2p ID: Indonesian Grapheme-to-Phoneme Converter 2 | 3 |

4 | 5 | GitHub 6 | 7 | 8 | Documentation 9 | 10 | 11 | GitHub release 12 | 13 | 14 | Contributor Covenant 15 | 16 | 17 | Tests 18 | 19 | 20 | Code Coverage 21 | 22 | 23 | chat on Discord 24 | 25 | 26 | contributing guidelines 27 | 28 |

29 | 30 | This library is developed to convert Indonesian (Bahasa Indonesia) graphemes (words) to phonemes in IPA. We followed the methods and designs used in the English equivalent library, [g2p](https://github.com/Kyubyong/g2p). 31 | 32 | ## Installation 33 | 34 | ```bash 35 | pip install g2p_id_py 36 | ``` 37 | 38 | ## How to Use 39 | 40 | ```py 41 | from g2p_id import G2p 42 | 43 | texts = [ 44 | "Apel itu berwarna merah.", 45 | "Rahel bersekolah di S M A Jakarta 17.", 46 | "Mereka sedang bermain bola di lapangan.", 47 | ] 48 | 49 | g2p = G2p() 50 | for text in texts: 51 | print(g2p(text)) 52 | 53 | >> [['a', 'p', 'ə', 'l'], ['i', 't', 'u'], ['b', 'ə', 'r', 'w', 'a', 'r', 'n', 'a'], ['m', 'e', 'r', 'a', 'h'], ['.']] 54 | >> [['r', 'a', 'h', 'e', 'l'], ['b', 'ə', 'r', 's', 'ə', 'k', 'o', 'l', 'a', 'h'], ['d', 'i'], ['e', 's'], ['e', 'm'], ['a'], ['dʒ', 'a', 'k', 'a', 'r', 't', 'a'], ['t', 'u', 'dʒ', 'u', 'h'], ['b', 'ə', 'l', 'a', 's'], ['.']] 55 | >> [['m', 'ə', 'r', 'e', 'k', 'a'], ['s', 'ə', 'd', 'a', 'ŋ'], ['b', 'ə', 'r', 'm', 'a', 'i', 'n'], ['b', 'o', 'l', 'a'], ['d', 'i'], ['l', 'a', 'p', 'a', 'ŋ', 'a', 'n'], ['.']] 56 | ``` 57 | 58 | ## Algorithm 59 | 60 | This is heavily inspired from the English [g2p](https://github.com/Kyubyong/g2p). 61 | 62 | 1. Spells out arabic numbers and some currency symbols, e.g. `Rp 200,000 -> dua ratus ribu rupiah`. This is borrowed from [Cahya's code](https://github.com/cahya-wirawan/text_processor). 63 | 2. Attempts to retrieve the correct pronunciation for homographs based on their [POS (part-of-speech) tags](#pos-tagging). 64 | 3. Looks up a lexicon (pronunciation dictionary) for non-homographs. This list is originally from [ipa-dict](https://github.com/open-dict-data/ipa-dict/blob/master/data/ma.txt), and we later made a modified version. 65 | 4. For OOVs, we predict their pronunciations using either a [BERT model](https://huggingface.co/bookbot/id-g2p-bert) or an [LSTM model](https://huggingface.co/bookbot/id-g2p-lstm). 66 | 67 | ## Phoneme and Grapheme Sets 68 | 69 | ```python 70 | graphemes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 71 | phonemes = ['a', 'b', 'd', 'e', 'f', 'ɡ', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'z', 'ŋ', 'ə', 'ɲ', 'tʃ', 'ʃ', 'dʒ', 'x', 'ʔ'] 72 | ``` 73 | 74 | ## Implementation Details 75 | 76 | You can find more details on how we handled homographs and out-of-vocabulary prediction on our [documentation](https://bookbot-kids.github.io/g2p_id/algorithm/) page. 77 | 78 | ## References 79 | 80 | ```bib 81 | @misc{g2pE2019, 82 | author = {Park, Kyubyong & Kim, Jongseok}, 83 | title = {g2pE}, 84 | year = {2019}, 85 | publisher = {GitHub}, 86 | journal = {GitHub repository}, 87 | howpublished = {\url{https://github.com/Kyubyong/g2p}} 88 | } 89 | ``` 90 | 91 | ```bib 92 | @misc{TextProcessor2021, 93 | author = {Cahya Wirawan}, 94 | title = {Text Processor}, 95 | year = {2021}, 96 | publisher = {GitHub}, 97 | journal = {GitHub repository}, 98 | howpublished = {\url{https://github.com/cahya-wirawan/text_processor}} 99 | } 100 | ``` 101 | 102 | ## Contributors 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /docs/algorithm.md: -------------------------------------------------------------------------------- 1 | # Algorithm 2 | 3 | This is heavily inspired from the English [g2p](https://github.com/Kyubyong/g2p). 4 | 5 | 1. Spells out arabic numbers and some currency symbols, e.g. `Rp 200,000 -> dua ratus ribu rupiah`. This is borrowed from [Cahya's code](https://github.com/cahya-wirawan/text_processor). 6 | 2. Attempts to retrieve the correct pronunciation for homographs based on their [POS (part-of-speech) tags](#pos-tagging). 7 | 3. Looks up a lexicon (pronunciation dictionary) for non-homographs. This list is originally from [ipa-dict](https://github.com/open-dict-data/ipa-dict/blob/master/data/ma.txt), and we later made a modified version. 8 | 4. For OOVs, we predict their pronunciations using either a [BERT model](https://huggingface.co/bookbot/id-g2p-bert) or an [LSTM model](https://huggingface.co/bookbot/id-g2p-lstm). 9 | 10 | ## Phoneme and Grapheme Sets 11 | 12 | ```python 13 | graphemes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'] 14 | phonemes = ['a', 'b', 'd', 'e', 'f', 'ɡ', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'z', 'ŋ', 'ə', 'ɲ', 'tʃ', 'ʃ', 'dʒ', 'x', 'ʔ'] 15 | ``` 16 | 17 | ## Homographs 18 | 19 | Indonesian words (as far as we know) only have one case of homograph, that is, differing ways to pronounce the letter `e`. For instance, in the word `apel` (meaning: apple), the letter `e` is a mid central vowel `ə`. On the other hand, the letter `e` in the word `apel` (meaning: going to a significant other's house; courting), is a closed-mid front unrounded vowel `e`. Sometimes, a word might have >1 `e`s pronounced in both ways, for instance, `mereka` (meaning: they) is pronounced as `məreka`. Because of this, there needs a way to disambiguate homographs, and in our case, we used their POS (part-of-speech) tags. However, this is not a foolproof method since homographs may even have the same POS tag. We are considering a contextual model to handle this better. 20 | 21 | ## OOV Prediction 22 | 23 | Initially, we relied on a sequence2sequence LSTM model for OOV (out-of-vocabulary) prediction. This was a natural choice given that it can "automatically" learn the rules of grapheme-to-phoneme conversion without having to determine the rules by hand. However, we soon noticed that despite its validation results, the model performed poorly on unseen words, especially on longer ones. We needed a more controllable model that makes predictions on necessary characters only. We ended up with a customized BERT that predicts the correct pronunciation of the letter `e` while keeping the rest of the string unchanged. We then apply a hand-written g2p conversion algorithm that handles the other characters. 24 | 25 | You can find more detail in [this blog post](https://wilsonwongso.dev/posts/2022/04/predicting-phonemes-with-bert/). 26 | 27 | ## POS Tagging 28 | 29 | We trained an [NLTK PerceptronTagger](https://www.nltk.org/_modules/nltk/tag/perceptron.html) on the [POSP](https://huggingface.co/datasets/indonlu) dataset, which achieved 0.956 and 0.945 F1-score on the valid and test sets, respectively. Given its performance and speed, we decided to adopt this model as the POS tagger for the purpose of disambiguating homographs, which is just like the English g2p library. 30 | 31 | | tag | precision | recall | f1-score | 32 | | --------- | --------- | -------- | -------- | 33 | | B-$$$ | 1.000000 | 1.000000 | 1.000000 | 34 | | B-ADJ | 0.904132 | 0.864139 | 0.883683 | 35 | | B-ADK | 1.000000 | 0.986667 | 0.993289 | 36 | | B-ADV | 0.966874 | 0.976987 | 0.971904 | 37 | | B-ART | 0.988920 | 0.978082 | 0.983471 | 38 | | B-CCN | 0.997934 | 0.997934 | 0.997934 | 39 | | B-CSN | 0.986395 | 0.963455 | 0.974790 | 40 | | B-INT | 1.000000 | 1.000000 | 1.000000 | 41 | | B-KUA | 0.976744 | 0.976744 | 0.976744 | 42 | | B-NEG | 0.992857 | 0.972028 | 0.982332 | 43 | | B-NNO | 0.919917 | 0.941288 | 0.930480 | 44 | | B-NNP | 0.917685 | 0.914703 | 0.916192 | 45 | | B-NUM | 0.997358 | 0.954488 | 0.975452 | 46 | | B-PAR | 1.000000 | 0.851064 | 0.919540 | 47 | | B-PPO | 0.991206 | 0.991829 | 0.991517 | 48 | | B-PRI | 1.000000 | 0.928571 | 0.962963 | 49 | | B-PRK | 0.793103 | 0.851852 | 0.821429 | 50 | | B-PRN | 0.988327 | 0.988327 | 0.988327 | 51 | | B-PRR | 0.995465 | 1.000000 | 0.997727 | 52 | | B-SYM | 0.999662 | 0.999323 | 0.999492 | 53 | | B-UNS | 0.916667 | 0.733333 | 0.814815 | 54 | | B-VBE | 1.000000 | 0.985714 | 0.992806 | 55 | | B-VBI | 0.929119 | 0.877034 | 0.902326 | 56 | | B-VBL | 1.000000 | 1.000000 | 1.000000 | 57 | | B-VBP | 0.926606 | 0.933457 | 0.930018 | 58 | | B-VBT | 0.939759 | 0.953333 | 0.946498 | 59 | | --------- | --------- | -------- | -------- | 60 | | macro avg | 0.966490 | 0.946937 | 0.955913 | 61 | 62 | ## Attempts that Failed 63 | 64 | - Parsed [online PDF KBBI](https://oldi.lipi.go.id/public/Kamus%20Indonesia.pdf), but it turns out that it has very little phoneme descriptions. 65 | - Scraped [online Web KBBI](https://github.com/laymonage/kbbi-python), but it had a daily bandwidth which was too low to be used at this level. 66 | 67 | ## Potential Improvements 68 | 69 | There is a ton of room for improvements, both from the technical and the linguistic side of the approaches. Consider that a failure of one component may cascade to an incorrect conclusion. For instance, an incorrect POS tag can lead to the wrong phoneme, ditto for incorrect OOV prediction. We propose the following future improvements. 70 | 71 | - [ ] Use a larger pronunciation lexicon instead of having to guess. 72 | - [x] Find a larger homograph list. 73 | - [x] Use contextual model instead of character-level RNNs. 74 | - [x] Consider hand-written rules for g2p conversion. 75 | - [x] Add to PyPI. -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributor Covenant Code of Conduct 3 | 4 | ## Our Pledge 5 | 6 | We as members, contributors, and leaders pledge to make participation in our 7 | community a harassment-free experience for everyone, regardless of age, body 8 | size, visible or invisible disability, ethnicity, sex characteristics, gender 9 | identity and expression, level of experience, education, socio-economic status, 10 | nationality, personal appearance, race, caste, color, religion, or sexual 11 | identity and orientation. 12 | 13 | We pledge to act and interact in ways that contribute to an open, welcoming, 14 | diverse, inclusive, and healthy community. 15 | 16 | ## Our Standards 17 | 18 | Examples of behavior that contributes to a positive environment for our 19 | community include: 20 | 21 | * Demonstrating empathy and kindness toward other people 22 | * Being respectful of differing opinions, viewpoints, and experiences 23 | * Giving and gracefully accepting constructive feedback 24 | * Accepting responsibility and apologizing to those affected by our mistakes, 25 | and learning from the experience 26 | * Focusing on what is best not just for us as individuals, but for the overall 27 | community 28 | 29 | Examples of unacceptable behavior include: 30 | 31 | * The use of sexualized language or imagery, and sexual attention or advances of 32 | any kind 33 | * Trolling, insulting or derogatory comments, and personal or political attacks 34 | * Public or private harassment 35 | * Publishing others' private information, such as a physical or email address, 36 | without their explicit permission 37 | * Other conduct which could reasonably be considered inappropriate in a 38 | professional setting 39 | 40 | ## Enforcement Responsibilities 41 | 42 | Community leaders are responsible for clarifying and enforcing our standards of 43 | acceptable behavior and will take appropriate and fair corrective action in 44 | response to any behavior that they deem inappropriate, threatening, offensive, 45 | or harmful. 46 | 47 | Community leaders have the right and responsibility to remove, edit, or reject 48 | comments, commits, code, wiki edits, issues, and other contributions that are 49 | not aligned to this Code of Conduct, and will communicate reasons for moderation 50 | decisions when appropriate. 51 | 52 | ## Scope 53 | 54 | This Code of Conduct applies within all community spaces, and also applies when 55 | an individual is officially representing the community in public spaces. 56 | Examples of representing our community include using an official e-mail address, 57 | posting via an official social media account, or acting as an appointed 58 | representative at an online or offline event. 59 | 60 | ## Enforcement 61 | 62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 63 | reported to the community leaders responsible for enforcement at [team@bookbotkids.com](mailto:team@bookbotkids.com). 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series of 86 | actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or permanent 93 | ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within the 113 | community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.1, available at 119 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. 120 | 121 | Community Impact Guidelines were inspired by 122 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 123 | 124 | For answers to common questions about this code of conduct, see the FAQ at 125 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at 126 | [https://www.contributor-covenant.org/translations][translations]. 127 | 128 | [homepage]: https://www.contributor-covenant.org 129 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html 130 | [Mozilla CoC]: https://github.com/mozilla/diversity 131 | [FAQ]: https://www.contributor-covenant.org/faq 132 | [translations]: https://www.contributor-covenant.org/translations 133 | -------------------------------------------------------------------------------- /g2p_id/g2p.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/) 3 | 4 | Licensed under the Apache License, Version 2.0 (the "License"); 5 | you may not use this file except in compliance with the License. 6 | You may obtain a copy of the License at 7 | 8 | http://www.apache.org/licenses/LICENSE-2.0 9 | 10 | Unless required by applicable law or agreed to in writing, software 11 | distributed under the License is distributed on an "AS IS" BASIS, 12 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | See the License for the specific language governing permissions and 14 | limitations under the License. 15 | """ 16 | 17 | import os 18 | import re 19 | import pickle 20 | import unicodedata 21 | from builtins import str as unicode 22 | from itertools import permutations 23 | from typing import Dict, List, Tuple, Union 24 | 25 | import nltk 26 | from nltk.tag.perceptron import PerceptronTagger 27 | from nltk.tokenize import TweetTokenizer 28 | 29 | from g2p_id.bert import BERT 30 | from g2p_id.lstm import LSTM 31 | from g2p_id.text_processor import TextProcessor 32 | 33 | nltk.download("wordnet") 34 | resources_path = os.path.join(os.path.dirname(__file__), "resources") 35 | 36 | 37 | def construct_homographs_dictionary() -> Dict[str, Tuple[str, str, str, str]]: 38 | """Creates a dictionary of homographs 39 | 40 | Returns: 41 | Dict[str, Tuple[str, str, str, str]]: 42 | Key: WORD 43 | Value: (PH1, PH2, POS1, POS2) 44 | """ 45 | homograph_path = os.path.join(resources_path, "homographs_id.tsv") 46 | homograph2features = {} 47 | with open(homograph_path, encoding="utf-8") as file: 48 | lines = file.readlines() 49 | for line in lines: 50 | grapheme, phone_1, phone_2, pos_1, pos_2 = line.strip("\n").split("\t") 51 | homograph2features[grapheme.lower()] = (phone_1, phone_2, pos_1, pos_2) 52 | 53 | return homograph2features 54 | 55 | 56 | def construct_lexicon_dictionary() -> Dict[str, str]: 57 | """Creates a lexicon dictionary. 58 | 59 | Returns: 60 | Dict[str, str]: 61 | Key: WORD 62 | Value: Phoneme (IPA) 63 | """ 64 | lexicon_path = os.path.join(resources_path, "lexicon_id.tsv") 65 | lexicon2features = {} 66 | with open(lexicon_path, encoding="utf-8") as file: 67 | lines = file.readlines() 68 | for line in lines: 69 | grapheme, phoneme = line.strip("\n").split("\t") 70 | lexicon2features[grapheme.lower()] = phoneme 71 | return lexicon2features 72 | 73 | 74 | class G2p: 75 | """Grapheme-to-phoneme (g2p) main class for phonemization. 76 | This class provides a high-level API for grapheme-to-phoneme conversion. 77 | 78 | 1. Preprocess and normalize text 79 | 2. Word tokenizes text 80 | 3. Predict POS for every word 81 | 4. If word is non-alphabetic, add to list (i.e. punctuation) 82 | 5. If word is a homograph, check POS and use matching word's phonemes 83 | 6. If word is a non-homograph, lookup lexicon 84 | 7. Otherwise, predict with a neural network 85 | """ 86 | 87 | def __init__(self, model_type="BERT"): 88 | """Constructor for G2p. 89 | 90 | Args: 91 | model_type (str, optional): 92 | Type of neural network to use for prediction. 93 | Choices are "LSTM" or "BERT". Defaults to "BERT". 94 | """ 95 | self.homograph2features = construct_homographs_dictionary() 96 | self.lexicon2features = construct_lexicon_dictionary() 97 | self.normalizer = TextProcessor() 98 | self.tagger = PerceptronTagger(load=False) 99 | tagger_path = os.path.join(resources_path, "id_posp_tagger.pickle") 100 | with open(tagger_path, "rb") as f: 101 | self.tagger = self.tagger.decode_json_obj(pickle.load(f)) 102 | self.model: Union[BERT, LSTM] = BERT() if model_type == "BERT" else LSTM() 103 | self.tokenizer = TweetTokenizer() 104 | self.pos_dict = { 105 | "N": ["B-NNO", "B-NNP", "B-PRN", "B-PRN", "B-PRK"], 106 | "V": ["B-VBI", "B-VBT", "B-VBP", "B-VBL", "B-VBE"], 107 | "A": ["B-ADJ"], 108 | "P": ["B-PAR"], 109 | } 110 | 111 | def _preprocess(self, text: str) -> str: 112 | """Performs preprocessing. 113 | (1) Adds spaces in between tokens 114 | (2) Normalizes unicode and accents 115 | (3) Normalizes numbers 116 | (4) Lower case texts 117 | (5) Removes unwanted tokens 118 | 119 | Arguments: 120 | text (str): Text to preprocess. 121 | 122 | Returns: 123 | str: Preprocessed text. 124 | """ 125 | text = text.replace("-", " ") 126 | text = re.sub(r"\.(?=.*\.)", " ", text) 127 | text = " ".join(self.tokenizer.tokenize(text)) 128 | text = unicode(text) 129 | text = "".join(char for char in unicodedata.normalize("NFD", text) if unicodedata.category(char) != "Mn") 130 | text = self.normalizer.normalize(text).strip() 131 | text = text.lower() 132 | text = re.sub(r"[^ a-z'.,?!\-]", "", text) 133 | return text 134 | 135 | def _rule_based_g2p(self, text: str) -> str: 136 | """Applies rule-based Indonesian grapheme2phoneme conversion. 137 | 138 | Args: 139 | text (str): Grapheme text to convert to phoneme. 140 | 141 | Returns: 142 | str: Phoneme string. 143 | """ 144 | phonetic_mapping = { 145 | "ny": "ɲ", 146 | "ng": "ŋ", 147 | "sy": "ʃ", 148 | "aa": "aʔa", 149 | "ii": "iʔi", 150 | "oo": "oʔo", 151 | "əə": "əʔə", 152 | "uu": "uʔu", 153 | "'": "ʔ", 154 | "g": "ɡ", 155 | "q": "k", 156 | "j": "dʒ", 157 | "y": "j", 158 | "x": "ks", 159 | "c": "tʃ", 160 | "kh": "x", 161 | } 162 | 163 | if text.startswith("x"): 164 | text = "s" + text[1:] 165 | 166 | if text.startswith("ps"): 167 | text = text[1:] 168 | 169 | for graph, phone in phonetic_mapping.items(): 170 | text = text.replace(graph, phone) 171 | 172 | phonemes = [list(phn) if phn not in ("dʒ", "tʃ") else [phn] for phn in re.split("(tʃ|dʒ)", text)] 173 | return " ".join([p for phn in phonemes for p in phn]) 174 | 175 | def __call__(self, text: str) -> List[List[str]]: 176 | """Grapheme-to-phoneme converter. 177 | 178 | 1. Preprocess and normalize text 179 | 2. Word tokenizes text 180 | 3. Predict POS for every word 181 | 4. If word is non-alphabetic, add to list (i.e. punctuation) 182 | 5. If word is a homograph, check POS and use matching word's phonemes 183 | 6. If word is a non-homograph, lookup lexicon 184 | 7. Otherwise, predict with a neural network 185 | 186 | Args: 187 | text (str): Grapheme text to convert to phoneme. 188 | 189 | Returns: 190 | List[List[str]]: List of strings in phonemes. 191 | """ 192 | text = self._preprocess(text) 193 | words = self.tokenizer.tokenize(text) 194 | tokens = self.tagger.tag(words) 195 | 196 | prons = [] 197 | for word, pos in tokens: 198 | pron = "" 199 | if re.search("[a-z]", word) is None: # non-alphabetic 200 | pron = word 201 | 202 | elif word in self.homograph2features: # check if homograph 203 | pron1, pron2, pos1, _ = self.homograph2features[word] 204 | 205 | # check for the matching POS 206 | if pos in self.pos_dict[pos1]: 207 | pron = pron1 208 | else: 209 | pron = pron2 210 | 211 | elif word in self.lexicon2features: # non-homographs 212 | pron = self.lexicon2features[word] 213 | 214 | else: # predict for OOV 215 | pron = self.model.predict(word) 216 | if isinstance(self.model, BERT): 217 | pron = self._rule_based_g2p(pron) 218 | 219 | if pron.endswith("ʔ"): 220 | pron = pron[:-1] + "k" 221 | 222 | consonants = "bdjklmnprstwɲ" 223 | vowels = "aeiouə" 224 | 225 | for letter in consonants: 226 | pron = pron.replace(f"ʔ {letter}", f"k {letter}") 227 | 228 | # add a glottal stop in between consecutive vowels 229 | for v1, v2 in permutations(vowels, 2): 230 | pron = pron.replace(f"{v1} {v2}", f"{v1} ʔ {v2}") 231 | 232 | prons.append(pron.split()) 233 | 234 | return prons 235 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | Copyright 2023 [PT BOOKBOT INDONESIA](https://bookbot.id/) 179 | 180 | Licensed under the Apache License, Version 2.0 (the "License"); 181 | you may not use this file except in compliance with the License. 182 | You may obtain a copy of the License at 183 | 184 | http://www.apache.org/licenses/LICENSE-2.0 185 | 186 | Unless required by applicable law or agreed to in writing, software 187 | distributed under the License is distributed on an "AS IS" BASIS, 188 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 189 | See the License for the specific language governing permissions and 190 | limitations under the License. 191 | -------------------------------------------------------------------------------- /g2p_id/text_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | MIT License 3 | 4 | Copyright (c) 2021 Cahya Wirawan 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | """ 24 | 25 | 26 | import os 27 | import re 28 | from typing import Any 29 | 30 | from num2words import num2words 31 | 32 | resources_path = os.path.join(os.path.dirname(__file__), "resources") 33 | 34 | 35 | class TextProcessor: 36 | """Indonesian text processor to normalize numerics, currencies, and timezones.""" 37 | 38 | def __init__(self): 39 | self.measurements = {} 40 | self.thousands = ["ratus", "ribu", "juta", "miliar", "milyar", "triliun"] 41 | self.months = [ 42 | "Januari", 43 | "Februari", 44 | "Maret", 45 | "April", 46 | "Mei", 47 | "Juni", 48 | "Juli", 49 | "Agustus", 50 | "September", 51 | "Oktober", 52 | "November", 53 | "Desember", 54 | ] 55 | measurements_path = os.path.join(resources_path, "measurements.tsv") 56 | currencies_path = os.path.join(resources_path, "currency.tsv") 57 | timezones_path = os.path.join(resources_path, "timezones.tsv") 58 | 59 | with open(measurements_path, "r", encoding="utf-8") as file: 60 | for lines in file: 61 | line = lines.strip().split("\t") 62 | self.measurements[line[0]] = line[1] 63 | 64 | self.currencies = {} 65 | with open(currencies_path, "r", encoding="utf-8") as file: 66 | for lines in file: 67 | line = lines.strip().split("\t") 68 | self.currencies[line[0]] = line[1] 69 | 70 | self.timezones = {} 71 | with open(timezones_path, "r", encoding="utf-8") as file: 72 | for lines in file: 73 | line = lines.strip().split("\t") 74 | self.timezones[line[0]] = line[1] 75 | 76 | self.re_thousands = "|".join(self.thousands) 77 | self.re_currencies = r"\b" + re.sub( 78 | r"\|([^|$£€¥₩]+)", r"|\\b\1", "|".join(list(self.currencies)) 79 | ) 80 | self.re_currencies = re.sub(r"([$£€¥₩])", r"\\\1", self.re_currencies) 81 | self.re_moneys = ( 82 | rf"(({self.re_currencies}) ?([\d\.\,]+)( ({self.re_thousands})?(an)?)?)" 83 | ) 84 | self.re_measurements = "|".join(list(self.measurements)) 85 | self.re_measurements = rf"(\b([\d\.\,]+) ?({self.re_measurements})\b)" 86 | self.re_timezones = "|".join(list(self.timezones)) 87 | self.re_timezones = ( 88 | r"((\d{1,2})[\.:](\d{1,2}) " + rf"\b({self.re_timezones})\b)" 89 | ) 90 | self.re_http = re.compile( 91 | r""" 92 | (https?://(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\. 93 | [a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9()@:%_\+.~#?&//=]*) 94 | """, 95 | re.X, 96 | ) 97 | 98 | @staticmethod 99 | def is_integer(number: Any) -> bool: 100 | """Check if integer by type-casting. 101 | 102 | Args: 103 | number (Any): Number to check. 104 | 105 | Returns: 106 | bool: Is a valid integer. 107 | """ 108 | try: 109 | int(number) 110 | return True 111 | except ValueError: 112 | return False 113 | 114 | @staticmethod 115 | def is_float(number: Any) -> bool: 116 | """Check if float by type-casting. 117 | 118 | Args: 119 | number (Any): Number to check. 120 | 121 | Returns: 122 | bool: Is a valid float. 123 | """ 124 | try: 125 | float(number) 126 | return True 127 | except ValueError: 128 | return False 129 | 130 | def normalize_url(self, text: str) -> str: 131 | """Removes URL from text. 132 | 133 | Args: 134 | text (str): Text with URL to normalize. 135 | 136 | Returns: 137 | str: Normalized text with URLs removed. 138 | """ 139 | urls = re.findall(self.re_http, text) 140 | for url in urls: 141 | text = text.replace(url[0], "") 142 | return text 143 | 144 | def normalize_currency(self, text: str) -> str: 145 | """Normalizes international and Indonesian (Rupiah) currencies. 146 | 147 | Examples: 148 | - `"$250"` -> `"dua ratus lima puluh dollar"` 149 | - `"Rp 3,000,000"` -> `"tiga juta rupiah"` 150 | 151 | Args: 152 | text (str): Text with currency to normalize. 153 | 154 | Returns: 155 | str: Normalized text with currency transliterated. 156 | """ 157 | moneys = re.findall(self.re_moneys, text) 158 | for money in moneys: 159 | number: Any = re.sub(",", ".", re.sub(r"\.", "", money[2].strip(" ,."))) 160 | try: 161 | if number == "": 162 | continue 163 | if self.is_integer(number): 164 | number = int(number) 165 | elif self.is_float(number): 166 | number = float(number) 167 | else: 168 | number = re.sub(r"[.,]", "", number) 169 | number = int(number) 170 | number = num2words(number, to="cardinal", lang="id") 171 | text = text.replace( 172 | money[0].strip(" ,."), 173 | f"{number} {money[3]} {self.currencies[money[1]]}", 174 | ) 175 | except NotImplementedError as error: 176 | print(error) 177 | print(f"Problem with money: <{text}>: {number}") 178 | return text 179 | 180 | def normalize_measurement(self, text: str) -> str: 181 | """Normalizes measurement units, including its scalar value. 182 | 183 | Examples: 184 | - `"10,5 km"` -> `"sepuluh koma lima kilometer"` 185 | - `"5°C"` -> `"lima derajat celsius"` 186 | 187 | Args: 188 | text (str): Text with measurements to normalize. 189 | 190 | Returns: 191 | str: Normalized text with measurements transliterated. 192 | """ 193 | units = re.findall(self.re_measurements, text) 194 | for unit in units: 195 | number: Any = re.sub(",", ".", re.sub(r"\.", "", unit[1].strip(" ,."))) 196 | try: 197 | if number == "": 198 | continue 199 | if re.search(r"\.", number): 200 | number = float(number) 201 | else: 202 | number = int(number) 203 | number = num2words(number, to="cardinal", lang="id") 204 | text = text.replace( 205 | unit[0].strip(" ,."), f"{number} {self.measurements[unit[2]]}" 206 | ) 207 | except NotImplementedError as error: 208 | print(error) 209 | print(f"Problem with measurements: <{text}>: {number}") 210 | return text 211 | 212 | def normalize_date(self, text: str) -> str: 213 | """Normalizes dates. 214 | 215 | Examples: 216 | - `"(12/3/2021)"` -> `"dua belas Maret dua ribu dua puluh satu"` 217 | 218 | Args: 219 | text (str): Text with dates to normalize. 220 | 221 | Returns: 222 | str: Normalized text with dates transliterated. 223 | """ 224 | dates = re.findall(r"(\((\d{1,2})/(\d{1,2})(/(\d+))?\))", text) 225 | for date in dates: 226 | try: 227 | day = num2words(int(date[1]), to="cardinal", lang="id") 228 | month: Any = int(date[2]) - 1 229 | if month >= 12: 230 | month = 0 231 | month = self.months[month] 232 | if date[4] != "": 233 | year = num2words(int(date[4]), to="cardinal", lang="id") 234 | date_string = f"{day} {month} {year}" 235 | else: 236 | date_string = f"{day} {month}" 237 | text = text.replace(date[0], f" {date_string} ") 238 | except NotImplementedError as error: 239 | print(error) 240 | print(f"Problem with dates: <{text}>: {date}") 241 | return text 242 | 243 | def normalize_timezone(self, text: str) -> str: 244 | """Normalizes Indonesian time with timezones. 245 | 246 | Examples: 247 | - `"22.30 WITA"` 248 | -> `"dua puluh dua lewat tiga puluh menit Waktu Indonesia Tengah"` 249 | 250 | Args: 251 | text (str): Text with timezones to normalize. 252 | 253 | Returns: 254 | str: Normalized text with timezones transliterated. 255 | """ 256 | timezones = re.findall(self.re_timezones, text) 257 | for timezone in timezones: 258 | try: 259 | hour = num2words(int(timezone[1]), to="cardinal", lang="id") 260 | minute = num2words(int(timezone[2]), to="cardinal", lang="id") 261 | zone = self.timezones[timezone[3]] 262 | if minute == "nol": 263 | time_string = f"{hour} {zone}" 264 | else: 265 | time_string = f"{hour} lewat {minute} menit {zone}" 266 | text = text.replace(timezone[0], f"{time_string}") 267 | except NotImplementedError as error: 268 | print(error) 269 | print(f"Problem with timezones: <{text}>: {timezone}") 270 | return text 271 | 272 | def normalize_number(self, text: str) -> str: 273 | """Normalizes Arabic numbers to Indonesian. 274 | 275 | Examples: 276 | - `"1.000"` -> `"seribu"` 277 | - `"10,5"` -> `"sepuluh koma lima"` 278 | 279 | Args: 280 | text (str): Text with numbers to normalize. 281 | 282 | Returns: 283 | str: Normalized text with numbers transliterated. 284 | """ 285 | re_numbers = [r"([\d.,]+)", r"\d+"] 286 | for re_number in re_numbers: 287 | number_len = 0 288 | for i in re.finditer(re_number, text): 289 | start = i.start() + number_len 290 | end = i.end() + number_len 291 | number: Any = text[start:end] 292 | number = re.sub(",", ".", re.sub(r"\.", "", number.strip(" ,."))) 293 | if number == "": 294 | continue 295 | if self.is_float(number) or self.is_integer(number): 296 | try: 297 | if self.is_integer(number): 298 | number = int(number) 299 | else: 300 | number = float(number) 301 | number = num2words(number, to="cardinal", lang="id") 302 | text = text[:start] + number + text[end:] 303 | number_len += len(number) - (end - start) 304 | except NotImplementedError as error: 305 | print(error) 306 | print(f"Problem with number: <{text}>: {number}") 307 | return text 308 | 309 | def normalize(self, text: str) -> str: 310 | """Normalizes Indonesian text by expanding: 311 | 312 | - URL 313 | - Currency 314 | - Measurements 315 | - Dates 316 | - Timezones 317 | - Arabic Numerals 318 | 319 | Args: 320 | text (str): Text to normalize. 321 | 322 | Returns: 323 | str: Normalized text. 324 | """ 325 | # Remove URL 326 | text = self.normalize_url(text) 327 | # Currency 328 | text = self.normalize_currency(text) 329 | # Measurements 330 | text = self.normalize_measurement(text) 331 | # Date 332 | text = self.normalize_date(text) 333 | # Timezones 334 | text = self.normalize_timezone(text) 335 | # Any number 336 | text = self.normalize_number(text) 337 | # collapse consecutive whitespaces 338 | text = re.sub(r"\s+", " ", text) 339 | return text 340 | --------------------------------------------------------------------------------