├── itn ├── __init__.py ├── chinese │ ├── __init__.py │ ├── rules │ │ ├── __init__.py │ │ ├── char.py │ │ ├── whitelist.py │ │ ├── postprocessor.py │ │ ├── math.py │ │ └── license_plate.py │ ├── data │ │ ├── number │ │ │ ├── dot.tsv │ │ │ ├── zero.tsv │ │ │ ├── sign.tsv │ │ │ ├── digit_zh.tsv │ │ │ ├── digit.tsv │ │ │ ├── special_dash.tsv │ │ │ └── special_tilde.tsv │ │ ├── default │ │ │ ├── blacklist.tsv │ │ │ └── whitelist.tsv │ │ ├── math │ │ │ └── operator.tsv │ │ ├── time │ │ │ ├── noon.tsv │ │ │ ├── hour.tsv │ │ │ ├── minute.tsv │ │ │ └── second.tsv │ │ ├── date │ │ │ ├── mm.tsv │ │ │ └── dd.tsv │ │ ├── license_plate │ │ │ └── province.tsv │ │ ├── money │ │ │ ├── symbol.tsv │ │ │ └── code.tsv │ │ └── measure │ │ │ ├── units_en.tsv │ │ │ └── units_zh.tsv │ └── test │ │ ├── data │ │ ├── char.txt │ │ ├── whitelist.txt │ │ ├── license_plate.txt │ │ ├── fraction.txt │ │ ├── math.txt │ │ ├── money.txt │ │ ├── normalizer_disable_standalone_number_enable_0_to_9.txt │ │ ├── date.txt │ │ ├── time.txt │ │ ├── number.txt │ │ ├── cardinal.txt │ │ └── normalizer.txt │ │ ├── utils.py │ │ └── __init__.py ├── japanese │ ├── __init__.py │ ├── rules │ │ ├── __init__.py │ │ ├── char.py │ │ ├── preprocessor.py │ │ ├── whitelist.py │ │ ├── ordinal.py │ │ ├── math.py │ │ └── money.py │ ├── test │ │ ├── __init__.py │ │ ├── data │ │ │ ├── char.txt │ │ │ ├── money.txt │ │ │ ├── whitelist.txt │ │ │ ├── math.txt │ │ │ ├── measure.txt │ │ │ ├── fraction.txt │ │ │ ├── time.txt │ │ │ ├── number.txt │ │ │ ├── date.txt │ │ │ ├── cardinal.txt │ │ │ ├── normalizer_disable_standalone_number_disable_0_to_9.txt │ │ │ ├── normalizer_disable_standalone_number_enable_0_to_9.txt │ │ │ └── normalizer_enable_standalone_number_disable_0_to_9.txt │ │ └── utils.py │ └── data │ │ ├── number │ │ ├── dot.tsv │ │ ├── hundred.tsv │ │ ├── zero.tsv │ │ ├── sign.tsv │ │ ├── thousands.tsv │ │ ├── digit.tsv │ │ ├── ties.tsv │ │ ├── hundred_digit.tsv │ │ └── teen.tsv │ │ ├── default │ │ ├── blacklist.tsv │ │ └── whitelist.tsv │ │ ├── char │ │ ├── oov_tags.tsv │ │ ├── punctuations_ja.tsv │ │ ├── fullwidth_to_halfwidth.tsv │ │ └── hiragana_and_katakana.tsv │ │ ├── math │ │ └── operator.tsv │ │ ├── money │ │ └── symbol.tsv │ │ ├── date │ │ ├── month.tsv │ │ ├── week.tsv │ │ └── day.tsv │ │ ├── measure │ │ ├── unit_ja.tsv │ │ └── unit_en.tsv │ │ └── time │ │ ├── hour.tsv │ │ ├── minute.tsv │ │ └── second.tsv └── __main__.py ├── tn ├── __init__.py ├── chinese │ ├── __init__.py │ ├── rules │ │ ├── __init__.py │ │ ├── char.py │ │ ├── preprocessor.py │ │ ├── math.py │ │ ├── fraction.py │ │ ├── whitelist.py │ │ └── money.py │ ├── data │ │ ├── number │ │ │ ├── dot.tsv │ │ │ ├── zero.tsv │ │ │ ├── sign.tsv │ │ │ ├── teen.tsv │ │ │ └── digit.tsv │ │ ├── default │ │ │ ├── blacklist.tsv │ │ │ └── whitelist.tsv │ │ ├── char │ │ │ ├── charset_extension.tsv │ │ │ ├── punctuations_zh.tsv │ │ │ └── fullwidth_to_halfwidth.tsv │ │ ├── math │ │ │ └── operator.tsv │ │ ├── time │ │ │ ├── noon.tsv │ │ │ ├── hour.tsv │ │ │ ├── second.tsv │ │ │ └── minute.tsv │ │ ├── date │ │ │ ├── m.tsv │ │ │ ├── mm.tsv │ │ │ ├── d.tsv │ │ │ └── dd.tsv │ │ ├── money │ │ │ ├── symbol.tsv │ │ │ └── code.tsv │ │ ├── erhua │ │ │ └── whitelist.tsv │ │ └── measure │ │ │ └── units_en.tsv │ └── test │ │ ├── data │ │ ├── char.txt │ │ ├── preprocessor.txt │ │ ├── fraction.txt │ │ ├── sport.txt │ │ ├── whitelist.txt │ │ ├── money.txt │ │ ├── postprocessor.txt │ │ ├── time.txt │ │ ├── math.txt │ │ ├── cardinal.txt │ │ ├── measure.txt │ │ ├── date.txt │ │ ├── number.txt │ │ └── normalizer.txt │ │ ├── __init__.py │ │ ├── time_test.py │ │ ├── char_test.py │ │ ├── date_test.py │ │ ├── math_test.py │ │ ├── money_test.py │ │ ├── sport_test.py │ │ ├── measure_test.py │ │ ├── fraction_test.py │ │ ├── whitelist_test.py │ │ ├── preprocessor_test.py │ │ ├── postprocessor_test.py │ │ ├── utils.py │ │ ├── cardinal_test.py │ │ └── normalizer_test.py ├── english │ ├── __init__.py │ ├── data │ │ ├── __init__.py │ │ ├── date │ │ │ ├── __init__.py │ │ │ ├── year_suffix.tsv │ │ │ ├── week.tsv │ │ │ ├── month_number.tsv │ │ │ ├── day.tsv │ │ │ ├── month_abbr.tsv │ │ │ └── month_name.tsv │ │ ├── money │ │ │ ├── __init__.py │ │ │ ├── per_unit.tsv │ │ │ ├── currency_minor_singular.tsv │ │ │ ├── currency_minor_plural.tsv │ │ │ └── currency_major.tsv │ │ ├── roman │ │ │ ├── __init__.py │ │ │ ├── key_word.tsv │ │ │ └── README.md │ │ ├── electronic │ │ │ ├── __init__.py │ │ │ ├── domain.tsv │ │ │ ├── words.tsv │ │ │ └── symbol.tsv │ │ ├── ordinal │ │ │ ├── __init__.py │ │ │ ├── teen.tsv │ │ │ └── digit.tsv │ │ ├── telephone │ │ │ ├── __init__.py │ │ │ ├── ip_prompt.tsv │ │ │ ├── ssn_prompt.tsv │ │ │ └── telephone_prompt.tsv │ │ ├── whitelist │ │ │ ├── __init__.py │ │ │ ├── alternatives_all_format.tsv │ │ │ ├── symbol.tsv │ │ │ ├── lj_speech.tsv │ │ │ ├── alternatives.tsv │ │ │ └── asr_with_pc.tsv │ │ ├── number │ │ │ ├── hundred.tsv │ │ │ ├── zero.tsv │ │ │ ├── digit.tsv │ │ │ ├── ty.tsv │ │ │ ├── cardinal_number_name.far │ │ │ ├── cardinal_number_name_au.far │ │ │ ├── teen.tsv │ │ │ ├── quantity_abbr.tsv │ │ │ ├── fraction.tsv │ │ │ ├── thousand.tsv │ │ │ └── __init__.py │ │ ├── measure │ │ │ ├── math_operation.tsv │ │ │ ├── __init__.py │ │ │ └── unit_alternatives.tsv │ │ ├── time │ │ │ ├── suffix.tsv │ │ │ ├── zone.tsv │ │ │ └── __init__.py │ │ └── address │ │ │ ├── address_word.tsv │ │ │ ├── __init__.py │ │ │ └── state.tsv │ ├── rules │ │ └── __init__.py │ └── test │ │ ├── __init__.py │ │ ├── data │ │ ├── roman.txt │ │ ├── range.txt │ │ ├── electronic.txt │ │ ├── word.txt │ │ ├── decimal.txt │ │ ├── telephone.txt │ │ ├── whitelist.txt │ │ ├── fraction.txt │ │ ├── ordinal.txt │ │ ├── time.txt │ │ ├── money.txt │ │ ├── measure.txt │ │ ├── date.txt │ │ └── cardinal.txt │ │ ├── utils.py │ │ ├── word_test.py │ │ ├── date_test.py │ │ ├── time_test.py │ │ ├── money_test.py │ │ ├── range_test.py │ │ ├── roman_test.py │ │ ├── decimal_test.py │ │ ├── measure_test.py │ │ ├── ordinal_test.py │ │ ├── normalizer_test.py │ │ ├── cardinal_test.py │ │ ├── fraction_test.py │ │ ├── telephone_test.py │ │ ├── whitelist_test.py │ │ └── electronic_test.py ├── japanese │ ├── __init__.py │ ├── test │ │ ├── __init__.py │ │ ├── data │ │ │ ├── char.txt │ │ │ ├── fraction.txt │ │ │ ├── whitelist.txt │ │ │ ├── money.txt │ │ │ ├── sport.txt │ │ │ ├── time.txt │ │ │ ├── measure.txt │ │ │ ├── math.txt │ │ │ ├── date.txt │ │ │ └── cardinal.txt │ │ ├── utils.py │ │ └── normalizer_test.py │ ├── data │ │ ├── default │ │ │ ├── blacklist.tsv │ │ │ └── whitelist.tsv │ │ ├── number │ │ │ ├── dot.tsv │ │ │ ├── zero.tsv │ │ │ ├── sign.tsv │ │ │ ├── teen.tsv │ │ │ ├── digit.tsv │ │ │ └── en_digit.tsv │ │ ├── char │ │ │ ├── oov_tags.tsv │ │ │ ├── punctuations_ja.tsv │ │ │ ├── fullwidth_to_halfwidth.tsv │ │ │ └── hiragana_and_katakana.tsv │ │ ├── date │ │ │ ├── date.tsv │ │ │ ├── dd.tsv │ │ │ ├── mm.tsv │ │ │ ├── m.tsv │ │ │ └── d.tsv │ │ ├── time │ │ │ ├── noon.tsv │ │ │ ├── hour.tsv │ │ │ ├── minute.tsv │ │ │ └── second.tsv │ │ ├── money │ │ │ ├── symbol.tsv │ │ │ └── code.tsv │ │ ├── math │ │ │ └── operator.tsv │ │ ├── measure │ │ │ ├── units_ja.tsv │ │ │ └── units_en.tsv │ │ └── sport │ │ │ └── club.tsv │ └── rules │ │ ├── char.py │ │ ├── preprocessor.py │ │ ├── transliteration.py │ │ ├── whitelist.py │ │ ├── math.py │ │ ├── fraction.py │ │ └── money.py └── __main__.py ├── runtime ├── patch │ ├── CPPLINT.cfg │ └── openfst │ │ └── src │ │ ├── CMakeLists.txt │ │ └── extensions │ │ └── special │ │ └── CMakeLists.txt ├── android │ ├── app │ │ ├── .gitignore │ │ ├── src │ │ │ ├── main │ │ │ │ ├── cpp │ │ │ │ │ ├── cmake │ │ │ │ │ ├── patch │ │ │ │ │ ├── utils │ │ │ │ │ ├── processor │ │ │ │ │ └── CMakeLists.txt │ │ │ │ ├── assets │ │ │ │ │ └── README.md │ │ │ │ ├── res │ │ │ │ │ ├── values │ │ │ │ │ │ ├── strings.xml │ │ │ │ │ │ ├── colors.xml │ │ │ │ │ │ ├── attrs.xml │ │ │ │ │ │ └── themes.xml │ │ │ │ │ ├── mipmap-hdpi │ │ │ │ │ │ ├── ic_launcher.png │ │ │ │ │ │ └── ic_launcher_round.png │ │ │ │ │ ├── mipmap-mdpi │ │ │ │ │ │ ├── ic_launcher.png │ │ │ │ │ │ └── ic_launcher_round.png │ │ │ │ │ ├── mipmap-xhdpi │ │ │ │ │ │ ├── ic_launcher.png │ │ │ │ │ │ └── ic_launcher_round.png │ │ │ │ │ ├── mipmap-xxhdpi │ │ │ │ │ │ ├── ic_launcher.png │ │ │ │ │ │ └── ic_launcher_round.png │ │ │ │ │ ├── mipmap-xxxhdpi │ │ │ │ │ │ ├── ic_launcher.png │ │ │ │ │ │ └── ic_launcher_round.png │ │ │ │ │ ├── mipmap-anydpi-v26 │ │ │ │ │ │ ├── ic_launcher.xml │ │ │ │ │ │ └── ic_launcher_round.xml │ │ │ │ │ ├── values-night │ │ │ │ │ │ └── themes.xml │ │ │ │ │ └── drawable-v24 │ │ │ │ │ │ └── ic_launcher_foreground.xml │ │ │ │ ├── java │ │ │ │ │ └── com │ │ │ │ │ │ └── mobvoi │ │ │ │ │ │ └── WeTextProcessing │ │ │ │ │ │ └── WeTextProcessing.java │ │ │ │ └── AndroidManifest.xml │ │ │ ├── test │ │ │ │ └── java │ │ │ │ │ └── com │ │ │ │ │ └── mobvoi │ │ │ │ │ └── WeTextProcessing │ │ │ │ │ └── ExampleUnitTest.java │ │ │ └── androidTest │ │ │ │ └── java │ │ │ │ └── com │ │ │ │ └── mobvoi │ │ │ │ └── WeTextProcessing │ │ │ │ └── ExampleInstrumentedTest.java │ │ ├── wenet.keystore │ │ └── proguard-rules.pro │ ├── settings.gradle │ ├── gradle │ │ └── wrapper │ │ │ ├── gradle-wrapper.jar │ │ │ └── gradle-wrapper.properties │ ├── .gitignore │ ├── build.gradle │ └── gradle.properties ├── utils │ ├── CMakeLists.txt │ ├── wetext_log.h │ ├── wetext_flags.h │ └── wetext_string.h ├── bin │ └── CMakeLists.txt ├── cmake │ ├── glog.cmake │ ├── gflags.cmake │ └── gtest.cmake ├── test │ ├── CMakeLists.txt │ └── string_test.cc ├── processor │ ├── CMakeLists.txt │ └── wetext_processor.h ├── README.md └── CMakeLists.txt ├── CPPLINT.cfg ├── requirements.txt ├── .flake8 ├── .pre-commit-config.yaml ├── .gitignore └── .github └── workflows ├── unittest.yml └── wheels.yml /itn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /itn/chinese/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/chinese/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/english/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/japanese/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /itn/japanese/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/chinese/rules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/english/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/english/rules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/english/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/japanese/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /itn/chinese/rules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /itn/japanese/rules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /itn/japanese/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/english/data/date/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/english/data/money/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/english/data/roman/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /itn/japanese/data/number/dot.tsv: -------------------------------------------------------------------------------- 1 | 点 . -------------------------------------------------------------------------------- /itn/japanese/data/number/hundred.tsv: -------------------------------------------------------------------------------- 1 | 百 -------------------------------------------------------------------------------- /tn/chinese/data/number/dot.tsv: -------------------------------------------------------------------------------- 1 | . 点 2 | -------------------------------------------------------------------------------- /tn/english/data/electronic/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/english/data/ordinal/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/english/data/telephone/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/english/data/whitelist/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/japanese/data/default/blacklist.tsv: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /itn/japanese/data/default/blacklist.tsv: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/english/data/number/hundred.tsv: -------------------------------------------------------------------------------- 1 | hundred -------------------------------------------------------------------------------- /tn/english/data/number/zero.tsv: -------------------------------------------------------------------------------- 1 | zero 0 2 | -------------------------------------------------------------------------------- /tn/japanese/data/number/dot.tsv: -------------------------------------------------------------------------------- 1 | . 点 2 | -------------------------------------------------------------------------------- /itn/chinese/data/number/dot.tsv: -------------------------------------------------------------------------------- 1 | 点 . 2 | 點 . 3 | -------------------------------------------------------------------------------- /itn/chinese/data/number/zero.tsv: -------------------------------------------------------------------------------- 1 | 零 0 2 | 洞 0 3 | -------------------------------------------------------------------------------- /itn/japanese/data/char/oov_tags.tsv: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /runtime/patch/CPPLINT.cfg: -------------------------------------------------------------------------------- 1 | exclude_files=.* 2 | -------------------------------------------------------------------------------- /tn/chinese/data/number/zero.tsv: -------------------------------------------------------------------------------- 1 | 0 零 2 | 0 零 3 | -------------------------------------------------------------------------------- /tn/english/data/ordinal/teen.tsv: -------------------------------------------------------------------------------- 1 | twelfth twelve -------------------------------------------------------------------------------- /tn/english/test/data/roman.txt: -------------------------------------------------------------------------------- 1 | IV => four 2 | -------------------------------------------------------------------------------- /tn/japanese/data/char/oov_tags.tsv: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tn/japanese/data/number/zero.tsv: -------------------------------------------------------------------------------- 1 | 0 〇 2 | 0 〇 3 | -------------------------------------------------------------------------------- /CPPLINT.cfg: -------------------------------------------------------------------------------- 1 | root=runtime 2 | filter=-build/c++11 3 | -------------------------------------------------------------------------------- /itn/chinese/data/default/blacklist.tsv: -------------------------------------------------------------------------------- 1 | 呃 2 | 啊 3 | -------------------------------------------------------------------------------- /itn/chinese/test/data/char.txt: -------------------------------------------------------------------------------- 1 | 中 => 中 2 | A => A 3 | -------------------------------------------------------------------------------- /itn/japanese/test/data/char.txt: -------------------------------------------------------------------------------- 1 | 中 => 中 2 | A => A 3 | -------------------------------------------------------------------------------- /tn/chinese/data/default/blacklist.tsv: -------------------------------------------------------------------------------- 1 | 呃 2 | 啊 3 | -------------------------------------------------------------------------------- /tn/chinese/test/data/char.txt: -------------------------------------------------------------------------------- 1 | 中 => 中 2 | A => A 3 | -------------------------------------------------------------------------------- /tn/chinese/test/data/preprocessor.txt: -------------------------------------------------------------------------------- 1 | 寶貝 => 宝贝 2 | -------------------------------------------------------------------------------- /tn/english/data/money/per_unit.tsv: -------------------------------------------------------------------------------- 1 | /ea each 2 | /dozen -------------------------------------------------------------------------------- /tn/english/test/utils.py: -------------------------------------------------------------------------------- 1 | ../../chinese/test/utils.py -------------------------------------------------------------------------------- /tn/japanese/test/data/char.txt: -------------------------------------------------------------------------------- 1 | 中 => 中 2 | A => A 3 | -------------------------------------------------------------------------------- /tn/japanese/test/utils.py: -------------------------------------------------------------------------------- 1 | ../../chinese/test/utils.py -------------------------------------------------------------------------------- /itn/chinese/test/utils.py: -------------------------------------------------------------------------------- 1 | ../../../tn/chinese/test/utils.py -------------------------------------------------------------------------------- /itn/japanese/test/data/money.txt: -------------------------------------------------------------------------------- 1 | 三千三百八十点五八ドル => $3380.58 -------------------------------------------------------------------------------- /runtime/android/app/.gitignore: -------------------------------------------------------------------------------- 1 | /build 2 | /release 3 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/cpp/cmake: -------------------------------------------------------------------------------- 1 | ../../../../../cmake -------------------------------------------------------------------------------- /runtime/android/app/src/main/cpp/patch: -------------------------------------------------------------------------------- 1 | ../../../../../patch -------------------------------------------------------------------------------- /runtime/android/app/src/main/cpp/utils: -------------------------------------------------------------------------------- 1 | ../../../../../utils -------------------------------------------------------------------------------- /tn/chinese/data/number/sign.tsv: -------------------------------------------------------------------------------- 1 | + 正 2 | ± 正负 3 | - 负 4 | -------------------------------------------------------------------------------- /tn/english/test/data/range.txt: -------------------------------------------------------------------------------- 1 | 2-3 => two to three 2 | -------------------------------------------------------------------------------- /itn/japanese/data/number/zero.tsv: -------------------------------------------------------------------------------- 1 | 〇 0 2 | 零 0 3 | ゼロ 0 4 | れい 0 -------------------------------------------------------------------------------- /itn/japanese/test/data/whitelist.txt: -------------------------------------------------------------------------------- 1 | 十三湖 => 十三湖 2 | 一月三舟 => 一月三舟 -------------------------------------------------------------------------------- /itn/japanese/test/utils.py: -------------------------------------------------------------------------------- 1 | ../../../tn/chinese/test/utils.py -------------------------------------------------------------------------------- /tn/english/data/telephone/ip_prompt.tsv: -------------------------------------------------------------------------------- 1 | IP address is 2 | IP is -------------------------------------------------------------------------------- /itn/chinese/data/number/sign.tsv: -------------------------------------------------------------------------------- 1 | 正 + 2 | 正负 ± 3 | 负 - 4 | 负的 - 5 | -------------------------------------------------------------------------------- /itn/chinese/test/data/whitelist.txt: -------------------------------------------------------------------------------- 1 | 三七二十一 => 三七二十一 2 | 一共 => 一共 3 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/cpp/processor: -------------------------------------------------------------------------------- 1 | ../../../../../processor -------------------------------------------------------------------------------- /tn/japanese/data/number/sign.tsv: -------------------------------------------------------------------------------- 1 | + プラス 2 | ± プラスマイナス 3 | - マイナス 4 | -------------------------------------------------------------------------------- /itn/japanese/data/number/sign.tsv: -------------------------------------------------------------------------------- 1 | プラス + 2 | プラスマイナス ± 3 | マイナス - 4 | 负の - -------------------------------------------------------------------------------- /tn/chinese/data/char/charset_extension.tsv: -------------------------------------------------------------------------------- 1 | 吶 2 | 囧 3 | 屄 4 | 屌 5 | 诶 6 | 飚 7 | -------------------------------------------------------------------------------- /tn/english/data/money/currency_minor_singular.tsv: -------------------------------------------------------------------------------- 1 | $ cent 2 | € cent 3 | £ penny -------------------------------------------------------------------------------- /tn/english/test/data/electronic.txt: -------------------------------------------------------------------------------- 1 | cdf1@abc.edu => cdf one at abc dot edu 2 | -------------------------------------------------------------------------------- /itn/japanese/data/number/thousands.tsv: -------------------------------------------------------------------------------- 1 | 千 2 | 万 3 | 亿 4 | 兆 5 | 京 6 | 垓 7 | 秭 8 | 穰 9 | 沟 -------------------------------------------------------------------------------- /runtime/android/settings.gradle: -------------------------------------------------------------------------------- 1 | include ':app' 2 | rootProject.name = "WeTextProcessing" -------------------------------------------------------------------------------- /tn/chinese/test/data/fraction.txt: -------------------------------------------------------------------------------- 1 | 1/2 => 二分之一 2 | 3/16 => 十六分之三 3 | 1 / 2 => 二分之一 4 | -------------------------------------------------------------------------------- /tn/japanese/test/data/fraction.txt: -------------------------------------------------------------------------------- 1 | 1/100 => 百分の一 2 | -1/100 => 百分のマイナス一 3 | 1 / 2 => 二分の一 -------------------------------------------------------------------------------- /itn/chinese/data/math/operator.tsv: -------------------------------------------------------------------------------- 1 | 乘 × 2 | 减 - 3 | 到 ~ 4 | 加 + 5 | 比 : 6 | 等于 = 7 | 除 ÷ 8 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/assets/README.md: -------------------------------------------------------------------------------- 1 | put tagger.fst and verbalizer.fst here. 2 | -------------------------------------------------------------------------------- /tn/chinese/test/data/sport.txt: -------------------------------------------------------------------------------- 1 | 中国1-2 => 中国一比二 2 | 爆冷0:1 => 爆冷零比一 3 | 拉齐奥 2/2 => 拉齐奥二比二 4 | -------------------------------------------------------------------------------- /tn/chinese/test/data/whitelist.txt: -------------------------------------------------------------------------------- 1 | 儿 => 2 | 婴儿 => 婴儿 3 | O2O => O to O 4 | 90后 => 九零后 5 | -------------------------------------------------------------------------------- /tn/english/data/money/currency_minor_plural.tsv: -------------------------------------------------------------------------------- 1 | $ cents 2 | US$ cents 3 | € cents 4 | £ pence -------------------------------------------------------------------------------- /tn/english/data/telephone/ssn_prompt.tsv: -------------------------------------------------------------------------------- 1 | ssn is SSN is 2 | ssn is SSN is 3 | SSN is 4 | SSN -------------------------------------------------------------------------------- /itn/__main__.py: -------------------------------------------------------------------------------- 1 | from itn.main import main 2 | 3 | if __name__ == "__main__": 4 | main() 5 | -------------------------------------------------------------------------------- /tn/__main__.py: -------------------------------------------------------------------------------- 1 | from tn.main import main 2 | 3 | if __name__ == "__main__": 4 | main() 5 | -------------------------------------------------------------------------------- /tn/chinese/test/data/money.txt: -------------------------------------------------------------------------------- 1 | ¥1.25 => 一点二五元 2 | CNY1.25 => 一点二五人民币 3 | CNY 1.25 => 一点二五人民币 4 | -------------------------------------------------------------------------------- /tn/japanese/data/date/date.tsv: -------------------------------------------------------------------------------- 1 | 月 月曜日 2 | 火 火曜日 3 | 水 水曜日 4 | 木 木曜日 5 | 金 金曜日 6 | 土 土曜日 7 | 日 日曜日 -------------------------------------------------------------------------------- /itn/chinese/data/time/noon.tsv: -------------------------------------------------------------------------------- 1 | 上午 a.m. 2 | 早上 a.m. 3 | 早晨 a.m. 4 | 下午 p.m. 5 | 晚上 p.m. 6 | 傍晚 p.m. 7 | -------------------------------------------------------------------------------- /itn/japanese/data/number/digit.tsv: -------------------------------------------------------------------------------- 1 | 一 1 2 | 二 2 3 | 三 3 4 | 四 4 5 | 五 5 6 | 六 6 7 | 七 7 8 | 八 8 9 | 九 9 -------------------------------------------------------------------------------- /itn/japanese/data/number/ties.tsv: -------------------------------------------------------------------------------- 1 | 二十 2 2 | 三十 3 3 | 四十 4 4 | 五十 5 5 | 六十 6 6 | 七十 7 7 | 八十 8 8 | 九十 9 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | flake8 2 | importlib_resources 3 | pynini==2.1.6 4 | pytest 5 | pre-commit==3.5.0 6 | -------------------------------------------------------------------------------- /tn/english/data/roman/key_word.tsv: -------------------------------------------------------------------------------- 1 | chapter 2 | class 3 | part 4 | article 5 | section 6 | paragraph 7 | -------------------------------------------------------------------------------- /itn/chinese/data/number/digit_zh.tsv: -------------------------------------------------------------------------------- 1 | 一 2 | 二 3 | 两 4 | 三 5 | 四 6 | 五 7 | 六 8 | 七 9 | 八 10 | 九 11 | -------------------------------------------------------------------------------- /itn/japanese/data/math/operator.tsv: -------------------------------------------------------------------------------- 1 | カケル × 2 | 負 - 3 | マイナス - 4 | プラス + 5 | イコール = 6 | ワル ÷ 7 | から ~ 8 | 対 : -------------------------------------------------------------------------------- /tn/english/test/data/word.txt: -------------------------------------------------------------------------------- 1 | smile => smile 2 | 中国 => 中国 3 | 中 => 中 4 | 国 => 国 5 | A => A 6 | a => a 7 | -------------------------------------------------------------------------------- /itn/chinese/test/data/license_plate.txt: -------------------------------------------------------------------------------- 1 | 鄂a七l六二u => 鄂a7l62u 2 | 皖C九B三四E => 皖C9B34E 3 | 京A零七ZX三F => 京A07ZX3F 4 | -------------------------------------------------------------------------------- /tn/japanese/data/date/dd.tsv: -------------------------------------------------------------------------------- 1 | 01 一日 2 | 02 二日 3 | 03 三日 4 | 04 四日 5 | 05 五日 6 | 06 六日 7 | 07 七日 8 | 08 八日 9 | 09 九日 -------------------------------------------------------------------------------- /tn/japanese/data/date/mm.tsv: -------------------------------------------------------------------------------- 1 | 01 一月 2 | 02 二月 3 | 03 三月 4 | 04 四月 5 | 05 五月 6 | 06 六月 7 | 07 七月 8 | 08 八月 9 | 09 九月 -------------------------------------------------------------------------------- /tn/english/data/telephone/telephone_prompt.tsv: -------------------------------------------------------------------------------- 1 | call me at 2 | reach at 3 | reached at 4 | my number is 5 | hit me up at -------------------------------------------------------------------------------- /tn/japanese/test/data/whitelist.txt: -------------------------------------------------------------------------------- 1 | P2P => P to P 2 | B2B => B to B 3 | R-18 => R十八 4 | FOREVER21 => FOREVERトゥエンティーワン 5 | -------------------------------------------------------------------------------- /itn/japanese/data/money/symbol.tsv: -------------------------------------------------------------------------------- 1 | ドル $ 2 | ポンド £ 3 | ポンド £ 4 | バーツ ฿ 5 | ユーロ € 6 | インドルピー ₹ 7 | ルーブル ₽ 8 | スイスフラン CHF 9 | レアル R$ -------------------------------------------------------------------------------- /tn/english/data/number/digit.tsv: -------------------------------------------------------------------------------- 1 | one 1 2 | two 2 3 | three 3 4 | four 4 5 | five 5 6 | six 6 7 | seven 7 8 | eight 8 9 | nine 9 -------------------------------------------------------------------------------- /tn/english/data/number/ty.tsv: -------------------------------------------------------------------------------- 1 | twenty 2 2 | thirty 3 3 | forty 4 4 | fifty 5 5 | sixty 6 6 | seventy 7 7 | eighty 8 8 | ninety 9 -------------------------------------------------------------------------------- /runtime/android/app/wenet.keystore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/wenet.keystore -------------------------------------------------------------------------------- /tn/chinese/data/math/operator.tsv: -------------------------------------------------------------------------------- 1 | × 乘 2 | - 减 3 | + 加 4 | = 等于 5 | ÷ 除 6 | ≥ 大于等于 7 | ≤ 小于等于 8 | >= 大于等于 9 | <= 小于等于 10 | -------------------------------------------------------------------------------- /itn/japanese/data/number/hundred_digit.tsv: -------------------------------------------------------------------------------- 1 | 百一 101 2 | 百二 102 3 | 百三 103 4 | 百四 104 5 | 百五 105 6 | 百六 106 7 | 百七 107 8 | 百八 108 9 | 百九 109 -------------------------------------------------------------------------------- /itn/japanese/data/number/teen.tsv: -------------------------------------------------------------------------------- 1 | 十 10 2 | 十一 11 3 | 十二 12 4 | 十三 13 5 | 十四 14 6 | 十五 15 7 | 十六 16 8 | 十七 17 9 | 十八 18 10 | 十九 19 -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/values/strings.xml: -------------------------------------------------------------------------------- 1 | 2 | WeTextProcessing 3 | -------------------------------------------------------------------------------- /tn/english/data/measure/math_operation.tsv: -------------------------------------------------------------------------------- 1 | + plus 2 | - minus 3 | / divided 4 | ÷ divided 5 | : divided 6 | × times 7 | * times 8 | · times -------------------------------------------------------------------------------- /itn/chinese/test/data/fraction.txt: -------------------------------------------------------------------------------- 1 | 二分之一 => 1/2 2 | 十六分之三 => 3/16 3 | 现场有十七分之七的观众投出了赞成票可是最后唱票结果却是负十二分之七 => 现场有7/17的观众投出了赞成票可是最后唱票结果却是-7/12 4 | -------------------------------------------------------------------------------- /itn/japanese/data/date/month.tsv: -------------------------------------------------------------------------------- 1 | 一 1 2 | 二 2 3 | 三 3 4 | 四 4 5 | 五 5 6 | 六 6 7 | 七 7 8 | 八 8 9 | 九 9 10 | 十 10 11 | 十一 11 12 | 十二 12 -------------------------------------------------------------------------------- /runtime/utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(wetext_utils STATIC wetext_string.cc) 2 | 3 | target_link_libraries(wetext_utils PUBLIC glog) 4 | -------------------------------------------------------------------------------- /tn/chinese/test/data/postprocessor.txt: -------------------------------------------------------------------------------- 1 | 好! => 好! 2 | 好啊 => 好 3 | 啊呃呃 => 4 | 我们안녕 => 我们 5 | 雪の花 => 雪花 6 | -------------------------------------------------------------------------------- /itn/chinese/test/data/math.txt: -------------------------------------------------------------------------------- 1 | 一加二 => 1+2 2 | 负一加二 => -1+2 3 | 一加二加三 => 1+2+3 4 | 二等于一加一 => 2=1+1 5 | 二十一到一千零一 => 21~1001 6 | 六百三到六百四 => 630~640 7 | -------------------------------------------------------------------------------- /runtime/bin/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(processor_main processor_main.cc) 2 | target_link_libraries(processor_main PUBLIC wetext_processor) 3 | -------------------------------------------------------------------------------- /tn/chinese/data/time/noon.tsv: -------------------------------------------------------------------------------- 1 | a m 上午 2 | a.m. 上午 3 | am 上午 4 | A M 上午 5 | AM 上午 6 | p m 下午 7 | p.m. 下午 8 | pm 下午 9 | P M 下午 10 | PM 下午 11 | -------------------------------------------------------------------------------- /tn/japanese/data/date/m.tsv: -------------------------------------------------------------------------------- 1 | 1 一月 2 | 2 二月 3 | 3 三月 4 | 4 四月 5 | 5 五月 6 | 6 六月 7 | 7 七月 8 | 8 八月 9 | 9 九月 10 | 10 十月 11 | 11 十一月 12 | 12 十二月 -------------------------------------------------------------------------------- /tn/chinese/data/date/m.tsv: -------------------------------------------------------------------------------- 1 | 1 一月 2 | 2 二月 3 | 3 三月 4 | 4 四月 5 | 5 五月 6 | 6 六月 7 | 7 七月 8 | 8 八月 9 | 9 九月 10 | 10 十月 11 | 11 十一月 12 | 12 十二月 13 | -------------------------------------------------------------------------------- /tn/chinese/data/money/symbol.tsv: -------------------------------------------------------------------------------- 1 | $ 美元 2 | £ 英镑 3 | £ 英镑 4 | ¥ 元 5 | ¥ 元 6 | ฿ 泰铢 7 | € 欧元 8 | ₹ 印度卢比 9 | ₽ 卢布 10 | CHF 瑞士法郎 11 | R$ 巴西雷亚尔 12 | -------------------------------------------------------------------------------- /tn/japanese/data/time/noon.tsv: -------------------------------------------------------------------------------- 1 | a m 午前 2 | a.m. 午前 3 | am 午前 4 | A M 午前 5 | AM 午前 6 | p m 午後 7 | p.m. 午後 8 | pm 午後 9 | P M 午後 10 | PM 午後 11 | -------------------------------------------------------------------------------- /tn/english/data/number/cardinal_number_name.far: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/tn/english/data/number/cardinal_number_name.far -------------------------------------------------------------------------------- /itn/chinese/data/date/mm.tsv: -------------------------------------------------------------------------------- 1 | 一月 01 2 | 二月 02 3 | 三月 03 4 | 四月 04 5 | 五月 05 6 | 六月 06 7 | 七月 07 8 | 八月 08 9 | 九月 09 10 | 十月 10 11 | 十一月 11 12 | 十二月 12 13 | -------------------------------------------------------------------------------- /runtime/android/gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /tn/chinese/data/date/mm.tsv: -------------------------------------------------------------------------------- 1 | 01 一月 2 | 02 二月 3 | 03 三月 4 | 04 四月 5 | 05 五月 6 | 06 六月 7 | 07 七月 8 | 08 八月 9 | 09 九月 10 | 10 十月 11 | 11 十一月 12 | 12 十二月 13 | -------------------------------------------------------------------------------- /tn/english/data/number/cardinal_number_name_au.far: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/tn/english/data/number/cardinal_number_name_au.far -------------------------------------------------------------------------------- /tn/japanese/data/money/symbol.tsv: -------------------------------------------------------------------------------- 1 | $ ドル 2 | £ ポンド 3 | £ ポンド 4 | ¥ 円 5 | ¥ 円 6 | ฿ バーツ 7 | € ユーロ 8 | ₹ インドルピー 9 | ₽ ルーブル 10 | CHF スイスフラン 11 | R$ レアル 12 | -------------------------------------------------------------------------------- /tn/japanese/test/data/money.txt: -------------------------------------------------------------------------------- 1 | USD1001 => 千一アメリカドル 2 | HKD1002 => 千二香港ドル 3 | ¥22 => 二十二円 4 | ¥ 22 => 二十二円 5 | $10000 => 一万ドル 6 | CAD => CAD 7 | CAD1001 => 千一カナダドル -------------------------------------------------------------------------------- /itn/japanese/test/data/math.txt: -------------------------------------------------------------------------------- 1 | 四百四マイナス二 => 404-2 2 | 一マイナス二プラス三十 => 1-2+30 3 | 三対二 => 3:2 4 | 一プラス一イコール二 => 1+1=2 5 | 一カケル二マイナス三プラス四ワル五イコール二 => 1×2-3+4÷5=2 6 | 六から七 => 6~7 -------------------------------------------------------------------------------- /tn/chinese/test/data/time.txt: -------------------------------------------------------------------------------- 1 | 2:02 => 两点零二分 2 | 11:00 => 十一点 3 | 22:58 => 二十二点五十八分 4 | 13:10:36 => 十三点十分三十六秒 5 | 1:02:36am => 上午一点零二分三十六秒 6 | 1:02:36 am => 上午一点零二分三十六秒 7 | -------------------------------------------------------------------------------- /tn/english/test/data/decimal.txt: -------------------------------------------------------------------------------- 1 | -12.5006 billion => minus twelve point five oh oh six billion 2 | 1 billion => one billion 3 | 1.5 million => one point five million 4 | -------------------------------------------------------------------------------- /itn/japanese/test/data/measure.txt: -------------------------------------------------------------------------------- 1 | 二千センチメートル每秒 => 2000cm/s 2 | 二万センチメートル每秒 => 20000cm/s 3 | 二万二千センチメートル每秒 => 22000cm/s 4 | 八百メガ秒 => 800ms 5 | 三点五千キロメートル => 3.5km 6 | 百人 => 100人 -------------------------------------------------------------------------------- /tn/english/data/time/suffix.tsv: -------------------------------------------------------------------------------- 1 | p.m. PM 2 | p.m PM 3 | pm PM 4 | P.M. PM 5 | P.M PM 6 | PM PM 7 | a.m. AM 8 | a.m AM 9 | am AM 10 | A.M. AM 11 | A.M AM 12 | AM AM 13 | -------------------------------------------------------------------------------- /tn/english/test/data/telephone.txt: -------------------------------------------------------------------------------- 1 | +1 123-123-5678-1 => plus one, one two three, one two three, five six seven eight, one 2 | 1-800-GO-U-HAUL => one, eight hundred, GO U HAUL 3 | -------------------------------------------------------------------------------- /tn/english/test/data/whitelist.txt: -------------------------------------------------------------------------------- 1 | Ph.D. => PHD 2 | Hon. => honorable 3 | Mt. => Mount 4 | Maj. => Major 5 | Rev. => Reverend 6 | Stroudsburg, PA => Stroudsburg, Pennsylvania 7 | -------------------------------------------------------------------------------- /itn/japanese/data/date/week.tsv: -------------------------------------------------------------------------------- 1 | 月曜日 月 2 | 月曜 月 3 | 火曜日 火 4 | 火曜 火 5 | 水曜日 水 6 | 水曜 水 7 | 木曜日 木 8 | 木曜 木 9 | 金曜日 金 10 | 金曜 金 11 | 土曜日 土 12 | 土曜 土 13 | 日曜日 日 14 | 日曜 日 15 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/mipmap-hdpi/ic_launcher.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-hdpi/ic_launcher.png -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/mipmap-mdpi/ic_launcher.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-mdpi/ic_launcher.png -------------------------------------------------------------------------------- /tn/chinese/test/data/math.txt: -------------------------------------------------------------------------------- 1 | 1+2 => 一加二 2 | -1+2 => 负一加二 3 | 1+2+3 => 一加二加三 4 | 2 = 1 + 1 => 二等于一加一 5 | 2 ≤ 4 => 二小于等于四 6 | 2 ≥ 1 => 二大于等于一 7 | 2<=4 => 二小于等于四 8 | 2>=1 => 二大于等于一 9 | -------------------------------------------------------------------------------- /tn/english/data/ordinal/digit.tsv: -------------------------------------------------------------------------------- 1 | first one 2 | second two 3 | third three 4 | fourth four 5 | fifth five 6 | sixth six 7 | seventh seven 8 | eighth eight 9 | ninth nine 10 | -------------------------------------------------------------------------------- /tn/japanese/data/math/operator.tsv: -------------------------------------------------------------------------------- 1 | × カケル 2 | - マイナス 3 | + プラス 4 | = イコール 5 | > 大なり 6 | < 小なり 7 | ≥ 大なりイコール 8 | ≤ 小なりイコール 9 | >= 大なりイコール 10 | <= 小なりイコール 11 | ÷ ワル 12 | ~ から -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png -------------------------------------------------------------------------------- /tn/english/data/number/teen.tsv: -------------------------------------------------------------------------------- 1 | ten 10 2 | eleven 11 3 | twelve 12 4 | thirteen 13 5 | fourteen 14 6 | fifteen 15 7 | sixteen 16 8 | seventeen 17 9 | eighteen 18 10 | nineteen 19 -------------------------------------------------------------------------------- /itn/japanese/test/data/fraction.txt: -------------------------------------------------------------------------------- 1 | 四分の三 => 3/4 2 | 一分の一 => 1/1 3 | 二万分の三 => 3/20000 4 | 二万点三 => 20000.3 5 | ルート三分の一 => 1/√3 6 | 一点六五分の五十 => 50/1.65 7 | 二ルート六分の三 => 3/2√6 8 | 三千分の三 => 3/3000 -------------------------------------------------------------------------------- /itn/japanese/test/data/time.txt: -------------------------------------------------------------------------------- 1 | 一時三十分三秒 => 1時30分3秒 2 | 五時二十分過ぎ => 5時20分過ぎ 3 | 七分 => 7分 4 | 七秒 => 7秒 5 | 七時 => 7時 6 | 八時半頃 => 8時半頃 7 | 十時五分前 => 10時5分前 8 | 正午一分前 => 正午1分前 9 | 正午十分過ぎ => 正午10分過ぎ -------------------------------------------------------------------------------- /tn/chinese/data/number/teen.tsv: -------------------------------------------------------------------------------- 1 | 1 2 | 2 二 3 | 3 三 4 | 4 四 5 | 5 五 6 | 6 六 7 | 7 七 8 | 8 八 9 | 9 九 10 | 1 11 | 2 二 12 | 3 三 13 | 4 四 14 | 5 五 15 | 6 六 16 | 7 七 17 | 8 八 18 | 9 九 19 | -------------------------------------------------------------------------------- /tn/chinese/test/data/cardinal.txt: -------------------------------------------------------------------------------- 1 | 110 => 幺幺零 2 | 2% => 百分之二 3 | 127.0.0.1 => 一二七点零点零点一 4 | 010-64035547 => 零一零六四零三五五四七 5 | 尾号1702 => 尾号幺七零二 6 | 尾号是3385 => 尾号是三三八五 7 | 尾号为2349 => 尾号为二三四九 8 | -------------------------------------------------------------------------------- /tn/english/data/number/quantity_abbr.tsv: -------------------------------------------------------------------------------- 1 | M million 2 | MLN million 3 | m million 4 | mln million 5 | B billion 6 | b billion 7 | BN billion 8 | bn billion 9 | K thousand 10 | k thousand -------------------------------------------------------------------------------- /tn/japanese/data/number/teen.tsv: -------------------------------------------------------------------------------- 1 | 1 2 | 2 二 3 | 3 三 4 | 4 四 5 | 5 五 6 | 6 六 7 | 7 七 8 | 8 八 9 | 9 九 10 | 1 11 | 2 二 12 | 3 三 13 | 4 四 14 | 5 五 15 | 6 六 16 | 7 七 17 | 8 八 18 | 9 九 19 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/mipmap-hdpi/ic_launcher_round.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-hdpi/ic_launcher_round.png -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/mipmap-mdpi/ic_launcher_round.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-mdpi/ic_launcher_round.png -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png -------------------------------------------------------------------------------- /tn/chinese/data/number/digit.tsv: -------------------------------------------------------------------------------- 1 | 1 一 2 | 2 二 3 | 3 三 4 | 4 四 5 | 5 五 6 | 6 六 7 | 7 七 8 | 8 八 9 | 9 九 10 | 1 一 11 | 2 二 12 | 3 三 13 | 4 四 14 | 5 五 15 | 6 六 16 | 7 七 17 | 8 八 18 | 9 九 19 | -------------------------------------------------------------------------------- /tn/japanese/data/number/digit.tsv: -------------------------------------------------------------------------------- 1 | 1 一 2 | 2 二 3 | 3 三 4 | 4 四 5 | 5 五 6 | 6 六 7 | 7 七 8 | 8 八 9 | 9 九 10 | 1 一 11 | 2 二 12 | 3 三 13 | 4 四 14 | 5 五 15 | 6 六 16 | 7 七 17 | 8 八 18 | 9 九 19 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png -------------------------------------------------------------------------------- /tn/chinese/test/data/measure.txt: -------------------------------------------------------------------------------- 1 | 1年后 => 一年后 2 | 2年后 => 两年后 3 | 20年后 => 二十年后 4 | 100余 => 一百余 5 | 10几 => 十几 6 | 2两 => 二两 7 | 1kg => 一千克 8 | 1 kg => 一千克 9 | 10km/h => 每小时十公里 10 | 100兆 => 一百兆 11 | -------------------------------------------------------------------------------- /tn/japanese/data/number/en_digit.tsv: -------------------------------------------------------------------------------- 1 | 1 いち 2 | 2 に 3 | 3 さん 4 | 4 よん 5 | 5 ご 6 | 6 ろく 7 | 7 なな 8 | 8 はち 9 | 9 きゅう 10 | 1 いち 11 | 2 に 12 | 3 さん 13 | 4 よん 14 | 5 ご 15 | 6 ろく 16 | 7 なな 17 | 8 はち 18 | 9 きゅう -------------------------------------------------------------------------------- /tn/english/data/time/zone.tsv: -------------------------------------------------------------------------------- 1 | cst CST 2 | c.s.t CST 3 | cet CET 4 | c.e.t CET 5 | pst PST 6 | p.s.t PST 7 | est EST 8 | e.s.t EST 9 | pt PT 10 | p.t PT 11 | et ET 12 | e.t ET 13 | gmt GMT 14 | g.m.t GMT 15 | -------------------------------------------------------------------------------- /itn/chinese/data/number/digit.tsv: -------------------------------------------------------------------------------- 1 | 一 1 2 | 幺 1 3 | 壹 1 4 | 二 2 5 | 两 2 6 | 贰 2 7 | 三 3 8 | 叁 3 9 | 四 4 10 | 肆 4 11 | 五 5 12 | 伍 5 13 | 六 6 14 | 陆 6 15 | 七 7 16 | 柒 7 17 | 拐 7 18 | 八 8 19 | 捌 8 20 | 九 9 21 | 玖 9 22 | -------------------------------------------------------------------------------- /tn/chinese/data/default/whitelist.tsv: -------------------------------------------------------------------------------- 1 | B2B B to B 2 | M.V.P M V P 3 | O2O O to O 4 | P2P P to P 5 | BY2 BY TWO 6 | By2 By Two 7 | 10后 一零后 8 | 00后 零零后 9 | 90后 九零后 10 | 80后 八零后 11 | 70后 七零后 12 | 60后 六零后 13 | 50后 五零后 14 | -------------------------------------------------------------------------------- /tn/japanese/test/data/sport.txt: -------------------------------------------------------------------------------- 1 | 中国韓国3:2 => 中国韓国三対二 2 | 中国韓国3-2 => 中国韓国三対二 3 | 中国韓国3-0 => 中国韓国三対〇 4 | ACミラン3:2 => ACミラン三対二 5 | ACミラン3-2 => ACミラン三対二 6 | 国韓3:2 => 国韓三対二 7 | 2:3 => 二対三 8 | 3:0 => 三対〇 9 | 1:1 => 一対一 10 | -------------------------------------------------------------------------------- /tn/english/data/electronic/domain.tsv: -------------------------------------------------------------------------------- 1 | .com dot com 2 | .org dot org 3 | .gov dot gov 4 | .uk dot UK 5 | .fr dot FR 6 | .net dot net 7 | .br dot BR 8 | .in dot IN 9 | .ru dot RU 10 | .de dot DE 11 | .it dot IT 12 | .jpg dot jpeg -------------------------------------------------------------------------------- /tn/english/data/number/fraction.tsv: -------------------------------------------------------------------------------- 1 | ¼ 1/4 2 | ½ 1/2 3 | ¾ 3/4 4 | ⅐ 1/7 5 | ⅑ 1/9 6 | ⅒ 1/10 7 | ⅓ 1/3 8 | ⅔ 2/3 9 | ⅕ 1/5 10 | ⅖ 2/5 11 | ⅗ 3/5 12 | ⅘ 4/5 13 | ⅙ 1/6 14 | ⅚ 5/6 15 | ⅛ 1/8 16 | ⅜ 3/8 17 | ⅝ 5/8 18 | ⅞ 7/8 19 | -------------------------------------------------------------------------------- /tn/english/data/date/year_suffix.tsv: -------------------------------------------------------------------------------- 1 | A. D AD 2 | A.D AD 3 | a. d AD 4 | a.d AD 5 | a. d. AD 6 | a.d. AD 7 | B. C BC 8 | B.C BC 9 | b. c BC 10 | b.c BC 11 | A. D. AD 12 | A.D. AD 13 | B. C. BC 14 | B.C. BC 15 | b. c. BC 16 | b.c. BC 17 | -------------------------------------------------------------------------------- /tn/english/data/electronic/words.tsv: -------------------------------------------------------------------------------- 1 | drive 2 | sim 3 | early 4 | access 5 | program 6 | rtx RTX 7 | developer 8 | basepod BASEPOD 9 | cuda CUDA 10 | cv 11 | enterprise 12 | services 13 | nvidia NVIDIA 14 | dgx DGX 15 | pro 16 | help 17 | -------------------------------------------------------------------------------- /tn/english/test/data/fraction.txt: -------------------------------------------------------------------------------- 1 | 23 4/5 => twenty three and four fifths 2 | 23 4/5th => twenty three and four fifths 3 | 1/3 => one third 4 | 1/2 => one half 5 | 1/4 => one quarter 6 | 2/4 => two quarters 7 | 23/44 => twenty three forty fourths 8 | -------------------------------------------------------------------------------- /tn/japanese/data/default/whitelist.tsv: -------------------------------------------------------------------------------- 1 | B2B B to B 2 | M.V.P M V P 3 | O2O O to O 4 | P2P P to P 5 | BY2 BY TWO 6 | By2 By Two 7 | R-18 R十八 8 | r-18 R十八 9 | M-1 Mワン 10 | M-1 Mワン 11 | M1 Mワン 12 | M1 Mワン 13 | FOREVER21 FOREVERトゥエンティーワン 14 | @ アットマーク -------------------------------------------------------------------------------- /itn/chinese/data/license_plate/province.tsv: -------------------------------------------------------------------------------- 1 | 京 2 | 津 3 | 沪 4 | 渝 5 | 冀 6 | 豫 7 | 云 8 | 辽 9 | 黑 10 | 湘 11 | 皖 12 | 鲁 13 | 新 14 | 苏 15 | 浙 16 | 赣 17 | 鄂 18 | 桂 19 | 甘 20 | 晋 21 | 蒙 22 | 陕 23 | 吉 24 | 闽 25 | 贵 26 | 粤 27 | 青 28 | 藏 29 | 川 30 | 宁 31 | 琼 32 | -------------------------------------------------------------------------------- /tn/japanese/test/data/time.txt: -------------------------------------------------------------------------------- 1 | 3:02 => 三時二分 2 | 3:40am => 午前三時四十分 3 | 3:40a.m. => 午前三時四十分 4 | 3:40AM => 午前三時四十分 5 | 3:40A M => 午前三時四十分 6 | 3:30pm => 午後三時三十分 7 | 3:30 => 三時三十分 8 | 3:30-4:34 => 三時三十分から四時三十四分 9 | 10p.m. => 午後十時 10 | 0:30くらいつく => 〇時三十分くらいつく -------------------------------------------------------------------------------- /tn/english/data/address/address_word.tsv: -------------------------------------------------------------------------------- 1 | st Street 2 | street Street 3 | expy Expressway 4 | fwy Freeway 5 | hwy Highway 6 | dr Drive 7 | ct Court 8 | ave Avenue 9 | av Avenue 10 | cir Circle 11 | blvd Boulevard 12 | alley Alley 13 | way Way 14 | jct Junction -------------------------------------------------------------------------------- /itn/japanese/data/default/whitelist.tsv: -------------------------------------------------------------------------------- 1 | 十三湖 2 | 一月三舟 3 | 一日之長 4 | 十八番 5 | 百人一首 6 | 二百十日 7 | 三度笠 8 | 千円札 9 | 二十面相 10 | 七つの海 11 | 四国八十八箇所 12 | 五箇山 13 | 千本鳥居 14 | 五月雨 15 | 六本木ヒルズ 16 | 七つの大罪 17 | 千本格子 18 | 二枚目俳優 19 | 六本木アートナイト 20 | 七人の侍 21 | 五月祭 22 | 七人の姉妹 23 | 十八番目の男 -------------------------------------------------------------------------------- /runtime/android/gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.2-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /itn/japanese/test/data/number.txt: -------------------------------------------------------------------------------- 1 | 三点一四一五九二六 => 3.1415926 2 | マイナス三点一四一五九二六 => -3.1415926 3 | 一百万千百十一 => 100万1111 4 | 一万千百十一 => 11111 5 | 一万二千三百 => 12300 6 | 二万 => 20000 7 | 三百万 => 300万 8 | 三兆三万 => 3兆3万 9 | 三千万 => 3000万 10 | 一兆三百二十万五千 => 1兆320万5000 11 | 三百二十万五千 => 320万5000 -------------------------------------------------------------------------------- /runtime/cmake/glog.cmake: -------------------------------------------------------------------------------- 1 | FetchContent_Declare(glog 2 | URL https://github.com/google/glog/archive/v0.4.0.zip 3 | URL_HASH MD5=2899b069b8229d49cd65eda5271315ad 4 | ) 5 | FetchContent_MakeAvailable(glog) 6 | include_directories(${glog_SOURCE_DIR}/src ${glog_BINARY_DIR}) 7 | -------------------------------------------------------------------------------- /tn/english/data/whitelist/alternatives_all_format.tsv: -------------------------------------------------------------------------------- 1 | st street 2 | st saint 3 | dr doctor 4 | dr drive 5 | mt mount 6 | sr senior 7 | prof professor 8 | mt mountain 9 | sr senior 10 | jr junior 11 | vol volume 12 | rd road 13 | ave avenue 14 | approx approximately 15 | -------------------------------------------------------------------------------- /tn/english/test/data/ordinal.txt: -------------------------------------------------------------------------------- 1 | 1st => first 2 | 2nd => second 3 | 3rd => third 4 | 5th => fifth 5 | 11th => eleventh 6 | 13th => thirteenth 7 | 20th => twentieth 8 | 21st => twenty first 9 | 30th => thirtieth 10 | 100th => one hundredth 11 | 1000th => one thousandth 12 | -------------------------------------------------------------------------------- /tn/japanese/test/data/measure.txt: -------------------------------------------------------------------------------- 1 | 2022-2023年 => 二千二十二から二千二十三年 2 | 1-3年 => 一から三年 3 | 22-26年 => 二十二から二十六年 4 | 1~3年 => 一から三年 5 | 1-3平方 => 一から三平方 6 | 0km/h => 〇キロメートル毎時 7 | 10km/h => 十キロメートル毎時 8 | 2m/s => 二メートル毎秒 9 | 100fph/s => 百フィート毎時毎秒 10 | 1-200キロ => 一から二百キロ 11 | 10-11月 => 十から十一月 -------------------------------------------------------------------------------- /itn/chinese/test/data/money.txt: -------------------------------------------------------------------------------- 1 | 一点二五元 => ¥1.25 2 | 一点二五人民币 => CNY1.25 3 | 三十四点五二一元 => ¥34.521 4 | 八九千美元 => $8000~9000 5 | 七八英镑 => £7~8 6 | 十五六卢布 => ₽15-6 7 | 四十五六新台币 => TWD45-6 8 | 七百三四十欧元 => €730-40 9 | 七百三四十马来西亚令吉 => RM730-40 10 | 三千三百八十元五角八分 => ¥3380.58 11 | 二十五元三毛 => ¥25.3 12 | -------------------------------------------------------------------------------- /runtime/cmake/gflags.cmake: -------------------------------------------------------------------------------- 1 | set(GFLAGS_NAMESPACE "gflags") 2 | 3 | FetchContent_Declare(gflags 4 | URL https://github.com/gflags/gflags/archive/v2.2.2.zip 5 | URL_HASH MD5=ff856ff64757f1381f7da260f79ba79b 6 | ) 7 | FetchContent_MakeAvailable(gflags) 8 | include_directories(${gflags_BINARY_DIR}/include) 9 | -------------------------------------------------------------------------------- /runtime/android/.gitignore: -------------------------------------------------------------------------------- 1 | *.iml 2 | .gradle 3 | /local.properties 4 | /.idea/caches 5 | /.idea/libraries 6 | /.idea/modules.xml 7 | /.idea/workspace.xml 8 | /.idea/navEditor.xml 9 | /.idea/assetWizardSettings.xml 10 | .DS_Store 11 | /build 12 | /captures 13 | .externalNativeBuild 14 | .cxx 15 | local.properties 16 | -------------------------------------------------------------------------------- /tn/english/test/data/time.txt: -------------------------------------------------------------------------------- 1 | 12:30 a.m. est => twelve thirty AM EST 2 | 2.30 a.m. => two thirty AM 3 | 02.30 a.m. => two thirty AM 4 | 2.00 a.m. => two AM 5 | 2 a.m. => two AM 6 | 02:00 => two o'clock 7 | 02:30 => two thirty 8 | 2:00 => two o'clock 9 | 10:00:05 a.m. => ten hours zero minutes and five seconds AM 10 | -------------------------------------------------------------------------------- /itn/japanese/test/data/date.txt: -------------------------------------------------------------------------------- 1 | 二千二十四年十月一日 => 2024年10月1日 2 | 二千二十四年十月 => 2024年10月 3 | 五から九日 => 5~9日 4 | 三から四月 => 3~4月 5 | 一月一日 => 1月1日 6 | 二十一世紀 => 21世紀 7 | 七十年代 => 70年代 8 | 七から八年 => 7~8年 9 | 二千九年 => 2009年 10 | 月曜日から金曜日 => 月曜日から金曜日 11 | 二十三年二月二十五日土曜日 => 23年2月25日土曜日 12 | 七月五から九日月曜日から金曜日 => 7月5〜9日月曜日から金曜日 13 | 今年はR六 => 今年は令和6 -------------------------------------------------------------------------------- /tn/english/test/data/money.txt: -------------------------------------------------------------------------------- 1 | $12.05 => twelve point oh five dollars 2 | $12.0500 => twelve point oh five dollars 3 | $1 => one dollar 4 | $1.00 => one dollar 5 | $0.05 => zero point oh five dollars 6 | $1 million => one million dollars 7 | $1.2 million => one point two million dollars 8 | $1.2320 => one point two three two dollars 9 | -------------------------------------------------------------------------------- /runtime/cmake/gtest.cmake: -------------------------------------------------------------------------------- 1 | FetchContent_Declare(googletest 2 | URL https://github.com/google/googletest/archive/release-1.12.1.zip 3 | URL_HASH MD5=2648d4138129812611cf6b6b4b497a3b 4 | ) 5 | if(MSVC) 6 | set(gtest_force_shared_crt ON CACHE BOOL "Always use msvcrt.dll" FORCE) 7 | endif() 8 | FetchContent_MakeAvailable(googletest) 9 | -------------------------------------------------------------------------------- /tn/chinese/data/money/code.tsv: -------------------------------------------------------------------------------- 1 | A$ 澳元 2 | AED 阿联酋迪拉姆 3 | ARS 阿根廷比索 4 | AUD 澳元 5 | CAD$ 加元 6 | CAD 加元 7 | CHF 瑞士法郎 8 | CNY 人民币 9 | EUR 欧元成员国 10 | GBP 英镑 11 | HK$ 港元 12 | HKD 港元 13 | INR 印度卢比 14 | J¥ 日元 15 | JPY¥ 日元 16 | JPY 日元 17 | KRW 韩元 18 | RUB 俄罗斯卢布 19 | SAR 沙特阿拉伯里亚尔 20 | SEK 瑞典克朗 21 | SGD 新加坡元 22 | THB 泰铢 23 | TRY 土耳其里拉 24 | USD 美元 25 | -------------------------------------------------------------------------------- /tn/english/data/date/week.tsv: -------------------------------------------------------------------------------- 1 | Mon Monday 2 | Mon. Monday 3 | Tu Tuesday 4 | Tu. Tuesday 5 | Wed Wednesday 6 | Wed. Wednesday 7 | Th Thursday 8 | Th. Thursday 9 | Thur Thursday 10 | Thur. Thursday 11 | Thurs Thursday 12 | Thurs. Thursday 13 | Fri Friday 14 | Fri. Friday 15 | Sat Saturday 16 | Sat. Saturday 17 | Sun Sunday 18 | Sun. Sunday 19 | -------------------------------------------------------------------------------- /tn/english/data/electronic/symbol.tsv: -------------------------------------------------------------------------------- 1 | . dot 2 | - dash 3 | _ underscore 4 | ! exclamation mark 5 | # number sign 6 | $ dollar sign 7 | % percent 8 | & ampersand 9 | ' quote 10 | * asterisk 11 | + plus 12 | / slash 13 | = equal sign 14 | ? question mark 15 | ^ circumflex 16 | ` right single quote 17 | | vertical bar 18 | ~ tilde 19 | , comma -------------------------------------------------------------------------------- /itn/japanese/data/date/day.tsv: -------------------------------------------------------------------------------- 1 | 一 1 2 | 二 2 3 | 三 3 4 | 四 4 5 | 五 5 6 | 六 6 7 | 七 7 8 | 八 8 9 | 九 9 10 | 十 10 11 | 十一 11 12 | 十二 12 13 | 十三 13 14 | 十四 14 15 | 十五 15 16 | 十六 16 17 | 十七 17 18 | 十八 18 19 | 十九 19 20 | 二十 20 21 | 二十一 21 22 | 二十二 22 23 | 二十三 23 24 | 二十四 24 25 | 二十五 25 26 | 二十六 26 27 | 二十七 27 28 | 二十八 28 29 | 二十九 29 30 | 三十 30 31 | 三十一 31 -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /tn/chinese/data/erhua/whitelist.tsv: -------------------------------------------------------------------------------- 1 | 儿女 2 | 儿子 3 | 儿孙 4 | 女儿 5 | 儿媳 6 | 妻儿 7 | 胎儿 8 | 婴儿 9 | 新生儿 10 | 婴幼儿 11 | 幼儿 12 | 少儿 13 | 小儿 14 | 儿歌 15 | 儿童 16 | 儿科 17 | 托儿所 18 | 孤儿 19 | 儿戏 20 | 儿化 21 | 台儿庄 22 | 鹿儿岛 23 | 正儿八经 24 | 吊儿郎当 25 | 生儿育女 26 | 托儿带女 27 | 养儿防老 28 | 痴儿呆女 29 | 佳儿佳妇 30 | 儿怜兽扰 31 | 儿无常父 32 | 儿不嫌母丑 33 | 儿行千里母担忧 34 | 儿大不由爷 35 | 苏乞儿 36 | 容祖儿 37 | -------------------------------------------------------------------------------- /tn/japanese/data/measure/units_ja.tsv: -------------------------------------------------------------------------------- 1 | つ 2 | 枚 3 | 部 4 | 台 5 | 杯 6 | 匹 7 | 本 8 | 階 9 | 個 10 | 箇 11 | 个 12 | ヶ 13 | 面 14 | 名 15 | 人 16 | 歳 17 | 才 18 | 冊 19 | 話 20 | 秒 21 | 分 22 | 月 23 | 泊 24 | 時 25 | 時間 26 | 日 27 | ヶ月 28 | 箇月 29 | 年 30 | 日 31 | 週 32 | 倍 33 | 番 34 | 度 35 | 畳 36 | 回 37 | 年前 38 | 年後 39 | 年以内 40 | 平方 41 | 平方メートル 42 | 立方 43 | 立方メートル 44 | キロ 45 | キロメトル -------------------------------------------------------------------------------- /tn/chinese/test/data/date.txt: -------------------------------------------------------------------------------- 1 | 2008-08-08 => 二零零八年八月八日 2 | 2008/08/08 => 二零零八年八月八日 3 | 2008.08.08 => 二零零八年八月八日 4 | 2008-8-8 => 二零零八年八月八日 5 | 08-08-2008 => 二零零八年八月八日 6 | 2008-08 => 二零零八年八月 7 | 2008/08 => 二零零八年八月 8 | 2008.08 => 二零零八年八月 9 | 08-2008 => 二零零八年八月 10 | 08/2008 => 二零零八年八月 11 | 08.2008 => 二零零八年八月 12 | 08-08 => 八月八日 13 | 08/08 => 八月八日 14 | 08.08 => 八月八日 15 | -------------------------------------------------------------------------------- /tn/english/data/whitelist/symbol.tsv: -------------------------------------------------------------------------------- 1 | & and 2 | # hash 3 | @ at 4 | § section 5 | ™ trademark 6 | ® registered trademark 7 | © copyright 8 | _ underscore 9 | % percent 10 | * asterisk 11 | + plus 12 | / slash 13 | = equal sign 14 | ^ circumflex 15 | | vertical bar 16 | ~ tilde 17 | $ dollar 18 | £ pound 19 | € euro 20 | ₩ won 21 | ¥ yen 22 | ° degree 23 | º degree 24 | -------------------------------------------------------------------------------- /tn/english/data/date/month_number.tsv: -------------------------------------------------------------------------------- 1 | 1 january 2 | 2 february 3 | 3 march 4 | 4 april 5 | 5 may 6 | 6 june 7 | 7 july 8 | 8 august 9 | 9 september 10 | 10 october 11 | 11 november 12 | 12 december 13 | 01 january 14 | 02 february 15 | 03 march 16 | 04 april 17 | 05 may 18 | 06 june 19 | 07 july 20 | 08 august 21 | 09 september 22 | 10 october 23 | 11 november 24 | 12 december -------------------------------------------------------------------------------- /tn/japanese/data/date/d.tsv: -------------------------------------------------------------------------------- 1 | 1 一日 2 | 2 二日 3 | 3 三日 4 | 4 四日 5 | 5 五日 6 | 6 六日 7 | 7 七日 8 | 8 八日 9 | 9 九日 10 | 10 十日 11 | 11 十一日 12 | 12 十二日 13 | 13 十三日 14 | 14 十四日 15 | 15 十五日 16 | 16 十六日 17 | 17 十七日 18 | 18 十八日 19 | 19 十九日 20 | 20 二十日 21 | 21 二十一日 22 | 22 二十二日 23 | 23 二十三日 24 | 24 二十四日 25 | 25 二十五日 26 | 26 二十六日 27 | 27 二十七日 28 | 28 二十八日 29 | 29 二十九日 30 | 30 三十日 31 | 31 三十一日 -------------------------------------------------------------------------------- /tn/japanese/data/money/code.tsv: -------------------------------------------------------------------------------- 1 | A$ 豪ドル 2 | AED UAEディルハム 3 | ARS アルゼンチンペソ 4 | AUD 豪ドル 5 | CAD$ カナダドル 6 | CAD カナダドル 7 | CHF スイスフラン 8 | CNY 人民元 9 | EUR ユーロ 10 | GBP ポンド 11 | HK$ 香港ドル 12 | HKD 香港ドル 13 | INR インドルピー 14 | J¥ 円 15 | JPY¥ 円 16 | JPY 円 17 | KRW ウォン 18 | RUB ロシアルーブル 19 | SAR サウジリヤル 20 | SEK スウェーデンクローナ 21 | SGD シンガポールドル 22 | THB タイバーツ 23 | TRY トルコリラ 24 | USD アメリカドル 25 | -------------------------------------------------------------------------------- /tn/chinese/data/date/d.tsv: -------------------------------------------------------------------------------- 1 | 1 一日 2 | 2 二日 3 | 3 三日 4 | 4 四日 5 | 5 五日 6 | 6 六日 7 | 7 七日 8 | 8 八日 9 | 9 九日 10 | 10 十日 11 | 11 十一日 12 | 12 十二日 13 | 13 十三日 14 | 14 十四日 15 | 15 十五日 16 | 16 十六日 17 | 17 十七日 18 | 18 十八日 19 | 19 十九日 20 | 20 二十日 21 | 21 二十一日 22 | 22 二十二日 23 | 23 二十三日 24 | 24 二十四日 25 | 25 二十五日 26 | 26 二十六日 27 | 27 二十七日 28 | 28 二十八日 29 | 29 二十九日 30 | 30 三十日 31 | 31 三十一日 32 | -------------------------------------------------------------------------------- /itn/japanese/data/measure/unit_ja.tsv: -------------------------------------------------------------------------------- 1 | つ 2 | 枚 3 | 部 4 | 台 5 | 杯 6 | 匹 7 | 本 8 | 階 9 | 個 10 | 箇 11 | 円 12 | 个 13 | ヶ 14 | 面 15 | 名 16 | 人 17 | 歳 18 | 才 19 | 冊 20 | 話 21 | 秒 22 | 分 23 | 月 24 | 泊 25 | 時 26 | 時間 27 | 日 28 | ヶ月 29 | 箇月 30 | 年 31 | 日 32 | 週 33 | 倍 34 | 番 35 | 度 36 | 畳 37 | 回 38 | 年前 39 | 年後 40 | 年以内 41 | 平方 42 | 平方メートル 43 | 立方 44 | 立方メートル 45 | キロ 46 | キロメトル 47 | 世紀 48 | 年代 -------------------------------------------------------------------------------- /itn/japanese/data/time/hour.tsv: -------------------------------------------------------------------------------- 1 | 一時 1 2 | 两時 2 3 | 三時 3 4 | 四時 4 5 | 五時 5 6 | 六時 6 7 | 七時 7 8 | 八時 8 9 | 九時 9 10 | 零時 0 11 | 一時 1 12 | 两時 2 13 | 三時 3 14 | 四時 4 15 | 五時 5 16 | 六時 6 17 | 七時 7 18 | 八時 8 19 | 九時 9 20 | 十時 10 21 | 十一時 11 22 | 十二時 12 23 | 十三時 13 24 | 十四時 14 25 | 十五時 15 26 | 十六時 16 27 | 十七時 17 28 | 十八時 18 29 | 十九時 19 30 | 二十時 20 31 | 二十一時 21 32 | 二十二時 22 33 | 二十三時 23 34 | 二十四時 24 -------------------------------------------------------------------------------- /tn/chinese/data/date/dd.tsv: -------------------------------------------------------------------------------- 1 | 01 一日 2 | 02 二日 3 | 03 三日 4 | 04 四日 5 | 05 五日 6 | 06 六日 7 | 07 七日 8 | 08 八日 9 | 09 九日 10 | 10 十日 11 | 11 十一日 12 | 12 十二日 13 | 13 十三日 14 | 14 十四日 15 | 15 十五日 16 | 16 十六日 17 | 17 十七日 18 | 18 十八日 19 | 19 十九日 20 | 20 二十日 21 | 21 二十一日 22 | 22 二十二日 23 | 23 二十三日 24 | 24 二十四日 25 | 25 二十五日 26 | 26 二十六日 27 | 27 二十七日 28 | 28 二十八日 29 | 29 二十九日 30 | 30 三十日 31 | 31 三十一日 32 | -------------------------------------------------------------------------------- /tn/english/data/number/thousand.tsv: -------------------------------------------------------------------------------- 1 | thousand 2 | million 3 | billion 4 | trillion 5 | quadrillion 6 | quintillion 7 | sextillion 8 | septillion 9 | octillion 10 | nonillion 11 | decillion 12 | undecillion 13 | duodecillion 14 | tredecillion 15 | quattuordecillion 16 | quindecillion 17 | sexdecillion 18 | septendecillion 19 | octodecillion 20 | novemdecillion 21 | vigintillion 22 | centillion -------------------------------------------------------------------------------- /tn/english/data/whitelist/lj_speech.tsv: -------------------------------------------------------------------------------- 1 | Mr. mister 2 | Mrs. misses 3 | Dr. doctor 4 | Drs. doctors 5 | Co. company 6 | Lt. lieutenant 7 | Sgt. sergeant 8 | St. saint 9 | Jr. junior 10 | Maj. major 11 | Hon. honorable 12 | Gov. governor 13 | Capt. captain 14 | Esq. esquire 15 | Gen. general 16 | Ltd. limited 17 | Rev. reverend 18 | Col. colonel 19 | Mt. mount 20 | Ft. fort 21 | etc. et cetera 22 | -------------------------------------------------------------------------------- /tn/chinese/data/time/hour.tsv: -------------------------------------------------------------------------------- 1 | 1 一点 2 | 2 两点 3 | 3 三点 4 | 4 四点 5 | 5 五点 6 | 6 六点 7 | 7 七点 8 | 8 八点 9 | 9 九点 10 | 00 零点 11 | 01 一点 12 | 02 两点 13 | 03 三点 14 | 04 四点 15 | 05 五点 16 | 06 六点 17 | 07 七点 18 | 08 八点 19 | 09 九点 20 | 10 十点 21 | 11 十一点 22 | 12 十二点 23 | 13 十三点 24 | 14 十四点 25 | 15 十五点 26 | 16 十六点 27 | 17 十七点 28 | 18 十八点 29 | 19 十九点 30 | 20 二十点 31 | 21 二十一点 32 | 22 二十二点 33 | 23 二十三点 34 | 24 二十四点 35 | -------------------------------------------------------------------------------- /tn/english/test/data/measure.txt: -------------------------------------------------------------------------------- 1 | -12kg => negative twelve kilograms 2 | 1kg => one kilogram 3 | .5kg => point five kilograms 4 | 3.5 cm² => three point five square centimeters 5 | 2788 San Tomas Expy, Santa Clara, CA 95051 => twenty seven eighty eight San Tomas Expressway, Santa Clara, California nine five oh five one 6 | 2-3 °C => two to three degrees Celsius 7 | 2 * 10 μm => two times ten micrometers 8 | -------------------------------------------------------------------------------- /itn/chinese/data/time/hour.tsv: -------------------------------------------------------------------------------- 1 | 一点 1 2 | 两点 2 3 | 三点 3 4 | 四点 4 5 | 五点 5 6 | 六点 6 7 | 七点 7 8 | 八点 8 9 | 九点 9 10 | 零点 00 11 | 一点 01 12 | 两点 02 13 | 三点 03 14 | 四点 04 15 | 五点 05 16 | 六点 06 17 | 七点 07 18 | 八点 08 19 | 九点 09 20 | 十点 10 21 | 十一点 11 22 | 十二点 12 23 | 十三点 13 24 | 十四点 14 25 | 十五点 15 26 | 十六点 16 27 | 十七点 17 28 | 十八点 18 29 | 十九点 19 30 | 二十点 20 31 | 二十一点 21 32 | 二十二点 22 33 | 二十三点 23 34 | 二十四点 24 35 | -------------------------------------------------------------------------------- /tn/japanese/data/time/hour.tsv: -------------------------------------------------------------------------------- 1 | 0 〇時 2 | 1 一時 3 | 2 二時 4 | 3 三時 5 | 4 四時 6 | 5 五時 7 | 6 六時 8 | 7 七時 9 | 8 八時 10 | 9 九時 11 | 00 〇時 12 | 01 一時 13 | 02 二時 14 | 03 三時 15 | 04 四時 16 | 05 五時 17 | 06 六時 18 | 07 七時 19 | 08 八時 20 | 09 九時 21 | 10 十時 22 | 11 十一時 23 | 12 十二時 24 | 13 十三時 25 | 14 十四時 26 | 15 十五時 27 | 16 十六時 28 | 17 十七時 29 | 18 十八時 30 | 19 十九時 31 | 20 二十時 32 | 21 二十一時 33 | 22 二十二時 34 | 23 二十三時 35 | 24 二十四時 36 | -------------------------------------------------------------------------------- /tn/english/data/date/day.tsv: -------------------------------------------------------------------------------- 1 | one 2 | two 3 | three 4 | four 5 | five 6 | six 7 | seven 8 | eight 9 | nine 10 | ten 11 | eleven 12 | twelve 13 | thirteen 14 | fourteen 15 | fifteen 16 | sixteen 17 | seventeen 18 | eighteen 19 | nineteen 20 | twenty 21 | twenty one 22 | twenty two 23 | twenty three 24 | twenty four 25 | twenty five 26 | twenty six 27 | twenty seven 28 | twenty eight 29 | twenty nine 30 | thirty 31 | thirty one -------------------------------------------------------------------------------- /runtime/android/app/src/main/java/com/mobvoi/WeTextProcessing/WeTextProcessing.java: -------------------------------------------------------------------------------- 1 | package com.mobvoi.WeTextProcessing; 2 | 3 | public class WeTextProcessing { 4 | 5 | static { 6 | System.loadLibrary("wetextprocessing"); 7 | } 8 | 9 | public static native void init(String modelDir); 10 | public static native String normalize(String input); 11 | public static native String inverse_normalize(String input); 12 | } 13 | -------------------------------------------------------------------------------- /itn/chinese/test/data/normalizer_disable_standalone_number_enable_0_to_9.txt: -------------------------------------------------------------------------------- 1 | 二点五平方电线,五,五十五,疑是银河落九天,十二块五 => 2.5平方电线,五,五十五,疑是银河落9天,12块五 2 | 三百九十九三盒 => 三百九十九3盒 3 | 这是八百一千 => 这是八百一千 4 | 这是二十九千 => 这是二十九千 5 | 这是九十九九千 => 这是九十九九千 6 | 这是十二一千 => 这是十二一千 7 | 这是零百 => 这是零百 8 | 这是零千 => 这是零千 9 | 这是一百一个,一千两位,一万三天 => 这是100 1个,1000 2位,10000 3天 10 | 这是九百九周,九千九月,九万九年 => 这是900 9周,9000 9月,90000 9年 11 | 十三五规划期间获得了十几万和几十万甚至二十几万的投资 => 十三五规划期间获得了十几万和几十万甚至二十几万的投资 12 | -------------------------------------------------------------------------------- /itn/chinese/test/data/date.txt: -------------------------------------------------------------------------------- 1 | 二零零八年八月八日 => 2008/08/08 2 | 二零零八年八月 => 2008/08 3 | 八月八日 => 08/08 4 | 二零零八年 => 2008年 5 | 二千年 => 2000年 6 | 两千零五年八月五号 => 2005年08/05 7 | 公元一九九六年 => 公元1996年 8 | 公元一六三年 => 公元163年 9 | 八五年二月二十七日 => 85/02/27 10 | 这件事发生在二零一九年九月十二日的晚上或者两千零五年八月五号的晚上或者在八五年二月二十七日的晚上或者在公元一九九六年或者在八六年八月十八日或者于一九九五年三月一日或者在公元一六三年或者在零六年一月二号 => 这件事发生在2019/09/12的晚上或者2005年08/05的晚上或者在85/02/27的晚上或者在公元1996年或者在86/08/18或者于1995/03/01或者在公元163年或者在06/01/02 11 | -------------------------------------------------------------------------------- /itn/chinese/test/data/time.txt: -------------------------------------------------------------------------------- 1 | 两点零二分 => 2:02 2 | 十三点十分三十六秒 => 13:10:36 3 | 上午一点零二分三十六秒 => 1:02:36a.m. 4 | 早上一点零二 => 1:02a.m. 5 | 早上一点零二分 => 1:02a.m. 6 | 晚上一点零二分 => 1:02p.m. 7 | 零点十分 => 00:10 8 | 下午八点零五分 => 8:05p.m. 9 | 八点零五分 => 8:05 10 | 八点三十 => 8:30 11 | 八点半 => 8:30 12 | 时间上是零点十分登机,八点五分下飞机,八点三十去吃早饭.你是说八点零五分下飞机还是八点零五下飞机?我可能下午三点四十,或者上午十点半再或者下午三点四十一分去找你 => 时间上是00:10登机,8.5分下飞机,8:30去吃早饭.你是说8:05下飞机还是8:05下飞机?我可能3:40p.m.,或者10:30a.m.再或者3:41p.m.去找你 13 | -------------------------------------------------------------------------------- /runtime/android/build.gradle: -------------------------------------------------------------------------------- 1 | buildscript { 2 | repositories { 3 | google() 4 | jcenter() 5 | } 6 | dependencies { 7 | classpath 'com.android.tools.build:gradle:4.2.2' 8 | } 9 | } 10 | 11 | allprojects { 12 | repositories { 13 | google() 14 | jcenter() 15 | maven { url 'https://jitpack.io' } 16 | } 17 | } 18 | 19 | task clean(type: Delete) { 20 | delete rootProject.buildDir 21 | } -------------------------------------------------------------------------------- /tn/japanese/data/sport/club.tsv: -------------------------------------------------------------------------------- 1 | FCバルセロナ 2 | レアル・マドリード 3 | マンチェスター・ユナイテッド 4 | マンチェスター・シティ 5 | バイエルン・ミュンヘン 6 | リヴァプールFC 7 | ユヴェントスFC 8 | パリ・サンジェルマンFC 9 | ACミラン 10 | インテル・ミラノ 11 | チェルシーFC 12 | アーセナルFC 13 | ボルシア・ドルトムント 14 | トッテナム・ホットスパーFC 15 | ASローマ 16 | アトレティコ・マドリード 17 | SSCナポリ 18 | バレンシアCF 19 | オリンピック・リヨン 20 | オリンピック・マルセイユ 21 | SLベンフィカ 22 | FCポルト 23 | アヤックス・アムステルダム 24 | フェイエノールト 25 | SSラツィオ 26 | ビジャレアルCF 27 | セビージャFC 28 | スパルタク・モスクワ 29 | ゼニト・サンクトペテルブルク 30 | セルティックFC 31 | レンジャーズFC -------------------------------------------------------------------------------- /tn/english/data/whitelist/alternatives.tsv: -------------------------------------------------------------------------------- 1 | Hon. Honorable 2 | Mr. Mister 3 | Mrs. Misses 4 | Ms. Miss 5 | Mr Mister 6 | Mrs Misses 7 | Ms Miss 8 | &Co. and Co. 9 | &Co. and Company 10 | Mon Monday 11 | Tu Tuesday 12 | Wed Wednesday 13 | Th Thursday 14 | Thur Thursday 15 | Thurs Thursday 16 | Fri Friday 17 | Sat Saturday 18 | Sun Sunday 19 | = equals 20 | # number 21 | No. number 22 | NO. number 23 | NO. number 24 | No. number 25 | VOL. Volume 26 | Vol. Volume 27 | TV Television 28 | -------------------------------------------------------------------------------- /tn/chinese/test/data/number.txt: -------------------------------------------------------------------------------- 1 | -1 => 负一 2 | 0 => 零 3 | 1 => 一 4 | 2 => 二 5 | 10 => 十 6 | 11 => 十一 7 | 20 => 二十 8 | 100 => 一百 9 | 101 => 一百零一 10 | 111 => 一百一十一 11 | 200 => 两百 12 | 1000 => 一千 13 | 1001 => 一千零一 14 | 1011 => 一千零一十一 15 | 1111 => 一千一百一十一 16 | 2000 => 两千 17 | 10000 => 一万 18 | 10001 => 一万零一 19 | 10011 => 一万零一十一 20 | 10111 => 一万零一百一十一 21 | 11111 => 一万一千一百一十一 22 | 20000 => 两万 23 | 101111 => 十万一千一百一十一 24 | 1001111 => 一百万一千一百一十一 25 | 10001111 => 一千万一千一百一十一 26 | 1.01 => 一点零一 27 | 1.11 => 一点一一 28 | -------------------------------------------------------------------------------- /itn/japanese/data/char/punctuations_ja.tsv: -------------------------------------------------------------------------------- 1 | ! 2 | ? 3 | 。 4 | 。 5 | " 6 | # 7 | $ 8 | % 9 | & 10 | ' 11 | ( 12 | ) 13 | * 14 | + 15 | , 16 | - 17 | / 18 | : 19 | ; 20 | < 21 | = 22 | > 23 | @ 24 | [ 25 | \ 26 | ] 27 | ^ 28 | _ 29 | ` 30 | { 31 | | 32 | } 33 | ~ 34 | ⦅ 35 | ⦆ 36 | 「 37 | 」 38 | 、 39 | 、 40 | 〃 41 | 》 42 | 「 43 | 」 44 | 『 45 | 』 46 | 【 47 | 】 48 | 〔 49 | 〕 50 | 〖 51 | 〗 52 | 〘 53 | 〙 54 | 〚 55 | 〛 56 | 〜 57 | 〝 58 | 〞 59 | 〟 60 | 〰 61 | – 62 | — 63 | ‘ 64 | ’ 65 | ‛ 66 | “ 67 | ” 68 | „ 69 | ‟ 70 | … 71 | ‧ 72 | ﹏ -------------------------------------------------------------------------------- /tn/japanese/data/char/punctuations_ja.tsv: -------------------------------------------------------------------------------- 1 | ! 2 | ? 3 | 。 4 | 。 5 | " 6 | # 7 | $ 8 | % 9 | & 10 | ' 11 | ( 12 | ) 13 | * 14 | + 15 | , 16 | - 17 | / 18 | : 19 | ; 20 | < 21 | = 22 | > 23 | @ 24 | [ 25 | \ 26 | ] 27 | ^ 28 | _ 29 | ` 30 | { 31 | | 32 | } 33 | ~ 34 | ⦅ 35 | ⦆ 36 | 「 37 | 」 38 | 、 39 | 、 40 | 〃 41 | 》 42 | 「 43 | 」 44 | 『 45 | 』 46 | 【 47 | 】 48 | 〔 49 | 〕 50 | 〖 51 | 〗 52 | 〘 53 | 〙 54 | 〚 55 | 〛 56 | 〜 57 | 〝 58 | 〞 59 | 〟 60 | 〰 61 | – 62 | — 63 | ‘ 64 | ’ 65 | ‛ 66 | “ 67 | ” 68 | „ 69 | ‟ 70 | … 71 | ‧ 72 | ﹏ -------------------------------------------------------------------------------- /runtime/android/app/src/test/java/com/mobvoi/WeTextProcessing/ExampleUnitTest.java: -------------------------------------------------------------------------------- 1 | package com.mobvoi.WeTextProcessing; 2 | 3 | import org.junit.Test; 4 | 5 | import static org.junit.Assert.*; 6 | 7 | /** 8 | * Example local unit test, which will execute on the development machine (host). 9 | * 10 | * @see Testing documentation 11 | */ 12 | public class ExampleUnitTest { 13 | @Test 14 | public void addition_isCorrect() { 15 | assertEquals(4, 2 + 2); 16 | } 17 | } -------------------------------------------------------------------------------- /tn/chinese/data/char/punctuations_zh.tsv: -------------------------------------------------------------------------------- 1 | – 2 | — 3 | ‘ 4 | ’ 5 | ‛ 6 | “ 7 | ” 8 | „ 9 | ‟ 10 | … 11 | ‧ 12 | 、 13 | 。 14 | 〃 15 | 《 16 | 》 17 | 「 18 | 」 19 | 『 20 | 』 21 | 【 22 | 】 23 | 〔 24 | 〕 25 | 〖 26 | 〗 27 | 〘 28 | 〙 29 | 〚 30 | 〛 31 | 〜 32 | 〝 33 | 〞 34 | 〟 35 | 〰 36 | ﹏ 37 | ! 38 | " 39 | # 40 | % 41 | & 42 | ' 43 | ( 44 | ) 45 | * 46 | + 47 | , 48 | - 49 | / 50 | : 51 | ; 52 | < 53 | = 54 | > 55 | ? 56 | @ 57 | [ 58 | \ 59 | ] 60 | ^ 61 | _ 62 | ` 63 | { 64 | | 65 | } 66 | ~ 67 | ⦅ 68 | ⦆ 69 | 。 70 | 「 71 | 」 72 | 、 73 | $ 74 | -------------------------------------------------------------------------------- /tn/english/data/date/month_abbr.tsv: -------------------------------------------------------------------------------- 1 | jan january 2 | Jan january 3 | JAN january 4 | feb february 5 | Feb february 6 | FEB february 7 | mar march 8 | Mar march 9 | MAR march 10 | apr april 11 | Apr april 12 | APR april 13 | jun june 14 | Jun june 15 | JUN june 16 | jul july 17 | Jul july 18 | JUL july 19 | aug august 20 | Aug august 21 | AUG august 22 | sep september 23 | Sep september 24 | SEP september 25 | sept september 26 | Sept september 27 | SEPT september 28 | oct october 29 | Oct october 30 | OCT october 31 | nov november 32 | Nov november 33 | NOV november 34 | dec december 35 | Dec december 36 | DEC december 37 | -------------------------------------------------------------------------------- /tn/english/test/data/date.txt: -------------------------------------------------------------------------------- 1 | 1219 => twelve nineteen 2 | 2999 => twenty nine ninety nine 3 | '70s => seventies 4 | 2024 B.C => twenty twenty four BC 5 | 1H23 => the first half of twenty three 6 | 3Q22 => the third quarter of twenty two 7 | jan. 5, 2012 => the fifth of january , twenty twelve 8 | jan. 5 => the fifth of january 9 | 5 january 2012 => the fifth of january twenty twelve 10 | 2012-01-05 => the fifth of january twenty twelve 11 | 2012.01.05 => the fifth of january twenty twelve 12 | 2012/01/05 => the fifth of january twenty twelve 13 | 2012 => twenty twelve 14 | 2024-05-06 => the sixth of may twenty twenty four 15 | -------------------------------------------------------------------------------- /itn/chinese/data/number/special_dash.tsv: -------------------------------------------------------------------------------- 1 | 一二 1-2 2 | 二三 2-3 3 | 三四 3-4 4 | 三五 3-5 5 | 四五 4-5 6 | 五六 5-6 7 | 六七 6-7 8 | 七八 7-8 9 | 八九 8-9 10 | 一二十 10-20 11 | 二三十 20-30 12 | 三四十 30-40 13 | 三五十 30-50 14 | 四五十 40-50 15 | 五六十 50-60 16 | 六七十 60-70 17 | 七八十 70-80 18 | 八九十 80-90 19 | 一二百 100-200 20 | 一两百 100-200 21 | 二三百 200-300 22 | 两三百 200-300 23 | 三四百 300-400 24 | 三五百 300-500 25 | 四五百 400-500 26 | 五六百 500-600 27 | 六七百 600-700 28 | 七八百 700-800 29 | 八九百 800-900 30 | 一二千 1000-2000 31 | 一两千 1000-2000 32 | 二三千 2000-3000 33 | 两三千 2000-3000 34 | 三四千 3000-4000 35 | 三五千 3000-5000 36 | 四五千 4000-5000 37 | 五六千 5000-6000 38 | 六七千 6000-7000 39 | 七八千 7000-8000 40 | 八九千 8000-9000 41 | -------------------------------------------------------------------------------- /itn/chinese/data/number/special_tilde.tsv: -------------------------------------------------------------------------------- 1 | 一二 1~2 2 | 二三 2~3 3 | 三四 3~4 4 | 三五 3~5 5 | 四五 4~5 6 | 五六 5~6 7 | 六七 6~7 8 | 七八 7~8 9 | 八九 8~9 10 | 一二十 10~20 11 | 二三十 20~30 12 | 三四十 30~40 13 | 三五十 30~50 14 | 四五十 40~50 15 | 五六十 50~60 16 | 六七十 60~70 17 | 七八十 70~80 18 | 八九十 80~90 19 | 一二百 100~200 20 | 一两百 100~200 21 | 二三百 200~300 22 | 两三百 200~300 23 | 三四百 300~400 24 | 三五百 300~500 25 | 四五百 400~500 26 | 五六百 500~600 27 | 六七百 600~700 28 | 七八百 700~800 29 | 八九百 800~900 30 | 一二千 1000~2000 31 | 一两千 1000~2000 32 | 二三千 2000~3000 33 | 两三千 2000~3000 34 | 三四千 3000~4000 35 | 三五千 3000~5000 36 | 四五千 4000~5000 37 | 五六千 5000~6000 38 | 六七千 6000~7000 39 | 七八千 7000~8000 40 | 八九千 8000~9000 41 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | select = B,C,E,F,P,T4,W,B9 3 | max-line-length = 80 4 | # C408 ignored because we like the dict keyword argument syntax 5 | # E501 is not flexible enough, we're using B950 instead 6 | ignore = 7 | E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303, 8 | # shebang has extra meaning in fbcode lints, so I think it's not worth trying 9 | # to line this up with executable bit 10 | EXE001, EXE002, 11 | # these ignores are from flake8-bugbear; please fix! 12 | B007,B008, 13 | # these ignores are from flake8-comprehensions; please fix! 14 | C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415 15 | -------------------------------------------------------------------------------- /tn/chinese/test/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /itn/chinese/test/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tn/english/data/date/month_name.tsv: -------------------------------------------------------------------------------- 1 | january january 2 | february february 3 | march march 4 | april april 5 | may may 6 | june june 7 | july july 8 | august august 9 | september september 10 | october october 11 | november november 12 | december december 13 | January january 14 | JANUARY january 15 | February february 16 | FEBRUARY february 17 | March march 18 | MARCH march 19 | April april 20 | APRIL april 21 | June june 22 | JUNE june 23 | July july 24 | JULY july 25 | August august 26 | AUGUST august 27 | September september 28 | SEPTEMBER september 29 | October october 30 | OCTOBER october 31 | November november 32 | NOVEMBER november 33 | December december 34 | DECEMBER december 35 | -------------------------------------------------------------------------------- /tn/english/data/time/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/values/colors.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | #FFBB86FC 4 | #FF6200EE 5 | #FF3700B3 6 | #FF03DAC5 7 | #FF018786 8 | #FF000000 9 | #FFFFFFFF 10 | 11 | #f16d7a 12 | #b7d28d 13 | #b8f1ed 14 | #b7d28d 15 | #b8f1ed 16 | -------------------------------------------------------------------------------- /tn/english/data/measure/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tn/english/data/number/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tn/japanese/test/data/math.txt: -------------------------------------------------------------------------------- 1 | 1-2=-1 => 一マイナス二イコールマイナス一 2 | 1+2>2 => 一プラス二大なり二 3 | 4×5=20 => 四カケル五イコール二十 4 | 4x5=20 => 四x五イコール二十 5 | 1~100 => 一から百 6 | 3>=3 => 三大なりイコール三 7 | 3≥2≥1 => 三大なりイコール二大なりイコール一 8 | 3 ≥ 2 => 三大なりイコール二 9 | 2>1 => 二大なり一 10 | 2 > 1 => 二大なり一 11 | 1<2 => 一小なり二 12 | 5×4÷2+3-6 ≥ 7 => 五カケル四ワル二プラス三マイナス六大なりイコール七 13 | 1≥0 => 一大なりイコール〇 14 | 1+3+2+3>3 => 一プラス三プラス二プラス三大なり三 15 | 1-1 => 一マイナス一 16 | 1+1 => 一プラス一 17 | abc/3 => abc/三 18 | 1 + 1 => 一プラス一 19 | ±5 => プラスマイナス五 20 | abc+5 => abcプラス五 21 | 1+1=2 => 一プラス一イコール二 22 | 1+2+3=6 => 一プラス二プラス三イコール六 23 | 911-1234-5678 => 九百十一マイナス千二百三十四マイナス五千六百七十八 24 | 112-1234-5678 => 百十二マイナス千二百三十四マイナス五千六百七十八 25 | 5-15 => 五マイナス十五 26 | 4/900000000を切る => 九億分の四を切る -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | exclude: '.*\.(txt|tsv)$' 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v4.5.0 5 | hooks: 6 | - id: trailing-whitespace 7 | - repo: https://github.com/pre-commit/mirrors-yapf 8 | rev: 'v0.32.0' 9 | hooks: 10 | - id: yapf 11 | - repo: https://github.com/pycqa/flake8 12 | rev: '3.8.2' 13 | hooks: 14 | - id: flake8 15 | - repo: https://github.com/pre-commit/mirrors-clang-format 16 | rev: 'v17.0.6' 17 | hooks: 18 | - id: clang-format 19 | exclude: '.*\.(json|java|js|m|mm|proto)' 20 | - repo: https://github.com/cpplint/cpplint 21 | rev: '1.6.1' 22 | hooks: 23 | - id: cpplint 24 | -------------------------------------------------------------------------------- /itn/chinese/data/time/minute.tsv: -------------------------------------------------------------------------------- 1 | 半 30 2 | 零一 01 3 | 零二 02 4 | 零三 03 5 | 零四 04 6 | 零五 05 7 | 零六 06 8 | 零七 07 9 | 零八 08 10 | 零九 09 11 | 十 10 12 | 十一 11 13 | 十二 12 14 | 十三 13 15 | 十四 14 16 | 十五 15 17 | 十六 16 18 | 十七 17 19 | 十八 18 20 | 十九 19 21 | 二十 20 22 | 二十一 21 23 | 二十二 22 24 | 二十三 23 25 | 二十四 24 26 | 二十五 25 27 | 二十六 26 28 | 二十七 27 29 | 二十八 28 30 | 二十九 29 31 | 三十 30 32 | 三十一 31 33 | 三十二 32 34 | 三十三 33 35 | 三十四 34 36 | 三十五 35 37 | 三十六 36 38 | 三十七 37 39 | 三十八 38 40 | 三十九 39 41 | 四十 40 42 | 四十一 41 43 | 四十二 42 44 | 四十三 43 45 | 四十四 44 46 | 四十五 45 47 | 四十六 46 48 | 四十七 47 49 | 四十八 48 50 | 四十九 49 51 | 五十 50 52 | 五十一 51 53 | 五十二 52 54 | 五十三 53 55 | 五十四 54 56 | 五十五 55 57 | 五十六 56 58 | 五十七 57 59 | 五十八 58 60 | 五十九 59 61 | -------------------------------------------------------------------------------- /runtime/test/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | enable_testing() 2 | link_libraries(gtest_main gmock) 3 | include(GoogleTest) 4 | 5 | add_executable(string_test string_test.cc) 6 | target_link_libraries(string_test PUBLIC wetext_utils) 7 | gtest_discover_tests(string_test) 8 | 9 | if(NOT MSVC) 10 | # token_parser_test uses the macro to access the private members 11 | add_executable(token_parser_test token_parser_test.cc) 12 | target_link_libraries(token_parser_test PUBLIC wetext_processor) 13 | gtest_discover_tests(token_parser_test) 14 | endif() 15 | 16 | add_executable(processor_test processor_test.cc) 17 | target_link_libraries(processor_test PUBLIC wetext_processor) 18 | gtest_discover_tests(processor_test) 19 | -------------------------------------------------------------------------------- /tn/english/data/address/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # Visual Studio Code files 7 | .vscode 8 | .vs 9 | 10 | # PyCharm files 11 | .idea 12 | 13 | # Eclipse Project settings 14 | *.*project 15 | .settings 16 | 17 | # Sublime Text settings 18 | *.sublime-workspace 19 | *.sublime-project 20 | 21 | # Editor temporaries 22 | *.swn 23 | *.swo 24 | *.swp 25 | *.swm 26 | *~ 27 | 28 | # IPython notebook checkpoints 29 | .ipynb_checkpoints 30 | 31 | # macOS dir files 32 | .DS_Store 33 | 34 | # Clangd files 35 | .cache 36 | compile_commands.json 37 | 38 | 39 | # Setup files 40 | WeTextProcessing.egg-info/ 41 | build/ 42 | dist/ 43 | tn/*.far 44 | itn/*.far 45 | -------------------------------------------------------------------------------- /itn/chinese/test/data/number.txt: -------------------------------------------------------------------------------- 1 | 负一 => -1 2 | 零 => 0 3 | 一 => 1 4 | 二 => 2 5 | 十 => 10 6 | 十一 => 11 7 | 二十 => 20 8 | 一百 => 100 9 | 一百零一 => 101 10 | 一百一十一 => 111 11 | 两百 => 200 12 | 一千 => 1000 13 | 一千零一 => 1001 14 | 一千零一十一 => 1011 15 | 一千一百一十一 => 1111 16 | 两千 => 2000 17 | 两千零十 => 2010 18 | 两千零一十 => 2010 19 | 两千零十二 => 2012 20 | 两千零一十二 => 2012 21 | 两千零二十 => 2020 22 | 一万 => 10000 23 | 一万零一 => 10001 24 | 一万零一十一 => 10011 25 | 一万零一百一十一 => 10111 26 | 一万一千一百一十一 => 11111 27 | 两万 => 20000 28 | 十万一千一百一十一 => 101111 29 | 一百万一千一百一十一 => 100万1111 30 | 一千万一千一百一十一 => 1000万1111 31 | 一点一一 => 1.11 32 | 三点一四一五九二六 => 3.1415926 33 | 负三点一四一五九二六 => -3.1415926 34 | 一万两千三百 => 12300 35 | 小数三点一四一五九二六和负三点一四一五九二六是不是经常见到 => 小数3.1415926和-3.1415926是不是经常见到 36 | -------------------------------------------------------------------------------- /runtime/patch/openfst/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | #-DHAVE_CONFIG_H -I./../include -fno-exceptions -funsigned-char -std=c++11 -MT symbol-table.lo -MD -MP -MF .deps/symbol-table.Tpo -c symbol-table.cc -fno-common -DPIC -o .libs/symbol-table.o 3 | 4 | include_directories(./include/) 5 | install(DIRECTORY include/ DESTINATION include/ 6 | FILES_MATCHING PATTERN "*.h") 7 | 8 | add_subdirectory(lib) 9 | 10 | if(HAVE_SCRIPT) 11 | add_subdirectory(script) 12 | endif(HAVE_SCRIPT) 13 | 14 | if(HAVE_BIN) 15 | add_subdirectory(bin) 16 | endif(HAVE_BIN) 17 | 18 | add_subdirectory(extensions) 19 | 20 | if(BUILD_TESTING) 21 | enable_testing() 22 | add_subdirectory(test) 23 | endif(BUILD_TESTING) 24 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/cpp/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.4.1) 2 | set(TARGET wetextprocessing) 3 | project(${TARGET} CXX) 4 | set(CMAKE_CXX_STANDARD 14) 5 | include(ExternalProject) 6 | 7 | set(CMAKE_VERBOSE_MAKEFILE on) 8 | set(build_DIR ${CMAKE_SOURCE_DIR}/../../../build) 9 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) 10 | string(REPLACE "-Wl,--exclude-libs,libgcc_real.a" "" CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") 11 | 12 | include(openfst) 13 | 14 | include_directories( 15 | ${CMAKE_SOURCE_DIR} 16 | ) 17 | 18 | add_subdirectory(utils) 19 | add_subdirectory(processor) 20 | 21 | link_libraries(wetext_processor android) 22 | add_library(${TARGET} SHARED wetextprocessing.cc) 23 | -------------------------------------------------------------------------------- /itn/japanese/data/time/minute.tsv: -------------------------------------------------------------------------------- 1 | 一分 1 2 | 二分 2 3 | 三分 3 4 | 四分 4 5 | 五分 5 6 | 六分 6 7 | 七分 7 8 | 八分 8 9 | 九分 9 10 | 十分 10 11 | 十一分 11 12 | 十二分 12 13 | 十三分 13 14 | 十四分 14 15 | 十五分 15 16 | 十六分 16 17 | 十七分 17 18 | 十八分 18 19 | 十九分 19 20 | 二十分 20 21 | 二十一分 21 22 | 二十二分 22 23 | 二十三分 23 24 | 二十四分 24 25 | 二十五分 25 26 | 二十六分 26 27 | 二十七分 27 28 | 二十八分 28 29 | 二十九分 29 30 | 三十分 30 31 | 三十一分 31 32 | 三十二分 32 33 | 三十三分 33 34 | 三十四分 34 35 | 三十五分 35 36 | 三十六分 36 37 | 三十七分 37 38 | 三十八分 38 39 | 三十九分 39 40 | 四十分 40 41 | 四十一分 41 42 | 四十二分 42 43 | 四十三分 43 44 | 四十四分 44 45 | 四十五分 45 46 | 四十六分 46 47 | 四十七分 47 48 | 四十八分 48 49 | 四十九分 49 50 | 五十分 50 51 | 五十一分 51 52 | 五二十分 52 53 | 五十三分 53 54 | 五十四分 54 55 | 五十五分 55 56 | 五十六分 56 57 | 五十七分 57 58 | 五十八分 58 59 | 五十九分 59 60 | 六十分 60 -------------------------------------------------------------------------------- /itn/japanese/data/time/second.tsv: -------------------------------------------------------------------------------- 1 | 一秒 1 2 | 二秒 2 3 | 三秒 3 4 | 四秒 4 5 | 五秒 5 6 | 六秒 6 7 | 七秒 7 8 | 八秒 8 9 | 九秒 9 10 | 十秒 10 11 | 十一秒 11 12 | 十二秒 12 13 | 十三秒 13 14 | 十四秒 14 15 | 十五秒 15 16 | 十六秒 16 17 | 十七秒 17 18 | 十八秒 18 19 | 十九秒 19 20 | 二十秒 20 21 | 二十一秒 21 22 | 二十二秒 22 23 | 二十三秒 23 24 | 二十四秒 24 25 | 二十五秒 25 26 | 二十六秒 26 27 | 二十七秒 27 28 | 二十八秒 28 29 | 二十九秒 29 30 | 三十秒 30 31 | 三十一秒 31 32 | 三十二秒 32 33 | 三十三秒 33 34 | 三十四秒 34 35 | 三十五秒 35 36 | 三十六秒 36 37 | 三十七秒 37 38 | 三十八秒 38 39 | 三十九秒 39 40 | 四十秒 40 41 | 四十一秒 41 42 | 四十二秒 42 43 | 四十三秒 43 44 | 四十四秒 44 45 | 四十五秒 45 46 | 四十六秒 46 47 | 四十七秒 47 48 | 四十八秒 48 49 | 四十九秒 49 50 | 五十秒 50 51 | 五十一秒 51 52 | 五二十秒 52 53 | 五十三秒 53 54 | 五十四秒 54 55 | 五十五秒 55 56 | 五十六秒 56 57 | 五十七秒 57 58 | 五十八秒 58 59 | 五十九秒 59 60 | 六十秒 60 -------------------------------------------------------------------------------- /itn/chinese/data/date/dd.tsv: -------------------------------------------------------------------------------- 1 | 一日 01 2 | 二日 02 3 | 三日 03 4 | 四日 04 5 | 五日 05 6 | 六日 06 7 | 七日 07 8 | 八日 08 9 | 九日 09 10 | 十日 10 11 | 十一日 11 12 | 十二日 12 13 | 十三日 13 14 | 十四日 14 15 | 十五日 15 16 | 十六日 16 17 | 十七日 17 18 | 十八日 18 19 | 十九日 19 20 | 二十日 20 21 | 二十一日 21 22 | 二十二日 22 23 | 二十三日 23 24 | 二十四日 24 25 | 二十五日 25 26 | 二十六日 26 27 | 二十七日 27 28 | 二十八日 28 29 | 二十九日 29 30 | 三十日 30 31 | 三十一日 31 32 | 一号 01 33 | 二号 02 34 | 三号 03 35 | 四号 04 36 | 五号 05 37 | 六号 06 38 | 七号 07 39 | 八号 08 40 | 九号 09 41 | 十号 10 42 | 十一号 11 43 | 十二号 12 44 | 十三号 13 45 | 十四号 14 46 | 十五号 15 47 | 十六号 16 48 | 十七号 17 49 | 十八号 18 50 | 十九号 19 51 | 二十号 20 52 | 二十一号 21 53 | 二十二号 22 54 | 二十三号 23 55 | 二十四号 24 56 | 二十五号 25 57 | 二十六号 26 58 | 二十七号 27 59 | 二十八号 28 60 | 二十九号 29 61 | 三十号 30 62 | 三十一号 31 63 | -------------------------------------------------------------------------------- /itn/chinese/data/time/second.tsv: -------------------------------------------------------------------------------- 1 | 00 2 | 一秒 01 3 | 二秒 02 4 | 三秒 03 5 | 四秒 04 6 | 五秒 05 7 | 六秒 06 8 | 七秒 07 9 | 八秒 08 10 | 九秒 09 11 | 十秒 10 12 | 十一秒 11 13 | 十二秒 12 14 | 十三秒 13 15 | 十四秒 14 16 | 十五秒 15 17 | 十六秒 16 18 | 十七秒 17 19 | 十八秒 18 20 | 十九秒 19 21 | 二十秒 20 22 | 二十一秒 21 23 | 二十二秒 22 24 | 二十三秒 23 25 | 二十四秒 24 26 | 二十五秒 25 27 | 二十六秒 26 28 | 二十七秒 27 29 | 二十八秒 28 30 | 二十九秒 29 31 | 三十秒 30 32 | 三十一秒 31 33 | 三十二秒 32 34 | 三十三秒 33 35 | 三十四秒 34 36 | 三十五秒 35 37 | 三十六秒 36 38 | 三十七秒 37 39 | 三十八秒 38 40 | 三十九秒 39 41 | 四十秒 40 42 | 四十一秒 41 43 | 四十二秒 42 44 | 四十三秒 43 45 | 四十四秒 44 46 | 四十五秒 45 47 | 四十六秒 46 48 | 四十七秒 47 49 | 四十八秒 48 50 | 四十九秒 49 51 | 五十秒 50 52 | 五十一秒 51 53 | 五十二秒 52 54 | 五十三秒 53 55 | 五十四秒 54 56 | 五十五秒 55 57 | 五十六秒 56 58 | 五十七秒 57 59 | 五十八秒 58 60 | 五十九秒 59 61 | -------------------------------------------------------------------------------- /itn/chinese/test/data/cardinal.txt: -------------------------------------------------------------------------------- 1 | 幺幺零 => 110 2 | 幺二七点零点零点幺 => 127.0.0.1 3 | 这是手机一八五四四一三九一二一 => 这是手机18544139121 4 | 三五百 => 300~500 5 | 三五千 => 3000~5000 6 | 三五万 => 3~5万 7 | 三四万 => 3~4万 8 | 五六十 => 50~60 9 | 三四十万 => 30~40万 10 | 三四十亿 => 30~40亿 11 | 十五六 => 15-6 12 | 四十五六 => 45-6 13 | 四十五六万 => 45-6万 14 | 七百三四十 => 730-40 15 | 十七八万 => 17-8万 16 | 六十三四万 => 63-4万 17 | 一万六七 => 16000-7000 18 | 三万四五 => 34000-5000 19 | 我的身份证号是三四零二零三一九三七零幺零幺零五幺七 => 我的身份证号是340203193701010517 20 | 我的身份证号是三四零二零三一九三七零幺零幺零五幺X => 我的身份证号是34020319370101051X 21 | 给一三三四五三一二二二一打电话 => 给13345312221打电话 22 | 给一三三四五三一二二二一拨电话 => 给13345312221拨电话 23 | 一二三四 => 1234 24 | 二二三四 => 2234 25 | 拨打幺二三零六 => 拨打12306 26 | 九幺幺是报警电话 => 911是报警电话 27 | 尾号幺七零二 => 尾号1702 28 | 尾号一二三四 => 尾号1234 29 | 幺八五洞幺拐两零柒幺玖 => 18501720719 30 | -------------------------------------------------------------------------------- /tn/chinese/data/time/second.tsv: -------------------------------------------------------------------------------- 1 | 00 2 | 01 一秒 3 | 02 二秒 4 | 03 三秒 5 | 04 四秒 6 | 05 五秒 7 | 06 六秒 8 | 07 七秒 9 | 08 八秒 10 | 09 九秒 11 | 10 十秒 12 | 11 十一秒 13 | 12 十二秒 14 | 13 十三秒 15 | 14 十四秒 16 | 15 十五秒 17 | 16 十六秒 18 | 17 十七秒 19 | 18 十八秒 20 | 19 十九秒 21 | 20 二十秒 22 | 21 二十一秒 23 | 22 二十二秒 24 | 23 二十三秒 25 | 24 二十四秒 26 | 25 二十五秒 27 | 26 二十六秒 28 | 27 二十七秒 29 | 28 二十八秒 30 | 29 二十九秒 31 | 30 三十秒 32 | 31 三十一秒 33 | 32 三十二秒 34 | 33 三十三秒 35 | 34 三十四秒 36 | 35 三十五秒 37 | 36 三十六秒 38 | 37 三十七秒 39 | 38 三十八秒 40 | 39 三十九秒 41 | 40 四十秒 42 | 41 四十一秒 43 | 42 四十二秒 44 | 43 四十三秒 45 | 44 四十四秒 46 | 45 四十五秒 47 | 46 四十六秒 48 | 47 四十七秒 49 | 48 四十八秒 50 | 49 四十九秒 51 | 50 五十秒 52 | 51 五十一秒 53 | 52 五十二秒 54 | 53 五十三秒 55 | 54 五十四秒 56 | 55 五十五秒 57 | 56 五十六秒 58 | 57 五十七秒 59 | 58 五十八秒 60 | 59 五十九秒 61 | -------------------------------------------------------------------------------- /tn/japanese/data/time/minute.tsv: -------------------------------------------------------------------------------- 1 | 00 2 | 01 一分 3 | 02 二分 4 | 03 三分 5 | 04 四分 6 | 05 五分 7 | 06 六分 8 | 07 七分 9 | 08 八分 10 | 09 九分 11 | 10 十分 12 | 11 十一分 13 | 12 十二分 14 | 13 十三分 15 | 14 十四分 16 | 15 十五分 17 | 16 十六分 18 | 17 十七分 19 | 18 十八分 20 | 19 十九分 21 | 20 二十分 22 | 21 二十一分 23 | 22 二十二分 24 | 23 二十三分 25 | 24 二十四分 26 | 25 二十五分 27 | 26 二十六分 28 | 27 二十七分 29 | 28 二十八分 30 | 29 二十九分 31 | 30 三十分 32 | 31 三十一分 33 | 32 三十二分 34 | 33 三十三分 35 | 34 三十四分 36 | 35 三十五分 37 | 36 三十六分 38 | 37 三十七分 39 | 38 三十八分 40 | 39 三十九分 41 | 40 四十分 42 | 41 四十一分 43 | 42 四十二分 44 | 43 四十三分 45 | 44 四十四分 46 | 45 四十五分 47 | 46 四十六分 48 | 47 四十七分 49 | 48 四十八分 50 | 49 四十九分 51 | 50 五十分 52 | 51 五十一分 53 | 52 五十二分 54 | 53 五十三分 55 | 54 五十四分 56 | 55 五十五分 57 | 56 五十六分 58 | 57 五十七分 59 | 58 五十八分 60 | 59 五十九分 61 | -------------------------------------------------------------------------------- /tn/japanese/data/time/second.tsv: -------------------------------------------------------------------------------- 1 | 00 2 | 01 一秒 3 | 02 二秒 4 | 03 三秒 5 | 04 四秒 6 | 05 五秒 7 | 06 六秒 8 | 07 七秒 9 | 08 八秒 10 | 09 九秒 11 | 10 十秒 12 | 11 十一秒 13 | 12 十二秒 14 | 13 十三秒 15 | 14 十四秒 16 | 15 十五秒 17 | 16 十六秒 18 | 17 十七秒 19 | 18 十八秒 20 | 19 十九秒 21 | 20 二十秒 22 | 21 二十一秒 23 | 22 二十二秒 24 | 23 二十三秒 25 | 24 二十四秒 26 | 25 二十五秒 27 | 26 二十六秒 28 | 27 二十七秒 29 | 28 二十八秒 30 | 29 二十九秒 31 | 30 三十秒 32 | 31 三十一秒 33 | 32 三十二秒 34 | 33 三十三秒 35 | 34 三十四秒 36 | 35 三十五秒 37 | 36 三十六秒 38 | 37 三十七秒 39 | 38 三十八秒 40 | 39 三十九秒 41 | 40 四十秒 42 | 41 四十一秒 43 | 42 四十二秒 44 | 43 四十三秒 45 | 44 四十四秒 46 | 45 四十五秒 47 | 46 四十六秒 48 | 47 四十七秒 49 | 48 四十八秒 50 | 49 四十九秒 51 | 50 五十秒 52 | 51 五十一秒 53 | 52 五十二秒 54 | 53 五十三秒 55 | 54 五十四秒 56 | 55 五十五秒 57 | 56 五十六秒 58 | 57 五十七秒 59 | 58 五十八秒 60 | 59 五十九秒 61 | -------------------------------------------------------------------------------- /tn/chinese/data/time/minute.tsv: -------------------------------------------------------------------------------- 1 | 00 2 | 01 零一分 3 | 02 零二分 4 | 03 零三分 5 | 04 零四分 6 | 05 零五分 7 | 06 零六分 8 | 07 零七分 9 | 08 零八分 10 | 09 零九分 11 | 10 十分 12 | 11 十一分 13 | 12 十二分 14 | 13 十三分 15 | 14 十四分 16 | 15 十五分 17 | 16 十六分 18 | 17 十七分 19 | 18 十八分 20 | 19 十九分 21 | 20 二十分 22 | 21 二十一分 23 | 22 二十二分 24 | 23 二十三分 25 | 24 二十四分 26 | 25 二十五分 27 | 26 二十六分 28 | 27 二十七分 29 | 28 二十八分 30 | 29 二十九分 31 | 30 三十分 32 | 31 三十一分 33 | 32 三十二分 34 | 33 三十三分 35 | 34 三十四分 36 | 35 三十五分 37 | 36 三十六分 38 | 37 三十七分 39 | 38 三十八分 40 | 39 三十九分 41 | 40 四十分 42 | 41 四十一分 43 | 42 四十二分 44 | 43 四十三分 45 | 44 四十四分 46 | 45 四十五分 47 | 46 四十六分 48 | 47 四十七分 49 | 48 四十八分 50 | 49 四十九分 51 | 50 五十分 52 | 51 五十一分 53 | 52 五十二分 54 | 53 五十三分 55 | 54 五十四分 56 | 55 五十五分 57 | 56 五十六分 58 | 57 五十七分 59 | 58 五十八分 60 | 59 五十九分 61 | -------------------------------------------------------------------------------- /tn/english/data/measure/unit_alternatives.tsv: -------------------------------------------------------------------------------- 1 | atm atmosphere 2 | bq becquerel 3 | cd candela 4 | da dalton 5 | eb exabyte 6 | f degree Fahrenheit 7 | gb gigabyte 8 | g gram 9 | gl gigaliter 10 | ha hectare 11 | h hour 12 | hl hectoliter 13 | hp horsepower 14 | hp horsepower 15 | kb kilobit 16 | kb kilobyte 17 | ma megaampere 18 | mA megaampere 19 | ma milliampere 20 | mA milliampere 21 | mb megabyte 22 | mc megacoulomb 23 | mf megafarad 24 | m meter 25 | m minute 26 | mm millimeter 27 | mm millimeter 28 | mm millimeter 29 | ms megasecond 30 | ms mega siemens 31 | ms millisecond 32 | mv millivolt 33 | mV millivolt 34 | mw megawatt 35 | mW megawatt 36 | pb petabyte 37 | pg petagram 38 | ps petasecond 39 | s second 40 | tb terabyte 41 | tb terabyte 42 | yb yottabyte 43 | zb zettabyte 44 | -------------------------------------------------------------------------------- /runtime/processor/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(wetext_processor STATIC 2 | wetext_processor.cc 3 | wetext_token_parser.cc 4 | ) 5 | if(ANDROID) 6 | target_link_libraries(wetext_processor PUBLIC fst wetext_utils) 7 | else() 8 | if(MSVC) 9 | target_link_libraries(wetext_processor PUBLIC fst wetext_utils) 10 | else() 11 | target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils) 12 | endif() 13 | endif() 14 | 15 | # ---------------------------------------------------------------------------- 16 | # C API shared library (wetext_processor_c) 17 | # ---------------------------------------------------------------------------- 18 | add_library(wetext_processor_c SHARED 19 | wetext_processor_c_api.cc 20 | ) 21 | 22 | target_link_libraries(wetext_processor_c PUBLIC wetext_processor) 23 | -------------------------------------------------------------------------------- /itn/chinese/data/money/symbol.tsv: -------------------------------------------------------------------------------- 1 | 加纳塞地 ¢ 2 | 玻利维亚玻利维亚诺 $b 3 | 乌拉圭比索 $U 4 | 美元 $ 5 | 英镑 £ 6 | 元 ¥ 7 | 泰铢 ฿ 8 | 柬埔寨瑞尔 ៛ 9 | 哥斯达黎加科隆 ₡ 10 | 尼日利亚奈拉 ₦ 11 | 朝鲜园 ₩ 12 | 以色列谢克尔 ₪ 13 | 越南东 ₫ 14 | 欧元 € 15 | 老挝基普 ₭ 16 | 蒙古图格里克 ₮ 17 | 古巴比索 ₱ 18 | 菲律宾比索 ₱ 19 | 乌克兰格里夫纳 ₴ 20 | 印度卢比 ₹ 21 | 土耳其里拉 ₺ 22 | 卢布 ₽ 23 | 白俄罗斯卢布 Br 24 | 委内瑞拉玻利瓦尔 Bs 25 | 伯利兹元 BZ$ 26 | 巴拿马巴尔博亚 B/. 27 | 尼加拉瓜科尔多瓦 C$ 28 | 瑞士法郎 CHF 29 | 匈牙利福林 Ft 30 | 阿鲁巴盾 ƒ 31 | 巴拉圭瓜拉尼 Gs 32 | 牙买加元 J$ 33 | 捷克克朗 Kč 34 | 波斯尼亚和黑塞哥维那可兑换马克 KM 35 | 克罗地亚库纳 kn 36 | 丹麦克朗 kr 37 | 罗马尼亚列伊 lei 38 | 阿尔巴尼亚列克 Lek 39 | 洪都拉斯伦皮拉 L 40 | 莫桑比克梅蒂卡尔 MT 41 | 博茨瓦纳普拉 P 42 | 危地马拉格查尔 Q 43 | 巴西雷亚尔 R$ 44 | 多米尼加共和国比索 RD$ 45 | 马来西亚令吉 RM 46 | 印尼盾 Rp 47 | 巴基斯坦卢比 ₨ 48 | 毛里求斯卢比 ₨ 49 | 南非兰特 R 50 | 秘鲁索尔 S/. 51 | 索马里先令 S 52 | 特立尼达和多巴哥元 TT$ 53 | 津巴布韦元 Z$ 54 | 波兰兹罗提 zł 55 | 马其顿代纳尔 ден 56 | 塞尔维亚第纳尔 Дин. 57 | 乌兹别克斯坦索姆 лв 58 | 保加利亚列弗 лв 59 | 吉尔吉斯斯坦索姆 лв 60 | 哈萨克斯坦腾格 лв 61 | -------------------------------------------------------------------------------- /tn/english/data/money/currency_major.tsv: -------------------------------------------------------------------------------- 1 | $ dollar 2 | $ us dollar 3 | US$ us dollar 4 | ฿ Thai Baht 5 | £ pound 6 | € euro 7 | ₩ won 8 | nzd new zealand dollar 9 | rs rupee 10 | chf swiss franc 11 | dkk danish kroner 12 | fim finnish markka 13 | aed arab emirates dirham 14 | ¥ yen 15 | czk czech koruna 16 | mro mauritanian ouguiya 17 | pkr pakistani rupee 18 | crc costa rican colon 19 | hk$ hong kong dollar 20 | npr nepalese rupee 21 | awg aruban florin 22 | nok norwegian kroner 23 | tzs tanzanian shilling 24 | sek swedish kronor 25 | cyp cypriot pound 26 | r real 27 | sar saudi riyal 28 | cve cape verde escudo 29 | rsd serbian dinar 30 | dm german mark 31 | shp saint helena pounds 32 | php philippine peso 33 | cad canadian dollar 34 | ssp south sudanese pound 35 | scr seychelles rupee 36 | mvr maldivian rufiyaa 37 | DH dirham 38 | Dh dirham 39 | Dhs. dirham 40 | -------------------------------------------------------------------------------- /runtime/android/app/proguard-rules.pro: -------------------------------------------------------------------------------- 1 | # Add project specific ProGuard rules here. 2 | # You can control the set of applied configuration files using the 3 | # proguardFiles setting in build.gradle. 4 | # 5 | # For more details, see 6 | # http://developer.android.com/guide/developing/tools/proguard.html 7 | 8 | # If your project uses WebView with JS, uncomment the following 9 | # and specify the fully qualified class name to the JavaScript interface 10 | # class: 11 | #-keepclassmembers class fqcn.of.javascript.interface.for.webview { 12 | # public *; 13 | #} 14 | 15 | # Uncomment this to preserve the line number information for 16 | # debugging stack traces. 17 | #-keepattributes SourceFile,LineNumberTable 18 | 19 | # If you keep the line number information, uncomment this to 20 | # hide the original source file name. 21 | #-renamesourcefileattribute SourceFile -------------------------------------------------------------------------------- /tn/english/data/address/state.tsv: -------------------------------------------------------------------------------- 1 | Alabama AL 2 | Alaska AK 3 | Arizona AZ 4 | Arkansas AR 5 | California CA 6 | Colorado CO 7 | Connecticut CT 8 | Delaware DE 9 | Florida FL 10 | Georgia GA 11 | Hawaii HI 12 | Idaho ID 13 | Illinois IL 14 | Indiana IND 15 | Iowa IA 16 | Kansas KS 17 | Kentucky KY 18 | Louisiana LA 19 | Maine ME 20 | Maryland MD 21 | Massachusetts MA 22 | Michigan MI 23 | Minnesota MN 24 | Mississippi MS 25 | Missouri MO 26 | Montana MT 27 | Nebraska NE 28 | Nevada NV 29 | New Hampshire NH 30 | New Jersey NJ 31 | New Mexico NM 32 | New York NY 33 | North Carolina NC 34 | North Dakota ND 35 | Oregon OR 36 | Pennsylvania PA 37 | Rhode Island RI 38 | South Carolina SC 39 | South Dakota SD 40 | Tennessee TN 41 | Tennessee TENN 42 | Texas TX 43 | Utah UT 44 | Vermont VT 45 | Virginia VA 46 | Washington WA 47 | West Virginia WV 48 | Wisconsin WI 49 | Wyoming WY -------------------------------------------------------------------------------- /tn/chinese/data/char/fullwidth_to_halfwidth.tsv: -------------------------------------------------------------------------------- 1 | , , 2 | 。 . 3 | . . 4 | “ " 5 | ” " 6 | ! ! 7 | " " 8 | # # 9 | % % 10 | & & 11 | ' ' 12 | ( ( 13 | ) ) 14 | * * 15 | + + 16 | - - 17 | / / 18 | : : 19 | ; ; 20 | < < 21 | = = 22 | > > 23 | ? ? 24 | @ @ 25 | \ \ 26 | ^ ^ 27 | _ _ 28 | ` ` 29 | { { 30 | | | 31 | } } 32 | ~ ~ 33 | $ $ 34 | 0 0 35 | 1 1 36 | 2 2 37 | 3 3 38 | 4 4 39 | 5 5 40 | 6 6 41 | 7 7 42 | 8 8 43 | 9 9 44 | a a 45 | A A 46 | b b 47 | B B 48 | c c 49 | C C 50 | d d 51 | D D 52 | e e 53 | E E 54 | f f 55 | F F 56 | g g 57 | G G 58 | h h 59 | H H 60 | i i 61 | I I 62 | j j 63 | J J 64 | k k 65 | K K 66 | l l 67 | L L 68 | m m 69 | M M 70 | n n 71 | N N 72 | o o 73 | O O 74 | p p 75 | P P 76 | q q 77 | Q Q 78 | r r 79 | R R 80 | s s 81 | S S 82 | t t 83 | T T 84 | u u 85 | U U 86 | v v 87 | V V 88 | w w 89 | W W 90 | x x 91 | X X 92 | y y 93 | Y Y 94 | z z 95 | Z Z 96 | -------------------------------------------------------------------------------- /itn/japanese/data/char/fullwidth_to_halfwidth.tsv: -------------------------------------------------------------------------------- 1 | , , 2 | 。 . 3 | . . 4 | “ " 5 | ” " 6 | ! ! 7 | " " 8 | # # 9 | % % 10 | & & 11 | ' ' 12 | ( ( 13 | ) ) 14 | * * 15 | + + 16 | - - 17 | / / 18 | : : 19 | ; ; 20 | < < 21 | = = 22 | > > 23 | ? ? 24 | @ @ 25 | \ \ 26 | ^ ^ 27 | _ _ 28 | ` ` 29 | { { 30 | | | 31 | } } 32 | ~ ~ 33 | $ $ 34 | 0 0 35 | 1 1 36 | 2 2 37 | 3 3 38 | 4 4 39 | 5 5 40 | 6 6 41 | 7 7 42 | 8 8 43 | 9 9 44 | a a 45 | A A 46 | b b 47 | B B 48 | c c 49 | C C 50 | d d 51 | D D 52 | e e 53 | E E 54 | f f 55 | F F 56 | g g 57 | G G 58 | h h 59 | H H 60 | i i 61 | I I 62 | j j 63 | J J 64 | k k 65 | K K 66 | l l 67 | L L 68 | m m 69 | M M 70 | n n 71 | N N 72 | o o 73 | O O 74 | p p 75 | P P 76 | q q 77 | Q Q 78 | r r 79 | R R 80 | s s 81 | S S 82 | t t 83 | T T 84 | u u 85 | U U 86 | v v 87 | V V 88 | w w 89 | W W 90 | x x 91 | X X 92 | y y 93 | Y Y 94 | z z 95 | Z Z 96 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/values/attrs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /tn/japanese/data/char/fullwidth_to_halfwidth.tsv: -------------------------------------------------------------------------------- 1 | , , 2 | 。 . 3 | . . 4 | “ " 5 | ” " 6 | ! ! 7 | " " 8 | # # 9 | % % 10 | & & 11 | ' ' 12 | ( ( 13 | ) ) 14 | * * 15 | + + 16 | - - 17 | / / 18 | : : 19 | ; ; 20 | < < 21 | = = 22 | > > 23 | ? ? 24 | @ @ 25 | \ \ 26 | ^ ^ 27 | _ _ 28 | ` ` 29 | { { 30 | | | 31 | } } 32 | ~ ~ 33 | $ $ 34 | 0 0 35 | 1 1 36 | 2 2 37 | 3 3 38 | 4 4 39 | 5 5 40 | 6 6 41 | 7 7 42 | 8 8 43 | 9 9 44 | a a 45 | A A 46 | b b 47 | B B 48 | c c 49 | C C 50 | d d 51 | D D 52 | e e 53 | E E 54 | f f 55 | F F 56 | g g 57 | G G 58 | h h 59 | H H 60 | i i 61 | I I 62 | j j 63 | J J 64 | k k 65 | K K 66 | l l 67 | L L 68 | m m 69 | M M 70 | n n 71 | N N 72 | o o 73 | O O 74 | p p 75 | P P 76 | q q 77 | Q Q 78 | r r 79 | R R 80 | s s 81 | S S 82 | t t 83 | T T 84 | u u 85 | U U 86 | v v 87 | V V 88 | w w 89 | W W 90 | x x 91 | X X 92 | y y 93 | Y Y 94 | z z 95 | Z Z 96 | -------------------------------------------------------------------------------- /tn/japanese/test/data/date.txt: -------------------------------------------------------------------------------- 1 | 1998/04/23 => 千九百九十八年四月二十三日 2 | 2023/11/14 => 二千二十三年十一月十四日 3 | 2023/11/01 => 二千二十三年十一月一日 4 | 2008/08 => 二千八年八月 5 | 08/2008 => 二千八年八月 6 | 08/08 => 八月八日 7 | 2008-08-23 => 二千八年八月二十三日 8 | 2008-8-8 => 二千八年八月八日 9 | 2008-08 => 二千八年八月 10 | 08-2008 => 二千八年八月 11 | 08-08 => 八月八日 12 | 2008.08.08 => 二千八年八月八日 13 | 2008.8.8 => 二千八年八月八日 14 | 2008.08 => 二千八年八月 15 | 08.2008 => 二千八年八月 16 | 08.08 => 八月八日 17 | 来月の旅行は12/15から12/20までです => 来月の旅行は十二月十五日から十二月二十日までです 18 | 次の週末は11/5から11/6です => 次の週末は十一月五日から十一月六日です 19 | 2008.08.08-2008.08.10 => 二千八年八月八日から二千八年八月十日 20 | 23/10 => 十分の二十三 21 | 1/3 => 三分の一 22 | 今日は2036-03-01です => 今日は二千三十六年三月一日です 23 | 今日は2036/03/01です => 今日は二千三十六年三月一日です 24 | 来週の月曜日は2036.2.28です => 来週の月曜日は二千三十六年二月二十八日です 25 | 次の週末は4/30から5/1です => 次の週末は四月三十日から五月一日です 26 | 来月の旅行は5-15から5-20です => 来月の旅行は五月十五日から五月二十日です 27 | 次の会議は2022-12-1に開催されます => 次の会議は二千二十二年十二月一日に開催されます 28 | 今日は2022.11.19です => 今日は二千二十二年十一月十九日です -------------------------------------------------------------------------------- /tn/english/test/data/cardinal.txt: -------------------------------------------------------------------------------- 1 | -23 => negative twenty three 2 | -1 => negative one 3 | 0 => zero 4 | 1 => one 5 | 2 => two 6 | 10 => ten 7 | 11 => eleven 8 | 20 => twenty 9 | 100 => one hundred 10 | 101 => one hundred and one 11 | 111 => one hundred and eleven 12 | 200 => two hundred 13 | 1000 => thousand 14 | 1001 => thousand and one 15 | 1011 => thousand and eleven 16 | 1111 => thousand one hundred and eleven 17 | 2000 => two thousand 18 | 10000 => ten thousand 19 | 10001 => ten thousand and one 20 | 10011 => ten thousand and eleven 21 | 10111 => ten thousand one hundred and eleven 22 | 11111 => eleven thousand one hundred and eleven 23 | 20000 => twenty thousand 24 | 101111 => one hundred one thousand one hundred and eleven 25 | 1001111 => one million one thousand one hundred and eleven 26 | 10001111 => ten million one thousand one hundred and eleven 27 | 4567 => four thousand five hundred and sixty seven 28 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/values/themes.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/values-night/themes.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 16 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/AndroidManifest.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /runtime/android/app/src/androidTest/java/com/mobvoi/WeTextProcessing/ExampleInstrumentedTest.java: -------------------------------------------------------------------------------- 1 | package com.mobvoi.WeTextProcessing; 2 | 3 | import android.content.Context; 4 | 5 | import androidx.test.platform.app.InstrumentationRegistry; 6 | import androidx.test.ext.junit.runners.AndroidJUnit4; 7 | 8 | import org.junit.Test; 9 | import org.junit.runner.RunWith; 10 | 11 | import static org.junit.Assert.*; 12 | 13 | /** 14 | * Instrumented test, which will execute on an Android device. 15 | * 16 | * @see Testing documentation 17 | */ 18 | @RunWith(AndroidJUnit4.class) 19 | public class ExampleInstrumentedTest { 20 | @Test 21 | public void useAppContext() { 22 | // Context of the app under test. 23 | Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext(); 24 | assertEquals("com.mobvoi.WeTextProcessing", appContext.getPackageName()); 25 | } 26 | } -------------------------------------------------------------------------------- /itn/chinese/test/data/normalizer.txt: -------------------------------------------------------------------------------- 1 | 一共有多少人 => 一共有多少人 2 | 呃这个呃啊我不知道 => 这个我不知道 3 | 呃呃啊 => 4 | 共四百六十五篇,约三百一十五万字 => 共465篇,约315万字 5 | 共计六点四二万人 => 共计6.42万人 6 | 同比升高零点六个百分点 => 同比升高0.6个百分点 7 | 总量的五分之一以上 => 总量的1/5以上 8 | 相当于头发丝的十六分之一 => 相当于头发丝的1/16 9 | 二分之三是一个假分数 => 3/2是一个假分数 10 | 同比增长百分之六点三 => 同比增长6.3% 11 | 增幅百分之零点四 => 增幅0.4% 12 | 二零零二年一月二十八日 => 2002/01/28 13 | 二零零二年一月 => 2002/01 14 | 八月十六号的十二点之前 => 08/16的12点之前 15 | 我是五点零二分开始的 => 我是5:02开始的 16 | 于五点三十五分三十六秒发射 => 于5:35:36发射 17 | 上午八点半准时开会 => 8:30a.m.准时开会 18 | 比分定格在七十八比九十六 => 比分定格在78:96 19 | 计算负二的绝对值是二 => 计算-2的绝对值是2 20 | 正负二的平方都是四 => ±2的平方都是4 21 | 价格是十三点五元 => 价格是¥13.5 22 | 价格是十三点五美元 => 价格是$13.5 23 | 价格是十三点五澳元 => 价格是A$13.5 24 | 价格是十三点五港元 => 价格是HKD13.5 25 | 重达二十五千克 => 重达25kg 26 | 最高气温三十八摄氏度 => 最高气温38°C 27 | 实际面积一百二十平方米 => 实际面积120m² 28 | 渲染速度十毫秒一帧 => 渲染速度10ms1帧 29 | 可以打我手机幺三五零幺二三四五六七 => 可以打我手机13501234567 30 | 可以拨打幺二三零六来咨询 => 可以拨打12306来咨询 31 | 二点五平方电线,五,五十五,疑是银河落九天,十二块五 => 2.5平方电线,5,55,疑是银河落9天,12块5 32 | 三百九十九三盒 => 3993盒 33 | 十三五规划期间获得了十几万和几十万甚至二十几万的投资 => 十三五规划期间获得了十几万和几十万甚至二十几万的投资 34 | -------------------------------------------------------------------------------- /runtime/utils/wetext_log.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef UTILS_WETEXT_LOG_H_ 16 | #define UTILS_WETEXT_LOG_H_ 17 | 18 | // Because openfst is a dynamic library compiled with gflags/glog, we must use 19 | // the gflags/glog from openfst to avoid them linked both statically and 20 | // dynamically into the executable. 21 | #include "fst/log.h" 22 | 23 | #endif // UTILS_WETEXT_LOG_H_ 24 | -------------------------------------------------------------------------------- /itn/japanese/test/data/cardinal.txt: -------------------------------------------------------------------------------- 1 | 一 => 1 2 | 四 => 4 3 | 十 => 10 4 | 十四 => 14 5 | 四十四 => 44 6 | 四十 => 40 7 | 百一 => 101 8 | 百十二 => 112 9 | 四百四 => 404 10 | 九千百二十三 => 9123 11 | 一千二百三十四 => 1234 12 | 五千六百七十八 => 5678 13 | 二千二十 => 2020 14 | 二千二 => 2002 15 | 二千十 => 2010 16 | 二千百 => 2100 17 | 九千 => 9000 18 | 九千二 => 9002 19 | 十 => 10 20 | 百 => 100 21 | 千 => 1000 22 | 万 => 万 23 | 兆 => 兆 24 | 千百 => 1100 25 | 千三百 => 1300 26 | 千三百十 => 1310 27 | 千十 => 1010 28 | 千二十 => 1020 29 | 千二十一 => 1021 30 | 千一 => 1001 31 | 千百十 => 1110 32 | 千百一 => 1101 33 | マイナス百十二 => -112 34 | プラス百十二 => +112 35 | 二十万二 => 200002 36 | 一万二 => 10002 37 | 二十万二千百 => 202100 38 | 四百万 => 400万 39 | 四百四万 => 404万 40 | 五千万 => 5000万 41 | 二万 => 20000 42 | 一億五千万 => 1億5000万 43 | 一億五万 => 1億5万 44 | 一億一百万 => 1億100万 45 | 一億一千万 => 1億1000万 46 | 二千億一千万 => 2000億1000万 47 | 二千億 => 2000億 48 | 二兆二億 => 2兆2億 49 | 二兆二千億 => 2兆2000億 50 | 二兆二千万 => 2兆2000万 51 | 二兆二百万 => 2兆200万 52 | 一兆三百二十万五千 => 1兆320万5000 53 | 二兆三十 => 2兆30 54 | 二兆百 => 2兆100 55 | 二十兆百 => 20兆100 56 | 一九二点一六八点零点一 => 192.168.0.1 57 | 一二三四五六七八九 => 123456789 -------------------------------------------------------------------------------- /runtime/utils/wetext_flags.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef UTILS_WETEXT_FLAGS_H_ 16 | #define UTILS_WETEXT_FLAGS_H_ 17 | 18 | // Because openfst is a dynamic library compiled with gflags/glog, we must use 19 | // the gflags/glog from openfst to avoid them linked both statically and 20 | // dynamically into the executable. 21 | #include "fst/flags.h" 22 | 23 | #endif // UTILS_WETEXT_FLAGS_H_ 24 | -------------------------------------------------------------------------------- /itn/chinese/data/measure/units_en.tsv: -------------------------------------------------------------------------------- 1 | 原子质量 amu 2 | 巴 bar 3 | 平方厘米 cm² 4 | 立方厘米 cm³ 5 | 厘米 cm 6 | 美担 cwt 7 | 摄氏度 °C 8 | 分贝 db 9 | 立方分米 dm³ 10 | 分米 dm 11 | 英尺 ft 12 | 华氏度 °F 13 | 吉字节 gb 14 | 吉赫兹 ghz 15 | 吉帕斯卡 gpa 16 | 吉瓦时 gwh 17 | 吉瓦 gw 18 | 戈瑞 gy 19 | 小时 h 20 | 公顷 ha 21 | 赫兹 hz 22 | 千比特每秒 kbps 23 | 千比特一秒 kbps 24 | 千卡 kcal 25 | 千克力 kgf 26 | 千克 kg 27 | 公斤 kg 28 | 千赫兹 khz 29 | 平方千米 km² 30 | 公里 km 31 | 公里每小时 km/h 32 | 公里一小时 km/h 33 | 千米 km 34 | 千米每小时 km/h 35 | 千米一小时 km/h 36 | 千帕 kpa 37 | 千瓦时 kwh 38 | 千瓦 kw 39 | 磅 lbs 40 | 平方米 m² 41 | 立方米 m³ 42 | 兆比特每秒 mbps 43 | 兆比特一秒 mbps 44 | 克 g 45 | 毫克 mg 46 | 兆赫兹 mhz 47 | 平方英里 mi² 48 | 分钟 min 49 | 英里 mi 50 | 毫升 ml 51 | 平方毫米 mm² 52 | 毫米 mm 53 | 摩尔 mol 54 | 兆帕 mpa 55 | 英里每小时 mph 56 | 英里一小时 mph 57 | 毫秒 ms 58 | 毫伏 mv 59 | 毫瓦 mw 60 | 千伏 kv 61 | 米 m 62 | 纳克 ng 63 | 纳米 nm 64 | 纳秒 ns 65 | 盎司 oz 66 | 度 º 67 | 帕斯卡 pa 68 | 皮克 pg 69 | 皮秒 ps 70 | 弧度 rad 71 | 转每分 rpm 72 | 平方英尺 sq ft 73 | 平方英里 sq mi 74 | 系沃特 sv 75 | 秒 s 76 | 太字节 tb 77 | 万亿焦耳 tj 78 | 台两 tl 79 | 伏特 v 80 | 码 yd 81 | 微克 μg 82 | 微米 μm 83 | 微秒 μs 84 | 欧米茄 ω 85 | 度 ° 86 | -------------------------------------------------------------------------------- /tn/chinese/test/time_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.chinese.rules.time import Time 18 | from tn.chinese.test.utils import parse_test_case 19 | 20 | 21 | class TestTime: 22 | time = Time() 23 | time_cases = parse_test_case("data/time.txt") 24 | 25 | @pytest.mark.parametrize("written, spoken", time_cases) 26 | def test_time(self, written, spoken): 27 | assert self.time.normalize(written) == spoken 28 | -------------------------------------------------------------------------------- /tn/chinese/test/char_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.chinese.rules.char import Char 18 | from tn.chinese.test.utils import parse_test_case 19 | 20 | 21 | class TestChar: 22 | 23 | char = Char() 24 | char_cases = parse_test_case("data/char.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", char_cases) 27 | def test_char(self, written, spoken): 28 | assert self.char.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/chinese/test/date_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.chinese.rules.date import Date 18 | from tn.chinese.test.utils import parse_test_case 19 | 20 | 21 | class TestDate: 22 | 23 | date = Date() 24 | date_cases = parse_test_case("data/date.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", date_cases) 27 | def test_date(self, written, spoken): 28 | assert self.date.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/chinese/test/math_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.chinese.rules.math import Math 18 | from tn.chinese.test.utils import parse_test_case 19 | 20 | 21 | class TestMath: 22 | 23 | math = Math() 24 | math_cases = parse_test_case("data/math.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", math_cases) 27 | def test_math(self, written, spoken): 28 | assert self.math.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/english/test/word_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.rules.word import Word 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class TestWord: 22 | 23 | word = Word() 24 | word_cases = parse_test_case("data/word.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", word_cases) 27 | def test_char(self, written, spoken): 28 | assert self.word.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/japanese/rules/char.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini.lib.pynutil import insert 16 | 17 | from tn.processor import Processor 18 | 19 | 20 | class Char(Processor): 21 | 22 | def __init__(self): 23 | super().__init__(name="char") 24 | self.build_tagger() 25 | self.build_verbalizer() 26 | 27 | def build_tagger(self): 28 | tagger = insert('value: "') + self.CHAR + insert('"') 29 | self.tagger = self.add_tokens(tagger) 30 | -------------------------------------------------------------------------------- /tn/japanese/test/data/cardinal.txt: -------------------------------------------------------------------------------- 1 | 1118 => 千百十八 2 | -1118 => マイナス千百十八 3 | 9.99999 => 九点九九九九九 4 | 20099.1001 => 二万九十九点一〇〇一 5 | 11118 => 一万千百十八 6 | -200.001 => マイナス二百点〇〇一 7 | 100% => 百パーセント 8 | -50.04% => マイナス五十点〇四パーセント 9 | -50.07% => マイナス五十点〇七パーセント 10 | 192.168.0.1 => 一九二点一六八点〇点一 11 | 090-1234-5678 => ゼロ九ゼロの一二三四の五六七八 12 | 090-12345678 => ゼロ九ゼロの一二三四五六七八 13 | +81-090-1234-5678 => プラス八一のゼロ九ゼロの一二三四の五六七八 14 | +81 090-1234-5678 => プラス八一ゼロ九ゼロの一二三四の五六七八 15 | +81 090-123-5678 => プラス八一ゼロ九ゼロの一二三の五六七八 16 | +81 09012345678 => プラス八一ゼロ九ゼロ一二三四五六七八 17 | 02-1234-5678 => ゼロ二の一二三四の五六七八 18 | 1.1234567 => 一点一二三四五六七 19 | 123456789 => 一二三四五六七八九 20 | 0.0005 => 〇点〇〇〇五 21 | No.1005 => No.一〇〇五 22 | 番号1234 => 番号一二三四 23 | 1234号室 => 一二三四号室 24 | 150,000 => 十五万 25 | 10,000 => 一万 26 | 11,000 => 一万千 27 | 1,115,000 => 百十一万五千 28 | 10,000,000 => 一千万 29 | 10,100,000 => 一千十万 30 | 1,000 => 千 31 | 150000 => 十五万 32 | 10000 => 一万 33 | 11000 => 一万千 34 | 1115000 => 百十一万五千 35 | 1000 => 千 36 | 100000 => 十万 37 | 1000000 => 百万 38 | 10100000 => 一千十万 39 | 0時に花火が打ち上げられます => 〇時に花火が打ち上げられます 40 | -80000000600 => マイナス八百億六百 41 | -80010000600 => マイナス八〇〇一〇〇〇〇六〇〇 -------------------------------------------------------------------------------- /itn/japanese/rules/char.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini.lib.pynutil import insert 16 | 17 | from tn.processor import Processor 18 | 19 | 20 | class Char(Processor): 21 | 22 | def __init__(self): 23 | super().__init__(name="char") 24 | self.build_tagger() 25 | self.build_verbalizer() 26 | 27 | def build_tagger(self): 28 | tagger = insert('value: "') + self.CHAR + insert('"') 29 | self.tagger = self.add_tokens(tagger) 30 | -------------------------------------------------------------------------------- /tn/chinese/data/measure/units_en.tsv: -------------------------------------------------------------------------------- 1 | amu 原子质量 2 | bar 巴 3 | cm2 平方厘米 4 | cm² 平方厘米 5 | cm3 立方厘米 6 | cm³ 立方厘米 7 | cm 厘米 8 | cwt 美担 9 | °c 摄氏度 10 | °C 摄氏度 11 | ℃ 摄氏度 12 | db 分贝 13 | dm3 立方分米 14 | dm³ 立方分米 15 | dm 分米 16 | ds 毫秒 17 | ft 英尺 18 | °f 华氏度 19 | °F 华氏度 20 | ℉ 华氏度 21 | gb 吉字节 22 | ghz 吉赫兹 23 | gpa 吉帕斯卡 24 | gwh 吉瓦时 25 | gw 吉瓦 26 | gy 戈瑞 27 | h 小时 28 | ha 公顷 29 | hz 赫兹 30 | kbps 千比特每秒 31 | kcal 千卡 32 | kgf 千克力 33 | kg 千克 34 | khz 千赫兹 35 | km2 平方千米 36 | km² 平方千米 37 | km 公里 38 | kpa 千帕 39 | kwh 千瓦时 40 | kw 千瓦 41 | kW 千瓦 42 | lbs 磅 43 | lb 磅 44 | m2 平方米 45 | m² 平方米 46 | m3 立方米 47 | m³ 立方米 48 | mbps 兆比特每秒 49 | mg 毫克 50 | mhz 兆赫兹 51 | mi2 平方英里 52 | mi² 平方英里 53 | min 分钟 54 | mi 英里 55 | ml 毫升 56 | mm2 平方毫米 57 | mm² 平方毫米 58 | mm 毫米 59 | mol 摩尔 60 | mpa 兆帕 61 | mph 英里每小时 62 | ms 毫秒 63 | mv 毫伏 64 | mw 毫瓦 65 | m 米 66 | ng 纳克 67 | nm 纳米 68 | ns 纳秒 69 | ºc 摄氏度 70 | ºC 摄氏度 71 | oz 盎司 72 | º 度 73 | pa 帕斯卡 74 | pg 皮克 75 | ps 皮秒 76 | rad 弧度 77 | rpm 转每分 78 | sq ft 平方英尺 79 | sq mi 平方英里 80 | sv 系沃特 81 | s 秒 82 | tb 太字节 83 | tj 万亿焦耳 84 | tl 台两 85 | v 伏特 86 | yd 码 87 | μg 微克 88 | μm 微米 89 | μs 微秒 90 | ω 欧米茄 91 | ° 度 92 | -------------------------------------------------------------------------------- /itn/chinese/rules/char.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini.lib.pynutil import insert 16 | 17 | from tn.processor import Processor 18 | 19 | 20 | class Char(Processor): 21 | 22 | def __init__(self): 23 | super().__init__(name="char") 24 | self.build_tagger() 25 | self.build_verbalizer() 26 | 27 | def build_tagger(self): 28 | tagger = insert('value: "') + self.CHAR + insert('"') 29 | self.tagger = self.add_tokens(tagger) 30 | -------------------------------------------------------------------------------- /tn/chinese/rules/char.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini.lib.pynutil import insert 16 | 17 | from tn.processor import Processor 18 | 19 | 20 | class Char(Processor): 21 | 22 | def __init__(self): 23 | super().__init__(name="char") 24 | self.build_tagger() 25 | self.build_verbalizer() 26 | 27 | def build_tagger(self): 28 | tagger = insert('value: "') + self.CHAR + insert('"') 29 | self.tagger = self.add_tokens(tagger) 30 | -------------------------------------------------------------------------------- /tn/chinese/test/money_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.chinese.rules.money import Money 18 | from tn.chinese.test.utils import parse_test_case 19 | 20 | 21 | class TestMoney: 22 | 23 | money = Money() 24 | money_cases = parse_test_case("data/money.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", money_cases) 27 | def test_money(self, written, spoken): 28 | assert self.money.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/chinese/test/sport_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.chinese.rules.sport import Sport 18 | from tn.chinese.test.utils import parse_test_case 19 | 20 | 21 | class TestSport: 22 | 23 | sport = Sport() 24 | sport_cases = parse_test_case("data/sport.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", sport_cases) 27 | def test_sport(self, written, spoken): 28 | assert self.sport.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/english/test/date_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.rules.date import Date 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class TestDate: 22 | 23 | date = Date(deterministic=False) 24 | date_cases = parse_test_case("data/date.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", date_cases) 27 | def test_date(self, written, spoken): 28 | assert self.date.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/english/test/time_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.rules.time import Time 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class Testtime: 22 | 23 | time = Time(deterministic=False) 24 | time_cases = parse_test_case("data/time.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", time_cases) 27 | def test_time(self, written, spoken): 28 | assert self.time.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/english/test/money_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.rules.money import Money 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class TestMoney: 22 | 23 | money = Money(deterministic=False) 24 | money_cases = parse_test_case("data/money.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", money_cases) 27 | def test_money(self, written, spoken): 28 | assert self.money.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/english/test/range_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.rules.range import Range 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class TestRange: 22 | 23 | range = Range(deterministic=False) 24 | range_cases = parse_test_case("data/range.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", range_cases) 27 | def test_range(self, written, spoken): 28 | assert self.range.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/english/test/roman_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.rules.roman import Roman 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class TestRoman: 22 | 23 | roman = Roman(deterministic=False) 24 | roman_cases = parse_test_case("data/roman.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", roman_cases) 27 | def test_roman(self, written, spoken): 28 | assert self.roman.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/chinese/test/measure_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.chinese.rules.measure import Measure 18 | from tn.chinese.test.utils import parse_test_case 19 | 20 | 21 | class TestMoney: 22 | 23 | measure = Measure() 24 | measure_cases = parse_test_case("data/measure.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", measure_cases) 27 | def test_measure(self, written, spoken): 28 | assert self.measure.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /itn/chinese/data/default/whitelist.tsv: -------------------------------------------------------------------------------- 1 | 三七二十一 三七二十一 2 | 一共 一共 3 | 一个 一个 4 | 一下 一下 5 | 一些 一些 6 | 一起 一起 7 | 一会 一会 8 | 一路 一路 9 | 二维码 二维码 10 | 慢一点 慢一点 11 | 一般 一般 12 | 统一 统一 13 | 星期一 星期一 14 | 星期二 星期二 15 | 星期三 星期三 16 | 星期四 星期四 17 | 星期五 星期五 18 | 星期六 星期六 19 | 一年一度 一年一度 20 | 一点一滴 一点一滴 21 | 三心二意 三心二意 22 | 阳春三月 阳春三月 23 | 七嘴八舌 七嘴八舌 24 | 四分五裂 四分五裂 25 | 七荤八素 七荤八素 26 | 三纲五常 三纲五常 27 | 三姑六婆 三姑六婆 28 | 四大皆空 四大皆空 29 | 五体投地 五体投地 30 | 六神无主 六神无主 31 | 七窍生烟 七窍生烟 32 | 七擒七纵 七擒七纵 33 | 八仙过海 八仙过海 34 | 十恶不赦 十恶不赦 35 | 一言九鼎 一言九鼎 36 | 一应俱全 一应俱全 37 | 一窍不通 一窍不通 38 | 一盘散沙 一盘散沙 39 | 十全十美 十全十美 40 | 一五一十 一五一十 41 | 让你三分 让你三分 42 | 乱七八糟 乱七八糟 43 | 一日三餐 一日三餐 44 | 十分高兴 十分高兴 45 | 十万八千里 十万八千里 46 | 四川 四川 47 | 三明 三明 48 | 九寨沟 九寨沟 49 | 七里河 七里河 50 | 九江 九江 51 | 六安 六安 52 | 十堰 十堰 53 | 八公山 八公山 54 | 七台河 七台河 55 | 五常 五常 56 | 四平 四平 57 | 四子王旗 四子王旗 58 | 三亚 三亚 59 | 二连浩特 二连浩特 60 | 零陵 零陵 61 | 五台山 五台山 62 | 六盘水 六盘水 63 | 八宿 八宿 64 | 十二五 十二五 65 | 十三五 十三五 66 | 十四五 十四五 67 | 几十万 几十万 68 | 几百万 几百万 69 | 几千万 几千万 70 | 十几万 十几万 71 | 二十几万 二十几万 72 | 三十几万 三十几万 73 | 四十几万 四十几万 74 | 五十几万 五十几万 75 | 六十几万 六十几万 76 | 七十几万 七十几万 77 | 八十几万 八十几万 78 | 九十几万 九十几万 79 | 七乘二十四小时 7x24小时 80 | 七乘二十四个小时 7x24个小时 81 | 四S店 4S店 82 | 四s店 4s店 83 | -------------------------------------------------------------------------------- /runtime/README.md: -------------------------------------------------------------------------------- 1 | ## WeTextProcessing Runtime 2 | 3 | 1. How to build 4 | 5 | ``` bash 6 | $ cmake -B build -DCMAKE_BUILD_TYPE=Release 7 | $ cmake --build build 8 | ``` 9 | 10 | On Windows: 11 | ``` bash 12 | $ cmake -DCMAKE_BUILD_TYPE=Release -B build -G "Visual Studio 17 2022" -DBUILD_SHARED_LIBS=0 -DCMAKE_CXX_FLAGS="/ZI" 13 | $ cmake --build build 14 | ``` 15 | 16 | 2. How to use 17 | 18 | ``` bash 19 | # tn usage 20 | $ wget https://github.com/wenet-e2e/WeTextProcessing/releases/download/WeTextProcessing/zh_tn_tagger.fst 21 | $ wget https://github.com/wenet-e2e/WeTextProcessing/releases/download/WeTextProcessing/zh_tn_verbalizer.fst 22 | $ ./build/processor_main --tagger zh_tn_tagger.fst --verbalizer zh_tn_verbalizer.fst --text "2.5平方电线" 23 | 24 | # itn usage 25 | $ wget https://github.com/wenet-e2e/WeTextProcessing/releases/download/WeTextProcessing/zh_itn_tagger.fst 26 | $ wget https://github.com/wenet-e2e/WeTextProcessing/releases/download/WeTextProcessing/zh_itn_verbalizer.fst 27 | $ ./build/processor_main --tagger zh_itn_tagger.fst --verbalizer zh_itn_verbalizer.fst --text "二点五平方电线" 28 | ``` 29 | -------------------------------------------------------------------------------- /tn/chinese/test/fraction_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.chinese.rules.fraction import Fraction 18 | from tn.chinese.test.utils import parse_test_case 19 | 20 | 21 | class TestFraction: 22 | 23 | fraction = Fraction() 24 | fraction_cases = parse_test_case("data/fraction.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", fraction_cases) 27 | def test_fraction(self, written, spoken): 28 | assert self.fraction.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/chinese/test/whitelist_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.chinese.rules.whitelist import Whitelist 18 | from tn.chinese.test.utils import parse_test_case 19 | 20 | 21 | class TestWhitelist: 22 | whitelist = Whitelist() 23 | whitelist_cases = parse_test_case("data/whitelist.txt") 24 | 25 | @pytest.mark.parametrize("written, spoken", whitelist_cases) 26 | def test_whitelist(self, written, spoken): 27 | assert self.whitelist.normalize(written) == spoken 28 | -------------------------------------------------------------------------------- /tn/english/test/decimal_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.rules.decimal import Decimal 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class TestDecimal: 22 | 23 | decimal = Decimal(deterministic=False) 24 | decimal_cases = parse_test_case("data/decimal.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", decimal_cases) 27 | def test_decimal(self, written, spoken): 28 | assert self.decimal.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/english/test/measure_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.rules.measure import Measure 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class TestMeasure: 22 | 23 | measure = Measure(deterministic=False) 24 | measure_cases = parse_test_case("data/measure.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", measure_cases) 27 | def test_measure(self, written, spoken): 28 | assert self.measure.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/english/test/ordinal_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.rules.ordinal import Ordinal 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class TestOrdinal: 22 | 23 | ordinal = Ordinal(deterministic=False) 24 | ordinal_cases = parse_test_case("data/ordinal.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", ordinal_cases) 27 | def test_ordinal(self, written, spoken): 28 | assert self.ordinal.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /.github/workflows/unittest.yml: -------------------------------------------------------------------------------- 1 | name: UnitTest 2 | 3 | on: 4 | workflow_dispatch: 5 | pull_request: 6 | paths: 7 | - '**.py' 8 | 9 | jobs: 10 | unit-test: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: [3.9] 15 | steps: 16 | - uses: actions/checkout@v3 17 | - name: Set up Python ${{ matrix.python-version }} 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: ${{ matrix.python-version }} 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install -r requirements.txt 25 | - name: Lint with flake8 26 | run: | 27 | # stop the build if there are Python syntax errors or undefined names 28 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 29 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 30 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 31 | - name: Test with pytest 32 | run: | 33 | pytest 34 | -------------------------------------------------------------------------------- /tn/english/test/normalizer_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.normalizer import Normalizer 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class TestNormalizer: 22 | 23 | normalizer = Normalizer(overwrite_cache=True) 24 | cases = parse_test_case("data/normalizer.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", cases) 27 | def test_normalizer(self, written, spoken): 28 | assert self.normalizer.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/english/test/cardinal_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.rules.cardinal import Cardinal 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class TestCardinal: 22 | 23 | cardinal = Cardinal(deterministic=False) 24 | cardinal_cases = parse_test_case("data/cardinal.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", cardinal_cases) 27 | def test_cardinal(self, written, spoken): 28 | assert self.cardinal.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/english/test/fraction_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.rules.fraction import Fraction 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class TestFraction: 22 | 23 | fraction = Fraction(deterministic=False) 24 | fraction_cases = parse_test_case("data/fraction.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", fraction_cases) 27 | def test_fraction(self, written, spoken): 28 | assert self.fraction.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/english/test/telephone_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.rules.telephone import Telephone 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class TestTelephone: 22 | 23 | telephone = Telephone(deterministic=False) 24 | telephone_cases = parse_test_case("data/telephone.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", telephone_cases) 27 | def test_telephone(self, written, spoken): 28 | assert self.telephone.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/english/test/whitelist_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.rules.whitelist import WhiteList 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class TestWhiteList: 22 | 23 | whitelist = WhiteList(deterministic=False) 24 | whitelist_cases = parse_test_case("data/whitelist.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", whitelist_cases) 27 | def test_whitelist(self, written, spoken): 28 | assert self.whitelist.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /tn/english/data/roman/README.md: -------------------------------------------------------------------------------- 1 | `female.tsv` - List of common female names. Copyright (c) January 1991 by Mark Kantrowitz, 4987 names, Version 1.3 (29-MAR-94) 2 | Source: [https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/female.txt](https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/female.txt) 3 | 4 | `male.tsv` - List of common male names. Copyright (c) January 1991 by Mark Kantrowitz, 2940 names, Version 1.3 (29-MAR-94) 5 | Source: [https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/male.txt](https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/male.txt) 6 | 7 | [Corpora Readme.txt](https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/readme.txt): 8 | 9 | You may use the lists of names for any purpose, so long as credit is given 10 | in any published work. You may also redistribute the list if you 11 | provide the recipients with a copy of this README file. The lists are 12 | not in the public domain (I retain the copyright on the lists) but are 13 | freely redistributable. 14 | 15 | If you have any additions to the lists of names, I would appreciate 16 | receiving them. 17 | 18 | My email address is mkant+@cs.cmu.edu. 19 | 20 | Mark Kantrowitz -------------------------------------------------------------------------------- /tn/english/test/electronic_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.english.rules.electronic import Electronic 18 | from tn.english.test.utils import parse_test_case 19 | 20 | 21 | class TestElectronic: 22 | 23 | electronic = Electronic(deterministic=False) 24 | electronic_cases = parse_test_case("data/electronic.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", electronic_cases) 27 | def test_electronic(self, written, spoken): 28 | assert self.electronic.normalize(written) == spoken 29 | -------------------------------------------------------------------------------- /runtime/android/gradle.properties: -------------------------------------------------------------------------------- 1 | # Project-wide Gradle settings. 2 | # IDE (e.g. Android Studio) users: 3 | # Gradle settings configured through the IDE *will override* 4 | # any settings specified in this file. 5 | # For more details on how to configure your build environment visit 6 | # http://www.gradle.org/docs/current/userguide/build_environment.html 7 | # Specifies the JVM arguments used for the daemon process. 8 | # The setting is particularly useful for tweaking memory settings. 9 | org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8 10 | # When configured, Gradle will run in incubating parallel mode. 11 | # This option should only be used with decoupled projects. More details, visit 12 | # http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects 13 | # org.gradle.parallel=true 14 | # AndroidX package structure to make it clearer which packages are bundled with the 15 | # Android operating system, and which are packaged with your app"s APK 16 | # https://developer.android.com/topic/libraries/support-library/androidx-rn 17 | android.useAndroidX=true 18 | # Automatically convert third-party libraries to use AndroidX 19 | android.enableJetifier=true -------------------------------------------------------------------------------- /tn/english/data/whitelist/asr_with_pc.tsv: -------------------------------------------------------------------------------- 1 | Hon. honorable 2 | Mt. Mount 3 | Maj. Major 4 | Rev. Reverend 5 | # hash 6 | Gov. governor 7 | vs. versus 8 | vs versus 9 | dept. department 10 | vol volume 11 | vol. volume 12 | bldg. building 13 | Bldg. Building 14 | apt. apartment 15 | Apt. Apartment 16 | Σ sigma 17 | η eta 18 | κ kappa 19 | ω omega 20 | σ sigma 21 | α alpha 22 | ν nu 23 | δ delta 24 | ι iota 25 | _ underscore 26 | % percent 27 | & ampersand 28 | * asterisk 29 | + plus 30 | / slash 31 | = equal sign 32 | ^ circumflex 33 | { left brace 34 | | vertical bar 35 | } right brace 36 | ~ tilde 37 | ltd limited 38 | int'l international 39 | $ dollar 40 | BMW M b m w 41 | Capt. captain 42 | Co. company 43 | Col. colonel 44 | Dr. doctor 45 | Drs. doctors 46 | e.g. for example 47 | e. g. for example 48 | ES3 e s three 49 | Esq. esquire 50 | F.I f 51 | FNU f n u d s a 52 | Ft. Fort 53 | Gen. general 54 | i.e. that is 55 | Jr. junior 56 | jr. junior 57 | Jr junior 58 | jr junior 59 | Ltd. limited 60 | Lt. lieutenant 61 | Mr. mister 62 | Mrs. misses 63 | Ms. miss 64 | Sgt. sergeant 65 | S&P 500 s and p five hundred 66 | Uéda u e acute d a 67 | USMC M u s m c 68 | vs. versus 69 | _vs._ versus 70 | VTE v t eL 71 | XVAS x v a 72 | -------------------------------------------------------------------------------- /tn/chinese/test/preprocessor_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.chinese.rules.preprocessor import PreProcessor 18 | from tn.chinese.test.utils import parse_test_case 19 | 20 | 21 | class TestPreProcessor: 22 | 23 | processor = PreProcessor().processor 24 | processor_cases = parse_test_case("data/preprocessor.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", processor_cases) 27 | def test_processor(self, written, spoken): 28 | print((written @ self.processor).string()) 29 | assert (written @ self.processor).string() == spoken 30 | -------------------------------------------------------------------------------- /tn/chinese/test/postprocessor_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.chinese.rules.postprocessor import PostProcessor 18 | from tn.chinese.test.utils import parse_test_case 19 | 20 | 21 | class TestPostProcessor: 22 | 23 | processor = PostProcessor(tag_oov=True).processor 24 | processor_cases = parse_test_case("data/postprocessor.txt") 25 | 26 | @pytest.mark.parametrize("written, spoken", processor_cases) 27 | def test_processor(self, written, spoken): 28 | print((written @ self.processor).string()) 29 | assert (written @ self.processor).string() == spoken 30 | -------------------------------------------------------------------------------- /tn/chinese/test/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | 17 | 18 | def parse_test_case(file_name): 19 | file = os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name 20 | 21 | delimiter = "=>" 22 | with open(file) as fin: 23 | for line in fin: 24 | assert delimiter in line 25 | arr = line.strip().split(delimiter) 26 | assert 0 < len(arr) <= 2 27 | 28 | written = arr[0].strip() 29 | spoken = "" 30 | if len(arr) > 1: 31 | spoken = arr[1].strip() 32 | yield (written, spoken) 33 | -------------------------------------------------------------------------------- /itn/japanese/rules/preprocessor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import string_file 16 | 17 | from tn.processor import Processor 18 | from tn.utils import get_abs_path 19 | 20 | 21 | class PreProcessor(Processor): 22 | 23 | def __init__(self, full_to_half): 24 | super().__init__(name="preprocessor") 25 | traditional2simple = string_file(get_abs_path("../itn/japanese/data/char/fullwidth_to_halfwidth.tsv")) 26 | 27 | processor = self.build_rule("") 28 | if full_to_half: 29 | processor @= self.build_rule(traditional2simple) 30 | 31 | self.processor = processor.optimize() 32 | -------------------------------------------------------------------------------- /tn/japanese/rules/preprocessor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import string_file 16 | 17 | from tn.processor import Processor 18 | from tn.utils import get_abs_path 19 | 20 | 21 | class PreProcessor(Processor): 22 | 23 | def __init__(self, full_to_half=True): 24 | super().__init__(name="preprocessor") 25 | traditional2simple = string_file(get_abs_path("japanese/data/char/fullwidth_to_halfwidth.tsv")) 26 | 27 | processor = self.build_rule("") 28 | if full_to_half: 29 | processor @= self.build_rule(traditional2simple) 30 | 31 | self.processor = processor.optimize() 32 | -------------------------------------------------------------------------------- /tn/chinese/rules/preprocessor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import string_file 16 | 17 | from tn.processor import Processor 18 | from tn.utils import get_abs_path 19 | 20 | 21 | class PreProcessor(Processor): 22 | 23 | def __init__(self, traditional_to_simple=True): 24 | super().__init__(name="preprocessor") 25 | traditional2simple = string_file(get_abs_path("chinese/data/char/traditional_to_simple.tsv")) 26 | 27 | processor = self.build_rule("") 28 | if traditional_to_simple: 29 | processor @= self.build_rule(traditional2simple) 30 | 31 | self.processor = processor.optimize() 32 | -------------------------------------------------------------------------------- /itn/japanese/data/char/hiragana_and_katakana.tsv: -------------------------------------------------------------------------------- 1 | あ 2 | い 3 | う 4 | え 5 | お 6 | か 7 | き 8 | く 9 | け 10 | こ 11 | さ 12 | し 13 | す 14 | せ 15 | そ 16 | た 17 | ち 18 | つ 19 | て 20 | と 21 | な 22 | に 23 | ぬ 24 | ね 25 | の 26 | は 27 | ひ 28 | ふ 29 | へ 30 | ほ 31 | ま 32 | み 33 | む 34 | め 35 | も 36 | や 37 | ゆ 38 | よ 39 | ら 40 | り 41 | る 42 | れ 43 | ろ 44 | わ 45 | を 46 | ん 47 | ア 48 | イ 49 | ウ 50 | エ 51 | オ 52 | カ 53 | キ 54 | ク 55 | ケ 56 | コ 57 | サ 58 | シ 59 | ス 60 | セ 61 | ソ 62 | タ 63 | チ 64 | ツ 65 | テ 66 | ト 67 | ナ 68 | ニ 69 | ヌ 70 | ネ 71 | ノ 72 | ハ 73 | ヒ 74 | フ 75 | ヘ 76 | ホ 77 | マ 78 | ミ 79 | ム 80 | メ 81 | モ 82 | ヤ 83 | ユ 84 | ヨ 85 | ラ 86 | リ 87 | ル 88 | レ 89 | ロ 90 | ワ 91 | ヲ 92 | ン 93 | が 94 | ぎ 95 | ぐ 96 | げ 97 | ご 98 | ざ 99 | じ 100 | ず 101 | ぜ 102 | ぞ 103 | だ 104 | ぢ 105 | づ 106 | で 107 | ど 108 | ば 109 | び 110 | ぶ 111 | べ 112 | ぼ 113 | ぱ 114 | ぴ 115 | ぷ 116 | ぺ 117 | ぽ 118 | ガ 119 | ギ 120 | グ 121 | ゲ 122 | ゴ 123 | ザ 124 | ジ 125 | ズ 126 | ゼ 127 | ゾ 128 | ダ 129 | ヂ 130 | ヅ 131 | デ 132 | ド 133 | バ 134 | ビ 135 | ブ 136 | ベ 137 | ボ 138 | パ 139 | ピ 140 | プ 141 | ペ 142 | ポ 143 | ャ 144 | ァ 145 | ィ 146 | ュ 147 | ッ 148 | ゥ 149 | ェ 150 | ョ 151 | ォ 152 | ぁ 153 | ぃ 154 | ぅ 155 | ぇ 156 | ぉ 157 | っ 158 | ゃ 159 | ゅ 160 | ょ -------------------------------------------------------------------------------- /itn/japanese/rules/whitelist.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import string_file 16 | from pynini.lib.pynutil import insert 17 | 18 | from tn.processor import Processor 19 | from tn.utils import get_abs_path 20 | 21 | 22 | class Whitelist(Processor): 23 | 24 | def __init__(self): 25 | super().__init__(name="whitelist") 26 | self.build_tagger() 27 | self.build_verbalizer() 28 | 29 | def build_tagger(self): 30 | whitelist = string_file(get_abs_path("../itn/japanese/data/default/whitelist.tsv")) 31 | 32 | tagger = insert('value: "') + whitelist + insert('"') 33 | self.tagger = self.add_tokens(tagger) 34 | -------------------------------------------------------------------------------- /tn/japanese/data/char/hiragana_and_katakana.tsv: -------------------------------------------------------------------------------- 1 | あ 2 | い 3 | う 4 | え 5 | お 6 | か 7 | き 8 | く 9 | け 10 | こ 11 | さ 12 | し 13 | す 14 | せ 15 | そ 16 | た 17 | ち 18 | つ 19 | て 20 | と 21 | な 22 | に 23 | ぬ 24 | ね 25 | の 26 | は 27 | ひ 28 | ふ 29 | へ 30 | ほ 31 | ま 32 | み 33 | む 34 | め 35 | も 36 | や 37 | ゆ 38 | よ 39 | ら 40 | り 41 | る 42 | れ 43 | ろ 44 | わ 45 | を 46 | ん 47 | ア 48 | イ 49 | ウ 50 | エ 51 | オ 52 | カ 53 | キ 54 | ク 55 | ケ 56 | コ 57 | サ 58 | シ 59 | ス 60 | セ 61 | ソ 62 | タ 63 | チ 64 | ツ 65 | テ 66 | ト 67 | ナ 68 | ニ 69 | ヌ 70 | ネ 71 | ノ 72 | ハ 73 | ヒ 74 | フ 75 | ヘ 76 | ホ 77 | マ 78 | ミ 79 | ム 80 | メ 81 | モ 82 | ヤ 83 | ユ 84 | ヨ 85 | ラ 86 | リ 87 | ル 88 | レ 89 | ロ 90 | ワ 91 | ヲ 92 | ン 93 | が 94 | ぎ 95 | ぐ 96 | げ 97 | ご 98 | ざ 99 | じ 100 | ず 101 | ぜ 102 | ぞ 103 | だ 104 | ぢ 105 | づ 106 | で 107 | ど 108 | ば 109 | び 110 | ぶ 111 | べ 112 | ぼ 113 | ぱ 114 | ぴ 115 | ぷ 116 | ぺ 117 | ぽ 118 | ガ 119 | ギ 120 | グ 121 | ゲ 122 | ゴ 123 | ザ 124 | ジ 125 | ズ 126 | ゼ 127 | ゾ 128 | ダ 129 | ヂ 130 | ヅ 131 | デ 132 | ド 133 | バ 134 | ビ 135 | ブ 136 | ベ 137 | ボ 138 | パ 139 | ピ 140 | プ 141 | ペ 142 | ポ 143 | ャ 144 | ァ 145 | ィ 146 | ュ 147 | ッ 148 | ゥ 149 | ェ 150 | ョ 151 | ォ 152 | ぁ 153 | ぃ 154 | ぅ 155 | ぇ 156 | ぉ 157 | っ 158 | ゃ 159 | ゅ 160 | ょ -------------------------------------------------------------------------------- /itn/chinese/rules/whitelist.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import string_file 16 | from pynini.lib.pynutil import insert 17 | 18 | from tn.processor import Processor 19 | from tn.utils import get_abs_path 20 | 21 | 22 | class Whitelist(Processor): 23 | 24 | def __init__(self): 25 | super().__init__(name="whitelist") 26 | self.build_tagger() 27 | self.build_verbalizer() 28 | 29 | def build_tagger(self): 30 | whitelist = string_file(get_abs_path("../itn/chinese/data/default/whitelist.tsv")) 31 | 32 | tagger = insert('value: "') + whitelist + insert('"') 33 | self.tagger = self.add_tokens(tagger) 34 | -------------------------------------------------------------------------------- /itn/japanese/rules/ordinal.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import accep 16 | from pynini.lib.pynutil import insert 17 | 18 | from itn.japanese.rules.cardinal import Cardinal 19 | from tn.processor import Processor 20 | 21 | 22 | class Ordinal(Processor): 23 | 24 | def __init__(self): 25 | super().__init__(name="ordinal") 26 | self.build_tagger() 27 | self.build_verbalizer() 28 | 29 | def build_tagger(self): 30 | cardinal = Cardinal().number 31 | ordinal = (cardinal + accep("番目")) | (accep("第") + cardinal) 32 | tagger = insert('value: "') + ordinal + insert('"') 33 | self.tagger = self.add_tokens(tagger) 34 | -------------------------------------------------------------------------------- /itn/japanese/test/data/normalizer_disable_standalone_number_disable_0_to_9.txt: -------------------------------------------------------------------------------- 1 | 一 => 一 2 | 四 => 四 3 | 十 => 十 4 | 十四 => 十四 5 | 四十四 => 四十四 6 | 四十 => 四十 7 | 百一 => 百一 8 | 百十二 => 百十二 9 | 四百四 => 四百四 10 | 九千百二十三 => 九千百二十三 11 | 一千二百三十四 => 一千二百三十四 12 | 五千六百七十八 => 五千六百七十八 13 | 二千二十 => 二千二十 14 | 二千二 => 二千二 15 | 二千十 => 二千十 16 | 二千百 => 二千百 17 | 九千 => 九千 18 | 九千二 => 九千二 19 | 十 => 十 20 | 百 => 百 21 | 千 => 千 22 | 万 => 万 23 | 兆 => 兆 24 | 千百 => 千百 25 | 千三百 => 千三百 26 | 千三百十 => 千三百十 27 | 千十 => 千十 28 | 千二十 => 千二十 29 | 千二十一 => 千二十一 30 | 千一 => 千一 31 | 千百十 => 千百十 32 | 千百一 => 千百一 33 | マイナス百十二 => マイナス百十二 34 | プラス百十二 => プラス百十二 35 | 二十万二 => 二十万二 36 | 一万二 => 一万二 37 | 二十万二千百 => 二十万二千百 38 | 四百万 => 四百万 39 | 四百四万 => 四百四万 40 | 五千万 => 五千万 41 | 二万 => 二万 42 | 一億五千万 => 一億五千万 43 | 一億五万 => 一億五万 44 | 一億一百万 => 一億一百万 45 | 一億一千万 => 一億一千万 46 | 二千億一千万 => 二千億一千万 47 | 二千億 => 二千億 48 | 二兆二億 => 二兆二億 49 | 二兆二千億 => 二兆二千億 50 | 二兆二千万 => 二兆二千万 51 | 二兆二百万 => 二兆二百万 52 | 一兆三百二十万五千 => 一兆三百二十万五千 53 | 二兆三十 => 二兆三十 54 | 二兆百 => 二兆百 55 | 二十兆百 => 二十兆百 56 | 一九二点一六八点零点一 => 192.168.0.1 57 | 一二三四五六七八九 => 123456789 58 | マイナス五百六十七 => マイナス五百六十七 59 | 四十四平方メートル => 44m² 60 | 四十四キログラム => 44kg 61 | 四部 => 四部 62 | 四円 => 四円 63 | 四十四部 => 44部 64 | 四十四匹 => 44匹 65 | 四分の三 => 3/4 66 | 四十四分の三 => 3/44 67 | 四十四パーセント => 44% 68 | 一時三十分三秒 => 1時30分3秒 69 | 八メガ秒 => 八メガ秒 70 | 一マイナス二プラス三十 => 1-2+30 71 | 一月 => 一月 72 | 一日 => 一日 -------------------------------------------------------------------------------- /itn/japanese/test/data/normalizer_disable_standalone_number_enable_0_to_9.txt: -------------------------------------------------------------------------------- 1 | 一 => 一 2 | 四 => 四 3 | 十 => 十 4 | 十四 => 十四 5 | 四十四 => 四十四 6 | 四十 => 四十 7 | 百一 => 百一 8 | 百十二 => 百十二 9 | 四百四 => 四百四 10 | 九千百二十三 => 九千百二十三 11 | 一千二百三十四 => 一千二百三十四 12 | 五千六百七十八 => 五千六百七十八 13 | 二千二十 => 二千二十 14 | 二千二 => 二千二 15 | 二千十 => 二千十 16 | 二千百 => 二千百 17 | 九千 => 九千 18 | 九千二 => 九千二 19 | 十 => 十 20 | 百 => 百 21 | 千 => 千 22 | 万 => 万 23 | 兆 => 兆 24 | 千百 => 千百 25 | 千三百 => 千三百 26 | 千三百十 => 千三百十 27 | 千十 => 千十 28 | 千二十 => 千二十 29 | 千二十一 => 千二十一 30 | 千一 => 千一 31 | 千百十 => 千百十 32 | 千百一 => 千百一 33 | マイナス百十二 => マイナス百十二 34 | プラス百十二 => プラス百十二 35 | 二十万二 => 二十万二 36 | 一万二 => 一万二 37 | 二十万二千百 => 二十万二千百 38 | 四百万 => 四百万 39 | 四百四万 => 四百四万 40 | 五千万 => 五千万 41 | 二万 => 二万 42 | 一億五千万 => 一億五千万 43 | 一億五万 => 一億五万 44 | 一億一百万 => 一億一百万 45 | 一億一千万 => 一億一千万 46 | 二千億一千万 => 二千億一千万 47 | 二千億 => 二千億 48 | 二兆二億 => 二兆二億 49 | 二兆二千億 => 二兆二千億 50 | 二兆二千万 => 二兆二千万 51 | 二兆二百万 => 二兆二百万 52 | 一兆三百二十万五千 => 一兆三百二十万五千 53 | 二兆三十 => 二兆三十 54 | 二兆百 => 二兆百 55 | 二十兆百 => 二十兆百 56 | 一九二点一六八点零点一 => 192.168.0.1 57 | 一二三四五六七八九 => 123456789 58 | マイナス五百六十七 => マイナス五百六十七 59 | 四十四平方メートル => 44m² 60 | 四十四キログラム => 44kg 61 | 四部 => 4部 62 | 四円 => 4円 63 | 四十四部 => 44部 64 | 四十四匹 => 44匹 65 | 四分の三 => 3/4 66 | 四十四分の三 => 3/44 67 | 四十四パーセント => 44% 68 | 一時三十分三秒 => 1時30分3秒 69 | 八メガ秒 => 8ms 70 | 一マイナス二プラス三十 => 1-2+30 71 | 一月 => 1月 72 | 一日 => 1日 -------------------------------------------------------------------------------- /tn/japanese/rules/transliteration.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import string_file 16 | from pynini.lib.pynutil import insert 17 | 18 | from tn.processor import Processor 19 | from tn.utils import get_abs_path 20 | 21 | 22 | class Transliteration(Processor): 23 | 24 | def __init__(self): 25 | super().__init__(name="transliteration") 26 | self.build_tagger() 27 | self.build_verbalizer() 28 | 29 | def build_tagger(self): 30 | transliteration = string_file(get_abs_path("japanese/data/pyopenjtalk/transliteration.tsv")) 31 | tagger = insert('value: "') + transliteration + insert('"') 32 | 33 | self.tagger = self.add_tokens(tagger) 34 | -------------------------------------------------------------------------------- /itn/japanese/test/data/normalizer_enable_standalone_number_disable_0_to_9.txt: -------------------------------------------------------------------------------- 1 | 一 => 一 2 | 四 => 四 3 | 十 => 10 4 | 十四 => 14 5 | 四十四 => 44 6 | 四十 => 40 7 | 百一 => 101 8 | 百十二 => 112 9 | 四百四 => 404 10 | 九千百二十三 => 9123 11 | 一千二百三十四 => 1234 12 | 五千六百七十八 => 5678 13 | 二千二十 => 2020 14 | 二千二 => 2002 15 | 二千十 => 2010 16 | 二千百 => 2100 17 | 九千 => 9000 18 | 九千二 => 9002 19 | 十 => 10 20 | 百 => 100 21 | 千 => 1000 22 | 万 => 万 23 | 兆 => 兆 24 | 千百 => 1100 25 | 千三百 => 1300 26 | 千三百十 => 1310 27 | 千十 => 1010 28 | 千二十 => 1020 29 | 千二十一 => 1021 30 | 千一 => 1001 31 | 千百十 => 1110 32 | 千百一 => 1101 33 | マイナス百十二 => -112 34 | プラス百十二 => +112 35 | 二十万二 => 200002 36 | 一万二 => 10002 37 | 二十万二千百 => 202100 38 | 四百万 => 400万 39 | 四百四万 => 404万 40 | 五千万 => 5000万 41 | 二万 => 20000 42 | 一億五千万 => 1億5000万 43 | 一億五万 => 1億5万 44 | 一億一百万 => 1億100万 45 | 一億一千万 => 1億1000万 46 | 二千億一千万 => 2000億1000万 47 | 二千億 => 2000億 48 | 二兆二億 => 2兆2億 49 | 二兆二千億 => 2兆2000億 50 | 二兆二千万 => 2兆2000万 51 | 二兆二百万 => 2兆200万 52 | 一兆三百二十万五千 => 1兆320万5000 53 | 二兆三十 => 2兆30 54 | 二兆百 => 2兆100 55 | 二十兆百 => 20兆100 56 | 一九二点一六八点零点一 => 192.168.0.1 57 | 一二三四五六七八九 => 123456789 58 | マイナス五百六十七 => -567 59 | 四十四平方メートル => 44m² 60 | 四十四キログラム => 44kg 61 | 四十四部 => 44部 62 | 四十四匹 => 44匹 63 | 四分の三 => 3/4 64 | 四十四分の三 => 3/44 65 | 四十四パーセント => 44% 66 | 一時三十分三秒 => 1時30分3秒 67 | 八メガ秒 => 八メガ秒 68 | 一マイナス二プラス三十 => 1-2+30 69 | 一月 => 一月 70 | 一日 => 一日 -------------------------------------------------------------------------------- /itn/chinese/rules/postprocessor.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # Copyright (c) 2023 Xingchen Song (sxc19@mails.tsinghua.edu.cn) 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | from pynini import string_file 17 | from pynini.lib.pynutil import delete 18 | 19 | from tn.processor import Processor 20 | from tn.utils import get_abs_path 21 | 22 | 23 | class PostProcessor(Processor): 24 | 25 | def __init__(self, remove_interjections=True): 26 | super().__init__(name="postprocessor") 27 | blacklist = string_file(get_abs_path("../itn/chinese/data/default/blacklist.tsv")) 28 | 29 | processor = self.VSIGMA 30 | if remove_interjections: 31 | processor @= self.build_rule(delete(blacklist)) 32 | self.processor = processor 33 | -------------------------------------------------------------------------------- /runtime/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.14 FATAL_ERROR) 2 | 3 | project(wetextprocessing VERSION 0.1) 4 | set(CMAKE_CXX_STANDARD 14) 5 | 6 | set(CMAKE_VERBOSE_MAKEFILE OFF) 7 | option(BUILD_TESTING "whether to build unit test" OFF) 8 | 9 | include(FetchContent) 10 | set(FETCHCONTENT_QUIET OFF) 11 | get_filename_component(fc_base "fc_base-${CMAKE_CXX_COMPILER_ID}" REALPATH BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) 12 | set(FETCHCONTENT_BASE_DIR ${fc_base}) 13 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake) 14 | 15 | if(NOT MSVC) 16 | # Keep the same with openfst, -fPIC or -fpic 17 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -pthread -fPIC") 18 | else() 19 | # For windows, please use unicode(3 bytes per chinese char) instead of gbk(2 bytes per chinese char). 20 | # https://github.com/wenet-e2e/wenet/issues/882#issuecomment-1101246299 21 | set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) 22 | add_compile_options("$<$:/utf-8>") 23 | endif() 24 | 25 | if(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") 26 | set(CMAKE_MACOSX_RPATH 1) 27 | endif() 28 | 29 | include(openfst) 30 | include_directories(${PROJECT_SOURCE_DIR}) 31 | 32 | add_subdirectory(utils) 33 | add_subdirectory(processor) 34 | add_subdirectory(bin) 35 | 36 | if(BUILD_TESTING) 37 | include(gtest) 38 | add_subdirectory(test) 39 | endif() 40 | -------------------------------------------------------------------------------- /tn/chinese/test/data/normalizer.txt: -------------------------------------------------------------------------------- 1 | 苹果宣布发布新IPHONE => 苹果宣布发布新IPHONE 2 | 他说:“我们已经吃过了!”。 => 他说:"我们已经吃过了!". 3 | 呃这个呃啊我不知道 => 这个我不知道 4 | 共465篇,约315万字 => 共四百六十五篇,约三百一十五万字 5 | 共计6.42万人 => 共计六点四二万人 6 | 同比升高0.6个百分点 => 同比升高零点六个百分点 7 | 总量的1/5以上 => 总量的五分之一以上 8 | 相当于头发丝的1/16 => 相当于头发丝的十六分之一 9 | 3/2是一个假分数 => 二分之三是一个假分数 10 | 同比增长6.3% => 同比增长百分之六点三 11 | 增幅0.4% => 增幅百分之零点四 12 | 2002/01/28 => 二零零二年一月二十八日 13 | 2002-01-28 => 二零零二年一月二十八日 14 | 2002.01.28 => 二零零二年一月二十八日 15 | 2002/01 => 二零零二年一月 16 | 8月16号12:00之前 => 八月十六号十二点之前 17 | 我是5:02开始的 => 我是五点零二分开始的 18 | 于5:35:36发射 => 于五点三十五分三十六秒发射 19 | 8:00 a.m.准时开会 => 上午八点准时开会 20 | 比分定格在78:96 => 比分定格在七十八比九十六 21 | 计算-2的绝对值是2 => 计算负二的绝对值是二 22 | ±2的平方都是4 => 正负二的平方都是四 23 | 价格是¥13.5 => 价格是十三点五元 24 | 价格是$13.5 => 价格是十三点五美元 25 | 价格是A$13.5 => 价格是十三点五澳元 26 | 价格是HKD13.5 => 价格是十三点五港元 27 | 重达25kg => 重达二十五千克 28 | 最高气温38°C => 最高气温三十八摄氏度 29 | 实际面积120m² => 实际面积一百二十平方米 30 | 渲染速度10ms一帧 => 渲染速度十毫秒一帧 31 | 可以打我手机13501234567 => 可以打我手机幺三五零幺二三四五六七 32 | 可以拨打12306来咨询 => 可以拨打幺二三零六来咨询 33 | 这儿有只鸟儿 => 这有只鸟 34 | 这事儿好办 => 这事好办 35 | 我儿子喜欢这地儿 => 我儿子喜欢这地 36 | O2O => O to O 37 | B2B => B to B 38 | 我们안녕 => 我们 39 | 雪の花 => 雪花 40 | 给12315打个电话 => 给幺二三幺五打个电话 41 | 人均200以内 => 人均两百以内 42 | 当场票数≥100万 => 当场票数大于等于一百万 43 | 独得300w张 => 独得三百万张 44 | 面积是10km² => 面积是十平方千米 45 | 仅仅是2015年 => 仅仅是二零一五年 46 | 包含3000余件 => 包含三千余件 47 | 查处450余名 => 查处四百五十余名 48 | 查处450余名 => 查处四百五十余名 49 | -------------------------------------------------------------------------------- /tn/japanese/rules/whitelist.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import string_file 16 | from pynini.lib.pynutil import insert 17 | 18 | from tn.processor import Processor 19 | from tn.utils import get_abs_path 20 | 21 | 22 | class Whitelist(Processor): 23 | 24 | def __init__(self): 25 | super().__init__(name="whitelist") 26 | self.build_tagger() 27 | self.build_verbalizer() 28 | 29 | def build_tagger(self): 30 | whitelist = string_file(get_abs_path("japanese/data/default/whitelist.tsv")) 31 | 32 | tagger = (insert('value: "') + whitelist) + insert('"') 33 | self.tagger = self.add_tokens(tagger) 34 | 35 | def build_verbalizer(self): 36 | super().build_verbalizer() 37 | -------------------------------------------------------------------------------- /tn/chinese/test/cardinal_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from tn.chinese.rules.cardinal import Cardinal 18 | from tn.chinese.test.utils import parse_test_case 19 | 20 | 21 | class TestCardinal: 22 | 23 | cardinal = Cardinal() 24 | number_cases = parse_test_case("data/number.txt") 25 | cardinal_cases = parse_test_case("data/cardinal.txt") 26 | 27 | @pytest.mark.parametrize("written, spoken", number_cases) 28 | def test_number(self, written, spoken): 29 | number = self.cardinal.number 30 | assert (written @ number).string() == spoken 31 | 32 | @pytest.mark.parametrize("written, spoken", cardinal_cases) 33 | def test_cardinal(self, written, spoken): 34 | assert self.cardinal.normalize(written) == spoken 35 | -------------------------------------------------------------------------------- /itn/chinese/rules/math.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Xingchen Song (sxc19@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import string_file 16 | from pynini.lib.pynutil import insert 17 | 18 | from itn.chinese.rules.cardinal import Cardinal 19 | from tn.processor import Processor 20 | from tn.utils import get_abs_path 21 | 22 | 23 | class Math(Processor): 24 | 25 | def __init__(self): 26 | super().__init__(name="math") 27 | self.build_tagger() 28 | self.build_verbalizer() 29 | 30 | def build_tagger(self): 31 | operator = string_file(get_abs_path("../itn/chinese/data/math/operator.tsv")) 32 | 33 | number = Cardinal().number 34 | tagger = number + (operator + number).plus 35 | tagger = insert('value: "') + tagger + insert('"') 36 | self.tagger = self.add_tokens(tagger) 37 | -------------------------------------------------------------------------------- /tn/japanese/rules/math.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import string_file 16 | from pynini.lib.pynutil import delete, insert 17 | 18 | from tn.japanese.rules.cardinal import Cardinal 19 | from tn.processor import Processor 20 | from tn.utils import get_abs_path 21 | 22 | 23 | class Math(Processor): 24 | 25 | def __init__(self): 26 | super().__init__(name="math") 27 | self.build_tagger() 28 | self.build_verbalizer() 29 | 30 | def build_tagger(self): 31 | operator = string_file(get_abs_path("japanese/data/math/operator.tsv")) 32 | 33 | number = Cardinal().number 34 | operator = number + (delete(" ").ques + operator + delete(" ").ques + number).star 35 | tagger = insert('value: "') + operator + insert('"') 36 | self.tagger = self.add_tokens(tagger) 37 | -------------------------------------------------------------------------------- /runtime/utils/wetext_string.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef UTILS_WETEXT_STRING_H_ 16 | #define UTILS_WETEXT_STRING_H_ 17 | 18 | #include 19 | #include 20 | 21 | namespace wetext { 22 | extern const char* WHITESPACE; 23 | 24 | int UTF8CharLength(char ch); 25 | 26 | int UTF8StringLength(const std::string& str); 27 | 28 | void SplitUTF8StringToChars(const std::string& str, 29 | std::vector* chars); 30 | 31 | std::string Ltrim(const std::string& str); 32 | 33 | std::string Rtrim(const std::string& str); 34 | 35 | std::string Trim(const std::string& str); 36 | 37 | void Split(const std::string& str, const std::string& delim, 38 | std::vector* output); 39 | 40 | } // namespace wetext 41 | 42 | #endif // UTILS_WETEXT_STRING_H_ 43 | -------------------------------------------------------------------------------- /itn/japanese/rules/math.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import string_file 16 | from pynini.lib.pynutil import insert 17 | 18 | from itn.japanese.rules.cardinal import Cardinal 19 | from tn.processor import Processor 20 | from tn.utils import get_abs_path 21 | 22 | 23 | class Math(Processor): 24 | 25 | def __init__(self): 26 | super().__init__(name="math") 27 | self.build_tagger() 28 | self.build_verbalizer() 29 | 30 | def build_tagger(self): 31 | operator = string_file(get_abs_path("../itn/japanese/data/math/operator.tsv")) 32 | 33 | number = Cardinal().big_integer 34 | decimal = Cardinal().decimal 35 | number |= decimal 36 | tagger = number + (operator + number).plus 37 | tagger = insert('value: "') + tagger + insert('"') 38 | self.tagger = self.add_tokens(tagger) 39 | -------------------------------------------------------------------------------- /tn/japanese/data/measure/units_en.tsv: -------------------------------------------------------------------------------- 1 | g グラム 2 | kg キログラム 3 | mg ミリグラム 4 | µg マイクログラム 5 | oz オンス 6 | t トン 7 | lb ポンド 8 | cm センチメートル 9 | m メートル 10 | km キロメートル 11 | dm デシメートル 12 | mm ミリメートル 13 | μm マイクロメートル 14 | nm ナノメートル 15 | ft フィート 16 | h 時 17 | hour 時 18 | min 分 19 | sec 秒 20 | s 秒 21 | ms ミリ秒 22 | °C 摂氏 23 | ℃ 摂氏 24 | °F 華氏 25 | mm² 平方ミリメートル 26 | cm² 平方センチメートル 27 | m² 平方メートル 28 | ㎡ 平方メートル 29 | km² 平方キロメートル 30 | ha ヘクタール 31 | ml ミリリットル 32 | L リットル 33 | mm³ 立方ミリメートル 34 | cm³ 立方センチメートル 35 | gal ガロン 36 | m³ 立方メートル 37 | mol モル 38 | μmol マイクロモル 39 | nmol ナノモル 40 | mmol ミリモル 41 | cd カンデラ 42 | Lm ルーメン 43 | Lux ルクス 44 | lm ルーメン 45 | fpm フィート毎分 46 | fph フィート毎時 47 | fps フィート毎秒 48 | mpm マイル毎分 49 | ips インチ毎秒 50 | ipm インチ毎分 51 | mph マイル毎時 52 | mps マイル毎秒 53 | in インチ 54 | mi マイル 55 | s² 毎秒毎秒 56 | s³ 毎秒毎秒毎秒 57 | kn ノット 58 | ° 度 59 | ' 分 60 | Pa パスカル 61 | N/m² パスカル 62 | pz ピエーズ 63 | N ニュートン 64 | mmHg 水銀柱ミリメートル 65 | hPa ヘクトパスカル 66 | MPa メガパスカル 67 | mbar ミリバール 68 | bar バール 69 | J ジュール 70 | kcal キロカロリー 71 | cal カロリー 72 | KCal キロカロリー 73 | Cal カロリー 74 | W ワット 75 | kWh キロワット時 76 | kW キロワット 77 | J·s ジュール秒 78 | A アンペア 79 | V ボルト 80 | Ω オーム 81 | A/m アンペア毎メートル 82 | Wb ウェーバ 83 | mAh ミリアンペアアワー 84 | Pa·s パスカル秒 85 | Bq ベクレル 86 | Gy グレイ 87 | rad ラド 88 | Sv シーベルト 89 | rem レム 90 | kat カタール 91 | Np ネーパ 92 | Hz ヘルツ 93 | dB デシベル 94 | hz ヘルツ 95 | bit ビット 96 | Byte バイト 97 | byte バイト 98 | MB メガバイト 99 | KB キロバイト 100 | GB ギガバイト 101 | TB テラバイト -------------------------------------------------------------------------------- /itn/chinese/data/measure/units_zh.tsv: -------------------------------------------------------------------------------- 1 | 年来 2 | 年前 3 | 年后 4 | 年内 5 | 年之前 6 | 年之后 7 | 人 8 | 篇 9 | 帧 10 | 把 11 | 封 12 | 艘 13 | 套 14 | 段 15 | 匹 16 | 张 17 | 座 18 | 回 19 | 场 20 | 尾 21 | 条 22 | 个 23 | 首 24 | 阙 25 | 阵 26 | 网 27 | 炮 28 | 顶 29 | 丘 30 | 棵 31 | 只 32 | 支 33 | 袭 34 | 辆 35 | 挑 36 | 担 37 | 颗 38 | 壳 39 | 窠 40 | 曲 41 | 墙 42 | 群 43 | 腔 44 | 砣 45 | 座 46 | 客 47 | 贯 48 | 扎 49 | 捆 50 | 刀 51 | 令 52 | 手 53 | 罗 54 | 坡 55 | 山 56 | 岭 57 | 江 58 | 溪 59 | 钟 60 | 队 61 | 单 62 | 双 63 | 对 64 | 口 65 | 头 66 | 脚 67 | 板 68 | 跳 69 | 枝 70 | 件 71 | 贴 72 | 针 73 | 线 74 | 管 75 | 名 76 | 位 77 | 身 78 | 堂 79 | 课 80 | 本 81 | 页 82 | 家 83 | 户 84 | 层 85 | 丝 86 | 毫 87 | 厘 88 | 分 89 | 钱 90 | 斤 91 | 担 92 | 铢 93 | 石 94 | 钧 95 | 锱 96 | 忽 97 | 克 98 | 毫 99 | 厘 100 | 寸 101 | 尺 102 | 丈 103 | 里 104 | 寻 105 | 常 106 | 铺 107 | 程 108 | 米 109 | 撮 110 | 勺 111 | 合 112 | 升 113 | 斗 114 | 石 115 | 盘 116 | 碗 117 | 碟 118 | 叠 119 | 桶 120 | 笼 121 | 盆 122 | 盒 123 | 杯 124 | 钟 125 | 斛 126 | 锅 127 | 簋 128 | 篮 129 | 盘 130 | 桶 131 | 罐 132 | 瓶 133 | 壶 134 | 卮 135 | 盏 136 | 箩 137 | 箱 138 | 煲 139 | 啖 140 | 袋 141 | 钵 142 | 季 143 | 年 144 | 月 145 | 日 146 | 刻 147 | 时 148 | 周 149 | 天 150 | 秒 151 | 旬 152 | 纪 153 | 岁 154 | 世 155 | 更 156 | 夜 157 | 春 158 | 夏 159 | 秋 160 | 冬 161 | 代 162 | 伏 163 | 辈 164 | 丸 165 | 泡 166 | 粒 167 | 颗 168 | 幢 169 | 堆 170 | 条 171 | 根 172 | 支 173 | 道 174 | 面 175 | 片 176 | 张 177 | 颗 178 | 块 179 | 架 180 | 角 181 | 毛 182 | 字 183 | 元 184 | 两 185 | 两米饭 186 | 两酒 187 | 吨 188 | 顿 189 | 牛 190 | 次 191 | 号 192 | -------------------------------------------------------------------------------- /tn/chinese/test/normalizer_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from itertools import chain 16 | 17 | import pytest 18 | 19 | from tn.chinese.normalizer import Normalizer 20 | from tn.chinese.test.utils import parse_test_case 21 | 22 | 23 | class TestNormalizer: 24 | 25 | normalizer = Normalizer(overwrite_cache=True, tag_oov=True) 26 | 27 | normalizer_cases = chain( 28 | parse_test_case("data/cardinal.txt"), 29 | parse_test_case("data/char.txt"), 30 | parse_test_case("data/date.txt"), 31 | parse_test_case("data/fraction.txt"), 32 | parse_test_case("data/math.txt"), 33 | parse_test_case("data/money.txt"), 34 | parse_test_case("data/time.txt"), 35 | parse_test_case("data/whitelist.txt"), 36 | parse_test_case("data/normalizer.txt"), 37 | ) 38 | 39 | @pytest.mark.parametrize("written, spoken", normalizer_cases) 40 | def test_normalizer(self, written, spoken): 41 | assert self.normalizer.normalize(written) == spoken 42 | -------------------------------------------------------------------------------- /tn/chinese/rules/math.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import cross, string_file 16 | from pynini.lib.pynutil import delete, insert 17 | 18 | from tn.chinese.rules.cardinal import Cardinal 19 | from tn.processor import Processor 20 | from tn.utils import get_abs_path 21 | 22 | 23 | class Math(Processor): 24 | 25 | def __init__(self): 26 | super().__init__(name="math") 27 | self.build_tagger() 28 | self.build_verbalizer() 29 | 30 | def build_tagger(self): 31 | operator = string_file(get_abs_path("chinese/data/math/operator.tsv")) 32 | # When it appears alone, it is treated as punctuation 33 | symbols = cross("~", "到") | cross(":", "比") | cross("<", "小于") | cross(">", "大于") 34 | 35 | number = Cardinal().number 36 | tagger = number + (delete(" ").ques + (operator | symbols) + delete(" ").ques + number).star 37 | tagger |= operator 38 | tagger = insert('value: "') + tagger + insert('"') 39 | self.tagger = self.add_tokens(tagger) 40 | -------------------------------------------------------------------------------- /tn/japanese/test/normalizer_test.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from itertools import chain 16 | 17 | import pytest 18 | 19 | from tn.japanese.normalizer import Normalizer 20 | from tn.japanese.test.utils import parse_test_case 21 | 22 | 23 | class TestNormalizer: 24 | 25 | normalizer = Normalizer(overwrite_cache=True) 26 | 27 | normalizer_cases = chain( 28 | parse_test_case("data/cardinal.txt"), 29 | parse_test_case("data/char.txt"), 30 | parse_test_case("data/date.txt"), 31 | parse_test_case("data/fraction.txt"), 32 | parse_test_case("data/math.txt"), 33 | parse_test_case("data/measure.txt"), 34 | parse_test_case("data/money.txt"), 35 | parse_test_case("data/sport.txt"), 36 | parse_test_case("data/time.txt"), 37 | parse_test_case("data/whitelist.txt"), 38 | ) 39 | 40 | @pytest.mark.parametrize("spoken, written", normalizer_cases) 41 | def test_normalizer(self, spoken, written): 42 | assert self.normalizer.normalize(spoken) == written 43 | -------------------------------------------------------------------------------- /itn/chinese/rules/license_plate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023 Xingchen Song (sxc19@mails.tsinghua.edu.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import string_file 16 | from pynini.lib.pynutil import insert 17 | 18 | from tn.processor import Processor 19 | from tn.utils import get_abs_path 20 | 21 | 22 | class LicensePlate(Processor): 23 | 24 | def __init__(self): 25 | super().__init__(name="licenseplate") 26 | self.build_tagger() 27 | self.build_verbalizer() 28 | 29 | def build_tagger(self): 30 | digit = string_file(get_abs_path("../itn/chinese/data/number/digit.tsv")) # 1 ~ 9 31 | zero = string_file(get_abs_path("../itn/chinese/data/number/zero.tsv")) # 0 32 | digits = zero | digit 33 | province = string_file(get_abs_path("../itn/chinese/data/license_plate/province.tsv")) # 皖 34 | license_plate = province + self.ALPHA + (self.ALPHA | digits) ** 5 35 | license_plate |= province + self.ALPHA + (self.ALPHA | digits) ** 6 36 | tagger = insert('value: "') + license_plate + insert('"') 37 | self.tagger = self.add_tokens(tagger) 38 | -------------------------------------------------------------------------------- /runtime/patch/openfst/src/extensions/special/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB HEADER_FILES ../../include/fst/extensions/special/*.h) 2 | message(STATUS "${HEADER_FILES}") 3 | 4 | if(HAVE_BIN) 5 | add_executable(fstspecial-bin 6 | ../../bin/fstconvert.cc 7 | ../../bin/fstconvert-main.cc 8 | phi-fst.cc 9 | rho-fst.cc 10 | sigma-fst.cc 11 | ) 12 | 13 | set_target_properties(fstspecial-bin PROPERTIES 14 | FOLDER special/bin 15 | OUTPUT_NAME fstspecial 16 | ) 17 | 18 | target_link_libraries(fstspecial-bin 19 | fstscript 20 | fst 21 | ${CMAKE_DL_LIBS} 22 | ) 23 | endif(HAVE_BIN) 24 | 25 | 26 | add_library(fstspecial 27 | phi-fst.cc 28 | rho-fst.cc 29 | sigma-fst.cc 30 | ${HEADER_FILES} 31 | ) 32 | 33 | set_target_properties(fstspecial PROPERTIES 34 | SOVERSION "${SOVERSION}" 35 | FOLDER special 36 | ) 37 | target_link_libraries(fstspecial 38 | fst 39 | ) 40 | 41 | set(FST_SPECIAL_INSTALL_TARGETS fstspecial) 42 | if(HAVE_BIN) 43 | list(APPEND FST_SPECIAL_INSTALL_TARGETS fstspecial-bin) 44 | endif() 45 | 46 | install(TARGETS ${FST_SPECIAL_INSTALL_TARGETS} 47 | LIBRARY DESTINATION lib 48 | RUNTIME DESTINATION bin 49 | ARCHIVE DESTINATION lib 50 | ) 51 | 52 | function (add_module _name) 53 | add_library(${ARGV}) 54 | if (TARGET ${_name}) 55 | target_link_libraries(${_name} fst) 56 | set_target_properties(${_name} 57 | PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true 58 | FOLDER special/modules 59 | ) 60 | endif() 61 | 62 | install(TARGETS ${_name} LIBRARY DESTINATION lib/fst) 63 | endfunction() 64 | 65 | add_module(phi-fst MODULE phi-fst.cc) 66 | add_module(rho-fst MODULE rho-fst.cc) 67 | add_module(sigma-fst MODULE sigma-fst.cc) 68 | -------------------------------------------------------------------------------- /.github/workflows/wheels.yml: -------------------------------------------------------------------------------- 1 | name: Build Wheels 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | version: 7 | description: 'Release version' 8 | required: true 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | strategy: 14 | matrix: 15 | python-version: [3.9] 16 | steps: 17 | - uses: actions/checkout@v3 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v4 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install -r requirements.txt 26 | pip install wheel 27 | 28 | - name: Build Graph 29 | run: | 30 | python -m tn --text "2.5平方电线" --overwrite_cache --language "zh" 31 | python -m tn --text "2010-03-21" --overwrite_cache --language "en" 32 | python -m itn --text "二点五平方电线" --overwrite_cache 33 | 34 | - name: Prepare Graph 35 | run: | 36 | mkdir graph 37 | cp tn/*.fst graph 38 | cp itn/*.fst graph 39 | 40 | - name: Upload Graph 41 | uses: actions/upload-artifact@v3 42 | with: 43 | name: release-graph-v${{ github.event.inputs.version}} 44 | path: graph 45 | 46 | - name: Publish on pypi.org 47 | env: 48 | TWINE_USERNAME: __token__ 49 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 50 | run: | 51 | python setup.py sdist bdist_wheel --version=${{ github.event.inputs.version}} 52 | python -m pip install -U twine 53 | python -m twine upload --repository-url https://upload.pypi.org/legacy/ dist/* 54 | -------------------------------------------------------------------------------- /runtime/test/string_test.cc: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "gmock/gmock.h" 16 | 17 | #include "utils/wetext_string.h" 18 | 19 | class StringTest : public testing::Test {}; 20 | 21 | TEST(StringTest, StringLengthTest) { 22 | EXPECT_EQ(wetext::UTF8StringLength("A"), 1); 23 | EXPECT_EQ(wetext::UTF8StringLength("À"), 1); 24 | EXPECT_EQ(wetext::UTF8StringLength("啊"), 1); 25 | EXPECT_EQ(wetext::UTF8StringLength("✐"), 1); 26 | EXPECT_EQ(wetext::UTF8StringLength("你好"), 2); 27 | EXPECT_EQ(wetext::UTF8StringLength("world"), 5); 28 | } 29 | 30 | TEST(StringTest, SplitUTF8StringToCharsTest) { 31 | std::vector chars; 32 | wetext::SplitUTF8StringToChars("你好world", &chars); 33 | ASSERT_THAT(chars, testing::ElementsAre("你", "好", "w", "o", "r", "l", "d")); 34 | } 35 | 36 | TEST(StringTest, TrimTest) { 37 | ASSERT_EQ(wetext::Trim("\thello "), "hello"); 38 | ASSERT_EQ(wetext::Trim(" hello\t"), "hello"); 39 | } 40 | 41 | TEST(StringTest, SplitTest) { 42 | std::vector output; 43 | wetext::Split("written => spoken", " => ", &output); 44 | ASSERT_THAT(output, testing::ElementsAre("written", "spoken")); 45 | } 46 | -------------------------------------------------------------------------------- /tn/japanese/rules/fraction.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini.lib.pynutil import delete, insert 16 | 17 | from tn.japanese.rules.cardinal import Cardinal 18 | from tn.processor import Processor 19 | 20 | 21 | class Fraction(Processor): 22 | 23 | def __init__(self): 24 | super().__init__(name="fraction") 25 | self.build_tagger() 26 | self.build_verbalizer() 27 | 28 | def build_tagger(self): 29 | rmspace = delete(" ").ques 30 | number = Cardinal().number 31 | 32 | tagger = ( 33 | insert('numerator: "') 34 | + number 35 | + rmspace 36 | + delete("/") 37 | + rmspace 38 | + insert('" denominator: "') 39 | + number 40 | + insert('"') 41 | ).optimize() 42 | self.tagger = self.add_tokens(tagger) 43 | 44 | def build_verbalizer(self): 45 | denominator = delete('denominator: "') + self.SIGMA + delete('" ') 46 | numerator = delete('numerator: "') + self.SIGMA + delete('"') 47 | verbalizer = denominator + insert("分の") + numerator 48 | self.verbalizer = self.delete_tokens(verbalizer) 49 | -------------------------------------------------------------------------------- /tn/chinese/rules/fraction.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini.lib.pynutil import delete, insert 16 | 17 | from tn.chinese.rules.cardinal import Cardinal 18 | from tn.processor import Processor 19 | 20 | 21 | class Fraction(Processor): 22 | 23 | def __init__(self): 24 | super().__init__(name="fraction") 25 | self.build_tagger() 26 | self.build_verbalizer() 27 | 28 | def build_tagger(self): 29 | rmspace = delete(" ").ques 30 | number = Cardinal().number 31 | 32 | tagger = ( 33 | insert('numerator: "') 34 | + number 35 | + rmspace 36 | + delete("/") 37 | + rmspace 38 | + insert('" denominator: "') 39 | + number 40 | + insert('"') 41 | ).optimize() 42 | self.tagger = self.add_tokens(tagger) 43 | 44 | def build_verbalizer(self): 45 | denominator = delete('denominator: "') + self.SIGMA + delete('" ') 46 | numerator = delete('numerator: "') + self.SIGMA + delete('"') 47 | verbalizer = denominator + insert("分之") + numerator 48 | self.verbalizer = self.delete_tokens(verbalizer) 49 | -------------------------------------------------------------------------------- /itn/japanese/data/measure/unit_en.tsv: -------------------------------------------------------------------------------- 1 | 華氏 f 2 | 摂氏 c 3 | キロメートル km 4 | 千キロメートル km 5 | メートル m 6 | センチ cm 7 | インチ インチ 8 | リットル L 9 | ジュール J 10 | ワット W 11 | アンペア A 12 | ボルト V 13 | オーム Ω 14 | アンペア毎メートル A/m 15 | ビット bit 16 | バイト Byte 17 | メガバイト MB 18 | キロバイト KB 19 | ギガバイト GB 20 | 度 ℃ 21 | 立方センチメートル cm³ 22 | ドット dpi 23 | ケルビン K 24 | センチメートル cm 25 | ミリメートル mm 26 | ヘクタール ha 27 | マイル mi 28 | 平方メートル m² 29 | 平方キロメートル km² 30 | 足 ft 31 | パーセント % 32 | ヘルツ hz 33 | キロワット kw 34 | 馬力 hp 35 | ミリグラム mg 36 | キログラム kg 37 | キロ kg 38 | ギガヘルツ ghz 39 | キロヘルツ khz 40 | メガヘルツ mhz 41 | ボルト v 42 | メガクーロン mc 43 | ナノメートル nm 44 | 毎分回転数 rpm 45 | ミリアンペア mA 46 | パーセント % 47 | キロワット時 kwh 48 | 立方メートル m³ 49 | 時速マイル mph 50 | テラワット tw 51 | ミリボルト mv 52 | メガワット mw 53 | マイクロメータ μm 54 | テラバイト TB 55 | グラム g 56 | ダルトン da 57 | 雰囲気 atm 58 | オーム ω 59 | デシベル dB 60 | ペタ秒 ps 61 | オンス oz 62 | ヘクトリットル hl 63 | マイクログラム μg 64 | ペタグラム pg 65 | ギガバイト gb 66 | キロビット kb 67 | 電子ボルト ev 68 | メガバイト mb 69 | キロバイト kb 70 | キロビット/秒 kbps 71 | 毎秒メガビット mbps 72 | 結石 st 73 | キロリットル kl 74 | テラジュール tj 75 | キロボルト kv 76 | メガボルト mv 77 | キロニュートン kn 78 | メガメーター mm 79 | 天文単位 au 80 | ヤード yd 81 | ラジアン rad 82 | ルーメン lm 83 | ヘクト秒 hs 84 | モル mol 85 | ギガパスカル gpa 86 | ミリリットル ml 87 | ギガワット gw 88 | メガアンペア ma 89 | 結び目 kt 90 | キログラム力 kgf 91 | ナノグラム ng 92 | ナノ秒 ns 93 | メガシーメンス ms 94 | バー bar 95 | ギガリットル gl 96 | マイクロ秒 μs 97 | デシアンペア da 98 | パスカル pa 99 | デシ秒 ds 100 | ミリ秒 ms 101 | デシメートル dm 102 | 立方デシメートル dm³ 103 | 原子質量単位 amu 104 | メガビット mb 105 | メガファラッド mf 106 | ベクレル bq 107 | ペタビット pb 108 | 平方ミリメートル mm² 109 | 平方センチメートル cm² 110 | 平方マイル sq mi 111 | 平方フィート sq ft 112 | キロパスカル kpa 113 | カンデラ cd 114 | テラリットル tl 115 | メガ秒 ms 116 | メガパスカル mpa 117 | ペタメーター pm 118 | ペタバイト pb 119 | ギガワットアワー gwh 120 | キロカロリー kcal 121 | グレー gy 122 | シーベルト sv 123 | ハンドレッド cwt -------------------------------------------------------------------------------- /itn/chinese/data/money/code.tsv: -------------------------------------------------------------------------------- 1 | 澳元 A$ 2 | 阿联酋迪拉姆 AED 3 | 阿富汗 阿富汗尼 AFN 4 | 阿尔巴尼亚列克 ALL 5 | 荷属安的列斯盾 ANG 6 | 阿根廷比索 ARS 7 | 澳元 AUD 8 | 阿鲁巴盾 AWG 9 | 阿塞拜疆马纳特 AZN 10 | 波斯尼亚和黑塞哥维那可兑换马克 BAM 11 | 巴巴多斯元 BBD 12 | 保加利亚列弗 BGN 13 | 百慕大元 BMD 14 | 文莱达鲁萨兰国元 BND 15 | 玻利维亚玻利维亚诺 BOB 16 | 巴西雷亚尔 BRL 17 | 巴哈马元 BSD 18 | 博茨瓦纳普拉 BWP 19 | 白俄罗斯卢布 BYN 20 | 伯利兹元 BZD 21 | 加元 CAD$ 22 | 加元 CAD 23 | 瑞士法郎 CHF 24 | 智利比索 CLP 25 | 人民币 CNY 26 | 哥伦比亚比索 COP 27 | 哥斯达黎加科隆 CRC 28 | 古巴比索 CUP 29 | 捷克克朗 CZK 30 | 丹麦克朗 DKK 31 | 多米尼加共和国比索 DOP 32 | 埃及镑 EGP 33 | 欧元成员国 EUR 34 | 斐济元 FJD 35 | 福克兰群岛(马尔维纳斯)镑 FKP 36 | 英镑 GBP 37 | 根西岛镑 GGP 38 | 加纳塞地 GHS 39 | 直布罗陀镑 GIP 40 | 危地马拉格查尔 GTQ 41 | 圭亚那元 GYD 42 | 港元 HK$ 43 | 港元 HKD 44 | 洪都拉斯伦皮拉 HNL 45 | 克罗地亚库纳 HRK 46 | 匈牙利福林 HUF 47 | 印尼盾 IDR 48 | 以色列谢克尔 ILS 49 | 马恩岛英镑 IMP 50 | 印度卢比 INR 51 | 伊朗里亚尔 IRR 52 | 冰岛克朗 ISK 53 | 日元 J¥ 54 | 泽西镑 JEP 55 | 牙买加元 JMD 56 | 日元 JPY¥ 57 | 日元 JPY 58 | 吉尔吉斯斯坦索姆 KGS 59 | 柬埔寨瑞尔 KHR 60 | 朝鲜园 KPW 61 | 韩元 KRW 62 | 韩元 KRW 63 | 开曼群岛元 KYD 64 | 哈萨克斯坦腾格 KZT 65 | 老挝基普 LAK 66 | 黎巴嫩镑 LBP 67 | 斯里兰卡卢比 LKR 68 | 利比里亚元 LRD 69 | 马其顿代纳尔 MKD 70 | 摩洛哥迪拉姆 MNT 71 | 蒙古图格里克 MNT 72 | 毛里求斯卢比 MUR 73 | 墨西哥比索 MXN 74 | 马来西亚令吉 MYR 75 | 莫桑比克梅蒂卡尔 MZN 76 | 纳米比亚元 NAD 77 | 尼日利亚奈拉 NGN 78 | 尼加拉瓜科尔多瓦 NIO 79 | 挪威克朗 NOK 80 | 尼泊尔卢比 NPR 81 | 新西兰元 NZD 82 | 阿曼里亚尔 OMR 83 | 巴拿马巴尔博亚 PAB 84 | 秘鲁索尔 PEN 85 | 菲律宾比索 PHP 86 | 巴基斯坦卢比 PKR 87 | 波兰兹罗提 PLN 88 | 巴拉圭瓜拉尼 PYG 89 | 卡塔尔里亚尔 QAR 90 | 罗马尼亚列伊 RON 91 | 塞尔维亚第纳尔 RSD 92 | 俄罗斯卢布 RUB 93 | 沙特阿拉伯里亚尔 SAR 94 | 所罗门群岛元 SBD 95 | 塞舌尔卢比 SCR 96 | 瑞典克朗 SEK 97 | 新加坡元 SGD 98 | 圣赫勒拿镑 SHP 99 | 索马里先令 SOS 100 | 苏里南元 SRD 101 | 萨尔瓦多科隆 SVC 102 | 叙利亚镑 SYP 103 | 泰铢 THB 104 | 土耳其里拉 TRY 105 | 特立尼达和多巴哥元 TTD 106 | 图瓦卢元 TVD 107 | 新台币 TWD 108 | 乌克兰格里夫纳 UAH 109 | 美元 USD 110 | 乌拉圭比索 UYU 111 | 乌兹别克斯坦索姆 UZS 112 | 委内瑞拉玻利瓦尔 VEF 113 | 越南东 VND 114 | 东加勒比元 XCD 115 | 也门里亚尔 YER 116 | 南非兰特 ZAR 117 | 津巴布韦元 ZWD 118 | -------------------------------------------------------------------------------- /runtime/android/app/src/main/res/drawable-v24/ic_launcher_foreground.xml: -------------------------------------------------------------------------------- 1 | 7 | 8 | 9 | 15 | 18 | 21 | 22 | 23 | 24 | 30 | -------------------------------------------------------------------------------- /runtime/processor/wetext_processor.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef PROCESSOR_WETEXT_PROCESSOR_H_ 16 | #define PROCESSOR_WETEXT_PROCESSOR_H_ 17 | 18 | #include 19 | #include 20 | 21 | #include "fst/fstlib.h" 22 | 23 | #include "processor/wetext_token_parser.h" 24 | 25 | using fst::StdArc; 26 | using fst::StdVectorFst; 27 | using fst::StringCompiler; 28 | using fst::StringPrinter; 29 | 30 | namespace wetext { 31 | class Processor { 32 | public: 33 | Processor(const std::string& tagger_path, const std::string& verbalizer_path); 34 | std::string Tag(const std::string& input); 35 | std::string Verbalize(const std::string& input); 36 | std::string Normalize(const std::string& input); 37 | 38 | private: 39 | std::string ShortestPath(const StdVectorFst& lattice); 40 | std::string Compose(const std::string& input, const StdVectorFst* fst); 41 | 42 | ParseType parse_type_; 43 | std::shared_ptr tagger_ = nullptr; 44 | std::shared_ptr verbalizer_ = nullptr; 45 | std::shared_ptr> compiler_ = nullptr; 46 | std::shared_ptr> printer_ = nullptr; 47 | }; 48 | 49 | } // namespace wetext 50 | 51 | #endif // PROCESSOR_WETEXT_PROCESSOR_H_ 52 | -------------------------------------------------------------------------------- /tn/chinese/rules/whitelist.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import accep, string_file 16 | from pynini.lib.pynutil import add_weight, delete, insert 17 | 18 | from tn.processor import Processor 19 | from tn.utils import get_abs_path 20 | 21 | 22 | class Whitelist(Processor): 23 | 24 | def __init__(self, remove_erhua=True): 25 | super().__init__(name="whitelist") 26 | self.remove_erhua = remove_erhua 27 | self.build_tagger() 28 | self.build_verbalizer() 29 | 30 | def build_tagger(self): 31 | whitelist = string_file(get_abs_path("chinese/data/default/whitelist.tsv")) | string_file( 32 | get_abs_path("chinese/data/erhua/whitelist.tsv") 33 | ) 34 | 35 | erhua = add_weight(insert('erhua: "') + accep("儿"), 0.1) 36 | tagger = (erhua | (insert('value: "') + whitelist)) + insert('"') 37 | self.tagger = self.add_tokens(tagger) 38 | 39 | def build_verbalizer(self): 40 | super().build_verbalizer() 41 | if self.remove_erhua: 42 | verbalizer = self.delete_tokens(delete('erhua: "儿"')) 43 | else: 44 | verbalizer = self.delete_tokens(delete('erhua: "') + accep("儿") + delete('"')) 45 | self.verbalizer |= verbalizer 46 | -------------------------------------------------------------------------------- /itn/japanese/rules/money.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import string_file 16 | from pynini.lib.pynutil import delete, insert 17 | 18 | from itn.japanese.rules.cardinal import Cardinal 19 | from tn.processor import Processor 20 | from tn.utils import get_abs_path 21 | 22 | 23 | class Money(Processor): 24 | 25 | def __init__(self, enable_0_to_9=True): 26 | super().__init__(name="money") 27 | self.enable_0_to_9 = enable_0_to_9 28 | self.build_tagger() 29 | self.build_verbalizer() 30 | 31 | def build_tagger(self): 32 | symbol = string_file(get_abs_path("../itn/japanese/data/money/symbol.tsv")) 33 | 34 | number = Cardinal().number if self.enable_0_to_9 else Cardinal().number_exclude_0_to_9 35 | decimal = Cardinal().decimal 36 | # 三千三百八十点五八円 => ¥3380.58 37 | tagger = insert('value: "') + (number | decimal) + insert('"') + insert(' currency: "') + symbol + insert('"') 38 | self.tagger = self.add_tokens(tagger) 39 | 40 | def build_verbalizer(self): 41 | currency = delete('currency: "') + self.SIGMA + delete('"') 42 | value = delete(' value: "') + self.SIGMA + delete('"') 43 | verbalizer = currency + value 44 | self.verbalizer = self.delete_tokens(verbalizer) 45 | -------------------------------------------------------------------------------- /tn/japanese/rules/money.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import string_file 16 | from pynini.lib.pynutil import delete, insert 17 | 18 | from tn.japanese.rules.cardinal import Cardinal 19 | from tn.processor import Processor 20 | from tn.utils import get_abs_path 21 | 22 | 23 | class Money(Processor): 24 | 25 | def __init__(self): 26 | super().__init__(name="money") 27 | self.build_tagger() 28 | self.build_verbalizer() 29 | 30 | def build_tagger(self): 31 | code = string_file(get_abs_path("japanese/data/money/code.tsv")) 32 | symbol = string_file(get_abs_path("japanese/data/money/symbol.tsv")) 33 | 34 | number = Cardinal().number 35 | tagger = ( 36 | insert('currency: "') 37 | + (code | symbol) 38 | + delete(" ").ques 39 | + insert('" ') 40 | + insert('value: "') 41 | + number 42 | + insert('"') 43 | ) 44 | self.tagger = self.add_tokens(tagger) 45 | 46 | def build_verbalizer(self): 47 | value = delete('value: "') + self.SIGMA + delete('" ') 48 | currency = delete('currency: "') + self.SIGMA + delete('"') 49 | verbalizer = value + currency 50 | self.verbalizer = self.delete_tokens(verbalizer) 51 | -------------------------------------------------------------------------------- /tn/chinese/rules/money.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn) 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pynini import string_file 16 | from pynini.lib.pynutil import delete, insert 17 | 18 | from tn.chinese.rules.cardinal import Cardinal 19 | from tn.processor import Processor 20 | from tn.utils import get_abs_path 21 | 22 | 23 | class Money(Processor): 24 | 25 | def __init__(self): 26 | super().__init__(name="money") 27 | self.build_tagger() 28 | self.build_verbalizer() 29 | 30 | def build_tagger(self): 31 | code = string_file(get_abs_path("chinese/data/money/code.tsv")) 32 | symbol = string_file(get_abs_path("chinese/data/money/symbol.tsv")) 33 | 34 | number = Cardinal().number 35 | tagger = ( 36 | insert('currency: "') 37 | + (code | symbol) 38 | + delete(" ").ques 39 | + insert('" ') 40 | + insert('value: "') 41 | + number 42 | + insert('"') 43 | ) 44 | self.tagger = self.add_tokens(tagger) 45 | 46 | def build_verbalizer(self): 47 | value = delete('value: "') + self.SIGMA + delete('" ') 48 | currency = delete('currency: "') + self.SIGMA + delete('"') 49 | verbalizer = value + currency 50 | self.verbalizer = self.delete_tokens(verbalizer) 51 | --------------------------------------------------------------------------------