├── itn
├── __init__.py
├── chinese
│ ├── __init__.py
│ ├── rules
│ │ ├── __init__.py
│ │ ├── char.py
│ │ ├── whitelist.py
│ │ ├── postprocessor.py
│ │ ├── math.py
│ │ └── license_plate.py
│ ├── data
│ │ ├── number
│ │ │ ├── dot.tsv
│ │ │ ├── zero.tsv
│ │ │ ├── sign.tsv
│ │ │ ├── digit_zh.tsv
│ │ │ ├── digit.tsv
│ │ │ ├── special_dash.tsv
│ │ │ └── special_tilde.tsv
│ │ ├── default
│ │ │ ├── blacklist.tsv
│ │ │ └── whitelist.tsv
│ │ ├── math
│ │ │ └── operator.tsv
│ │ ├── time
│ │ │ ├── noon.tsv
│ │ │ ├── hour.tsv
│ │ │ ├── minute.tsv
│ │ │ └── second.tsv
│ │ ├── date
│ │ │ ├── mm.tsv
│ │ │ └── dd.tsv
│ │ ├── license_plate
│ │ │ └── province.tsv
│ │ ├── money
│ │ │ ├── symbol.tsv
│ │ │ └── code.tsv
│ │ └── measure
│ │ │ ├── units_en.tsv
│ │ │ └── units_zh.tsv
│ └── test
│ │ ├── data
│ │ ├── char.txt
│ │ ├── whitelist.txt
│ │ ├── license_plate.txt
│ │ ├── fraction.txt
│ │ ├── math.txt
│ │ ├── money.txt
│ │ ├── normalizer_disable_standalone_number_enable_0_to_9.txt
│ │ ├── date.txt
│ │ ├── time.txt
│ │ ├── number.txt
│ │ ├── cardinal.txt
│ │ └── normalizer.txt
│ │ ├── utils.py
│ │ └── __init__.py
├── japanese
│ ├── __init__.py
│ ├── rules
│ │ ├── __init__.py
│ │ ├── char.py
│ │ ├── preprocessor.py
│ │ ├── whitelist.py
│ │ ├── ordinal.py
│ │ ├── math.py
│ │ └── money.py
│ ├── test
│ │ ├── __init__.py
│ │ ├── data
│ │ │ ├── char.txt
│ │ │ ├── money.txt
│ │ │ ├── whitelist.txt
│ │ │ ├── math.txt
│ │ │ ├── measure.txt
│ │ │ ├── fraction.txt
│ │ │ ├── time.txt
│ │ │ ├── number.txt
│ │ │ ├── date.txt
│ │ │ ├── cardinal.txt
│ │ │ ├── normalizer_disable_standalone_number_disable_0_to_9.txt
│ │ │ ├── normalizer_disable_standalone_number_enable_0_to_9.txt
│ │ │ └── normalizer_enable_standalone_number_disable_0_to_9.txt
│ │ └── utils.py
│ └── data
│ │ ├── number
│ │ ├── dot.tsv
│ │ ├── hundred.tsv
│ │ ├── zero.tsv
│ │ ├── sign.tsv
│ │ ├── thousands.tsv
│ │ ├── digit.tsv
│ │ ├── ties.tsv
│ │ ├── hundred_digit.tsv
│ │ └── teen.tsv
│ │ ├── default
│ │ ├── blacklist.tsv
│ │ └── whitelist.tsv
│ │ ├── char
│ │ ├── oov_tags.tsv
│ │ ├── punctuations_ja.tsv
│ │ ├── fullwidth_to_halfwidth.tsv
│ │ └── hiragana_and_katakana.tsv
│ │ ├── math
│ │ └── operator.tsv
│ │ ├── money
│ │ └── symbol.tsv
│ │ ├── date
│ │ ├── month.tsv
│ │ ├── week.tsv
│ │ └── day.tsv
│ │ ├── measure
│ │ ├── unit_ja.tsv
│ │ └── unit_en.tsv
│ │ └── time
│ │ ├── hour.tsv
│ │ ├── minute.tsv
│ │ └── second.tsv
└── __main__.py
├── tn
├── __init__.py
├── chinese
│ ├── __init__.py
│ ├── rules
│ │ ├── __init__.py
│ │ ├── char.py
│ │ ├── preprocessor.py
│ │ ├── math.py
│ │ ├── fraction.py
│ │ ├── whitelist.py
│ │ └── money.py
│ ├── data
│ │ ├── number
│ │ │ ├── dot.tsv
│ │ │ ├── zero.tsv
│ │ │ ├── sign.tsv
│ │ │ ├── teen.tsv
│ │ │ └── digit.tsv
│ │ ├── default
│ │ │ ├── blacklist.tsv
│ │ │ └── whitelist.tsv
│ │ ├── char
│ │ │ ├── charset_extension.tsv
│ │ │ ├── punctuations_zh.tsv
│ │ │ └── fullwidth_to_halfwidth.tsv
│ │ ├── math
│ │ │ └── operator.tsv
│ │ ├── time
│ │ │ ├── noon.tsv
│ │ │ ├── hour.tsv
│ │ │ ├── second.tsv
│ │ │ └── minute.tsv
│ │ ├── date
│ │ │ ├── m.tsv
│ │ │ ├── mm.tsv
│ │ │ ├── d.tsv
│ │ │ └── dd.tsv
│ │ ├── money
│ │ │ ├── symbol.tsv
│ │ │ └── code.tsv
│ │ ├── erhua
│ │ │ └── whitelist.tsv
│ │ └── measure
│ │ │ └── units_en.tsv
│ └── test
│ │ ├── data
│ │ ├── char.txt
│ │ ├── preprocessor.txt
│ │ ├── fraction.txt
│ │ ├── sport.txt
│ │ ├── whitelist.txt
│ │ ├── money.txt
│ │ ├── postprocessor.txt
│ │ ├── time.txt
│ │ ├── math.txt
│ │ ├── cardinal.txt
│ │ ├── measure.txt
│ │ ├── date.txt
│ │ ├── number.txt
│ │ └── normalizer.txt
│ │ ├── __init__.py
│ │ ├── time_test.py
│ │ ├── char_test.py
│ │ ├── date_test.py
│ │ ├── math_test.py
│ │ ├── money_test.py
│ │ ├── sport_test.py
│ │ ├── measure_test.py
│ │ ├── fraction_test.py
│ │ ├── whitelist_test.py
│ │ ├── preprocessor_test.py
│ │ ├── postprocessor_test.py
│ │ ├── utils.py
│ │ ├── cardinal_test.py
│ │ └── normalizer_test.py
├── english
│ ├── __init__.py
│ ├── data
│ │ ├── __init__.py
│ │ ├── date
│ │ │ ├── __init__.py
│ │ │ ├── year_suffix.tsv
│ │ │ ├── week.tsv
│ │ │ ├── month_number.tsv
│ │ │ ├── day.tsv
│ │ │ ├── month_abbr.tsv
│ │ │ └── month_name.tsv
│ │ ├── money
│ │ │ ├── __init__.py
│ │ │ ├── per_unit.tsv
│ │ │ ├── currency_minor_singular.tsv
│ │ │ ├── currency_minor_plural.tsv
│ │ │ └── currency_major.tsv
│ │ ├── roman
│ │ │ ├── __init__.py
│ │ │ ├── key_word.tsv
│ │ │ └── README.md
│ │ ├── electronic
│ │ │ ├── __init__.py
│ │ │ ├── domain.tsv
│ │ │ ├── words.tsv
│ │ │ └── symbol.tsv
│ │ ├── ordinal
│ │ │ ├── __init__.py
│ │ │ ├── teen.tsv
│ │ │ └── digit.tsv
│ │ ├── telephone
│ │ │ ├── __init__.py
│ │ │ ├── ip_prompt.tsv
│ │ │ ├── ssn_prompt.tsv
│ │ │ └── telephone_prompt.tsv
│ │ ├── whitelist
│ │ │ ├── __init__.py
│ │ │ ├── alternatives_all_format.tsv
│ │ │ ├── symbol.tsv
│ │ │ ├── lj_speech.tsv
│ │ │ ├── alternatives.tsv
│ │ │ └── asr_with_pc.tsv
│ │ ├── number
│ │ │ ├── hundred.tsv
│ │ │ ├── zero.tsv
│ │ │ ├── digit.tsv
│ │ │ ├── ty.tsv
│ │ │ ├── cardinal_number_name.far
│ │ │ ├── cardinal_number_name_au.far
│ │ │ ├── teen.tsv
│ │ │ ├── quantity_abbr.tsv
│ │ │ ├── fraction.tsv
│ │ │ ├── thousand.tsv
│ │ │ └── __init__.py
│ │ ├── measure
│ │ │ ├── math_operation.tsv
│ │ │ ├── __init__.py
│ │ │ └── unit_alternatives.tsv
│ │ ├── time
│ │ │ ├── suffix.tsv
│ │ │ ├── zone.tsv
│ │ │ └── __init__.py
│ │ └── address
│ │ │ ├── address_word.tsv
│ │ │ ├── __init__.py
│ │ │ └── state.tsv
│ ├── rules
│ │ └── __init__.py
│ └── test
│ │ ├── __init__.py
│ │ ├── data
│ │ ├── roman.txt
│ │ ├── range.txt
│ │ ├── electronic.txt
│ │ ├── word.txt
│ │ ├── decimal.txt
│ │ ├── telephone.txt
│ │ ├── whitelist.txt
│ │ ├── fraction.txt
│ │ ├── ordinal.txt
│ │ ├── time.txt
│ │ ├── money.txt
│ │ ├── measure.txt
│ │ ├── date.txt
│ │ └── cardinal.txt
│ │ ├── utils.py
│ │ ├── word_test.py
│ │ ├── date_test.py
│ │ ├── time_test.py
│ │ ├── money_test.py
│ │ ├── range_test.py
│ │ ├── roman_test.py
│ │ ├── decimal_test.py
│ │ ├── measure_test.py
│ │ ├── ordinal_test.py
│ │ ├── normalizer_test.py
│ │ ├── cardinal_test.py
│ │ ├── fraction_test.py
│ │ ├── telephone_test.py
│ │ ├── whitelist_test.py
│ │ └── electronic_test.py
├── japanese
│ ├── __init__.py
│ ├── test
│ │ ├── __init__.py
│ │ ├── data
│ │ │ ├── char.txt
│ │ │ ├── fraction.txt
│ │ │ ├── whitelist.txt
│ │ │ ├── money.txt
│ │ │ ├── sport.txt
│ │ │ ├── time.txt
│ │ │ ├── measure.txt
│ │ │ ├── math.txt
│ │ │ ├── date.txt
│ │ │ └── cardinal.txt
│ │ ├── utils.py
│ │ └── normalizer_test.py
│ ├── data
│ │ ├── default
│ │ │ ├── blacklist.tsv
│ │ │ └── whitelist.tsv
│ │ ├── number
│ │ │ ├── dot.tsv
│ │ │ ├── zero.tsv
│ │ │ ├── sign.tsv
│ │ │ ├── teen.tsv
│ │ │ ├── digit.tsv
│ │ │ └── en_digit.tsv
│ │ ├── char
│ │ │ ├── oov_tags.tsv
│ │ │ ├── punctuations_ja.tsv
│ │ │ ├── fullwidth_to_halfwidth.tsv
│ │ │ └── hiragana_and_katakana.tsv
│ │ ├── date
│ │ │ ├── date.tsv
│ │ │ ├── dd.tsv
│ │ │ ├── mm.tsv
│ │ │ ├── m.tsv
│ │ │ └── d.tsv
│ │ ├── time
│ │ │ ├── noon.tsv
│ │ │ ├── hour.tsv
│ │ │ ├── minute.tsv
│ │ │ └── second.tsv
│ │ ├── money
│ │ │ ├── symbol.tsv
│ │ │ └── code.tsv
│ │ ├── math
│ │ │ └── operator.tsv
│ │ ├── measure
│ │ │ ├── units_ja.tsv
│ │ │ └── units_en.tsv
│ │ └── sport
│ │ │ └── club.tsv
│ └── rules
│ │ ├── char.py
│ │ ├── preprocessor.py
│ │ ├── transliteration.py
│ │ ├── whitelist.py
│ │ ├── math.py
│ │ ├── fraction.py
│ │ └── money.py
└── __main__.py
├── runtime
├── patch
│ ├── CPPLINT.cfg
│ └── openfst
│ │ └── src
│ │ ├── CMakeLists.txt
│ │ └── extensions
│ │ └── special
│ │ └── CMakeLists.txt
├── android
│ ├── app
│ │ ├── .gitignore
│ │ ├── src
│ │ │ ├── main
│ │ │ │ ├── cpp
│ │ │ │ │ ├── cmake
│ │ │ │ │ ├── patch
│ │ │ │ │ ├── utils
│ │ │ │ │ ├── processor
│ │ │ │ │ └── CMakeLists.txt
│ │ │ │ ├── assets
│ │ │ │ │ └── README.md
│ │ │ │ ├── res
│ │ │ │ │ ├── values
│ │ │ │ │ │ ├── strings.xml
│ │ │ │ │ │ ├── colors.xml
│ │ │ │ │ │ ├── attrs.xml
│ │ │ │ │ │ └── themes.xml
│ │ │ │ │ ├── mipmap-hdpi
│ │ │ │ │ │ ├── ic_launcher.png
│ │ │ │ │ │ └── ic_launcher_round.png
│ │ │ │ │ ├── mipmap-mdpi
│ │ │ │ │ │ ├── ic_launcher.png
│ │ │ │ │ │ └── ic_launcher_round.png
│ │ │ │ │ ├── mipmap-xhdpi
│ │ │ │ │ │ ├── ic_launcher.png
│ │ │ │ │ │ └── ic_launcher_round.png
│ │ │ │ │ ├── mipmap-xxhdpi
│ │ │ │ │ │ ├── ic_launcher.png
│ │ │ │ │ │ └── ic_launcher_round.png
│ │ │ │ │ ├── mipmap-xxxhdpi
│ │ │ │ │ │ ├── ic_launcher.png
│ │ │ │ │ │ └── ic_launcher_round.png
│ │ │ │ │ ├── mipmap-anydpi-v26
│ │ │ │ │ │ ├── ic_launcher.xml
│ │ │ │ │ │ └── ic_launcher_round.xml
│ │ │ │ │ ├── values-night
│ │ │ │ │ │ └── themes.xml
│ │ │ │ │ └── drawable-v24
│ │ │ │ │ │ └── ic_launcher_foreground.xml
│ │ │ │ ├── java
│ │ │ │ │ └── com
│ │ │ │ │ │ └── mobvoi
│ │ │ │ │ │ └── WeTextProcessing
│ │ │ │ │ │ └── WeTextProcessing.java
│ │ │ │ └── AndroidManifest.xml
│ │ │ ├── test
│ │ │ │ └── java
│ │ │ │ │ └── com
│ │ │ │ │ └── mobvoi
│ │ │ │ │ └── WeTextProcessing
│ │ │ │ │ └── ExampleUnitTest.java
│ │ │ └── androidTest
│ │ │ │ └── java
│ │ │ │ └── com
│ │ │ │ └── mobvoi
│ │ │ │ └── WeTextProcessing
│ │ │ │ └── ExampleInstrumentedTest.java
│ │ ├── wenet.keystore
│ │ └── proguard-rules.pro
│ ├── settings.gradle
│ ├── gradle
│ │ └── wrapper
│ │ │ ├── gradle-wrapper.jar
│ │ │ └── gradle-wrapper.properties
│ ├── .gitignore
│ ├── build.gradle
│ └── gradle.properties
├── utils
│ ├── CMakeLists.txt
│ ├── wetext_log.h
│ ├── wetext_flags.h
│ └── wetext_string.h
├── bin
│ └── CMakeLists.txt
├── cmake
│ ├── glog.cmake
│ ├── gflags.cmake
│ └── gtest.cmake
├── test
│ ├── CMakeLists.txt
│ └── string_test.cc
├── processor
│ ├── CMakeLists.txt
│ └── wetext_processor.h
├── README.md
└── CMakeLists.txt
├── CPPLINT.cfg
├── requirements.txt
├── .flake8
├── .pre-commit-config.yaml
├── .gitignore
└── .github
└── workflows
├── unittest.yml
└── wheels.yml
/itn/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/itn/chinese/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/chinese/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/english/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/japanese/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/itn/japanese/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/chinese/rules/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/english/data/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/english/rules/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/english/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/japanese/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/itn/chinese/rules/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/itn/japanese/rules/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/itn/japanese/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/english/data/date/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/english/data/money/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/english/data/roman/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/itn/japanese/data/number/dot.tsv:
--------------------------------------------------------------------------------
1 | 点 .
--------------------------------------------------------------------------------
/itn/japanese/data/number/hundred.tsv:
--------------------------------------------------------------------------------
1 | 百
--------------------------------------------------------------------------------
/tn/chinese/data/number/dot.tsv:
--------------------------------------------------------------------------------
1 | . 点
2 |
--------------------------------------------------------------------------------
/tn/english/data/electronic/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/english/data/ordinal/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/english/data/telephone/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/english/data/whitelist/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/japanese/data/default/blacklist.tsv:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/itn/japanese/data/default/blacklist.tsv:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/english/data/number/hundred.tsv:
--------------------------------------------------------------------------------
1 | hundred
--------------------------------------------------------------------------------
/tn/english/data/number/zero.tsv:
--------------------------------------------------------------------------------
1 | zero 0
2 |
--------------------------------------------------------------------------------
/tn/japanese/data/number/dot.tsv:
--------------------------------------------------------------------------------
1 | . 点
2 |
--------------------------------------------------------------------------------
/itn/chinese/data/number/dot.tsv:
--------------------------------------------------------------------------------
1 | 点 .
2 | 點 .
3 |
--------------------------------------------------------------------------------
/itn/chinese/data/number/zero.tsv:
--------------------------------------------------------------------------------
1 | 零 0
2 | 洞 0
3 |
--------------------------------------------------------------------------------
/itn/japanese/data/char/oov_tags.tsv:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/runtime/patch/CPPLINT.cfg:
--------------------------------------------------------------------------------
1 | exclude_files=.*
2 |
--------------------------------------------------------------------------------
/tn/chinese/data/number/zero.tsv:
--------------------------------------------------------------------------------
1 | 0 零
2 | 0 零
3 |
--------------------------------------------------------------------------------
/tn/english/data/ordinal/teen.tsv:
--------------------------------------------------------------------------------
1 | twelfth twelve
--------------------------------------------------------------------------------
/tn/english/test/data/roman.txt:
--------------------------------------------------------------------------------
1 | IV => four
2 |
--------------------------------------------------------------------------------
/tn/japanese/data/char/oov_tags.tsv:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/tn/japanese/data/number/zero.tsv:
--------------------------------------------------------------------------------
1 | 0 〇
2 | 0 〇
3 |
--------------------------------------------------------------------------------
/CPPLINT.cfg:
--------------------------------------------------------------------------------
1 | root=runtime
2 | filter=-build/c++11
3 |
--------------------------------------------------------------------------------
/itn/chinese/data/default/blacklist.tsv:
--------------------------------------------------------------------------------
1 | 呃
2 | 啊
3 |
--------------------------------------------------------------------------------
/itn/chinese/test/data/char.txt:
--------------------------------------------------------------------------------
1 | 中 => 中
2 | A => A
3 |
--------------------------------------------------------------------------------
/itn/japanese/test/data/char.txt:
--------------------------------------------------------------------------------
1 | 中 => 中
2 | A => A
3 |
--------------------------------------------------------------------------------
/tn/chinese/data/default/blacklist.tsv:
--------------------------------------------------------------------------------
1 | 呃
2 | 啊
3 |
--------------------------------------------------------------------------------
/tn/chinese/test/data/char.txt:
--------------------------------------------------------------------------------
1 | 中 => 中
2 | A => A
3 |
--------------------------------------------------------------------------------
/tn/chinese/test/data/preprocessor.txt:
--------------------------------------------------------------------------------
1 | 寶貝 => 宝贝
2 |
--------------------------------------------------------------------------------
/tn/english/data/money/per_unit.tsv:
--------------------------------------------------------------------------------
1 | /ea each
2 | /dozen
--------------------------------------------------------------------------------
/tn/english/test/utils.py:
--------------------------------------------------------------------------------
1 | ../../chinese/test/utils.py
--------------------------------------------------------------------------------
/tn/japanese/test/data/char.txt:
--------------------------------------------------------------------------------
1 | 中 => 中
2 | A => A
3 |
--------------------------------------------------------------------------------
/tn/japanese/test/utils.py:
--------------------------------------------------------------------------------
1 | ../../chinese/test/utils.py
--------------------------------------------------------------------------------
/itn/chinese/test/utils.py:
--------------------------------------------------------------------------------
1 | ../../../tn/chinese/test/utils.py
--------------------------------------------------------------------------------
/itn/japanese/test/data/money.txt:
--------------------------------------------------------------------------------
1 | 三千三百八十点五八ドル => $3380.58
--------------------------------------------------------------------------------
/runtime/android/app/.gitignore:
--------------------------------------------------------------------------------
1 | /build
2 | /release
3 |
--------------------------------------------------------------------------------
/runtime/android/app/src/main/cpp/cmake:
--------------------------------------------------------------------------------
1 | ../../../../../cmake
--------------------------------------------------------------------------------
/runtime/android/app/src/main/cpp/patch:
--------------------------------------------------------------------------------
1 | ../../../../../patch
--------------------------------------------------------------------------------
/runtime/android/app/src/main/cpp/utils:
--------------------------------------------------------------------------------
1 | ../../../../../utils
--------------------------------------------------------------------------------
/tn/chinese/data/number/sign.tsv:
--------------------------------------------------------------------------------
1 | + 正
2 | ± 正负
3 | - 负
4 |
--------------------------------------------------------------------------------
/tn/english/test/data/range.txt:
--------------------------------------------------------------------------------
1 | 2-3 => two to three
2 |
--------------------------------------------------------------------------------
/itn/japanese/data/number/zero.tsv:
--------------------------------------------------------------------------------
1 | 〇 0
2 | 零 0
3 | ゼロ 0
4 | れい 0
--------------------------------------------------------------------------------
/itn/japanese/test/data/whitelist.txt:
--------------------------------------------------------------------------------
1 | 十三湖 => 十三湖
2 | 一月三舟 => 一月三舟
--------------------------------------------------------------------------------
/itn/japanese/test/utils.py:
--------------------------------------------------------------------------------
1 | ../../../tn/chinese/test/utils.py
--------------------------------------------------------------------------------
/tn/english/data/telephone/ip_prompt.tsv:
--------------------------------------------------------------------------------
1 | IP address is
2 | IP is
--------------------------------------------------------------------------------
/itn/chinese/data/number/sign.tsv:
--------------------------------------------------------------------------------
1 | 正 +
2 | 正负 ±
3 | 负 -
4 | 负的 -
5 |
--------------------------------------------------------------------------------
/itn/chinese/test/data/whitelist.txt:
--------------------------------------------------------------------------------
1 | 三七二十一 => 三七二十一
2 | 一共 => 一共
3 |
--------------------------------------------------------------------------------
/runtime/android/app/src/main/cpp/processor:
--------------------------------------------------------------------------------
1 | ../../../../../processor
--------------------------------------------------------------------------------
/tn/japanese/data/number/sign.tsv:
--------------------------------------------------------------------------------
1 | + プラス
2 | ± プラスマイナス
3 | - マイナス
4 |
--------------------------------------------------------------------------------
/itn/japanese/data/number/sign.tsv:
--------------------------------------------------------------------------------
1 | プラス +
2 | プラスマイナス ±
3 | マイナス -
4 | 负の -
--------------------------------------------------------------------------------
/tn/chinese/data/char/charset_extension.tsv:
--------------------------------------------------------------------------------
1 | 吶
2 | 囧
3 | 屄
4 | 屌
5 | 诶
6 | 飚
7 |
--------------------------------------------------------------------------------
/tn/english/data/money/currency_minor_singular.tsv:
--------------------------------------------------------------------------------
1 | $ cent
2 | € cent
3 | £ penny
--------------------------------------------------------------------------------
/tn/english/test/data/electronic.txt:
--------------------------------------------------------------------------------
1 | cdf1@abc.edu => cdf one at abc dot edu
2 |
--------------------------------------------------------------------------------
/itn/japanese/data/number/thousands.tsv:
--------------------------------------------------------------------------------
1 | 千
2 | 万
3 | 亿
4 | 兆
5 | 京
6 | 垓
7 | 秭
8 | 穰
9 | 沟
--------------------------------------------------------------------------------
/runtime/android/settings.gradle:
--------------------------------------------------------------------------------
1 | include ':app'
2 | rootProject.name = "WeTextProcessing"
--------------------------------------------------------------------------------
/tn/chinese/test/data/fraction.txt:
--------------------------------------------------------------------------------
1 | 1/2 => 二分之一
2 | 3/16 => 十六分之三
3 | 1 / 2 => 二分之一
4 |
--------------------------------------------------------------------------------
/tn/japanese/test/data/fraction.txt:
--------------------------------------------------------------------------------
1 | 1/100 => 百分の一
2 | -1/100 => 百分のマイナス一
3 | 1 / 2 => 二分の一
--------------------------------------------------------------------------------
/itn/chinese/data/math/operator.tsv:
--------------------------------------------------------------------------------
1 | 乘 ×
2 | 减 -
3 | 到 ~
4 | 加 +
5 | 比 :
6 | 等于 =
7 | 除 ÷
8 |
--------------------------------------------------------------------------------
/runtime/android/app/src/main/assets/README.md:
--------------------------------------------------------------------------------
1 | put tagger.fst and verbalizer.fst here.
2 |
--------------------------------------------------------------------------------
/tn/chinese/test/data/sport.txt:
--------------------------------------------------------------------------------
1 | 中国1-2 => 中国一比二
2 | 爆冷0:1 => 爆冷零比一
3 | 拉齐奥 2/2 => 拉齐奥二比二
4 |
--------------------------------------------------------------------------------
/tn/chinese/test/data/whitelist.txt:
--------------------------------------------------------------------------------
1 | 儿 =>
2 | 婴儿 => 婴儿
3 | O2O => O to O
4 | 90后 => 九零后
5 |
--------------------------------------------------------------------------------
/tn/english/data/money/currency_minor_plural.tsv:
--------------------------------------------------------------------------------
1 | $ cents
2 | US$ cents
3 | € cents
4 | £ pence
--------------------------------------------------------------------------------
/tn/english/data/telephone/ssn_prompt.tsv:
--------------------------------------------------------------------------------
1 | ssn is SSN is
2 | ssn is SSN is
3 | SSN is
4 | SSN
--------------------------------------------------------------------------------
/itn/__main__.py:
--------------------------------------------------------------------------------
1 | from itn.main import main
2 |
3 | if __name__ == "__main__":
4 | main()
5 |
--------------------------------------------------------------------------------
/tn/__main__.py:
--------------------------------------------------------------------------------
1 | from tn.main import main
2 |
3 | if __name__ == "__main__":
4 | main()
5 |
--------------------------------------------------------------------------------
/tn/chinese/test/data/money.txt:
--------------------------------------------------------------------------------
1 | ¥1.25 => 一点二五元
2 | CNY1.25 => 一点二五人民币
3 | CNY 1.25 => 一点二五人民币
4 |
--------------------------------------------------------------------------------
/tn/japanese/data/date/date.tsv:
--------------------------------------------------------------------------------
1 | 月 月曜日
2 | 火 火曜日
3 | 水 水曜日
4 | 木 木曜日
5 | 金 金曜日
6 | 土 土曜日
7 | 日 日曜日
--------------------------------------------------------------------------------
/itn/chinese/data/time/noon.tsv:
--------------------------------------------------------------------------------
1 | 上午 a.m.
2 | 早上 a.m.
3 | 早晨 a.m.
4 | 下午 p.m.
5 | 晚上 p.m.
6 | 傍晚 p.m.
7 |
--------------------------------------------------------------------------------
/itn/japanese/data/number/digit.tsv:
--------------------------------------------------------------------------------
1 | 一 1
2 | 二 2
3 | 三 3
4 | 四 4
5 | 五 5
6 | 六 6
7 | 七 7
8 | 八 8
9 | 九 9
--------------------------------------------------------------------------------
/itn/japanese/data/number/ties.tsv:
--------------------------------------------------------------------------------
1 | 二十 2
2 | 三十 3
3 | 四十 4
4 | 五十 5
5 | 六十 6
6 | 七十 7
7 | 八十 8
8 | 九十 9
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | flake8
2 | importlib_resources
3 | pynini==2.1.6
4 | pytest
5 | pre-commit==3.5.0
6 |
--------------------------------------------------------------------------------
/tn/english/data/roman/key_word.tsv:
--------------------------------------------------------------------------------
1 | chapter
2 | class
3 | part
4 | article
5 | section
6 | paragraph
7 |
--------------------------------------------------------------------------------
/itn/chinese/data/number/digit_zh.tsv:
--------------------------------------------------------------------------------
1 | 一
2 | 二
3 | 两
4 | 三
5 | 四
6 | 五
7 | 六
8 | 七
9 | 八
10 | 九
11 |
--------------------------------------------------------------------------------
/itn/japanese/data/math/operator.tsv:
--------------------------------------------------------------------------------
1 | カケル ×
2 | 負 -
3 | マイナス -
4 | プラス +
5 | イコール =
6 | ワル ÷
7 | から ~
8 | 対 :
--------------------------------------------------------------------------------
/tn/english/test/data/word.txt:
--------------------------------------------------------------------------------
1 | smile => smile
2 | 中国 => 中国
3 | 中 => 中
4 | 国 => 国
5 | A => A
6 | a => a
7 |
--------------------------------------------------------------------------------
/itn/chinese/test/data/license_plate.txt:
--------------------------------------------------------------------------------
1 | 鄂a七l六二u => 鄂a7l62u
2 | 皖C九B三四E => 皖C9B34E
3 | 京A零七ZX三F => 京A07ZX3F
4 |
--------------------------------------------------------------------------------
/tn/japanese/data/date/dd.tsv:
--------------------------------------------------------------------------------
1 | 01 一日
2 | 02 二日
3 | 03 三日
4 | 04 四日
5 | 05 五日
6 | 06 六日
7 | 07 七日
8 | 08 八日
9 | 09 九日
--------------------------------------------------------------------------------
/tn/japanese/data/date/mm.tsv:
--------------------------------------------------------------------------------
1 | 01 一月
2 | 02 二月
3 | 03 三月
4 | 04 四月
5 | 05 五月
6 | 06 六月
7 | 07 七月
8 | 08 八月
9 | 09 九月
--------------------------------------------------------------------------------
/tn/english/data/telephone/telephone_prompt.tsv:
--------------------------------------------------------------------------------
1 | call me at
2 | reach at
3 | reached at
4 | my number is
5 | hit me up at
--------------------------------------------------------------------------------
/tn/japanese/test/data/whitelist.txt:
--------------------------------------------------------------------------------
1 | P2P => P to P
2 | B2B => B to B
3 | R-18 => R十八
4 | FOREVER21 => FOREVERトゥエンティーワン
5 |
--------------------------------------------------------------------------------
/itn/japanese/data/money/symbol.tsv:
--------------------------------------------------------------------------------
1 | ドル $
2 | ポンド £
3 | ポンド £
4 | バーツ ฿
5 | ユーロ €
6 | インドルピー ₹
7 | ルーブル ₽
8 | スイスフラン CHF
9 | レアル R$
--------------------------------------------------------------------------------
/tn/english/data/number/digit.tsv:
--------------------------------------------------------------------------------
1 | one 1
2 | two 2
3 | three 3
4 | four 4
5 | five 5
6 | six 6
7 | seven 7
8 | eight 8
9 | nine 9
--------------------------------------------------------------------------------
/tn/english/data/number/ty.tsv:
--------------------------------------------------------------------------------
1 | twenty 2
2 | thirty 3
3 | forty 4
4 | fifty 5
5 | sixty 6
6 | seventy 7
7 | eighty 8
8 | ninety 9
--------------------------------------------------------------------------------
/runtime/android/app/wenet.keystore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/wenet.keystore
--------------------------------------------------------------------------------
/tn/chinese/data/math/operator.tsv:
--------------------------------------------------------------------------------
1 | × 乘
2 | - 减
3 | + 加
4 | = 等于
5 | ÷ 除
6 | ≥ 大于等于
7 | ≤ 小于等于
8 | >= 大于等于
9 | <= 小于等于
10 |
--------------------------------------------------------------------------------
/itn/japanese/data/number/hundred_digit.tsv:
--------------------------------------------------------------------------------
1 | 百一 101
2 | 百二 102
3 | 百三 103
4 | 百四 104
5 | 百五 105
6 | 百六 106
7 | 百七 107
8 | 百八 108
9 | 百九 109
--------------------------------------------------------------------------------
/itn/japanese/data/number/teen.tsv:
--------------------------------------------------------------------------------
1 | 十 10
2 | 十一 11
3 | 十二 12
4 | 十三 13
5 | 十四 14
6 | 十五 15
7 | 十六 16
8 | 十七 17
9 | 十八 18
10 | 十九 19
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/values/strings.xml:
--------------------------------------------------------------------------------
1 |
2 | WeTextProcessing
3 |
--------------------------------------------------------------------------------
/tn/english/data/measure/math_operation.tsv:
--------------------------------------------------------------------------------
1 | + plus
2 | - minus
3 | / divided
4 | ÷ divided
5 | : divided
6 | × times
7 | * times
8 | · times
--------------------------------------------------------------------------------
/itn/chinese/test/data/fraction.txt:
--------------------------------------------------------------------------------
1 | 二分之一 => 1/2
2 | 十六分之三 => 3/16
3 | 现场有十七分之七的观众投出了赞成票可是最后唱票结果却是负十二分之七 => 现场有7/17的观众投出了赞成票可是最后唱票结果却是-7/12
4 |
--------------------------------------------------------------------------------
/itn/japanese/data/date/month.tsv:
--------------------------------------------------------------------------------
1 | 一 1
2 | 二 2
3 | 三 3
4 | 四 4
5 | 五 5
6 | 六 6
7 | 七 7
8 | 八 8
9 | 九 9
10 | 十 10
11 | 十一 11
12 | 十二 12
--------------------------------------------------------------------------------
/runtime/utils/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(wetext_utils STATIC wetext_string.cc)
2 |
3 | target_link_libraries(wetext_utils PUBLIC glog)
4 |
--------------------------------------------------------------------------------
/tn/chinese/test/data/postprocessor.txt:
--------------------------------------------------------------------------------
1 | 好! => 好!
2 | 好啊 => 好
3 | 啊呃呃 =>
4 | 我们안녕 => 我们안녕
5 | 雪の花 => 雪の花
6 |
--------------------------------------------------------------------------------
/itn/chinese/test/data/math.txt:
--------------------------------------------------------------------------------
1 | 一加二 => 1+2
2 | 负一加二 => -1+2
3 | 一加二加三 => 1+2+3
4 | 二等于一加一 => 2=1+1
5 | 二十一到一千零一 => 21~1001
6 | 六百三到六百四 => 630~640
7 |
--------------------------------------------------------------------------------
/runtime/bin/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_executable(processor_main processor_main.cc)
2 | target_link_libraries(processor_main PUBLIC wetext_processor)
3 |
--------------------------------------------------------------------------------
/tn/chinese/data/time/noon.tsv:
--------------------------------------------------------------------------------
1 | a m 上午
2 | a.m. 上午
3 | am 上午
4 | A M 上午
5 | AM 上午
6 | p m 下午
7 | p.m. 下午
8 | pm 下午
9 | P M 下午
10 | PM 下午
11 |
--------------------------------------------------------------------------------
/tn/japanese/data/date/m.tsv:
--------------------------------------------------------------------------------
1 | 1 一月
2 | 2 二月
3 | 3 三月
4 | 4 四月
5 | 5 五月
6 | 6 六月
7 | 7 七月
8 | 8 八月
9 | 9 九月
10 | 10 十月
11 | 11 十一月
12 | 12 十二月
--------------------------------------------------------------------------------
/tn/chinese/data/date/m.tsv:
--------------------------------------------------------------------------------
1 | 1 一月
2 | 2 二月
3 | 3 三月
4 | 4 四月
5 | 5 五月
6 | 6 六月
7 | 7 七月
8 | 8 八月
9 | 9 九月
10 | 10 十月
11 | 11 十一月
12 | 12 十二月
13 |
--------------------------------------------------------------------------------
/tn/chinese/data/money/symbol.tsv:
--------------------------------------------------------------------------------
1 | $ 美元
2 | £ 英镑
3 | £ 英镑
4 | ¥ 元
5 | ¥ 元
6 | ฿ 泰铢
7 | € 欧元
8 | ₹ 印度卢比
9 | ₽ 卢布
10 | CHF 瑞士法郎
11 | R$ 巴西雷亚尔
12 |
--------------------------------------------------------------------------------
/tn/japanese/data/time/noon.tsv:
--------------------------------------------------------------------------------
1 | a m 午前
2 | a.m. 午前
3 | am 午前
4 | A M 午前
5 | AM 午前
6 | p m 午後
7 | p.m. 午後
8 | pm 午後
9 | P M 午後
10 | PM 午後
11 |
--------------------------------------------------------------------------------
/tn/english/data/number/cardinal_number_name.far:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/tn/english/data/number/cardinal_number_name.far
--------------------------------------------------------------------------------
/itn/chinese/data/date/mm.tsv:
--------------------------------------------------------------------------------
1 | 一月 01
2 | 二月 02
3 | 三月 03
4 | 四月 04
5 | 五月 05
6 | 六月 06
7 | 七月 07
8 | 八月 08
9 | 九月 09
10 | 十月 10
11 | 十一月 11
12 | 十二月 12
13 |
--------------------------------------------------------------------------------
/runtime/android/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/gradle/wrapper/gradle-wrapper.jar
--------------------------------------------------------------------------------
/tn/chinese/data/date/mm.tsv:
--------------------------------------------------------------------------------
1 | 01 一月
2 | 02 二月
3 | 03 三月
4 | 04 四月
5 | 05 五月
6 | 06 六月
7 | 07 七月
8 | 08 八月
9 | 09 九月
10 | 10 十月
11 | 11 十一月
12 | 12 十二月
13 |
--------------------------------------------------------------------------------
/tn/english/data/number/cardinal_number_name_au.far:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/tn/english/data/number/cardinal_number_name_au.far
--------------------------------------------------------------------------------
/tn/japanese/data/money/symbol.tsv:
--------------------------------------------------------------------------------
1 | $ ドル
2 | £ ポンド
3 | £ ポンド
4 | ¥ 円
5 | ¥ 円
6 | ฿ バーツ
7 | € ユーロ
8 | ₹ インドルピー
9 | ₽ ルーブル
10 | CHF スイスフラン
11 | R$ レアル
12 |
--------------------------------------------------------------------------------
/tn/japanese/test/data/money.txt:
--------------------------------------------------------------------------------
1 | USD1001 => 千一アメリカドル
2 | HKD1002 => 千二香港ドル
3 | ¥22 => 二十二円
4 | ¥ 22 => 二十二円
5 | $10000 => 一万ドル
6 | CAD => CAD
7 | CAD1001 => 千一カナダドル
--------------------------------------------------------------------------------
/itn/japanese/test/data/math.txt:
--------------------------------------------------------------------------------
1 | 四百四マイナス二 => 404-2
2 | 一マイナス二プラス三十 => 1-2+30
3 | 三対二 => 3:2
4 | 一プラス一イコール二 => 1+1=2
5 | 一カケル二マイナス三プラス四ワル五イコール二 => 1×2-3+4÷5=2
6 | 六から七 => 6~7
--------------------------------------------------------------------------------
/tn/chinese/test/data/time.txt:
--------------------------------------------------------------------------------
1 | 2:02 => 两点零二分
2 | 11:00 => 十一点
3 | 22:58 => 二十二点五十八分
4 | 13:10:36 => 十三点十分三十六秒
5 | 1:02:36am => 上午一点零二分三十六秒
6 | 1:02:36 am => 上午一点零二分三十六秒
7 |
--------------------------------------------------------------------------------
/tn/english/test/data/decimal.txt:
--------------------------------------------------------------------------------
1 | -12.5006 billion => minus twelve point five oh oh six billion
2 | 1 billion => one billion
3 | 1.5 million => one point five million
4 |
--------------------------------------------------------------------------------
/itn/japanese/test/data/measure.txt:
--------------------------------------------------------------------------------
1 | 二千センチメートル每秒 => 2000cm/s
2 | 二万センチメートル每秒 => 20000cm/s
3 | 二万二千センチメートル每秒 => 22000cm/s
4 | 八百メガ秒 => 800ms
5 | 三点五千キロメートル => 3.5km
6 | 百人 => 100人
--------------------------------------------------------------------------------
/tn/english/data/time/suffix.tsv:
--------------------------------------------------------------------------------
1 | p.m. PM
2 | p.m PM
3 | pm PM
4 | P.M. PM
5 | P.M PM
6 | PM PM
7 | a.m. AM
8 | a.m AM
9 | am AM
10 | A.M. AM
11 | A.M AM
12 | AM AM
13 |
--------------------------------------------------------------------------------
/tn/english/test/data/telephone.txt:
--------------------------------------------------------------------------------
1 | +1 123-123-5678-1 => plus one, one two three, one two three, five six seven eight, one
2 | 1-800-GO-U-HAUL => one, eight hundred, GO U HAUL
3 |
--------------------------------------------------------------------------------
/tn/english/test/data/whitelist.txt:
--------------------------------------------------------------------------------
1 | Ph.D. => PHD
2 | Hon. => honorable
3 | Mt. => Mount
4 | Maj. => Major
5 | Rev. => Reverend
6 | Stroudsburg, PA => Stroudsburg, Pennsylvania
7 |
--------------------------------------------------------------------------------
/itn/japanese/data/date/week.tsv:
--------------------------------------------------------------------------------
1 | 月曜日 月
2 | 月曜 月
3 | 火曜日 火
4 | 火曜 火
5 | 水曜日 水
6 | 水曜 水
7 | 木曜日 木
8 | 木曜 木
9 | 金曜日 金
10 | 金曜 金
11 | 土曜日 土
12 | 土曜 土
13 | 日曜日 日
14 | 日曜 日
15 |
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/mipmap-hdpi/ic_launcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-hdpi/ic_launcher.png
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/mipmap-mdpi/ic_launcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-mdpi/ic_launcher.png
--------------------------------------------------------------------------------
/tn/chinese/test/data/math.txt:
--------------------------------------------------------------------------------
1 | 1+2 => 一加二
2 | -1+2 => 负一加二
3 | 1+2+3 => 一加二加三
4 | 2 = 1 + 1 => 二等于一加一
5 | 2 ≤ 4 => 二小于等于四
6 | 2 ≥ 1 => 二大于等于一
7 | 2<=4 => 二小于等于四
8 | 2>=1 => 二大于等于一
9 |
--------------------------------------------------------------------------------
/tn/english/data/ordinal/digit.tsv:
--------------------------------------------------------------------------------
1 | first one
2 | second two
3 | third three
4 | fourth four
5 | fifth five
6 | sixth six
7 | seventh seven
8 | eighth eight
9 | ninth nine
10 |
--------------------------------------------------------------------------------
/tn/japanese/data/math/operator.tsv:
--------------------------------------------------------------------------------
1 | × カケル
2 | - マイナス
3 | + プラス
4 | = イコール
5 | > 大なり
6 | < 小なり
7 | ≥ 大なりイコール
8 | ≤ 小なりイコール
9 | >= 大なりイコール
10 | <= 小なりイコール
11 | ÷ ワル
12 | ~ から
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-xhdpi/ic_launcher.png
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-xxhdpi/ic_launcher.png
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher.png
--------------------------------------------------------------------------------
/tn/english/data/number/teen.tsv:
--------------------------------------------------------------------------------
1 | ten 10
2 | eleven 11
3 | twelve 12
4 | thirteen 13
5 | fourteen 14
6 | fifteen 15
7 | sixteen 16
8 | seventeen 17
9 | eighteen 18
10 | nineteen 19
--------------------------------------------------------------------------------
/itn/japanese/test/data/fraction.txt:
--------------------------------------------------------------------------------
1 | 四分の三 => 3/4
2 | 一分の一 => 1/1
3 | 二万分の三 => 3/20000
4 | 二万点三 => 20000.3
5 | ルート三分の一 => 1/√3
6 | 一点六五分の五十 => 50/1.65
7 | 二ルート六分の三 => 3/2√6
8 | 三千分の三 => 3/3000
--------------------------------------------------------------------------------
/itn/japanese/test/data/time.txt:
--------------------------------------------------------------------------------
1 | 一時三十分三秒 => 1時30分3秒
2 | 五時二十分過ぎ => 5時20分過ぎ
3 | 七分 => 7分
4 | 七秒 => 7秒
5 | 七時 => 7時
6 | 八時半頃 => 8時半頃
7 | 十時五分前 => 10時5分前
8 | 正午一分前 => 正午1分前
9 | 正午十分過ぎ => 正午10分過ぎ
--------------------------------------------------------------------------------
/tn/chinese/data/number/teen.tsv:
--------------------------------------------------------------------------------
1 | 1
2 | 2 二
3 | 3 三
4 | 4 四
5 | 5 五
6 | 6 六
7 | 7 七
8 | 8 八
9 | 9 九
10 | 1
11 | 2 二
12 | 3 三
13 | 4 四
14 | 5 五
15 | 6 六
16 | 7 七
17 | 8 八
18 | 9 九
19 |
--------------------------------------------------------------------------------
/tn/chinese/test/data/cardinal.txt:
--------------------------------------------------------------------------------
1 | 110 => 幺幺零
2 | 2% => 百分之二
3 | 127.0.0.1 => 一二七点零点零点一
4 | 010-64035547 => 零一零六四零三五五四七
5 | 尾号1702 => 尾号幺七零二
6 | 尾号是3385 => 尾号是三三八五
7 | 尾号为2349 => 尾号为二三四九
8 |
--------------------------------------------------------------------------------
/tn/english/data/number/quantity_abbr.tsv:
--------------------------------------------------------------------------------
1 | M million
2 | MLN million
3 | m million
4 | mln million
5 | B billion
6 | b billion
7 | BN billion
8 | bn billion
9 | K thousand
10 | k thousand
--------------------------------------------------------------------------------
/tn/japanese/data/number/teen.tsv:
--------------------------------------------------------------------------------
1 | 1
2 | 2 二
3 | 3 三
4 | 4 四
5 | 5 五
6 | 6 六
7 | 7 七
8 | 8 八
9 | 9 九
10 | 1
11 | 2 二
12 | 3 三
13 | 4 四
14 | 5 五
15 | 6 六
16 | 7 七
17 | 8 八
18 | 9 九
19 |
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/mipmap-hdpi/ic_launcher_round.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-hdpi/ic_launcher_round.png
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/mipmap-mdpi/ic_launcher_round.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-mdpi/ic_launcher_round.png
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-xhdpi/ic_launcher_round.png
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-xxhdpi/ic_launcher_round.png
--------------------------------------------------------------------------------
/tn/chinese/data/number/digit.tsv:
--------------------------------------------------------------------------------
1 | 1 一
2 | 2 二
3 | 3 三
4 | 4 四
5 | 5 五
6 | 6 六
7 | 7 七
8 | 8 八
9 | 9 九
10 | 1 一
11 | 2 二
12 | 3 三
13 | 4 四
14 | 5 五
15 | 6 六
16 | 7 七
17 | 8 八
18 | 9 九
19 |
--------------------------------------------------------------------------------
/tn/japanese/data/number/digit.tsv:
--------------------------------------------------------------------------------
1 | 1 一
2 | 2 二
3 | 3 三
4 | 4 四
5 | 5 五
6 | 6 六
7 | 7 七
8 | 8 八
9 | 9 九
10 | 1 一
11 | 2 二
12 | 3 三
13 | 4 四
14 | 5 五
15 | 6 六
16 | 7 七
17 | 8 八
18 | 9 九
19 |
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wenet-e2e/WeTextProcessing/HEAD/runtime/android/app/src/main/res/mipmap-xxxhdpi/ic_launcher_round.png
--------------------------------------------------------------------------------
/tn/chinese/test/data/measure.txt:
--------------------------------------------------------------------------------
1 | 1年后 => 一年后
2 | 2年后 => 两年后
3 | 20年后 => 二十年后
4 | 100余 => 一百余
5 | 10几 => 十几
6 | 2两 => 二两
7 | 1kg => 一千克
8 | 1 kg => 一千克
9 | 10km/h => 每小时十公里
10 | 100兆 => 一百兆
11 |
--------------------------------------------------------------------------------
/tn/japanese/data/number/en_digit.tsv:
--------------------------------------------------------------------------------
1 | 1 いち
2 | 2 に
3 | 3 さん
4 | 4 よん
5 | 5 ご
6 | 6 ろく
7 | 7 なな
8 | 8 はち
9 | 9 きゅう
10 | 1 いち
11 | 2 に
12 | 3 さん
13 | 4 よん
14 | 5 ご
15 | 6 ろく
16 | 7 なな
17 | 8 はち
18 | 9 きゅう
--------------------------------------------------------------------------------
/tn/english/data/time/zone.tsv:
--------------------------------------------------------------------------------
1 | cst CST
2 | c.s.t CST
3 | cet CET
4 | c.e.t CET
5 | pst PST
6 | p.s.t PST
7 | est EST
8 | e.s.t EST
9 | pt PT
10 | p.t PT
11 | et ET
12 | e.t ET
13 | gmt GMT
14 | g.m.t GMT
15 |
--------------------------------------------------------------------------------
/itn/chinese/data/number/digit.tsv:
--------------------------------------------------------------------------------
1 | 一 1
2 | 幺 1
3 | 壹 1
4 | 二 2
5 | 两 2
6 | 贰 2
7 | 三 3
8 | 叁 3
9 | 四 4
10 | 肆 4
11 | 五 5
12 | 伍 5
13 | 六 6
14 | 陆 6
15 | 七 7
16 | 柒 7
17 | 拐 7
18 | 八 8
19 | 捌 8
20 | 九 9
21 | 玖 9
22 |
--------------------------------------------------------------------------------
/tn/chinese/data/default/whitelist.tsv:
--------------------------------------------------------------------------------
1 | B2B B to B
2 | M.V.P M V P
3 | O2O O to O
4 | P2P P to P
5 | BY2 BY TWO
6 | By2 By Two
7 | 10后 一零后
8 | 00后 零零后
9 | 90后 九零后
10 | 80后 八零后
11 | 70后 七零后
12 | 60后 六零后
13 | 50后 五零后
14 |
--------------------------------------------------------------------------------
/tn/japanese/test/data/sport.txt:
--------------------------------------------------------------------------------
1 | 中国韓国3:2 => 中国韓国三対二
2 | 中国韓国3-2 => 中国韓国三対二
3 | 中国韓国3-0 => 中国韓国三対〇
4 | ACミラン3:2 => ACミラン三対二
5 | ACミラン3-2 => ACミラン三対二
6 | 国韓3:2 => 国韓三対二
7 | 2:3 => 二対三
8 | 3:0 => 三対〇
9 | 1:1 => 一対一
10 |
--------------------------------------------------------------------------------
/tn/english/data/electronic/domain.tsv:
--------------------------------------------------------------------------------
1 | .com dot com
2 | .org dot org
3 | .gov dot gov
4 | .uk dot UK
5 | .fr dot FR
6 | .net dot net
7 | .br dot BR
8 | .in dot IN
9 | .ru dot RU
10 | .de dot DE
11 | .it dot IT
12 | .jpg dot jpeg
--------------------------------------------------------------------------------
/tn/english/data/number/fraction.tsv:
--------------------------------------------------------------------------------
1 | ¼ 1/4
2 | ½ 1/2
3 | ¾ 3/4
4 | ⅐ 1/7
5 | ⅑ 1/9
6 | ⅒ 1/10
7 | ⅓ 1/3
8 | ⅔ 2/3
9 | ⅕ 1/5
10 | ⅖ 2/5
11 | ⅗ 3/5
12 | ⅘ 4/5
13 | ⅙ 1/6
14 | ⅚ 5/6
15 | ⅛ 1/8
16 | ⅜ 3/8
17 | ⅝ 5/8
18 | ⅞ 7/8
19 |
--------------------------------------------------------------------------------
/tn/english/data/date/year_suffix.tsv:
--------------------------------------------------------------------------------
1 | A. D AD
2 | A.D AD
3 | a. d AD
4 | a.d AD
5 | a. d. AD
6 | a.d. AD
7 | B. C BC
8 | B.C BC
9 | b. c BC
10 | b.c BC
11 | A. D. AD
12 | A.D. AD
13 | B. C. BC
14 | B.C. BC
15 | b. c. BC
16 | b.c. BC
17 |
--------------------------------------------------------------------------------
/tn/english/data/electronic/words.tsv:
--------------------------------------------------------------------------------
1 | drive
2 | sim
3 | early
4 | access
5 | program
6 | rtx RTX
7 | developer
8 | basepod BASEPOD
9 | cuda CUDA
10 | cv
11 | enterprise
12 | services
13 | nvidia NVIDIA
14 | dgx DGX
15 | pro
16 | help
17 |
--------------------------------------------------------------------------------
/tn/english/test/data/fraction.txt:
--------------------------------------------------------------------------------
1 | 23 4/5 => twenty three and four fifths
2 | 23 4/5th => twenty three and four fifths
3 | 1/3 => one third
4 | 1/2 => one half
5 | 1/4 => one quarter
6 | 2/4 => two quarters
7 | 23/44 => twenty three forty fourths
8 |
--------------------------------------------------------------------------------
/tn/japanese/data/default/whitelist.tsv:
--------------------------------------------------------------------------------
1 | B2B B to B
2 | M.V.P M V P
3 | O2O O to O
4 | P2P P to P
5 | BY2 BY TWO
6 | By2 By Two
7 | R-18 R十八
8 | r-18 R十八
9 | M-1 Mワン
10 | M-1 Mワン
11 | M1 Mワン
12 | M1 Mワン
13 | FOREVER21 FOREVERトゥエンティーワン
14 | @ アットマーク
--------------------------------------------------------------------------------
/itn/chinese/data/license_plate/province.tsv:
--------------------------------------------------------------------------------
1 | 京
2 | 津
3 | 沪
4 | 渝
5 | 冀
6 | 豫
7 | 云
8 | 辽
9 | 黑
10 | 湘
11 | 皖
12 | 鲁
13 | 新
14 | 苏
15 | 浙
16 | 赣
17 | 鄂
18 | 桂
19 | 甘
20 | 晋
21 | 蒙
22 | 陕
23 | 吉
24 | 闽
25 | 贵
26 | 粤
27 | 青
28 | 藏
29 | 川
30 | 宁
31 | 琼
32 |
--------------------------------------------------------------------------------
/tn/japanese/test/data/time.txt:
--------------------------------------------------------------------------------
1 | 3:02 => 三時二分
2 | 3:40am => 午前三時四十分
3 | 3:40a.m. => 午前三時四十分
4 | 3:40AM => 午前三時四十分
5 | 3:40A M => 午前三時四十分
6 | 3:30pm => 午後三時三十分
7 | 3:30 => 三時三十分
8 | 3:30-4:34 => 三時三十分から四時三十四分
9 | 10p.m. => 午後十時
10 | 0:30くらいつく => 〇時三十分くらいつく
--------------------------------------------------------------------------------
/tn/english/data/address/address_word.tsv:
--------------------------------------------------------------------------------
1 | st Street
2 | street Street
3 | expy Expressway
4 | fwy Freeway
5 | hwy Highway
6 | dr Drive
7 | ct Court
8 | ave Avenue
9 | av Avenue
10 | cir Circle
11 | blvd Boulevard
12 | alley Alley
13 | way Way
14 | jct Junction
--------------------------------------------------------------------------------
/itn/japanese/data/default/whitelist.tsv:
--------------------------------------------------------------------------------
1 | 十三湖
2 | 一月三舟
3 | 一日之長
4 | 十八番
5 | 百人一首
6 | 二百十日
7 | 三度笠
8 | 千円札
9 | 二十面相
10 | 七つの海
11 | 四国八十八箇所
12 | 五箇山
13 | 千本鳥居
14 | 五月雨
15 | 六本木ヒルズ
16 | 七つの大罪
17 | 千本格子
18 | 二枚目俳優
19 | 六本木アートナイト
20 | 七人の侍
21 | 五月祭
22 | 七人の姉妹
23 | 十八番目の男
--------------------------------------------------------------------------------
/runtime/android/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | distributionBase=GRADLE_USER_HOME
2 | distributionPath=wrapper/dists
3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.2-bin.zip
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 |
--------------------------------------------------------------------------------
/itn/japanese/test/data/number.txt:
--------------------------------------------------------------------------------
1 | 三点一四一五九二六 => 3.1415926
2 | マイナス三点一四一五九二六 => -3.1415926
3 | 一百万千百十一 => 100万1111
4 | 一万千百十一 => 11111
5 | 一万二千三百 => 12300
6 | 二万 => 20000
7 | 三百万 => 300万
8 | 三兆三万 => 3兆3万
9 | 三千万 => 3000万
10 | 一兆三百二十万五千 => 1兆320万5000
11 | 三百二十万五千 => 320万5000
--------------------------------------------------------------------------------
/runtime/cmake/glog.cmake:
--------------------------------------------------------------------------------
1 | FetchContent_Declare(glog
2 | URL https://github.com/google/glog/archive/v0.4.0.zip
3 | URL_HASH MD5=2899b069b8229d49cd65eda5271315ad
4 | )
5 | FetchContent_MakeAvailable(glog)
6 | include_directories(${glog_SOURCE_DIR}/src ${glog_BINARY_DIR})
7 |
--------------------------------------------------------------------------------
/tn/english/data/whitelist/alternatives_all_format.tsv:
--------------------------------------------------------------------------------
1 | st street
2 | st saint
3 | dr doctor
4 | dr drive
5 | mt mount
6 | sr senior
7 | prof professor
8 | mt mountain
9 | sr senior
10 | jr junior
11 | vol volume
12 | rd road
13 | ave avenue
14 | approx approximately
15 |
--------------------------------------------------------------------------------
/tn/english/test/data/ordinal.txt:
--------------------------------------------------------------------------------
1 | 1st => first
2 | 2nd => second
3 | 3rd => third
4 | 5th => fifth
5 | 11th => eleventh
6 | 13th => thirteenth
7 | 20th => twentieth
8 | 21st => twenty first
9 | 30th => thirtieth
10 | 100th => one hundredth
11 | 1000th => one thousandth
12 |
--------------------------------------------------------------------------------
/tn/japanese/test/data/measure.txt:
--------------------------------------------------------------------------------
1 | 2022-2023年 => 二千二十二から二千二十三年
2 | 1-3年 => 一から三年
3 | 22-26年 => 二十二から二十六年
4 | 1~3年 => 一から三年
5 | 1-3平方 => 一から三平方
6 | 0km/h => 〇キロメートル毎時
7 | 10km/h => 十キロメートル毎時
8 | 2m/s => 二メートル毎秒
9 | 100fph/s => 百フィート毎時毎秒
10 | 1-200キロ => 一から二百キロ
11 | 10-11月 => 十から十一月
--------------------------------------------------------------------------------
/itn/chinese/test/data/money.txt:
--------------------------------------------------------------------------------
1 | 一点二五元 => ¥1.25
2 | 一点二五人民币 => CNY1.25
3 | 三十四点五二一元 => ¥34.521
4 | 八九千美元 => $8000~9000
5 | 七八英镑 => £7~8
6 | 十五六卢布 => ₽15-6
7 | 四十五六新台币 => TWD45-6
8 | 七百三四十欧元 => €730-40
9 | 七百三四十马来西亚令吉 => RM730-40
10 | 三千三百八十元五角八分 => ¥3380.58
11 | 二十五元三毛 => ¥25.3
12 |
--------------------------------------------------------------------------------
/runtime/cmake/gflags.cmake:
--------------------------------------------------------------------------------
1 | set(GFLAGS_NAMESPACE "gflags")
2 |
3 | FetchContent_Declare(gflags
4 | URL https://github.com/gflags/gflags/archive/v2.2.2.zip
5 | URL_HASH MD5=ff856ff64757f1381f7da260f79ba79b
6 | )
7 | FetchContent_MakeAvailable(gflags)
8 | include_directories(${gflags_BINARY_DIR}/include)
9 |
--------------------------------------------------------------------------------
/runtime/android/.gitignore:
--------------------------------------------------------------------------------
1 | *.iml
2 | .gradle
3 | /local.properties
4 | /.idea/caches
5 | /.idea/libraries
6 | /.idea/modules.xml
7 | /.idea/workspace.xml
8 | /.idea/navEditor.xml
9 | /.idea/assetWizardSettings.xml
10 | .DS_Store
11 | /build
12 | /captures
13 | .externalNativeBuild
14 | .cxx
15 | local.properties
16 |
--------------------------------------------------------------------------------
/tn/english/test/data/time.txt:
--------------------------------------------------------------------------------
1 | 12:30 a.m. est => twelve thirty AM EST
2 | 2.30 a.m. => two thirty AM
3 | 02.30 a.m. => two thirty AM
4 | 2.00 a.m. => two AM
5 | 2 a.m. => two AM
6 | 02:00 => two o'clock
7 | 02:30 => two thirty
8 | 2:00 => two o'clock
9 | 10:00:05 a.m. => ten hours zero minutes and five seconds AM
10 |
--------------------------------------------------------------------------------
/itn/japanese/test/data/date.txt:
--------------------------------------------------------------------------------
1 | 二千二十四年十月一日 => 2024年10月1日
2 | 二千二十四年十月 => 2024年10月
3 | 五から九日 => 5~9日
4 | 三から四月 => 3~4月
5 | 一月一日 => 1月1日
6 | 二十一世紀 => 21世紀
7 | 七十年代 => 70年代
8 | 七から八年 => 7~8年
9 | 二千九年 => 2009年
10 | 月曜日から金曜日 => 月曜日から金曜日
11 | 二十三年二月二十五日土曜日 => 23年2月25日土曜日
12 | 七月五から九日月曜日から金曜日 => 7月5〜9日月曜日から金曜日
13 | 今年はR六 => 今年は令和6
--------------------------------------------------------------------------------
/tn/english/test/data/money.txt:
--------------------------------------------------------------------------------
1 | $12.05 => twelve point oh five dollars
2 | $12.0500 => twelve point oh five dollars
3 | $1 => one dollar
4 | $1.00 => one dollar
5 | $0.05 => zero point oh five dollars
6 | $1 million => one million dollars
7 | $1.2 million => one point two million dollars
8 | $1.2320 => one point two three two dollars
9 |
--------------------------------------------------------------------------------
/runtime/cmake/gtest.cmake:
--------------------------------------------------------------------------------
1 | FetchContent_Declare(googletest
2 | URL https://github.com/google/googletest/archive/release-1.12.1.zip
3 | URL_HASH MD5=2648d4138129812611cf6b6b4b497a3b
4 | )
5 | if(MSVC)
6 | set(gtest_force_shared_crt ON CACHE BOOL "Always use msvcrt.dll" FORCE)
7 | endif()
8 | FetchContent_MakeAvailable(googletest)
9 |
--------------------------------------------------------------------------------
/tn/chinese/data/money/code.tsv:
--------------------------------------------------------------------------------
1 | A$ 澳元
2 | AED 阿联酋迪拉姆
3 | ARS 阿根廷比索
4 | AUD 澳元
5 | CAD$ 加元
6 | CAD 加元
7 | CHF 瑞士法郎
8 | CNY 人民币
9 | EUR 欧元成员国
10 | GBP 英镑
11 | HK$ 港元
12 | HKD 港元
13 | INR 印度卢比
14 | J¥ 日元
15 | JPY¥ 日元
16 | JPY 日元
17 | KRW 韩元
18 | RUB 俄罗斯卢布
19 | SAR 沙特阿拉伯里亚尔
20 | SEK 瑞典克朗
21 | SGD 新加坡元
22 | THB 泰铢
23 | TRY 土耳其里拉
24 | USD 美元
25 |
--------------------------------------------------------------------------------
/tn/english/data/date/week.tsv:
--------------------------------------------------------------------------------
1 | Mon Monday
2 | Mon. Monday
3 | Tu Tuesday
4 | Tu. Tuesday
5 | Wed Wednesday
6 | Wed. Wednesday
7 | Th Thursday
8 | Th. Thursday
9 | Thur Thursday
10 | Thur. Thursday
11 | Thurs Thursday
12 | Thurs. Thursday
13 | Fri Friday
14 | Fri. Friday
15 | Sat Saturday
16 | Sat. Saturday
17 | Sun Sunday
18 | Sun. Sunday
19 |
--------------------------------------------------------------------------------
/tn/english/data/electronic/symbol.tsv:
--------------------------------------------------------------------------------
1 | . dot
2 | - dash
3 | _ underscore
4 | ! exclamation mark
5 | # number sign
6 | $ dollar sign
7 | % percent
8 | & ampersand
9 | ' quote
10 | * asterisk
11 | + plus
12 | / slash
13 | = equal sign
14 | ? question mark
15 | ^ circumflex
16 | ` right single quote
17 | | vertical bar
18 | ~ tilde
19 | , comma
--------------------------------------------------------------------------------
/itn/japanese/data/date/day.tsv:
--------------------------------------------------------------------------------
1 | 一 1
2 | 二 2
3 | 三 3
4 | 四 4
5 | 五 5
6 | 六 6
7 | 七 7
8 | 八 8
9 | 九 9
10 | 十 10
11 | 十一 11
12 | 十二 12
13 | 十三 13
14 | 十四 14
15 | 十五 15
16 | 十六 16
17 | 十七 17
18 | 十八 18
19 | 十九 19
20 | 二十 20
21 | 二十一 21
22 | 二十二 22
23 | 二十三 23
24 | 二十四 24
25 | 二十五 25
26 | 二十六 26
27 | 二十七 27
28 | 二十八 28
29 | 二十九 29
30 | 三十 30
31 | 三十一 31
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/mipmap-anydpi-v26/ic_launcher_round.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/tn/chinese/data/erhua/whitelist.tsv:
--------------------------------------------------------------------------------
1 | 儿女
2 | 儿子
3 | 儿孙
4 | 女儿
5 | 儿媳
6 | 妻儿
7 | 胎儿
8 | 婴儿
9 | 新生儿
10 | 婴幼儿
11 | 幼儿
12 | 少儿
13 | 小儿
14 | 儿歌
15 | 儿童
16 | 儿科
17 | 托儿所
18 | 孤儿
19 | 儿戏
20 | 儿化
21 | 台儿庄
22 | 鹿儿岛
23 | 正儿八经
24 | 吊儿郎当
25 | 生儿育女
26 | 托儿带女
27 | 养儿防老
28 | 痴儿呆女
29 | 佳儿佳妇
30 | 儿怜兽扰
31 | 儿无常父
32 | 儿不嫌母丑
33 | 儿行千里母担忧
34 | 儿大不由爷
35 | 苏乞儿
36 | 容祖儿
37 |
--------------------------------------------------------------------------------
/tn/japanese/data/measure/units_ja.tsv:
--------------------------------------------------------------------------------
1 | つ
2 | 枚
3 | 部
4 | 台
5 | 杯
6 | 匹
7 | 本
8 | 階
9 | 個
10 | 箇
11 | 个
12 | ヶ
13 | 面
14 | 名
15 | 人
16 | 歳
17 | 才
18 | 冊
19 | 話
20 | 秒
21 | 分
22 | 月
23 | 泊
24 | 時
25 | 時間
26 | 日
27 | ヶ月
28 | 箇月
29 | 年
30 | 日
31 | 週
32 | 倍
33 | 番
34 | 度
35 | 畳
36 | 回
37 | 年前
38 | 年後
39 | 年以内
40 | 平方
41 | 平方メートル
42 | 立方
43 | 立方メートル
44 | キロ
45 | キロメトル
--------------------------------------------------------------------------------
/tn/chinese/test/data/date.txt:
--------------------------------------------------------------------------------
1 | 2008-08-08 => 二零零八年八月八日
2 | 2008/08/08 => 二零零八年八月八日
3 | 2008.08.08 => 二零零八年八月八日
4 | 2008-8-8 => 二零零八年八月八日
5 | 08-08-2008 => 二零零八年八月八日
6 | 2008-08 => 二零零八年八月
7 | 2008/08 => 二零零八年八月
8 | 2008.08 => 二零零八年八月
9 | 08-2008 => 二零零八年八月
10 | 08/2008 => 二零零八年八月
11 | 08.2008 => 二零零八年八月
12 | 08-08 => 八月八日
13 | 08/08 => 八月八日
14 | 08.08 => 八月八日
15 |
--------------------------------------------------------------------------------
/tn/english/data/whitelist/symbol.tsv:
--------------------------------------------------------------------------------
1 | & and
2 | # hash
3 | @ at
4 | § section
5 | ™ trademark
6 | ® registered trademark
7 | © copyright
8 | _ underscore
9 | % percent
10 | * asterisk
11 | + plus
12 | / slash
13 | = equal sign
14 | ^ circumflex
15 | | vertical bar
16 | ~ tilde
17 | $ dollar
18 | £ pound
19 | € euro
20 | ₩ won
21 | ¥ yen
22 | ° degree
23 | º degree
24 |
--------------------------------------------------------------------------------
/tn/english/data/date/month_number.tsv:
--------------------------------------------------------------------------------
1 | 1 january
2 | 2 february
3 | 3 march
4 | 4 april
5 | 5 may
6 | 6 june
7 | 7 july
8 | 8 august
9 | 9 september
10 | 10 october
11 | 11 november
12 | 12 december
13 | 01 january
14 | 02 february
15 | 03 march
16 | 04 april
17 | 05 may
18 | 06 june
19 | 07 july
20 | 08 august
21 | 09 september
22 | 10 october
23 | 11 november
24 | 12 december
--------------------------------------------------------------------------------
/tn/japanese/data/date/d.tsv:
--------------------------------------------------------------------------------
1 | 1 一日
2 | 2 二日
3 | 3 三日
4 | 4 四日
5 | 5 五日
6 | 6 六日
7 | 7 七日
8 | 8 八日
9 | 9 九日
10 | 10 十日
11 | 11 十一日
12 | 12 十二日
13 | 13 十三日
14 | 14 十四日
15 | 15 十五日
16 | 16 十六日
17 | 17 十七日
18 | 18 十八日
19 | 19 十九日
20 | 20 二十日
21 | 21 二十一日
22 | 22 二十二日
23 | 23 二十三日
24 | 24 二十四日
25 | 25 二十五日
26 | 26 二十六日
27 | 27 二十七日
28 | 28 二十八日
29 | 29 二十九日
30 | 30 三十日
31 | 31 三十一日
--------------------------------------------------------------------------------
/tn/japanese/data/money/code.tsv:
--------------------------------------------------------------------------------
1 | A$ 豪ドル
2 | AED UAEディルハム
3 | ARS アルゼンチンペソ
4 | AUD 豪ドル
5 | CAD$ カナダドル
6 | CAD カナダドル
7 | CHF スイスフラン
8 | CNY 人民元
9 | EUR ユーロ
10 | GBP ポンド
11 | HK$ 香港ドル
12 | HKD 香港ドル
13 | INR インドルピー
14 | J¥ 円
15 | JPY¥ 円
16 | JPY 円
17 | KRW ウォン
18 | RUB ロシアルーブル
19 | SAR サウジリヤル
20 | SEK スウェーデンクローナ
21 | SGD シンガポールドル
22 | THB タイバーツ
23 | TRY トルコリラ
24 | USD アメリカドル
25 |
--------------------------------------------------------------------------------
/tn/chinese/data/date/d.tsv:
--------------------------------------------------------------------------------
1 | 1 一日
2 | 2 二日
3 | 3 三日
4 | 4 四日
5 | 5 五日
6 | 6 六日
7 | 7 七日
8 | 8 八日
9 | 9 九日
10 | 10 十日
11 | 11 十一日
12 | 12 十二日
13 | 13 十三日
14 | 14 十四日
15 | 15 十五日
16 | 16 十六日
17 | 17 十七日
18 | 18 十八日
19 | 19 十九日
20 | 20 二十日
21 | 21 二十一日
22 | 22 二十二日
23 | 23 二十三日
24 | 24 二十四日
25 | 25 二十五日
26 | 26 二十六日
27 | 27 二十七日
28 | 28 二十八日
29 | 29 二十九日
30 | 30 三十日
31 | 31 三十一日
32 |
--------------------------------------------------------------------------------
/itn/japanese/data/measure/unit_ja.tsv:
--------------------------------------------------------------------------------
1 | つ
2 | 枚
3 | 部
4 | 台
5 | 杯
6 | 匹
7 | 本
8 | 階
9 | 個
10 | 箇
11 | 円
12 | 个
13 | ヶ
14 | 面
15 | 名
16 | 人
17 | 歳
18 | 才
19 | 冊
20 | 話
21 | 秒
22 | 分
23 | 月
24 | 泊
25 | 時
26 | 時間
27 | 日
28 | ヶ月
29 | 箇月
30 | 年
31 | 日
32 | 週
33 | 倍
34 | 番
35 | 度
36 | 畳
37 | 回
38 | 年前
39 | 年後
40 | 年以内
41 | 平方
42 | 平方メートル
43 | 立方
44 | 立方メートル
45 | キロ
46 | キロメトル
47 | 世紀
48 | 年代
--------------------------------------------------------------------------------
/itn/japanese/data/time/hour.tsv:
--------------------------------------------------------------------------------
1 | 一時 1
2 | 两時 2
3 | 三時 3
4 | 四時 4
5 | 五時 5
6 | 六時 6
7 | 七時 7
8 | 八時 8
9 | 九時 9
10 | 零時 0
11 | 一時 1
12 | 两時 2
13 | 三時 3
14 | 四時 4
15 | 五時 5
16 | 六時 6
17 | 七時 7
18 | 八時 8
19 | 九時 9
20 | 十時 10
21 | 十一時 11
22 | 十二時 12
23 | 十三時 13
24 | 十四時 14
25 | 十五時 15
26 | 十六時 16
27 | 十七時 17
28 | 十八時 18
29 | 十九時 19
30 | 二十時 20
31 | 二十一時 21
32 | 二十二時 22
33 | 二十三時 23
34 | 二十四時 24
--------------------------------------------------------------------------------
/tn/chinese/data/date/dd.tsv:
--------------------------------------------------------------------------------
1 | 01 一日
2 | 02 二日
3 | 03 三日
4 | 04 四日
5 | 05 五日
6 | 06 六日
7 | 07 七日
8 | 08 八日
9 | 09 九日
10 | 10 十日
11 | 11 十一日
12 | 12 十二日
13 | 13 十三日
14 | 14 十四日
15 | 15 十五日
16 | 16 十六日
17 | 17 十七日
18 | 18 十八日
19 | 19 十九日
20 | 20 二十日
21 | 21 二十一日
22 | 22 二十二日
23 | 23 二十三日
24 | 24 二十四日
25 | 25 二十五日
26 | 26 二十六日
27 | 27 二十七日
28 | 28 二十八日
29 | 29 二十九日
30 | 30 三十日
31 | 31 三十一日
32 |
--------------------------------------------------------------------------------
/tn/english/data/number/thousand.tsv:
--------------------------------------------------------------------------------
1 | thousand
2 | million
3 | billion
4 | trillion
5 | quadrillion
6 | quintillion
7 | sextillion
8 | septillion
9 | octillion
10 | nonillion
11 | decillion
12 | undecillion
13 | duodecillion
14 | tredecillion
15 | quattuordecillion
16 | quindecillion
17 | sexdecillion
18 | septendecillion
19 | octodecillion
20 | novemdecillion
21 | vigintillion
22 | centillion
--------------------------------------------------------------------------------
/tn/english/data/whitelist/lj_speech.tsv:
--------------------------------------------------------------------------------
1 | Mr. mister
2 | Mrs. misses
3 | Dr. doctor
4 | Drs. doctors
5 | Co. company
6 | Lt. lieutenant
7 | Sgt. sergeant
8 | St. saint
9 | Jr. junior
10 | Maj. major
11 | Hon. honorable
12 | Gov. governor
13 | Capt. captain
14 | Esq. esquire
15 | Gen. general
16 | Ltd. limited
17 | Rev. reverend
18 | Col. colonel
19 | Mt. mount
20 | Ft. fort
21 | etc. et cetera
22 |
--------------------------------------------------------------------------------
/tn/chinese/data/time/hour.tsv:
--------------------------------------------------------------------------------
1 | 1 一点
2 | 2 两点
3 | 3 三点
4 | 4 四点
5 | 5 五点
6 | 6 六点
7 | 7 七点
8 | 8 八点
9 | 9 九点
10 | 00 零点
11 | 01 一点
12 | 02 两点
13 | 03 三点
14 | 04 四点
15 | 05 五点
16 | 06 六点
17 | 07 七点
18 | 08 八点
19 | 09 九点
20 | 10 十点
21 | 11 十一点
22 | 12 十二点
23 | 13 十三点
24 | 14 十四点
25 | 15 十五点
26 | 16 十六点
27 | 17 十七点
28 | 18 十八点
29 | 19 十九点
30 | 20 二十点
31 | 21 二十一点
32 | 22 二十二点
33 | 23 二十三点
34 | 24 二十四点
35 |
--------------------------------------------------------------------------------
/tn/english/test/data/measure.txt:
--------------------------------------------------------------------------------
1 | -12kg => negative twelve kilograms
2 | 1kg => one kilogram
3 | .5kg => point five kilograms
4 | 3.5 cm² => three point five square centimeters
5 | 2788 San Tomas Expy, Santa Clara, CA 95051 => twenty seven eighty eight San Tomas Expressway, Santa Clara, California nine five oh five one
6 | 2-3 °C => two to three degrees Celsius
7 | 2 * 10 μm => two times ten micrometers
8 |
--------------------------------------------------------------------------------
/itn/chinese/data/time/hour.tsv:
--------------------------------------------------------------------------------
1 | 一点 1
2 | 两点 2
3 | 三点 3
4 | 四点 4
5 | 五点 5
6 | 六点 6
7 | 七点 7
8 | 八点 8
9 | 九点 9
10 | 零点 00
11 | 一点 01
12 | 两点 02
13 | 三点 03
14 | 四点 04
15 | 五点 05
16 | 六点 06
17 | 七点 07
18 | 八点 08
19 | 九点 09
20 | 十点 10
21 | 十一点 11
22 | 十二点 12
23 | 十三点 13
24 | 十四点 14
25 | 十五点 15
26 | 十六点 16
27 | 十七点 17
28 | 十八点 18
29 | 十九点 19
30 | 二十点 20
31 | 二十一点 21
32 | 二十二点 22
33 | 二十三点 23
34 | 二十四点 24
35 |
--------------------------------------------------------------------------------
/tn/japanese/data/time/hour.tsv:
--------------------------------------------------------------------------------
1 | 0 〇時
2 | 1 一時
3 | 2 二時
4 | 3 三時
5 | 4 四時
6 | 5 五時
7 | 6 六時
8 | 7 七時
9 | 8 八時
10 | 9 九時
11 | 00 〇時
12 | 01 一時
13 | 02 二時
14 | 03 三時
15 | 04 四時
16 | 05 五時
17 | 06 六時
18 | 07 七時
19 | 08 八時
20 | 09 九時
21 | 10 十時
22 | 11 十一時
23 | 12 十二時
24 | 13 十三時
25 | 14 十四時
26 | 15 十五時
27 | 16 十六時
28 | 17 十七時
29 | 18 十八時
30 | 19 十九時
31 | 20 二十時
32 | 21 二十一時
33 | 22 二十二時
34 | 23 二十三時
35 | 24 二十四時
36 |
--------------------------------------------------------------------------------
/tn/english/data/date/day.tsv:
--------------------------------------------------------------------------------
1 | one
2 | two
3 | three
4 | four
5 | five
6 | six
7 | seven
8 | eight
9 | nine
10 | ten
11 | eleven
12 | twelve
13 | thirteen
14 | fourteen
15 | fifteen
16 | sixteen
17 | seventeen
18 | eighteen
19 | nineteen
20 | twenty
21 | twenty one
22 | twenty two
23 | twenty three
24 | twenty four
25 | twenty five
26 | twenty six
27 | twenty seven
28 | twenty eight
29 | twenty nine
30 | thirty
31 | thirty one
--------------------------------------------------------------------------------
/runtime/android/app/src/main/java/com/mobvoi/WeTextProcessing/WeTextProcessing.java:
--------------------------------------------------------------------------------
1 | package com.mobvoi.WeTextProcessing;
2 |
3 | public class WeTextProcessing {
4 |
5 | static {
6 | System.loadLibrary("wetextprocessing");
7 | }
8 |
9 | public static native void init(String modelDir);
10 | public static native String normalize(String input);
11 | public static native String inverse_normalize(String input);
12 | }
13 |
--------------------------------------------------------------------------------
/itn/chinese/test/data/normalizer_disable_standalone_number_enable_0_to_9.txt:
--------------------------------------------------------------------------------
1 | 二点五平方电线,五,五十五,疑是银河落九天,十二块五 => 2.5平方电线,五,五十五,疑是银河落9天,12块五
2 | 三百九十九三盒 => 三百九十九3盒
3 | 这是八百一千 => 这是八百一千
4 | 这是二十九千 => 这是二十九千
5 | 这是九十九九千 => 这是九十九九千
6 | 这是十二一千 => 这是十二一千
7 | 这是零百 => 这是零百
8 | 这是零千 => 这是零千
9 | 这是一百一个,一千两位,一万三天 => 这是100 1个,1000 2位,10000 3天
10 | 这是九百九周,九千九月,九万九年 => 这是900 9周,9000 9月,90000 9年
11 | 十三五规划期间获得了十几万和几十万甚至二十几万的投资 => 十三五规划期间获得了十几万和几十万甚至二十几万的投资
12 |
--------------------------------------------------------------------------------
/itn/chinese/test/data/date.txt:
--------------------------------------------------------------------------------
1 | 二零零八年八月八日 => 2008/08/08
2 | 二零零八年八月 => 2008/08
3 | 八月八日 => 08/08
4 | 二零零八年 => 2008年
5 | 二千年 => 2000年
6 | 两千零五年八月五号 => 2005年08/05
7 | 公元一九九六年 => 公元1996年
8 | 公元一六三年 => 公元163年
9 | 八五年二月二十七日 => 85/02/27
10 | 这件事发生在二零一九年九月十二日的晚上或者两千零五年八月五号的晚上或者在八五年二月二十七日的晚上或者在公元一九九六年或者在八六年八月十八日或者于一九九五年三月一日或者在公元一六三年或者在零六年一月二号 => 这件事发生在2019/09/12的晚上或者2005年08/05的晚上或者在85/02/27的晚上或者在公元1996年或者在86/08/18或者于1995/03/01或者在公元163年或者在06/01/02
11 |
--------------------------------------------------------------------------------
/itn/chinese/test/data/time.txt:
--------------------------------------------------------------------------------
1 | 两点零二分 => 2:02
2 | 十三点十分三十六秒 => 13:10:36
3 | 上午一点零二分三十六秒 => 1:02:36a.m.
4 | 早上一点零二 => 1:02a.m.
5 | 早上一点零二分 => 1:02a.m.
6 | 晚上一点零二分 => 1:02p.m.
7 | 零点十分 => 00:10
8 | 下午八点零五分 => 8:05p.m.
9 | 八点零五分 => 8:05
10 | 八点三十 => 8:30
11 | 八点半 => 8:30
12 | 时间上是零点十分登机,八点五分下飞机,八点三十去吃早饭.你是说八点零五分下飞机还是八点零五下飞机?我可能下午三点四十,或者上午十点半再或者下午三点四十一分去找你 => 时间上是00:10登机,8.5分下飞机,8:30去吃早饭.你是说8:05下飞机还是8:05下飞机?我可能3:40p.m.,或者10:30a.m.再或者3:41p.m.去找你
13 |
--------------------------------------------------------------------------------
/runtime/android/build.gradle:
--------------------------------------------------------------------------------
1 | buildscript {
2 | repositories {
3 | google()
4 | jcenter()
5 | }
6 | dependencies {
7 | classpath 'com.android.tools.build:gradle:4.2.2'
8 | }
9 | }
10 |
11 | allprojects {
12 | repositories {
13 | google()
14 | jcenter()
15 | maven { url 'https://jitpack.io' }
16 | }
17 | }
18 |
19 | task clean(type: Delete) {
20 | delete rootProject.buildDir
21 | }
--------------------------------------------------------------------------------
/tn/japanese/data/sport/club.tsv:
--------------------------------------------------------------------------------
1 | FCバルセロナ
2 | レアル・マドリード
3 | マンチェスター・ユナイテッド
4 | マンチェスター・シティ
5 | バイエルン・ミュンヘン
6 | リヴァプールFC
7 | ユヴェントスFC
8 | パリ・サンジェルマンFC
9 | ACミラン
10 | インテル・ミラノ
11 | チェルシーFC
12 | アーセナルFC
13 | ボルシア・ドルトムント
14 | トッテナム・ホットスパーFC
15 | ASローマ
16 | アトレティコ・マドリード
17 | SSCナポリ
18 | バレンシアCF
19 | オリンピック・リヨン
20 | オリンピック・マルセイユ
21 | SLベンフィカ
22 | FCポルト
23 | アヤックス・アムステルダム
24 | フェイエノールト
25 | SSラツィオ
26 | ビジャレアルCF
27 | セビージャFC
28 | スパルタク・モスクワ
29 | ゼニト・サンクトペテルブルク
30 | セルティックFC
31 | レンジャーズFC
--------------------------------------------------------------------------------
/tn/english/data/whitelist/alternatives.tsv:
--------------------------------------------------------------------------------
1 | Hon. Honorable
2 | Mr. Mister
3 | Mrs. Misses
4 | Ms. Miss
5 | Mr Mister
6 | Mrs Misses
7 | Ms Miss
8 | &Co. and Co.
9 | &Co. and Company
10 | Mon Monday
11 | Tu Tuesday
12 | Wed Wednesday
13 | Th Thursday
14 | Thur Thursday
15 | Thurs Thursday
16 | Fri Friday
17 | Sat Saturday
18 | Sun Sunday
19 | = equals
20 | # number
21 | No. number
22 | NO. number
23 | NO. number
24 | No. number
25 | VOL. Volume
26 | Vol. Volume
27 | TV Television
28 |
--------------------------------------------------------------------------------
/tn/chinese/test/data/number.txt:
--------------------------------------------------------------------------------
1 | -1 => 负一
2 | 0 => 零
3 | 1 => 一
4 | 2 => 二
5 | 10 => 十
6 | 11 => 十一
7 | 20 => 二十
8 | 100 => 一百
9 | 101 => 一百零一
10 | 111 => 一百一十一
11 | 200 => 两百
12 | 1000 => 一千
13 | 1001 => 一千零一
14 | 1011 => 一千零一十一
15 | 1111 => 一千一百一十一
16 | 2000 => 两千
17 | 10000 => 一万
18 | 10001 => 一万零一
19 | 10011 => 一万零一十一
20 | 10111 => 一万零一百一十一
21 | 11111 => 一万一千一百一十一
22 | 20000 => 两万
23 | 101111 => 十万一千一百一十一
24 | 1001111 => 一百万一千一百一十一
25 | 10001111 => 一千万一千一百一十一
26 | 1.01 => 一点零一
27 | 1.11 => 一点一一
28 |
--------------------------------------------------------------------------------
/itn/japanese/data/char/punctuations_ja.tsv:
--------------------------------------------------------------------------------
1 | !
2 | ?
3 | 。
4 | 。
5 | "
6 | #
7 | $
8 | %
9 | &
10 | '
11 | (
12 | )
13 | *
14 | +
15 | ,
16 | -
17 | /
18 | :
19 | ;
20 | <
21 | =
22 | >
23 | @
24 | [
25 | \
26 | ]
27 | ^
28 | _
29 | `
30 | {
31 | |
32 | }
33 | ~
34 | ⦅
35 | ⦆
36 | 「
37 | 」
38 | 、
39 | 、
40 | 〃
41 | 》
42 | 「
43 | 」
44 | 『
45 | 』
46 | 【
47 | 】
48 | 〔
49 | 〕
50 | 〖
51 | 〗
52 | 〘
53 | 〙
54 | 〚
55 | 〛
56 | 〜
57 | 〝
58 | 〞
59 | 〟
60 | 〰
61 | –
62 | —
63 | ‘
64 | ’
65 | ‛
66 | “
67 | ”
68 | „
69 | ‟
70 | …
71 | ‧
72 | ﹏
--------------------------------------------------------------------------------
/tn/japanese/data/char/punctuations_ja.tsv:
--------------------------------------------------------------------------------
1 | !
2 | ?
3 | 。
4 | 。
5 | "
6 | #
7 | $
8 | %
9 | &
10 | '
11 | (
12 | )
13 | *
14 | +
15 | ,
16 | -
17 | /
18 | :
19 | ;
20 | <
21 | =
22 | >
23 | @
24 | [
25 | \
26 | ]
27 | ^
28 | _
29 | `
30 | {
31 | |
32 | }
33 | ~
34 | ⦅
35 | ⦆
36 | 「
37 | 」
38 | 、
39 | 、
40 | 〃
41 | 》
42 | 「
43 | 」
44 | 『
45 | 』
46 | 【
47 | 】
48 | 〔
49 | 〕
50 | 〖
51 | 〗
52 | 〘
53 | 〙
54 | 〚
55 | 〛
56 | 〜
57 | 〝
58 | 〞
59 | 〟
60 | 〰
61 | –
62 | —
63 | ‘
64 | ’
65 | ‛
66 | “
67 | ”
68 | „
69 | ‟
70 | …
71 | ‧
72 | ﹏
--------------------------------------------------------------------------------
/runtime/android/app/src/test/java/com/mobvoi/WeTextProcessing/ExampleUnitTest.java:
--------------------------------------------------------------------------------
1 | package com.mobvoi.WeTextProcessing;
2 |
3 | import org.junit.Test;
4 |
5 | import static org.junit.Assert.*;
6 |
7 | /**
8 | * Example local unit test, which will execute on the development machine (host).
9 | *
10 | * @see Testing documentation
11 | */
12 | public class ExampleUnitTest {
13 | @Test
14 | public void addition_isCorrect() {
15 | assertEquals(4, 2 + 2);
16 | }
17 | }
--------------------------------------------------------------------------------
/tn/chinese/data/char/punctuations_zh.tsv:
--------------------------------------------------------------------------------
1 | –
2 | —
3 | ‘
4 | ’
5 | ‛
6 | “
7 | ”
8 | „
9 | ‟
10 | …
11 | ‧
12 | 、
13 | 。
14 | 〃
15 | 《
16 | 》
17 | 「
18 | 」
19 | 『
20 | 』
21 | 【
22 | 】
23 | 〔
24 | 〕
25 | 〖
26 | 〗
27 | 〘
28 | 〙
29 | 〚
30 | 〛
31 | 〜
32 | 〝
33 | 〞
34 | 〟
35 | 〰
36 | ﹏
37 | !
38 | "
39 | #
40 | %
41 | &
42 | '
43 | (
44 | )
45 | *
46 | +
47 | ,
48 | -
49 | /
50 | :
51 | ;
52 | <
53 | =
54 | >
55 | ?
56 | @
57 | [
58 | \
59 | ]
60 | ^
61 | _
62 | `
63 | {
64 | |
65 | }
66 | ~
67 | ⦅
68 | ⦆
69 | 。
70 | 「
71 | 」
72 | 、
73 | $
74 |
--------------------------------------------------------------------------------
/tn/english/data/date/month_abbr.tsv:
--------------------------------------------------------------------------------
1 | jan january
2 | Jan january
3 | JAN january
4 | feb february
5 | Feb february
6 | FEB february
7 | mar march
8 | Mar march
9 | MAR march
10 | apr april
11 | Apr april
12 | APR april
13 | jun june
14 | Jun june
15 | JUN june
16 | jul july
17 | Jul july
18 | JUL july
19 | aug august
20 | Aug august
21 | AUG august
22 | sep september
23 | Sep september
24 | SEP september
25 | sept september
26 | Sept september
27 | SEPT september
28 | oct october
29 | Oct october
30 | OCT october
31 | nov november
32 | Nov november
33 | NOV november
34 | dec december
35 | Dec december
36 | DEC december
37 |
--------------------------------------------------------------------------------
/tn/english/test/data/date.txt:
--------------------------------------------------------------------------------
1 | 1219 => twelve nineteen
2 | 2999 => twenty nine ninety nine
3 | '70s => seventies
4 | 2024 B.C => twenty twenty four BC
5 | 1H23 => the first half of twenty three
6 | 3Q22 => the third quarter of twenty two
7 | jan. 5, 2012 => the fifth of january , twenty twelve
8 | jan. 5 => the fifth of january
9 | 5 january 2012 => the fifth of january twenty twelve
10 | 2012-01-05 => the fifth of january twenty twelve
11 | 2012.01.05 => the fifth of january twenty twelve
12 | 2012/01/05 => the fifth of january twenty twelve
13 | 2012 => twenty twelve
14 | 2024-05-06 => the sixth of may twenty twenty four
15 |
--------------------------------------------------------------------------------
/itn/chinese/data/number/special_dash.tsv:
--------------------------------------------------------------------------------
1 | 一二 1-2
2 | 二三 2-3
3 | 三四 3-4
4 | 三五 3-5
5 | 四五 4-5
6 | 五六 5-6
7 | 六七 6-7
8 | 七八 7-8
9 | 八九 8-9
10 | 一二十 10-20
11 | 二三十 20-30
12 | 三四十 30-40
13 | 三五十 30-50
14 | 四五十 40-50
15 | 五六十 50-60
16 | 六七十 60-70
17 | 七八十 70-80
18 | 八九十 80-90
19 | 一二百 100-200
20 | 一两百 100-200
21 | 二三百 200-300
22 | 两三百 200-300
23 | 三四百 300-400
24 | 三五百 300-500
25 | 四五百 400-500
26 | 五六百 500-600
27 | 六七百 600-700
28 | 七八百 700-800
29 | 八九百 800-900
30 | 一二千 1000-2000
31 | 一两千 1000-2000
32 | 二三千 2000-3000
33 | 两三千 2000-3000
34 | 三四千 3000-4000
35 | 三五千 3000-5000
36 | 四五千 4000-5000
37 | 五六千 5000-6000
38 | 六七千 6000-7000
39 | 七八千 7000-8000
40 | 八九千 8000-9000
41 |
--------------------------------------------------------------------------------
/itn/chinese/data/number/special_tilde.tsv:
--------------------------------------------------------------------------------
1 | 一二 1~2
2 | 二三 2~3
3 | 三四 3~4
4 | 三五 3~5
5 | 四五 4~5
6 | 五六 5~6
7 | 六七 6~7
8 | 七八 7~8
9 | 八九 8~9
10 | 一二十 10~20
11 | 二三十 20~30
12 | 三四十 30~40
13 | 三五十 30~50
14 | 四五十 40~50
15 | 五六十 50~60
16 | 六七十 60~70
17 | 七八十 70~80
18 | 八九十 80~90
19 | 一二百 100~200
20 | 一两百 100~200
21 | 二三百 200~300
22 | 两三百 200~300
23 | 三四百 300~400
24 | 三五百 300~500
25 | 四五百 400~500
26 | 五六百 500~600
27 | 六七百 600~700
28 | 七八百 700~800
29 | 八九百 800~900
30 | 一二千 1000~2000
31 | 一两千 1000~2000
32 | 二三千 2000~3000
33 | 两三千 2000~3000
34 | 三四千 3000~4000
35 | 三五千 3000~5000
36 | 四五千 4000~5000
37 | 五六千 5000~6000
38 | 六七千 6000~7000
39 | 七八千 7000~8000
40 | 八九千 8000~9000
41 |
--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | select = B,C,E,F,P,T4,W,B9
3 | max-line-length = 80
4 | # C408 ignored because we like the dict keyword argument syntax
5 | # E501 is not flexible enough, we're using B950 instead
6 | ignore =
7 | E203,E305,E402,E501,E721,E741,F403,F405,F821,F841,F999,W503,W504,C408,E302,W291,E303,
8 | # shebang has extra meaning in fbcode lints, so I think it's not worth trying
9 | # to line this up with executable bit
10 | EXE001, EXE002,
11 | # these ignores are from flake8-bugbear; please fix!
12 | B007,B008,
13 | # these ignores are from flake8-comprehensions; please fix!
14 | C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415
15 |
--------------------------------------------------------------------------------
/tn/chinese/test/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/itn/chinese/test/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/tn/english/data/date/month_name.tsv:
--------------------------------------------------------------------------------
1 | january january
2 | february february
3 | march march
4 | april april
5 | may may
6 | june june
7 | july july
8 | august august
9 | september september
10 | october october
11 | november november
12 | december december
13 | January january
14 | JANUARY january
15 | February february
16 | FEBRUARY february
17 | March march
18 | MARCH march
19 | April april
20 | APRIL april
21 | June june
22 | JUNE june
23 | July july
24 | JULY july
25 | August august
26 | AUGUST august
27 | September september
28 | SEPTEMBER september
29 | October october
30 | OCTOBER october
31 | November november
32 | NOVEMBER november
33 | December december
34 | DECEMBER december
35 |
--------------------------------------------------------------------------------
/tn/english/data/time/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/values/colors.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | #FFBB86FC
4 | #FF6200EE
5 | #FF3700B3
6 | #FF03DAC5
7 | #FF018786
8 | #FF000000
9 | #FFFFFFFF
10 |
11 | #f16d7a
12 | #b7d28d
13 | #b8f1ed
14 | #b7d28d
15 | #b8f1ed
16 |
--------------------------------------------------------------------------------
/tn/english/data/measure/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/tn/english/data/number/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/tn/japanese/test/data/math.txt:
--------------------------------------------------------------------------------
1 | 1-2=-1 => 一マイナス二イコールマイナス一
2 | 1+2>2 => 一プラス二大なり二
3 | 4×5=20 => 四カケル五イコール二十
4 | 4x5=20 => 四x五イコール二十
5 | 1~100 => 一から百
6 | 3>=3 => 三大なりイコール三
7 | 3≥2≥1 => 三大なりイコール二大なりイコール一
8 | 3 ≥ 2 => 三大なりイコール二
9 | 2>1 => 二大なり一
10 | 2 > 1 => 二大なり一
11 | 1<2 => 一小なり二
12 | 5×4÷2+3-6 ≥ 7 => 五カケル四ワル二プラス三マイナス六大なりイコール七
13 | 1≥0 => 一大なりイコール〇
14 | 1+3+2+3>3 => 一プラス三プラス二プラス三大なり三
15 | 1-1 => 一マイナス一
16 | 1+1 => 一プラス一
17 | abc/3 => abc/三
18 | 1 + 1 => 一プラス一
19 | ±5 => プラスマイナス五
20 | abc+5 => abcプラス五
21 | 1+1=2 => 一プラス一イコール二
22 | 1+2+3=6 => 一プラス二プラス三イコール六
23 | 911-1234-5678 => 九百十一マイナス千二百三十四マイナス五千六百七十八
24 | 112-1234-5678 => 百十二マイナス千二百三十四マイナス五千六百七十八
25 | 5-15 => 五マイナス十五
26 | 4/900000000を切る => 九億分の四を切る
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | exclude: '.*\.(txt|tsv)$'
2 | repos:
3 | - repo: https://github.com/pre-commit/pre-commit-hooks
4 | rev: v4.5.0
5 | hooks:
6 | - id: trailing-whitespace
7 | - repo: https://github.com/pre-commit/mirrors-yapf
8 | rev: 'v0.32.0'
9 | hooks:
10 | - id: yapf
11 | - repo: https://github.com/pycqa/flake8
12 | rev: '3.8.2'
13 | hooks:
14 | - id: flake8
15 | - repo: https://github.com/pre-commit/mirrors-clang-format
16 | rev: 'v17.0.6'
17 | hooks:
18 | - id: clang-format
19 | exclude: '.*\.(json|java|js|m|mm|proto)'
20 | - repo: https://github.com/cpplint/cpplint
21 | rev: '1.6.1'
22 | hooks:
23 | - id: cpplint
24 |
--------------------------------------------------------------------------------
/itn/chinese/data/time/minute.tsv:
--------------------------------------------------------------------------------
1 | 半 30
2 | 零一 01
3 | 零二 02
4 | 零三 03
5 | 零四 04
6 | 零五 05
7 | 零六 06
8 | 零七 07
9 | 零八 08
10 | 零九 09
11 | 十 10
12 | 十一 11
13 | 十二 12
14 | 十三 13
15 | 十四 14
16 | 十五 15
17 | 十六 16
18 | 十七 17
19 | 十八 18
20 | 十九 19
21 | 二十 20
22 | 二十一 21
23 | 二十二 22
24 | 二十三 23
25 | 二十四 24
26 | 二十五 25
27 | 二十六 26
28 | 二十七 27
29 | 二十八 28
30 | 二十九 29
31 | 三十 30
32 | 三十一 31
33 | 三十二 32
34 | 三十三 33
35 | 三十四 34
36 | 三十五 35
37 | 三十六 36
38 | 三十七 37
39 | 三十八 38
40 | 三十九 39
41 | 四十 40
42 | 四十一 41
43 | 四十二 42
44 | 四十三 43
45 | 四十四 44
46 | 四十五 45
47 | 四十六 46
48 | 四十七 47
49 | 四十八 48
50 | 四十九 49
51 | 五十 50
52 | 五十一 51
53 | 五十二 52
54 | 五十三 53
55 | 五十四 54
56 | 五十五 55
57 | 五十六 56
58 | 五十七 57
59 | 五十八 58
60 | 五十九 59
61 |
--------------------------------------------------------------------------------
/runtime/test/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | enable_testing()
2 | link_libraries(gtest_main gmock)
3 | include(GoogleTest)
4 |
5 | add_executable(string_test string_test.cc)
6 | target_link_libraries(string_test PUBLIC wetext_utils)
7 | gtest_discover_tests(string_test)
8 |
9 | if(NOT MSVC)
10 | # token_parser_test uses the macro to access the private members
11 | add_executable(token_parser_test token_parser_test.cc)
12 | target_link_libraries(token_parser_test PUBLIC wetext_processor)
13 | gtest_discover_tests(token_parser_test)
14 | endif()
15 |
16 | add_executable(processor_test processor_test.cc)
17 | target_link_libraries(processor_test PUBLIC wetext_processor)
18 | gtest_discover_tests(processor_test)
19 |
--------------------------------------------------------------------------------
/tn/english/data/address/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # Visual Studio Code files
7 | .vscode
8 | .vs
9 |
10 | # PyCharm files
11 | .idea
12 |
13 | # Eclipse Project settings
14 | *.*project
15 | .settings
16 |
17 | # Sublime Text settings
18 | *.sublime-workspace
19 | *.sublime-project
20 |
21 | # Editor temporaries
22 | *.swn
23 | *.swo
24 | *.swp
25 | *.swm
26 | *~
27 |
28 | # IPython notebook checkpoints
29 | .ipynb_checkpoints
30 |
31 | # macOS dir files
32 | .DS_Store
33 |
34 | # Clangd files
35 | .cache
36 | compile_commands.json
37 |
38 |
39 | # Setup files
40 | WeTextProcessing.egg-info/
41 | build/
42 | dist/
43 | tn/*.far
44 | itn/*.far
45 |
--------------------------------------------------------------------------------
/itn/chinese/test/data/number.txt:
--------------------------------------------------------------------------------
1 | 负一 => -1
2 | 零 => 0
3 | 一 => 1
4 | 二 => 2
5 | 十 => 10
6 | 十一 => 11
7 | 二十 => 20
8 | 一百 => 100
9 | 一百零一 => 101
10 | 一百一十一 => 111
11 | 两百 => 200
12 | 一千 => 1000
13 | 一千零一 => 1001
14 | 一千零一十一 => 1011
15 | 一千一百一十一 => 1111
16 | 两千 => 2000
17 | 两千零十 => 2010
18 | 两千零一十 => 2010
19 | 两千零十二 => 2012
20 | 两千零一十二 => 2012
21 | 两千零二十 => 2020
22 | 一万 => 10000
23 | 一万零一 => 10001
24 | 一万零一十一 => 10011
25 | 一万零一百一十一 => 10111
26 | 一万一千一百一十一 => 11111
27 | 两万 => 20000
28 | 十万一千一百一十一 => 101111
29 | 一百万一千一百一十一 => 100万1111
30 | 一千万一千一百一十一 => 1000万1111
31 | 一点一一 => 1.11
32 | 三点一四一五九二六 => 3.1415926
33 | 负三点一四一五九二六 => -3.1415926
34 | 一万两千三百 => 12300
35 | 小数三点一四一五九二六和负三点一四一五九二六是不是经常见到 => 小数3.1415926和-3.1415926是不是经常见到
36 |
--------------------------------------------------------------------------------
/runtime/patch/openfst/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 |
2 | #-DHAVE_CONFIG_H -I./../include -fno-exceptions -funsigned-char -std=c++11 -MT symbol-table.lo -MD -MP -MF .deps/symbol-table.Tpo -c symbol-table.cc -fno-common -DPIC -o .libs/symbol-table.o
3 |
4 | include_directories(./include/)
5 | install(DIRECTORY include/ DESTINATION include/
6 | FILES_MATCHING PATTERN "*.h")
7 |
8 | add_subdirectory(lib)
9 |
10 | if(HAVE_SCRIPT)
11 | add_subdirectory(script)
12 | endif(HAVE_SCRIPT)
13 |
14 | if(HAVE_BIN)
15 | add_subdirectory(bin)
16 | endif(HAVE_BIN)
17 |
18 | add_subdirectory(extensions)
19 |
20 | if(BUILD_TESTING)
21 | enable_testing()
22 | add_subdirectory(test)
23 | endif(BUILD_TESTING)
24 |
--------------------------------------------------------------------------------
/runtime/android/app/src/main/cpp/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.4.1)
2 | set(TARGET wetextprocessing)
3 | project(${TARGET} CXX)
4 | set(CMAKE_CXX_STANDARD 14)
5 | include(ExternalProject)
6 |
7 | set(CMAKE_VERBOSE_MAKEFILE on)
8 | set(build_DIR ${CMAKE_SOURCE_DIR}/../../../build)
9 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
10 | string(REPLACE "-Wl,--exclude-libs,libgcc_real.a" "" CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
11 |
12 | include(openfst)
13 |
14 | include_directories(
15 | ${CMAKE_SOURCE_DIR}
16 | )
17 |
18 | add_subdirectory(utils)
19 | add_subdirectory(processor)
20 |
21 | link_libraries(wetext_processor android)
22 | add_library(${TARGET} SHARED wetextprocessing.cc)
23 |
--------------------------------------------------------------------------------
/itn/japanese/data/time/minute.tsv:
--------------------------------------------------------------------------------
1 | 一分 1
2 | 二分 2
3 | 三分 3
4 | 四分 4
5 | 五分 5
6 | 六分 6
7 | 七分 7
8 | 八分 8
9 | 九分 9
10 | 十分 10
11 | 十一分 11
12 | 十二分 12
13 | 十三分 13
14 | 十四分 14
15 | 十五分 15
16 | 十六分 16
17 | 十七分 17
18 | 十八分 18
19 | 十九分 19
20 | 二十分 20
21 | 二十一分 21
22 | 二十二分 22
23 | 二十三分 23
24 | 二十四分 24
25 | 二十五分 25
26 | 二十六分 26
27 | 二十七分 27
28 | 二十八分 28
29 | 二十九分 29
30 | 三十分 30
31 | 三十一分 31
32 | 三十二分 32
33 | 三十三分 33
34 | 三十四分 34
35 | 三十五分 35
36 | 三十六分 36
37 | 三十七分 37
38 | 三十八分 38
39 | 三十九分 39
40 | 四十分 40
41 | 四十一分 41
42 | 四十二分 42
43 | 四十三分 43
44 | 四十四分 44
45 | 四十五分 45
46 | 四十六分 46
47 | 四十七分 47
48 | 四十八分 48
49 | 四十九分 49
50 | 五十分 50
51 | 五十一分 51
52 | 五二十分 52
53 | 五十三分 53
54 | 五十四分 54
55 | 五十五分 55
56 | 五十六分 56
57 | 五十七分 57
58 | 五十八分 58
59 | 五十九分 59
60 | 六十分 60
--------------------------------------------------------------------------------
/itn/japanese/data/time/second.tsv:
--------------------------------------------------------------------------------
1 | 一秒 1
2 | 二秒 2
3 | 三秒 3
4 | 四秒 4
5 | 五秒 5
6 | 六秒 6
7 | 七秒 7
8 | 八秒 8
9 | 九秒 9
10 | 十秒 10
11 | 十一秒 11
12 | 十二秒 12
13 | 十三秒 13
14 | 十四秒 14
15 | 十五秒 15
16 | 十六秒 16
17 | 十七秒 17
18 | 十八秒 18
19 | 十九秒 19
20 | 二十秒 20
21 | 二十一秒 21
22 | 二十二秒 22
23 | 二十三秒 23
24 | 二十四秒 24
25 | 二十五秒 25
26 | 二十六秒 26
27 | 二十七秒 27
28 | 二十八秒 28
29 | 二十九秒 29
30 | 三十秒 30
31 | 三十一秒 31
32 | 三十二秒 32
33 | 三十三秒 33
34 | 三十四秒 34
35 | 三十五秒 35
36 | 三十六秒 36
37 | 三十七秒 37
38 | 三十八秒 38
39 | 三十九秒 39
40 | 四十秒 40
41 | 四十一秒 41
42 | 四十二秒 42
43 | 四十三秒 43
44 | 四十四秒 44
45 | 四十五秒 45
46 | 四十六秒 46
47 | 四十七秒 47
48 | 四十八秒 48
49 | 四十九秒 49
50 | 五十秒 50
51 | 五十一秒 51
52 | 五二十秒 52
53 | 五十三秒 53
54 | 五十四秒 54
55 | 五十五秒 55
56 | 五十六秒 56
57 | 五十七秒 57
58 | 五十八秒 58
59 | 五十九秒 59
60 | 六十秒 60
--------------------------------------------------------------------------------
/itn/chinese/data/date/dd.tsv:
--------------------------------------------------------------------------------
1 | 一日 01
2 | 二日 02
3 | 三日 03
4 | 四日 04
5 | 五日 05
6 | 六日 06
7 | 七日 07
8 | 八日 08
9 | 九日 09
10 | 十日 10
11 | 十一日 11
12 | 十二日 12
13 | 十三日 13
14 | 十四日 14
15 | 十五日 15
16 | 十六日 16
17 | 十七日 17
18 | 十八日 18
19 | 十九日 19
20 | 二十日 20
21 | 二十一日 21
22 | 二十二日 22
23 | 二十三日 23
24 | 二十四日 24
25 | 二十五日 25
26 | 二十六日 26
27 | 二十七日 27
28 | 二十八日 28
29 | 二十九日 29
30 | 三十日 30
31 | 三十一日 31
32 | 一号 01
33 | 二号 02
34 | 三号 03
35 | 四号 04
36 | 五号 05
37 | 六号 06
38 | 七号 07
39 | 八号 08
40 | 九号 09
41 | 十号 10
42 | 十一号 11
43 | 十二号 12
44 | 十三号 13
45 | 十四号 14
46 | 十五号 15
47 | 十六号 16
48 | 十七号 17
49 | 十八号 18
50 | 十九号 19
51 | 二十号 20
52 | 二十一号 21
53 | 二十二号 22
54 | 二十三号 23
55 | 二十四号 24
56 | 二十五号 25
57 | 二十六号 26
58 | 二十七号 27
59 | 二十八号 28
60 | 二十九号 29
61 | 三十号 30
62 | 三十一号 31
63 |
--------------------------------------------------------------------------------
/itn/chinese/data/time/second.tsv:
--------------------------------------------------------------------------------
1 | 00
2 | 一秒 01
3 | 二秒 02
4 | 三秒 03
5 | 四秒 04
6 | 五秒 05
7 | 六秒 06
8 | 七秒 07
9 | 八秒 08
10 | 九秒 09
11 | 十秒 10
12 | 十一秒 11
13 | 十二秒 12
14 | 十三秒 13
15 | 十四秒 14
16 | 十五秒 15
17 | 十六秒 16
18 | 十七秒 17
19 | 十八秒 18
20 | 十九秒 19
21 | 二十秒 20
22 | 二十一秒 21
23 | 二十二秒 22
24 | 二十三秒 23
25 | 二十四秒 24
26 | 二十五秒 25
27 | 二十六秒 26
28 | 二十七秒 27
29 | 二十八秒 28
30 | 二十九秒 29
31 | 三十秒 30
32 | 三十一秒 31
33 | 三十二秒 32
34 | 三十三秒 33
35 | 三十四秒 34
36 | 三十五秒 35
37 | 三十六秒 36
38 | 三十七秒 37
39 | 三十八秒 38
40 | 三十九秒 39
41 | 四十秒 40
42 | 四十一秒 41
43 | 四十二秒 42
44 | 四十三秒 43
45 | 四十四秒 44
46 | 四十五秒 45
47 | 四十六秒 46
48 | 四十七秒 47
49 | 四十八秒 48
50 | 四十九秒 49
51 | 五十秒 50
52 | 五十一秒 51
53 | 五十二秒 52
54 | 五十三秒 53
55 | 五十四秒 54
56 | 五十五秒 55
57 | 五十六秒 56
58 | 五十七秒 57
59 | 五十八秒 58
60 | 五十九秒 59
61 |
--------------------------------------------------------------------------------
/itn/chinese/test/data/cardinal.txt:
--------------------------------------------------------------------------------
1 | 幺幺零 => 110
2 | 幺二七点零点零点幺 => 127.0.0.1
3 | 这是手机一八五四四一三九一二一 => 这是手机18544139121
4 | 三五百 => 300~500
5 | 三五千 => 3000~5000
6 | 三五万 => 3~5万
7 | 三四万 => 3~4万
8 | 五六十 => 50~60
9 | 三四十万 => 30~40万
10 | 三四十亿 => 30~40亿
11 | 十五六 => 15-6
12 | 四十五六 => 45-6
13 | 四十五六万 => 45-6万
14 | 七百三四十 => 730-40
15 | 十七八万 => 17-8万
16 | 六十三四万 => 63-4万
17 | 一万六七 => 16000-7000
18 | 三万四五 => 34000-5000
19 | 我的身份证号是三四零二零三一九三七零幺零幺零五幺七 => 我的身份证号是340203193701010517
20 | 我的身份证号是三四零二零三一九三七零幺零幺零五幺X => 我的身份证号是34020319370101051X
21 | 给一三三四五三一二二二一打电话 => 给13345312221打电话
22 | 给一三三四五三一二二二一拨电话 => 给13345312221拨电话
23 | 一二三四 => 1234
24 | 二二三四 => 2234
25 | 拨打幺二三零六 => 拨打12306
26 | 九幺幺是报警电话 => 911是报警电话
27 | 尾号幺七零二 => 尾号1702
28 | 尾号一二三四 => 尾号1234
29 | 幺八五洞幺拐两零柒幺玖 => 18501720719
30 |
--------------------------------------------------------------------------------
/tn/chinese/data/time/second.tsv:
--------------------------------------------------------------------------------
1 | 00
2 | 01 一秒
3 | 02 二秒
4 | 03 三秒
5 | 04 四秒
6 | 05 五秒
7 | 06 六秒
8 | 07 七秒
9 | 08 八秒
10 | 09 九秒
11 | 10 十秒
12 | 11 十一秒
13 | 12 十二秒
14 | 13 十三秒
15 | 14 十四秒
16 | 15 十五秒
17 | 16 十六秒
18 | 17 十七秒
19 | 18 十八秒
20 | 19 十九秒
21 | 20 二十秒
22 | 21 二十一秒
23 | 22 二十二秒
24 | 23 二十三秒
25 | 24 二十四秒
26 | 25 二十五秒
27 | 26 二十六秒
28 | 27 二十七秒
29 | 28 二十八秒
30 | 29 二十九秒
31 | 30 三十秒
32 | 31 三十一秒
33 | 32 三十二秒
34 | 33 三十三秒
35 | 34 三十四秒
36 | 35 三十五秒
37 | 36 三十六秒
38 | 37 三十七秒
39 | 38 三十八秒
40 | 39 三十九秒
41 | 40 四十秒
42 | 41 四十一秒
43 | 42 四十二秒
44 | 43 四十三秒
45 | 44 四十四秒
46 | 45 四十五秒
47 | 46 四十六秒
48 | 47 四十七秒
49 | 48 四十八秒
50 | 49 四十九秒
51 | 50 五十秒
52 | 51 五十一秒
53 | 52 五十二秒
54 | 53 五十三秒
55 | 54 五十四秒
56 | 55 五十五秒
57 | 56 五十六秒
58 | 57 五十七秒
59 | 58 五十八秒
60 | 59 五十九秒
61 |
--------------------------------------------------------------------------------
/tn/japanese/data/time/minute.tsv:
--------------------------------------------------------------------------------
1 | 00
2 | 01 一分
3 | 02 二分
4 | 03 三分
5 | 04 四分
6 | 05 五分
7 | 06 六分
8 | 07 七分
9 | 08 八分
10 | 09 九分
11 | 10 十分
12 | 11 十一分
13 | 12 十二分
14 | 13 十三分
15 | 14 十四分
16 | 15 十五分
17 | 16 十六分
18 | 17 十七分
19 | 18 十八分
20 | 19 十九分
21 | 20 二十分
22 | 21 二十一分
23 | 22 二十二分
24 | 23 二十三分
25 | 24 二十四分
26 | 25 二十五分
27 | 26 二十六分
28 | 27 二十七分
29 | 28 二十八分
30 | 29 二十九分
31 | 30 三十分
32 | 31 三十一分
33 | 32 三十二分
34 | 33 三十三分
35 | 34 三十四分
36 | 35 三十五分
37 | 36 三十六分
38 | 37 三十七分
39 | 38 三十八分
40 | 39 三十九分
41 | 40 四十分
42 | 41 四十一分
43 | 42 四十二分
44 | 43 四十三分
45 | 44 四十四分
46 | 45 四十五分
47 | 46 四十六分
48 | 47 四十七分
49 | 48 四十八分
50 | 49 四十九分
51 | 50 五十分
52 | 51 五十一分
53 | 52 五十二分
54 | 53 五十三分
55 | 54 五十四分
56 | 55 五十五分
57 | 56 五十六分
58 | 57 五十七分
59 | 58 五十八分
60 | 59 五十九分
61 |
--------------------------------------------------------------------------------
/tn/japanese/data/time/second.tsv:
--------------------------------------------------------------------------------
1 | 00
2 | 01 一秒
3 | 02 二秒
4 | 03 三秒
5 | 04 四秒
6 | 05 五秒
7 | 06 六秒
8 | 07 七秒
9 | 08 八秒
10 | 09 九秒
11 | 10 十秒
12 | 11 十一秒
13 | 12 十二秒
14 | 13 十三秒
15 | 14 十四秒
16 | 15 十五秒
17 | 16 十六秒
18 | 17 十七秒
19 | 18 十八秒
20 | 19 十九秒
21 | 20 二十秒
22 | 21 二十一秒
23 | 22 二十二秒
24 | 23 二十三秒
25 | 24 二十四秒
26 | 25 二十五秒
27 | 26 二十六秒
28 | 27 二十七秒
29 | 28 二十八秒
30 | 29 二十九秒
31 | 30 三十秒
32 | 31 三十一秒
33 | 32 三十二秒
34 | 33 三十三秒
35 | 34 三十四秒
36 | 35 三十五秒
37 | 36 三十六秒
38 | 37 三十七秒
39 | 38 三十八秒
40 | 39 三十九秒
41 | 40 四十秒
42 | 41 四十一秒
43 | 42 四十二秒
44 | 43 四十三秒
45 | 44 四十四秒
46 | 45 四十五秒
47 | 46 四十六秒
48 | 47 四十七秒
49 | 48 四十八秒
50 | 49 四十九秒
51 | 50 五十秒
52 | 51 五十一秒
53 | 52 五十二秒
54 | 53 五十三秒
55 | 54 五十四秒
56 | 55 五十五秒
57 | 56 五十六秒
58 | 57 五十七秒
59 | 58 五十八秒
60 | 59 五十九秒
61 |
--------------------------------------------------------------------------------
/tn/chinese/data/time/minute.tsv:
--------------------------------------------------------------------------------
1 | 00
2 | 01 零一分
3 | 02 零二分
4 | 03 零三分
5 | 04 零四分
6 | 05 零五分
7 | 06 零六分
8 | 07 零七分
9 | 08 零八分
10 | 09 零九分
11 | 10 十分
12 | 11 十一分
13 | 12 十二分
14 | 13 十三分
15 | 14 十四分
16 | 15 十五分
17 | 16 十六分
18 | 17 十七分
19 | 18 十八分
20 | 19 十九分
21 | 20 二十分
22 | 21 二十一分
23 | 22 二十二分
24 | 23 二十三分
25 | 24 二十四分
26 | 25 二十五分
27 | 26 二十六分
28 | 27 二十七分
29 | 28 二十八分
30 | 29 二十九分
31 | 30 三十分
32 | 31 三十一分
33 | 32 三十二分
34 | 33 三十三分
35 | 34 三十四分
36 | 35 三十五分
37 | 36 三十六分
38 | 37 三十七分
39 | 38 三十八分
40 | 39 三十九分
41 | 40 四十分
42 | 41 四十一分
43 | 42 四十二分
44 | 43 四十三分
45 | 44 四十四分
46 | 45 四十五分
47 | 46 四十六分
48 | 47 四十七分
49 | 48 四十八分
50 | 49 四十九分
51 | 50 五十分
52 | 51 五十一分
53 | 52 五十二分
54 | 53 五十三分
55 | 54 五十四分
56 | 55 五十五分
57 | 56 五十六分
58 | 57 五十七分
59 | 58 五十八分
60 | 59 五十九分
61 |
--------------------------------------------------------------------------------
/tn/english/data/measure/unit_alternatives.tsv:
--------------------------------------------------------------------------------
1 | atm atmosphere
2 | bq becquerel
3 | cd candela
4 | da dalton
5 | eb exabyte
6 | f degree Fahrenheit
7 | gb gigabyte
8 | g gram
9 | gl gigaliter
10 | ha hectare
11 | h hour
12 | hl hectoliter
13 | hp horsepower
14 | hp horsepower
15 | kb kilobit
16 | kb kilobyte
17 | ma megaampere
18 | mA megaampere
19 | ma milliampere
20 | mA milliampere
21 | mb megabyte
22 | mc megacoulomb
23 | mf megafarad
24 | m meter
25 | m minute
26 | mm millimeter
27 | mm millimeter
28 | mm millimeter
29 | ms megasecond
30 | ms mega siemens
31 | ms millisecond
32 | mv millivolt
33 | mV millivolt
34 | mw megawatt
35 | mW megawatt
36 | pb petabyte
37 | pg petagram
38 | ps petasecond
39 | s second
40 | tb terabyte
41 | tb terabyte
42 | yb yottabyte
43 | zb zettabyte
44 |
--------------------------------------------------------------------------------
/runtime/processor/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | add_library(wetext_processor STATIC
2 | wetext_processor.cc
3 | wetext_token_parser.cc
4 | )
5 | if(ANDROID)
6 | target_link_libraries(wetext_processor PUBLIC fst wetext_utils)
7 | else()
8 | if(MSVC)
9 | target_link_libraries(wetext_processor PUBLIC fst wetext_utils)
10 | else()
11 | target_link_libraries(wetext_processor PUBLIC dl fst wetext_utils)
12 | endif()
13 | endif()
14 |
15 | # ----------------------------------------------------------------------------
16 | # C API shared library (wetext_processor_c)
17 | # ----------------------------------------------------------------------------
18 | add_library(wetext_processor_c SHARED
19 | wetext_processor_c_api.cc
20 | )
21 |
22 | target_link_libraries(wetext_processor_c PUBLIC wetext_processor)
23 |
--------------------------------------------------------------------------------
/itn/chinese/data/money/symbol.tsv:
--------------------------------------------------------------------------------
1 | 加纳塞地 ¢
2 | 玻利维亚玻利维亚诺 $b
3 | 乌拉圭比索 $U
4 | 美元 $
5 | 英镑 £
6 | 元 ¥
7 | 泰铢 ฿
8 | 柬埔寨瑞尔 ៛
9 | 哥斯达黎加科隆 ₡
10 | 尼日利亚奈拉 ₦
11 | 朝鲜园 ₩
12 | 以色列谢克尔 ₪
13 | 越南东 ₫
14 | 欧元 €
15 | 老挝基普 ₭
16 | 蒙古图格里克 ₮
17 | 古巴比索 ₱
18 | 菲律宾比索 ₱
19 | 乌克兰格里夫纳 ₴
20 | 印度卢比 ₹
21 | 土耳其里拉 ₺
22 | 卢布 ₽
23 | 白俄罗斯卢布 Br
24 | 委内瑞拉玻利瓦尔 Bs
25 | 伯利兹元 BZ$
26 | 巴拿马巴尔博亚 B/.
27 | 尼加拉瓜科尔多瓦 C$
28 | 瑞士法郎 CHF
29 | 匈牙利福林 Ft
30 | 阿鲁巴盾 ƒ
31 | 巴拉圭瓜拉尼 Gs
32 | 牙买加元 J$
33 | 捷克克朗 Kč
34 | 波斯尼亚和黑塞哥维那可兑换马克 KM
35 | 克罗地亚库纳 kn
36 | 丹麦克朗 kr
37 | 罗马尼亚列伊 lei
38 | 阿尔巴尼亚列克 Lek
39 | 洪都拉斯伦皮拉 L
40 | 莫桑比克梅蒂卡尔 MT
41 | 博茨瓦纳普拉 P
42 | 危地马拉格查尔 Q
43 | 巴西雷亚尔 R$
44 | 多米尼加共和国比索 RD$
45 | 马来西亚令吉 RM
46 | 印尼盾 Rp
47 | 巴基斯坦卢比 ₨
48 | 毛里求斯卢比 ₨
49 | 南非兰特 R
50 | 秘鲁索尔 S/.
51 | 索马里先令 S
52 | 特立尼达和多巴哥元 TT$
53 | 津巴布韦元 Z$
54 | 波兰兹罗提 zł
55 | 马其顿代纳尔 ден
56 | 塞尔维亚第纳尔 Дин.
57 | 乌兹别克斯坦索姆 лв
58 | 保加利亚列弗 лв
59 | 吉尔吉斯斯坦索姆 лв
60 | 哈萨克斯坦腾格 лв
61 |
--------------------------------------------------------------------------------
/tn/english/data/money/currency_major.tsv:
--------------------------------------------------------------------------------
1 | $ dollar
2 | $ us dollar
3 | US$ us dollar
4 | ฿ Thai Baht
5 | £ pound
6 | € euro
7 | ₩ won
8 | nzd new zealand dollar
9 | rs rupee
10 | chf swiss franc
11 | dkk danish kroner
12 | fim finnish markka
13 | aed arab emirates dirham
14 | ¥ yen
15 | czk czech koruna
16 | mro mauritanian ouguiya
17 | pkr pakistani rupee
18 | crc costa rican colon
19 | hk$ hong kong dollar
20 | npr nepalese rupee
21 | awg aruban florin
22 | nok norwegian kroner
23 | tzs tanzanian shilling
24 | sek swedish kronor
25 | cyp cypriot pound
26 | r real
27 | sar saudi riyal
28 | cve cape verde escudo
29 | rsd serbian dinar
30 | dm german mark
31 | shp saint helena pounds
32 | php philippine peso
33 | cad canadian dollar
34 | ssp south sudanese pound
35 | scr seychelles rupee
36 | mvr maldivian rufiyaa
37 | DH dirham
38 | Dh dirham
39 | Dhs. dirham
40 |
--------------------------------------------------------------------------------
/runtime/android/app/proguard-rules.pro:
--------------------------------------------------------------------------------
1 | # Add project specific ProGuard rules here.
2 | # You can control the set of applied configuration files using the
3 | # proguardFiles setting in build.gradle.
4 | #
5 | # For more details, see
6 | # http://developer.android.com/guide/developing/tools/proguard.html
7 |
8 | # If your project uses WebView with JS, uncomment the following
9 | # and specify the fully qualified class name to the JavaScript interface
10 | # class:
11 | #-keepclassmembers class fqcn.of.javascript.interface.for.webview {
12 | # public *;
13 | #}
14 |
15 | # Uncomment this to preserve the line number information for
16 | # debugging stack traces.
17 | #-keepattributes SourceFile,LineNumberTable
18 |
19 | # If you keep the line number information, uncomment this to
20 | # hide the original source file name.
21 | #-renamesourcefileattribute SourceFile
--------------------------------------------------------------------------------
/tn/english/data/address/state.tsv:
--------------------------------------------------------------------------------
1 | Alabama AL
2 | Alaska AK
3 | Arizona AZ
4 | Arkansas AR
5 | California CA
6 | Colorado CO
7 | Connecticut CT
8 | Delaware DE
9 | Florida FL
10 | Georgia GA
11 | Hawaii HI
12 | Idaho ID
13 | Illinois IL
14 | Indiana IND
15 | Iowa IA
16 | Kansas KS
17 | Kentucky KY
18 | Louisiana LA
19 | Maine ME
20 | Maryland MD
21 | Massachusetts MA
22 | Michigan MI
23 | Minnesota MN
24 | Mississippi MS
25 | Missouri MO
26 | Montana MT
27 | Nebraska NE
28 | Nevada NV
29 | New Hampshire NH
30 | New Jersey NJ
31 | New Mexico NM
32 | New York NY
33 | North Carolina NC
34 | North Dakota ND
35 | Oregon OR
36 | Pennsylvania PA
37 | Rhode Island RI
38 | South Carolina SC
39 | South Dakota SD
40 | Tennessee TN
41 | Tennessee TENN
42 | Texas TX
43 | Utah UT
44 | Vermont VT
45 | Virginia VA
46 | Washington WA
47 | West Virginia WV
48 | Wisconsin WI
49 | Wyoming WY
--------------------------------------------------------------------------------
/tn/chinese/data/char/fullwidth_to_halfwidth.tsv:
--------------------------------------------------------------------------------
1 | , ,
2 | 。 .
3 | . .
4 | “ "
5 | ” "
6 | ! !
7 | " "
8 | # #
9 | % %
10 | & &
11 | ' '
12 | ( (
13 | ) )
14 | * *
15 | + +
16 | - -
17 | / /
18 | : :
19 | ; ;
20 | < <
21 | = =
22 | > >
23 | ? ?
24 | @ @
25 | \ \
26 | ^ ^
27 | _ _
28 | ` `
29 | { {
30 | | |
31 | } }
32 | ~ ~
33 | $ $
34 | 0 0
35 | 1 1
36 | 2 2
37 | 3 3
38 | 4 4
39 | 5 5
40 | 6 6
41 | 7 7
42 | 8 8
43 | 9 9
44 | a a
45 | A A
46 | b b
47 | B B
48 | c c
49 | C C
50 | d d
51 | D D
52 | e e
53 | E E
54 | f f
55 | F F
56 | g g
57 | G G
58 | h h
59 | H H
60 | i i
61 | I I
62 | j j
63 | J J
64 | k k
65 | K K
66 | l l
67 | L L
68 | m m
69 | M M
70 | n n
71 | N N
72 | o o
73 | O O
74 | p p
75 | P P
76 | q q
77 | Q Q
78 | r r
79 | R R
80 | s s
81 | S S
82 | t t
83 | T T
84 | u u
85 | U U
86 | v v
87 | V V
88 | w w
89 | W W
90 | x x
91 | X X
92 | y y
93 | Y Y
94 | z z
95 | Z Z
96 |
--------------------------------------------------------------------------------
/itn/japanese/data/char/fullwidth_to_halfwidth.tsv:
--------------------------------------------------------------------------------
1 | , ,
2 | 。 .
3 | . .
4 | “ "
5 | ” "
6 | ! !
7 | " "
8 | # #
9 | % %
10 | & &
11 | ' '
12 | ( (
13 | ) )
14 | * *
15 | + +
16 | - -
17 | / /
18 | : :
19 | ; ;
20 | < <
21 | = =
22 | > >
23 | ? ?
24 | @ @
25 | \ \
26 | ^ ^
27 | _ _
28 | ` `
29 | { {
30 | | |
31 | } }
32 | ~ ~
33 | $ $
34 | 0 0
35 | 1 1
36 | 2 2
37 | 3 3
38 | 4 4
39 | 5 5
40 | 6 6
41 | 7 7
42 | 8 8
43 | 9 9
44 | a a
45 | A A
46 | b b
47 | B B
48 | c c
49 | C C
50 | d d
51 | D D
52 | e e
53 | E E
54 | f f
55 | F F
56 | g g
57 | G G
58 | h h
59 | H H
60 | i i
61 | I I
62 | j j
63 | J J
64 | k k
65 | K K
66 | l l
67 | L L
68 | m m
69 | M M
70 | n n
71 | N N
72 | o o
73 | O O
74 | p p
75 | P P
76 | q q
77 | Q Q
78 | r r
79 | R R
80 | s s
81 | S S
82 | t t
83 | T T
84 | u u
85 | U U
86 | v v
87 | V V
88 | w w
89 | W W
90 | x x
91 | X X
92 | y y
93 | Y Y
94 | z z
95 | Z Z
96 |
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/values/attrs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/tn/japanese/data/char/fullwidth_to_halfwidth.tsv:
--------------------------------------------------------------------------------
1 | , ,
2 | 。 .
3 | . .
4 | “ "
5 | ” "
6 | ! !
7 | " "
8 | # #
9 | % %
10 | & &
11 | ' '
12 | ( (
13 | ) )
14 | * *
15 | + +
16 | - -
17 | / /
18 | : :
19 | ; ;
20 | < <
21 | = =
22 | > >
23 | ? ?
24 | @ @
25 | \ \
26 | ^ ^
27 | _ _
28 | ` `
29 | { {
30 | | |
31 | } }
32 | ~ ~
33 | $ $
34 | 0 0
35 | 1 1
36 | 2 2
37 | 3 3
38 | 4 4
39 | 5 5
40 | 6 6
41 | 7 7
42 | 8 8
43 | 9 9
44 | a a
45 | A A
46 | b b
47 | B B
48 | c c
49 | C C
50 | d d
51 | D D
52 | e e
53 | E E
54 | f f
55 | F F
56 | g g
57 | G G
58 | h h
59 | H H
60 | i i
61 | I I
62 | j j
63 | J J
64 | k k
65 | K K
66 | l l
67 | L L
68 | m m
69 | M M
70 | n n
71 | N N
72 | o o
73 | O O
74 | p p
75 | P P
76 | q q
77 | Q Q
78 | r r
79 | R R
80 | s s
81 | S S
82 | t t
83 | T T
84 | u u
85 | U U
86 | v v
87 | V V
88 | w w
89 | W W
90 | x x
91 | X X
92 | y y
93 | Y Y
94 | z z
95 | Z Z
96 |
--------------------------------------------------------------------------------
/tn/japanese/test/data/date.txt:
--------------------------------------------------------------------------------
1 | 1998/04/23 => 千九百九十八年四月二十三日
2 | 2023/11/14 => 二千二十三年十一月十四日
3 | 2023/11/01 => 二千二十三年十一月一日
4 | 2008/08 => 二千八年八月
5 | 08/2008 => 二千八年八月
6 | 08/08 => 八月八日
7 | 2008-08-23 => 二千八年八月二十三日
8 | 2008-8-8 => 二千八年八月八日
9 | 2008-08 => 二千八年八月
10 | 08-2008 => 二千八年八月
11 | 08-08 => 八月八日
12 | 2008.08.08 => 二千八年八月八日
13 | 2008.8.8 => 二千八年八月八日
14 | 2008.08 => 二千八年八月
15 | 08.2008 => 二千八年八月
16 | 08.08 => 八月八日
17 | 来月の旅行は12/15から12/20までです => 来月の旅行は十二月十五日から十二月二十日までです
18 | 次の週末は11/5から11/6です => 次の週末は十一月五日から十一月六日です
19 | 2008.08.08-2008.08.10 => 二千八年八月八日から二千八年八月十日
20 | 23/10 => 十分の二十三
21 | 1/3 => 三分の一
22 | 今日は2036-03-01です => 今日は二千三十六年三月一日です
23 | 今日は2036/03/01です => 今日は二千三十六年三月一日です
24 | 来週の月曜日は2036.2.28です => 来週の月曜日は二千三十六年二月二十八日です
25 | 次の週末は4/30から5/1です => 次の週末は四月三十日から五月一日です
26 | 来月の旅行は5-15から5-20です => 来月の旅行は五月十五日から五月二十日です
27 | 次の会議は2022-12-1に開催されます => 次の会議は二千二十二年十二月一日に開催されます
28 | 今日は2022.11.19です => 今日は二千二十二年十一月十九日です
--------------------------------------------------------------------------------
/tn/english/test/data/cardinal.txt:
--------------------------------------------------------------------------------
1 | -23 => negative twenty three
2 | -1 => negative one
3 | 0 => zero
4 | 1 => one
5 | 2 => two
6 | 10 => ten
7 | 11 => eleven
8 | 20 => twenty
9 | 100 => one hundred
10 | 101 => one hundred and one
11 | 111 => one hundred and eleven
12 | 200 => two hundred
13 | 1000 => thousand
14 | 1001 => thousand and one
15 | 1011 => thousand and eleven
16 | 1111 => thousand one hundred and eleven
17 | 2000 => two thousand
18 | 10000 => ten thousand
19 | 10001 => ten thousand and one
20 | 10011 => ten thousand and eleven
21 | 10111 => ten thousand one hundred and eleven
22 | 11111 => eleven thousand one hundred and eleven
23 | 20000 => twenty thousand
24 | 101111 => one hundred one thousand one hundred and eleven
25 | 1001111 => one million one thousand one hundred and eleven
26 | 10001111 => ten million one thousand one hundred and eleven
27 | 4567 => four thousand five hundred and sixty seven
28 |
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/values/themes.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/values-night/themes.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
16 |
--------------------------------------------------------------------------------
/runtime/android/app/src/main/AndroidManifest.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/runtime/android/app/src/androidTest/java/com/mobvoi/WeTextProcessing/ExampleInstrumentedTest.java:
--------------------------------------------------------------------------------
1 | package com.mobvoi.WeTextProcessing;
2 |
3 | import android.content.Context;
4 |
5 | import androidx.test.platform.app.InstrumentationRegistry;
6 | import androidx.test.ext.junit.runners.AndroidJUnit4;
7 |
8 | import org.junit.Test;
9 | import org.junit.runner.RunWith;
10 |
11 | import static org.junit.Assert.*;
12 |
13 | /**
14 | * Instrumented test, which will execute on an Android device.
15 | *
16 | * @see Testing documentation
17 | */
18 | @RunWith(AndroidJUnit4.class)
19 | public class ExampleInstrumentedTest {
20 | @Test
21 | public void useAppContext() {
22 | // Context of the app under test.
23 | Context appContext = InstrumentationRegistry.getInstrumentation().getTargetContext();
24 | assertEquals("com.mobvoi.WeTextProcessing", appContext.getPackageName());
25 | }
26 | }
--------------------------------------------------------------------------------
/itn/chinese/test/data/normalizer.txt:
--------------------------------------------------------------------------------
1 | 一共有多少人 => 一共有多少人
2 | 呃这个呃啊我不知道 => 这个我不知道
3 | 呃呃啊 =>
4 | 共四百六十五篇,约三百一十五万字 => 共465篇,约315万字
5 | 共计六点四二万人 => 共计6.42万人
6 | 同比升高零点六个百分点 => 同比升高0.6个百分点
7 | 总量的五分之一以上 => 总量的1/5以上
8 | 相当于头发丝的十六分之一 => 相当于头发丝的1/16
9 | 二分之三是一个假分数 => 3/2是一个假分数
10 | 同比增长百分之六点三 => 同比增长6.3%
11 | 增幅百分之零点四 => 增幅0.4%
12 | 二零零二年一月二十八日 => 2002/01/28
13 | 二零零二年一月 => 2002/01
14 | 八月十六号的十二点之前 => 08/16的12点之前
15 | 我是五点零二分开始的 => 我是5:02开始的
16 | 于五点三十五分三十六秒发射 => 于5:35:36发射
17 | 上午八点半准时开会 => 8:30a.m.准时开会
18 | 比分定格在七十八比九十六 => 比分定格在78:96
19 | 计算负二的绝对值是二 => 计算-2的绝对值是2
20 | 正负二的平方都是四 => ±2的平方都是4
21 | 价格是十三点五元 => 价格是¥13.5
22 | 价格是十三点五美元 => 价格是$13.5
23 | 价格是十三点五澳元 => 价格是A$13.5
24 | 价格是十三点五港元 => 价格是HKD13.5
25 | 重达二十五千克 => 重达25kg
26 | 最高气温三十八摄氏度 => 最高气温38°C
27 | 实际面积一百二十平方米 => 实际面积120m²
28 | 渲染速度十毫秒一帧 => 渲染速度10ms1帧
29 | 可以打我手机幺三五零幺二三四五六七 => 可以打我手机13501234567
30 | 可以拨打幺二三零六来咨询 => 可以拨打12306来咨询
31 | 二点五平方电线,五,五十五,疑是银河落九天,十二块五 => 2.5平方电线,5,55,疑是银河落9天,12块5
32 | 三百九十九三盒 => 3993盒
33 | 十三五规划期间获得了十几万和几十万甚至二十几万的投资 => 十三五规划期间获得了十几万和几十万甚至二十几万的投资
34 |
--------------------------------------------------------------------------------
/runtime/utils/wetext_log.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #ifndef UTILS_WETEXT_LOG_H_
16 | #define UTILS_WETEXT_LOG_H_
17 |
18 | // Because openfst is a dynamic library compiled with gflags/glog, we must use
19 | // the gflags/glog from openfst to avoid them linked both statically and
20 | // dynamically into the executable.
21 | #include "fst/log.h"
22 |
23 | #endif // UTILS_WETEXT_LOG_H_
24 |
--------------------------------------------------------------------------------
/itn/japanese/test/data/cardinal.txt:
--------------------------------------------------------------------------------
1 | 一 => 1
2 | 四 => 4
3 | 十 => 10
4 | 十四 => 14
5 | 四十四 => 44
6 | 四十 => 40
7 | 百一 => 101
8 | 百十二 => 112
9 | 四百四 => 404
10 | 九千百二十三 => 9123
11 | 一千二百三十四 => 1234
12 | 五千六百七十八 => 5678
13 | 二千二十 => 2020
14 | 二千二 => 2002
15 | 二千十 => 2010
16 | 二千百 => 2100
17 | 九千 => 9000
18 | 九千二 => 9002
19 | 十 => 10
20 | 百 => 100
21 | 千 => 1000
22 | 万 => 万
23 | 兆 => 兆
24 | 千百 => 1100
25 | 千三百 => 1300
26 | 千三百十 => 1310
27 | 千十 => 1010
28 | 千二十 => 1020
29 | 千二十一 => 1021
30 | 千一 => 1001
31 | 千百十 => 1110
32 | 千百一 => 1101
33 | マイナス百十二 => -112
34 | プラス百十二 => +112
35 | 二十万二 => 200002
36 | 一万二 => 10002
37 | 二十万二千百 => 202100
38 | 四百万 => 400万
39 | 四百四万 => 404万
40 | 五千万 => 5000万
41 | 二万 => 20000
42 | 一億五千万 => 1億5000万
43 | 一億五万 => 1億5万
44 | 一億一百万 => 1億100万
45 | 一億一千万 => 1億1000万
46 | 二千億一千万 => 2000億1000万
47 | 二千億 => 2000億
48 | 二兆二億 => 2兆2億
49 | 二兆二千億 => 2兆2000億
50 | 二兆二千万 => 2兆2000万
51 | 二兆二百万 => 2兆200万
52 | 一兆三百二十万五千 => 1兆320万5000
53 | 二兆三十 => 2兆30
54 | 二兆百 => 2兆100
55 | 二十兆百 => 20兆100
56 | 一九二点一六八点零点一 => 192.168.0.1
57 | 一二三四五六七八九 => 123456789
--------------------------------------------------------------------------------
/runtime/utils/wetext_flags.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #ifndef UTILS_WETEXT_FLAGS_H_
16 | #define UTILS_WETEXT_FLAGS_H_
17 |
18 | // Because openfst is a dynamic library compiled with gflags/glog, we must use
19 | // the gflags/glog from openfst to avoid them linked both statically and
20 | // dynamically into the executable.
21 | #include "fst/flags.h"
22 |
23 | #endif // UTILS_WETEXT_FLAGS_H_
24 |
--------------------------------------------------------------------------------
/itn/chinese/data/measure/units_en.tsv:
--------------------------------------------------------------------------------
1 | 原子质量 amu
2 | 巴 bar
3 | 平方厘米 cm²
4 | 立方厘米 cm³
5 | 厘米 cm
6 | 美担 cwt
7 | 摄氏度 °C
8 | 分贝 db
9 | 立方分米 dm³
10 | 分米 dm
11 | 英尺 ft
12 | 华氏度 °F
13 | 吉字节 gb
14 | 吉赫兹 ghz
15 | 吉帕斯卡 gpa
16 | 吉瓦时 gwh
17 | 吉瓦 gw
18 | 戈瑞 gy
19 | 小时 h
20 | 公顷 ha
21 | 赫兹 hz
22 | 千比特每秒 kbps
23 | 千比特一秒 kbps
24 | 千卡 kcal
25 | 千克力 kgf
26 | 千克 kg
27 | 公斤 kg
28 | 千赫兹 khz
29 | 平方千米 km²
30 | 公里 km
31 | 公里每小时 km/h
32 | 公里一小时 km/h
33 | 千米 km
34 | 千米每小时 km/h
35 | 千米一小时 km/h
36 | 千帕 kpa
37 | 千瓦时 kwh
38 | 千瓦 kw
39 | 磅 lbs
40 | 平方米 m²
41 | 立方米 m³
42 | 兆比特每秒 mbps
43 | 兆比特一秒 mbps
44 | 克 g
45 | 毫克 mg
46 | 兆赫兹 mhz
47 | 平方英里 mi²
48 | 分钟 min
49 | 英里 mi
50 | 毫升 ml
51 | 平方毫米 mm²
52 | 毫米 mm
53 | 摩尔 mol
54 | 兆帕 mpa
55 | 英里每小时 mph
56 | 英里一小时 mph
57 | 毫秒 ms
58 | 毫伏 mv
59 | 毫瓦 mw
60 | 千伏 kv
61 | 米 m
62 | 纳克 ng
63 | 纳米 nm
64 | 纳秒 ns
65 | 盎司 oz
66 | 度 º
67 | 帕斯卡 pa
68 | 皮克 pg
69 | 皮秒 ps
70 | 弧度 rad
71 | 转每分 rpm
72 | 平方英尺 sq ft
73 | 平方英里 sq mi
74 | 系沃特 sv
75 | 秒 s
76 | 太字节 tb
77 | 万亿焦耳 tj
78 | 台两 tl
79 | 伏特 v
80 | 码 yd
81 | 微克 μg
82 | 微米 μm
83 | 微秒 μs
84 | 欧米茄 ω
85 | 度 °
86 |
--------------------------------------------------------------------------------
/tn/chinese/test/time_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.chinese.rules.time import Time
18 | from tn.chinese.test.utils import parse_test_case
19 |
20 |
21 | class TestTime:
22 | time = Time()
23 | time_cases = parse_test_case("data/time.txt")
24 |
25 | @pytest.mark.parametrize("written, spoken", time_cases)
26 | def test_time(self, written, spoken):
27 | assert self.time.normalize(written) == spoken
28 |
--------------------------------------------------------------------------------
/tn/chinese/test/char_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.chinese.rules.char import Char
18 | from tn.chinese.test.utils import parse_test_case
19 |
20 |
21 | class TestChar:
22 |
23 | char = Char()
24 | char_cases = parse_test_case("data/char.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", char_cases)
27 | def test_char(self, written, spoken):
28 | assert self.char.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/chinese/test/date_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.chinese.rules.date import Date
18 | from tn.chinese.test.utils import parse_test_case
19 |
20 |
21 | class TestDate:
22 |
23 | date = Date()
24 | date_cases = parse_test_case("data/date.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", date_cases)
27 | def test_date(self, written, spoken):
28 | assert self.date.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/chinese/test/math_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.chinese.rules.math import Math
18 | from tn.chinese.test.utils import parse_test_case
19 |
20 |
21 | class TestMath:
22 |
23 | math = Math()
24 | math_cases = parse_test_case("data/math.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", math_cases)
27 | def test_math(self, written, spoken):
28 | assert self.math.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/english/test/word_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.rules.word import Word
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class TestWord:
22 |
23 | word = Word()
24 | word_cases = parse_test_case("data/word.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", word_cases)
27 | def test_char(self, written, spoken):
28 | assert self.word.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/japanese/rules/char.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini.lib.pynutil import insert
16 |
17 | from tn.processor import Processor
18 |
19 |
20 | class Char(Processor):
21 |
22 | def __init__(self):
23 | super().__init__(name="char")
24 | self.build_tagger()
25 | self.build_verbalizer()
26 |
27 | def build_tagger(self):
28 | tagger = insert('value: "') + self.CHAR + insert('"')
29 | self.tagger = self.add_tokens(tagger)
30 |
--------------------------------------------------------------------------------
/tn/japanese/test/data/cardinal.txt:
--------------------------------------------------------------------------------
1 | 1118 => 千百十八
2 | -1118 => マイナス千百十八
3 | 9.99999 => 九点九九九九九
4 | 20099.1001 => 二万九十九点一〇〇一
5 | 11118 => 一万千百十八
6 | -200.001 => マイナス二百点〇〇一
7 | 100% => 百パーセント
8 | -50.04% => マイナス五十点〇四パーセント
9 | -50.07% => マイナス五十点〇七パーセント
10 | 192.168.0.1 => 一九二点一六八点〇点一
11 | 090-1234-5678 => ゼロ九ゼロの一二三四の五六七八
12 | 090-12345678 => ゼロ九ゼロの一二三四五六七八
13 | +81-090-1234-5678 => プラス八一のゼロ九ゼロの一二三四の五六七八
14 | +81 090-1234-5678 => プラス八一ゼロ九ゼロの一二三四の五六七八
15 | +81 090-123-5678 => プラス八一ゼロ九ゼロの一二三の五六七八
16 | +81 09012345678 => プラス八一ゼロ九ゼロ一二三四五六七八
17 | 02-1234-5678 => ゼロ二の一二三四の五六七八
18 | 1.1234567 => 一点一二三四五六七
19 | 123456789 => 一二三四五六七八九
20 | 0.0005 => 〇点〇〇〇五
21 | No.1005 => No.一〇〇五
22 | 番号1234 => 番号一二三四
23 | 1234号室 => 一二三四号室
24 | 150,000 => 十五万
25 | 10,000 => 一万
26 | 11,000 => 一万千
27 | 1,115,000 => 百十一万五千
28 | 10,000,000 => 一千万
29 | 10,100,000 => 一千十万
30 | 1,000 => 千
31 | 150000 => 十五万
32 | 10000 => 一万
33 | 11000 => 一万千
34 | 1115000 => 百十一万五千
35 | 1000 => 千
36 | 100000 => 十万
37 | 1000000 => 百万
38 | 10100000 => 一千十万
39 | 0時に花火が打ち上げられます => 〇時に花火が打ち上げられます
40 | -80000000600 => マイナス八百億六百
41 | -80010000600 => マイナス八〇〇一〇〇〇〇六〇〇
--------------------------------------------------------------------------------
/itn/japanese/rules/char.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini.lib.pynutil import insert
16 |
17 | from tn.processor import Processor
18 |
19 |
20 | class Char(Processor):
21 |
22 | def __init__(self):
23 | super().__init__(name="char")
24 | self.build_tagger()
25 | self.build_verbalizer()
26 |
27 | def build_tagger(self):
28 | tagger = insert('value: "') + self.CHAR + insert('"')
29 | self.tagger = self.add_tokens(tagger)
30 |
--------------------------------------------------------------------------------
/tn/chinese/data/measure/units_en.tsv:
--------------------------------------------------------------------------------
1 | amu 原子质量
2 | bar 巴
3 | cm2 平方厘米
4 | cm² 平方厘米
5 | cm3 立方厘米
6 | cm³ 立方厘米
7 | cm 厘米
8 | cwt 美担
9 | °c 摄氏度
10 | °C 摄氏度
11 | ℃ 摄氏度
12 | db 分贝
13 | dm3 立方分米
14 | dm³ 立方分米
15 | dm 分米
16 | ds 毫秒
17 | ft 英尺
18 | °f 华氏度
19 | °F 华氏度
20 | ℉ 华氏度
21 | gb 吉字节
22 | ghz 吉赫兹
23 | gpa 吉帕斯卡
24 | gwh 吉瓦时
25 | gw 吉瓦
26 | gy 戈瑞
27 | h 小时
28 | ha 公顷
29 | hz 赫兹
30 | kbps 千比特每秒
31 | kcal 千卡
32 | kgf 千克力
33 | kg 千克
34 | khz 千赫兹
35 | km2 平方千米
36 | km² 平方千米
37 | km 公里
38 | kpa 千帕
39 | kwh 千瓦时
40 | kw 千瓦
41 | kW 千瓦
42 | lbs 磅
43 | lb 磅
44 | m2 平方米
45 | m² 平方米
46 | m3 立方米
47 | m³ 立方米
48 | mbps 兆比特每秒
49 | mg 毫克
50 | mhz 兆赫兹
51 | mi2 平方英里
52 | mi² 平方英里
53 | min 分钟
54 | mi 英里
55 | ml 毫升
56 | mm2 平方毫米
57 | mm² 平方毫米
58 | mm 毫米
59 | mol 摩尔
60 | mpa 兆帕
61 | mph 英里每小时
62 | ms 毫秒
63 | mv 毫伏
64 | mw 毫瓦
65 | m 米
66 | ng 纳克
67 | nm 纳米
68 | ns 纳秒
69 | ºc 摄氏度
70 | ºC 摄氏度
71 | oz 盎司
72 | º 度
73 | pa 帕斯卡
74 | pg 皮克
75 | ps 皮秒
76 | rad 弧度
77 | rpm 转每分
78 | sq ft 平方英尺
79 | sq mi 平方英里
80 | sv 系沃特
81 | s 秒
82 | tb 太字节
83 | tj 万亿焦耳
84 | tl 台两
85 | v 伏特
86 | yd 码
87 | μg 微克
88 | μm 微米
89 | μs 微秒
90 | ω 欧米茄
91 | ° 度
92 |
--------------------------------------------------------------------------------
/itn/chinese/rules/char.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini.lib.pynutil import insert
16 |
17 | from tn.processor import Processor
18 |
19 |
20 | class Char(Processor):
21 |
22 | def __init__(self):
23 | super().__init__(name="char")
24 | self.build_tagger()
25 | self.build_verbalizer()
26 |
27 | def build_tagger(self):
28 | tagger = insert('value: "') + self.CHAR + insert('"')
29 | self.tagger = self.add_tokens(tagger)
30 |
--------------------------------------------------------------------------------
/tn/chinese/rules/char.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini.lib.pynutil import insert
16 |
17 | from tn.processor import Processor
18 |
19 |
20 | class Char(Processor):
21 |
22 | def __init__(self):
23 | super().__init__(name="char")
24 | self.build_tagger()
25 | self.build_verbalizer()
26 |
27 | def build_tagger(self):
28 | tagger = insert('value: "') + self.CHAR + insert('"')
29 | self.tagger = self.add_tokens(tagger)
30 |
--------------------------------------------------------------------------------
/tn/chinese/test/money_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.chinese.rules.money import Money
18 | from tn.chinese.test.utils import parse_test_case
19 |
20 |
21 | class TestMoney:
22 |
23 | money = Money()
24 | money_cases = parse_test_case("data/money.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", money_cases)
27 | def test_money(self, written, spoken):
28 | assert self.money.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/chinese/test/sport_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.chinese.rules.sport import Sport
18 | from tn.chinese.test.utils import parse_test_case
19 |
20 |
21 | class TestSport:
22 |
23 | sport = Sport()
24 | sport_cases = parse_test_case("data/sport.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", sport_cases)
27 | def test_sport(self, written, spoken):
28 | assert self.sport.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/english/test/date_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.rules.date import Date
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class TestDate:
22 |
23 | date = Date(deterministic=False)
24 | date_cases = parse_test_case("data/date.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", date_cases)
27 | def test_date(self, written, spoken):
28 | assert self.date.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/english/test/time_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.rules.time import Time
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class Testtime:
22 |
23 | time = Time(deterministic=False)
24 | time_cases = parse_test_case("data/time.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", time_cases)
27 | def test_time(self, written, spoken):
28 | assert self.time.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/english/test/money_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.rules.money import Money
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class TestMoney:
22 |
23 | money = Money(deterministic=False)
24 | money_cases = parse_test_case("data/money.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", money_cases)
27 | def test_money(self, written, spoken):
28 | assert self.money.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/english/test/range_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.rules.range import Range
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class TestRange:
22 |
23 | range = Range(deterministic=False)
24 | range_cases = parse_test_case("data/range.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", range_cases)
27 | def test_range(self, written, spoken):
28 | assert self.range.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/english/test/roman_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.rules.roman import Roman
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class TestRoman:
22 |
23 | roman = Roman(deterministic=False)
24 | roman_cases = parse_test_case("data/roman.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", roman_cases)
27 | def test_roman(self, written, spoken):
28 | assert self.roman.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/chinese/test/measure_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.chinese.rules.measure import Measure
18 | from tn.chinese.test.utils import parse_test_case
19 |
20 |
21 | class TestMoney:
22 |
23 | measure = Measure()
24 | measure_cases = parse_test_case("data/measure.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", measure_cases)
27 | def test_measure(self, written, spoken):
28 | assert self.measure.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/itn/chinese/data/default/whitelist.tsv:
--------------------------------------------------------------------------------
1 | 三七二十一 三七二十一
2 | 一共 一共
3 | 一个 一个
4 | 一下 一下
5 | 一些 一些
6 | 一起 一起
7 | 一会 一会
8 | 一路 一路
9 | 二维码 二维码
10 | 慢一点 慢一点
11 | 一般 一般
12 | 统一 统一
13 | 星期一 星期一
14 | 星期二 星期二
15 | 星期三 星期三
16 | 星期四 星期四
17 | 星期五 星期五
18 | 星期六 星期六
19 | 一年一度 一年一度
20 | 一点一滴 一点一滴
21 | 三心二意 三心二意
22 | 阳春三月 阳春三月
23 | 七嘴八舌 七嘴八舌
24 | 四分五裂 四分五裂
25 | 七荤八素 七荤八素
26 | 三纲五常 三纲五常
27 | 三姑六婆 三姑六婆
28 | 四大皆空 四大皆空
29 | 五体投地 五体投地
30 | 六神无主 六神无主
31 | 七窍生烟 七窍生烟
32 | 七擒七纵 七擒七纵
33 | 八仙过海 八仙过海
34 | 十恶不赦 十恶不赦
35 | 一言九鼎 一言九鼎
36 | 一应俱全 一应俱全
37 | 一窍不通 一窍不通
38 | 一盘散沙 一盘散沙
39 | 十全十美 十全十美
40 | 一五一十 一五一十
41 | 让你三分 让你三分
42 | 乱七八糟 乱七八糟
43 | 一日三餐 一日三餐
44 | 十分高兴 十分高兴
45 | 十万八千里 十万八千里
46 | 四川 四川
47 | 三明 三明
48 | 九寨沟 九寨沟
49 | 七里河 七里河
50 | 九江 九江
51 | 六安 六安
52 | 十堰 十堰
53 | 八公山 八公山
54 | 七台河 七台河
55 | 五常 五常
56 | 四平 四平
57 | 四子王旗 四子王旗
58 | 三亚 三亚
59 | 二连浩特 二连浩特
60 | 零陵 零陵
61 | 五台山 五台山
62 | 六盘水 六盘水
63 | 八宿 八宿
64 | 十二五 十二五
65 | 十三五 十三五
66 | 十四五 十四五
67 | 几十万 几十万
68 | 几百万 几百万
69 | 几千万 几千万
70 | 十几万 十几万
71 | 二十几万 二十几万
72 | 三十几万 三十几万
73 | 四十几万 四十几万
74 | 五十几万 五十几万
75 | 六十几万 六十几万
76 | 七十几万 七十几万
77 | 八十几万 八十几万
78 | 九十几万 九十几万
79 | 七乘二十四小时 7x24小时
80 | 七乘二十四个小时 7x24个小时
81 | 四S店 4S店
82 | 四s店 4s店
83 |
--------------------------------------------------------------------------------
/runtime/README.md:
--------------------------------------------------------------------------------
1 | ## WeTextProcessing Runtime
2 |
3 | 1. How to build
4 |
5 | ``` bash
6 | $ cmake -B build -DCMAKE_BUILD_TYPE=Release
7 | $ cmake --build build
8 | ```
9 |
10 | On Windows:
11 | ``` bash
12 | $ cmake -DCMAKE_BUILD_TYPE=Release -B build -G "Visual Studio 17 2022" -DBUILD_SHARED_LIBS=0 -DCMAKE_CXX_FLAGS="/ZI"
13 | $ cmake --build build
14 | ```
15 |
16 | 2. How to use
17 |
18 | ``` bash
19 | # tn usage
20 | $ wget https://github.com/wenet-e2e/WeTextProcessing/releases/download/WeTextProcessing/zh_tn_tagger.fst
21 | $ wget https://github.com/wenet-e2e/WeTextProcessing/releases/download/WeTextProcessing/zh_tn_verbalizer.fst
22 | $ ./build/processor_main --tagger zh_tn_tagger.fst --verbalizer zh_tn_verbalizer.fst --text "2.5平方电线"
23 |
24 | # itn usage
25 | $ wget https://github.com/wenet-e2e/WeTextProcessing/releases/download/WeTextProcessing/zh_itn_tagger.fst
26 | $ wget https://github.com/wenet-e2e/WeTextProcessing/releases/download/WeTextProcessing/zh_itn_verbalizer.fst
27 | $ ./build/processor_main --tagger zh_itn_tagger.fst --verbalizer zh_itn_verbalizer.fst --text "二点五平方电线"
28 | ```
29 |
--------------------------------------------------------------------------------
/tn/chinese/test/fraction_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.chinese.rules.fraction import Fraction
18 | from tn.chinese.test.utils import parse_test_case
19 |
20 |
21 | class TestFraction:
22 |
23 | fraction = Fraction()
24 | fraction_cases = parse_test_case("data/fraction.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", fraction_cases)
27 | def test_fraction(self, written, spoken):
28 | assert self.fraction.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/chinese/test/whitelist_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.chinese.rules.whitelist import Whitelist
18 | from tn.chinese.test.utils import parse_test_case
19 |
20 |
21 | class TestWhitelist:
22 | whitelist = Whitelist()
23 | whitelist_cases = parse_test_case("data/whitelist.txt")
24 |
25 | @pytest.mark.parametrize("written, spoken", whitelist_cases)
26 | def test_whitelist(self, written, spoken):
27 | assert self.whitelist.normalize(written) == spoken
28 |
--------------------------------------------------------------------------------
/tn/english/test/decimal_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.rules.decimal import Decimal
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class TestDecimal:
22 |
23 | decimal = Decimal(deterministic=False)
24 | decimal_cases = parse_test_case("data/decimal.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", decimal_cases)
27 | def test_decimal(self, written, spoken):
28 | assert self.decimal.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/english/test/measure_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.rules.measure import Measure
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class TestMeasure:
22 |
23 | measure = Measure(deterministic=False)
24 | measure_cases = parse_test_case("data/measure.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", measure_cases)
27 | def test_measure(self, written, spoken):
28 | assert self.measure.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/english/test/ordinal_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.rules.ordinal import Ordinal
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class TestOrdinal:
22 |
23 | ordinal = Ordinal(deterministic=False)
24 | ordinal_cases = parse_test_case("data/ordinal.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", ordinal_cases)
27 | def test_ordinal(self, written, spoken):
28 | assert self.ordinal.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/.github/workflows/unittest.yml:
--------------------------------------------------------------------------------
1 | name: UnitTest
2 |
3 | on:
4 | workflow_dispatch:
5 | pull_request:
6 | paths:
7 | - '**.py'
8 |
9 | jobs:
10 | unit-test:
11 | runs-on: ubuntu-latest
12 | strategy:
13 | matrix:
14 | python-version: [3.9]
15 | steps:
16 | - uses: actions/checkout@v3
17 | - name: Set up Python ${{ matrix.python-version }}
18 | uses: actions/setup-python@v4
19 | with:
20 | python-version: ${{ matrix.python-version }}
21 | - name: Install dependencies
22 | run: |
23 | python -m pip install --upgrade pip
24 | pip install -r requirements.txt
25 | - name: Lint with flake8
26 | run: |
27 | # stop the build if there are Python syntax errors or undefined names
28 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
29 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
30 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
31 | - name: Test with pytest
32 | run: |
33 | pytest
34 |
--------------------------------------------------------------------------------
/tn/english/test/normalizer_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.normalizer import Normalizer
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class TestNormalizer:
22 |
23 | normalizer = Normalizer(overwrite_cache=True)
24 | cases = parse_test_case("data/normalizer.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", cases)
27 | def test_normalizer(self, written, spoken):
28 | assert self.normalizer.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/english/test/cardinal_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.rules.cardinal import Cardinal
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class TestCardinal:
22 |
23 | cardinal = Cardinal(deterministic=False)
24 | cardinal_cases = parse_test_case("data/cardinal.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", cardinal_cases)
27 | def test_cardinal(self, written, spoken):
28 | assert self.cardinal.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/english/test/fraction_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.rules.fraction import Fraction
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class TestFraction:
22 |
23 | fraction = Fraction(deterministic=False)
24 | fraction_cases = parse_test_case("data/fraction.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", fraction_cases)
27 | def test_fraction(self, written, spoken):
28 | assert self.fraction.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/english/test/telephone_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.rules.telephone import Telephone
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class TestTelephone:
22 |
23 | telephone = Telephone(deterministic=False)
24 | telephone_cases = parse_test_case("data/telephone.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", telephone_cases)
27 | def test_telephone(self, written, spoken):
28 | assert self.telephone.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/english/test/whitelist_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.rules.whitelist import WhiteList
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class TestWhiteList:
22 |
23 | whitelist = WhiteList(deterministic=False)
24 | whitelist_cases = parse_test_case("data/whitelist.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", whitelist_cases)
27 | def test_whitelist(self, written, spoken):
28 | assert self.whitelist.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/tn/english/data/roman/README.md:
--------------------------------------------------------------------------------
1 | `female.tsv` - List of common female names. Copyright (c) January 1991 by Mark Kantrowitz, 4987 names, Version 1.3 (29-MAR-94)
2 | Source: [https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/female.txt](https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/female.txt)
3 |
4 | `male.tsv` - List of common male names. Copyright (c) January 1991 by Mark Kantrowitz, 2940 names, Version 1.3 (29-MAR-94)
5 | Source: [https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/male.txt](https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/male.txt)
6 |
7 | [Corpora Readme.txt](https://www.cs.cmu.edu/Groups/AI/areas/nlp/corpora/names/readme.txt):
8 |
9 | You may use the lists of names for any purpose, so long as credit is given
10 | in any published work. You may also redistribute the list if you
11 | provide the recipients with a copy of this README file. The lists are
12 | not in the public domain (I retain the copyright on the lists) but are
13 | freely redistributable.
14 |
15 | If you have any additions to the lists of names, I would appreciate
16 | receiving them.
17 |
18 | My email address is mkant+@cs.cmu.edu.
19 |
20 | Mark Kantrowitz
--------------------------------------------------------------------------------
/tn/english/test/electronic_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.english.rules.electronic import Electronic
18 | from tn.english.test.utils import parse_test_case
19 |
20 |
21 | class TestElectronic:
22 |
23 | electronic = Electronic(deterministic=False)
24 | electronic_cases = parse_test_case("data/electronic.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", electronic_cases)
27 | def test_electronic(self, written, spoken):
28 | assert self.electronic.normalize(written) == spoken
29 |
--------------------------------------------------------------------------------
/runtime/android/gradle.properties:
--------------------------------------------------------------------------------
1 | # Project-wide Gradle settings.
2 | # IDE (e.g. Android Studio) users:
3 | # Gradle settings configured through the IDE *will override*
4 | # any settings specified in this file.
5 | # For more details on how to configure your build environment visit
6 | # http://www.gradle.org/docs/current/userguide/build_environment.html
7 | # Specifies the JVM arguments used for the daemon process.
8 | # The setting is particularly useful for tweaking memory settings.
9 | org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
10 | # When configured, Gradle will run in incubating parallel mode.
11 | # This option should only be used with decoupled projects. More details, visit
12 | # http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
13 | # org.gradle.parallel=true
14 | # AndroidX package structure to make it clearer which packages are bundled with the
15 | # Android operating system, and which are packaged with your app"s APK
16 | # https://developer.android.com/topic/libraries/support-library/androidx-rn
17 | android.useAndroidX=true
18 | # Automatically convert third-party libraries to use AndroidX
19 | android.enableJetifier=true
--------------------------------------------------------------------------------
/tn/english/data/whitelist/asr_with_pc.tsv:
--------------------------------------------------------------------------------
1 | Hon. honorable
2 | Mt. Mount
3 | Maj. Major
4 | Rev. Reverend
5 | # hash
6 | Gov. governor
7 | vs. versus
8 | vs versus
9 | dept. department
10 | vol volume
11 | vol. volume
12 | bldg. building
13 | Bldg. Building
14 | apt. apartment
15 | Apt. Apartment
16 | Σ sigma
17 | η eta
18 | κ kappa
19 | ω omega
20 | σ sigma
21 | α alpha
22 | ν nu
23 | δ delta
24 | ι iota
25 | _ underscore
26 | % percent
27 | & ampersand
28 | * asterisk
29 | + plus
30 | / slash
31 | = equal sign
32 | ^ circumflex
33 | { left brace
34 | | vertical bar
35 | } right brace
36 | ~ tilde
37 | ltd limited
38 | int'l international
39 | $ dollar
40 | BMW M b m w
41 | Capt. captain
42 | Co. company
43 | Col. colonel
44 | Dr. doctor
45 | Drs. doctors
46 | e.g. for example
47 | e. g. for example
48 | ES3 e s three
49 | Esq. esquire
50 | F.I f
51 | FNU f n u d s a
52 | Ft. Fort
53 | Gen. general
54 | i.e. that is
55 | Jr. junior
56 | jr. junior
57 | Jr junior
58 | jr junior
59 | Ltd. limited
60 | Lt. lieutenant
61 | Mr. mister
62 | Mrs. misses
63 | Ms. miss
64 | Sgt. sergeant
65 | S&P 500 s and p five hundred
66 | Uéda u e acute d a
67 | USMC M u s m c
68 | vs. versus
69 | _vs._ versus
70 | VTE v t eL
71 | XVAS x v a
72 |
--------------------------------------------------------------------------------
/tn/chinese/test/preprocessor_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.chinese.rules.preprocessor import PreProcessor
18 | from tn.chinese.test.utils import parse_test_case
19 |
20 |
21 | class TestPreProcessor:
22 |
23 | processor = PreProcessor().processor
24 | processor_cases = parse_test_case("data/preprocessor.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", processor_cases)
27 | def test_processor(self, written, spoken):
28 | print((written @ self.processor).string())
29 | assert (written @ self.processor).string() == spoken
30 |
--------------------------------------------------------------------------------
/tn/chinese/test/postprocessor_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.chinese.rules.postprocessor import PostProcessor
18 | from tn.chinese.test.utils import parse_test_case
19 |
20 |
21 | class TestPostProcessor:
22 |
23 | processor = PostProcessor(tag_oov=True).processor
24 | processor_cases = parse_test_case("data/postprocessor.txt")
25 |
26 | @pytest.mark.parametrize("written, spoken", processor_cases)
27 | def test_processor(self, written, spoken):
28 | print((written @ self.processor).string())
29 | assert (written @ self.processor).string() == spoken
30 |
--------------------------------------------------------------------------------
/tn/chinese/test/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import os
16 |
17 |
18 | def parse_test_case(file_name):
19 | file = os.path.dirname(os.path.abspath(__file__)) + os.path.sep + file_name
20 |
21 | delimiter = "=>"
22 | with open(file) as fin:
23 | for line in fin:
24 | assert delimiter in line
25 | arr = line.strip().split(delimiter)
26 | assert 0 < len(arr) <= 2
27 |
28 | written = arr[0].strip()
29 | spoken = ""
30 | if len(arr) > 1:
31 | spoken = arr[1].strip()
32 | yield (written, spoken)
33 |
--------------------------------------------------------------------------------
/itn/japanese/rules/preprocessor.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import string_file
16 |
17 | from tn.processor import Processor
18 | from tn.utils import get_abs_path
19 |
20 |
21 | class PreProcessor(Processor):
22 |
23 | def __init__(self, full_to_half):
24 | super().__init__(name="preprocessor")
25 | traditional2simple = string_file(get_abs_path("../itn/japanese/data/char/fullwidth_to_halfwidth.tsv"))
26 |
27 | processor = self.build_rule("")
28 | if full_to_half:
29 | processor @= self.build_rule(traditional2simple)
30 |
31 | self.processor = processor.optimize()
32 |
--------------------------------------------------------------------------------
/tn/japanese/rules/preprocessor.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import string_file
16 |
17 | from tn.processor import Processor
18 | from tn.utils import get_abs_path
19 |
20 |
21 | class PreProcessor(Processor):
22 |
23 | def __init__(self, full_to_half=True):
24 | super().__init__(name="preprocessor")
25 | traditional2simple = string_file(get_abs_path("japanese/data/char/fullwidth_to_halfwidth.tsv"))
26 |
27 | processor = self.build_rule("")
28 | if full_to_half:
29 | processor @= self.build_rule(traditional2simple)
30 |
31 | self.processor = processor.optimize()
32 |
--------------------------------------------------------------------------------
/tn/chinese/rules/preprocessor.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import string_file
16 |
17 | from tn.processor import Processor
18 | from tn.utils import get_abs_path
19 |
20 |
21 | class PreProcessor(Processor):
22 |
23 | def __init__(self, traditional_to_simple=True):
24 | super().__init__(name="preprocessor")
25 | traditional2simple = string_file(get_abs_path("chinese/data/char/traditional_to_simple.tsv"))
26 |
27 | processor = self.build_rule("")
28 | if traditional_to_simple:
29 | processor @= self.build_rule(traditional2simple)
30 |
31 | self.processor = processor.optimize()
32 |
--------------------------------------------------------------------------------
/itn/japanese/data/char/hiragana_and_katakana.tsv:
--------------------------------------------------------------------------------
1 | あ
2 | い
3 | う
4 | え
5 | お
6 | か
7 | き
8 | く
9 | け
10 | こ
11 | さ
12 | し
13 | す
14 | せ
15 | そ
16 | た
17 | ち
18 | つ
19 | て
20 | と
21 | な
22 | に
23 | ぬ
24 | ね
25 | の
26 | は
27 | ひ
28 | ふ
29 | へ
30 | ほ
31 | ま
32 | み
33 | む
34 | め
35 | も
36 | や
37 | ゆ
38 | よ
39 | ら
40 | り
41 | る
42 | れ
43 | ろ
44 | わ
45 | を
46 | ん
47 | ア
48 | イ
49 | ウ
50 | エ
51 | オ
52 | カ
53 | キ
54 | ク
55 | ケ
56 | コ
57 | サ
58 | シ
59 | ス
60 | セ
61 | ソ
62 | タ
63 | チ
64 | ツ
65 | テ
66 | ト
67 | ナ
68 | ニ
69 | ヌ
70 | ネ
71 | ノ
72 | ハ
73 | ヒ
74 | フ
75 | ヘ
76 | ホ
77 | マ
78 | ミ
79 | ム
80 | メ
81 | モ
82 | ヤ
83 | ユ
84 | ヨ
85 | ラ
86 | リ
87 | ル
88 | レ
89 | ロ
90 | ワ
91 | ヲ
92 | ン
93 | が
94 | ぎ
95 | ぐ
96 | げ
97 | ご
98 | ざ
99 | じ
100 | ず
101 | ぜ
102 | ぞ
103 | だ
104 | ぢ
105 | づ
106 | で
107 | ど
108 | ば
109 | び
110 | ぶ
111 | べ
112 | ぼ
113 | ぱ
114 | ぴ
115 | ぷ
116 | ぺ
117 | ぽ
118 | ガ
119 | ギ
120 | グ
121 | ゲ
122 | ゴ
123 | ザ
124 | ジ
125 | ズ
126 | ゼ
127 | ゾ
128 | ダ
129 | ヂ
130 | ヅ
131 | デ
132 | ド
133 | バ
134 | ビ
135 | ブ
136 | ベ
137 | ボ
138 | パ
139 | ピ
140 | プ
141 | ペ
142 | ポ
143 | ャ
144 | ァ
145 | ィ
146 | ュ
147 | ッ
148 | ゥ
149 | ェ
150 | ョ
151 | ォ
152 | ぁ
153 | ぃ
154 | ぅ
155 | ぇ
156 | ぉ
157 | っ
158 | ゃ
159 | ゅ
160 | ょ
--------------------------------------------------------------------------------
/itn/japanese/rules/whitelist.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import string_file
16 | from pynini.lib.pynutil import insert
17 |
18 | from tn.processor import Processor
19 | from tn.utils import get_abs_path
20 |
21 |
22 | class Whitelist(Processor):
23 |
24 | def __init__(self):
25 | super().__init__(name="whitelist")
26 | self.build_tagger()
27 | self.build_verbalizer()
28 |
29 | def build_tagger(self):
30 | whitelist = string_file(get_abs_path("../itn/japanese/data/default/whitelist.tsv"))
31 |
32 | tagger = insert('value: "') + whitelist + insert('"')
33 | self.tagger = self.add_tokens(tagger)
34 |
--------------------------------------------------------------------------------
/tn/japanese/data/char/hiragana_and_katakana.tsv:
--------------------------------------------------------------------------------
1 | あ
2 | い
3 | う
4 | え
5 | お
6 | か
7 | き
8 | く
9 | け
10 | こ
11 | さ
12 | し
13 | す
14 | せ
15 | そ
16 | た
17 | ち
18 | つ
19 | て
20 | と
21 | な
22 | に
23 | ぬ
24 | ね
25 | の
26 | は
27 | ひ
28 | ふ
29 | へ
30 | ほ
31 | ま
32 | み
33 | む
34 | め
35 | も
36 | や
37 | ゆ
38 | よ
39 | ら
40 | り
41 | る
42 | れ
43 | ろ
44 | わ
45 | を
46 | ん
47 | ア
48 | イ
49 | ウ
50 | エ
51 | オ
52 | カ
53 | キ
54 | ク
55 | ケ
56 | コ
57 | サ
58 | シ
59 | ス
60 | セ
61 | ソ
62 | タ
63 | チ
64 | ツ
65 | テ
66 | ト
67 | ナ
68 | ニ
69 | ヌ
70 | ネ
71 | ノ
72 | ハ
73 | ヒ
74 | フ
75 | ヘ
76 | ホ
77 | マ
78 | ミ
79 | ム
80 | メ
81 | モ
82 | ヤ
83 | ユ
84 | ヨ
85 | ラ
86 | リ
87 | ル
88 | レ
89 | ロ
90 | ワ
91 | ヲ
92 | ン
93 | が
94 | ぎ
95 | ぐ
96 | げ
97 | ご
98 | ざ
99 | じ
100 | ず
101 | ぜ
102 | ぞ
103 | だ
104 | ぢ
105 | づ
106 | で
107 | ど
108 | ば
109 | び
110 | ぶ
111 | べ
112 | ぼ
113 | ぱ
114 | ぴ
115 | ぷ
116 | ぺ
117 | ぽ
118 | ガ
119 | ギ
120 | グ
121 | ゲ
122 | ゴ
123 | ザ
124 | ジ
125 | ズ
126 | ゼ
127 | ゾ
128 | ダ
129 | ヂ
130 | ヅ
131 | デ
132 | ド
133 | バ
134 | ビ
135 | ブ
136 | ベ
137 | ボ
138 | パ
139 | ピ
140 | プ
141 | ペ
142 | ポ
143 | ャ
144 | ァ
145 | ィ
146 | ュ
147 | ッ
148 | ゥ
149 | ェ
150 | ョ
151 | ォ
152 | ぁ
153 | ぃ
154 | ぅ
155 | ぇ
156 | ぉ
157 | っ
158 | ゃ
159 | ゅ
160 | ょ
--------------------------------------------------------------------------------
/itn/chinese/rules/whitelist.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import string_file
16 | from pynini.lib.pynutil import insert
17 |
18 | from tn.processor import Processor
19 | from tn.utils import get_abs_path
20 |
21 |
22 | class Whitelist(Processor):
23 |
24 | def __init__(self):
25 | super().__init__(name="whitelist")
26 | self.build_tagger()
27 | self.build_verbalizer()
28 |
29 | def build_tagger(self):
30 | whitelist = string_file(get_abs_path("../itn/chinese/data/default/whitelist.tsv"))
31 |
32 | tagger = insert('value: "') + whitelist + insert('"')
33 | self.tagger = self.add_tokens(tagger)
34 |
--------------------------------------------------------------------------------
/itn/japanese/rules/ordinal.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import accep
16 | from pynini.lib.pynutil import insert
17 |
18 | from itn.japanese.rules.cardinal import Cardinal
19 | from tn.processor import Processor
20 |
21 |
22 | class Ordinal(Processor):
23 |
24 | def __init__(self):
25 | super().__init__(name="ordinal")
26 | self.build_tagger()
27 | self.build_verbalizer()
28 |
29 | def build_tagger(self):
30 | cardinal = Cardinal().number
31 | ordinal = (cardinal + accep("番目")) | (accep("第") + cardinal)
32 | tagger = insert('value: "') + ordinal + insert('"')
33 | self.tagger = self.add_tokens(tagger)
34 |
--------------------------------------------------------------------------------
/itn/japanese/test/data/normalizer_disable_standalone_number_disable_0_to_9.txt:
--------------------------------------------------------------------------------
1 | 一 => 一
2 | 四 => 四
3 | 十 => 十
4 | 十四 => 十四
5 | 四十四 => 四十四
6 | 四十 => 四十
7 | 百一 => 百一
8 | 百十二 => 百十二
9 | 四百四 => 四百四
10 | 九千百二十三 => 九千百二十三
11 | 一千二百三十四 => 一千二百三十四
12 | 五千六百七十八 => 五千六百七十八
13 | 二千二十 => 二千二十
14 | 二千二 => 二千二
15 | 二千十 => 二千十
16 | 二千百 => 二千百
17 | 九千 => 九千
18 | 九千二 => 九千二
19 | 十 => 十
20 | 百 => 百
21 | 千 => 千
22 | 万 => 万
23 | 兆 => 兆
24 | 千百 => 千百
25 | 千三百 => 千三百
26 | 千三百十 => 千三百十
27 | 千十 => 千十
28 | 千二十 => 千二十
29 | 千二十一 => 千二十一
30 | 千一 => 千一
31 | 千百十 => 千百十
32 | 千百一 => 千百一
33 | マイナス百十二 => マイナス百十二
34 | プラス百十二 => プラス百十二
35 | 二十万二 => 二十万二
36 | 一万二 => 一万二
37 | 二十万二千百 => 二十万二千百
38 | 四百万 => 四百万
39 | 四百四万 => 四百四万
40 | 五千万 => 五千万
41 | 二万 => 二万
42 | 一億五千万 => 一億五千万
43 | 一億五万 => 一億五万
44 | 一億一百万 => 一億一百万
45 | 一億一千万 => 一億一千万
46 | 二千億一千万 => 二千億一千万
47 | 二千億 => 二千億
48 | 二兆二億 => 二兆二億
49 | 二兆二千億 => 二兆二千億
50 | 二兆二千万 => 二兆二千万
51 | 二兆二百万 => 二兆二百万
52 | 一兆三百二十万五千 => 一兆三百二十万五千
53 | 二兆三十 => 二兆三十
54 | 二兆百 => 二兆百
55 | 二十兆百 => 二十兆百
56 | 一九二点一六八点零点一 => 192.168.0.1
57 | 一二三四五六七八九 => 123456789
58 | マイナス五百六十七 => マイナス五百六十七
59 | 四十四平方メートル => 44m²
60 | 四十四キログラム => 44kg
61 | 四部 => 四部
62 | 四円 => 四円
63 | 四十四部 => 44部
64 | 四十四匹 => 44匹
65 | 四分の三 => 3/4
66 | 四十四分の三 => 3/44
67 | 四十四パーセント => 44%
68 | 一時三十分三秒 => 1時30分3秒
69 | 八メガ秒 => 八メガ秒
70 | 一マイナス二プラス三十 => 1-2+30
71 | 一月 => 一月
72 | 一日 => 一日
--------------------------------------------------------------------------------
/itn/japanese/test/data/normalizer_disable_standalone_number_enable_0_to_9.txt:
--------------------------------------------------------------------------------
1 | 一 => 一
2 | 四 => 四
3 | 十 => 十
4 | 十四 => 十四
5 | 四十四 => 四十四
6 | 四十 => 四十
7 | 百一 => 百一
8 | 百十二 => 百十二
9 | 四百四 => 四百四
10 | 九千百二十三 => 九千百二十三
11 | 一千二百三十四 => 一千二百三十四
12 | 五千六百七十八 => 五千六百七十八
13 | 二千二十 => 二千二十
14 | 二千二 => 二千二
15 | 二千十 => 二千十
16 | 二千百 => 二千百
17 | 九千 => 九千
18 | 九千二 => 九千二
19 | 十 => 十
20 | 百 => 百
21 | 千 => 千
22 | 万 => 万
23 | 兆 => 兆
24 | 千百 => 千百
25 | 千三百 => 千三百
26 | 千三百十 => 千三百十
27 | 千十 => 千十
28 | 千二十 => 千二十
29 | 千二十一 => 千二十一
30 | 千一 => 千一
31 | 千百十 => 千百十
32 | 千百一 => 千百一
33 | マイナス百十二 => マイナス百十二
34 | プラス百十二 => プラス百十二
35 | 二十万二 => 二十万二
36 | 一万二 => 一万二
37 | 二十万二千百 => 二十万二千百
38 | 四百万 => 四百万
39 | 四百四万 => 四百四万
40 | 五千万 => 五千万
41 | 二万 => 二万
42 | 一億五千万 => 一億五千万
43 | 一億五万 => 一億五万
44 | 一億一百万 => 一億一百万
45 | 一億一千万 => 一億一千万
46 | 二千億一千万 => 二千億一千万
47 | 二千億 => 二千億
48 | 二兆二億 => 二兆二億
49 | 二兆二千億 => 二兆二千億
50 | 二兆二千万 => 二兆二千万
51 | 二兆二百万 => 二兆二百万
52 | 一兆三百二十万五千 => 一兆三百二十万五千
53 | 二兆三十 => 二兆三十
54 | 二兆百 => 二兆百
55 | 二十兆百 => 二十兆百
56 | 一九二点一六八点零点一 => 192.168.0.1
57 | 一二三四五六七八九 => 123456789
58 | マイナス五百六十七 => マイナス五百六十七
59 | 四十四平方メートル => 44m²
60 | 四十四キログラム => 44kg
61 | 四部 => 4部
62 | 四円 => 4円
63 | 四十四部 => 44部
64 | 四十四匹 => 44匹
65 | 四分の三 => 3/4
66 | 四十四分の三 => 3/44
67 | 四十四パーセント => 44%
68 | 一時三十分三秒 => 1時30分3秒
69 | 八メガ秒 => 8ms
70 | 一マイナス二プラス三十 => 1-2+30
71 | 一月 => 1月
72 | 一日 => 1日
--------------------------------------------------------------------------------
/tn/japanese/rules/transliteration.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import string_file
16 | from pynini.lib.pynutil import insert
17 |
18 | from tn.processor import Processor
19 | from tn.utils import get_abs_path
20 |
21 |
22 | class Transliteration(Processor):
23 |
24 | def __init__(self):
25 | super().__init__(name="transliteration")
26 | self.build_tagger()
27 | self.build_verbalizer()
28 |
29 | def build_tagger(self):
30 | transliteration = string_file(get_abs_path("japanese/data/pyopenjtalk/transliteration.tsv"))
31 | tagger = insert('value: "') + transliteration + insert('"')
32 |
33 | self.tagger = self.add_tokens(tagger)
34 |
--------------------------------------------------------------------------------
/itn/japanese/test/data/normalizer_enable_standalone_number_disable_0_to_9.txt:
--------------------------------------------------------------------------------
1 | 一 => 一
2 | 四 => 四
3 | 十 => 10
4 | 十四 => 14
5 | 四十四 => 44
6 | 四十 => 40
7 | 百一 => 101
8 | 百十二 => 112
9 | 四百四 => 404
10 | 九千百二十三 => 9123
11 | 一千二百三十四 => 1234
12 | 五千六百七十八 => 5678
13 | 二千二十 => 2020
14 | 二千二 => 2002
15 | 二千十 => 2010
16 | 二千百 => 2100
17 | 九千 => 9000
18 | 九千二 => 9002
19 | 十 => 10
20 | 百 => 100
21 | 千 => 1000
22 | 万 => 万
23 | 兆 => 兆
24 | 千百 => 1100
25 | 千三百 => 1300
26 | 千三百十 => 1310
27 | 千十 => 1010
28 | 千二十 => 1020
29 | 千二十一 => 1021
30 | 千一 => 1001
31 | 千百十 => 1110
32 | 千百一 => 1101
33 | マイナス百十二 => -112
34 | プラス百十二 => +112
35 | 二十万二 => 200002
36 | 一万二 => 10002
37 | 二十万二千百 => 202100
38 | 四百万 => 400万
39 | 四百四万 => 404万
40 | 五千万 => 5000万
41 | 二万 => 20000
42 | 一億五千万 => 1億5000万
43 | 一億五万 => 1億5万
44 | 一億一百万 => 1億100万
45 | 一億一千万 => 1億1000万
46 | 二千億一千万 => 2000億1000万
47 | 二千億 => 2000億
48 | 二兆二億 => 2兆2億
49 | 二兆二千億 => 2兆2000億
50 | 二兆二千万 => 2兆2000万
51 | 二兆二百万 => 2兆200万
52 | 一兆三百二十万五千 => 1兆320万5000
53 | 二兆三十 => 2兆30
54 | 二兆百 => 2兆100
55 | 二十兆百 => 20兆100
56 | 一九二点一六八点零点一 => 192.168.0.1
57 | 一二三四五六七八九 => 123456789
58 | マイナス五百六十七 => -567
59 | 四十四平方メートル => 44m²
60 | 四十四キログラム => 44kg
61 | 四十四部 => 44部
62 | 四十四匹 => 44匹
63 | 四分の三 => 3/4
64 | 四十四分の三 => 3/44
65 | 四十四パーセント => 44%
66 | 一時三十分三秒 => 1時30分3秒
67 | 八メガ秒 => 八メガ秒
68 | 一マイナス二プラス三十 => 1-2+30
69 | 一月 => 一月
70 | 一日 => 一日
--------------------------------------------------------------------------------
/itn/chinese/rules/postprocessor.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | # Copyright (c) 2023 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | from pynini import string_file
17 | from pynini.lib.pynutil import delete
18 |
19 | from tn.processor import Processor
20 | from tn.utils import get_abs_path
21 |
22 |
23 | class PostProcessor(Processor):
24 |
25 | def __init__(self, remove_interjections=True):
26 | super().__init__(name="postprocessor")
27 | blacklist = string_file(get_abs_path("../itn/chinese/data/default/blacklist.tsv"))
28 |
29 | processor = self.VSIGMA
30 | if remove_interjections:
31 | processor @= self.build_rule(delete(blacklist))
32 | self.processor = processor
33 |
--------------------------------------------------------------------------------
/runtime/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
2 |
3 | project(wetextprocessing VERSION 0.1)
4 | set(CMAKE_CXX_STANDARD 14)
5 |
6 | set(CMAKE_VERBOSE_MAKEFILE OFF)
7 | option(BUILD_TESTING "whether to build unit test" OFF)
8 |
9 | include(FetchContent)
10 | set(FETCHCONTENT_QUIET OFF)
11 | get_filename_component(fc_base "fc_base-${CMAKE_CXX_COMPILER_ID}" REALPATH BASE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
12 | set(FETCHCONTENT_BASE_DIR ${fc_base})
13 | list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
14 |
15 | if(NOT MSVC)
16 | # Keep the same with openfst, -fPIC or -fpic
17 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14 -pthread -fPIC")
18 | else()
19 | # For windows, please use unicode(3 bytes per chinese char) instead of gbk(2 bytes per chinese char).
20 | # https://github.com/wenet-e2e/wenet/issues/882#issuecomment-1101246299
21 | set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
22 | add_compile_options("$<$:/utf-8>")
23 | endif()
24 |
25 | if(${CMAKE_SYSTEM_NAME} STREQUAL "Darwin")
26 | set(CMAKE_MACOSX_RPATH 1)
27 | endif()
28 |
29 | include(openfst)
30 | include_directories(${PROJECT_SOURCE_DIR})
31 |
32 | add_subdirectory(utils)
33 | add_subdirectory(processor)
34 | add_subdirectory(bin)
35 |
36 | if(BUILD_TESTING)
37 | include(gtest)
38 | add_subdirectory(test)
39 | endif()
40 |
--------------------------------------------------------------------------------
/tn/chinese/test/data/normalizer.txt:
--------------------------------------------------------------------------------
1 | 苹果宣布发布新IPHONE => 苹果宣布发布新IPHONE
2 | 他说:“我们已经吃过了!”。 => 他说:"我们已经吃过了!".
3 | 呃这个呃啊我不知道 => 这个我不知道
4 | 共465篇,约315万字 => 共四百六十五篇,约三百一十五万字
5 | 共计6.42万人 => 共计六点四二万人
6 | 同比升高0.6个百分点 => 同比升高零点六个百分点
7 | 总量的1/5以上 => 总量的五分之一以上
8 | 相当于头发丝的1/16 => 相当于头发丝的十六分之一
9 | 3/2是一个假分数 => 二分之三是一个假分数
10 | 同比增长6.3% => 同比增长百分之六点三
11 | 增幅0.4% => 增幅百分之零点四
12 | 2002/01/28 => 二零零二年一月二十八日
13 | 2002-01-28 => 二零零二年一月二十八日
14 | 2002.01.28 => 二零零二年一月二十八日
15 | 2002/01 => 二零零二年一月
16 | 8月16号12:00之前 => 八月十六号十二点之前
17 | 我是5:02开始的 => 我是五点零二分开始的
18 | 于5:35:36发射 => 于五点三十五分三十六秒发射
19 | 8:00 a.m.准时开会 => 上午八点准时开会
20 | 比分定格在78:96 => 比分定格在七十八比九十六
21 | 计算-2的绝对值是2 => 计算负二的绝对值是二
22 | ±2的平方都是4 => 正负二的平方都是四
23 | 价格是¥13.5 => 价格是十三点五元
24 | 价格是$13.5 => 价格是十三点五美元
25 | 价格是A$13.5 => 价格是十三点五澳元
26 | 价格是HKD13.5 => 价格是十三点五港元
27 | 重达25kg => 重达二十五千克
28 | 最高气温38°C => 最高气温三十八摄氏度
29 | 实际面积120m² => 实际面积一百二十平方米
30 | 渲染速度10ms一帧 => 渲染速度十毫秒一帧
31 | 可以打我手机13501234567 => 可以打我手机幺三五零幺二三四五六七
32 | 可以拨打12306来咨询 => 可以拨打幺二三零六来咨询
33 | 这儿有只鸟儿 => 这有只鸟
34 | 这事儿好办 => 这事好办
35 | 我儿子喜欢这地儿 => 我儿子喜欢这地
36 | O2O => O to O
37 | B2B => B to B
38 | 我们안녕 => 我们안녕
39 | 雪の花 => 雪の花
40 | 给12315打个电话 => 给幺二三幺五打个电话
41 | 人均200以内 => 人均两百以内
42 | 当场票数≥100万 => 当场票数大于等于一百万
43 | 独得300w张 => 独得三百万张
44 | 面积是10km² => 面积是十平方千米
45 | 仅仅是2015年 => 仅仅是二零一五年
46 | 包含3000余件 => 包含三千余件
47 | 查处450余名 => 查处四百五十余名
48 | 查处450余名 => 查处四百五十余名
49 |
--------------------------------------------------------------------------------
/tn/japanese/rules/whitelist.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import string_file
16 | from pynini.lib.pynutil import insert
17 |
18 | from tn.processor import Processor
19 | from tn.utils import get_abs_path
20 |
21 |
22 | class Whitelist(Processor):
23 |
24 | def __init__(self):
25 | super().__init__(name="whitelist")
26 | self.build_tagger()
27 | self.build_verbalizer()
28 |
29 | def build_tagger(self):
30 | whitelist = string_file(get_abs_path("japanese/data/default/whitelist.tsv"))
31 |
32 | tagger = (insert('value: "') + whitelist) + insert('"')
33 | self.tagger = self.add_tokens(tagger)
34 |
35 | def build_verbalizer(self):
36 | super().build_verbalizer()
37 |
--------------------------------------------------------------------------------
/tn/chinese/test/cardinal_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import pytest
16 |
17 | from tn.chinese.rules.cardinal import Cardinal
18 | from tn.chinese.test.utils import parse_test_case
19 |
20 |
21 | class TestCardinal:
22 |
23 | cardinal = Cardinal()
24 | number_cases = parse_test_case("data/number.txt")
25 | cardinal_cases = parse_test_case("data/cardinal.txt")
26 |
27 | @pytest.mark.parametrize("written, spoken", number_cases)
28 | def test_number(self, written, spoken):
29 | number = self.cardinal.number
30 | assert (written @ number).string() == spoken
31 |
32 | @pytest.mark.parametrize("written, spoken", cardinal_cases)
33 | def test_cardinal(self, written, spoken):
34 | assert self.cardinal.normalize(written) == spoken
35 |
--------------------------------------------------------------------------------
/itn/chinese/rules/math.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Xingchen Song (sxc19@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import string_file
16 | from pynini.lib.pynutil import insert
17 |
18 | from itn.chinese.rules.cardinal import Cardinal
19 | from tn.processor import Processor
20 | from tn.utils import get_abs_path
21 |
22 |
23 | class Math(Processor):
24 |
25 | def __init__(self):
26 | super().__init__(name="math")
27 | self.build_tagger()
28 | self.build_verbalizer()
29 |
30 | def build_tagger(self):
31 | operator = string_file(get_abs_path("../itn/chinese/data/math/operator.tsv"))
32 |
33 | number = Cardinal().number
34 | tagger = number + (operator + number).plus
35 | tagger = insert('value: "') + tagger + insert('"')
36 | self.tagger = self.add_tokens(tagger)
37 |
--------------------------------------------------------------------------------
/tn/japanese/rules/math.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import string_file
16 | from pynini.lib.pynutil import delete, insert
17 |
18 | from tn.japanese.rules.cardinal import Cardinal
19 | from tn.processor import Processor
20 | from tn.utils import get_abs_path
21 |
22 |
23 | class Math(Processor):
24 |
25 | def __init__(self):
26 | super().__init__(name="math")
27 | self.build_tagger()
28 | self.build_verbalizer()
29 |
30 | def build_tagger(self):
31 | operator = string_file(get_abs_path("japanese/data/math/operator.tsv"))
32 |
33 | number = Cardinal().number
34 | operator = number + (delete(" ").ques + operator + delete(" ").ques + number).star
35 | tagger = insert('value: "') + operator + insert('"')
36 | self.tagger = self.add_tokens(tagger)
37 |
--------------------------------------------------------------------------------
/runtime/utils/wetext_string.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #ifndef UTILS_WETEXT_STRING_H_
16 | #define UTILS_WETEXT_STRING_H_
17 |
18 | #include
19 | #include
20 |
21 | namespace wetext {
22 | extern const char* WHITESPACE;
23 |
24 | int UTF8CharLength(char ch);
25 |
26 | int UTF8StringLength(const std::string& str);
27 |
28 | void SplitUTF8StringToChars(const std::string& str,
29 | std::vector* chars);
30 |
31 | std::string Ltrim(const std::string& str);
32 |
33 | std::string Rtrim(const std::string& str);
34 |
35 | std::string Trim(const std::string& str);
36 |
37 | void Split(const std::string& str, const std::string& delim,
38 | std::vector* output);
39 |
40 | } // namespace wetext
41 |
42 | #endif // UTILS_WETEXT_STRING_H_
43 |
--------------------------------------------------------------------------------
/itn/japanese/rules/math.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import string_file
16 | from pynini.lib.pynutil import insert
17 |
18 | from itn.japanese.rules.cardinal import Cardinal
19 | from tn.processor import Processor
20 | from tn.utils import get_abs_path
21 |
22 |
23 | class Math(Processor):
24 |
25 | def __init__(self):
26 | super().__init__(name="math")
27 | self.build_tagger()
28 | self.build_verbalizer()
29 |
30 | def build_tagger(self):
31 | operator = string_file(get_abs_path("../itn/japanese/data/math/operator.tsv"))
32 |
33 | number = Cardinal().big_integer
34 | decimal = Cardinal().decimal
35 | number |= decimal
36 | tagger = number + (operator + number).plus
37 | tagger = insert('value: "') + tagger + insert('"')
38 | self.tagger = self.add_tokens(tagger)
39 |
--------------------------------------------------------------------------------
/tn/japanese/data/measure/units_en.tsv:
--------------------------------------------------------------------------------
1 | g グラム
2 | kg キログラム
3 | mg ミリグラム
4 | µg マイクログラム
5 | oz オンス
6 | t トン
7 | lb ポンド
8 | cm センチメートル
9 | m メートル
10 | km キロメートル
11 | dm デシメートル
12 | mm ミリメートル
13 | μm マイクロメートル
14 | nm ナノメートル
15 | ft フィート
16 | h 時
17 | hour 時
18 | min 分
19 | sec 秒
20 | s 秒
21 | ms ミリ秒
22 | °C 摂氏
23 | ℃ 摂氏
24 | °F 華氏
25 | mm² 平方ミリメートル
26 | cm² 平方センチメートル
27 | m² 平方メートル
28 | ㎡ 平方メートル
29 | km² 平方キロメートル
30 | ha ヘクタール
31 | ml ミリリットル
32 | L リットル
33 | mm³ 立方ミリメートル
34 | cm³ 立方センチメートル
35 | gal ガロン
36 | m³ 立方メートル
37 | mol モル
38 | μmol マイクロモル
39 | nmol ナノモル
40 | mmol ミリモル
41 | cd カンデラ
42 | Lm ルーメン
43 | Lux ルクス
44 | lm ルーメン
45 | fpm フィート毎分
46 | fph フィート毎時
47 | fps フィート毎秒
48 | mpm マイル毎分
49 | ips インチ毎秒
50 | ipm インチ毎分
51 | mph マイル毎時
52 | mps マイル毎秒
53 | in インチ
54 | mi マイル
55 | s² 毎秒毎秒
56 | s³ 毎秒毎秒毎秒
57 | kn ノット
58 | ° 度
59 | ' 分
60 | Pa パスカル
61 | N/m² パスカル
62 | pz ピエーズ
63 | N ニュートン
64 | mmHg 水銀柱ミリメートル
65 | hPa ヘクトパスカル
66 | MPa メガパスカル
67 | mbar ミリバール
68 | bar バール
69 | J ジュール
70 | kcal キロカロリー
71 | cal カロリー
72 | KCal キロカロリー
73 | Cal カロリー
74 | W ワット
75 | kWh キロワット時
76 | kW キロワット
77 | J·s ジュール秒
78 | A アンペア
79 | V ボルト
80 | Ω オーム
81 | A/m アンペア毎メートル
82 | Wb ウェーバ
83 | mAh ミリアンペアアワー
84 | Pa·s パスカル秒
85 | Bq ベクレル
86 | Gy グレイ
87 | rad ラド
88 | Sv シーベルト
89 | rem レム
90 | kat カタール
91 | Np ネーパ
92 | Hz ヘルツ
93 | dB デシベル
94 | hz ヘルツ
95 | bit ビット
96 | Byte バイト
97 | byte バイト
98 | MB メガバイト
99 | KB キロバイト
100 | GB ギガバイト
101 | TB テラバイト
--------------------------------------------------------------------------------
/itn/chinese/data/measure/units_zh.tsv:
--------------------------------------------------------------------------------
1 | 年来
2 | 年前
3 | 年后
4 | 年内
5 | 年之前
6 | 年之后
7 | 人
8 | 篇
9 | 帧
10 | 把
11 | 封
12 | 艘
13 | 套
14 | 段
15 | 匹
16 | 张
17 | 座
18 | 回
19 | 场
20 | 尾
21 | 条
22 | 个
23 | 首
24 | 阙
25 | 阵
26 | 网
27 | 炮
28 | 顶
29 | 丘
30 | 棵
31 | 只
32 | 支
33 | 袭
34 | 辆
35 | 挑
36 | 担
37 | 颗
38 | 壳
39 | 窠
40 | 曲
41 | 墙
42 | 群
43 | 腔
44 | 砣
45 | 座
46 | 客
47 | 贯
48 | 扎
49 | 捆
50 | 刀
51 | 令
52 | 手
53 | 罗
54 | 坡
55 | 山
56 | 岭
57 | 江
58 | 溪
59 | 钟
60 | 队
61 | 单
62 | 双
63 | 对
64 | 口
65 | 头
66 | 脚
67 | 板
68 | 跳
69 | 枝
70 | 件
71 | 贴
72 | 针
73 | 线
74 | 管
75 | 名
76 | 位
77 | 身
78 | 堂
79 | 课
80 | 本
81 | 页
82 | 家
83 | 户
84 | 层
85 | 丝
86 | 毫
87 | 厘
88 | 分
89 | 钱
90 | 斤
91 | 担
92 | 铢
93 | 石
94 | 钧
95 | 锱
96 | 忽
97 | 克
98 | 毫
99 | 厘
100 | 寸
101 | 尺
102 | 丈
103 | 里
104 | 寻
105 | 常
106 | 铺
107 | 程
108 | 米
109 | 撮
110 | 勺
111 | 合
112 | 升
113 | 斗
114 | 石
115 | 盘
116 | 碗
117 | 碟
118 | 叠
119 | 桶
120 | 笼
121 | 盆
122 | 盒
123 | 杯
124 | 钟
125 | 斛
126 | 锅
127 | 簋
128 | 篮
129 | 盘
130 | 桶
131 | 罐
132 | 瓶
133 | 壶
134 | 卮
135 | 盏
136 | 箩
137 | 箱
138 | 煲
139 | 啖
140 | 袋
141 | 钵
142 | 季
143 | 年
144 | 月
145 | 日
146 | 刻
147 | 时
148 | 周
149 | 天
150 | 秒
151 | 旬
152 | 纪
153 | 岁
154 | 世
155 | 更
156 | 夜
157 | 春
158 | 夏
159 | 秋
160 | 冬
161 | 代
162 | 伏
163 | 辈
164 | 丸
165 | 泡
166 | 粒
167 | 颗
168 | 幢
169 | 堆
170 | 条
171 | 根
172 | 支
173 | 道
174 | 面
175 | 片
176 | 张
177 | 颗
178 | 块
179 | 架
180 | 角
181 | 毛
182 | 字
183 | 元
184 | 两
185 | 两米饭
186 | 两酒
187 | 吨
188 | 顿
189 | 牛
190 | 次
191 | 号
192 |
--------------------------------------------------------------------------------
/tn/chinese/test/normalizer_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from itertools import chain
16 |
17 | import pytest
18 |
19 | from tn.chinese.normalizer import Normalizer
20 | from tn.chinese.test.utils import parse_test_case
21 |
22 |
23 | class TestNormalizer:
24 |
25 | normalizer = Normalizer(overwrite_cache=True, tag_oov=True)
26 |
27 | normalizer_cases = chain(
28 | parse_test_case("data/cardinal.txt"),
29 | parse_test_case("data/char.txt"),
30 | parse_test_case("data/date.txt"),
31 | parse_test_case("data/fraction.txt"),
32 | parse_test_case("data/math.txt"),
33 | parse_test_case("data/money.txt"),
34 | parse_test_case("data/time.txt"),
35 | parse_test_case("data/whitelist.txt"),
36 | parse_test_case("data/normalizer.txt"),
37 | )
38 |
39 | @pytest.mark.parametrize("written, spoken", normalizer_cases)
40 | def test_normalizer(self, written, spoken):
41 | assert self.normalizer.normalize(written) == spoken
42 |
--------------------------------------------------------------------------------
/tn/chinese/rules/math.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import cross, string_file
16 | from pynini.lib.pynutil import delete, insert
17 |
18 | from tn.chinese.rules.cardinal import Cardinal
19 | from tn.processor import Processor
20 | from tn.utils import get_abs_path
21 |
22 |
23 | class Math(Processor):
24 |
25 | def __init__(self):
26 | super().__init__(name="math")
27 | self.build_tagger()
28 | self.build_verbalizer()
29 |
30 | def build_tagger(self):
31 | operator = string_file(get_abs_path("chinese/data/math/operator.tsv"))
32 | # When it appears alone, it is treated as punctuation
33 | symbols = cross("~", "到") | cross(":", "比") | cross("<", "小于") | cross(">", "大于")
34 |
35 | number = Cardinal().number
36 | tagger = number + (delete(" ").ques + (operator | symbols) + delete(" ").ques + number).star
37 | tagger |= operator
38 | tagger = insert('value: "') + tagger + insert('"')
39 | self.tagger = self.add_tokens(tagger)
40 |
--------------------------------------------------------------------------------
/tn/japanese/test/normalizer_test.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from itertools import chain
16 |
17 | import pytest
18 |
19 | from tn.japanese.normalizer import Normalizer
20 | from tn.japanese.test.utils import parse_test_case
21 |
22 |
23 | class TestNormalizer:
24 |
25 | normalizer = Normalizer(overwrite_cache=True)
26 |
27 | normalizer_cases = chain(
28 | parse_test_case("data/cardinal.txt"),
29 | parse_test_case("data/char.txt"),
30 | parse_test_case("data/date.txt"),
31 | parse_test_case("data/fraction.txt"),
32 | parse_test_case("data/math.txt"),
33 | parse_test_case("data/measure.txt"),
34 | parse_test_case("data/money.txt"),
35 | parse_test_case("data/sport.txt"),
36 | parse_test_case("data/time.txt"),
37 | parse_test_case("data/whitelist.txt"),
38 | )
39 |
40 | @pytest.mark.parametrize("spoken, written", normalizer_cases)
41 | def test_normalizer(self, spoken, written):
42 | assert self.normalizer.normalize(spoken) == written
43 |
--------------------------------------------------------------------------------
/itn/chinese/rules/license_plate.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2023 Xingchen Song (sxc19@mails.tsinghua.edu.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import string_file
16 | from pynini.lib.pynutil import insert
17 |
18 | from tn.processor import Processor
19 | from tn.utils import get_abs_path
20 |
21 |
22 | class LicensePlate(Processor):
23 |
24 | def __init__(self):
25 | super().__init__(name="licenseplate")
26 | self.build_tagger()
27 | self.build_verbalizer()
28 |
29 | def build_tagger(self):
30 | digit = string_file(get_abs_path("../itn/chinese/data/number/digit.tsv")) # 1 ~ 9
31 | zero = string_file(get_abs_path("../itn/chinese/data/number/zero.tsv")) # 0
32 | digits = zero | digit
33 | province = string_file(get_abs_path("../itn/chinese/data/license_plate/province.tsv")) # 皖
34 | license_plate = province + self.ALPHA + (self.ALPHA | digits) ** 5
35 | license_plate |= province + self.ALPHA + (self.ALPHA | digits) ** 6
36 | tagger = insert('value: "') + license_plate + insert('"')
37 | self.tagger = self.add_tokens(tagger)
38 |
--------------------------------------------------------------------------------
/runtime/patch/openfst/src/extensions/special/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | file(GLOB HEADER_FILES ../../include/fst/extensions/special/*.h)
2 | message(STATUS "${HEADER_FILES}")
3 |
4 | if(HAVE_BIN)
5 | add_executable(fstspecial-bin
6 | ../../bin/fstconvert.cc
7 | ../../bin/fstconvert-main.cc
8 | phi-fst.cc
9 | rho-fst.cc
10 | sigma-fst.cc
11 | )
12 |
13 | set_target_properties(fstspecial-bin PROPERTIES
14 | FOLDER special/bin
15 | OUTPUT_NAME fstspecial
16 | )
17 |
18 | target_link_libraries(fstspecial-bin
19 | fstscript
20 | fst
21 | ${CMAKE_DL_LIBS}
22 | )
23 | endif(HAVE_BIN)
24 |
25 |
26 | add_library(fstspecial
27 | phi-fst.cc
28 | rho-fst.cc
29 | sigma-fst.cc
30 | ${HEADER_FILES}
31 | )
32 |
33 | set_target_properties(fstspecial PROPERTIES
34 | SOVERSION "${SOVERSION}"
35 | FOLDER special
36 | )
37 | target_link_libraries(fstspecial
38 | fst
39 | )
40 |
41 | set(FST_SPECIAL_INSTALL_TARGETS fstspecial)
42 | if(HAVE_BIN)
43 | list(APPEND FST_SPECIAL_INSTALL_TARGETS fstspecial-bin)
44 | endif()
45 |
46 | install(TARGETS ${FST_SPECIAL_INSTALL_TARGETS}
47 | LIBRARY DESTINATION lib
48 | RUNTIME DESTINATION bin
49 | ARCHIVE DESTINATION lib
50 | )
51 |
52 | function (add_module _name)
53 | add_library(${ARGV})
54 | if (TARGET ${_name})
55 | target_link_libraries(${_name} fst)
56 | set_target_properties(${_name}
57 | PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true
58 | FOLDER special/modules
59 | )
60 | endif()
61 |
62 | install(TARGETS ${_name} LIBRARY DESTINATION lib/fst)
63 | endfunction()
64 |
65 | add_module(phi-fst MODULE phi-fst.cc)
66 | add_module(rho-fst MODULE rho-fst.cc)
67 | add_module(sigma-fst MODULE sigma-fst.cc)
68 |
--------------------------------------------------------------------------------
/.github/workflows/wheels.yml:
--------------------------------------------------------------------------------
1 | name: Build Wheels
2 |
3 | on:
4 | workflow_dispatch:
5 | inputs:
6 | version:
7 | description: 'Release version'
8 | required: true
9 |
10 | jobs:
11 | build:
12 | runs-on: ubuntu-latest
13 | strategy:
14 | matrix:
15 | python-version: [3.9]
16 | steps:
17 | - uses: actions/checkout@v3
18 | - name: Set up Python ${{ matrix.python-version }}
19 | uses: actions/setup-python@v4
20 | with:
21 | python-version: ${{ matrix.python-version }}
22 | - name: Install dependencies
23 | run: |
24 | python -m pip install --upgrade pip
25 | pip install -r requirements.txt
26 | pip install wheel
27 |
28 | - name: Build Graph
29 | run: |
30 | python -m tn --text "2.5平方电线" --overwrite_cache --language "zh"
31 | python -m tn --text "2010-03-21" --overwrite_cache --language "en"
32 | python -m itn --text "二点五平方电线" --overwrite_cache
33 |
34 | - name: Prepare Graph
35 | run: |
36 | mkdir graph
37 | cp tn/*.fst graph
38 | cp itn/*.fst graph
39 |
40 | - name: Upload Graph
41 | uses: actions/upload-artifact@v3
42 | with:
43 | name: release-graph-v${{ github.event.inputs.version}}
44 | path: graph
45 |
46 | - name: Publish on pypi.org
47 | env:
48 | TWINE_USERNAME: __token__
49 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
50 | run: |
51 | python setup.py sdist bdist_wheel --version=${{ github.event.inputs.version}}
52 | python -m pip install -U twine
53 | python -m twine upload --repository-url https://upload.pypi.org/legacy/ dist/*
54 |
--------------------------------------------------------------------------------
/runtime/test/string_test.cc:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #include "gmock/gmock.h"
16 |
17 | #include "utils/wetext_string.h"
18 |
19 | class StringTest : public testing::Test {};
20 |
21 | TEST(StringTest, StringLengthTest) {
22 | EXPECT_EQ(wetext::UTF8StringLength("A"), 1);
23 | EXPECT_EQ(wetext::UTF8StringLength("À"), 1);
24 | EXPECT_EQ(wetext::UTF8StringLength("啊"), 1);
25 | EXPECT_EQ(wetext::UTF8StringLength("✐"), 1);
26 | EXPECT_EQ(wetext::UTF8StringLength("你好"), 2);
27 | EXPECT_EQ(wetext::UTF8StringLength("world"), 5);
28 | }
29 |
30 | TEST(StringTest, SplitUTF8StringToCharsTest) {
31 | std::vector chars;
32 | wetext::SplitUTF8StringToChars("你好world", &chars);
33 | ASSERT_THAT(chars, testing::ElementsAre("你", "好", "w", "o", "r", "l", "d"));
34 | }
35 |
36 | TEST(StringTest, TrimTest) {
37 | ASSERT_EQ(wetext::Trim("\thello "), "hello");
38 | ASSERT_EQ(wetext::Trim(" hello\t"), "hello");
39 | }
40 |
41 | TEST(StringTest, SplitTest) {
42 | std::vector output;
43 | wetext::Split("written => spoken", " => ", &output);
44 | ASSERT_THAT(output, testing::ElementsAre("written", "spoken"));
45 | }
46 |
--------------------------------------------------------------------------------
/tn/japanese/rules/fraction.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini.lib.pynutil import delete, insert
16 |
17 | from tn.japanese.rules.cardinal import Cardinal
18 | from tn.processor import Processor
19 |
20 |
21 | class Fraction(Processor):
22 |
23 | def __init__(self):
24 | super().__init__(name="fraction")
25 | self.build_tagger()
26 | self.build_verbalizer()
27 |
28 | def build_tagger(self):
29 | rmspace = delete(" ").ques
30 | number = Cardinal().number
31 |
32 | tagger = (
33 | insert('numerator: "')
34 | + number
35 | + rmspace
36 | + delete("/")
37 | + rmspace
38 | + insert('" denominator: "')
39 | + number
40 | + insert('"')
41 | ).optimize()
42 | self.tagger = self.add_tokens(tagger)
43 |
44 | def build_verbalizer(self):
45 | denominator = delete('denominator: "') + self.SIGMA + delete('" ')
46 | numerator = delete('numerator: "') + self.SIGMA + delete('"')
47 | verbalizer = denominator + insert("分の") + numerator
48 | self.verbalizer = self.delete_tokens(verbalizer)
49 |
--------------------------------------------------------------------------------
/tn/chinese/rules/fraction.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini.lib.pynutil import delete, insert
16 |
17 | from tn.chinese.rules.cardinal import Cardinal
18 | from tn.processor import Processor
19 |
20 |
21 | class Fraction(Processor):
22 |
23 | def __init__(self):
24 | super().__init__(name="fraction")
25 | self.build_tagger()
26 | self.build_verbalizer()
27 |
28 | def build_tagger(self):
29 | rmspace = delete(" ").ques
30 | number = Cardinal().number
31 |
32 | tagger = (
33 | insert('numerator: "')
34 | + number
35 | + rmspace
36 | + delete("/")
37 | + rmspace
38 | + insert('" denominator: "')
39 | + number
40 | + insert('"')
41 | ).optimize()
42 | self.tagger = self.add_tokens(tagger)
43 |
44 | def build_verbalizer(self):
45 | denominator = delete('denominator: "') + self.SIGMA + delete('" ')
46 | numerator = delete('numerator: "') + self.SIGMA + delete('"')
47 | verbalizer = denominator + insert("分之") + numerator
48 | self.verbalizer = self.delete_tokens(verbalizer)
49 |
--------------------------------------------------------------------------------
/itn/japanese/data/measure/unit_en.tsv:
--------------------------------------------------------------------------------
1 | 華氏 f
2 | 摂氏 c
3 | キロメートル km
4 | 千キロメートル km
5 | メートル m
6 | センチ cm
7 | インチ インチ
8 | リットル L
9 | ジュール J
10 | ワット W
11 | アンペア A
12 | ボルト V
13 | オーム Ω
14 | アンペア毎メートル A/m
15 | ビット bit
16 | バイト Byte
17 | メガバイト MB
18 | キロバイト KB
19 | ギガバイト GB
20 | 度 ℃
21 | 立方センチメートル cm³
22 | ドット dpi
23 | ケルビン K
24 | センチメートル cm
25 | ミリメートル mm
26 | ヘクタール ha
27 | マイル mi
28 | 平方メートル m²
29 | 平方キロメートル km²
30 | 足 ft
31 | パーセント %
32 | ヘルツ hz
33 | キロワット kw
34 | 馬力 hp
35 | ミリグラム mg
36 | キログラム kg
37 | キロ kg
38 | ギガヘルツ ghz
39 | キロヘルツ khz
40 | メガヘルツ mhz
41 | ボルト v
42 | メガクーロン mc
43 | ナノメートル nm
44 | 毎分回転数 rpm
45 | ミリアンペア mA
46 | パーセント %
47 | キロワット時 kwh
48 | 立方メートル m³
49 | 時速マイル mph
50 | テラワット tw
51 | ミリボルト mv
52 | メガワット mw
53 | マイクロメータ μm
54 | テラバイト TB
55 | グラム g
56 | ダルトン da
57 | 雰囲気 atm
58 | オーム ω
59 | デシベル dB
60 | ペタ秒 ps
61 | オンス oz
62 | ヘクトリットル hl
63 | マイクログラム μg
64 | ペタグラム pg
65 | ギガバイト gb
66 | キロビット kb
67 | 電子ボルト ev
68 | メガバイト mb
69 | キロバイト kb
70 | キロビット/秒 kbps
71 | 毎秒メガビット mbps
72 | 結石 st
73 | キロリットル kl
74 | テラジュール tj
75 | キロボルト kv
76 | メガボルト mv
77 | キロニュートン kn
78 | メガメーター mm
79 | 天文単位 au
80 | ヤード yd
81 | ラジアン rad
82 | ルーメン lm
83 | ヘクト秒 hs
84 | モル mol
85 | ギガパスカル gpa
86 | ミリリットル ml
87 | ギガワット gw
88 | メガアンペア ma
89 | 結び目 kt
90 | キログラム力 kgf
91 | ナノグラム ng
92 | ナノ秒 ns
93 | メガシーメンス ms
94 | バー bar
95 | ギガリットル gl
96 | マイクロ秒 μs
97 | デシアンペア da
98 | パスカル pa
99 | デシ秒 ds
100 | ミリ秒 ms
101 | デシメートル dm
102 | 立方デシメートル dm³
103 | 原子質量単位 amu
104 | メガビット mb
105 | メガファラッド mf
106 | ベクレル bq
107 | ペタビット pb
108 | 平方ミリメートル mm²
109 | 平方センチメートル cm²
110 | 平方マイル sq mi
111 | 平方フィート sq ft
112 | キロパスカル kpa
113 | カンデラ cd
114 | テラリットル tl
115 | メガ秒 ms
116 | メガパスカル mpa
117 | ペタメーター pm
118 | ペタバイト pb
119 | ギガワットアワー gwh
120 | キロカロリー kcal
121 | グレー gy
122 | シーベルト sv
123 | ハンドレッド cwt
--------------------------------------------------------------------------------
/itn/chinese/data/money/code.tsv:
--------------------------------------------------------------------------------
1 | 澳元 A$
2 | 阿联酋迪拉姆 AED
3 | 阿富汗 阿富汗尼 AFN
4 | 阿尔巴尼亚列克 ALL
5 | 荷属安的列斯盾 ANG
6 | 阿根廷比索 ARS
7 | 澳元 AUD
8 | 阿鲁巴盾 AWG
9 | 阿塞拜疆马纳特 AZN
10 | 波斯尼亚和黑塞哥维那可兑换马克 BAM
11 | 巴巴多斯元 BBD
12 | 保加利亚列弗 BGN
13 | 百慕大元 BMD
14 | 文莱达鲁萨兰国元 BND
15 | 玻利维亚玻利维亚诺 BOB
16 | 巴西雷亚尔 BRL
17 | 巴哈马元 BSD
18 | 博茨瓦纳普拉 BWP
19 | 白俄罗斯卢布 BYN
20 | 伯利兹元 BZD
21 | 加元 CAD$
22 | 加元 CAD
23 | 瑞士法郎 CHF
24 | 智利比索 CLP
25 | 人民币 CNY
26 | 哥伦比亚比索 COP
27 | 哥斯达黎加科隆 CRC
28 | 古巴比索 CUP
29 | 捷克克朗 CZK
30 | 丹麦克朗 DKK
31 | 多米尼加共和国比索 DOP
32 | 埃及镑 EGP
33 | 欧元成员国 EUR
34 | 斐济元 FJD
35 | 福克兰群岛(马尔维纳斯)镑 FKP
36 | 英镑 GBP
37 | 根西岛镑 GGP
38 | 加纳塞地 GHS
39 | 直布罗陀镑 GIP
40 | 危地马拉格查尔 GTQ
41 | 圭亚那元 GYD
42 | 港元 HK$
43 | 港元 HKD
44 | 洪都拉斯伦皮拉 HNL
45 | 克罗地亚库纳 HRK
46 | 匈牙利福林 HUF
47 | 印尼盾 IDR
48 | 以色列谢克尔 ILS
49 | 马恩岛英镑 IMP
50 | 印度卢比 INR
51 | 伊朗里亚尔 IRR
52 | 冰岛克朗 ISK
53 | 日元 J¥
54 | 泽西镑 JEP
55 | 牙买加元 JMD
56 | 日元 JPY¥
57 | 日元 JPY
58 | 吉尔吉斯斯坦索姆 KGS
59 | 柬埔寨瑞尔 KHR
60 | 朝鲜园 KPW
61 | 韩元 KRW
62 | 韩元 KRW
63 | 开曼群岛元 KYD
64 | 哈萨克斯坦腾格 KZT
65 | 老挝基普 LAK
66 | 黎巴嫩镑 LBP
67 | 斯里兰卡卢比 LKR
68 | 利比里亚元 LRD
69 | 马其顿代纳尔 MKD
70 | 摩洛哥迪拉姆 MNT
71 | 蒙古图格里克 MNT
72 | 毛里求斯卢比 MUR
73 | 墨西哥比索 MXN
74 | 马来西亚令吉 MYR
75 | 莫桑比克梅蒂卡尔 MZN
76 | 纳米比亚元 NAD
77 | 尼日利亚奈拉 NGN
78 | 尼加拉瓜科尔多瓦 NIO
79 | 挪威克朗 NOK
80 | 尼泊尔卢比 NPR
81 | 新西兰元 NZD
82 | 阿曼里亚尔 OMR
83 | 巴拿马巴尔博亚 PAB
84 | 秘鲁索尔 PEN
85 | 菲律宾比索 PHP
86 | 巴基斯坦卢比 PKR
87 | 波兰兹罗提 PLN
88 | 巴拉圭瓜拉尼 PYG
89 | 卡塔尔里亚尔 QAR
90 | 罗马尼亚列伊 RON
91 | 塞尔维亚第纳尔 RSD
92 | 俄罗斯卢布 RUB
93 | 沙特阿拉伯里亚尔 SAR
94 | 所罗门群岛元 SBD
95 | 塞舌尔卢比 SCR
96 | 瑞典克朗 SEK
97 | 新加坡元 SGD
98 | 圣赫勒拿镑 SHP
99 | 索马里先令 SOS
100 | 苏里南元 SRD
101 | 萨尔瓦多科隆 SVC
102 | 叙利亚镑 SYP
103 | 泰铢 THB
104 | 土耳其里拉 TRY
105 | 特立尼达和多巴哥元 TTD
106 | 图瓦卢元 TVD
107 | 新台币 TWD
108 | 乌克兰格里夫纳 UAH
109 | 美元 USD
110 | 乌拉圭比索 UYU
111 | 乌兹别克斯坦索姆 UZS
112 | 委内瑞拉玻利瓦尔 VEF
113 | 越南东 VND
114 | 东加勒比元 XCD
115 | 也门里亚尔 YER
116 | 南非兰特 ZAR
117 | 津巴布韦元 ZWD
118 |
--------------------------------------------------------------------------------
/runtime/android/app/src/main/res/drawable-v24/ic_launcher_foreground.xml:
--------------------------------------------------------------------------------
1 |
7 |
8 |
9 |
15 |
18 |
21 |
22 |
23 |
24 |
30 |
--------------------------------------------------------------------------------
/runtime/processor/wetext_processor.h:
--------------------------------------------------------------------------------
1 | // Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | //
3 | // Licensed under the Apache License, Version 2.0 (the "License");
4 | // you may not use this file except in compliance with the License.
5 | // You may obtain a copy of the License at
6 | //
7 | // http://www.apache.org/licenses/LICENSE-2.0
8 | //
9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 |
15 | #ifndef PROCESSOR_WETEXT_PROCESSOR_H_
16 | #define PROCESSOR_WETEXT_PROCESSOR_H_
17 |
18 | #include
19 | #include
20 |
21 | #include "fst/fstlib.h"
22 |
23 | #include "processor/wetext_token_parser.h"
24 |
25 | using fst::StdArc;
26 | using fst::StdVectorFst;
27 | using fst::StringCompiler;
28 | using fst::StringPrinter;
29 |
30 | namespace wetext {
31 | class Processor {
32 | public:
33 | Processor(const std::string& tagger_path, const std::string& verbalizer_path);
34 | std::string Tag(const std::string& input);
35 | std::string Verbalize(const std::string& input);
36 | std::string Normalize(const std::string& input);
37 |
38 | private:
39 | std::string ShortestPath(const StdVectorFst& lattice);
40 | std::string Compose(const std::string& input, const StdVectorFst* fst);
41 |
42 | ParseType parse_type_;
43 | std::shared_ptr tagger_ = nullptr;
44 | std::shared_ptr verbalizer_ = nullptr;
45 | std::shared_ptr> compiler_ = nullptr;
46 | std::shared_ptr> printer_ = nullptr;
47 | };
48 |
49 | } // namespace wetext
50 |
51 | #endif // PROCESSOR_WETEXT_PROCESSOR_H_
52 |
--------------------------------------------------------------------------------
/tn/chinese/rules/whitelist.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import accep, string_file
16 | from pynini.lib.pynutil import add_weight, delete, insert
17 |
18 | from tn.processor import Processor
19 | from tn.utils import get_abs_path
20 |
21 |
22 | class Whitelist(Processor):
23 |
24 | def __init__(self, remove_erhua=True):
25 | super().__init__(name="whitelist")
26 | self.remove_erhua = remove_erhua
27 | self.build_tagger()
28 | self.build_verbalizer()
29 |
30 | def build_tagger(self):
31 | whitelist = string_file(get_abs_path("chinese/data/default/whitelist.tsv")) | string_file(
32 | get_abs_path("chinese/data/erhua/whitelist.tsv")
33 | )
34 |
35 | erhua = add_weight(insert('erhua: "') + accep("儿"), 0.1)
36 | tagger = (erhua | (insert('value: "') + whitelist)) + insert('"')
37 | self.tagger = self.add_tokens(tagger)
38 |
39 | def build_verbalizer(self):
40 | super().build_verbalizer()
41 | if self.remove_erhua:
42 | verbalizer = self.delete_tokens(delete('erhua: "儿"'))
43 | else:
44 | verbalizer = self.delete_tokens(delete('erhua: "') + accep("儿") + delete('"'))
45 | self.verbalizer |= verbalizer
46 |
--------------------------------------------------------------------------------
/itn/japanese/rules/money.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import string_file
16 | from pynini.lib.pynutil import delete, insert
17 |
18 | from itn.japanese.rules.cardinal import Cardinal
19 | from tn.processor import Processor
20 | from tn.utils import get_abs_path
21 |
22 |
23 | class Money(Processor):
24 |
25 | def __init__(self, enable_0_to_9=True):
26 | super().__init__(name="money")
27 | self.enable_0_to_9 = enable_0_to_9
28 | self.build_tagger()
29 | self.build_verbalizer()
30 |
31 | def build_tagger(self):
32 | symbol = string_file(get_abs_path("../itn/japanese/data/money/symbol.tsv"))
33 |
34 | number = Cardinal().number if self.enable_0_to_9 else Cardinal().number_exclude_0_to_9
35 | decimal = Cardinal().decimal
36 | # 三千三百八十点五八円 => ¥3380.58
37 | tagger = insert('value: "') + (number | decimal) + insert('"') + insert(' currency: "') + symbol + insert('"')
38 | self.tagger = self.add_tokens(tagger)
39 |
40 | def build_verbalizer(self):
41 | currency = delete('currency: "') + self.SIGMA + delete('"')
42 | value = delete(' value: "') + self.SIGMA + delete('"')
43 | verbalizer = currency + value
44 | self.verbalizer = self.delete_tokens(verbalizer)
45 |
--------------------------------------------------------------------------------
/tn/japanese/rules/money.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2024 Logan Liu (2319277867@qq.com)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import string_file
16 | from pynini.lib.pynutil import delete, insert
17 |
18 | from tn.japanese.rules.cardinal import Cardinal
19 | from tn.processor import Processor
20 | from tn.utils import get_abs_path
21 |
22 |
23 | class Money(Processor):
24 |
25 | def __init__(self):
26 | super().__init__(name="money")
27 | self.build_tagger()
28 | self.build_verbalizer()
29 |
30 | def build_tagger(self):
31 | code = string_file(get_abs_path("japanese/data/money/code.tsv"))
32 | symbol = string_file(get_abs_path("japanese/data/money/symbol.tsv"))
33 |
34 | number = Cardinal().number
35 | tagger = (
36 | insert('currency: "')
37 | + (code | symbol)
38 | + delete(" ").ques
39 | + insert('" ')
40 | + insert('value: "')
41 | + number
42 | + insert('"')
43 | )
44 | self.tagger = self.add_tokens(tagger)
45 |
46 | def build_verbalizer(self):
47 | value = delete('value: "') + self.SIGMA + delete('" ')
48 | currency = delete('currency: "') + self.SIGMA + delete('"')
49 | verbalizer = value + currency
50 | self.verbalizer = self.delete_tokens(verbalizer)
51 |
--------------------------------------------------------------------------------
/tn/chinese/rules/money.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022 Zhendong Peng (pzd17@tsinghua.org.cn)
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from pynini import string_file
16 | from pynini.lib.pynutil import delete, insert
17 |
18 | from tn.chinese.rules.cardinal import Cardinal
19 | from tn.processor import Processor
20 | from tn.utils import get_abs_path
21 |
22 |
23 | class Money(Processor):
24 |
25 | def __init__(self):
26 | super().__init__(name="money")
27 | self.build_tagger()
28 | self.build_verbalizer()
29 |
30 | def build_tagger(self):
31 | code = string_file(get_abs_path("chinese/data/money/code.tsv"))
32 | symbol = string_file(get_abs_path("chinese/data/money/symbol.tsv"))
33 |
34 | number = Cardinal().number
35 | tagger = (
36 | insert('currency: "')
37 | + (code | symbol)
38 | + delete(" ").ques
39 | + insert('" ')
40 | + insert('value: "')
41 | + number
42 | + insert('"')
43 | )
44 | self.tagger = self.add_tokens(tagger)
45 |
46 | def build_verbalizer(self):
47 | value = delete('value: "') + self.SIGMA + delete('" ')
48 | currency = delete('currency: "') + self.SIGMA + delete('"')
49 | verbalizer = value + currency
50 | self.verbalizer = self.delete_tokens(verbalizer)
51 |
--------------------------------------------------------------------------------