├── CN ├── pinyin_chart.csv ├── pinyin_to_phone.txt ├── run.sh └── word_to_pinyin.txt ├── EN ├── ARPA2IPA.map ├── cmudict-0.7b.txt └── run.sh ├── README.md ├── run.sh └── utils ├── DaCiDian.py ├── convert_pinyin_chart_to_mapping.py ├── dict_to_phoneset.py └── map_arpa_to_ipa.py /CN/pinyin_chart.csv: -------------------------------------------------------------------------------- 1 | Initial-/-Final,-a,-ai,-ao,-an,-ang,-e,-ei,-en,-eng,-er,-o,-ou,-ong,-i,-i*,-ia,-iao,-ie,-iu,-ian,-iang,-in,-ing,-iong,-u,-ua,-uai,-ui,-uo,-uan,-uang,-un,-ü,-üe,-üan,-ün 2 | ∅-,a [a],ai [ai],ao [au],an [an],ang [aŋ],e [ə],ei [ei],en [ən],eng [əŋ],er [ər],o [ɔ],ou [əu],,,,,,,,,,,,,,,,,,,,,,,, 3 | b-,ba [b a],bai [b ai],bao [b au],ban [b an],bang [b aŋ],,bei [b ei],ben [b ən],beng [b əŋ],,bo [b uɔ],,,bi [b ii],,,biao [b i au],bie [b ie],,bian [b i an],,bin [b in],bing [b iŋ],,bu [b uu],,,,,,,,,,, 4 | c-,ca [ts a],cai [ts ai],cao [ts au],can [ts an],cang [ts aŋ],ce [ts ə],cei [ts ei],cen [ts ən],ceng [ts əŋ],,,cou [ts əu],cong [ts uŋ],,ci [ts iii],,,,,,,,,,cu [ts uu],,,cui [ts u ei],cuo [ts uɔ],cuan [ts u an],,cun [ts u ən],,,, 5 | ch-,cha [ch a],chai [ch ai],chao [ch au],chan [ch an],chang [ch aŋ],che [ch ə],,chen [ch ən],cheng [ch əŋ],,,chou [ch əu],chong [ch uŋ],,chi [ch iii],,,,,,,,,,chu [ch uu],chua [ch u a],chuai [ch u ai],chui [ch u ei],chuo [ch uɔ],chuan [ch u an],chuang [ch u aŋ],chun [ch u ən],,,, 6 | d-,da [d a],dai [d ai],dao [d au],dan [d an],dang [d aŋ],de [d ə],dei [d ei],den [d ən],deng [d əŋ],,,dou [d əu],dong [d uŋ],di [d ii],,dia [d i a],diao [d i au],die [d ie],diu [d i əu],dian [d i an],,,ding [d iŋ],,du [d uu],,,dui [d u ei],duo [d uɔ],duan [d u an],,dun [d u ən],,,, 7 | f-,fa [f a],,,fan [f an],fang [f aŋ],,fei [f ei],fen [f ən],feng [f əŋ],,fo [f uɔ],fou [f əu],,,,,,,,,,,,,fu [f uu],,,,,,,,,,, 8 | g-,ga [g a],gai [g ai],gao [g au],gan [g an],gang [g aŋ],ge [g ə],gei [g ei],gen [g ən],geng [g əŋ],,,gou [g əu],gong [g uŋ],,,,,,,,,,,,gu [g uu],gua [g u a],guai [g u ai],gui [g u ei],guo [g uɔ],guan [g u an],guang [g u aŋ],gun [g u ən],,,, 9 | h-,ha [h a],hai [h ai],hao [h au],han [h an],hang [h aŋ],he [h ə],hei [h ei],hen [h ən],heng [h əŋ],,,hou [h əu],hong [h uŋ],,,,,,,,,,,,hu [h uu],hua [h u a],huai [h u ai],hui [h u ei],huo [h uɔ],huan [h u an],huang [h u aŋ],hun [h u ən],,,, 10 | j-,,,,,,,,,,,,,,ji [j ii],,jia [j i a],jiao [j i au],jie [j ie],jiu [j i əu],jian [j i an],jiang [j i aŋ],jin [j in],jing [j iŋ],jiong [j i uŋ],,,,,,,,,ju [j yu],jue [j yue],juan [j yu an],jun [j yu n] 11 | k-,ka [k a],kai [k ai],kao [k au],kan [k an],kang [k aŋ],ke [k ə],kei [k ei],ken [k ən],keng [k əŋ],,,kou [k əu],kong [k uŋ],,,,,,kiu [k i əu],,,,,,ku [k uu],kua [k u a],kuai [k u ai],kui [k u ei],kuo [k uɔ],kuan [k u an],kuang [k u aŋ],kun [k u ən],,,, 12 | l-,la [l a],lai [l ai],lao [l au],lan [l an],lang [l aŋ],le [l ə],lei [l ei],,leng [l əŋ],,,lou [l əu],long [l uŋ],li [l ii],,lia [l i a],liao [l i au],lie [l ie],liu [l i əu],lian [l i an],liang [l i aŋ],lin [l in],ling [l iŋ],,lu [l uu],,,,luo [l uɔ],luan [l u an],,lun [l u ən],lv [l yu],lve [l yue],, 13 | m-,ma [m a],mai [m ai],mao [m au],man [m an],mang [m aŋ],me [m ə],mei [m ei],men [m ən],meng [m əŋ],,mo [m uɔ],mou [m əu],,mi [m ii],,,miao [m i au],mie [m ie],miu [m i əu],mian [m i an],,min [m in],ming [m iŋ],,mu [m uu],,,,,,,,,,, 14 | n-,na [n a],nai [n ai],nao [n au],nan [n an],nang [n aŋ],ne [n ə],nei [n ei],nen [n ən],neng [n əŋ],,,nou [n əu],nong [n uŋ],ni [n ii],,,niao [n i au],nie [n ie],niu [n i əu],nian [n i an],niang [n i aŋ],nin [n in],ning [n iŋ],,nu [n uu],,,,nuo [n uɔ],nuan [n u an],,,nv [n yu],nve [n yue],, 15 | p-,pa [p a],pai [p ai],pao [p au],pan [p an],pang [p aŋ],,pei [p ei],pen [p ən],peng [p əŋ],,po [p uɔ],pou [p əu],,pi [p ii],,,piao [p i au],pie [p ie],,pian [p i an],,pin [p in],ping [p iŋ],,pu [p uu],,,,,,,,,,, 16 | q-,,,,,,,,,,,,,,qi [q ii],,qia [q i a],qiao [q i au],qie [q ie],qiu [q i əu],qian [q i an],qiang [q i aŋ],qin [q in],qing [q iŋ],qiong [q i uŋ],,,,,,,,,qu [q yu],que [q yue],quan [q yu an],qun [q yu n] 17 | r-,,,rao [ʒ au],ran [ʒ an],rang [ʒ aŋ],re [ʒ ə],,ren [ʒ ən],reng [ʒ əŋ],,,rou [ʒ əu],rong [ʒ uŋ],,ri [ʒ iii],,,,,,,,,,ru [ʒ uu],,,rui [ʒ u ei],ruo [ʒ uɔ],ruan [ʒ u an],,run [ʒ u ən],,,, 18 | s-,sa [s a],sai [s ai],sao [s au],san [s an],sang [s aŋ],se [s ə],sei [s ei],sen [s ən],seng [s əŋ],,,sou [s əu],song [s uŋ],,si [s iii],,,,,,,,,,su [s uu],,,sui [s u ei],suo [s uɔ],suan [s u an],,sun [s u ən],,,, 19 | sh-,sha [sh a],shai [sh ai],shao [sh au],shan [sh an],shang [sh aŋ],she [sh ə],shei [sh ei],shen [sh ən],sheng [sh əŋ],,,shou [sh əu],,,shi [sh iii],,,,,,,,,,shu [sh uu],shua [sh u a],shuai [sh u ai],shui [sh u ei],shuo [sh uɔ],shuan [sh u an],shuang [sh u aŋ],shun [sh u ən],,,, 20 | t-,ta [t a],tai [t ai],tao [t au],tan [t an],tang [t aŋ],te [t ə],,,teng [t əŋ],,,tou [t əu],tong [t uŋ],ti [t ii],,,tiao [t i au],tie [t ie],,tian [t i an],,,ting [t iŋ],,tu [t uu],,,tui [t u ei],tuo [t uɔ],tuan [t u an],,tun [t u ən],,,, 21 | w-,wa [w a],wai [w ai],,wan [w an],wang [w aŋ],,wei [w ei],wen [w ən],weng [w əŋ],,wo [w uɔ],,,,,,,,,,,,,,wu [w uu],,,,,,,,,,, 22 | x-,,,,,,,,,,,,,,xi [x ii],,xia [x i a],xiao [x i au],xie [x ie],xiu [x i əu],xian [x i an],xiang [x i aŋ],xin [x in],xing [x iŋ],xiong [x i uŋ],,,,,,,,,xu [x yu],xue [x yue],xuan [x yu an],xun [x yu n] 23 | y-,ya [y a],,yao [y au],yan [y an],yang [y aŋ],ye [y ie],,,,,yo [y ɔ],you [y əu],yong [y uŋ],yi [y ii],,,,,,,,yin [y in],ying [y iŋ],,,,,,,,,,yu [yu],yue [yue],yuan [yu an],yun [yu n] 24 | z-,za [z a],zai [z ai],zao [z au],zan [z an],zang [z aŋ],ze [z ə],zei [z ei],zen [z ən],zeng [z əŋ],,,zou [z əu],zong [z uŋ],,zi [z iii],,,,,,,,,,zu [z uu],,,zui [z u ei],zuo [z uɔ],zuan [z u an],,zun [z u ən],,,, 25 | zh-,zha [zh a],zhai [zh ai],zhao [zh au],zhan [zh an],zhang [zh aŋ],zhe [zh ə],zhei [zh ei],zhen [zh ən],zheng [zh əŋ],,,zhou [zh əu],zhong [zh uŋ],,zhi [zh iii],,,,,,,,,,zhu [zh uu],zhua [zh u a],zhuai [zh u ai],zhui [zh u ei],zhuo [zh uɔ],zhuan [zh u an],zhuang [zh u aŋ],zhun [zh u ən],,,, 26 | -------------------------------------------------------------------------------- /CN/pinyin_to_phone.txt: -------------------------------------------------------------------------------- 1 | a a 2 | ai ai 3 | an an 4 | ang aŋ 5 | ao au 6 | ba b a 7 | bai b ai 8 | ban b an 9 | bang b aŋ 10 | bao b au 11 | bei b ei 12 | ben b ən 13 | beng b əŋ 14 | bi b ii 15 | bian b i an 16 | biao b i au 17 | bie b ie 18 | bin b in 19 | bing b iŋ 20 | bo b uɔ 21 | bu b uu 22 | ca ts a 23 | cai ts ai 24 | can ts an 25 | cang ts aŋ 26 | cao ts au 27 | ce ts ə 28 | cei ts ei 29 | cen ts ən 30 | ceng ts əŋ 31 | cha ch a 32 | chai ch ai 33 | chan ch an 34 | chang ch aŋ 35 | chao ch au 36 | che ch ə 37 | chen ch ən 38 | cheng ch əŋ 39 | chi ch iii 40 | chong ch uŋ 41 | chou ch əu 42 | chu ch uu 43 | chua ch u a 44 | chuai ch u ai 45 | chuan ch u an 46 | chuang ch u aŋ 47 | chui ch u ei 48 | chun ch u ən 49 | chuo ch uɔ 50 | ci ts iii 51 | cong ts uŋ 52 | cou ts əu 53 | cu ts uu 54 | cuan ts u an 55 | cui ts u ei 56 | cun ts u ən 57 | cuo ts uɔ 58 | da d a 59 | dai d ai 60 | dan d an 61 | dang d aŋ 62 | dao d au 63 | de d ə 64 | dei d ei 65 | den d ən 66 | deng d əŋ 67 | di d ii 68 | dia d i a 69 | dian d i an 70 | diao d i au 71 | die d ie 72 | ding d iŋ 73 | diu d i əu 74 | dong d uŋ 75 | dou d əu 76 | du d uu 77 | duan d u an 78 | dui d u ei 79 | dun d u ən 80 | duo d uɔ 81 | e ə 82 | ei ei 83 | en ən 84 | eng əŋ 85 | er ər 86 | fa f a 87 | fan f an 88 | fang f aŋ 89 | fei f ei 90 | fen f ən 91 | feng f əŋ 92 | fo f uɔ 93 | fou f əu 94 | fu f uu 95 | ga g a 96 | gai g ai 97 | gan g an 98 | gang g aŋ 99 | gao g au 100 | ge g ə 101 | gei g ei 102 | gen g ən 103 | geng g əŋ 104 | gong g uŋ 105 | gou g əu 106 | gu g uu 107 | gua g u a 108 | guai g u ai 109 | guan g u an 110 | guang g u aŋ 111 | gui g u ei 112 | gun g u ən 113 | guo g uɔ 114 | ha h a 115 | hai h ai 116 | han h an 117 | hang h aŋ 118 | hao h au 119 | he h ə 120 | hei h ei 121 | hen h ən 122 | heng h əŋ 123 | hong h uŋ 124 | hou h əu 125 | hu h uu 126 | hua h u a 127 | huai h u ai 128 | huan h u an 129 | huang h u aŋ 130 | hui h u ei 131 | hun h u ən 132 | huo h uɔ 133 | ji j ii 134 | jia j i a 135 | jian j i an 136 | jiang j i aŋ 137 | jiao j i au 138 | jie j ie 139 | jin j in 140 | jing j iŋ 141 | jiong j i uŋ 142 | jiu j i əu 143 | ju j yu 144 | juan j yu an 145 | jue j yue 146 | jun j yu n 147 | ka k a 148 | kai k ai 149 | kan k an 150 | kang k aŋ 151 | kao k au 152 | ke k ə 153 | kei k ei 154 | ken k ən 155 | keng k əŋ 156 | kiu k i əu 157 | kong k uŋ 158 | kou k əu 159 | ku k uu 160 | kua k u a 161 | kuai k u ai 162 | kuan k u an 163 | kuang k u aŋ 164 | kui k u ei 165 | kun k u ən 166 | kuo k uɔ 167 | la l a 168 | lai l ai 169 | lan l an 170 | lang l aŋ 171 | lao l au 172 | le l ə 173 | lei l ei 174 | leng l əŋ 175 | li l ii 176 | lia l i a 177 | lian l i an 178 | liang l i aŋ 179 | liao l i au 180 | lie l ie 181 | lin l in 182 | ling l iŋ 183 | liu l i əu 184 | long l uŋ 185 | lou l əu 186 | lu l uu 187 | luan l u an 188 | lun l u ən 189 | luo l uɔ 190 | lv l yu 191 | lve l yue 192 | ma m a 193 | mai m ai 194 | man m an 195 | mang m aŋ 196 | mao m au 197 | me m ə 198 | mei m ei 199 | men m ən 200 | meng m əŋ 201 | mi m ii 202 | mian m i an 203 | miao m i au 204 | mie m ie 205 | min m in 206 | ming m iŋ 207 | miu m i əu 208 | mo m uɔ 209 | mou m əu 210 | mu m uu 211 | na n a 212 | nai n ai 213 | nan n an 214 | nang n aŋ 215 | nao n au 216 | ne n ə 217 | nei n ei 218 | nen n ən 219 | neng n əŋ 220 | ni n ii 221 | nian n i an 222 | niang n i aŋ 223 | niao n i au 224 | nie n ie 225 | nin n in 226 | ning n iŋ 227 | niu n i əu 228 | nong n uŋ 229 | nou n əu 230 | nu n uu 231 | nuan n u an 232 | nuo n uɔ 233 | nv n yu 234 | nve n yue 235 | o ɔ 236 | ou əu 237 | pa p a 238 | pai p ai 239 | pan p an 240 | pang p aŋ 241 | pao p au 242 | pei p ei 243 | pen p ən 244 | peng p əŋ 245 | pi p ii 246 | pian p i an 247 | piao p i au 248 | pie p ie 249 | pin p in 250 | ping p iŋ 251 | po p uɔ 252 | pou p əu 253 | pu p uu 254 | qi q ii 255 | qia q i a 256 | qian q i an 257 | qiang q i aŋ 258 | qiao q i au 259 | qie q ie 260 | qin q in 261 | qing q iŋ 262 | qiong q i uŋ 263 | qiu q i əu 264 | qu q yu 265 | quan q yu an 266 | que q yue 267 | qun q yu n 268 | ran ʒ an 269 | rang ʒ aŋ 270 | rao ʒ au 271 | re ʒ ə 272 | ren ʒ ən 273 | reng ʒ əŋ 274 | ri ʒ iii 275 | rong ʒ uŋ 276 | rou ʒ əu 277 | ru ʒ uu 278 | ruan ʒ u an 279 | rui ʒ u ei 280 | run ʒ u ən 281 | ruo ʒ uɔ 282 | sa s a 283 | sai s ai 284 | san s an 285 | sang s aŋ 286 | sao s au 287 | se s ə 288 | sei s ei 289 | sen s ən 290 | seng s əŋ 291 | sha sh a 292 | shai sh ai 293 | shan sh an 294 | shang sh aŋ 295 | shao sh au 296 | she sh ə 297 | shei sh ei 298 | shen sh ən 299 | sheng sh əŋ 300 | shi sh iii 301 | shou sh əu 302 | shu sh uu 303 | shua sh u a 304 | shuai sh u ai 305 | shuan sh u an 306 | shuang sh u aŋ 307 | shui sh u ei 308 | shun sh u ən 309 | shuo sh uɔ 310 | si s iii 311 | song s uŋ 312 | sou s əu 313 | su s uu 314 | suan s u an 315 | sui s u ei 316 | sun s u ən 317 | suo s uɔ 318 | ta t a 319 | tai t ai 320 | tan t an 321 | tang t aŋ 322 | tao t au 323 | te t ə 324 | teng t əŋ 325 | ti t ii 326 | tian t i an 327 | tiao t i au 328 | tie t ie 329 | ting t iŋ 330 | tong t uŋ 331 | tou t əu 332 | tu t uu 333 | tuan t u an 334 | tui t u ei 335 | tun t u ən 336 | tuo t uɔ 337 | wa w a 338 | wai w ai 339 | wan w an 340 | wang w aŋ 341 | wei w ei 342 | wen w ən 343 | weng w əŋ 344 | wo w uɔ 345 | wu w uu 346 | xi x ii 347 | xia x i a 348 | xian x i an 349 | xiang x i aŋ 350 | xiao x i au 351 | xie x ie 352 | xin x in 353 | xing x iŋ 354 | xiong x i uŋ 355 | xiu x i əu 356 | xu x yu 357 | xuan x yu an 358 | xue x yue 359 | xun x yu n 360 | ya y a 361 | yan y an 362 | yang y aŋ 363 | yao y au 364 | ye y ie 365 | yi y ii 366 | yin y in 367 | ying y iŋ 368 | yo y ɔ 369 | yong y uŋ 370 | you y əu 371 | yu yu 372 | yuan yu an 373 | yue yue 374 | yun yu n 375 | za z a 376 | zai z ai 377 | zan z an 378 | zang z aŋ 379 | zao z au 380 | ze z ə 381 | zei z ei 382 | zen z ən 383 | zeng z əŋ 384 | zha zh a 385 | zhai zh ai 386 | zhan zh an 387 | zhang zh aŋ 388 | zhao zh au 389 | zhe zh ə 390 | zhei zh ei 391 | zhen zh ən 392 | zheng zh əŋ 393 | zhi zh iii 394 | zhong zh uŋ 395 | zhou zh əu 396 | zhu zh uu 397 | zhua zh u a 398 | zhuai zh u ai 399 | zhuan zh u an 400 | zhuang zh u aŋ 401 | zhui zh u ei 402 | zhun zh u ən 403 | zhuo zh uɔ 404 | zi z iii 405 | zong z uŋ 406 | zou z əu 407 | zu z uu 408 | zuan z u an 409 | zui z u ei 410 | zun z u ən 411 | zuo z uɔ 412 | -------------------------------------------------------------------------------- /CN/run.sh: -------------------------------------------------------------------------------- 1 | python ../utils/convert_pinyin_chart_to_mapping.py pinyin_chart.csv pinyin_to_phone.txt 2 | python ../utils/DaCiDian.py word_to_pinyin.txt pinyin_to_phone.txt > CN.txt 3 | -------------------------------------------------------------------------------- /EN/ARPA2IPA.map: -------------------------------------------------------------------------------- 1 | AA0 a 2 | AA1 a 3 | AA2 a 4 | AE0 æ 5 | AE1 æ 6 | AE2 æ 7 | AH0 ə 8 | AH1 ʌ 9 | AH2 ʌ 10 | AO0 ɔ 11 | AO1 ɔ 12 | AO2 ɔ 13 | AW0 au 14 | AW1 au 15 | AW2 au 16 | AY0 ai 17 | AY1 ai 18 | AY2 ai 19 | B b 20 | CH ch 21 | D d 22 | DH ð 23 | EH0 e 24 | EH1 e 25 | EH2 e 26 | ER0 ə r 27 | ER1 ə r 28 | ER2 ə r 29 | EY0 ei 30 | EY1 ei 31 | EY2 ei 32 | F f 33 | G g 34 | HH h 35 | IH0 i 36 | IH1 i 37 | IH2 i 38 | IY0 ii 39 | IY1 ii 40 | IY2 ii 41 | JH zh 42 | K k 43 | L l 44 | M m 45 | N n 46 | NG ŋ 47 | OW0 əu 48 | OW1 əu 49 | OW2 əu 50 | OY0 ɔi 51 | OY1 ɔi 52 | OY2 ɔi 53 | P p 54 | R r 55 | S s 56 | SH sh 57 | T t 58 | TH θ 59 | UH0 u 60 | UH1 u 61 | UH2 u 62 | UW0 uu 63 | UW1 uu 64 | UW2 uu 65 | V v 66 | W w 67 | Y y 68 | Z z 69 | ZH ʒ 70 | -------------------------------------------------------------------------------- /EN/cmudict-0.7b.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechio/BigCiDian/5da8d5da138b772e21e99520da0bb27bfb873f1a/EN/cmudict-0.7b.txt -------------------------------------------------------------------------------- /EN/run.sh: -------------------------------------------------------------------------------- 1 | iconv -f ISO_8859-10 -t utf8 cmudict-0.7b.txt > tmp # convert raw format from IOS-8859 to UTF8 2 | #python ../utils/dict_to_phoneset.py tmp ARPA.list 3 | python ../utils/map_arpa_to_ipa.py ARPA2IPA.map tmp EN.txt 4 | rm tmp 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BigCiDian 2 | 3 | ## 1. Goal 4 | This project is an attempt to create a pronunciation lexicon covering both English and Chinese words *in a unified phoneset* for ASR applications. 5 | 6 | P.S. "CiDian" means "lexicon" in Chinese. 7 | 8 | typical use cases in Chinese ASR applications: 9 | ``` 10 | 你手机上都装了什么 APP ? 11 | APPLE 的新 MACBOOK PRO 真漂亮 12 | 上个月 PRADA 出了款新包包 13 | 手机开了 GPRS 导航 14 | 世界杯 H 组小组赛 15 | ``` 16 | 17 | ## 2. Phoneset 18 | The unified phoneset should be a simple and precise phoneset that covers both languages. Note that the mapping listed below are heavily based on IPA. 19 | 20 | ### 2.1 English Phoneset Mapping 21 | English entries are derived from CMUDict 0.7b, hence we need a mapping from ARPA phoneset to target phoneset. 22 | 23 | |ARPA|IPA|CMUDict example entries| 24 | |-|-|-| 25 | |AA0 |a|icon:AY1 K AA0 N| 26 | |AA1 |a|heart: HH AA1 R T| 27 | |AA2 |a|kmart: K EY1 M AA2 R T| 28 | |AE0 |æ|romance: R OW1 M AE0 N S| 29 | |AE1 |æ|lambda: L AE1 M D AH0| 30 | |AE2 |æ|setback: S EH1 T B AE2 K| 31 | |AH0 |ə|station: S T EY1 SH AH0 N| 32 | |AH1 |ʌ|bug: B AH1 G| 33 | |AH2 |ʌ|haircut: HH EH1 R K AH2 T| 34 | |AO0 |ɔ|hongkong: HH AO1 NG K AO0 NG| 35 | |AO1 |ɔ|law: L AO1| 36 | |AO2 |ɔ|layoff: L EY1 AO2 F| 37 | |AW0 |au|foundation: F AW0 N D EY1 SH AH0 N| 38 | |AW1 |au|founder: F AW1 N D ER0| 39 | |AW2 |au|hometown: HH OW1 M T AW2 N| 40 | |AY0 |ai|hypothese: HH AY0 P AA1 TH AH0 S IY2 Z| 41 | |AY1 |ai|ice: AY1 S| 42 | |AY2 |ai|iceland: AY1 S L AH0 N D| 43 | |B |b|bike: B AY1 K| 44 | |CH |ch|chase: CH EY1 S| 45 | |D |d|desk: D EH1 S K| 46 | |DH |ð|those: DH OW1 Z| 47 | |EH0 |e|princess: P R IH1 N S EH0 S| 48 | |EH1 |e|professor: P R AH0 F EH1 S ER0| 49 | |EH2 |e|progress: P R AA1 G R EH2 S| 50 | |ER0 |ə r|programmer: P R OW1 G R AE2 M ER0| 51 | |ER1 |ə r|purge: P ER1 JH| 52 | |ER2 |ə r|showgirl: SH OW1 G ER2 L| 53 | |EY0 |ei|eighteen: EY0 T IY1 N| 54 | |EY1 |ei|email: IY0 M EY1 L| 55 | |EY2 |ei|thursday: TH ER1 Z D EY2| 56 | |F |f|face: F EY1 S| 57 | |G |g|give: G IH1 V| 58 | |HH |h|hey: HH EY1| 59 | |IH0 |i|facing: F EY1 S IH0 NG | 60 | |IH1 |i|fear: F IH1 R| 61 | |IH2 |i|fellowship: F EH1 L OW0 SH IH2 P| 62 | |IY0 |ii|email: IY0 M EY1 L| 63 | |IY1 |ii|prefix: P R IY1 F IH0 K S| 64 | |IY2 |ii|increase: IH1 N K R IY2 S| 65 | |JH |zh|gesture: JH EH1 S CH ER0| 66 | |K |k|cat: K AE1 T| 67 | |L |l|lack: L AE1 K| 68 | |M |m|may: M EY1| 69 | |N |n|no: N OW1| 70 | |NG |ŋ|thing: TH IH1 NG| 71 | |OW0 |əu|crypto: K R IH1 P T OW0| 72 | |OW1 |əu|token: T OW1 K AH0 N| 73 | |OW2 |əu|earphone: IH1 R F OW2 N| 74 | |OY0 |ɔi|invoice: IH1 N V OY0 S| 75 | |OY1 |ɔi|floyd: F L OY1 D| 76 | |OY2 |ɔi|episode: EH1 P IH0 S OW2 D| 77 | |P |p|pat: P AE1 T| 78 | |R |r|risk: R IH1 S K| 79 | |S |s|sing: S IH1 NG| 80 | |SH |sh|shake: SH EY1 K| 81 | |T |t|test: T EH1 S T| 82 | |TH |θ|think: TH IH1 NG K| 83 | |UH0 |u|fulfill: F UH0 L F IH1 L| 84 | |UH1 |u|full: F UH1 L| 85 | |UH2 |u|goodbye: G UH2 D B AY1| 86 | |UW0 |uu|rescue: R EH1 S K Y UW0| 87 | |UW1 |uu|fool: F UW1 L| 88 | |UW2 |uu|restroom: R EH1 S T R UW2 M| 89 | |V |v|very: V EH1 R IY0| 90 | |W |w|west: W EH1 S T| 91 | |Y |y|yes: Y EH1 S| 92 | |Z |z|zero: Z IY1 R OW0| 93 | |ZH |ʒ|illusion: IH2 L UW1 ZH AH0 N| 94 | 95 | *notes: If you find anything that doesn't make sense in the mapping table, please let me know, thanks* 96 | 97 | ### 2.2 Chinese PinYin Mapping 98 | Chinese entries are extracted from [DaCiDian project](https://github.com/aishell-foundation/DaCiDian) 99 | 100 | Here is a PinYin to IPA mapping from educational prospective: https://resources.allsetlearning.com/chinese/pronunciation/Pinyin_chart 101 | 102 | With a few mapping modifications and symbolic adaptations, here is the final [PinYin to target phoneset mapping](/CN/pinyin_chart.csv) 103 | 104 | ### 2.3 tone 105 | There are normally 5 tones in Chinese PinYin system ranging from 0 ~ 4. 106 | However there is no tone definition in English. In BigCiDian, Chinese tonal information is retained and merged with untoned English, so the resulting phoneset may contain 6 tonal variation(1 from English and 5 from Chinese): 107 | 108 | ``` 109 | e.g. for phoneme *ai* 110 | 111 | 1. HI -> h ai 112 | 2. 哎 -> ai_0 113 | 3. 掰 -> b ai_1 114 | 4. 还 -> h ai_2 115 | 5. 凯 -> k ai_3 116 | 6. 外 -> w ai_4 117 | ``` 118 | 119 | ### 2.4 the unified phoneset 120 | The final unified bi-lingual phoneset details are listed below: 121 | 122 | |phoneme|CN example|EN example| 123 | |-|-|-| 124 | |a|把 b a_3| AACHEN a k ə n| 125 | |æ||CAT k æ t| 126 | |ai|爱 ai_4| KITE k ai t| 127 | |an|安 an_1|| 128 | |aŋ|羊 y aŋ_2|| 129 | |au|老 l au_3| LOUD l au d| 130 | |b|白 b ai_2| BUT b ʌ t| 131 | |ch|陈 ch ən_2| CHEST ch e s t| 132 | |d|大 d a_4| DAY d ei| 133 | |ð||THIS ð i s| 134 | |e||BED b e d| 135 | |ei|累 l ei_4| LAKE l ei k| 136 | |ə|鹅 ə_2| COCA-COLA k əu k ə k əu l a| 137 | |ən|陈 ch ən_2|| 138 | |əŋ|横 h əŋ_2|| 139 | |ər|二 ər_4|| 140 | |əu|欧 əu_1|BOAT b əu t| 141 | |f|房 f aŋ_2|FACE f ei s| 142 | |g|刚 g aŋ_1|GIVE g i v| 143 | |h|海 h ai_3|HUG h ʌ g| 144 | |i|天 t i an_1|HIT h i t| 145 | |ie|别 b ie_2|| 146 | |ii|比 b ii_3|BEAT b ii t| 147 | |iii|吃 ch iii_1|| 148 | |in|音 y in_1|| 149 | |iŋ|听 t iŋ_1|| 150 | |j|九 j i əu_3|| 151 | |k|看 k an_4|CAKE k ei k| 152 | |l|来 l ai_2|LAKE l ei k| 153 | |m|马 m a_3|MAKE m ei k| 154 | |n|那 n a_1|NIKE n ai k ii| 155 | |ŋ||INTERESTING i n t ə r e s t i ŋ| 156 | |ɔ||OFF ɔ f| 157 | |ɔi||JOY zh ɔi| 158 | |p|胖 p aŋ_4|PACE p ei s| 159 | |q|钱 q i an_2|| 160 | |r|让 ʒ aŋ_4|RISK r i s k| 161 | |s|丝 s iii_1|SING s i ŋ| 162 | |sh|上 sh aŋ_4|SHAKE sh ei k| 163 | |t|团 t u an_2|TIME t ai m| 164 | |ts|才 ts ai_2|| 165 | |u||BOOK b u k| 166 | |uŋ|从 ts uŋ_2|| 167 | |uɔ|桌 zh uɔ_1|| 168 | |uu|不 b uu_4|TWO t uu| 169 | |v||VICTORY v i k t ə r ii| 170 | |ʌ||CUT k ʌ t| 171 | |w|王 w aŋ_2|WEST w e s t| 172 | |x|西 x ii_1|| 173 | |y|言 y an_2|YES y e s| 174 | |yu|去 q yu_4|| 175 | |yue|缺 q yue_1|| 176 | |z|赞 z an_4|ZOO z uu| 177 | |zh|中 zh uŋ_1|GESTURE zh e s ch ə r| 178 | |ʒ|让 ʒ aŋ_4|LEISURE l e ʒ ə r| 179 | |θ||THINK θ i ŋ k| 180 | 181 | So overall there are 56 phonemes in the unified phoneset(regardless of tones). 182 | 183 | Theoretically some phonemes can be split with smaller granularity(eg. au->a u, ɔi->ɔ i, an->a n ...), hence making the phoneset even more compact. But it is a common practice that larger acoustic modeling units are beneficial for Chinese ASR accuracy, and the existence of decision-tree based state-tying, makes base phoneset size less irrelevant to ASR problem. 184 | 185 | I may or may not change the unified phoneset in the future, currently it seems to be sufficient for my purpose. 186 | 187 | ## 3. Usage 188 | `sh run.sh` should give you a ready-to-use bi-lingual ASR lexicon (`lexicon.txt`), and a phoneset list(`phones.list`) in project root directory. 189 | 190 | ## 4. Extend entries 191 | To extend the final lexicon with entries of your own interest(say "IPHONE", "华为P30"), you can either: 192 | * add those entries into the very bottom sources(CMUDict and DaCiDian) 193 | 194 | or: 195 | * maintain a seperate extension-lexicon, and merge it with main lexicon automatically generated above. 196 | 197 | ## 5. Experiment result 198 | In [AISHELL-2](https://github.com/kaldi-asr/kaldi/tree/master/egs/aishell2) Mandarin ASR task, replacing Chinese lexicon(DaCiDian) with multilingual CN-EN lexicon(BigCiDian), details are showed below: 199 | 200 | For DaCiDian, system performance: 201 | ``` 202 | ----- test -----: 203 | %WER 44.39 [ 21986 / 49532, 338 ins, 2085 del, 19563 sub ] exp/mono/decode_test/cer_9_0.0 204 | %WER 24.25 [ 12011 / 49532, 393 ins, 792 del, 10826 sub ] exp/tri1/decode_test/cer_12_0.0 205 | %WER 22.13 [ 10963 / 49532, 396 ins, 644 del, 9923 sub ] exp/tri2/decode_test/cer_12_0.0 206 | %WER 19.29 [ 9555 / 49532, 263 ins, 640 del, 8652 sub ] exp/tri3/decode_test/cer_13_0.5 207 | %WER 8.33 [ 4125 / 49532, 84 ins, 192 del, 3849 sub ] exp/chain/tdnn_1a/decode_test/cer_8_0.5 208 | ``` 209 | 210 | For BigCiDian, system performance: 211 | ``` 212 | %WER 43.92 [ 21754 / 49532, 405 ins, 1574 del, 19775 sub ] exp/mono/decode_test/cer_7_0.0 213 | %WER 22.54 [ 11163 / 49532, 406 ins, 652 del, 10105 sub ] exp/tri1/decode_test/cer_11_0.0 214 | %WER 21.09 [ 10445 / 49532, 377 ins, 609 del, 9459 sub ] exp/tri2/decode_test/cer_12_0.0 215 | %WER 18.47 [ 9148 / 49532, 265 ins, 621 del, 8262 sub ] exp/tri3/decode_test/cer_13_0.5 216 | %WER 8.22 [ 4072 / 49532, 68 ins, 260 del, 3744 sub ] exp/chain/tdnn_1a/decode_test/cer_9_0.5 217 | ``` 218 | 219 | __Conclusion__ 220 | 221 | * It shows that BigCiDian only gives slightly better results than DaCiDian. 222 | * But more importantly, BigCiDian turns a pure Chinese ASR system to multiligual system, which is pretty much the case in nowadays Chinese ASR applications. 223 | 224 | THE END -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | cd EN/ 2 | sh run.sh 3 | cd - 4 | 5 | cd CN/ 6 | sh run.sh 7 | cd - 8 | 9 | #cat EN/EN.txt CN/CN.txt | sort -k1 -d > lexicon.txt 10 | #cat EN/EN.txt CN/CN.txt | sort -k1 > lexicon.txt 11 | cat EN/EN.txt CN/CN.txt | sort -u > lexicon.txt 12 | rm EN/EN.txt CN/CN.txt 13 | 14 | python utils/dict_to_phoneset.py lexicon.txt phoneset.list 15 | -------------------------------------------------------------------------------- /utils/DaCiDian.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This script processes DaCiDian 4 | # FROM: 5 | # layer-1 mapping: DaCiDian/word_to_pinyin.txt 6 | # layer-2 mapping: DaCiDian/pinyin_to_phone.txt 7 | # TO: 8 | # lexicon.txt 9 | 10 | import sys 11 | 12 | syllable_to_phones={} 13 | 14 | word_to_syllable_file = sys.argv[1] # layer-1 mapping 15 | syllable_to_phone_file = sys.argv[2] # layer-2 mapping 16 | 17 | for l in open(syllable_to_phone_file): # "ZHENG zh eng" 18 | cols = l.strip().split('\t') 19 | assert(len(cols) == 2) 20 | syllable = cols[0] 21 | phones = cols[1].split() 22 | syllable_to_phones[syllable] = phones 23 | 24 | for l in open(word_to_syllable_file): # "15 YI_1 WU_3;YAO_1 WU_3" 25 | cols = l.strip().split('\t') 26 | assert(len(cols) == 2) 27 | word = cols[0] 28 | prons = cols[1].split(';') 29 | for pron in prons: 30 | phone_seq = [] 31 | for syllable in pron.split(): 32 | base,tone = syllable.split('_') 33 | phones = [phn for phn in syllable_to_phones[base]] 34 | phones[-1] = phones[-1]+'_'+tone 35 | phone_seq.extend(phones) 36 | sys.stdout.write(word + '\t' + ' '.join(phone_seq) + '\n') 37 | -------------------------------------------------------------------------------- /utils/convert_pinyin_chart_to_mapping.py: -------------------------------------------------------------------------------- 1 | # coding=utf8 2 | import codecs, sys 3 | 4 | fi = codecs.open(sys.argv[1], 'r', 'utf8') 5 | fo = codecs.open(sys.argv[2], 'w+', 'utf8') 6 | 7 | m = {} 8 | 9 | for l in fi: 10 | cols = l.strip().split(',') 11 | for col in cols: 12 | if '[' in col: 13 | pinyin = col.split('[')[0].strip() 14 | phones = col.split('[')[1].strip().strip(']') 15 | m[pinyin] = phones 16 | 17 | for k in sorted(m.keys()): 18 | fo.write(u'{}\t{}\n'.format(k, m[k])) 19 | 20 | fi.close() 21 | fo.close() 22 | -------------------------------------------------------------------------------- /utils/dict_to_phoneset.py: -------------------------------------------------------------------------------- 1 | # -*- encoding: utf8 2 | 3 | import sys, os 4 | import codecs 5 | 6 | dict_filename = sys.argv[1] 7 | phoneset_filename = sys.argv[2] 8 | 9 | phoneset = [] 10 | 11 | fi = codecs.open(dict_filename, 'r', 'utf8') 12 | 13 | for l in fi.readlines(): 14 | if ';;;' in l: 15 | continue 16 | cols = l.strip().split() 17 | word = cols[0] 18 | phones = cols[1:] 19 | for p in phones: 20 | if p not in phoneset: 21 | phoneset.append(p) 22 | 23 | fi.close() 24 | 25 | fo = codecs.open(phoneset_filename, 'w+', 'utf8') 26 | for phone in sorted(phoneset): 27 | fo.write(phone + '\n') 28 | fo.close() 29 | -------------------------------------------------------------------------------- /utils/map_arpa_to_ipa.py: -------------------------------------------------------------------------------- 1 | import os, sys, codecs, re 2 | 3 | mapf = sys.argv[1] 4 | arpaf = sys.argv[2] 5 | ipaf = sys.argv[3] 6 | 7 | m = {} 8 | 9 | for l in codecs.open(mapf, 'r', 'utf8'): 10 | cols = l.strip().split() 11 | m[cols[0]] = u' '.join(cols[1:]) 12 | 13 | ipa = codecs.open(ipaf, 'w+', 'utf8') 14 | 15 | for l in codecs.open(arpaf, 'r', 'utf8'): 16 | if ';;;' in l: 17 | continue 18 | cols = l.strip().split() 19 | word = cols[0] 20 | word = re.sub('\([0-9]*\)','',word) 21 | pron = cols[1:] 22 | ipa_pron = [ m[phn] for phn in pron ] 23 | ipa.write(word + u'\t' + u' '.join(ipa_pron) + u'\n') 24 | 25 | ipa.close() 26 | --------------------------------------------------------------------------------