├── trending.csv ├── .gitignore ├── .github └── workflows │ ├── validate.yml │ └── validate-and-sort.yml ├── scripts ├── ignore.csv ├── dedup_across_files.py ├── find_duplicates.py ├── sort_and_dedup.py ├── validate.py ├── jyutping.py └── lint.py ├── .vscode └── tasks.json ├── README-en.md ├── proper_nouns.csv ├── onomatopoeia.csv ├── README.md ├── variant.csv ├── LICENSE └── fixed_expressions.csv /trending.csv: -------------------------------------------------------------------------------- 1 | char,jyutping 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | scripts/__pycache__ 2 | -------------------------------------------------------------------------------- /.github/workflows/validate.yml: -------------------------------------------------------------------------------- 1 | name: Validate 2 | 3 | on: 4 | push: 5 | branches-ignore: 6 | - main # handled separately 7 | pull_request: 8 | 9 | jobs: 10 | compile: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v3 15 | - name: Validate 16 | run: python scripts/validate.py 17 | - name: Find Duplicated Lines 18 | run: python scripts/find_duplicates.py 19 | -------------------------------------------------------------------------------- /scripts/ignore.csv: -------------------------------------------------------------------------------- 1 | char,jyutping 2 | 㕵,wau1 3 | 㗱,zep2 4 | 㗱,zep4 5 | 乓,bam1 6 | 乓,bam2 7 | 乓,bam4 8 | 円,jen1 9 | 卡,kaat1 10 | 呱,gwek4 11 | 咭,kaat1 12 | 唷,jo1 13 | 唷,jo3 14 | 喲,jo1 15 | 喴,wi1 16 | 喵,meu1 17 | 喼,gep1 18 | 嘭,bam4 19 | 嚕,lu1 20 | 嚕,lu3 21 | 嚕,lu4 22 | 壁,bek1 23 | 壁,bek3 24 | 壁,bek4 25 | 夾,gep2 26 | 夾,gep6 27 | 慕,mu1 28 | 掉,deu6 29 | 泵,bam1 30 | 瀎,met6 31 | 畸,ki1 32 | 疊,dep6 33 | 舔,lem2 34 | 芒,mon1 35 | 調,deu6 36 | 鉗,kem2 37 | 鉗,kem4 38 | 𠰲,oet1 39 | 𠵯,gwit1 40 | 𢚖,ti4 41 | 𦧷,lem2 42 | 𩜠,mam1 43 | 𪚩,gwi1 44 | 勾,ngau1 45 | 鈎,ngau1 46 | 銀,ngan2 47 | 揈,fing6 48 | 奀,ngan1 49 | 囈,ngai1 50 | 啱,ngaam1 51 | 額,ngaak2 52 | 鵝,ngo2 53 | 囊,nong2 54 | 耐,noi2 55 | 硬,ngaang2 56 | 𢯎,ngaau1 57 | -------------------------------------------------------------------------------- /scripts/dedup_across_files.py: -------------------------------------------------------------------------------- 1 | from glob import iglob 2 | 3 | seen = set() 4 | seen_add = seen.add 5 | 6 | for filename in iglob('*.csv'): 7 | with open(filename, encoding='utf-8') as f: 8 | header = next(f).rstrip('\n') 9 | if header != 'char,jyutping': 10 | continue 11 | entries = [] 12 | entries_append = entries.append 13 | for line in f: 14 | line = line.rstrip('\n') 15 | if line not in seen: 16 | entries_append(line) 17 | seen_add(line) 18 | 19 | with open(filename, 'w', encoding='utf-8') as f: 20 | print(header, file=f) 21 | for line in entries: 22 | print(line, file=f) 23 | -------------------------------------------------------------------------------- /scripts/find_duplicates.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from glob import iglob 3 | import sys 4 | 5 | line_to_locations = defaultdict(list) 6 | 7 | has_error = False 8 | i = 0 9 | 10 | for filename in iglob('*.csv'): 11 | with open(filename, encoding='utf-8') as f: 12 | next(f) 13 | for line_num, line in enumerate(f, 2): 14 | line = line.rstrip('\n') 15 | location = f'{filename}:{line_num}' 16 | line_to_locations[line].append(location) 17 | 18 | for line, locations in line_to_locations.items(): 19 | if len(locations) > 1: 20 | locations_str = ', '.join(locations) 21 | print(f'[{i:04}] \033[91mERROR: "{line}" is duplicated in [{locations_str}]\033[0m', file=sys.stderr) 22 | has_error = True 23 | i += 1 24 | 25 | if has_error: 26 | sys.exit(1) 27 | -------------------------------------------------------------------------------- /.github/workflows/validate-and-sort.yml: -------------------------------------------------------------------------------- 1 | name: Validate and sort 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | compile: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v3 14 | - name: Validate 15 | run: python scripts/validate.py 16 | - name: Find Duplicated Lines 17 | run: python scripts/find_duplicates.py 18 | - name: Sort 19 | run: python scripts/sort_and_dedup.py 20 | - name: Push back to GitHub when contents changed 21 | run: | 22 | git config user.email "41898282+github-actions[bot]@users.noreply.github.com" 23 | git config user.name "github-actions[bot]" 24 | git add . 25 | if [ -n "$(git status --porcelain)" ]; then 26 | git commit -m "Sort entries" 27 | git push origin main 28 | fi 29 | -------------------------------------------------------------------------------- /scripts/sort_and_dedup.py: -------------------------------------------------------------------------------- 1 | from glob import iglob 2 | from operator import itemgetter, methodcaller 3 | 4 | def compose(f, g): 5 | return lambda arg: f(g(arg)) 6 | 7 | sort_cols = { 8 | 'variant.csv': (1, 3, 0, 2) 9 | } 10 | 11 | for filename in iglob('*.csv'): 12 | with open(filename, encoding='utf-8') as f: 13 | header = next(f).rstrip('\n') 14 | sort_criteria = methodcaller('split', ',') 15 | if filename in sort_cols: 16 | sort_criteria = compose(itemgetter(*sort_cols[filename]), sort_criteria) 17 | entries = sorted(f, key=sort_criteria) 18 | 19 | with open(filename, 'w', encoding='utf-8') as f: 20 | print(header, file=f) 21 | prev_line = None 22 | for line in entries: 23 | line = line.rstrip('\n') 24 | if line != prev_line: 25 | print(line, file=f) 26 | prev_line = line 27 | -------------------------------------------------------------------------------- /.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | { 2 | // See https://go.microsoft.com/fwlink/?LinkId=733558 3 | // for the documentation about the tasks.json format 4 | "version": "2.0.0", 5 | "tasks": [ 6 | { 7 | "label": "Lint", 8 | "type": "shell", 9 | "command": "python", 10 | "args": ["${workspaceFolder}/scripts/lint.py"], 11 | "isBackground": true, 12 | "presentation": { 13 | "clear": true, 14 | "reveal": "never" 15 | }, 16 | "problemMatcher": { 17 | "owner": "python", 18 | "fileLocation": ["relative", "${workspaceFolder}"], 19 | "pattern": { 20 | "regexp": "^(.*?):(\\d+):(\\d+),(\\d+):\\s+(WARNING|ERROR):\\s+(.*)$", 21 | "file": 1, 22 | "line": 2, 23 | "column": 3, 24 | "endColumn": 4, 25 | "severity": 5, 26 | "message": 6 27 | }, 28 | "background": { 29 | "activeOnStart": true, 30 | "beginsPattern": "----- Message Starts -----", 31 | "endsPattern": "----- Message Ends -----" 32 | } 33 | }, 34 | "runOptions": { 35 | "runOn": "folderOpen" 36 | } 37 | } 38 | ] 39 | } 40 | -------------------------------------------------------------------------------- /README-en.md: -------------------------------------------------------------------------------- 1 | [粵語](README.md) 2 | 3 | # rime-cantonese Upstream Word List 4 | 5 | This repo serves as the upstream data storage for [rime-cantonese](https://github.com/rime/rime-cantonese). The rime-cantonese repo regularly pulls data from this repo and compile the lexicon. 6 | 7 | ## Structure 8 | 9 | This repo contains the following files: 10 | 11 | 1. `char.csv`: Characters 12 | 1. `word.csv`: Common words 13 | 1. `phrase_fragment.csv`: Short phrases, input fragments and combos, ngrams 14 | 1. `trending.csv`: Uncategorized newly added words. 15 | 16 | ## Data sources 17 | 18 | Source of single character entries 19 | 20 | - LSHK 電腦用漢字粵語拼音表 https://github.com/lshk-org/jyutping-table 21 | 22 | Consultant resources for single character entries 23 | 24 | - [Unihan 12.0 kCantonese](https://www.unicode.org/charts/unihan.html) 25 | - [粵語審音配詞字庫](https://humanum.arts.cuhk.edu.hk/Lexis/lexi-can/) 26 | - [《廣州話正音字典》](https://github.com/jyutnet/cantonese-books-data/tree/master/2004_%E5%BB%A3%E5%B7%9E%E8%A9%B1%E6%AD%A3%E9%9F%B3%E5%AD%97%E5%85%B8) 27 | 28 | Source of word entries 29 | 30 | - [粵典](https://words.hk/faiman/analysis/wordslist/) 31 | - [冚唪唥粵文](https://hambaanglaang.hk/) 32 | - [《實用廣州話分類詞典》](https://github.com/rime/rime-cantonese/blob/build/lexicons/%E3%80%8A%E5%AF%A6%E7%94%A8%E5%BB%A3%E5%B7%9E%E8%A9%B1%E5%88%86%E9%A1%9E%E8%A9%9E%E5%85%B8%E3%80%8B.tsv) 33 | - A Dictionary of Cantonese Slang 34 | - 《廣州話詞典》 35 | - 《地道廣州話用語》 36 | 37 | ## Credits 38 | 39 | - laubonghaudoi 40 | - Ayaka 41 | - Leimaau 42 | - Chaak 43 | - Bing Cheung 44 | - Cherry 45 | - Lili Ou 46 | - Philip Wong 47 | - Henry Chan 48 | - Alex Man 49 | -------------------------------------------------------------------------------- /proper_nouns.csv: -------------------------------------------------------------------------------- 1 | category,char,jyutping 2 | 交通,九巴,gau2 baa1 3 | 交通,九廣鐵路,gau2 gwong2 tit3 lou6 4 | 交通,城巴,sing4 baa1 5 | 交通,新巴,san1 baa1 6 | 人名,余文樂,jyu4 man4 lok6 7 | 人名,姜濤,goeng1 tou4 8 | 人名,張天賦,zoeng1 tin1 fu3 9 | 人名,張敬軒,zoeng1 ging3 hin1 10 | 人名,李兆基,lei5 siu6 gei1 11 | 人名,李卓仁,lei5 coek3 jan4 12 | 人名,李嘉誠,lei5 gaa1 sing4 13 | 人名,李家超,lei5 gaa1 ciu1 14 | 人名,李幸倪,lei5 hang6 ngai4 15 | 人名,林家謙,lam4 gaa1 him1 16 | 人名,林智樂,lam4 zi3 lok6 17 | 人名,林鄭,lam4 zeng6 18 | 人名,林鄭月娥,lam4 zeng6 jyut6 ngo4 19 | 人名,梁振英,loeng4 zan3 jing1 20 | 人名,炎明熹,jim4 ming4 hei1 21 | 人名,范徐麗泰,faan6 ceoi4 lai6 taai3 22 | 人名,邱彥筒,jau1 jin6 tung4 23 | 人名,陳健安,can4 gin6 on1 24 | 人名,陳卓賢,can4 coek3 jin4 25 | 人名,陳柏宇,can4 paak3 jyu5 26 | 人名,陳蕾,can4 leoi4 27 | 人名,馮允謙,fung4 wan5 him1 28 | 人名,魏浚笙,ngai6 zeon3 sang1 29 | 人名,黃偉文,wong4 wai5 man4 30 | 其他,中大,zung1 daai6 31 | 其他,城大,sing4 daai6 32 | 地名,上海,soeng6 hoi2 33 | 地名,中國,zung1 gwok3 34 | 地名,北京,bak1 ging1 35 | 地名,大埔,daai6 bou3 36 | 地名,大埔仔,daai6 bou3 zai2 37 | 地名,大埔墟,daai6 bou3 heoi1 38 | 地名,大陸,daai6 luk6 39 | 地名,屯元天,tyun4 jyun4 tin1 40 | 地名,屯門,tyun4 mun4 41 | 地名,掃桿埔,sou3 gon2 bou2 42 | 地名,沙田,saa1 tin4 43 | 地名,臺灣,toi4 waan1 44 | 地名,香港,hoeng1 gong2 45 | 路名,北大嶼山公路,bak1 daai6 jyu4 saan1 gung1 lou6 46 | 路名,城門隧道,sing4 mun4 seoi6 dou6 47 | 路名,大老山隧道,daai6 lou5 saan1 seoi6 dou6 48 | 路名,將藍隧道,zoeng1 laam4 seoi6 dou6 49 | 路名,屯公,tyun4 gung1 50 | 路名,屯赤,tyun4 cek3 51 | 路名,德士古道,dak1 si6 gu2 dou6 52 | 路名,東廊,dung1 long2 53 | 路名,東隧,dung1 seoi6 54 | 路名,欖隧,laam6 seoi6 55 | 路名,汀九橋,ding1 gau2 kiu4 56 | 路名,獅隧,si1 seoi6 57 | 路名,紅隧,hung4 seoi6 58 | 路名,荔橋,lai6 kiu4 59 | 路名,西隧,sai1 seoi6 60 | 路名,青山公路,cing1 saan1 gung1 lou6 61 | 路名,青馬大橋,cing1 maa5 daai6 kiu4 62 | 路名,香港仔隧道,hoeng1 gong2 zai2 seoi6 dou6 63 | 路名,鵝頸橋,ngo4 geng2 kiu4 64 | 路名,龍翔道,lung4 coeng4 dou6 65 | 飲食,譚仔,taam5 zai2 66 | 飲食,譚仔三哥,taam5 zai2 saam1 go1 67 | -------------------------------------------------------------------------------- /onomatopoeia.csv: -------------------------------------------------------------------------------- 1 | type,jyutping,char 2 | A1111,bi1 li1 baa1 laa1,嗶哩叭啦 3 | A1111,bi1 li1 baa1 laa1,嗶哩吧啦 4 | A1111,ding1 ling1 daang1 laang1,傾呤哐啷 5 | A1111,gi1 li1 gu1 lu1,嘰哩咕嚕 6 | A1111,kik1 lik1 kaak1 laak1,虢礫緙嘞 7 | A1111,kik1 lik1 kaak1 laak1,闃礫緙嘞 8 | A1111,kik1 lik1 kaak1 laak1,𠽤叻𡃈嘞 9 | A1111,kik1 lik1 kaak1 laak1,𠽤嚦𡃈嘞 10 | A1111,king1 ling1 kaang1 laang1,傾呤哐啷 11 | A1111,king1 ling1 kong1 long1,傾鈴哐啷 12 | A1111,pik1 lik1 paak1 laak1,霹靂啪嘞 13 | A1111,ping1 ling1 paang1 laang1,砰鈴嘭唥 14 | A4144,bi4 li1 bek4 lek4,啤哩壁叻 15 | A4144,bing4 ling1 baang4 laang4,乒鈴嘭唥 16 | A4144,bing4 ling1 baang4 laang4,乒鈴𠾴唥 17 | A4144,bing4 ling1 baang4 laang4,砰呤嘭唥 18 | A4144,bing4 ling1 baang4 laang4,砰呤𠾴唥 19 | A4144,bing4 ling1 baang4 laang4,𠹶呤嘭唥 20 | A4144,bing4 ling1 baang4 laang4,𠹶呤𠾴唥 21 | A4144,fi4 li1 fe4 le4,fi li fe le 22 | A4144,fi4 li1 fe4 le4,飛哩啡呢 23 | A4144,gi4 li1 gat6 gat6,嘰哩咕嚕 24 | A4144,gi4 li1 gu4 lu4,嘰哩咕嚕 25 | A4144,ping4 ling1 paang4 laang4,砰呤嘭唥 26 | A4144,ping4 ling1 paang4 laang4,砰呤𠾴唥 27 | A4144,ping4 ling1 paang4 laang4,𠹶呤嘭唥 28 | A4144,ping4 ling1 paang4 laang4,𠹶呤𠾴唥 29 | A4144,si4 li1 soe4 loe4,si li sir lur 30 | A4144,si4 li1 soe4 loe4,噝哩𡄽𠼱 31 | A4163,ding4 ling1 dang6 lang3,椗呤鄧㨢 32 | B1111,mi1 mi1 mo1 mo1,咪咪嚒嚒 33 | B1111,mi1 mi1 mo1 mo1,咪咪嚤嚤 34 | B1111,mi1 mi1 mo1 mo1,咪咪摩摩 35 | B1111,mi1 mi1 mo1 mo1,咪咪摸摸 36 | B1111,zi1 zi1 zaa1 zaa1,吱吱喳喳 37 | B4144,hi4 hi1 hoe4 hoe4,hee hee hur hur 38 | B4144,ji4 ji1 aai4 aai4,咦咿哎哎 39 | B4144,ji4 ji1 ang4 ang4,咦咦ang ang 40 | B4144,ji4 ji1 ang4 ang4,咦咿哽哽 41 | B4144,ji4 ji1 ngo4 ngo4,咦咿哦哦 42 | B4144,ji4 ji1 o4 o4,咦咿哦哦 43 | B4144,zi4 zi1 zam4 zam4,吱吱斟斟 44 | B4166,gi4 gi1 gat6 gat6,嘰嘰咭咭 45 | B4166,gi4 gi1 gat6 gat6,嘰嘰屹屹 46 | B4166,gi4 gi1 gat6 gat6,嘰嘰訖訖 47 | B4166,gi4 gi1 gat6 gat6,嘰嘰趷趷 48 | B4166,gi4 gi1 gat6 gat6,齮齮齕齕 49 | B4444,bing4 bing4 baang4 baang4,砰砰嘭嘭 50 | B4444,bing4 bing4 baang4 baang4,𠹶𠹶嘭嘭 51 | B4444,ping4 ping4 paang4 paang4,砰砰嘭嘭 52 | B4444,ping4 ping4 paang4 paang4,𠹶𠹶嘭嘭 53 | B4444,zi4 zi4 zam4 zam4,吱吱浸浸 54 | -------------------------------------------------------------------------------- /scripts/validate.py: -------------------------------------------------------------------------------- 1 | import jyutping 2 | import sys 3 | from glob import iglob 4 | 5 | non_han = {*',:'} 6 | multisyllable_allowlist = {*'兡瓸䇉竡尣兛瓩竏𥪕兝瓰竕嗧浬兞瓱竓呎吋啢𠺖兣糎甅竰卅𠯢兙瓧䇆竍卌'} 7 | 8 | with open('scripts/ignore.csv', encoding='utf-8') as f: 9 | next(f) 10 | ignoreroman_list = {tuple(line.rstrip('\n').split(',')) for line in f} 11 | 12 | def is_han(char): 13 | return char == '\u3007' or \ 14 | '\u3400' <= char <= '\u4dbf' or \ 15 | '\u4e00' <= char <= '\u9fff' or \ 16 | '\uf900' <= char <= '\ufaff' or \ 17 | '\U00020000' <= char <= '\U0002a6df' or \ 18 | '\U0002a700' <= char <= '\U0002ebef' or \ 19 | '\U0002f800' <= char <= '\U0002fa1f' or \ 20 | '\U00030000' <= char <= '\U000323af' 21 | 22 | has_error = False 23 | i = 0 24 | 25 | for filename in iglob('*.csv'): 26 | with open(filename, encoding='utf-8') as f: 27 | if not next(f).startswith('char,jyutping'): 28 | continue 29 | 30 | for line_num, line in enumerate(f, 2): 31 | word, romans, *_ = line.rstrip('\n').split(',') 32 | 33 | word_ = [char for char in word if char not in non_han] 34 | romans_ = romans.split(' ') 35 | 36 | if '' in romans_: 37 | print(f'[{i:04}] \033[91m ERROR: [{filename}:{line_num}] Leading, trailing or continuous spaces are not allowed: {word}, "{romans}"\033[0m', file=sys.stderr) 38 | has_error = True 39 | i += 1 40 | 41 | romans_ = [roman for roman in romans_ if roman] 42 | 43 | if len(word_) != len(romans_) and not any(char in multisyllable_allowlist for char in word_): 44 | print(f'[{i:04}] \033[91m ERROR: [{filename}:{line_num}] Length do not match: {word}, "{romans}"\033[0m', file=sys.stderr) 45 | has_error = True 46 | i += 1 47 | 48 | if not all(is_han(char) for char in word_): 49 | print(f'[{i:04}] \033[91m ERROR: [{filename}:{line_num}] Word contains invalid char: {word}, "{romans}"\033[0m', file=sys.stderr) 50 | has_error = True 51 | i += 1 52 | 53 | for char, roman in zip(word_, romans_): 54 | if (char, roman) in ignoreroman_list: 55 | continue 56 | status = jyutping.validate(roman, jyutping.TestSet.LOOSE) 57 | if status == jyutping.ValidationStatus.UNCOMMON: 58 | print(f'[{i:04}] WARNING: [{filename}:{line_num}] Uncommon jyutping "{roman}": {word}, "{romans}"', file=sys.stderr) 59 | i += 1 60 | elif status == jyutping.ValidationStatus.INVALID: 61 | print(f'[{i:04}] \033[91m ERROR: [{filename}:{line_num}] Invalid jyutping "{roman}": {word}, "{romans}"\033[0m', file=sys.stderr) 62 | has_error = True 63 | i += 1 64 | 65 | if has_error: 66 | sys.exit(1) 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [English](README-en.md) 2 | 3 | # rime-cantonese 上游詞表 4 | 5 | [![License: CC BY 4.0](https://img.shields.io/badge/License-CC_BY_4.0-red.svg)](https://creativecommons.org/licenses/by/4.0/) 6 | 7 | 本倉庫係 [rime-cantonese](https://github.com/rime/rime-cantonese) 嘅上游詞表。rime-cantonese 作為下游輸入法碼表會通過 CI 自動從本倉庫揸取更新構建新碼表。 8 | 9 | ## 結構 10 | 11 | 呢個上游詞表會將所有詞條分成以下幾類,每類對應一個文件: 12 | 13 | 1. `char.csv`:單字音 14 | 1. `variant.csv`:異體字分類 15 | 1. `word.csv`:常用詞 16 | 1. `fixed_expressions.csv`:成語、諺語、歇後語、文言短句 17 | 1. `phrase_fragment.csv`:短句、文字碎片、常見輸入組合、ngram 18 | 1. `trending.csv`未分類嘅流行詞 19 | 20 | ### 單字收錄 `char.csv` 格式説明 21 | 22 | | char | jyutping | pron_rank | tone_var | literary_vernacular | comment | 23 | | ------------ | ------------------------ | ------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------ | ------------------------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------- | 24 | | 漢字 Unicode | 粵拼:漢字對應嘅粵拼發音 | 發音常見度: