├── .github └── workflows │ ├── daily-build.yaml │ └── test.yaml ├── .gitignore ├── .prettierrc ├── LICENSE ├── package-lock.json ├── package.json ├── readme.md └── src ├── constants.js ├── convertToFrequencyDictionary.js ├── convertToHonziDictionary.js ├── convertToTermDictionary.js ├── downloadLatest.js ├── test ├── parseCantoneseReadings.test.js ├── parseEntry.test.js └── testdata.csv ├── types.d.ts └── util ├── addYomitanImages.js ├── addYomitanTags.js ├── csv ├── csvHandler.js └── parseCsvEntriesToJson.js ├── entryParse ├── findImages.js ├── parseEntryToJson.js └── parseLabels.js ├── getVersion.js ├── imageHandler ├── compressImages.js ├── downloadImages.js └── getImageFileName.js ├── readAndParseCSVs.js ├── textHandling ├── parseCantoneseReadings.js └── textUtils.js └── yomitan ├── convertEntryToDetailedDefinition.js ├── convertEntryToSynAntsSC.js ├── convertEntryToYomitanTerms.js ├── convertHeadwordsToSC.js ├── convertSenseToSC.js ├── createEntryAttribution.js ├── createEntryImageSC.js └── parseTextToSC.js /.github/workflows/daily-build.yaml: -------------------------------------------------------------------------------- 1 | name: Build and Release Dictionaries Daily 2 | 3 | on: 4 | schedule: 5 | - cron: '0 0 * * *' 6 | workflow_dispatch: 7 | 8 | jobs: 9 | build-release: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | 15 | - name: Install Dependencies 16 | run: npm ci 17 | 18 | - name: Download Latest CSVs 19 | run: npm run download 20 | 21 | - name: Get Current Date 22 | id: date 23 | run: echo "::set-output name=date::$(date +'%Y-%m-%d')" 24 | 25 | - name: Build Dictionaries 26 | run: | 27 | npm run buildTermDict ${{ steps.date.outputs.date }} 28 | npm run buildHonziDict ${{ steps.date.outputs.date }} 29 | 30 | - name: Create and Publish Release 31 | uses: softprops/action-gh-release@v2 32 | with: 33 | files: dist/* 34 | tag_name: ${{ steps.date.outputs.date }} 35 | name: ${{ steps.date.outputs.date }} 36 | token: ${{ secrets.GITHUB_TOKEN }} 37 | body: | 38 | This is an automated release of the latest Words.hk for Yomitan. 39 | For more information, please see the [README](https://github.com/MarvNC/wordshk-yomitan). 40 | Download the latest release below: the file name should look like `Words.hk.YYYY-MM-DD.zip`. 41 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: Node.js CI 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | test: 7 | runs-on: ubuntu-latest 8 | 9 | steps: 10 | - uses: actions/checkout@v4 11 | - uses: actions/setup-node@v4 12 | with: 13 | node-version: 20 14 | - run: npm ci 15 | - run: npm test 16 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.txt 2 | csvs 3 | /images 4 | /compressedImages 5 | /freqjsons 6 | 7 | # Created by https://www.toptal.com/developers/gitignore/api/node 8 | # Edit at https://www.toptal.com/developers/gitignore?templates=node 9 | 10 | ### Node ### 11 | # Logs 12 | logs 13 | *.log 14 | npm-debug.log* 15 | yarn-debug.log* 16 | yarn-error.log* 17 | lerna-debug.log* 18 | .pnpm-debug.log* 19 | 20 | # Diagnostic reports (https://nodejs.org/api/report.html) 21 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 22 | 23 | # Runtime data 24 | pids 25 | *.pid 26 | *.seed 27 | *.pid.lock 28 | 29 | # Directory for instrumented libs generated by jscoverage/JSCover 30 | lib-cov 31 | 32 | # Coverage directory used by tools like istanbul 33 | coverage 34 | *.lcov 35 | 36 | # nyc test coverage 37 | .nyc_output 38 | 39 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 40 | .grunt 41 | 42 | # Bower dependency directory (https://bower.io/) 43 | bower_components 44 | 45 | # node-waf configuration 46 | .lock-wscript 47 | 48 | # Compiled binary addons (https://nodejs.org/api/addons.html) 49 | build/Release 50 | 51 | # Dependency directories 52 | node_modules/ 53 | jspm_packages/ 54 | 55 | # Snowpack dependency directory (https://snowpack.dev/) 56 | web_modules/ 57 | 58 | # TypeScript cache 59 | *.tsbuildinfo 60 | 61 | # Optional npm cache directory 62 | .npm 63 | 64 | # Optional eslint cache 65 | .eslintcache 66 | 67 | # Optional stylelint cache 68 | .stylelintcache 69 | 70 | # Microbundle cache 71 | .rpt2_cache/ 72 | .rts2_cache_cjs/ 73 | .rts2_cache_es/ 74 | .rts2_cache_umd/ 75 | 76 | # Optional REPL history 77 | .node_repl_history 78 | 79 | # Output of 'npm pack' 80 | *.tgz 81 | 82 | # Yarn Integrity file 83 | .yarn-integrity 84 | 85 | # dotenv environment variable files 86 | .env 87 | .env.development.local 88 | .env.test.local 89 | .env.production.local 90 | .env.local 91 | 92 | # parcel-bundler cache (https://parceljs.org/) 93 | .cache 94 | .parcel-cache 95 | 96 | # Next.js build output 97 | .next 98 | out 99 | 100 | # Nuxt.js build / generate output 101 | .nuxt 102 | dist 103 | 104 | # Gatsby files 105 | .cache/ 106 | # Comment in the public line in if your project uses Gatsby and not Next.js 107 | # https://nextjs.org/blog/next-9-1#public-directory-support 108 | # public 109 | 110 | # vuepress build output 111 | .vuepress/dist 112 | 113 | # vuepress v2.x temp and cache directory 114 | .temp 115 | 116 | # Docusaurus cache and generated files 117 | .docusaurus 118 | 119 | # Serverless directories 120 | .serverless/ 121 | 122 | # FuseBox cache 123 | .fusebox/ 124 | 125 | # DynamoDB Local files 126 | .dynamodb/ 127 | 128 | # TernJS port file 129 | .tern-port 130 | 131 | # Stores VSCode versions used for testing VSCode extensions 132 | .vscode-test 133 | 134 | # yarn v2 135 | .yarn/cache 136 | .yarn/unplugged 137 | .yarn/build-state.yml 138 | .yarn/install-state.gz 139 | .pnp.* 140 | 141 | ### Node Patch ### 142 | # Serverless Webpack directories 143 | .webpack/ 144 | 145 | # Optional stylelint cache 146 | 147 | # SvelteKit build / generate output 148 | .svelte-kit 149 | 150 | # End of https://www.toptal.com/developers/gitignore/api/node -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "tabWidth": 2, 3 | "useTabs": false, 4 | "singleQuote": true, 5 | "proseWrap": "always" 6 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 marv 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "scripts": { 3 | "download": "node src/downloadLatest.js", 4 | "buildTermDict": "node src/convertToTermDictionary.js", 5 | "buildFreq": "node src/convertToFrequencyDictionary.js", 6 | "buildHonziDict": "node src/convertToHonziDictionary.js", 7 | "test": "ava" 8 | }, 9 | "dependencies": { 10 | "@gerhobbelt/xregexp": "^4.4.0-32", 11 | "axios": "^1.6.7", 12 | "csv-parser": "^3.0.0", 13 | "is-cjk-hanzi": "^1.0.0", 14 | "jsdom": "^23.0.1", 15 | "sharp": "^0.33.2", 16 | "yomichan-dict-builder": "^2.9.0", 17 | "zlib": "^1.0.5" 18 | }, 19 | "type": "module", 20 | "devDependencies": { 21 | "ava": "^6.0.1" 22 | }, 23 | "version": "1.0.0" 24 | } 25 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | # Words.hk for Yomitan 2 | 3 | [![](https://img.shields.io/github/v/tag/marvnc/wordshk-yomitan?style=for-the-badge&label=Last%20Release)](https://github.com/MarvNC/wordshk-yomitan/releases/latest) 4 | 5 | A conversion of the [words.hk](https://words.hk) dictionary for 6 | [Yomitan](https://github.com/themoeway/yomitan) (formerly Yomichan). The 7 | words.hk dictionary data is fetched from 8 | [words.hk](https://words.hk/faiman/analysis/), built, then released 9 | automatically every day. 10 | 11 | Built using 12 | [yomichan-dict-builder](https://github.com/MarvNC/yomichan-dict-builder). For 13 | more Yomitan dictionaries and tools, see 14 | [Yomichan Dictionaries](https://github.com/MarvNC/yomichan-dictionaries). 15 | 16 | ## Download 17 | 18 | - [Words.hk for Yomitan](https://github.com/MarvNC/wordshk-yomitan/releases/latest) 19 | - [Words.hk 漢字 for Yomitan](https://github.com/MarvNC/wordshk-yomitan/releases/latest) 20 | - [Words.hk Frequency](https://github.com/MarvNC/wordshk-yomitan/releases/download/2024-09-17/YUE.Freq.Words.hk.Frequency.zip) 21 | 22 | ## Screenshots 23 | 24 | | ![chrome_廣東話_-_廣東話解釋__粵典_-_Google_Chrome_2024-02-03_22-57-37](https://github.com/MarvNC/wordshk-yomitan/assets/17340496/83eacfc1-6e31-453c-91c2-a8dac3be0bc4) | ![chrome_老虎_-_廣東話解釋__粵典_-_Google_Chrome_2024-02-03_22-58-13](https://github.com/MarvNC/wordshk-yomitan/assets/17340496/e882daa8-6fc4-491d-930e-ca9a0a081193) | 25 | | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | 26 | | ![chrome_全脂_-_廣東話解釋__粵典_-_Google_Chrome_2024-02-03_22-58-35](https://github.com/MarvNC/wordshk-yomitan/assets/17340496/51fa78ba-c882-4f8c-b159-57a86f08e74b) | ![chrome_講_-_廣東話解釋__粵典_-_Google_Chrome_2024-02-03_22-58-48](https://github.com/MarvNC/wordshk-yomitan/assets/17340496/233798e0-2363-48c4-9c11-6665e6262ef2) | 27 | | ![chrome_Yomitan_Settings_-_Google_Chrome_2024-02-10_20-54-43](https://github.com/MarvNC/wordshk-yomitan/assets/17340496/57190a49-baaa-4313-87c7-9e8252daf2ae) | ![chrome_Yomitan_Settings_-_Google_Chrome_2024-02-10_20-53-17](https://github.com/MarvNC/wordshk-yomitan/assets/17340496/4f6b9654-eb5d-4187-8d8d-56f4a10dfcf6) | 28 | 29 | ## Usage 30 | 31 | Simply download the dictionary and import it into Yomitan. For more detailed 32 | instructions, please see the 33 | [Yomitan documentation](https://github.com/themoeway/yomitan). 34 | 35 | ## Attribution/License 36 | 37 | The code in this repository is licensed under the MIT license. 38 | 39 | This Yomitan dictionary is built off the free data provided by words.hk and is 40 | licensed under the Non-Commercial Open Data License 1.0 that 41 | [words.hk](https://words.hk/base/hoifong/) is. 42 | 43 | I took a lot of inspiration (copied) design ideas and styling from 44 | [Stephenmk's Jitendex](https://github.com/stephenmk/Jitendex) in designing this 45 | dictionary. 46 | -------------------------------------------------------------------------------- /src/constants.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @type {Record} 3 | */ 4 | const LANGUAGES_DATA = { 5 | yue: { 6 | name: '廣東話', 7 | shortName: '粵', 8 | langCode: 'yue', 9 | }, 10 | eng: { 11 | name: '英文', 12 | shortName: '英', 13 | langCode: 'en', 14 | }, 15 | zho: { 16 | name: '中文', 17 | shortName: '中', 18 | langCode: 'zh-Hant', 19 | }, 20 | jpn: { 21 | name: '日文', 22 | shortName: '日', 23 | langCode: 'ja', 24 | }, 25 | kor: { 26 | name: '韓文', 27 | shortName: '韓', 28 | langCode: 'ko', 29 | }, 30 | vie: { 31 | name: '越南文', 32 | shortName: '越', 33 | langCode: 'vi', 34 | }, 35 | lzh: { 36 | name: '文言文', 37 | shortName: '文', 38 | langCode: 'zh-Hant', 39 | }, 40 | por: { 41 | name: '葡萄牙文', 42 | shortName: '葡', 43 | langCode: 'pt', 44 | }, 45 | deu: { 46 | name: '德文', 47 | shortName: '德', 48 | langCode: 'de', 49 | }, 50 | fra: { 51 | name: '法文', 52 | shortName: '法', 53 | langCode: 'fr', 54 | }, 55 | mnc: { 56 | name: '滿文', 57 | shortName: '滿', 58 | langCode: 'mnc', 59 | }, 60 | lat: { 61 | name: '拉丁文', 62 | shortName: '拉', 63 | langCode: 'la', 64 | }, 65 | tib: { 66 | name: '藏文', 67 | shortName: '藏', 68 | langCode: 'bo', 69 | }, 70 | 量詞: { 71 | name: '量詞', 72 | shortName: '量詞', 73 | langCode: '', 74 | }, 75 | }; 76 | 77 | const IMAGE_FOLDER = 'images'; 78 | const COMPRESSED_IMAGES_FOLDER = './compressedImages'; 79 | const IMAGE_RESIZE_WIDTH = 400; 80 | 81 | export { 82 | LANGUAGES_DATA, 83 | IMAGE_FOLDER, 84 | COMPRESSED_IMAGES_FOLDER, 85 | IMAGE_RESIZE_WIDTH, 86 | }; 87 | export const dataFolder = './csvs'; 88 | export const exportDirectory = './dist'; 89 | 90 | export const TERM_INDEX_FILE = 'term_index.json'; 91 | export const HONZI_INDEX_FILE = 'honzi_index.json'; -------------------------------------------------------------------------------- /src/convertToFrequencyDictionary.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Requires the jsons downloaded from https://words.hk/faiman/analysis/ 3 | * to be in the freqjsons directory 4 | */ 5 | import fs from 'fs'; 6 | import path from 'path'; 7 | import { Dictionary, DictionaryIndex } from 'yomichan-dict-builder'; 8 | const freqJsonsDir = 'freqjsons'; 9 | const charCountJson = 'charcount.json'; 10 | const existingWordCountJson = 'existingwordcount.json'; 11 | 12 | (async () => { 13 | const freqJsons = fs.readdirSync(freqJsonsDir); 14 | const charCountData = JSON.parse( 15 | fs.readFileSync(path.join(freqJsonsDir, charCountJson)).toString() 16 | ); 17 | const existingWordCountData = JSON.parse( 18 | fs.readFileSync(path.join(freqJsonsDir, existingWordCountJson)).toString() 19 | ); 20 | console.log(`Read ${freqJsons.length} files from ${freqJsonsDir}`); 21 | console.log( 22 | `Read ${Object.keys(charCountData).length} characters from ${charCountJson}` 23 | ); 24 | console.log( 25 | `Read ${ 26 | Object.keys(existingWordCountData).length 27 | } words from ${existingWordCountJson}` 28 | ); 29 | 30 | const dictionary = new Dictionary({ 31 | fileName: 'Words.hk Frequency.zip', 32 | }); 33 | const dictionaryIndex = new DictionaryIndex() 34 | .setAuthor('Marv') 35 | .setAttribution( 36 | `Words.hk & contributers (https://words.hk) 37 | See license at https://words.hk/base/hoifong/` 38 | ) 39 | .setUrl('https://github.com/MarvNC/wordshk-yomitan') 40 | .setDescription( 41 | `Converted from the free Words.hk dictionary found at https://words.hk/. 42 | Converted using https://github.com/MarvNC/yomichan-dict-builder` 43 | ) 44 | .setTitle(`Words.hk Frequency`) 45 | .setRevision(`1.0`); 46 | await dictionary.setIndex(dictionaryIndex.build()); 47 | 48 | // Add characters to kanji meta 49 | const sortedCharCountData = Object.entries(charCountData).sort( 50 | ([, a], [, b]) => b - a 51 | ); 52 | for (let i = 0; i < sortedCharCountData.length; i++) { 53 | const [char, occurrences] = sortedCharCountData[i]; 54 | await dictionary.addKanjiMeta([ 55 | char, 56 | 'freq', 57 | { 58 | displayValue: `${i + 1} (${occurrences})`, 59 | value: i + 1, 60 | }, 61 | ]); 62 | } 63 | 64 | // Add words to dictionary 65 | const sortedExistingWordCountData = Object.entries( 66 | existingWordCountData 67 | ).sort(([, a], [, b]) => b - a); 68 | for (let i = 0; i < sortedExistingWordCountData.length; i++) { 69 | const [word, occurrences] = sortedExistingWordCountData[i]; 70 | await dictionary.addTermMeta([ 71 | word, 72 | 'freq', 73 | { 74 | displayValue: `${i + 1} (${occurrences})`, 75 | value: i + 1, 76 | }, 77 | ]); 78 | } 79 | 80 | await dictionary.export('dist'); 81 | console.log(`Exported dictionary to dist.`); 82 | })(); 83 | -------------------------------------------------------------------------------- /src/convertToHonziDictionary.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs/promises'; 2 | import { Dictionary, DictionaryIndex, KanjiEntry } from 'yomichan-dict-builder'; 3 | import { getVersion } from './util/getVersion.js'; 4 | import { dataFolder, exportDirectory, HONZI_INDEX_FILE } from './constants.js'; 5 | import { readAndParseCSVs } from './util/readAndParseCSVs.js'; 6 | import { isSingleCJKHanzi } from 'is-cjk-hanzi'; 7 | 8 | (async () => { 9 | const tagName = process.argv[2] ?? 'latest'; 10 | 11 | const { dictionaryEntries, dateString } = await readAndParseCSVs(dataFolder); 12 | 13 | /** @type {`${string}.zip`} */ 14 | const honziDictionaryFilename = `Words.hk.Honzi.${dateString}.zip`; 15 | const dictionary = new Dictionary({ 16 | fileName: honziDictionaryFilename, 17 | }); 18 | 19 | const dictionaryIndex = new DictionaryIndex() 20 | .setAuthor('Marv') 21 | .setAttribution( 22 | `Words.hk & contributers (https://words.hk) 23 | See license at https://words.hk/base/hoifong/` 24 | ) 25 | .setUrl('https://github.com/MarvNC/wordshk-yomitan') 26 | .setDescription( 27 | `Converted from the free Words.hk dictionary found at https://words.hk/. 28 | Converted using https://github.com/MarvNC/yomichan-dict-builder` 29 | ) 30 | .setTitle(`Words.hk 粵典 漢字 [${dateString}]`) 31 | .setRevision(dateString) 32 | .setIsUpdatable(true) 33 | .setIndexUrl( 34 | `https://github.com/MarvNC/wordshk-yomitan/releases/latest/download/${HONZI_INDEX_FILE}` 35 | ) 36 | .setDownloadUrl( 37 | `https://github.com/MarvNC/wordshk-yomitan/releases/download/${tagName}/${honziDictionaryFilename}` 38 | ); 39 | await dictionary.setIndex(dictionaryIndex.build()); 40 | 41 | // save index file to exportDirectory 42 | await dictionaryIndex.export(exportDirectory, HONZI_INDEX_FILE); 43 | 44 | for (const entry of dictionaryEntries) { 45 | addHonziEntry(dictionary, entry); 46 | } 47 | console.log(`Finished adding entries to dictionary.`); 48 | 49 | const stats = await dictionary.export(exportDirectory); 50 | console.log(`Exported honzi dictionary to ${exportDirectory}.`); 51 | console.log(`Added ${stats.kanjiCount} honzi entries.`); 52 | })(); 53 | 54 | /** 55 | * 56 | * @param {Dictionary} dictionary 57 | * @param {DictionaryEntry} entry 58 | */ 59 | function addHonziEntry(dictionary, entry) { 60 | for (const headword of entry.headwords) { 61 | if (!isSingleCJKHanzi(headword.text)) { 62 | continue; 63 | } 64 | const kanjiEntry = new KanjiEntry(headword.text).setKunyomi( 65 | headword.readings.join(' ') 66 | ); 67 | for (const sense of entry.senses) { 68 | for (const explanationText of Object.values(sense.explanation)) { 69 | kanjiEntry.addMeanings(explanationText); 70 | } 71 | } 72 | dictionary.addKanji(kanjiEntry.build()); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/convertToTermDictionary.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs/promises'; 2 | import { Dictionary, DictionaryIndex } from 'yomichan-dict-builder'; 3 | 4 | import { convertEntryToYomitanTerms } from './util/yomitan/convertEntryToYomitanTerms.js'; 5 | import { findLabelValues } from './util/entryParse/parseLabels.js'; 6 | import { addYomitanTags } from './util/addYomitanTags.js'; 7 | import { getAllImageURLs } from './util/entryParse/findImages.js'; 8 | import { downloadImages } from './util/imageHandler/downloadImages.js'; 9 | import { addYomitanImages } from './util/addYomitanImages.js'; 10 | import { 11 | IMAGE_FOLDER, 12 | COMPRESSED_IMAGES_FOLDER, 13 | IMAGE_RESIZE_WIDTH, 14 | TERM_INDEX_FILE, 15 | } from './constants.js'; 16 | import { compressImages } from './util/imageHandler/compressImages.js'; 17 | import { dataFolder, exportDirectory } from './constants.js'; 18 | import { getVersion } from './util/getVersion.js'; 19 | import { readAndParseCSVs } from './util/readAndParseCSVs.js'; 20 | 21 | (async () => { 22 | const tagName = process.argv[2] ?? 'latest'; 23 | 24 | const { dictionaryEntries, dateString } = await readAndParseCSVs(dataFolder); 25 | 26 | const uniqueLabels = findLabelValues(dictionaryEntries); 27 | 28 | const imageURLs = getAllImageURLs(dictionaryEntries); 29 | 30 | await downloadImages(imageURLs); 31 | 32 | const compressImagesPromise = compressImages( 33 | IMAGE_FOLDER, 34 | COMPRESSED_IMAGES_FOLDER, 35 | IMAGE_RESIZE_WIDTH 36 | ); 37 | 38 | /** @type {`${string}.zip`} */ 39 | const termDictionaryFileName = `Words.hk.${dateString}.zip`; 40 | const dictionary = new Dictionary({ 41 | fileName: termDictionaryFileName, 42 | }); 43 | 44 | const dictionaryIndex = new DictionaryIndex() 45 | .setAuthor('Marv') 46 | .setAttribution( 47 | `Words.hk & contributers (https://words.hk) 48 | See license at https://words.hk/base/hoifong/` 49 | ) 50 | .setUrl('https://github.com/MarvNC/wordshk-yomitan') 51 | .setDescription( 52 | `Converted from the free Words.hk dictionary found at https://words.hk/. 53 | This export contains ${dictionaryEntries.length} entries. 54 | Converted using https://github.com/MarvNC/yomichan-dict-builder` 55 | ) 56 | .setTitle(`Words.hk 粵典 [${dateString}]`) 57 | .setRevision(dateString) 58 | .setIsUpdatable(true) 59 | .setIndexUrl( 60 | `https://github.com/MarvNC/wordshk-yomitan/releases/latest/download/${TERM_INDEX_FILE}` 61 | ) 62 | .setDownloadUrl( 63 | `https://github.com/MarvNC/wordshk-yomitan/releases/${tagName}/download/${termDictionaryFileName}` 64 | ); 65 | await dictionary.setIndex(dictionaryIndex.build()); 66 | 67 | // save index file to exportDirectory 68 | await dictionaryIndex.export(exportDirectory, TERM_INDEX_FILE); 69 | 70 | for (const entry of dictionaryEntries) { 71 | const terms = convertEntryToYomitanTerms(entry); 72 | for (const term of terms) { 73 | await dictionary.addTerm(term); 74 | } 75 | } 76 | console.log(`Finished adding entries to dictionary.`); 77 | 78 | await addYomitanTags(dictionary, uniqueLabels); 79 | 80 | console.log(`Adding images to dictionary.`); 81 | // Wait for images to be compressed before adding 82 | await compressImagesPromise; 83 | await addYomitanImages(dictionary, COMPRESSED_IMAGES_FOLDER); 84 | 85 | await dictionary.export(exportDirectory); 86 | console.log(`Exported dictionary to ${exportDirectory}.`); 87 | })(); 88 | -------------------------------------------------------------------------------- /src/downloadLatest.js: -------------------------------------------------------------------------------- 1 | import { JSDOM } from 'jsdom'; 2 | import fs from 'fs'; 3 | import path from 'path'; 4 | import zlib from 'zlib'; 5 | import axios from 'axios'; 6 | 7 | const domain = 'https://words.hk'; 8 | const requestURL = `${domain}/faiman/request_data/`; 9 | const csvDir = 'csvs'; 10 | 11 | (async function downloadLatest() { 12 | const dom = await JSDOM.fromURL(requestURL); 13 | const { document } = dom.window; 14 | const csrfTokenInput = document.querySelector( 15 | 'input[name=csrfmiddlewaretoken]' 16 | ); 17 | if (!csrfTokenInput) { 18 | throw new Error('No csrf token found'); 19 | } 20 | const csrfToken = /** @type{HTMLInputElement} */ (csrfTokenInput).value; 21 | const myHeaders = new Headers(); 22 | myHeaders.append('Cookie', `csrftoken=${csrfToken}`); 23 | myHeaders.append('Origin', domain); 24 | myHeaders.append('Referer', requestURL); 25 | const urlencoded = new URLSearchParams(); 26 | urlencoded.append('csrfmiddlewaretoken', csrfToken); 27 | 28 | /** 29 | * @type {RequestInit} 30 | */ 31 | const requestOptions = { 32 | method: 'POST', 33 | headers: myHeaders, 34 | body: urlencoded, 35 | redirect: 'follow', 36 | }; 37 | 38 | const response = await fetch(requestURL, requestOptions); 39 | const text = await response.text(); 40 | if (!response.ok) { 41 | throw new Error(`Response: ${response.status} ${response.statusText}`); 42 | } 43 | console.log('Request success, getting csv links...'); 44 | const csvLinks = await getCSVLinks(new JSDOM(text)); 45 | 46 | await downloadCSVs(csvLinks); 47 | console.log('Download complete.'); 48 | })(); 49 | 50 | /** 51 | * 52 | * @param {JSDOM} dom 53 | * @returns {Promise} The URLs of the CSVs 54 | */ 55 | async function getCSVLinks(dom) { 56 | const { document } = dom.window; 57 | 58 | const csvLinkAnchors = /** @type {HTMLAnchorElement[]} */ ([ 59 | ...document.querySelectorAll("a[href$='.csv.gz']"), 60 | ]); 61 | if (csvLinkAnchors.length !== 2) { 62 | throw new Error('Expected 2 csv links'); 63 | } 64 | 65 | console.log('Found two csv links.'); 66 | 67 | const csvLinks = csvLinkAnchors.map((a) => `${domain}${a.href}`); 68 | 69 | return csvLinks; 70 | } 71 | 72 | /** 73 | * Download the CSVs from the given URLs 74 | * @param {string[]} csvLinks 75 | */ 76 | async function downloadCSVs(csvLinks) { 77 | // Create the directory if it doesn't exist 78 | if (!fs.existsSync(csvDir)) { 79 | fs.mkdirSync(csvDir); 80 | } 81 | 82 | // Delete contents of the directory 83 | fs.readdirSync(csvDir).forEach((file) => { 84 | fs.unlinkSync(path.join(csvDir, file)); 85 | }); 86 | 87 | // Process each URL 88 | for (const url of csvLinks) { 89 | // Extract filename from URL 90 | const filename = path.basename(url); 91 | 92 | const fullPath = path.join(csvDir, filename); 93 | 94 | console.log(`Downloading ${filename} from ${url}...`); 95 | 96 | // Download the file from the URL to csvs directory 97 | const response = await axios.get(url, { 98 | responseType: 'arraybuffer', 99 | }); 100 | const buffer = Buffer.from(response.data); 101 | 102 | fs.writeFileSync(fullPath, buffer); 103 | 104 | // Unzip the downloaded file 105 | console.log(`Unzipping ${filename}...`); 106 | const gzip = zlib.createGunzip(); 107 | const source = fs.createReadStream(fullPath); 108 | const destination = fs.createWriteStream( 109 | path.join(csvDir, filename.replace('.gz', '')) 110 | ); 111 | source 112 | .pipe(gzip) 113 | .pipe(destination) 114 | .on('finish', function () { 115 | // Delete the .gz file 116 | fs.unlinkSync(fullPath); 117 | }); 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /src/test/parseCantoneseReadings.test.js: -------------------------------------------------------------------------------- 1 | import test from 'ava'; 2 | 3 | import { parseCantoneseReadings } from '../util/textHandling/parseCantoneseReadings.js'; 4 | 5 | /** 6 | * @typedef {Object} TestCase 7 | * @property {string} text 8 | * @property {string} reading 9 | * @property {TextReadingPair[]} [expected] 10 | * @property {boolean} [shouldThrow] 11 | */ 12 | 13 | /** 14 | * @type {TestCase[]} 15 | */ 16 | const testCases = [ 17 | { 18 | text: '福州', 19 | reading: 'fuk1 zau1', 20 | expected: [ 21 | { text: '福', reading: 'fuk1' }, 22 | { text: '州', reading: 'zau1' }, 23 | ], 24 | }, 25 | { 26 | text: 'bu你阿麼', 27 | reading: 'bu4 ni5 aa3 mo1', 28 | expected: [ 29 | { text: 'bu', reading: 'bu4' }, 30 | { text: '你', reading: 'ni5' }, 31 | { text: '阿', reading: 'aa3' }, 32 | { text: '麼', reading: 'mo1' }, 33 | ], 34 | }, 35 | { 36 | text: '你get唔get到我講咩?', 37 | reading: 'nei5 get1 m4 get1 dou2 ngo5 gong2 me1?', 38 | expected: [ 39 | { text: '你', reading: 'nei5' }, 40 | { text: 'get', reading: 'get1' }, 41 | { text: '唔', reading: 'm4' }, 42 | { text: 'get', reading: 'get1' }, 43 | { text: '到', reading: 'dou2' }, 44 | { text: '我', reading: 'ngo5' }, 45 | { text: '講', reading: 'gong2' }, 46 | { text: '咩', reading: 'me1' }, 47 | { text: '?', reading: '' }, 48 | ], 49 | }, 50 | { 51 | text: '專業運動員成日斷韌帶。', 52 | reading: 'zyun1 jip6 wan6 dung6 jyun4 seng4 jat6 tyun5 jan6 daai2.', 53 | expected: [ 54 | { text: '專', reading: 'zyun1' }, 55 | { text: '業', reading: 'jip6' }, 56 | { text: '運', reading: 'wan6' }, 57 | { text: '動', reading: 'dung6' }, 58 | { text: '員', reading: 'jyun4' }, 59 | { text: '成', reading: 'seng4' }, 60 | { text: '日', reading: 'jat6' }, 61 | { text: '斷', reading: 'tyun5' }, 62 | { text: '韌', reading: 'jan6' }, 63 | { text: '帶', reading: 'daai2' }, 64 | { text: '。', reading: '' }, 65 | ], 66 | }, 67 | { 68 | text: '佢考咗車牌六年,終於成功嘞。', 69 | reading: 'keoi5 haau2 zo2 ce1 paai4 luk6 nin4 zung1 jyu1 sing4 gung1 laak3', 70 | expected: [ 71 | { text: '佢', reading: 'keoi5' }, 72 | { text: '考', reading: 'haau2' }, 73 | { text: '咗', reading: 'zo2' }, 74 | { text: '車', reading: 'ce1' }, 75 | { text: '牌', reading: 'paai4' }, 76 | { text: '六', reading: 'luk6' }, 77 | { text: '年', reading: 'nin4' }, 78 | { text: ',', reading: '' }, 79 | { text: '終', reading: 'zung1' }, 80 | { text: '於', reading: 'jyu1' }, 81 | { text: '成', reading: 'sing4' }, 82 | { text: '功', reading: 'gung1' }, 83 | { text: '嘞', reading: 'laak3' }, 84 | { text: '。', reading: '' }, 85 | ], 86 | }, 87 | { 88 | text: '嗰個男仔喺我手臂上搣咗一下。', 89 | reading: 'go2 go3 naam4 zai2 hai2 ngo5 sau2 bei3 soeng6 mit1 zo2 jat1 haa5', 90 | expected: [ 91 | { text: '嗰', reading: 'go2' }, 92 | { text: '個', reading: 'go3' }, 93 | { text: '男', reading: 'naam4' }, 94 | { text: '仔', reading: 'zai2' }, 95 | { text: '喺', reading: 'hai2' }, 96 | { text: '我', reading: 'ngo5' }, 97 | { text: '手', reading: 'sau2' }, 98 | { text: '臂', reading: 'bei3' }, 99 | { text: '上', reading: 'soeng6' }, 100 | { text: '搣', reading: 'mit1' }, 101 | { text: '咗', reading: 'zo2' }, 102 | { text: '一', reading: 'jat1' }, 103 | { text: '下', reading: 'haa5' }, 104 | { text: '。', reading: '' }, 105 | ], 106 | }, 107 | { 108 | text: '「乜乜M」嗰啲巴士,一定經地鐵站㗎。', 109 | reading: 110 | 'mat1 mat1 em1 go2 di1 baa1 si2, jat1 ding6 ging1 dei6 tit3 zaam6 gaa3.', 111 | expected: [ 112 | { text: '「', reading: '' }, 113 | { text: '乜', reading: 'mat1' }, 114 | { text: '乜', reading: 'mat1' }, 115 | { text: 'M', reading: 'em1' }, 116 | { text: '」', reading: '' }, 117 | { text: '嗰', reading: 'go2' }, 118 | { text: '啲', reading: 'di1' }, 119 | { text: '巴', reading: 'baa1' }, 120 | { text: '士', reading: 'si2' }, 121 | { text: ',', reading: '' }, 122 | { text: '一', reading: 'jat1' }, 123 | { text: '定', reading: 'ding6' }, 124 | { text: '經', reading: 'ging1' }, 125 | { text: '地', reading: 'dei6' }, 126 | { text: '鐵', reading: 'tit3' }, 127 | { text: '站', reading: 'zaam6' }, 128 | { text: '㗎', reading: 'gaa3' }, 129 | { text: '。', reading: '' }, 130 | ], 131 | }, 132 | { 133 | text: '𨂾過條溪', 134 | reading: 'laam3 gwo3 tiu4 kai1', 135 | expected: [ 136 | { text: '𨂾', reading: 'laam3' }, 137 | { text: '過', reading: 'gwo3' }, 138 | { text: '條', reading: 'tiu4' }, 139 | { text: '溪', reading: 'kai1' }, 140 | ], 141 | }, 142 | { 143 | text: '呢個商場係好多居民返屋企嘅必經之路,有好有唔好囉。', 144 | reading: 145 | 'nei1 go3 soeng1 coeng4 hai6 hou2 do1 geoi1 man4 faan1 uk1 kei2 ge3 bit1ging1 zi1 lou6, jau5 hou2 jau5 m4 hou2 lo1.', 146 | expected: [ 147 | { text: '呢', reading: 'nei1' }, 148 | { text: '個', reading: 'go3' }, 149 | { text: '商', reading: 'soeng1' }, 150 | { text: '場', reading: 'coeng4' }, 151 | { text: '係', reading: 'hai6' }, 152 | { text: '好', reading: 'hou2' }, 153 | { text: '多', reading: 'do1' }, 154 | { text: '居', reading: 'geoi1' }, 155 | { text: '民', reading: 'man4' }, 156 | { text: '返', reading: 'faan1' }, 157 | { text: '屋', reading: 'uk1' }, 158 | { text: '企', reading: 'kei2' }, 159 | { text: '嘅', reading: 'ge3' }, 160 | { text: '必', reading: 'bit1' }, 161 | { text: '經', reading: 'ging1' }, 162 | { text: '之', reading: 'zi1' }, 163 | { text: '路', reading: 'lou6' }, 164 | { text: ',', reading: '' }, 165 | { text: '有', reading: 'jau5' }, 166 | { text: '好', reading: 'hou2' }, 167 | { text: '有', reading: 'jau5' }, 168 | { text: '唔', reading: 'm4' }, 169 | { text: '好', reading: 'hou2' }, 170 | { text: '囉', reading: 'lo1' }, 171 | { text: '。', reading: '' }, 172 | ], 173 | }, 174 | { 175 | text: '今晚演出嘅粵劇劇目係《白兔會》。', 176 | reading: 177 | 'gam1 maan5 jin2 ceot1 ge3 jyut6 kek6 kek6 muk6 hai6 baak6 tou3 wui6.', 178 | expected: [ 179 | { text: '今', reading: 'gam1' }, 180 | { text: '晚', reading: 'maan5' }, 181 | { text: '演', reading: 'jin2' }, 182 | { text: '出', reading: 'ceot1' }, 183 | { text: '嘅', reading: 'ge3' }, 184 | { text: '粵', reading: 'jyut6' }, 185 | { text: '劇', reading: 'kek6' }, 186 | { text: '劇', reading: 'kek6' }, 187 | { text: '目', reading: 'muk6' }, 188 | { text: '係', reading: 'hai6' }, 189 | { text: '《', reading: '' }, 190 | { text: '白', reading: 'baak6' }, 191 | { text: '兔', reading: 'tou3' }, 192 | { text: '會', reading: 'wui6' }, 193 | { text: '》', reading: '' }, 194 | { text: '。', reading: '' }, 195 | ], 196 | }, 197 | { 198 | text: 'Panda好pandai踢呢', 199 | reading: 'pan3 daa1 hou2 baan3 naai1 tek3 le3', 200 | shouldThrow: true, 201 | }, 202 | ]; 203 | 204 | for (const { text, reading, expected, shouldThrow } of testCases) { 205 | test(`${ 206 | shouldThrow ? ' (should throw)' : '' 207 | }parseCantoneseReadings: ${text} ${reading}`, (t) => { 208 | if (shouldThrow) { 209 | t.throws(() => parseCantoneseReadings(text, reading)); 210 | return; 211 | } else { 212 | const result = parseCantoneseReadings(text, reading); 213 | t.deepEqual(result, expected); 214 | } 215 | }); 216 | } 217 | -------------------------------------------------------------------------------- /src/test/parseEntry.test.js: -------------------------------------------------------------------------------- 1 | import test from 'ava'; 2 | import path from 'path'; 3 | 4 | import { parseCSVEntries } from '../util/csv/parseCsvEntriesToJson.js'; 5 | 6 | const testCsvFile = 'src/test/testdata.csv'; 7 | 8 | const expectedEntries = [ 9 | { 10 | id: 101613, 11 | headwords: [ 12 | { 13 | text: '大電', 14 | readings: ['daai6 din6'], 15 | }, 16 | ], 17 | tags: [ 18 | { 19 | name: 'pos', 20 | value: '名詞', 21 | }, 22 | ], 23 | senses: [ 24 | { 25 | explanation: { 26 | yue: ['D電池(量詞:粒)'], 27 | eng: ['D cells battery'], 28 | }, 29 | egs: [], 30 | }, 31 | ], 32 | }, 33 | { 34 | id: 92456, 35 | headwords: [ 36 | { 37 | text: '發電廠', 38 | readings: ['faat3 din6 cong2'], 39 | }, 40 | ], 41 | tags: [ 42 | { 43 | name: 'pos', 44 | value: '名詞', 45 | }, 46 | ], 47 | senses: [ 48 | { 49 | explanation: { 50 | yue: ['產生#電力 嘅大型#建築物(量詞:間/座)'], 51 | eng: ['power plant'], 52 | }, 53 | egs: [], 54 | }, 55 | ], 56 | }, 57 | { 58 | id: 82131, 59 | headwords: [ 60 | { 61 | text: '排污', 62 | readings: ['paai4 wu1'], 63 | }, 64 | ], 65 | tags: [ 66 | { 67 | name: 'pos', 68 | value: '動詞', 69 | }, 70 | ], 71 | senses: [ 72 | { 73 | explanation: { 74 | yue: ['排走#污水'], 75 | eng: ['to drain away sewage'], 76 | }, 77 | egs: [ 78 | { 79 | yue: ['排污費 (paai4 wu1)'], 80 | eng: ['sewerage charge'], 81 | }, 82 | { 83 | yue: ['排污系統 (paai4 wu1 hai6 tung2)'], 84 | eng: ['sewage system'], 85 | }, 86 | { 87 | yue: ['排污設施 (paai4 wu1 cit3 si1)'], 88 | eng: ['sewage works'], 89 | }, 90 | { 91 | yue: ['公共排污服務 (gung1 gung6 paai4 wu1 fuk6 mou6)'], 92 | eng: ['public sewage services'], 93 | }, 94 | { 95 | yue: [ 96 | '排污設備改善計劃 (paai4 wu1 cit3 bei6 goi2 sin6 gai3 waak6)', 97 | ], 98 | eng: ['sewerage improvement programme'], 99 | }, 100 | { 101 | yue: [ 102 | '呢啲市區河道嘅設計以防洪及有效排污為主。 (ni1 di1 si5 keoi1 ho4 dou6 ge3 cit3 gai3 ji5 fong4 hung4 kap6 jau5 haau6 paai4 wu1 wai4 zyu2.)', 103 | ], 104 | eng: [ 105 | 'These urban channels were designed for flood prevention and effective drainage.', 106 | ], 107 | }, 108 | ], 109 | }, 110 | ], 111 | }, 112 | { 113 | id: 72252, 114 | headwords: [ 115 | { 116 | text: '揀選', 117 | readings: ['gaan2 syun2'], 118 | }, 119 | ], 120 | tags: [ 121 | { 122 | name: 'pos', 123 | value: '動詞', 124 | }, 125 | { 126 | name: 'sim', 127 | value: '挑選', 128 | }, 129 | { 130 | name: 'sim', 131 | value: '揀', 132 | }, 133 | { 134 | name: 'sim', 135 | value: '選', 136 | }, 137 | { 138 | name: 'sim', 139 | value: '選擇', 140 | }, 141 | ], 142 | senses: [ 143 | { 144 | explanation: { 145 | yue: ['根據你嘅取向,喺兩樣嘢或以上當中,抽取一樣'], 146 | eng: ['to select; to choose'], 147 | }, 148 | egs: [ 149 | { 150 | yue: [ 151 | '一個蠢,一個鈍,噉樣邊叫有得揀選? (jat1 go3 ceon2, jat1 go3 deon6, gam2 joeng2 bin1 giu3 jau5 dak1 gaan2 syun2?)', 152 | ], 153 | eng: [ 154 | 'This candidate is stupid and that is dumb. How can I choose among them?', 155 | ], 156 | }, 157 | ], 158 | }, 159 | ], 160 | }, 161 | { 162 | id: 66987, 163 | headwords: [ 164 | { 165 | text: '背景', 166 | readings: ['bui3 ging2'], 167 | }, 168 | ], 169 | tags: [ 170 | { 171 | name: 'pos', 172 | value: '名詞', 173 | }, 174 | ], 175 | senses: [ 176 | { 177 | explanation: { 178 | yue: ['喺舞台或者現實襯托主體嘅景物、佈景、環境'], 179 | eng: ['background; setting'], 180 | }, 181 | egs: [ 182 | { 183 | yue: [ 184 | '呢張相嘅背景係一啲椰樹。 (ni1 zoeng1 soeng2 ge3 bui3 ging2 hai6 jat1 di1 je4 syu6.)', 185 | ], 186 | eng: ['The coconut trees form a background to this picture.'], 187 | }, 188 | { 189 | yue: [ 190 | '段片嘅背景音樂叫咩名? (dyun6 pin2 ge3 bui3 ging2 jam1 ngok6 giu3 me1 meng2?)', 191 | ], 192 | eng: ['What is the title of the background music in the video?'], 193 | }, 194 | ], 195 | }, 196 | { 197 | explanation: { 198 | yue: [ 199 | '人嘅來歷或經歷,例如家庭、教育、工作等等,亦可以指佢哋所倚靠嘅人物或者勢力', 200 | ], 201 | eng: [ 202 | 'the "background" of a person, especially their educational background, occupation, social/family connections, etc.', 203 | ], 204 | }, 205 | egs: [ 206 | { 207 | yue: [ 208 | '不如揾人查下佢個背景,我覺得佢好有可疑。 (bat1 jyu4 wan2 jan4 caa4 haa5 keoi5 go3 bui3 ging2, ngo5 gok3 dak1 keoi5 hou2 jau5 ho2 ji4.)', 209 | ], 210 | eng: [ 211 | 'Shall we find someone to look into his background? I think he is so suspicious.', 212 | ], 213 | }, 214 | ], 215 | }, 216 | ], 217 | }, 218 | { 219 | id: 90185, 220 | headwords: [ 221 | { 222 | text: '天干地支', 223 | readings: ['tin1 gon1 dei6 zi1'], 224 | }, 225 | ], 226 | tags: [ 227 | { 228 | name: 'pos', 229 | value: '名詞', 230 | }, 231 | { 232 | name: 'sim', 233 | value: '干支', 234 | }, 235 | ], 236 | senses: [ 237 | { 238 | explanation: { 239 | yue: [ 240 | '「#天干」同「#地支」嘅合稱。十天干分別係「#甲#乙#丙#丁#戊#己#庚#辛#壬#癸」。 十二地支係:「#子#丑#寅#卯#辰#巳#午#未#申#酉#戌#亥」。 天干同地支組合就成為以「#甲子」為首嘅六十干支循環。\n\n干支循環通常用嚟計年份。天干亦可以獨立用嚟順序將物件命名,第一個叫「甲」、第二個叫「乙」,如此類推。用法類似西方嘅「A, B, C」 或 「α, β, γ」。中國傳統紀時間嘅方法係將一日分成十二個時辰,每一個時辰由一個地支表示,「子時」係半夜 (11pm 至 1am),如此類推。', 241 | ], 242 | eng: [ 243 | 'Literally "Heavenly Stems and Earthly Branches". It is a traditional Chinese system of counting. Heavenly Stems and Earthly Branches are collectively known as "Stem-Branch".\n\nThe 10 Heavenly Stems are 甲(gaap3) 乙(jyut6) 丙(bing2) 丁(ding1) 戊(mou6) 己(gei2) 庚(gang1) 辛(san1) 壬(jam4) 癸(gwai3).\n\nThe 12 Earthly Branches are 子(zi2) 丑(cau2) 寅(jan4) 卯(maau5) 辰(san4) 巳(zi6) 午(ng5) 未(mei6) 申(san1) 酉(jau5) 戌(seot1) 亥(hoi6). Each Heavenly Stem is paired with an Earthly Branch to form the "stem-branch" sexagenary (i.e. 60 element) cycle that starts with 甲子 (gaap3 zi2)\n\nThe sexagenary cycle is often used for counting years in the Chinese calendar. Heavenly Stems are also used independently to name things in a particular order -- the first is labeled "gaap3", the second "jyut6", the third "bing2", and so on. It is similar to how "A, B, C" and "α, β, γ" are used in western cultures. Earthly Branches are also traditionally used to denote time. One day is divided into twelve slots called Chinese-hours (#時辰), starting from 子時 (zi2 si4), which is 11pm to 1am.', 244 | ], 245 | }, 246 | egs: [ 247 | { 248 | yue: ['乙等 / 乙級 (jyut6 dang2 / jyut6 kap1)'], 249 | eng: ['B grade'], 250 | }, 251 | { 252 | yue: ['甲級戰犯 (gaap3 kap1 zin3 faan2)'], 253 | eng: ['Class A war criminal'], 254 | }, 255 | { 256 | yue: ['戊戌變法 (mou6 seot1 bin3 faat3)'], 257 | eng: [ 258 | "The Hundred Days' Reform of the Qing Dynasty (it is called 戊戌變法 because it occurred in the 戊戌 year)", 259 | ], 260 | }, 261 | { 262 | yue: ['辛亥革命 (san1 hoi6 gaap3 ming6)'], 263 | eng: ['The Xinhai Revolution (Pinyin romanization)'], 264 | }, 265 | { 266 | yue: ['子時 (zi2 si4)'], 267 | eng: ['midnight'], 268 | }, 269 | { 270 | yue: ['午時 (ng5 si4)'], 271 | eng: ['noon'], 272 | }, 273 | ], 274 | }, 275 | ], 276 | }, 277 | { 278 | id: 97033, 279 | headwords: [ 280 | { 281 | text: '着', 282 | readings: ['zoek6'], 283 | }, 284 | { 285 | text: '著', 286 | readings: ['zoek6'], 287 | }, 288 | ], 289 | tags: [ 290 | { 291 | name: 'pos', 292 | value: '詞綴', 293 | }, 294 | { 295 | name: 'label', 296 | value: '書面語', 297 | }, 298 | ], 299 | senses: [ 300 | { 301 | explanation: { 302 | yue: ['表示動作、狀態進行緊、持續緊,類似「#住」、「#下」'], 303 | eng: [ 304 | "to express that an action is in process and a state is prolonged; similar to '#住' zyu6 or '#下' haa5", 305 | ], 306 | }, 307 | egs: [ 308 | { 309 | zho: ['痛並快樂着 (tung3 bing6 faai3 lok6 zoek6)'], 310 | yue: ['痛住開心 (tung3 zyu6 hoi1 sam1)'], 311 | eng: ['feeling painful and happy'], 312 | }, 313 | { 314 | zho: [ 315 | '走着走着就到了課室。 (zau2 zoek6 zau2 zoek6 zau6 dou3 liu5 fo3 sat1.)', 316 | ], 317 | yue: [ 318 | '行下行下就到咗班房。 (haang4 haa5 haang4 haa5 zau6 dou3 zo2 baan1 fong2.)', 319 | ], 320 | eng: ['Walking, (we) have arrived at the classroom.'], 321 | }, 322 | { 323 | zho: ['他們正説着話呢。 (taa1 mun4 zing3 syut3 zoek6 waa6 ne1.)'], 324 | yue: ['佢哋講緊嘢啊。 (keoi5 dei6 gong2 gan2 je5 aa3.)'], 325 | eng: ['They are talking.'], 326 | }, 327 | { 328 | zho: ['等着瞧。 (dang2 zoek6 ciu4.)'], 329 | yue: ['睇下點。 (tai2 haa5 dim2.)'], 330 | eng: ["(Let's) wait and see."], 331 | }, 332 | ], 333 | }, 334 | { 335 | explanation: { 336 | yue: ['動詞後綴,表示動作達到目的、有結果;類似「#到」(dou2)'], 337 | eng: [ 338 | 'verbal suffix to mean that the aim of an action has been achieved or its result has come out; similar to #到 dou2', 339 | ], 340 | }, 341 | egs: [ 342 | { 343 | zho: ['你的錶我沒見着。 (nei5 dik1 biu1 ngo5 mut6 gin3 zoek6.)'], 344 | yue: ['你隻錶我見唔到。 (nei5 zek3 biu1 ngo5 gin3 m4 dou2.)'], 345 | eng: ['I have not found your watch.'], 346 | }, 347 | ], 348 | }, 349 | { 350 | explanation: { 351 | yue: ['喺句尾出現,表示祈使'], 352 | eng: ['used at the end of a sentence to form an imperative'], 353 | }, 354 | egs: [ 355 | { 356 | zho: ['聽着。 (ting3 zoek6.)'], 357 | yue: ['聽住。 (teng1 zyu6.)'], 358 | eng: ['Listen.'], 359 | }, 360 | { 361 | zho: [ 362 | '你可好生給我應付着。 (nei5 ho2 hou2 sang1 kap1 ngo5 jing3 fu6 zoek6.)', 363 | ], 364 | yue: [ 365 | '你好好哋同我應付下。 (nei5 hou2 hou2 dei2 tung4 ngo5 jing3 fu6 haa5.)', 366 | '你小心啲同我應付下。 (nei5 siu2 sam1 di1 tung4 ngo5 jing3 fu6 haa5.)', 367 | ], 368 | eng: ['Handle this well (for me).'], 369 | }, 370 | ], 371 | }, 372 | ], 373 | }, 374 | { 375 | id: 93305, 376 | headwords: [ 377 | { 378 | text: '揸正嚟做', 379 | readings: ['zaa1 zeng3 lai4 zou6', 'zaa1 zeng3 lei4 zou6'], 380 | }, 381 | ], 382 | tags: [ 383 | { 384 | name: 'pos', 385 | value: '動詞', 386 | }, 387 | { 388 | name: 'sim', 389 | value: '揸正', 390 | }, 391 | ], 392 | senses: [ 393 | { 394 | explanation: { 395 | yue: ['嚴格依照規矩,不留餘地,冇人情講'], 396 | eng: [ 397 | 'to follow the rules strictly; to "go by the book"; to leave no room for discretion', 398 | ], 399 | }, 400 | egs: [ 401 | { 402 | yue: [ 403 | '唔好怪我揸正嚟做。 (m4 hou2 gwaai3 ngo5 zaa1 zeng3 lei4 zou6.)', 404 | ], 405 | eng: ["Don't blame me for following the rules too strictly."], 406 | }, 407 | ], 408 | }, 409 | ], 410 | }, 411 | { 412 | id: 96792, 413 | headwords: [ 414 | { 415 | text: '牛河博士', 416 | readings: ['ngau4 ho2 bok3 si6'], 417 | }, 418 | ], 419 | tags: [ 420 | { 421 | name: 'pos', 422 | value: '名詞', 423 | }, 424 | { 425 | name: 'label', 426 | value: '專名', 427 | }, 428 | { 429 | name: 'label', 430 | value: '潮語', 431 | }, 432 | { 433 | name: 'ref', 434 | value: 'https://evchk.fandom.com/zh/wiki/曹宏威', 435 | }, 436 | ], 437 | senses: [ 438 | { 439 | explanation: { 440 | yue: [ 441 | '香港#學者 曹宏威喺#網民 之間嘅叫法,佢因為#乾炒牛河 而一舉成名', 442 | ], 443 | eng: ['Wung-wai Tso, literally "Doctor Beef Chow-fun"'], 444 | }, 445 | egs: [], 446 | }, 447 | ], 448 | }, 449 | ]; 450 | 451 | /** 452 | * @type {DictionaryEntry[]} 453 | */ 454 | let entries; 455 | 456 | test.before(async () => { 457 | entries = await parseCSVEntries(testCsvFile); 458 | }); 459 | 460 | test('CSV successfully parsed', (t) => { 461 | t.not(entries, undefined); 462 | }); 463 | 464 | for (const expectedEntry of expectedEntries) { 465 | const id = expectedEntry.id; 466 | test(`Entry ${id}`, (t) => { 467 | const entry = entries.find((entry) => entry.id === Number(id)); 468 | t.deepEqual(entry, expectedEntry); 469 | }); 470 | } 471 | -------------------------------------------------------------------------------- /src/test/testdata.csv: -------------------------------------------------------------------------------- 1 | ,,"" 2 | "" 3 | 101613,大電:daai6 din6,"(pos:名詞) 4 | yue:D電池(量詞:粒) 5 | eng:D cells battery",,OK,未公開 6 | 92456,發電廠:faat3 din6 cong2,"(pos:名詞) 7 | yue:產生#電力 嘅大型#建築物(量詞:間/座) 8 | eng:power plant",,OK,已公開 9 | 82131,排污:paai4 wu1,"(pos:動詞) 10 | 11 | yue:排走#污水 12 | eng:to drain away sewage 13 | 14 | yue:排污費 (paai4 wu1) 15 | eng:sewerage charge 16 | 17 | yue:排污系統 (paai4 wu1 hai6 tung2) 18 | eng:sewage system 19 | 20 | yue:排污設施 (paai4 wu1 cit3 si1) 21 | eng:sewage works 22 | 23 | yue:公共排污服務 (gung1 gung6 paai4 wu1 fuk6 mou6) 24 | eng:public sewage services 25 | 26 | yue:排污設備改善計劃 (paai4 wu1 cit3 bei6 goi2 sin6 gai3 waak6) 27 | eng:sewerage improvement programme 28 | 29 | yue:呢啲市區河道嘅設計以防洪及有效排污為主。 (ni1 di1 si5 keoi1 ho4 dou6 ge3 cit3 gai3 ji5 fong4 hung4 kap6 jau5 haau6 paai4 wu1 wai4 zyu2.) 30 | eng:These urban channels were designed for flood prevention and effective drainage.",排汙,OK,已公開 31 | 72252,揀選:gaan2 syun2,"(pos:動詞)(sim:挑選)(sim:揀)(sim:選)(sim:選擇) 32 | 33 | yue:根據你嘅取向,喺兩樣嘢或以上當中,抽取一樣 34 | eng:to select; to choose 35 | 36 | yue:一個蠢,一個鈍,噉樣邊叫有得揀選? (jat1 go3 ceon2, jat1 go3 deon6, gam2 joeng2 bin1 giu3 jau5 dak1 gaan2 syun2?) 37 | eng:This candidate is stupid and that is dumb. How can I choose among them?",㨂選,OK,已公開 38 | 66987,背景:bui3 ging2,"(pos:名詞) 39 | 40 | yue:喺舞台或者現實襯托主體嘅景物、佈景、環境 41 | eng:background; setting 42 | 43 | yue:呢張相嘅背景係一啲椰樹。 (ni1 zoeng1 soeng2 ge3 bui3 ging2 hai6 jat1 di1 je4 syu6.) 44 | eng:The coconut trees form a background to this picture. 45 | 46 | yue:段片嘅背景音樂叫咩名? (dyun6 pin2 ge3 bui3 ging2 jam1 ngok6 giu3 me1 meng2?) 47 | eng:What is the title of the background music in the video? 48 | ---- 49 | 50 | yue:人嘅來歷或經歷,例如家庭、教育、工作等等,亦可以指佢哋所倚靠嘅人物或者勢力 51 | eng:the ""background"" of a person, especially their educational background, occupation, social/family connections, etc. 52 | 53 | yue:不如揾人查下佢個背景,我覺得佢好有可疑。 (bat1 jyu4 wan2 jan4 caa4 haa5 keoi5 go3 bui3 ging2, ngo5 gok3 dak1 keoi5 hou2 jau5 ho2 ji4.) 54 | eng:Shall we find someone to look into his background? I think he is so suspicious.",,OK,已公開 55 | 90185,天干地支:tin1 gon1 dei6 zi1,"(pos:名詞)(sim:干支) 56 | 57 | yue:「#天干」同「#地支」嘅合稱。十天干分別係「#甲#乙#丙#丁#戊#己#庚#辛#壬#癸」。 十二地支係:「#子#丑#寅#卯#辰#巳#午#未#申#酉#戌#亥」。 天干同地支組合就成為以「#甲子」為首嘅六十干支循環。 58 | 59 | 干支循環通常用嚟計年份。天干亦可以獨立用嚟順序將物件命名,第一個叫「甲」、第二個叫「乙」,如此類推。用法類似西方嘅「A, B, C」 或 「α, β, γ」。中國傳統紀時間嘅方法係將一日分成十二個時辰,每一個時辰由一個地支表示,「子時」係半夜 (11pm 至 1am),如此類推。 60 | eng:Literally ""Heavenly Stems and Earthly Branches"". It is a traditional Chinese system of counting. Heavenly Stems and Earthly Branches are collectively known as ""Stem-Branch"". 61 | 62 | The 10 Heavenly Stems are 甲(gaap3) 乙(jyut6) 丙(bing2) 丁(ding1) 戊(mou6) 己(gei2) 庚(gang1) 辛(san1) 壬(jam4) 癸(gwai3). 63 | 64 | The 12 Earthly Branches are 子(zi2) 丑(cau2) 寅(jan4) 卯(maau5) 辰(san4) 巳(zi6) 午(ng5) 未(mei6) 申(san1) 酉(jau5) 戌(seot1) 亥(hoi6). Each Heavenly Stem is paired with an Earthly Branch to form the ""stem-branch"" sexagenary (i.e. 60 element) cycle that starts with 甲子 (gaap3 zi2) 65 | 66 | The sexagenary cycle is often used for counting years in the Chinese calendar. Heavenly Stems are also used independently to name things in a particular order -- the first is labeled ""gaap3"", the second ""jyut6"", the third ""bing2"", and so on. It is similar to how ""A, B, C"" and ""α, β, γ"" are used in western cultures. Earthly Branches are also traditionally used to denote time. One day is divided into twelve slots called Chinese-hours (#時辰), starting from 子時 (zi2 si4), which is 11pm to 1am. 67 | 68 | yue:乙等 / 乙級 (jyut6 dang2 / jyut6 kap1) 69 | eng:B grade 70 | 71 | yue:甲級戰犯 (gaap3 kap1 zin3 faan2) 72 | eng:Class A war criminal 73 | 74 | yue:戊戌變法 (mou6 seot1 bin3 faat3) 75 | eng:The Hundred Days' Reform of the Qing Dynasty (it is called 戊戌變法 because it occurred in the 戊戌 year) 76 | 77 | yue:辛亥革命 (san1 hoi6 gaap3 ming6) 78 | eng:The Xinhai Revolution (Pinyin romanization) 79 | 80 | yue:子時 (zi2 si4) 81 | eng:midnight 82 | 83 | yue:午時 (ng5 si4) 84 | eng:noon",,OK,已公開 85 | 97033,"着:zoek6,著:zoek6","(pos:詞綴)(label:書面語) 86 | 87 | yue:表示動作、狀態進行緊、持續緊,類似「#住」、「#下」 88 | eng:to express that an action is in process and a state is prolonged; similar to '#住' zyu6 or '#下' haa5 89 | 90 | zho:痛並快樂着 (tung3 bing6 faai3 lok6 zoek6) 91 | yue:痛住開心 (tung3 zyu6 hoi1 sam1) 92 | eng:feeling painful and happy 93 | 94 | zho:走着走着就到了課室。 (zau2 zoek6 zau2 zoek6 zau6 dou3 liu5 fo3 sat1.) 95 | yue:行下行下就到咗班房。 (haang4 haa5 haang4 haa5 zau6 dou3 zo2 baan1 fong2.) 96 | eng:Walking, (we) have arrived at the classroom. 97 | 98 | zho:他們正説着話呢。 (taa1 mun4 zing3 syut3 zoek6 waa6 ne1.) 99 | yue:佢哋講緊嘢啊。 (keoi5 dei6 gong2 gan2 je5 aa3.) 100 | eng:They are talking. 101 | 102 | zho:等着瞧。 (dang2 zoek6 ciu4.) 103 | yue:睇下點。 (tai2 haa5 dim2.) 104 | eng:(Let's) wait and see. 105 | ---- 106 | 107 | yue:動詞後綴,表示動作達到目的、有結果;類似「#到」(dou2) 108 | eng:verbal suffix to mean that the aim of an action has been achieved or its result has come out; similar to #到 dou2 109 | 110 | zho:你的錶我沒見着。 (nei5 dik1 biu1 ngo5 mut6 gin3 zoek6.) 111 | yue:你隻錶我見唔到。 (nei5 zek3 biu1 ngo5 gin3 m4 dou2.) 112 | eng:I have not found your watch. 113 | ---- 114 | 115 | yue:喺句尾出現,表示祈使 116 | eng:used at the end of a sentence to form an imperative 117 | 118 | zho:聽着。 (ting3 zoek6.) 119 | yue:聽住。 (teng1 zyu6.) 120 | eng:Listen. 121 | 122 | zho:你可好生給我應付着。 (nei5 ho2 hou2 sang1 kap1 ngo5 jing3 fu6 zoek6.) 123 | yue:你好好哋同我應付下。 (nei5 hou2 hou2 dei2 tung4 ngo5 jing3 fu6 haa5.) 124 | yue:你小心啲同我應付下。 (nei5 siu2 sam1 di1 tung4 ngo5 jing3 fu6 haa5.) 125 | eng:Handle this well (for me).",,OK,已公開 126 | 93305,揸正嚟做:zaa1 zeng3 lai4 zou6:zaa1 zeng3 lei4 zou6,"(pos:動詞)(sim:揸正) 127 | 128 | yue:嚴格依照規矩,不留餘地,冇人情講 129 | eng:to follow the rules strictly; to ""go by the book""; to leave no room for discretion 130 | 131 | yue:唔好怪我揸正嚟做。 (m4 hou2 gwaai3 ngo5 zaa1 zeng3 lei4 zou6.) 132 | eng:Don't blame me for following the rules too strictly.",,OK,已公開 133 | 96792,牛河博士:ngau4 ho2 bok3 si6,"(pos:名詞)(label:專名)(label:潮語)(ref:https://evchk.fandom.com/zh/wiki/曹宏威) 134 | yue:香港#學者 曹宏威喺#網民 之間嘅叫法,佢因為#乾炒牛河 而一舉成名 135 | eng:Wung-wai Tso, literally ""Doctor Beef Chow-fun""",,OK,未公開 -------------------------------------------------------------------------------- /src/types.d.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Represents a CSV record. 3 | */ 4 | type CsvRecord = { 5 | id: string; // The unique identifier for the record. 6 | headword: string; // The main word or expression in the entry. 7 | entry: string; // The full text of the dictionary entry. 8 | variants: string; // The different forms or spellings of the headword. 9 | warning: string; // Any warnings related to the entry. 10 | public: string; // Whether the entry is public or not. 11 | }; 12 | 13 | type LanguageArray = [ 14 | 'yue', 15 | 'eng', 16 | 'zho', 17 | 'jpn', 18 | 'kor', 19 | 'vie', 20 | 'lzh', 21 | 'por', 22 | 'deu', 23 | 'fra', 24 | 'mnc', 25 | 'lat', 26 | 'tib', 27 | '量詞' 28 | ]; 29 | 30 | type Language = LanguageArray[number]; 31 | 32 | type TextReadingPair = { 33 | text: string; 34 | reading: string; 35 | }; 36 | 37 | type Headword = { 38 | text: string; 39 | readings: string[]; 40 | }; 41 | 42 | type Tag = { 43 | name: string; 44 | value: string; 45 | }; 46 | 47 | type Sense = { 48 | explanation: LanguageData; 49 | egs: LanguageData[]; 50 | }; 51 | 52 | type LanguageData = { 53 | [key in Language]?: string[]; 54 | }; 55 | 56 | type DictionaryEntry = { 57 | id: number; 58 | headwords: Headword[]; 59 | tags: Tag[]; 60 | senses: Sense[]; 61 | }; 62 | -------------------------------------------------------------------------------- /src/util/addYomitanImages.js: -------------------------------------------------------------------------------- 1 | import { Dictionary } from 'yomichan-dict-builder'; 2 | import fs from 'fs'; 3 | 4 | /** 5 | * 6 | * @param {Dictionary} dictionary 7 | */ 8 | async function addYomitanImages(dictionary, imageFolder) { 9 | const imageFiles = fs.readdirSync(imageFolder); 10 | for (const imageFile of imageFiles) { 11 | const filePath = `${imageFolder}/${imageFile}`; 12 | await dictionary.addFile(filePath, `images/${imageFile}`); 13 | } 14 | } 15 | 16 | export { addYomitanImages }; -------------------------------------------------------------------------------- /src/util/addYomitanTags.js: -------------------------------------------------------------------------------- 1 | /** 2 | * @type {Record} 3 | */ 4 | const tagValueToNote = { 5 | // Parts of speech 6 | 名詞: 'noun', 7 | 動詞: 'verb', 8 | 語句: 'phrase', 9 | 形容詞: 'adjective', 10 | 量詞: 'classifier', 11 | 感嘆詞: 'interjection', 12 | 代詞: 'pronoun', 13 | 助詞: 'particle', 14 | 語素: 'morpheme', 15 | 區別詞: 'distinguishing word', 16 | 副詞: 'adverb', 17 | 擬聲詞: 'onomatopoeia', 18 | 連詞: 'conjunction', 19 | 詞綴: 'affix', 20 | 介詞: 'preposition', 21 | 數詞: 'numeral', 22 | 方位詞: 'locative', 23 | 術語: 'term', 24 | // Labels 25 | 馬來西亞: 'Malaysia', 26 | 粗俗: 'vulgar', 27 | 香港: 'Hong Kong', 28 | 專名: 'proper noun', 29 | 俚語: 'slang', 30 | 潮語: 'trendy expression', 31 | 外來語: 'loanword', 32 | 書面語: 'written language', 33 | 舊式: 'old-fashioned', 34 | 大陸: 'Mainland China', 35 | 文言: 'classical Chinese', 36 | gpt: 'GPT', 37 | 台灣: 'Taiwan', 38 | 爭議: 'controversial', 39 | 黃賭毒: 'vice', 40 | 日本: 'Japan', 41 | 口語: 'colloquial', 42 | 錯字: 'misspelling', 43 | 玩嘢: 'playful', 44 | 民間傳説: 'folklore', 45 | 澳門: 'Macau', 46 | }; 47 | 48 | const categoryToYomitanLabelCategoryMap = { 49 | pos: 'partOfSpeech', 50 | }; 51 | 52 | const categoryToSortingOrder = { 53 | pos: 1, 54 | }; 55 | 56 | /** 57 | * Given a set of unique labels, adds the appropriate tags to the Yomitan dictionary. 58 | * @param {Dictionary} dictionary 59 | * @param {Record>} uniqueLabels 60 | */ 61 | async function addYomitanTags(dictionary, uniqueLabels) { 62 | let tagsAdded = 0; 63 | const noNoteAvailable = new Set(); 64 | for (const [labelName, labelValues] of Object.entries(uniqueLabels)) { 65 | for (const value of labelValues) { 66 | await dictionary.addTag({ 67 | name: value, 68 | category: 69 | categoryToYomitanLabelCategoryMap[labelName] ?? 70 | labelName, 71 | notes: `${value} | ${tagValueToNote[value]}` ?? value, 72 | sortingOrder: categoryToSortingOrder[labelName] ?? 0, 73 | }); 74 | if (!tagValueToNote[value]) { 75 | noNoteAvailable.add(value); 76 | } 77 | tagsAdded++; 78 | } 79 | } 80 | console.log(`Added ${tagsAdded} tags to dictionary.`); 81 | if (noNoteAvailable.size) { 82 | console.warn(`No note available for: ${[...noNoteAvailable].join(', ')}`); 83 | } 84 | } 85 | 86 | export { addYomitanTags }; 87 | -------------------------------------------------------------------------------- /src/util/csv/csvHandler.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import csv from 'csv-parser'; 3 | const csvHeaders = ['id', 'headword', 'entry', 'variants', 'warning', 'public']; 4 | 5 | /** 6 | * Reads the all- file and returns the parsed entries 7 | * @param {string} allCsvPath 8 | * @returns {Promise} 9 | */ 10 | async function readCSVAsync(allCsvPath) { 11 | return new Promise((resolve, reject) => { 12 | const results = []; 13 | fs.createReadStream(allCsvPath) 14 | .pipe( 15 | csv({ 16 | headers: csvHeaders, 17 | strict: true, 18 | skipLines: 2, 19 | quote: '"', 20 | }) 21 | ) 22 | .on('data', (data) => { 23 | results.push(data); 24 | }) 25 | .on('end', () => { 26 | resolve(results); 27 | }) 28 | .on('error', (error) => { 29 | reject(error); 30 | }); 31 | }); 32 | } 33 | 34 | /** 35 | * Reads the contents of the data folder and returns the name of the all- file and the date of the data 36 | * @param {string} dataFolder 37 | */ 38 | async function getCSVInfo(dataFolder) { 39 | // Get contents of data folder 40 | const files = await fs.promises.readdir(dataFolder); 41 | // Filter out non-csv files 42 | const csvFiles = files.filter((file) => file.endsWith('.csv')); 43 | const allCsv = files.find((file) => file.startsWith('all-')); 44 | if (!allCsv) { 45 | throw new Error('No all- file found'); 46 | } 47 | 48 | const dateEpoch = allCsv.split('-')[1].split('.')[0]; 49 | const date = new Date(Number(dateEpoch) * 1000); 50 | const dateString = date.toISOString().split('T')[0]; 51 | console.log(`Date of data: ${dateString}`); 52 | 53 | return { 54 | allCsv, 55 | dateString, 56 | }; 57 | } 58 | 59 | export { readCSVAsync, getCSVInfo }; 60 | -------------------------------------------------------------------------------- /src/util/csv/parseCsvEntriesToJson.js: -------------------------------------------------------------------------------- 1 | import { readCSVAsync } from './csvHandler.js'; 2 | import { parseEntry } from '../entryParse/parseEntryToJson.js'; 3 | 4 | async function parseCSVEntries(allCsvPath) { 5 | const data = await readCSVAsync(allCsvPath); 6 | console.log(`Read ${data.length} entries from ${allCsvPath}`); 7 | /** 8 | * @type {DictionaryEntry[]} 9 | */ 10 | const dictionaryEntries = []; 11 | let unpublishedCount = 0; 12 | let noDataCount = 0; 13 | let unreviewedCount = 0; 14 | for (const entry of data) { 15 | if (entry.entry === '未有內容 NO DATA') { 16 | noDataCount++; 17 | continue; 18 | } 19 | if ( 20 | entry.warning.includes( 21 | '未經覆核,可能有錯漏 UNREVIEWED ENTRY - MAY CONTAIN ERRORS OR OMISSIONS' 22 | ) 23 | ) { 24 | unreviewedCount++; 25 | } 26 | if (entry.public !== '已公開') { 27 | unpublishedCount++; 28 | } 29 | try { 30 | const parsedEntry = parseEntry(entry); 31 | dictionaryEntries.push(parsedEntry); 32 | } catch (error) { 33 | console.log(`Error parsing entry ${entry.id}: ${error.message}`); 34 | } 35 | } 36 | console.log(`Parsed ${dictionaryEntries.length} entries`); 37 | console.log(`Skipped ${noDataCount} no data entries`); 38 | return dictionaryEntries; 39 | } 40 | 41 | export { parseCSVEntries }; 42 | -------------------------------------------------------------------------------- /src/util/entryParse/findImages.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Given a list of dictionary entries, find all unique image URLs. 3 | * @param {DictionaryEntry[]} dictionaryEntries 4 | */ 5 | function getAllImageURLs(dictionaryEntries) { 6 | let imageURLs = new Set(); 7 | for (const entry of dictionaryEntries) { 8 | for (const tag of entry.tags) { 9 | if (tag.name === 'img') { 10 | const imgURL = tag.value; 11 | // Check if valid URL 12 | try { 13 | new URL(imgURL); 14 | imageURLs.add(tag.value); 15 | } catch (error) { 16 | console.error(`Invalid URL: ${imgURL}`); 17 | continue; 18 | } 19 | } 20 | } 21 | } 22 | return imageURLs; 23 | } 24 | 25 | export { getAllImageURLs }; 26 | -------------------------------------------------------------------------------- /src/util/entryParse/parseEntryToJson.js: -------------------------------------------------------------------------------- 1 | import { LANGUAGES_DATA } from '../../constants.js'; 2 | 3 | /** 4 | * 5 | * @param {CsvRecord} entry 6 | * @returns {DictionaryEntry} 7 | */ 8 | function parseEntry(entry) { 9 | const id = parseInt(entry.id); 10 | if (isNaN(id)) { 11 | throw new Error(`Invalid id: ${entry.id}`); 12 | } 13 | 14 | const headwords = parseHeadwords(entry.headword); 15 | 16 | const entryLines = entry.entry.split('\n'); 17 | const tags = parseTags(entryLines); 18 | 19 | const explanationsText = entryLines.join('\n'); 20 | const explanationsTexts = explanationsText 21 | .split(/^\-\-\-\-$/gm) 22 | .map((text) => { 23 | return text; 24 | }); 25 | 26 | /** 27 | * @type {Sense[]} 28 | */ 29 | const senses = []; 30 | for (const text of explanationsTexts) { 31 | senses.push(parseSense(text)); 32 | } 33 | 34 | return { 35 | id, 36 | headwords, 37 | tags, 38 | senses, 39 | }; 40 | } 41 | 42 | /** 43 | * Parses a headword string in the format "text:reading,text:reading" 44 | * @param {string} headwordString 45 | * @returns {Headword[]} 46 | */ 47 | function parseHeadwords(headwordString) { 48 | return headwordString.split(',').map((headword) => { 49 | const [text, ...readings] = headword.split(':'); 50 | if (!text || !readings) { 51 | throw new Error(`Invalid headword: ${headword}`); 52 | } 53 | return { 54 | text, 55 | readings, 56 | }; 57 | }); 58 | } 59 | 60 | /** 61 | * 62 | * @param {string[]} entryLines 63 | */ 64 | function parseTags(entryLines) { 65 | if (!entryLines[0].startsWith('(pos:')) { 66 | throw new Error(`Entry does not start with (pos:): ${entryLines[0]}`); 67 | } 68 | // tags in format (pos:名詞)(label:書面語) 69 | const firstLine = entryLines.shift(); 70 | if (!firstLine) { 71 | throw new Error(`Entry is empty: ${entryLines.toString()}`); 72 | } 73 | const tags = firstLine.split(')(').map((tag) => { 74 | tag = tag.replace(/[()]/g, ''); 75 | let colonIndex = tag.indexOf(':'); 76 | const name = tag.slice(0, colonIndex).trim(); 77 | const value = tag.slice(colonIndex + 1).trim(); 78 | return { 79 | name, 80 | value, 81 | }; 82 | }); 83 | if (tags.length === 0) { 84 | throw new Error(`No tags found: ${firstLine}`); 85 | } 86 | return tags; 87 | } 88 | 89 | /** 90 | * Accepts a sense entry string and returns the parsed sense 91 | * @param {string} entryText 92 | * @returns {Sense} 93 | */ 94 | function parseSense(entryText) { 95 | // Remove first line explanations 96 | entryText = entryText.replace('\n', ''); 97 | const [explanationText, ...examplesTexts] = entryText.split(/^$/gm); 98 | 99 | /** 100 | * @type {LanguageData} 101 | */ 102 | const explanation = parseLanguageData(explanationText); 103 | 104 | /** 105 | * @type {LanguageData[]} 106 | */ 107 | const egs = []; 108 | for (const exampleText of examplesTexts) { 109 | egs.push(parseLanguageData(exampleText)); 110 | } 111 | 112 | return { explanation, egs }; 113 | } 114 | 115 | /** 116 | * Parses a language data multiline string in the format "lang:text\nlang:text" 117 | * Some texts are multiline 118 | * @param {string} text 119 | * @returns {LanguageData} 120 | */ 121 | function parseLanguageData(text) { 122 | /** 123 | * @type {LanguageData} 124 | */ 125 | const languageData = {}; 126 | const lines = text.split('\n'); 127 | 128 | let currentLang = ''; 129 | let currentLangData = ''; 130 | 131 | /** 132 | * Adds the currently stored language data to the languageData object 133 | */ 134 | function addCurrentLangData() { 135 | if (!currentLang) { 136 | return; 137 | } 138 | if (!currentLangData) { 139 | return; 140 | } 141 | if (!languageData[currentLang]) { 142 | languageData[currentLang] = []; 143 | } 144 | languageData[currentLang].push(currentLangData.trim()); 145 | currentLang = ''; 146 | currentLangData = ''; 147 | } 148 | 149 | for (const line of lines) { 150 | // Check if first few characters are a language followed by : 151 | const matchedLang = line.split(':')[0]; 152 | if ( 153 | // !(matchedLang.length >= 2 && matchedLang.length <= 4) || 154 | !line.includes(':') 155 | ) { 156 | // If no language is found, this is a continuation of the previous line 157 | currentLangData += '\n' + line.trim(); 158 | continue; 159 | } 160 | // Check if the language is a possible language 161 | if (!LANGUAGES_DATA[matchedLang]) { 162 | throw new Error(`Invalid language: ${matchedLang}`); 163 | } 164 | // Else a language is found 165 | addCurrentLangData(); 166 | currentLang = matchedLang; 167 | currentLangData = line.replace(`${currentLang}:`, '').trim(); 168 | } 169 | addCurrentLangData(); 170 | return languageData; 171 | } 172 | 173 | export { parseEntry }; 174 | -------------------------------------------------------------------------------- /src/util/entryParse/parseLabels.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Given a list of dictionary entries, find all unique labels. 3 | * @param {DictionaryEntry[]} dictionaryEntries 4 | * @returns {Record>} 5 | */ 6 | function findLabelValues(dictionaryEntries) { 7 | const tagCategories = { 8 | label: new Set(), 9 | pos: new Set(), 10 | }; 11 | for (const entry of dictionaryEntries) { 12 | for (const tag of entry.tags) { 13 | if (tagCategories[tag.name]) { 14 | tagCategories[tag.name].add(tag.value); 15 | } 16 | } 17 | } 18 | return tagCategories; 19 | } 20 | 21 | export { findLabelValues }; 22 | -------------------------------------------------------------------------------- /src/util/getVersion.js: -------------------------------------------------------------------------------- 1 | import path from 'path'; 2 | import fs from 'fs'; 3 | 4 | /** 5 | * Get the version from the package.json file. 6 | * @returns {string} The version. 7 | */ 8 | export function getVersion() { 9 | const packageJsonPath = path.join(process.cwd(), 'package.json'); 10 | const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8')); 11 | return packageJson.version; 12 | } 13 | -------------------------------------------------------------------------------- /src/util/imageHandler/compressImages.js: -------------------------------------------------------------------------------- 1 | import sharp from 'sharp'; 2 | import fs from 'fs'; 3 | 4 | /** 5 | * Compresses and resizes all jpg and png images in the image folder 6 | * @param {string} imageFolder 7 | * @param {string} outputFolder 8 | * @param {number} resizeWidth 9 | * @returns {Promise} 10 | */ 11 | function compressImages(imageFolder, outputFolder, resizeWidth) { 12 | // Create directory 13 | if (!fs.existsSync(outputFolder)) { 14 | fs.mkdirSync(outputFolder); 15 | } 16 | const imageFiles = fs.readdirSync(imageFolder); 17 | const promises = []; 18 | for (const imageFile of imageFiles) { 19 | const filePath = `${imageFolder}/${imageFile}`; 20 | const outputPath = `${outputFolder}/${imageFile}`; 21 | promises.push(compressImage(filePath, outputPath, resizeWidth)); 22 | } 23 | return Promise.all(promises); 24 | } 25 | 26 | /** 27 | * Compresses and resizes the image at the given path. 28 | * @param {string} imagePath 29 | * @param {string} outputPath 30 | * @param {number} resizeWidth 31 | * @returns 32 | */ 33 | async function compressImage(imagePath, outputPath, resizeWidth) { 34 | try { 35 | const image = sharp(imagePath); 36 | const metadata = await image.metadata(); 37 | // Check if image is jpg or png 38 | if (metadata.format !== 'jpeg' && metadata.format !== 'png') { 39 | throw new Error(`Invalid image format: ${metadata.format}`); 40 | return; 41 | } 42 | // Resize image 43 | if (resizeWidth && metadata.width && metadata.width > resizeWidth) { 44 | image.resize(resizeWidth); 45 | } 46 | // Compress image 47 | if (metadata.format === 'jpeg') { 48 | await image.jpeg({ quality: 85 }).toFile(outputPath); 49 | } else { 50 | await image.toFile(outputPath); 51 | } 52 | } catch (e) { 53 | // Copy file if error 54 | fs.copyFileSync(imagePath, outputPath); 55 | } 56 | } 57 | 58 | export { compressImages }; 59 | -------------------------------------------------------------------------------- /src/util/imageHandler/downloadImages.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | import path from 'path'; 3 | import axios from 'axios'; 4 | import sharp from 'sharp'; 5 | 6 | import { getImageFileName } from './getImageFileName.js'; 7 | import { IMAGE_FOLDER } from '../../constants.js'; 8 | 9 | const DELAY_MS = 1000; 10 | 11 | /** 12 | * Downloads all the images in the given set. 13 | * @param {Set} imageURLs - The set of image URLs to download. 14 | */ 15 | async function downloadImages(imageURLs) { 16 | // Create directory 17 | if (!fs.existsSync(IMAGE_FOLDER)) { 18 | fs.mkdirSync(IMAGE_FOLDER); 19 | } 20 | let successful = 0; 21 | let failed = 0; 22 | const imageURLsArray = Array.from(imageURLs); 23 | for (let i = 0; i < imageURLsArray.length; i++) { 24 | const imageURL = imageURLsArray[i]; 25 | try { 26 | console.log(`${i}/${imageURLsArray.length}: Downloading ${imageURL}`); 27 | const fileName = getImageFileName(imageURL); 28 | const wasDownloadedOnline = await downloadImage( 29 | imageURL, 30 | IMAGE_FOLDER, 31 | fileName 32 | ); 33 | // Delay if downloaded online 34 | if (wasDownloadedOnline) { 35 | await new Promise((resolve) => setTimeout(resolve, DELAY_MS)); 36 | } 37 | successful++; 38 | const filePath = path.join(IMAGE_FOLDER, getImageFileName(imageURL)); 39 | // Check if the image is valid, delete if not 40 | try { 41 | await sharp(filePath).metadata(); 42 | } catch (error) { 43 | console.log(`Deleting invalid image ${filePath}`); 44 | fs.unlinkSync(filePath); 45 | } 46 | } catch (error) { 47 | console.log(`Error when downloading ${imageURL}`); 48 | failed++; 49 | } 50 | } 51 | console.log(`Successfully downloaded ${successful} images.`); 52 | console.log(`Failed to download ${failed} images.`); 53 | } 54 | 55 | /** 56 | * Downloads the image at the given URL and saves it to the given path. 57 | * @param {string} imageURL 58 | * @param {string} savePath 59 | * @param {string} fileName 60 | * @returns {Promise} - Returns true if the image was downloaded online. 61 | */ 62 | async function downloadImage(imageURL, savePath, fileName) { 63 | // Check if path valid 64 | if (!fs.existsSync(savePath)) { 65 | throw new Error(`Invalid path: ${savePath}`); 66 | } 67 | // Check if valid URL 68 | try { 69 | new URL(imageURL); 70 | } catch (error) { 71 | throw new Error(`Invalid URL: ${imageURL}`); 72 | } 73 | 74 | const filePath = path.join(savePath, fileName); 75 | 76 | // Check if file already exists 77 | if (fs.existsSync(filePath)) { 78 | return false; 79 | } 80 | // Download image 81 | const response = await axios.get(imageURL, { 82 | responseType: 'arraybuffer', 83 | }); 84 | const buffer = Buffer.from(response.data, 'binary'); 85 | 86 | // Save image 87 | fs.writeFileSync(filePath, buffer); 88 | return true; 89 | } 90 | 91 | export { downloadImages }; 92 | -------------------------------------------------------------------------------- /src/util/imageHandler/getImageFileName.js: -------------------------------------------------------------------------------- 1 | import { createHash } from 'crypto'; 2 | 3 | /** 4 | * Hashes the image URL to get the image file name, preserving the file extension. 5 | * @param {string} imageURL 6 | */ 7 | function getImageFileName(imageURL) { 8 | const hash = createHash('sha256'); 9 | hash.update(imageURL); 10 | const hashed = hash.digest('hex'); 11 | const extension = imageURL.split('.').pop()?.toLocaleLowerCase() || ''; 12 | const allowedExtensions = ['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp']; 13 | if (!allowedExtensions.includes(extension)) { 14 | throw new Error(`Invalid extension: ${extension}`); 15 | } 16 | return `${hashed}.${extension}`; 17 | } 18 | 19 | export { getImageFileName }; -------------------------------------------------------------------------------- /src/util/readAndParseCSVs.js: -------------------------------------------------------------------------------- 1 | import path from 'path'; 2 | import { getCSVInfo } from './csv/csvHandler.js'; 3 | import { parseCSVEntries } from './csv/parseCsvEntriesToJson.js'; 4 | 5 | /** 6 | * @param {string} dataFolder 7 | */ 8 | export async function readAndParseCSVs(dataFolder) { 9 | const { allCsv, dateString } = await getCSVInfo(dataFolder); 10 | const dictionaryEntries = await parseCSVEntries( 11 | path.join(dataFolder, allCsv) 12 | ); 13 | console.log(`Found ${dictionaryEntries.length} entries.`); 14 | 15 | return { dictionaryEntries, dateString }; 16 | } 17 | -------------------------------------------------------------------------------- /src/util/textHandling/parseCantoneseReadings.js: -------------------------------------------------------------------------------- 1 | import { 2 | punctuations, 3 | isHanzi, 4 | isJyuutping, 5 | isPunctuation, 6 | } from './textUtils.js'; 7 | 8 | /** 9 | * Parses a text string into an array matching each character to the readings 10 | * @example text: "你get唔get到我講咩?" 11 | * reading: "nei5 get1 m4 get1 dou2 ngo5 gong2 me1?" 12 | * => 13 | * [{text: "你", reading: "nei5"}, {text: "get", reading: "get1"}, ...] 14 | * @param {string} rawText 15 | * @param {string} readings 16 | * @returns {TextReadingPair[]} 17 | */ 18 | function parseCantoneseReadings(rawText, readings) { 19 | /** 20 | * @type {TextReadingPair[]} 21 | */ 22 | const resultArray = []; 23 | 24 | const textArray = splitString(rawText); 25 | const readingsArray = splitString(readings); 26 | 27 | let readingIndex = 0; 28 | let textIndex = 0; 29 | for (let i = 0; i < Math.max(textArray.length, readingsArray.length); i++) { 30 | const text = textArray[textIndex]; 31 | const reading = readingsArray[readingIndex]; 32 | const isTextHanzi = isHanzi(text); 33 | const isTextAlphanumeric = isJyuutping(text); 34 | const isTextPunctuation = isPunctuation(text); 35 | const isReadingJyutping = isJyuutping(reading); 36 | const isReadingPunctuation = isPunctuation(reading); 37 | // Ideal case 38 | if ( 39 | !!text && 40 | !!reading && 41 | ((isTextHanzi && isReadingJyutping) || 42 | // Case where for example text is 'bu' and reading is 'bu4' 43 | (isTextAlphanumeric && isReadingJyutping)) 44 | ) { 45 | resultArray.push({ text, reading }); 46 | textIndex++; 47 | readingIndex++; 48 | } else if ( 49 | !!text && 50 | ((isTextPunctuation && isReadingJyutping) || 51 | (!!text && reading === undefined) || 52 | (!isTextAlphanumeric && !isTextHanzi && isReadingJyutping)) 53 | ) { 54 | // Send empty string to reading 55 | resultArray.push({ text, reading: '' }); 56 | textIndex++; 57 | } else if ( 58 | !!text && 59 | !!reading && 60 | ((isTextPunctuation && isReadingPunctuation) || 61 | // Where both are special characters 62 | (!isTextAlphanumeric && !isTextHanzi && !isReadingJyutping)) 63 | ) { 64 | // Don't add the punctuation but consume it 65 | resultArray.push({ text, reading: '' }); 66 | textIndex++; 67 | readingIndex++; 68 | } else { 69 | throw new Error( 70 | `Unexpected text "${text}" and reading "${reading}" at index ${i} in ${rawText}: ${readings}` 71 | ); 72 | } 73 | } 74 | // Check if remaining readings exist 75 | if (readingIndex < readingsArray.length) { 76 | throw new Error( 77 | `Unexpected reading "${readingsArray[readingIndex]}" at index ${readingIndex} in ${rawText}: ${readings}` 78 | ); 79 | } 80 | return resultArray; 81 | } 82 | 83 | /** 84 | * 85 | * @param {string} input 86 | * @returns {string[]} 87 | */ 88 | function splitString(input) { 89 | const resultArray = []; 90 | let current = ''; 91 | for (const char of input) { 92 | if (/[a-zA-Z0-9]/.test(char)) { 93 | // Check if alphabetical or numeric 94 | const isAlphabetical = /[a-zA-Z]/.test(char); 95 | if (current.length > 0) { 96 | // Check if previous character was alphabetical or numeric 97 | const isPreviousAlphabetical = /[a-zA-Z]/.test( 98 | current[current.length - 1] 99 | ); 100 | if (isAlphabetical && !isPreviousAlphabetical) { 101 | // Probably a case where the reading was typo'd like bit1ging1 102 | resultArray.push(current); 103 | current = ''; 104 | } 105 | } 106 | current += char; 107 | } else if (punctuations[char]) { 108 | if (current) { 109 | resultArray.push(current); 110 | current = ''; 111 | } 112 | resultArray.push(char); 113 | } else { 114 | if (current) { 115 | resultArray.push(current); 116 | current = ''; 117 | } 118 | resultArray.push(char); 119 | } 120 | } 121 | // Push the last current 122 | if (current) { 123 | resultArray.push(current); 124 | } 125 | 126 | // Remove empty strings 127 | const resultArrayFiltered = resultArray 128 | .map((item) => item.trim()) 129 | .filter((item) => item); 130 | return resultArrayFiltered; 131 | } 132 | 133 | export { parseCantoneseReadings }; 134 | -------------------------------------------------------------------------------- /src/util/textHandling/textUtils.js: -------------------------------------------------------------------------------- 1 | import XRegExp from '@gerhobbelt/xregexp'; 2 | 3 | const punctuations = [ 4 | ',', 5 | ',', 6 | '。', 7 | '.', 8 | '?', 9 | '?', 10 | '!', 11 | '!', 12 | ';', 13 | ';', 14 | ':', 15 | ':', 16 | '、', 17 | ',', 18 | ',', 19 | '⋯', 20 | ]; 21 | 22 | /** 23 | * Returns true if the text is a Chinese character. 24 | * @param {string} text 25 | * @returns {boolean} 26 | */ 27 | function isHanzi(text) { 28 | XRegExp.install('astral'); 29 | return XRegExp( 30 | '\\p{InCJK_Unified_Ideographs}|\\p{InCJK_Unified_Ideographs_Extension_A}|\\p{InCJK_Unified_Ideographs_Extension_B}|\\p{InCJK_Unified_Ideographs_Extension_C}|\\p{InCJK_Unified_Ideographs_Extension_D}|\\p{InCJK_Unified_Ideographs_Extension_E}' 31 | ).test(text); 32 | } 33 | 34 | /** 35 | * Returns true if the text is a Jyutping reading. 36 | * @param {string} text 37 | * @returns {boolean} 38 | */ 39 | function isJyuutping(text) { 40 | return /[a-zA-Z0-9]/.test(text); 41 | } 42 | 43 | /** 44 | * Returns true if the text is a punctuation. 45 | * @param {string} text 46 | * @returns {boolean} 47 | */ 48 | function isPunctuation(text) { 49 | return punctuations.includes(text); 50 | } 51 | 52 | function isStringSentence(text) { 53 | // Check if text ends with a punctuation 54 | const lastChar = text[text.length - 1]; 55 | return isPunctuation(lastChar); 56 | } 57 | 58 | export { punctuations, isHanzi, isJyuutping, isPunctuation, isStringSentence }; 59 | -------------------------------------------------------------------------------- /src/util/yomitan/convertEntryToDetailedDefinition.js: -------------------------------------------------------------------------------- 1 | import { convertHeadwordsToSC } from './convertHeadwordsToSC.js'; 2 | import { convertSenseToLiSC } from './convertSenseToSC.js'; 3 | import { createEntryAttribution } from './createEntryAttribution.js'; 4 | import { createEntryImageSC } from './createEntryImageSC.js'; 5 | import { convertEntryToSynAntsSC } from './convertEntryToSynAntsSC.js'; 6 | 7 | /** 8 | * Converts a dictionary entry to a detailed definition. 9 | * @param {DictionaryEntry} entry 10 | * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').DetailedDefinition} 11 | */ 12 | function convertEntryToDetailedDefinition(entry) { 13 | /** 14 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} 15 | */ 16 | const SCArray = []; 17 | // Headword 18 | SCArray.push(convertHeadwordsToSC(entry.headwords)); 19 | 20 | // Senses with explanation/examples 21 | SCArray.push({ 22 | tag: 'div', 23 | data: { 24 | wordshk: 'definition', 25 | }, 26 | lang: 'yue', 27 | content: { 28 | tag: 'ul', 29 | data: { 30 | wordshk: 'sense-list', 31 | }, 32 | content: entry.senses.map(convertSenseToLiSC), 33 | }, 34 | }); 35 | 36 | // Synonyms/antonyms 37 | const synAntsSC = convertEntryToSynAntsSC(entry); 38 | SCArray.push(...synAntsSC); 39 | 40 | // Image 41 | let imageURLs = []; 42 | if (entry.tags.some((tag) => tag.name === 'img')) { 43 | const { SCs, validImageURLs } = createEntryImageSC(entry); 44 | if (SCs.length > 0) { 45 | SCArray.push(SCs); 46 | } 47 | imageURLs.push(...validImageURLs); 48 | } 49 | 50 | // Attribution 51 | SCArray.push(createEntryAttribution(entry, imageURLs)); 52 | return { 53 | type: 'structured-content', 54 | content: SCArray, 55 | }; 56 | } 57 | 58 | export { convertEntryToDetailedDefinition }; 59 | -------------------------------------------------------------------------------- /src/util/yomitan/convertEntryToSynAntsSC.js: -------------------------------------------------------------------------------- 1 | // const synonymEmoji = ; 2 | // const antonymEmoji = '🚫'; 3 | 4 | const types = { 5 | sim: { 6 | emoji: '🔗', 7 | text: '近義', 8 | }, 9 | ant: { 10 | emoji: '🚫', 11 | text: '反義', 12 | }, 13 | }; 14 | 15 | /** 16 | * Converts an entry to a ul list of the element's synonyms and antonyms. 17 | * @param {DictionaryEntry} entry 18 | * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} 19 | */ 20 | function convertEntryToSynAntsSC(entry) { 21 | let exists = false; 22 | /** 23 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} 24 | */ 25 | const SCArray = []; 26 | /** 27 | * @type {('sim'|'ant')[]} 28 | */ 29 | const tagTypes = ['sim', 'ant']; 30 | for (const type of tagTypes) { 31 | const { SC, exists: typeExists } = convertEntryToSCType(entry, type); 32 | if (typeExists) { 33 | SCArray.push(SC); 34 | } 35 | } 36 | 37 | return SCArray; 38 | } 39 | 40 | /** 41 | * 42 | * @param {DictionaryEntry} entry 43 | * @param {'sim'|'ant'} type 44 | * @returns {{ SC: import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent, exists: boolean}} 45 | */ 46 | function convertEntryToSCType(entry, type) { 47 | const typeTags = entry.tags.filter((tag) => tag.name === type); 48 | if (typeTags.length === 0) { 49 | return { 50 | SC: [], 51 | exists: false, 52 | }; 53 | } 54 | // let tagString = typeTags.map((tag) => tag.value).join('・'); 55 | return { 56 | SC: { 57 | tag: 'ul', 58 | data: { 59 | wordshk: `${type}-list`, 60 | }, 61 | content: [ 62 | { 63 | tag: 'li', 64 | style: { 65 | listStyleType: `"${types[type].emoji}"`, 66 | fontWeight: 'bold', 67 | }, 68 | data: { 69 | wordshk: `${type}-header`, 70 | }, 71 | content: types[type].text, 72 | }, 73 | { 74 | tag: 'ul', 75 | /** 76 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} 77 | */ 78 | content: typeTags.map((tag) => ({ 79 | tag: 'li', 80 | data: { 81 | wordshk: `${type}-entry`, 82 | }, 83 | content: tag.value, 84 | lang: 'yue', 85 | style: { 86 | fontSize: '1.2em', 87 | }, 88 | })), 89 | }, 90 | ], 91 | }, 92 | exists: true, 93 | }; 94 | } 95 | 96 | export { convertEntryToSynAntsSC }; 97 | -------------------------------------------------------------------------------- /src/util/yomitan/convertEntryToYomitanTerms.js: -------------------------------------------------------------------------------- 1 | import { TermEntry } from 'yomichan-dict-builder'; 2 | import { convertEntryToDetailedDefinition } from './convertEntryToDetailedDefinition.js'; 3 | 4 | /** 5 | * 6 | * @param {DictionaryEntry} dictionaryEntry 7 | * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').TermInformation[]} 8 | */ 9 | function convertEntryToYomitanTerms(dictionaryEntry) { 10 | /** 11 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').TermInformation[]} 12 | */ 13 | const yomitanTerms = []; 14 | 15 | const detailedDefinition = convertEntryToDetailedDefinition(dictionaryEntry); 16 | for (const headword of dictionaryEntry.headwords) { 17 | for (const reading of headword.readings) { 18 | const yomitanTermEntry = new TermEntry(headword.text) 19 | .setReading(reading) 20 | .addDetailedDefinition(detailedDefinition); 21 | addTagsToTermEntry(dictionaryEntry, yomitanTermEntry); 22 | yomitanTerms.push(yomitanTermEntry.build()); 23 | } 24 | } 25 | 26 | return yomitanTerms; 27 | } 28 | 29 | /** 30 | * @param {DictionaryEntry} dictionaryEntry 31 | * @param {TermEntry} termEntry 32 | */ 33 | function addTagsToTermEntry(dictionaryEntry, termEntry) { 34 | const termTags = []; 35 | const entryTags = []; 36 | const tagTypesToAdd = ['pos', 'label']; 37 | for (const tag of dictionaryEntry.tags) { 38 | if (tagTypesToAdd.includes(tag.name)) { 39 | entryTags.push(tag.value); 40 | } 41 | } 42 | termEntry.setTermTags(termTags.join(' ')); 43 | termEntry.setDefinitionTags(entryTags.join(' ')); 44 | } 45 | 46 | export { convertEntryToYomitanTerms }; 47 | -------------------------------------------------------------------------------- /src/util/yomitan/convertHeadwordsToSC.js: -------------------------------------------------------------------------------- 1 | import { convertReadingToRubySC } from './parseTextToSC.js'; 2 | 3 | /** 4 | * Converts headword(s) to structured content. 5 | * @param {Headword[]} headwords 6 | */ 7 | function convertHeadwordsToSC(headwords) { 8 | const headwordsSCList = headwordsToSC(headwords); 9 | const separator = '・'; 10 | /** 11 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} 12 | */ 13 | const headwordsSCListWithSeparator = []; 14 | for (let i = 0; i < headwordsSCList.length; i++) { 15 | headwordsSCListWithSeparator.push(headwordsSCList[i]); 16 | if (i !== headwordsSCList.length - 1) { 17 | headwordsSCListWithSeparator.push(separator); 18 | } 19 | } 20 | /** 21 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent} 22 | */ 23 | const sc = { 24 | tag: 'div', 25 | data: { 26 | wordshk: 'headword', 27 | }, 28 | style: { 29 | fontSize: '1.2em', 30 | }, 31 | lang: 'yue', 32 | content: ['【', ...headwordsSCListWithSeparator, '】'], 33 | }; 34 | return sc; 35 | } 36 | 37 | /** 38 | * Converts a headword to structured content. 39 | * @param {Headword[]} headwords 40 | * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} 41 | */ 42 | function headwordsToSC(headwords) { 43 | /** 44 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} 45 | */ 46 | const headwordsSCList = []; 47 | for (const headword of headwords) { 48 | headwordsSCList.push( 49 | ...headword.readings.map((reading) => 50 | convertReadingToRubySC(headword.text, reading) 51 | ) 52 | ); 53 | } 54 | return headwordsSCList; 55 | } 56 | 57 | export { convertHeadwordsToSC }; 58 | -------------------------------------------------------------------------------- /src/util/yomitan/convertSenseToSC.js: -------------------------------------------------------------------------------- 1 | import { LANGUAGES_DATA } from '../../constants.js'; 2 | import { isStringSentence } from '../textHandling/textUtils.js'; 3 | import { convertTextToSC } from './parseTextToSC.js'; 4 | 5 | const examplePhraseText = '配詞 / 用法'; 6 | const exampleSentenceText = '例句'; 7 | const examplePhraseEmoji = '💬'; 8 | const exampleSentenceEmoji = '📝'; 9 | 10 | /** 11 | * Converts a sense to structured content as a li. 12 | * @param {Sense} sense 13 | * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent} 14 | */ 15 | function convertSenseToLiSC(sense) { 16 | /** 17 | * @type {LanguageData[]} 18 | */ 19 | const phrases = []; 20 | /** 21 | * @type {LanguageData[]} 22 | */ 23 | const sentences = []; 24 | for (const eg of sense.egs) { 25 | // Check if any of the language datas are a sentence 26 | const isEgSentence = Object.values(eg).some((languageData) => { 27 | return languageData.some((languageText) => { 28 | return isStringSentence(languageText); 29 | }); 30 | }); 31 | if (isEgSentence) { 32 | sentences.push(eg); 33 | } else { 34 | phrases.push(eg); 35 | } 36 | } 37 | 38 | /** 39 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} 40 | */ 41 | const exampleNodes = []; 42 | if (phrases.length > 0) { 43 | exampleNodes.push( 44 | convertExampleToSC( 45 | phrases, 46 | 'phrase', 47 | examplePhraseText, 48 | examplePhraseEmoji 49 | ) 50 | ); 51 | } 52 | if (sentences.length > 0) { 53 | exampleNodes.push( 54 | convertExampleToSC( 55 | sentences, 56 | 'sentence', 57 | exampleSentenceText, 58 | exampleSentenceEmoji 59 | ) 60 | ); 61 | } 62 | 63 | return { 64 | tag: 'li', 65 | data: { 66 | wordshk: 'sense', 67 | }, 68 | content: [ 69 | { 70 | tag: 'div', 71 | data: { 72 | wordshk: 'explanation', 73 | }, 74 | content: convertLanguageDataToLiSC(sense.explanation, true), 75 | }, 76 | { 77 | tag: 'div', 78 | data: { 79 | wordshk: 'examples', 80 | }, 81 | content: exampleNodes, 82 | }, 83 | ], 84 | }; 85 | } 86 | 87 | /** 88 | * Converts an example list to a ul structured content object with the appropriate emoji. 89 | * @param {LanguageData[]} languageDatas 90 | * @param {'phrase' | 'sentence'} exampleType 91 | * @param {string} exampleText 92 | * @param {string} exampleEmoji 93 | * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent} 94 | */ 95 | function convertExampleToSC( 96 | languageDatas, 97 | exampleType, 98 | exampleText, 99 | exampleEmoji 100 | ) { 101 | return { 102 | tag: 'ul', 103 | data: { 104 | wordshk: exampleType, 105 | }, 106 | content: [ 107 | { 108 | tag: 'li', 109 | style: { 110 | listStyleType: `"${exampleEmoji}"`, 111 | fontWeight: 'bold', 112 | }, 113 | data: { 114 | wordshk: 'example-type-header', 115 | }, 116 | content: exampleText, 117 | }, 118 | { 119 | tag: 'ul', 120 | data: { 121 | wordshk: `${exampleType}-list`, 122 | }, 123 | content: [ 124 | ...languageDatas.map((languageData) => { 125 | return convertLanguageDataToLiSC(languageData, false); 126 | }), 127 | ], 128 | }, 129 | ], 130 | }; 131 | } 132 | 133 | /** 134 | * Converts one single languageData to structured content representing a definition/example/sentence as an unordered list. 135 | * @param {LanguageData} languageData 136 | * @param {boolean} isExplanation whether the languageData is an explanation 137 | or an example 138 | * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent} 139 | */ 140 | function convertLanguageDataToLiSC(languageData, isExplanation) { 141 | /** 142 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} 143 | */ 144 | const languageDivArray = []; 145 | 146 | for (const language of Object.keys(languageData)) { 147 | languageDivArray.push( 148 | ...convertLanguageEntryToListItems( 149 | // @ts-ignore 150 | language, 151 | languageData[language], 152 | isExplanation 153 | ) 154 | ); 155 | } 156 | 157 | /** 158 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent} 159 | */ 160 | const sc = { 161 | tag: 'li', 162 | style: { 163 | marginBottom: isExplanation ? '0.3em' : '0.5em', 164 | listStyleType: isExplanation ? 'none' : 'circle', 165 | }, 166 | data: { 167 | wordshk: isExplanation ? 'explanation' : 'example', 168 | }, 169 | content: languageDivArray, 170 | }; 171 | 172 | return sc; 173 | } 174 | 175 | /** 176 | * Converts a single language entry consisting of multiple language contents to a list of lis 177 | * @param {Language} language 178 | * @param {string[]} languageTexts 179 | * @param {boolean} isExplanation whether the languageData is an explanation 180 | * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} 181 | */ 182 | function convertLanguageEntryToListItems( 183 | language, 184 | languageTexts, 185 | isExplanation 186 | ) { 187 | /** 188 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} 189 | */ 190 | const languageLiScArray = []; 191 | const languageInfo = LANGUAGES_DATA[language]; 192 | for (const languageText of languageTexts) { 193 | /** 194 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} 195 | */ 196 | const liChildren = [convertTextToSC(languageText, language)]; 197 | 198 | // Only push lang tag if non yue/eng language 199 | const noLanguageTagNecessaryLanguages = ['yue', 'eng']; 200 | if (!noLanguageTagNecessaryLanguages.includes(language)) { 201 | liChildren.unshift({ 202 | tag: 'span', 203 | data: { 204 | wordshk: 'langSignifier', 205 | }, 206 | style: { 207 | color: '#888', 208 | fontSize: '0.8em', 209 | }, 210 | content: `${languageInfo.name}› `, 211 | }); 212 | } 213 | 214 | /** 215 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent} 216 | */ 217 | const singleLanguageLi = { 218 | tag: 'li', 219 | lang: languageInfo.langCode, 220 | content: liChildren, 221 | style: { 222 | listStyleType: 'none', 223 | }, 224 | data: { 225 | wordshk: languageInfo.langCode, 226 | }, 227 | }; 228 | 229 | // Change text size for selected languages 230 | const cjkLangs = ['yue', 'zho', 'jpn', 'kor', 'lzh']; 231 | const isCJK = cjkLangs.includes(language); 232 | // @ts-ignore 233 | singleLanguageLi.style.fontSize = isCJK 234 | ? '1.2em' 235 | : isExplanation 236 | ? '1em' 237 | : '0.75em'; 238 | 239 | languageLiScArray.push(singleLanguageLi); 240 | } 241 | 242 | return languageLiScArray; 243 | } 244 | 245 | export { convertSenseToLiSC }; 246 | -------------------------------------------------------------------------------- /src/util/yomitan/createEntryAttribution.js: -------------------------------------------------------------------------------- 1 | /** 2 | * 3 | * @param {DictionaryEntry} entry 4 | * @param {string[]} imageURLs 5 | * @returns {import("yomichan-dict-builder/dist/types/yomitan/termbank").StructuredContent} 6 | */ 7 | function createEntryAttribution(entry, imageURLs) { 8 | /** 9 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} 10 | */ 11 | const contentAttributionSCArray = [ 12 | { 13 | tag: 'a', 14 | href: `https://words.hk/zidin/v/${entry.id}`, 15 | content: '粵典 words.hk', 16 | }, 17 | ]; 18 | if (entry.tags.length > 0) { 19 | // Find reference tag if exists 20 | const referenceTag = entry.tags.find((tag) => tag.name === 'ref'); 21 | if (referenceTag) { 22 | let urlDomain = ''; 23 | try { 24 | const url = new URL(referenceTag.value); 25 | urlDomain = url.hostname; 26 | } catch (error) { 27 | console.error(`Invalid URL: ${referenceTag.value}`); 28 | } 29 | 30 | contentAttributionSCArray.unshift( 31 | { 32 | tag: 'a', 33 | href: referenceTag.value, 34 | content: `參考: ${urlDomain}`, 35 | }, 36 | { 37 | tag: 'span', 38 | content: ' | ', 39 | } 40 | ); 41 | } 42 | } 43 | 44 | // Add image attributions 45 | if (imageURLs.length > 0) { 46 | for (const imageURL of imageURLs) { 47 | try { 48 | const url = new URL(imageURL); 49 | const urlDomain = url.hostname; 50 | contentAttributionSCArray.unshift( 51 | { 52 | tag: 'a', 53 | href: imageURL, 54 | content: `圖片: ${urlDomain}`, 55 | }, 56 | { 57 | tag: 'span', 58 | content: ' | ', 59 | } 60 | ); 61 | } catch (error) {} 62 | } 63 | } 64 | 65 | return { 66 | tag: 'div', 67 | data: { 68 | wordshk: 'attribution', 69 | }, 70 | lang: 'yue', 71 | style: { 72 | fontSize: '0.7em', 73 | textAlign: 'right', 74 | // The examples/definitions above have marginBottom set 75 | marginTop: '-0.4em', 76 | }, 77 | content: contentAttributionSCArray, 78 | }; 79 | } 80 | 81 | export { createEntryAttribution }; 82 | -------------------------------------------------------------------------------- /src/util/yomitan/createEntryImageSC.js: -------------------------------------------------------------------------------- 1 | import fs from 'fs'; 2 | 3 | import { getImageFileName } from '../imageHandler/getImageFileName.js'; 4 | import { IMAGE_FOLDER, IMAGE_RESIZE_WIDTH } from '../../constants.js'; 5 | 6 | /** 7 | * @param {DictionaryEntry} entry 8 | * @returns {{SCs: import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[], validImageURLs: string[]}} 9 | */ 10 | function createEntryImageSC(entry) { 11 | // Check if entry has images 12 | const imageTags = entry.tags.filter((tag) => tag.name === 'img'); 13 | if (imageTags.length === 0) { 14 | throw new Error(`Entry ${entry.headwords[0].text} has no images.`); 15 | } 16 | 17 | /** 18 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]} 19 | */ 20 | const SCs = []; 21 | const validImageURLs = []; 22 | for (const tag of imageTags) { 23 | try { 24 | const fileName = getImageFileName(tag.value); 25 | // Check if file exists 26 | const filePath = `${IMAGE_FOLDER}/${fileName}`; 27 | if (!fs.existsSync(filePath)) { 28 | throw new Error(`File does not exist: ${filePath}`); 29 | } 30 | /** 31 | * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent} 32 | */ 33 | const imageNode = { 34 | tag: 'img', 35 | data: { 36 | wordshk: 'image', 37 | }, 38 | path: filePath, 39 | collapsed: false, 40 | collapsible: false, 41 | }; 42 | if (fileName.endsWith('.svg')) { 43 | imageNode.width = IMAGE_RESIZE_WIDTH; 44 | } 45 | SCs.push(imageNode); 46 | validImageURLs.push(tag.value); 47 | } catch (error) {} 48 | } 49 | return { SCs, validImageURLs }; 50 | } 51 | 52 | export { createEntryImageSC }; 53 | -------------------------------------------------------------------------------- /src/util/yomitan/parseTextToSC.js: -------------------------------------------------------------------------------- 1 | import { parseCantoneseReadings } from '../textHandling/parseCantoneseReadings.js'; 2 | 3 | /** 4 | * Parses a text string into a structured content object. 5 | * @param {string} rawText 6 | * @param {string} languageCode 7 | * @returns {import("yomichan-dict-builder/dist/types/yomitan/termbank").StructuredContent} 8 | */ 9 | function convertTextToSC(rawText, languageCode) { 10 | const rubyTextLangs = ['yue', 'zho', 'lzh']; 11 | if (!rubyTextLangs.includes(languageCode)) { 12 | return rawText; 13 | } 14 | const cleanedText = cleanRawText(rawText); 15 | // Parse brackets for possible reading 16 | const bracketRegex = /(.+)\(([^\(\)]+)\)$/; 17 | const [_, phrase, reading] = cleanedText.match(bracketRegex) || []; 18 | 19 | if (!phrase || !reading) { 20 | return cleanedText; 21 | } 22 | 23 | // If reading doesn't have alphanumeric characters, it's not a jyut reading 24 | const hasEnglishChars = /[a-zA-Z0-9]/.test(reading); 25 | if (!hasEnglishChars) { 26 | return cleanedText; 27 | } 28 | 29 | try { 30 | const readings = parseCantoneseReadings(phrase, reading); 31 | return readings.map(({ text, reading }) => 32 | convertReadingToRubySC(text, reading) 33 | ); 34 | } catch (error) { 35 | return cleanedText; 36 | } 37 | } 38 | 39 | /** 40 | * Strips out # and spaces from raw text 41 | * @param {string} rawText 42 | */ 43 | function cleanRawText(rawText) { 44 | return rawText.replace(/#| /g, ''); 45 | } 46 | 47 | /** 48 | * Parses a text string into a structured content object with ruby text for readings 49 | * @param {string} text 50 | * @param {string} reading 51 | * @returns {import("yomichan-dict-builder/dist/types/yomitan/termbank").StructuredContent} 52 | */ 53 | function convertReadingToRubySC(text, reading) { 54 | // Check that both text and reading are type string, if not then cast to string 55 | if (typeof text !== 'string') { 56 | text = String(text); 57 | } 58 | if (typeof reading !== 'string') { 59 | reading = String(reading); 60 | } 61 | return { 62 | tag: 'ruby', 63 | content: [ 64 | text, 65 | { 66 | tag: 'rt', 67 | content: reading, 68 | }, 69 | ], 70 | }; 71 | } 72 | 73 | export { convertReadingToRubySC, convertTextToSC }; 74 | --------------------------------------------------------------------------------