├── .github
    └── workflows
    │   ├── daily-build.yaml
    │   └── test.yaml
├── .gitignore
├── .prettierrc
├── LICENSE
├── package-lock.json
├── package.json
├── readme.md
└── src
    ├── constants.js
    ├── convertToFrequencyDictionary.js
    ├── convertToHonziDictionary.js
    ├── convertToTermDictionary.js
    ├── downloadLatest.js
    ├── test
        ├── parseCantoneseReadings.test.js
        ├── parseEntry.test.js
        └── testdata.csv
    ├── types.d.ts
    └── util
        ├── addYomitanImages.js
        ├── addYomitanTags.js
        ├── csv
            ├── csvHandler.js
            └── parseCsvEntriesToJson.js
        ├── entryParse
            ├── findImages.js
            ├── parseEntryToJson.js
            └── parseLabels.js
        ├── getVersion.js
        ├── imageHandler
            ├── compressImages.js
            ├── downloadImages.js
            └── getImageFileName.js
        ├── readAndParseCSVs.js
        ├── textHandling
            ├── parseCantoneseReadings.js
            └── textUtils.js
        └── yomitan
            ├── convertEntryToDetailedDefinition.js
            ├── convertEntryToSynAntsSC.js
            ├── convertEntryToYomitanTerms.js
            ├── convertHeadwordsToSC.js
            ├── convertSenseToSC.js
            ├── createEntryAttribution.js
            ├── createEntryImageSC.js
            └── parseTextToSC.js


/.github/workflows/daily-build.yaml:
--------------------------------------------------------------------------------
 1 | name: Build and Release Dictionaries Daily
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 0 * * *'
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   build-release:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - uses: actions/checkout@v4
14 | 
15 |       - name: Install Dependencies
16 |         run: npm ci
17 | 
18 |       - name: Download Latest CSVs
19 |         run: npm run download
20 | 
21 |       - name: Get Current Date
22 |         id: date
23 |         run: echo "::set-output name=date::$(date +'%Y-%m-%d')"
24 | 
25 |       - name: Build Dictionaries
26 |         run: |
27 |           npm run buildTermDict ${{ steps.date.outputs.date }}
28 |           npm run buildHonziDict ${{ steps.date.outputs.date }}
29 | 
30 |       - name: Create and Publish Release
31 |         uses: softprops/action-gh-release@v2
32 |         with:
33 |           files: dist/*
34 |           tag_name: ${{ steps.date.outputs.date }}
35 |           name: ${{ steps.date.outputs.date }}
36 |           token: ${{ secrets.GITHUB_TOKEN }}
37 |           body: |
38 |             This is an automated release of the latest Words.hk for Yomitan.
39 |             For more information, please see the [README](https://github.com/MarvNC/wordshk-yomitan).
40 |             Download the latest release below: the file name should look like `Words.hk.YYYY-MM-DD.zip`.
41 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
 1 | name: Node.js CI
 2 | 
 3 | on: [push, pull_request]
 4 | 
 5 | jobs:
 6 |   test:
 7 |     runs-on: ubuntu-latest
 8 | 
 9 |     steps:
10 |       - uses: actions/checkout@v4
11 |       - uses: actions/setup-node@v4
12 |         with:
13 |           node-version: 20
14 |       - run: npm ci
15 |       - run: npm test
16 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.txt
  2 | csvs
  3 | /images
  4 | /compressedImages
  5 | /freqjsons
  6 | 
  7 | # Created by https://www.toptal.com/developers/gitignore/api/node
  8 | # Edit at https://www.toptal.com/developers/gitignore?templates=node
  9 | 
 10 | ### Node ###
 11 | # Logs
 12 | logs
 13 | *.log
 14 | npm-debug.log*
 15 | yarn-debug.log*
 16 | yarn-error.log*
 17 | lerna-debug.log*
 18 | .pnpm-debug.log*
 19 | 
 20 | # Diagnostic reports (https://nodejs.org/api/report.html)
 21 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
 22 | 
 23 | # Runtime data
 24 | pids
 25 | *.pid
 26 | *.seed
 27 | *.pid.lock
 28 | 
 29 | # Directory for instrumented libs generated by jscoverage/JSCover
 30 | lib-cov
 31 | 
 32 | # Coverage directory used by tools like istanbul
 33 | coverage
 34 | *.lcov
 35 | 
 36 | # nyc test coverage
 37 | .nyc_output
 38 | 
 39 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
 40 | .grunt
 41 | 
 42 | # Bower dependency directory (https://bower.io/)
 43 | bower_components
 44 | 
 45 | # node-waf configuration
 46 | .lock-wscript
 47 | 
 48 | # Compiled binary addons (https://nodejs.org/api/addons.html)
 49 | build/Release
 50 | 
 51 | # Dependency directories
 52 | node_modules/
 53 | jspm_packages/
 54 | 
 55 | # Snowpack dependency directory (https://snowpack.dev/)
 56 | web_modules/
 57 | 
 58 | # TypeScript cache
 59 | *.tsbuildinfo
 60 | 
 61 | # Optional npm cache directory
 62 | .npm
 63 | 
 64 | # Optional eslint cache
 65 | .eslintcache
 66 | 
 67 | # Optional stylelint cache
 68 | .stylelintcache
 69 | 
 70 | # Microbundle cache
 71 | .rpt2_cache/
 72 | .rts2_cache_cjs/
 73 | .rts2_cache_es/
 74 | .rts2_cache_umd/
 75 | 
 76 | # Optional REPL history
 77 | .node_repl_history
 78 | 
 79 | # Output of 'npm pack'
 80 | *.tgz
 81 | 
 82 | # Yarn Integrity file
 83 | .yarn-integrity
 84 | 
 85 | # dotenv environment variable files
 86 | .env
 87 | .env.development.local
 88 | .env.test.local
 89 | .env.production.local
 90 | .env.local
 91 | 
 92 | # parcel-bundler cache (https://parceljs.org/)
 93 | .cache
 94 | .parcel-cache
 95 | 
 96 | # Next.js build output
 97 | .next
 98 | out
 99 | 
100 | # Nuxt.js build / generate output
101 | .nuxt
102 | dist
103 | 
104 | # Gatsby files
105 | .cache/
106 | # Comment in the public line in if your project uses Gatsby and not Next.js
107 | # https://nextjs.org/blog/next-9-1#public-directory-support
108 | # public
109 | 
110 | # vuepress build output
111 | .vuepress/dist
112 | 
113 | # vuepress v2.x temp and cache directory
114 | .temp
115 | 
116 | # Docusaurus cache and generated files
117 | .docusaurus
118 | 
119 | # Serverless directories
120 | .serverless/
121 | 
122 | # FuseBox cache
123 | .fusebox/
124 | 
125 | # DynamoDB Local files
126 | .dynamodb/
127 | 
128 | # TernJS port file
129 | .tern-port
130 | 
131 | # Stores VSCode versions used for testing VSCode extensions
132 | .vscode-test
133 | 
134 | # yarn v2
135 | .yarn/cache
136 | .yarn/unplugged
137 | .yarn/build-state.yml
138 | .yarn/install-state.gz
139 | .pnp.*
140 | 
141 | ### Node Patch ###
142 | # Serverless Webpack directories
143 | .webpack/
144 | 
145 | # Optional stylelint cache
146 | 
147 | # SvelteKit build / generate output
148 | .svelte-kit
149 | 
150 | # End of https://www.toptal.com/developers/gitignore/api/node


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 |   "tabWidth": 2,
3 |   "useTabs": false,
4 |   "singleQuote": true,
5 |   "proseWrap": "always"
6 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 marv
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "scripts": {
 3 |     "download": "node src/downloadLatest.js",
 4 |     "buildTermDict": "node src/convertToTermDictionary.js",
 5 |     "buildFreq": "node src/convertToFrequencyDictionary.js",
 6 |     "buildHonziDict": "node src/convertToHonziDictionary.js",
 7 |     "test": "ava"
 8 |   },
 9 |   "dependencies": {
10 |     "@gerhobbelt/xregexp": "^4.4.0-32",
11 |     "axios": "^1.6.7",
12 |     "csv-parser": "^3.0.0",
13 |     "is-cjk-hanzi": "^1.0.0",
14 |     "jsdom": "^23.0.1",
15 |     "sharp": "^0.33.2",
16 |     "yomichan-dict-builder": "^2.9.0",
17 |     "zlib": "^1.0.5"
18 |   },
19 |   "type": "module",
20 |   "devDependencies": {
21 |     "ava": "^6.0.1"
22 |   },
23 |   "version": "1.0.0"
24 | }
25 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | # Words.hk for Yomitan
 2 | 
 3 | [![](https://img.shields.io/github/v/tag/marvnc/wordshk-yomitan?style=for-the-badge&label=Last%20Release)](https://github.com/MarvNC/wordshk-yomitan/releases/latest)
 4 | 
 5 | A conversion of the [words.hk](https://words.hk) dictionary for
 6 | [Yomitan](https://github.com/themoeway/yomitan) (formerly Yomichan). The
 7 | words.hk dictionary data is fetched from
 8 | [words.hk](https://words.hk/faiman/analysis/), built, then released
 9 | automatically every day.
10 | 
11 | Built using
12 | [yomichan-dict-builder](https://github.com/MarvNC/yomichan-dict-builder). For
13 | more Yomitan dictionaries and tools, see
14 | [Yomichan Dictionaries](https://github.com/MarvNC/yomichan-dictionaries).
15 | 
16 | ## Download
17 | 
18 | - [Words.hk for Yomitan](https://github.com/MarvNC/wordshk-yomitan/releases/latest)
19 | - [Words.hk 漢字 for Yomitan](https://github.com/MarvNC/wordshk-yomitan/releases/latest)
20 | - [Words.hk Frequency](https://github.com/MarvNC/wordshk-yomitan/releases/download/2024-09-17/YUE.Freq.Words.hk.Frequency.zip)
21 | 
22 | ## Screenshots
23 | 
24 | | ![chrome_廣東話_-_廣東話解釋__粵典_-_Google_Chrome_2024-02-03_22-57-37](https://github.com/MarvNC/wordshk-yomitan/assets/17340496/83eacfc1-6e31-453c-91c2-a8dac3be0bc4) | ![chrome_老虎_-_廣東話解釋__粵典_-_Google_Chrome_2024-02-03_22-58-13](https://github.com/MarvNC/wordshk-yomitan/assets/17340496/e882daa8-6fc4-491d-930e-ca9a0a081193) |
25 | | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
26 | | ![chrome_全脂_-_廣東話解釋__粵典_-_Google_Chrome_2024-02-03_22-58-35](https://github.com/MarvNC/wordshk-yomitan/assets/17340496/51fa78ba-c882-4f8c-b159-57a86f08e74b)   | ![chrome_講_-_廣東話解釋__粵典_-_Google_Chrome_2024-02-03_22-58-48](https://github.com/MarvNC/wordshk-yomitan/assets/17340496/233798e0-2363-48c4-9c11-6665e6262ef2)   |
27 | | ![chrome_Yomitan_Settings_-_Google_Chrome_2024-02-10_20-54-43](https://github.com/MarvNC/wordshk-yomitan/assets/17340496/57190a49-baaa-4313-87c7-9e8252daf2ae)          | ![chrome_Yomitan_Settings_-_Google_Chrome_2024-02-10_20-53-17](https://github.com/MarvNC/wordshk-yomitan/assets/17340496/4f6b9654-eb5d-4187-8d8d-56f4a10dfcf6)        |
28 | 
29 | ## Usage
30 | 
31 | Simply download the dictionary and import it into Yomitan. For more detailed
32 | instructions, please see the
33 | [Yomitan documentation](https://github.com/themoeway/yomitan).
34 | 
35 | ## Attribution/License
36 | 
37 | The code in this repository is licensed under the MIT license.
38 | 
39 | This Yomitan dictionary is built off the free data provided by words.hk and is
40 | licensed under the Non-Commercial Open Data License 1.0 that
41 | [words.hk](https://words.hk/base/hoifong/) is.
42 | 
43 | I took a lot of inspiration (copied) design ideas and styling from
44 | [Stephenmk's Jitendex](https://github.com/stephenmk/Jitendex) in designing this
45 | dictionary.
46 | 


--------------------------------------------------------------------------------
/src/constants.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @type {Record<Language, { name: string, shortName: string, langCode: string }>}
 3 |  */
 4 | const LANGUAGES_DATA = {
 5 |   yue: {
 6 |     name: '廣東話',
 7 |     shortName: '粵',
 8 |     langCode: 'yue',
 9 |   },
10 |   eng: {
11 |     name: '英文',
12 |     shortName: '英',
13 |     langCode: 'en',
14 |   },
15 |   zho: {
16 |     name: '中文',
17 |     shortName: '中',
18 |     langCode: 'zh-Hant',
19 |   },
20 |   jpn: {
21 |     name: '日文',
22 |     shortName: '日',
23 |     langCode: 'ja',
24 |   },
25 |   kor: {
26 |     name: '韓文',
27 |     shortName: '韓',
28 |     langCode: 'ko',
29 |   },
30 |   vie: {
31 |     name: '越南文',
32 |     shortName: '越',
33 |     langCode: 'vi',
34 |   },
35 |   lzh: {
36 |     name: '文言文',
37 |     shortName: '文',
38 |     langCode: 'zh-Hant',
39 |   },
40 |   por: {
41 |     name: '葡萄牙文',
42 |     shortName: '葡',
43 |     langCode: 'pt',
44 |   },
45 |   deu: {
46 |     name: '德文',
47 |     shortName: '德',
48 |     langCode: 'de',
49 |   },
50 |   fra: {
51 |     name: '法文',
52 |     shortName: '法',
53 |     langCode: 'fr',
54 |   },
55 |   mnc: {
56 |     name: '滿文',
57 |     shortName: '滿',
58 |     langCode: 'mnc',
59 |   },
60 |   lat: {
61 |     name: '拉丁文',
62 |     shortName: '拉',
63 |     langCode: 'la',
64 |   },
65 |   tib: {
66 |     name: '藏文',
67 |     shortName: '藏',
68 |     langCode: 'bo',
69 |   },
70 |   量詞: {
71 |     name: '量詞',
72 |     shortName: '量詞',
73 |     langCode: '',
74 |   },
75 | };
76 | 
77 | const IMAGE_FOLDER = 'images';
78 | const COMPRESSED_IMAGES_FOLDER = './compressedImages';
79 | const IMAGE_RESIZE_WIDTH = 400;
80 | 
81 | export {
82 |   LANGUAGES_DATA,
83 |   IMAGE_FOLDER,
84 |   COMPRESSED_IMAGES_FOLDER,
85 |   IMAGE_RESIZE_WIDTH,
86 | };
87 | export const dataFolder = './csvs';
88 | export const exportDirectory = './dist';
89 | 
90 | export const TERM_INDEX_FILE = 'term_index.json';
91 | export const HONZI_INDEX_FILE = 'honzi_index.json';


--------------------------------------------------------------------------------
/src/convertToFrequencyDictionary.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Requires the jsons downloaded from https://words.hk/faiman/analysis/
 3 |  * to be in the freqjsons directory
 4 |  */
 5 | import fs from 'fs';
 6 | import path from 'path';
 7 | import { Dictionary, DictionaryIndex } from 'yomichan-dict-builder';
 8 | const freqJsonsDir = 'freqjsons';
 9 | const charCountJson = 'charcount.json';
10 | const existingWordCountJson = 'existingwordcount.json';
11 | 
12 | (async () => {
13 |   const freqJsons = fs.readdirSync(freqJsonsDir);
14 |   const charCountData = JSON.parse(
15 |     fs.readFileSync(path.join(freqJsonsDir, charCountJson)).toString()
16 |   );
17 |   const existingWordCountData = JSON.parse(
18 |     fs.readFileSync(path.join(freqJsonsDir, existingWordCountJson)).toString()
19 |   );
20 |   console.log(`Read ${freqJsons.length} files from ${freqJsonsDir}`);
21 |   console.log(
22 |     `Read ${Object.keys(charCountData).length} characters from ${charCountJson}`
23 |   );
24 |   console.log(
25 |     `Read ${
26 |       Object.keys(existingWordCountData).length
27 |     } words from ${existingWordCountJson}`
28 |   );
29 | 
30 |   const dictionary = new Dictionary({
31 |     fileName: 'Words.hk Frequency.zip',
32 |   });
33 |   const dictionaryIndex = new DictionaryIndex()
34 |     .setAuthor('Marv')
35 |     .setAttribution(
36 |       `Words.hk & contributers (https://words.hk)
37 |   See license at https://words.hk/base/hoifong/`
38 |     )
39 |     .setUrl('https://github.com/MarvNC/wordshk-yomitan')
40 |     .setDescription(
41 |       `Converted from the free Words.hk dictionary found at https://words.hk/.
42 |     Converted using https://github.com/MarvNC/yomichan-dict-builder`
43 |     )
44 |     .setTitle(`Words.hk Frequency`)
45 |     .setRevision(`1.0`);
46 |   await dictionary.setIndex(dictionaryIndex.build());
47 | 
48 |   // Add characters to kanji meta
49 |   const sortedCharCountData = Object.entries(charCountData).sort(
50 |     ([, a], [, b]) => b - a
51 |   );
52 |   for (let i = 0; i < sortedCharCountData.length; i++) {
53 |     const [char, occurrences] = sortedCharCountData[i];
54 |     await dictionary.addKanjiMeta([
55 |       char,
56 |       'freq',
57 |       {
58 |         displayValue: `${i + 1} (${occurrences})`,
59 |         value: i + 1,
60 |       },
61 |     ]);
62 |   }
63 | 
64 |   // Add words to dictionary
65 |   const sortedExistingWordCountData = Object.entries(
66 |     existingWordCountData
67 |   ).sort(([, a], [, b]) => b - a);
68 |   for (let i = 0; i < sortedExistingWordCountData.length; i++) {
69 |     const [word, occurrences] = sortedExistingWordCountData[i];
70 |     await dictionary.addTermMeta([
71 |       word,
72 |       'freq',
73 |       {
74 |         displayValue: `${i + 1} (${occurrences})`,
75 |         value: i + 1,
76 |       },
77 |     ]);
78 |   }
79 | 
80 |   await dictionary.export('dist');
81 |   console.log(`Exported dictionary to dist.`);
82 | })();
83 | 


--------------------------------------------------------------------------------
/src/convertToHonziDictionary.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs/promises';
 2 | import { Dictionary, DictionaryIndex, KanjiEntry } from 'yomichan-dict-builder';
 3 | import { getVersion } from './util/getVersion.js';
 4 | import { dataFolder, exportDirectory, HONZI_INDEX_FILE } from './constants.js';
 5 | import { readAndParseCSVs } from './util/readAndParseCSVs.js';
 6 | import { isSingleCJKHanzi } from 'is-cjk-hanzi';
 7 | 
 8 | (async () => {
 9 |   const tagName = process.argv[2] ?? 'latest';
10 | 
11 |   const { dictionaryEntries, dateString } = await readAndParseCSVs(dataFolder);
12 | 
13 |   /** @type {`${string}.zip`} */
14 |   const honziDictionaryFilename = `Words.hk.Honzi.${dateString}.zip`;
15 |   const dictionary = new Dictionary({
16 |     fileName: honziDictionaryFilename,
17 |   });
18 | 
19 |   const dictionaryIndex = new DictionaryIndex()
20 |     .setAuthor('Marv')
21 |     .setAttribution(
22 |       `Words.hk & contributers (https://words.hk)
23 |     See license at https://words.hk/base/hoifong/`
24 |     )
25 |     .setUrl('https://github.com/MarvNC/wordshk-yomitan')
26 |     .setDescription(
27 |       `Converted from the free Words.hk dictionary found at https://words.hk/.
28 |       Converted using https://github.com/MarvNC/yomichan-dict-builder`
29 |     )
30 |     .setTitle(`Words.hk 粵典 漢字 [${dateString}]`)
31 |     .setRevision(dateString)
32 |     .setIsUpdatable(true)
33 |     .setIndexUrl(
34 |       `https://github.com/MarvNC/wordshk-yomitan/releases/latest/download/${HONZI_INDEX_FILE}`
35 |     )
36 |     .setDownloadUrl(
37 |       `https://github.com/MarvNC/wordshk-yomitan/releases/download/${tagName}/${honziDictionaryFilename}`
38 |     );
39 |   await dictionary.setIndex(dictionaryIndex.build());
40 | 
41 |   // save index file to exportDirectory
42 |   await dictionaryIndex.export(exportDirectory, HONZI_INDEX_FILE);
43 | 
44 |   for (const entry of dictionaryEntries) {
45 |     addHonziEntry(dictionary, entry);
46 |   }
47 |   console.log(`Finished adding entries to dictionary.`);
48 | 
49 |   const stats = await dictionary.export(exportDirectory);
50 |   console.log(`Exported honzi dictionary to ${exportDirectory}.`);
51 |   console.log(`Added ${stats.kanjiCount} honzi entries.`);
52 | })();
53 | 
54 | /**
55 |  *
56 |  * @param {Dictionary} dictionary
57 |  * @param {DictionaryEntry} entry
58 |  */
59 | function addHonziEntry(dictionary, entry) {
60 |   for (const headword of entry.headwords) {
61 |     if (!isSingleCJKHanzi(headword.text)) {
62 |       continue;
63 |     }
64 |     const kanjiEntry = new KanjiEntry(headword.text).setKunyomi(
65 |       headword.readings.join(' ')
66 |     );
67 |     for (const sense of entry.senses) {
68 |       for (const explanationText of Object.values(sense.explanation)) {
69 |         kanjiEntry.addMeanings(explanationText);
70 |       }
71 |     }
72 |     dictionary.addKanji(kanjiEntry.build());
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/convertToTermDictionary.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs/promises';
 2 | import { Dictionary, DictionaryIndex } from 'yomichan-dict-builder';
 3 | 
 4 | import { convertEntryToYomitanTerms } from './util/yomitan/convertEntryToYomitanTerms.js';
 5 | import { findLabelValues } from './util/entryParse/parseLabels.js';
 6 | import { addYomitanTags } from './util/addYomitanTags.js';
 7 | import { getAllImageURLs } from './util/entryParse/findImages.js';
 8 | import { downloadImages } from './util/imageHandler/downloadImages.js';
 9 | import { addYomitanImages } from './util/addYomitanImages.js';
10 | import {
11 |   IMAGE_FOLDER,
12 |   COMPRESSED_IMAGES_FOLDER,
13 |   IMAGE_RESIZE_WIDTH,
14 |   TERM_INDEX_FILE,
15 | } from './constants.js';
16 | import { compressImages } from './util/imageHandler/compressImages.js';
17 | import { dataFolder, exportDirectory } from './constants.js';
18 | import { getVersion } from './util/getVersion.js';
19 | import { readAndParseCSVs } from './util/readAndParseCSVs.js';
20 | 
21 | (async () => {
22 |   const tagName = process.argv[2] ?? 'latest';
23 | 
24 |   const { dictionaryEntries, dateString } = await readAndParseCSVs(dataFolder);
25 | 
26 |   const uniqueLabels = findLabelValues(dictionaryEntries);
27 | 
28 |   const imageURLs = getAllImageURLs(dictionaryEntries);
29 | 
30 |   await downloadImages(imageURLs);
31 | 
32 |   const compressImagesPromise = compressImages(
33 |     IMAGE_FOLDER,
34 |     COMPRESSED_IMAGES_FOLDER,
35 |     IMAGE_RESIZE_WIDTH
36 |   );
37 | 
38 |   /** @type {`${string}.zip`} */
39 |   const termDictionaryFileName = `Words.hk.${dateString}.zip`;
40 |   const dictionary = new Dictionary({
41 |     fileName: termDictionaryFileName,
42 |   });
43 | 
44 |   const dictionaryIndex = new DictionaryIndex()
45 |     .setAuthor('Marv')
46 |     .setAttribution(
47 |       `Words.hk & contributers (https://words.hk)
48 |     See license at https://words.hk/base/hoifong/`
49 |     )
50 |     .setUrl('https://github.com/MarvNC/wordshk-yomitan')
51 |     .setDescription(
52 |       `Converted from the free Words.hk dictionary found at https://words.hk/.
53 |       This export contains ${dictionaryEntries.length} entries.
54 |       Converted using https://github.com/MarvNC/yomichan-dict-builder`
55 |     )
56 |     .setTitle(`Words.hk 粵典 [${dateString}]`)
57 |     .setRevision(dateString)
58 |     .setIsUpdatable(true)
59 |     .setIndexUrl(
60 |       `https://github.com/MarvNC/wordshk-yomitan/releases/latest/download/${TERM_INDEX_FILE}`
61 |     )
62 |     .setDownloadUrl(
63 |       `https://github.com/MarvNC/wordshk-yomitan/releases/${tagName}/download/${termDictionaryFileName}`
64 |     );
65 |   await dictionary.setIndex(dictionaryIndex.build());
66 | 
67 |   // save index file to exportDirectory
68 |   await dictionaryIndex.export(exportDirectory, TERM_INDEX_FILE);
69 | 
70 |   for (const entry of dictionaryEntries) {
71 |     const terms = convertEntryToYomitanTerms(entry);
72 |     for (const term of terms) {
73 |       await dictionary.addTerm(term);
74 |     }
75 |   }
76 |   console.log(`Finished adding entries to dictionary.`);
77 | 
78 |   await addYomitanTags(dictionary, uniqueLabels);
79 | 
80 |   console.log(`Adding images to dictionary.`);
81 |   // Wait for images to be compressed before adding
82 |   await compressImagesPromise;
83 |   await addYomitanImages(dictionary, COMPRESSED_IMAGES_FOLDER);
84 | 
85 |   await dictionary.export(exportDirectory);
86 |   console.log(`Exported dictionary to ${exportDirectory}.`);
87 | })();
88 | 


--------------------------------------------------------------------------------
/src/downloadLatest.js:
--------------------------------------------------------------------------------
  1 | import { JSDOM } from 'jsdom';
  2 | import fs from 'fs';
  3 | import path from 'path';
  4 | import zlib from 'zlib';
  5 | import axios from 'axios';
  6 | 
  7 | const domain = 'https://words.hk';
  8 | const requestURL = `${domain}/faiman/request_data/`;
  9 | const csvDir = 'csvs';
 10 | 
 11 | (async function downloadLatest() {
 12 |   const dom = await JSDOM.fromURL(requestURL);
 13 |   const { document } = dom.window;
 14 |   const csrfTokenInput = document.querySelector(
 15 |     'input[name=csrfmiddlewaretoken]'
 16 |   );
 17 |   if (!csrfTokenInput) {
 18 |     throw new Error('No csrf token found');
 19 |   }
 20 |   const csrfToken = /** @type{HTMLInputElement} */ (csrfTokenInput).value;
 21 |   const myHeaders = new Headers();
 22 |   myHeaders.append('Cookie', `csrftoken=${csrfToken}`);
 23 |   myHeaders.append('Origin', domain);
 24 |   myHeaders.append('Referer', requestURL);
 25 |   const urlencoded = new URLSearchParams();
 26 |   urlencoded.append('csrfmiddlewaretoken', csrfToken);
 27 | 
 28 |   /**
 29 |    * @type {RequestInit}
 30 |    */
 31 |   const requestOptions = {
 32 |     method: 'POST',
 33 |     headers: myHeaders,
 34 |     body: urlencoded,
 35 |     redirect: 'follow',
 36 |   };
 37 | 
 38 |   const response = await fetch(requestURL, requestOptions);
 39 |   const text = await response.text();
 40 |   if (!response.ok) {
 41 |     throw new Error(`Response: ${response.status} ${response.statusText}`);
 42 |   }
 43 |   console.log('Request success, getting csv links...');
 44 |   const csvLinks = await getCSVLinks(new JSDOM(text));
 45 | 
 46 |   await downloadCSVs(csvLinks);
 47 |   console.log('Download complete.');
 48 | })();
 49 | 
 50 | /**
 51 |  *
 52 |  * @param {JSDOM} dom
 53 |  * @returns {Promise<string[]>} The URLs of the CSVs
 54 |  */
 55 | async function getCSVLinks(dom) {
 56 |   const { document } = dom.window;
 57 | 
 58 |   const csvLinkAnchors = /** @type {HTMLAnchorElement[]} */ ([
 59 |     ...document.querySelectorAll("a[href$='.csv.gz']"),
 60 |   ]);
 61 |   if (csvLinkAnchors.length !== 2) {
 62 |     throw new Error('Expected 2 csv links');
 63 |   }
 64 | 
 65 |   console.log('Found two csv links.');
 66 | 
 67 |   const csvLinks = csvLinkAnchors.map((a) => `${domain}${a.href}`);
 68 | 
 69 |   return csvLinks;
 70 | }
 71 | 
 72 | /**
 73 |  * Download the CSVs from the given URLs
 74 |  * @param {string[]} csvLinks
 75 |  */
 76 | async function downloadCSVs(csvLinks) {
 77 |   // Create the directory if it doesn't exist
 78 |   if (!fs.existsSync(csvDir)) {
 79 |     fs.mkdirSync(csvDir);
 80 |   }
 81 | 
 82 |   // Delete contents of the directory
 83 |   fs.readdirSync(csvDir).forEach((file) => {
 84 |     fs.unlinkSync(path.join(csvDir, file));
 85 |   });
 86 | 
 87 |   // Process each URL
 88 |   for (const url of csvLinks) {
 89 |     // Extract filename from URL
 90 |     const filename = path.basename(url);
 91 | 
 92 |     const fullPath = path.join(csvDir, filename);
 93 | 
 94 |     console.log(`Downloading ${filename} from ${url}...`);
 95 | 
 96 |     // Download the file from the URL to csvs directory
 97 |     const response = await axios.get(url, {
 98 |       responseType: 'arraybuffer',
 99 |     });
100 |     const buffer = Buffer.from(response.data);
101 | 
102 |     fs.writeFileSync(fullPath, buffer);
103 | 
104 |     // Unzip the downloaded file
105 |     console.log(`Unzipping ${filename}...`);
106 |     const gzip = zlib.createGunzip();
107 |     const source = fs.createReadStream(fullPath);
108 |     const destination = fs.createWriteStream(
109 |       path.join(csvDir, filename.replace('.gz', ''))
110 |     );
111 |     source
112 |       .pipe(gzip)
113 |       .pipe(destination)
114 |       .on('finish', function () {
115 |         // Delete the .gz file
116 |         fs.unlinkSync(fullPath);
117 |       });
118 |   }
119 | }
120 | 


--------------------------------------------------------------------------------
/src/test/parseCantoneseReadings.test.js:
--------------------------------------------------------------------------------
  1 | import test from 'ava';
  2 | 
  3 | import { parseCantoneseReadings } from '../util/textHandling/parseCantoneseReadings.js';
  4 | 
  5 | /**
  6 |  * @typedef {Object} TestCase
  7 |  * @property {string} text
  8 |  * @property {string} reading
  9 |  * @property {TextReadingPair[]} [expected]
 10 |  * @property {boolean} [shouldThrow]
 11 |  */
 12 | 
 13 | /**
 14 |  * @type {TestCase[]}
 15 |  */
 16 | const testCases = [
 17 |   {
 18 |     text: '福州',
 19 |     reading: 'fuk1 zau1',
 20 |     expected: [
 21 |       { text: '福', reading: 'fuk1' },
 22 |       { text: '州', reading: 'zau1' },
 23 |     ],
 24 |   },
 25 |   {
 26 |     text: 'bu你阿麼',
 27 |     reading: 'bu4 ni5 aa3 mo1',
 28 |     expected: [
 29 |       { text: 'bu', reading: 'bu4' },
 30 |       { text: '你', reading: 'ni5' },
 31 |       { text: '阿', reading: 'aa3' },
 32 |       { text: '麼', reading: 'mo1' },
 33 |     ],
 34 |   },
 35 |   {
 36 |     text: '你get唔get到我講咩？',
 37 |     reading: 'nei5 get1 m4 get1 dou2 ngo5 gong2 me1?',
 38 |     expected: [
 39 |       { text: '你', reading: 'nei5' },
 40 |       { text: 'get', reading: 'get1' },
 41 |       { text: '唔', reading: 'm4' },
 42 |       { text: 'get', reading: 'get1' },
 43 |       { text: '到', reading: 'dou2' },
 44 |       { text: '我', reading: 'ngo5' },
 45 |       { text: '講', reading: 'gong2' },
 46 |       { text: '咩', reading: 'me1' },
 47 |       { text: '？', reading: '' },
 48 |     ],
 49 |   },
 50 |   {
 51 |     text: '專業運動員成日斷韌帶。',
 52 |     reading: 'zyun1 jip6 wan6 dung6 jyun4 seng4 jat6 tyun5 jan6 daai2.',
 53 |     expected: [
 54 |       { text: '專', reading: 'zyun1' },
 55 |       { text: '業', reading: 'jip6' },
 56 |       { text: '運', reading: 'wan6' },
 57 |       { text: '動', reading: 'dung6' },
 58 |       { text: '員', reading: 'jyun4' },
 59 |       { text: '成', reading: 'seng4' },
 60 |       { text: '日', reading: 'jat6' },
 61 |       { text: '斷', reading: 'tyun5' },
 62 |       { text: '韌', reading: 'jan6' },
 63 |       { text: '帶', reading: 'daai2' },
 64 |       { text: '。', reading: '' },
 65 |     ],
 66 |   },
 67 |   {
 68 |     text: '佢考咗車牌六年，終於成功嘞。',
 69 |     reading: 'keoi5 haau2 zo2 ce1 paai4 luk6 nin4 zung1 jyu1 sing4 gung1 laak3',
 70 |     expected: [
 71 |       { text: '佢', reading: 'keoi5' },
 72 |       { text: '考', reading: 'haau2' },
 73 |       { text: '咗', reading: 'zo2' },
 74 |       { text: '車', reading: 'ce1' },
 75 |       { text: '牌', reading: 'paai4' },
 76 |       { text: '六', reading: 'luk6' },
 77 |       { text: '年', reading: 'nin4' },
 78 |       { text: '，', reading: '' },
 79 |       { text: '終', reading: 'zung1' },
 80 |       { text: '於', reading: 'jyu1' },
 81 |       { text: '成', reading: 'sing4' },
 82 |       { text: '功', reading: 'gung1' },
 83 |       { text: '嘞', reading: 'laak3' },
 84 |       { text: '。', reading: '' },
 85 |     ],
 86 |   },
 87 |   {
 88 |     text: '嗰個男仔喺我手臂上搣咗一下。',
 89 |     reading: 'go2 go3 naam4 zai2 hai2 ngo5 sau2 bei3 soeng6 mit1 zo2 jat1 haa5',
 90 |     expected: [
 91 |       { text: '嗰', reading: 'go2' },
 92 |       { text: '個', reading: 'go3' },
 93 |       { text: '男', reading: 'naam4' },
 94 |       { text: '仔', reading: 'zai2' },
 95 |       { text: '喺', reading: 'hai2' },
 96 |       { text: '我', reading: 'ngo5' },
 97 |       { text: '手', reading: 'sau2' },
 98 |       { text: '臂', reading: 'bei3' },
 99 |       { text: '上', reading: 'soeng6' },
100 |       { text: '搣', reading: 'mit1' },
101 |       { text: '咗', reading: 'zo2' },
102 |       { text: '一', reading: 'jat1' },
103 |       { text: '下', reading: 'haa5' },
104 |       { text: '。', reading: '' },
105 |     ],
106 |   },
107 |   {
108 |     text: '「乜乜M」嗰啲巴士，一定經地鐵站㗎。',
109 |     reading:
110 |       'mat1 mat1 em1 go2 di1 baa1 si2, jat1 ding6 ging1 dei6 tit3 zaam6 gaa3.',
111 |     expected: [
112 |       { text: '「', reading: '' },
113 |       { text: '乜', reading: 'mat1' },
114 |       { text: '乜', reading: 'mat1' },
115 |       { text: 'M', reading: 'em1' },
116 |       { text: '」', reading: '' },
117 |       { text: '嗰', reading: 'go2' },
118 |       { text: '啲', reading: 'di1' },
119 |       { text: '巴', reading: 'baa1' },
120 |       { text: '士', reading: 'si2' },
121 |       { text: '，', reading: '' },
122 |       { text: '一', reading: 'jat1' },
123 |       { text: '定', reading: 'ding6' },
124 |       { text: '經', reading: 'ging1' },
125 |       { text: '地', reading: 'dei6' },
126 |       { text: '鐵', reading: 'tit3' },
127 |       { text: '站', reading: 'zaam6' },
128 |       { text: '㗎', reading: 'gaa3' },
129 |       { text: '。', reading: '' },
130 |     ],
131 |   },
132 |   {
133 |     text: '𨂾過條溪',
134 |     reading: 'laam3 gwo3 tiu4 kai1',
135 |     expected: [
136 |       { text: '𨂾', reading: 'laam3' },
137 |       { text: '過', reading: 'gwo3' },
138 |       { text: '條', reading: 'tiu4' },
139 |       { text: '溪', reading: 'kai1' },
140 |     ],
141 |   },
142 |   {
143 |     text: '呢個商場係好多居民返屋企嘅必經之路，有好有唔好囉。',
144 |     reading:
145 |       'nei1 go3 soeng1 coeng4 hai6 hou2 do1 geoi1 man4 faan1 uk1 kei2 ge3 bit1ging1 zi1 lou6, jau5 hou2 jau5 m4 hou2 lo1.',
146 |     expected: [
147 |       { text: '呢', reading: 'nei1' },
148 |       { text: '個', reading: 'go3' },
149 |       { text: '商', reading: 'soeng1' },
150 |       { text: '場', reading: 'coeng4' },
151 |       { text: '係', reading: 'hai6' },
152 |       { text: '好', reading: 'hou2' },
153 |       { text: '多', reading: 'do1' },
154 |       { text: '居', reading: 'geoi1' },
155 |       { text: '民', reading: 'man4' },
156 |       { text: '返', reading: 'faan1' },
157 |       { text: '屋', reading: 'uk1' },
158 |       { text: '企', reading: 'kei2' },
159 |       { text: '嘅', reading: 'ge3' },
160 |       { text: '必', reading: 'bit1' },
161 |       { text: '經', reading: 'ging1' },
162 |       { text: '之', reading: 'zi1' },
163 |       { text: '路', reading: 'lou6' },
164 |       { text: '，', reading: '' },
165 |       { text: '有', reading: 'jau5' },
166 |       { text: '好', reading: 'hou2' },
167 |       { text: '有', reading: 'jau5' },
168 |       { text: '唔', reading: 'm4' },
169 |       { text: '好', reading: 'hou2' },
170 |       { text: '囉', reading: 'lo1' },
171 |       { text: '。', reading: '' },
172 |     ],
173 |   },
174 |   {
175 |     text: '今晚演出嘅粵劇劇目係《白兔會》。',
176 |     reading:
177 |       'gam1 maan5 jin2 ceot1 ge3 jyut6 kek6 kek6 muk6 hai6 baak6 tou3 wui6.',
178 |     expected: [
179 |       { text: '今', reading: 'gam1' },
180 |       { text: '晚', reading: 'maan5' },
181 |       { text: '演', reading: 'jin2' },
182 |       { text: '出', reading: 'ceot1' },
183 |       { text: '嘅', reading: 'ge3' },
184 |       { text: '粵', reading: 'jyut6' },
185 |       { text: '劇', reading: 'kek6' },
186 |       { text: '劇', reading: 'kek6' },
187 |       { text: '目', reading: 'muk6' },
188 |       { text: '係', reading: 'hai6' },
189 |       { text: '《', reading: '' },
190 |       { text: '白', reading: 'baak6' },
191 |       { text: '兔', reading: 'tou3' },
192 |       { text: '會', reading: 'wui6' },
193 |       { text: '》', reading: '' },
194 |       { text: '。', reading: '' },
195 |     ],
196 |   },
197 |   {
198 |     text: 'Panda好pandai踢呢',
199 |     reading: 'pan3 daa1 hou2 baan3 naai1 tek3 le3',
200 |     shouldThrow: true,
201 |   },
202 | ];
203 | 
204 | for (const { text, reading, expected, shouldThrow } of testCases) {
205 |   test(`${
206 |     shouldThrow ? ' (should throw)' : ''
207 |   }parseCantoneseReadings: ${text} ${reading}`, (t) => {
208 |     if (shouldThrow) {
209 |       t.throws(() => parseCantoneseReadings(text, reading));
210 |       return;
211 |     } else {
212 |       const result = parseCantoneseReadings(text, reading);
213 |       t.deepEqual(result, expected);
214 |     }
215 |   });
216 | }
217 | 


--------------------------------------------------------------------------------
/src/test/parseEntry.test.js:
--------------------------------------------------------------------------------
  1 | import test from 'ava';
  2 | import path from 'path';
  3 | 
  4 | import { parseCSVEntries } from '../util/csv/parseCsvEntriesToJson.js';
  5 | 
  6 | const testCsvFile = 'src/test/testdata.csv';
  7 | 
  8 | const expectedEntries = [
  9 |   {
 10 |     id: 101613,
 11 |     headwords: [
 12 |       {
 13 |         text: '大電',
 14 |         readings: ['daai6 din6'],
 15 |       },
 16 |     ],
 17 |     tags: [
 18 |       {
 19 |         name: 'pos',
 20 |         value: '名詞',
 21 |       },
 22 |     ],
 23 |     senses: [
 24 |       {
 25 |         explanation: {
 26 |           yue: ['D電池（量詞：粒）'],
 27 |           eng: ['D cells battery'],
 28 |         },
 29 |         egs: [],
 30 |       },
 31 |     ],
 32 |   },
 33 |   {
 34 |     id: 92456,
 35 |     headwords: [
 36 |       {
 37 |         text: '發電廠',
 38 |         readings: ['faat3 din6 cong2'],
 39 |       },
 40 |     ],
 41 |     tags: [
 42 |       {
 43 |         name: 'pos',
 44 |         value: '名詞',
 45 |       },
 46 |     ],
 47 |     senses: [
 48 |       {
 49 |         explanation: {
 50 |           yue: ['產生#電力 嘅大型#建築物（量詞：間／座）'],
 51 |           eng: ['power plant'],
 52 |         },
 53 |         egs: [],
 54 |       },
 55 |     ],
 56 |   },
 57 |   {
 58 |     id: 82131,
 59 |     headwords: [
 60 |       {
 61 |         text: '排污',
 62 |         readings: ['paai4 wu1'],
 63 |       },
 64 |     ],
 65 |     tags: [
 66 |       {
 67 |         name: 'pos',
 68 |         value: '動詞',
 69 |       },
 70 |     ],
 71 |     senses: [
 72 |       {
 73 |         explanation: {
 74 |           yue: ['排走#污水'],
 75 |           eng: ['to drain away sewage'],
 76 |         },
 77 |         egs: [
 78 |           {
 79 |             yue: ['排污費 (paai4 wu1)'],
 80 |             eng: ['sewerage charge'],
 81 |           },
 82 |           {
 83 |             yue: ['排污系統 (paai4 wu1 hai6 tung2)'],
 84 |             eng: ['sewage system'],
 85 |           },
 86 |           {
 87 |             yue: ['排污設施 (paai4 wu1 cit3 si1)'],
 88 |             eng: ['sewage works'],
 89 |           },
 90 |           {
 91 |             yue: ['公共排污服務 (gung1 gung6 paai4 wu1 fuk6 mou6)'],
 92 |             eng: ['public sewage services'],
 93 |           },
 94 |           {
 95 |             yue: [
 96 |               '排污設備改善計劃 (paai4 wu1 cit3 bei6 goi2 sin6 gai3 waak6)',
 97 |             ],
 98 |             eng: ['sewerage improvement programme'],
 99 |           },
100 |           {
101 |             yue: [
102 |               '呢啲市區河道嘅設計以防洪及有效排污為主。 (ni1 di1 si5 keoi1 ho4 dou6 ge3 cit3 gai3 ji5 fong4 hung4 kap6 jau5 haau6 paai4 wu1 wai4 zyu2.)',
103 |             ],
104 |             eng: [
105 |               'These urban channels were designed for flood prevention and effective drainage.',
106 |             ],
107 |           },
108 |         ],
109 |       },
110 |     ],
111 |   },
112 |   {
113 |     id: 72252,
114 |     headwords: [
115 |       {
116 |         text: '揀選',
117 |         readings: ['gaan2 syun2'],
118 |       },
119 |     ],
120 |     tags: [
121 |       {
122 |         name: 'pos',
123 |         value: '動詞',
124 |       },
125 |       {
126 |         name: 'sim',
127 |         value: '挑選',
128 |       },
129 |       {
130 |         name: 'sim',
131 |         value: '揀',
132 |       },
133 |       {
134 |         name: 'sim',
135 |         value: '選',
136 |       },
137 |       {
138 |         name: 'sim',
139 |         value: '選擇',
140 |       },
141 |     ],
142 |     senses: [
143 |       {
144 |         explanation: {
145 |           yue: ['根據你嘅取向，喺兩樣嘢或以上當中，抽取一樣'],
146 |           eng: ['to select; to choose'],
147 |         },
148 |         egs: [
149 |           {
150 |             yue: [
151 |               '一個蠢，一個鈍，噉樣邊叫有得揀選？ (jat1 go3 ceon2, jat1 go3 deon6, gam2 joeng2 bin1 giu3 jau5 dak1 gaan2 syun2?)',
152 |             ],
153 |             eng: [
154 |               'This candidate is stupid and that is dumb. How can I choose among them?',
155 |             ],
156 |           },
157 |         ],
158 |       },
159 |     ],
160 |   },
161 |   {
162 |     id: 66987,
163 |     headwords: [
164 |       {
165 |         text: '背景',
166 |         readings: ['bui3 ging2'],
167 |       },
168 |     ],
169 |     tags: [
170 |       {
171 |         name: 'pos',
172 |         value: '名詞',
173 |       },
174 |     ],
175 |     senses: [
176 |       {
177 |         explanation: {
178 |           yue: ['喺舞台或者現實襯托主體嘅景物、佈景、環境'],
179 |           eng: ['background; setting'],
180 |         },
181 |         egs: [
182 |           {
183 |             yue: [
184 |               '呢張相嘅背景係一啲椰樹。 (ni1 zoeng1 soeng2 ge3 bui3 ging2 hai6 jat1 di1 je4 syu6.)',
185 |             ],
186 |             eng: ['The coconut trees form a background to this picture.'],
187 |           },
188 |           {
189 |             yue: [
190 |               '段片嘅背景音樂叫咩名？ (dyun6 pin2 ge3 bui3 ging2 jam1 ngok6 giu3 me1 meng2?)',
191 |             ],
192 |             eng: ['What is the title of the background music in the video?'],
193 |           },
194 |         ],
195 |       },
196 |       {
197 |         explanation: {
198 |           yue: [
199 |             '人嘅來歷或經歷，例如家庭、教育、工作等等，亦可以指佢哋所倚靠嘅人物或者勢力',
200 |           ],
201 |           eng: [
202 |             'the "background" of a person, especially their educational background, occupation, social/family connections, etc.',
203 |           ],
204 |         },
205 |         egs: [
206 |           {
207 |             yue: [
208 |               '不如揾人查下佢個背景，我覺得佢好有可疑。 (bat1 jyu4 wan2 jan4 caa4 haa5 keoi5 go3 bui3 ging2, ngo5 gok3 dak1 keoi5 hou2 jau5 ho2 ji4.)',
209 |             ],
210 |             eng: [
211 |               'Shall we find someone to look into his background? I think he is so suspicious.',
212 |             ],
213 |           },
214 |         ],
215 |       },
216 |     ],
217 |   },
218 |   {
219 |     id: 90185,
220 |     headwords: [
221 |       {
222 |         text: '天干地支',
223 |         readings: ['tin1 gon1 dei6 zi1'],
224 |       },
225 |     ],
226 |     tags: [
227 |       {
228 |         name: 'pos',
229 |         value: '名詞',
230 |       },
231 |       {
232 |         name: 'sim',
233 |         value: '干支',
234 |       },
235 |     ],
236 |     senses: [
237 |       {
238 |         explanation: {
239 |           yue: [
240 |             '「#天干」同「#地支」嘅合稱。十天干分別係「#甲#乙#丙#丁#戊#己#庚#辛#壬#癸」。 十二地支係：「#子#丑#寅#卯#辰#巳#午#未#申#酉#戌#亥」。 天干同地支組合就成為以「#甲子」為首嘅六十干支循環。\n\n干支循環通常用嚟計年份。天干亦可以獨立用嚟順序將物件命名，第一個叫「甲」、第二個叫「乙」，如此類推。用法類似西方嘅「A, B, C」 或 「α, β, γ」。中國傳統紀時間嘅方法係將一日分成十二個時辰，每一個時辰由一個地支表示，「子時」係半夜 (11pm 至 1am)，如此類推。',
241 |           ],
242 |           eng: [
243 |             'Literally "Heavenly Stems and Earthly Branches". It is a traditional Chinese system of counting. Heavenly Stems and Earthly Branches are collectively known as "Stem-Branch".\n\nThe 10 Heavenly Stems are 甲(gaap3) 乙(jyut6) 丙(bing2) 丁(ding1) 戊(mou6) 己(gei2) 庚(gang1) 辛(san1) 壬(jam4) 癸(gwai3).\n\nThe 12 Earthly Branches are 子(zi2) 丑(cau2) 寅(jan4) 卯(maau5) 辰(san4) 巳(zi6) 午(ng5) 未(mei6) 申(san1) 酉(jau5) 戌(seot1) 亥(hoi6). Each Heavenly Stem is paired with an Earthly Branch to form the "stem-branch" sexagenary (i.e. 60 element) cycle that starts with 甲子 (gaap3 zi2)\n\nThe sexagenary cycle is often used for counting years in the Chinese calendar. Heavenly Stems are also used independently to name things in a particular order -- the first is labeled "gaap3", the second "jyut6", the third "bing2", and so on. It is similar to how "A, B, C" and "α, β, γ" are used in western cultures. Earthly Branches are also traditionally used to denote time. One day is divided into twelve slots called Chinese-hours (#時辰), starting from 子時 (zi2 si4), which is 11pm to 1am.',
244 |           ],
245 |         },
246 |         egs: [
247 |           {
248 |             yue: ['乙等 / 乙級 (jyut6 dang2 / jyut6 kap1)'],
249 |             eng: ['B grade'],
250 |           },
251 |           {
252 |             yue: ['甲級戰犯 (gaap3 kap1 zin3 faan2)'],
253 |             eng: ['Class A war criminal'],
254 |           },
255 |           {
256 |             yue: ['戊戌變法 (mou6 seot1 bin3 faat3)'],
257 |             eng: [
258 |               "The Hundred Days' Reform of the Qing Dynasty (it is called 戊戌變法 because it occurred in the 戊戌 year)",
259 |             ],
260 |           },
261 |           {
262 |             yue: ['辛亥革命 (san1 hoi6 gaap3 ming6)'],
263 |             eng: ['The Xinhai Revolution (Pinyin romanization)'],
264 |           },
265 |           {
266 |             yue: ['子時 (zi2 si4)'],
267 |             eng: ['midnight'],
268 |           },
269 |           {
270 |             yue: ['午時 (ng5 si4)'],
271 |             eng: ['noon'],
272 |           },
273 |         ],
274 |       },
275 |     ],
276 |   },
277 |   {
278 |     id: 97033,
279 |     headwords: [
280 |       {
281 |         text: '着',
282 |         readings: ['zoek6'],
283 |       },
284 |       {
285 |         text: '著',
286 |         readings: ['zoek6'],
287 |       },
288 |     ],
289 |     tags: [
290 |       {
291 |         name: 'pos',
292 |         value: '詞綴',
293 |       },
294 |       {
295 |         name: 'label',
296 |         value: '書面語',
297 |       },
298 |     ],
299 |     senses: [
300 |       {
301 |         explanation: {
302 |           yue: ['表示動作、狀態進行緊、持續緊，類似「#住」、「#下」'],
303 |           eng: [
304 |             "to express that an action is in process and a state is prolonged; similar to '#住' zyu6 or '#下' haa5",
305 |           ],
306 |         },
307 |         egs: [
308 |           {
309 |             zho: ['痛並快樂着 (tung3 bing6 faai3 lok6 zoek6)'],
310 |             yue: ['痛住開心 (tung3 zyu6 hoi1 sam1)'],
311 |             eng: ['feeling painful and happy'],
312 |           },
313 |           {
314 |             zho: [
315 |               '走着走着就到了課室。 (zau2 zoek6 zau2 zoek6 zau6 dou3 liu5 fo3 sat1.)',
316 |             ],
317 |             yue: [
318 |               '行下行下就到咗班房。 (haang4 haa5 haang4 haa5 zau6 dou3 zo2 baan1 fong2.)',
319 |             ],
320 |             eng: ['Walking, (we) have arrived at the classroom.'],
321 |           },
322 |           {
323 |             zho: ['他們正説着話呢。 (taa1 mun4 zing3 syut3 zoek6 waa6 ne1.)'],
324 |             yue: ['佢哋講緊嘢啊。 (keoi5 dei6 gong2 gan2 je5 aa3.)'],
325 |             eng: ['They are talking.'],
326 |           },
327 |           {
328 |             zho: ['等着瞧。 (dang2 zoek6 ciu4.)'],
329 |             yue: ['睇下點。 (tai2 haa5 dim2.)'],
330 |             eng: ["(Let's) wait and see."],
331 |           },
332 |         ],
333 |       },
334 |       {
335 |         explanation: {
336 |           yue: ['動詞後綴，表示動作達到目的、有結果；類似「#到」（dou2）'],
337 |           eng: [
338 |             'verbal suffix to mean that the aim of an action has been achieved or its result has come out; similar to #到 dou2',
339 |           ],
340 |         },
341 |         egs: [
342 |           {
343 |             zho: ['你的錶我沒見着。 (nei5 dik1 biu1 ngo5 mut6 gin3 zoek6.)'],
344 |             yue: ['你隻錶我見唔到。 (nei5 zek3 biu1 ngo5 gin3 m4 dou2.)'],
345 |             eng: ['I have not found your watch.'],
346 |           },
347 |         ],
348 |       },
349 |       {
350 |         explanation: {
351 |           yue: ['喺句尾出現，表示祈使'],
352 |           eng: ['used at the end of a sentence to form an imperative'],
353 |         },
354 |         egs: [
355 |           {
356 |             zho: ['聽着。 (ting3 zoek6.)'],
357 |             yue: ['聽住。 (teng1 zyu6.)'],
358 |             eng: ['Listen.'],
359 |           },
360 |           {
361 |             zho: [
362 |               '你可好生給我應付着。 (nei5 ho2 hou2 sang1 kap1 ngo5 jing3 fu6 zoek6.)',
363 |             ],
364 |             yue: [
365 |               '你好好哋同我應付下。 (nei5 hou2 hou2 dei2 tung4 ngo5 jing3 fu6 haa5.)',
366 |               '你小心啲同我應付下。 (nei5 siu2 sam1 di1 tung4 ngo5 jing3 fu6 haa5.)',
367 |             ],
368 |             eng: ['Handle this well (for me).'],
369 |           },
370 |         ],
371 |       },
372 |     ],
373 |   },
374 |   {
375 |     id: 93305,
376 |     headwords: [
377 |       {
378 |         text: '揸正嚟做',
379 |         readings: ['zaa1 zeng3 lai4 zou6', 'zaa1 zeng3 lei4 zou6'],
380 |       },
381 |     ],
382 |     tags: [
383 |       {
384 |         name: 'pos',
385 |         value: '動詞',
386 |       },
387 |       {
388 |         name: 'sim',
389 |         value: '揸正',
390 |       },
391 |     ],
392 |     senses: [
393 |       {
394 |         explanation: {
395 |           yue: ['嚴格依照規矩，不留餘地，冇人情講'],
396 |           eng: [
397 |             'to follow the rules strictly; to "go by the book"; to leave no room for discretion',
398 |           ],
399 |         },
400 |         egs: [
401 |           {
402 |             yue: [
403 |               '唔好怪我揸正嚟做。 (m4 hou2 gwaai3 ngo5 zaa1 zeng3 lei4 zou6.)',
404 |             ],
405 |             eng: ["Don't blame me for following the rules too strictly."],
406 |           },
407 |         ],
408 |       },
409 |     ],
410 |   },
411 |   {
412 |     id: 96792,
413 |     headwords: [
414 |       {
415 |         text: '牛河博士',
416 |         readings: ['ngau4 ho2 bok3 si6'],
417 |       },
418 |     ],
419 |     tags: [
420 |       {
421 |         name: 'pos',
422 |         value: '名詞',
423 |       },
424 |       {
425 |         name: 'label',
426 |         value: '專名',
427 |       },
428 |       {
429 |         name: 'label',
430 |         value: '潮語',
431 |       },
432 |       {
433 |         name: 'ref',
434 |         value: 'https://evchk.fandom.com/zh/wiki/曹宏威',
435 |       },
436 |     ],
437 |     senses: [
438 |       {
439 |         explanation: {
440 |           yue: [
441 |             '香港#學者 曹宏威喺#網民 之間嘅叫法，佢因為#乾炒牛河 而一舉成名',
442 |           ],
443 |           eng: ['Wung-wai Tso, literally "Doctor Beef Chow-fun"'],
444 |         },
445 |         egs: [],
446 |       },
447 |     ],
448 |   },
449 | ];
450 | 
451 | /**
452 |  * @type {DictionaryEntry[]}
453 |  */
454 | let entries;
455 | 
456 | test.before(async () => {
457 |   entries = await parseCSVEntries(testCsvFile);
458 | });
459 | 
460 | test('CSV successfully parsed', (t) => {
461 |   t.not(entries, undefined);
462 | });
463 | 
464 | for (const expectedEntry of expectedEntries) {
465 |   const id = expectedEntry.id;
466 |   test(`Entry ${id}`, (t) => {
467 |     const entry = entries.find((entry) => entry.id === Number(id));
468 |     t.deepEqual(entry, expectedEntry);
469 |   });
470 | }
471 | 


--------------------------------------------------------------------------------
/src/test/testdata.csv:
--------------------------------------------------------------------------------
  1 | ,,""
  2 | ""
  3 | 101613,大電:daai6 din6,"(pos:名詞)
  4 | yue:D電池（量詞：粒）
  5 | eng:D cells battery",,OK,未公開
  6 | 92456,發電廠:faat3 din6 cong2,"(pos:名詞)
  7 | yue:產生#電力 嘅大型#建築物（量詞：間／座）
  8 | eng:power plant",,OK,已公開
  9 | 82131,排污:paai4 wu1,"(pos:動詞)
 10 | <explanation>
 11 | yue:排走#污水
 12 | eng:to drain away sewage
 13 | <eg>
 14 | yue:排污費 (paai4 wu1)
 15 | eng:sewerage charge
 16 | <eg>
 17 | yue:排污系統 (paai4 wu1 hai6 tung2)
 18 | eng:sewage system
 19 | <eg>
 20 | yue:排污設施 (paai4 wu1 cit3 si1)
 21 | eng:sewage works
 22 | <eg>
 23 | yue:公共排污服務 (gung1 gung6 paai4 wu1 fuk6 mou6)
 24 | eng:public sewage services
 25 | <eg>
 26 | yue:排污設備改善計劃 (paai4 wu1 cit3 bei6 goi2 sin6 gai3 waak6)
 27 | eng:sewerage improvement programme
 28 | <eg>
 29 | yue:呢啲市區河道嘅設計以防洪及有效排污為主。 (ni1 di1 si5 keoi1 ho4 dou6 ge3 cit3 gai3 ji5 fong4 hung4 kap6 jau5 haau6 paai4 wu1 wai4 zyu2.)
 30 | eng:These urban channels were designed for flood prevention and effective drainage.",排汙,OK,已公開
 31 | 72252,揀選:gaan2 syun2,"(pos:動詞)(sim:挑選)(sim:揀)(sim:選)(sim:選擇)
 32 | <explanation>
 33 | yue:根據你嘅取向，喺兩樣嘢或以上當中，抽取一樣
 34 | eng:to select; to choose
 35 | <eg>
 36 | yue:一個蠢，一個鈍，噉樣邊叫有得揀選？ (jat1 go3 ceon2, jat1 go3 deon6, gam2 joeng2 bin1 giu3 jau5 dak1 gaan2 syun2?)
 37 | eng:This candidate is stupid and that is dumb. How can I choose among them?",㨂選,OK,已公開
 38 | 66987,背景:bui3 ging2,"(pos:名詞)
 39 | <explanation>
 40 | yue:喺舞台或者現實襯托主體嘅景物、佈景、環境
 41 | eng:background; setting
 42 | <eg>
 43 | yue:呢張相嘅背景係一啲椰樹。 (ni1 zoeng1 soeng2 ge3 bui3 ging2 hai6 jat1 di1 je4 syu6.)
 44 | eng:The coconut trees form a background to this picture.
 45 | <eg>
 46 | yue:段片嘅背景音樂叫咩名？ (dyun6 pin2 ge3 bui3 ging2 jam1 ngok6 giu3 me1 meng2?)
 47 | eng:What is the title of the background music in the video?
 48 | ----
 49 | <explanation>
 50 | yue:人嘅來歷或經歷，例如家庭、教育、工作等等，亦可以指佢哋所倚靠嘅人物或者勢力
 51 | eng:the ""background"" of a person, especially their educational background, occupation, social/family connections, etc.
 52 | <eg>
 53 | yue:不如揾人查下佢個背景，我覺得佢好有可疑。 (bat1 jyu4 wan2 jan4 caa4 haa5 keoi5 go3 bui3 ging2, ngo5 gok3 dak1 keoi5 hou2 jau5 ho2 ji4.)
 54 | eng:Shall we find someone to look into his background? I think he is so suspicious.",,OK,已公開
 55 | 90185,天干地支:tin1 gon1 dei6 zi1,"(pos:名詞)(sim:干支)
 56 | <explanation>
 57 | yue:「#天干」同「#地支」嘅合稱。十天干分別係「#甲#乙#丙#丁#戊#己#庚#辛#壬#癸」。 十二地支係：「#子#丑#寅#卯#辰#巳#午#未#申#酉#戌#亥」。 天干同地支組合就成為以「#甲子」為首嘅六十干支循環。
 58 | 
 59 | 干支循環通常用嚟計年份。天干亦可以獨立用嚟順序將物件命名，第一個叫「甲」、第二個叫「乙」，如此類推。用法類似西方嘅「A, B, C」 或 「α, β, γ」。中國傳統紀時間嘅方法係將一日分成十二個時辰，每一個時辰由一個地支表示，「子時」係半夜 (11pm 至 1am)，如此類推。
 60 | eng:Literally ""Heavenly Stems and Earthly Branches"". It is a traditional Chinese system of counting. Heavenly Stems and Earthly Branches are collectively known as ""Stem-Branch"".
 61 | 
 62 | The 10 Heavenly Stems are 甲(gaap3) 乙(jyut6) 丙(bing2) 丁(ding1) 戊(mou6) 己(gei2) 庚(gang1) 辛(san1) 壬(jam4) 癸(gwai3).
 63 | 
 64 | The 12 Earthly Branches are 子(zi2) 丑(cau2) 寅(jan4) 卯(maau5) 辰(san4) 巳(zi6) 午(ng5) 未(mei6) 申(san1) 酉(jau5) 戌(seot1) 亥(hoi6). Each Heavenly Stem is paired with an Earthly Branch to form the ""stem-branch"" sexagenary (i.e. 60 element) cycle that starts with 甲子 (gaap3 zi2)
 65 | 
 66 | The sexagenary cycle is often used for counting years in the Chinese calendar. Heavenly Stems are also used independently to name things in a particular order -- the first is labeled ""gaap3"", the second ""jyut6"", the third ""bing2"", and so on. It is similar to how ""A, B, C"" and ""α, β, γ"" are used in western cultures. Earthly Branches are also traditionally used to denote time. One day is divided into twelve slots called Chinese-hours (#時辰), starting from 子時 (zi2 si4), which is 11pm to 1am.
 67 | <eg>
 68 | yue:乙等 / 乙級 (jyut6 dang2 / jyut6 kap1)
 69 | eng:B grade
 70 | <eg>
 71 | yue:甲級戰犯 (gaap3 kap1 zin3 faan2)
 72 | eng:Class A war criminal
 73 | <eg>
 74 | yue:戊戌變法 (mou6 seot1 bin3 faat3)
 75 | eng:The Hundred Days' Reform of the Qing Dynasty (it is called 戊戌變法 because it occurred in the 戊戌 year)
 76 | <eg>
 77 | yue:辛亥革命 (san1 hoi6 gaap3 ming6)
 78 | eng:The Xinhai Revolution (Pinyin romanization)
 79 | <eg>
 80 | yue:子時 (zi2 si4)
 81 | eng:midnight
 82 | <eg>
 83 | yue:午時 (ng5 si4)
 84 | eng:noon",,OK,已公開
 85 | 97033,"着:zoek6,著:zoek6","(pos:詞綴)(label:書面語)
 86 | <explanation>
 87 | yue:表示動作、狀態進行緊、持續緊，類似「#住」、「#下」
 88 | eng:to express that an action is in process and a state is prolonged; similar to '#住' zyu6 or '#下' haa5
 89 | <eg>
 90 | zho:痛並快樂着 (tung3 bing6 faai3 lok6 zoek6)
 91 | yue:痛住開心 (tung3 zyu6 hoi1 sam1)
 92 | eng:feeling painful and happy
 93 | <eg>
 94 | zho:走着走着就到了課室。 (zau2 zoek6 zau2 zoek6 zau6 dou3 liu5 fo3 sat1.)
 95 | yue:行下行下就到咗班房。 (haang4 haa5 haang4 haa5 zau6 dou3 zo2 baan1 fong2.)
 96 | eng:Walking, (we) have arrived at the classroom.
 97 | <eg>
 98 | zho:他們正説着話呢。 (taa1 mun4 zing3 syut3 zoek6 waa6 ne1.)
 99 | yue:佢哋講緊嘢啊。 (keoi5 dei6 gong2 gan2 je5 aa3.)
100 | eng:They are talking.
101 | <eg>
102 | zho:等着瞧。 (dang2 zoek6 ciu4.)
103 | yue:睇下點。 (tai2 haa5 dim2.)
104 | eng:(Let's) wait and see.
105 | ----
106 | <explanation>
107 | yue:動詞後綴，表示動作達到目的、有結果；類似「#到」（dou2）
108 | eng:verbal suffix to mean that the aim of an action has been achieved or its result has come out; similar to #到 dou2
109 | <eg>
110 | zho:你的錶我沒見着。 (nei5 dik1 biu1 ngo5 mut6 gin3 zoek6.)
111 | yue:你隻錶我見唔到。 (nei5 zek3 biu1 ngo5 gin3 m4 dou2.)
112 | eng:I have not found your watch.
113 | ----
114 | <explanation>
115 | yue:喺句尾出現，表示祈使
116 | eng:used at the end of a sentence to form an imperative
117 | <eg>
118 | zho:聽着。 (ting3 zoek6.)
119 | yue:聽住。 (teng1 zyu6.)
120 | eng:Listen.
121 | <eg>
122 | zho:你可好生給我應付着。 (nei5 ho2 hou2 sang1 kap1 ngo5 jing3 fu6 zoek6.)
123 | yue:你好好哋同我應付下。 (nei5 hou2 hou2 dei2 tung4 ngo5 jing3 fu6 haa5.)
124 | yue:你小心啲同我應付下。 (nei5 siu2 sam1 di1 tung4 ngo5 jing3 fu6 haa5.)
125 | eng:Handle this well (for me).",,OK,已公開
126 | 93305,揸正嚟做:zaa1 zeng3 lai4 zou6:zaa1 zeng3 lei4 zou6,"(pos:動詞)(sim:揸正)
127 | <explanation>
128 | yue:嚴格依照規矩，不留餘地，冇人情講
129 | eng:to follow the rules strictly; to ""go by the book""; to leave no room for discretion
130 | <eg>
131 | yue:唔好怪我揸正嚟做。 (m4 hou2 gwaai3 ngo5 zaa1 zeng3 lei4 zou6.)
132 | eng:Don't blame me for following the rules too strictly.",,OK,已公開
133 | 96792,牛河博士:ngau4 ho2 bok3 si6,"(pos:名詞)(label:專名)(label:潮語)(ref:https://evchk.fandom.com/zh/wiki/曹宏威)
134 | yue:香港#學者 曹宏威喺#網民 之間嘅叫法，佢因為#乾炒牛河 而一舉成名
135 | eng:Wung-wai Tso, literally ""Doctor Beef Chow-fun""",,OK,未公開


--------------------------------------------------------------------------------
/src/types.d.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Represents a CSV record.
 3 |  */
 4 | type CsvRecord = {
 5 |   id: string; // The unique identifier for the record.
 6 |   headword: string; // The main word or expression in the entry.
 7 |   entry: string; // The full text of the dictionary entry.
 8 |   variants: string; // The different forms or spellings of the headword.
 9 |   warning: string; // Any warnings related to the entry.
10 |   public: string; // Whether the entry is public or not.
11 | };
12 | 
13 | type LanguageArray = [
14 |   'yue',
15 |   'eng',
16 |   'zho',
17 |   'jpn',
18 |   'kor',
19 |   'vie',
20 |   'lzh',
21 |   'por',
22 |   'deu',
23 |   'fra',
24 |   'mnc',
25 |   'lat',
26 |   'tib',
27 |   '量詞'
28 | ];
29 | 
30 | type Language = LanguageArray[number];
31 | 
32 | type TextReadingPair = {
33 |   text: string;
34 |   reading: string;
35 | };
36 | 
37 | type Headword = {
38 |   text: string;
39 |   readings: string[];
40 | };
41 | 
42 | type Tag = {
43 |   name: string;
44 |   value: string;
45 | };
46 | 
47 | type Sense = {
48 |   explanation: LanguageData;
49 |   egs: LanguageData[];
50 | };
51 | 
52 | type LanguageData = {
53 |   [key in Language]?: string[];
54 | };
55 | 
56 | type DictionaryEntry = {
57 |   id: number;
58 |   headwords: Headword[];
59 |   tags: Tag[];
60 |   senses: Sense[];
61 | };
62 | 


--------------------------------------------------------------------------------
/src/util/addYomitanImages.js:
--------------------------------------------------------------------------------
 1 | import { Dictionary } from 'yomichan-dict-builder';
 2 | import fs from 'fs';
 3 | 
 4 | /**
 5 |  *
 6 |  * @param {Dictionary} dictionary
 7 |  */
 8 | async function addYomitanImages(dictionary, imageFolder) {
 9 |   const imageFiles = fs.readdirSync(imageFolder);
10 |   for (const imageFile of imageFiles) {
11 |     const filePath = `${imageFolder}/${imageFile}`;
12 |     await dictionary.addFile(filePath, `images/${imageFile}`);
13 |   }
14 | }
15 | 
16 | export { addYomitanImages };


--------------------------------------------------------------------------------
/src/util/addYomitanTags.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * @type {Record<string, string>}
 3 |  */
 4 | const tagValueToNote = {
 5 |   // Parts of speech
 6 |   名詞: 'noun',
 7 |   動詞: 'verb',
 8 |   語句: 'phrase',
 9 |   形容詞: 'adjective',
10 |   量詞: 'classifier',
11 |   感嘆詞: 'interjection',
12 |   代詞: 'pronoun',
13 |   助詞: 'particle',
14 |   語素: 'morpheme',
15 |   區別詞: 'distinguishing word',
16 |   副詞: 'adverb',
17 |   擬聲詞: 'onomatopoeia',
18 |   連詞: 'conjunction',
19 |   詞綴: 'affix',
20 |   介詞: 'preposition',
21 |   數詞: 'numeral',
22 |   方位詞: 'locative',
23 |   術語: 'term',
24 |   // Labels
25 |   馬來西亞: 'Malaysia',
26 |   粗俗: 'vulgar',
27 |   香港: 'Hong Kong',
28 |   專名: 'proper noun',
29 |   俚語: 'slang',
30 |   潮語: 'trendy expression',
31 |   外來語: 'loanword',
32 |   書面語: 'written language',
33 |   舊式: 'old-fashioned',
34 |   大陸: 'Mainland China',
35 |   文言: 'classical Chinese',
36 |   gpt: 'GPT',
37 |   台灣: 'Taiwan',
38 |   爭議: 'controversial',
39 |   黃賭毒: 'vice',
40 |   日本: 'Japan',
41 |   口語: 'colloquial',
42 |   錯字: 'misspelling',
43 |   玩嘢: 'playful',
44 |   民間傳説: 'folklore',
45 |   澳門: 'Macau',
46 | };
47 | 
48 | const categoryToYomitanLabelCategoryMap = {
49 |   pos: 'partOfSpeech',
50 | };
51 | 
52 | const categoryToSortingOrder = {
53 |   pos: 1,
54 | };
55 | 
56 | /**
57 |  * Given a set of unique labels, adds the appropriate tags to the Yomitan dictionary.
58 |  * @param {Dictionary} dictionary
59 |  * @param {Record<string, Set<string>>} uniqueLabels
60 |  */
61 | async function addYomitanTags(dictionary, uniqueLabels) {
62 |   let tagsAdded = 0;
63 |   const noNoteAvailable = new Set();
64 |   for (const [labelName, labelValues] of Object.entries(uniqueLabels)) {
65 |     for (const value of labelValues) {
66 |       await dictionary.addTag({
67 |         name: value,
68 |         category:
69 |           categoryToYomitanLabelCategoryMap[labelName] ??
70 |           labelName,
71 |         notes: `${value} | ${tagValueToNote[value]}` ?? value,
72 |         sortingOrder: categoryToSortingOrder[labelName] ?? 0,
73 |       });
74 |       if (!tagValueToNote[value]) {
75 |         noNoteAvailable.add(value);
76 |       }
77 |       tagsAdded++;
78 |     }
79 |   }
80 |   console.log(`Added ${tagsAdded} tags to dictionary.`);
81 |   if (noNoteAvailable.size) {
82 |     console.warn(`No note available for: ${[...noNoteAvailable].join(', ')}`);
83 |   }
84 | }
85 | 
86 | export { addYomitanTags };
87 | 


--------------------------------------------------------------------------------
/src/util/csv/csvHandler.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs';
 2 | import csv from 'csv-parser';
 3 | const csvHeaders = ['id', 'headword', 'entry', 'variants', 'warning', 'public'];
 4 | 
 5 | /**
 6 |  * Reads the all- file and returns the parsed entries
 7 |  * @param {string} allCsvPath
 8 |  * @returns {Promise<CsvRecord[]>}
 9 |  */
10 | async function readCSVAsync(allCsvPath) {
11 |   return new Promise((resolve, reject) => {
12 |     const results = [];
13 |     fs.createReadStream(allCsvPath)
14 |       .pipe(
15 |         csv({
16 |           headers: csvHeaders,
17 |           strict: true,
18 |           skipLines: 2,
19 |           quote: '"',
20 |         })
21 |       )
22 |       .on('data', (data) => {
23 |         results.push(data);
24 |       })
25 |       .on('end', () => {
26 |         resolve(results);
27 |       })
28 |       .on('error', (error) => {
29 |         reject(error);
30 |       });
31 |   });
32 | }
33 | 
34 | /**
35 |  * Reads the contents of the data folder and returns the name of the all- file and the date of the data
36 |  * @param {string} dataFolder
37 |  */
38 | async function getCSVInfo(dataFolder) {
39 |   // Get contents of data folder
40 |   const files = await fs.promises.readdir(dataFolder);
41 |   // Filter out non-csv files
42 |   const csvFiles = files.filter((file) => file.endsWith('.csv'));
43 |   const allCsv = files.find((file) => file.startsWith('all-'));
44 |   if (!allCsv) {
45 |     throw new Error('No all- file found');
46 |   }
47 | 
48 |   const dateEpoch = allCsv.split('-')[1].split('.')[0];
49 |   const date = new Date(Number(dateEpoch) * 1000);
50 |   const dateString = date.toISOString().split('T')[0];
51 |   console.log(`Date of data: ${dateString}`);
52 | 
53 |   return {
54 |     allCsv,
55 |     dateString,
56 |   };
57 | }
58 | 
59 | export { readCSVAsync, getCSVInfo };
60 | 


--------------------------------------------------------------------------------
/src/util/csv/parseCsvEntriesToJson.js:
--------------------------------------------------------------------------------
 1 | import { readCSVAsync } from './csvHandler.js';
 2 | import { parseEntry } from '../entryParse/parseEntryToJson.js';
 3 | 
 4 | async function parseCSVEntries(allCsvPath) {
 5 |   const data = await readCSVAsync(allCsvPath);
 6 |   console.log(`Read ${data.length} entries from ${allCsvPath}`);
 7 |   /**
 8 |    * @type {DictionaryEntry[]}
 9 |    */
10 |   const dictionaryEntries = [];
11 |   let unpublishedCount = 0;
12 |   let noDataCount = 0;
13 |   let unreviewedCount = 0;
14 |   for (const entry of data) {
15 |     if (entry.entry === '未有內容 NO DATA') {
16 |       noDataCount++;
17 |       continue;
18 |     }
19 |     if (
20 |       entry.warning.includes(
21 |         '未經覆核，可能有錯漏 UNREVIEWED ENTRY - MAY CONTAIN ERRORS OR OMISSIONS'
22 |       )
23 |     ) {
24 |       unreviewedCount++;
25 |     }
26 |     if (entry.public !== '已公開') {
27 |       unpublishedCount++;
28 |     }
29 |     try {
30 |       const parsedEntry = parseEntry(entry);
31 |       dictionaryEntries.push(parsedEntry);
32 |     } catch (error) {
33 |       console.log(`Error parsing entry ${entry.id}: ${error.message}`);
34 |     }
35 |   }
36 |   console.log(`Parsed ${dictionaryEntries.length} entries`);
37 |   console.log(`Skipped ${noDataCount} no data entries`);
38 |   return dictionaryEntries;
39 | }
40 | 
41 | export { parseCSVEntries };
42 | 


--------------------------------------------------------------------------------
/src/util/entryParse/findImages.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Given a list of dictionary entries, find all unique image URLs.
 3 |  * @param {DictionaryEntry[]} dictionaryEntries
 4 |  */
 5 | function getAllImageURLs(dictionaryEntries) {
 6 |   let imageURLs = new Set();
 7 |   for (const entry of dictionaryEntries) {
 8 |     for (const tag of entry.tags) {
 9 |       if (tag.name === 'img') {
10 |         const imgURL = tag.value;
11 |         // Check if valid URL
12 |         try {
13 |           new URL(imgURL);
14 |           imageURLs.add(tag.value);
15 |         } catch (error) {
16 |           console.error(`Invalid URL: ${imgURL}`);
17 |           continue;
18 |         }
19 |       }
20 |     }
21 |   }
22 |   return imageURLs;
23 | }
24 | 
25 | export { getAllImageURLs };
26 | 


--------------------------------------------------------------------------------
/src/util/entryParse/parseEntryToJson.js:
--------------------------------------------------------------------------------
  1 | import { LANGUAGES_DATA } from '../../constants.js';
  2 | 
  3 | /**
  4 |  *
  5 |  * @param {CsvRecord} entry
  6 |  * @returns {DictionaryEntry}
  7 |  */
  8 | function parseEntry(entry) {
  9 |   const id = parseInt(entry.id);
 10 |   if (isNaN(id)) {
 11 |     throw new Error(`Invalid id: ${entry.id}`);
 12 |   }
 13 | 
 14 |   const headwords = parseHeadwords(entry.headword);
 15 | 
 16 |   const entryLines = entry.entry.split('\n');
 17 |   const tags = parseTags(entryLines);
 18 | 
 19 |   const explanationsText = entryLines.join('\n');
 20 |   const explanationsTexts = explanationsText
 21 |     .split(/^\-\-\-\-$/gm)
 22 |     .map((text) => {
 23 |       return text;
 24 |     });
 25 | 
 26 |   /**
 27 |    * @type {Sense[]}
 28 |    */
 29 |   const senses = [];
 30 |   for (const text of explanationsTexts) {
 31 |     senses.push(parseSense(text));
 32 |   }
 33 | 
 34 |   return {
 35 |     id,
 36 |     headwords,
 37 |     tags,
 38 |     senses,
 39 |   };
 40 | }
 41 | 
 42 | /**
 43 |  * Parses a headword string in the format "text:reading,text:reading"
 44 |  * @param {string} headwordString
 45 |  * @returns {Headword[]}
 46 |  */
 47 | function parseHeadwords(headwordString) {
 48 |   return headwordString.split(',').map((headword) => {
 49 |     const [text, ...readings] = headword.split(':');
 50 |     if (!text || !readings) {
 51 |       throw new Error(`Invalid headword: ${headword}`);
 52 |     }
 53 |     return {
 54 |       text,
 55 |       readings,
 56 |     };
 57 |   });
 58 | }
 59 | 
 60 | /**
 61 |  *
 62 |  * @param {string[]} entryLines
 63 |  */
 64 | function parseTags(entryLines) {
 65 |   if (!entryLines[0].startsWith('(pos:')) {
 66 |     throw new Error(`Entry does not start with (pos:): ${entryLines[0]}`);
 67 |   }
 68 |   // tags in format (pos:名詞)(label:書面語)
 69 |   const firstLine = entryLines.shift();
 70 |   if (!firstLine) {
 71 |     throw new Error(`Entry is empty: ${entryLines.toString()}`);
 72 |   }
 73 |   const tags = firstLine.split(')(').map((tag) => {
 74 |     tag = tag.replace(/[()]/g, '');
 75 |     let colonIndex = tag.indexOf(':');
 76 |     const name = tag.slice(0, colonIndex).trim();
 77 |     const value = tag.slice(colonIndex + 1).trim();
 78 |     return {
 79 |       name,
 80 |       value,
 81 |     };
 82 |   });
 83 |   if (tags.length === 0) {
 84 |     throw new Error(`No tags found: ${firstLine}`);
 85 |   }
 86 |   return tags;
 87 | }
 88 | 
 89 | /**
 90 |  * Accepts a sense entry string and returns the parsed sense
 91 |  * @param {string} entryText
 92 |  * @returns {Sense}
 93 |  */
 94 | function parseSense(entryText) {
 95 |   // Remove first line explanations
 96 |   entryText = entryText.replace('<explanation>\n', '');
 97 |   const [explanationText, ...examplesTexts] = entryText.split(/^<eg>$/gm);
 98 | 
 99 |   /**
100 |    * @type {LanguageData}
101 |    */
102 |   const explanation = parseLanguageData(explanationText);
103 | 
104 |   /**
105 |    * @type {LanguageData[]}
106 |    */
107 |   const egs = [];
108 |   for (const exampleText of examplesTexts) {
109 |     egs.push(parseLanguageData(exampleText));
110 |   }
111 | 
112 |   return { explanation, egs };
113 | }
114 | 
115 | /**
116 |  * Parses a language data multiline string in the format "lang:text\nlang:text"
117 |  * Some texts are multiline
118 |  * @param {string} text
119 |  * @returns {LanguageData}
120 |  */
121 | function parseLanguageData(text) {
122 |   /**
123 |    * @type {LanguageData}
124 |    */
125 |   const languageData = {};
126 |   const lines = text.split('\n');
127 | 
128 |   let currentLang = '';
129 |   let currentLangData = '';
130 | 
131 |   /**
132 |    * Adds the currently stored language data to the languageData object
133 |    */
134 |   function addCurrentLangData() {
135 |     if (!currentLang) {
136 |       return;
137 |     }
138 |     if (!currentLangData) {
139 |       return;
140 |     }
141 |     if (!languageData[currentLang]) {
142 |       languageData[currentLang] = [];
143 |     }
144 |     languageData[currentLang].push(currentLangData.trim());
145 |     currentLang = '';
146 |     currentLangData = '';
147 |   }
148 | 
149 |   for (const line of lines) {
150 |     // Check if first few characters are a language followed by :
151 |     const matchedLang = line.split(':')[0];
152 |     if (
153 |       // !(matchedLang.length >= 2 && matchedLang.length <= 4) ||
154 |       !line.includes(':')
155 |     ) {
156 |       // If no language is found, this is a continuation of the previous line
157 |       currentLangData += '\n' + line.trim();
158 |       continue;
159 |     }
160 |     // Check if the language is a possible language
161 |     if (!LANGUAGES_DATA[matchedLang]) {
162 |       throw new Error(`Invalid language: ${matchedLang}`);
163 |     }
164 |     // Else a language is found
165 |     addCurrentLangData();
166 |     currentLang = matchedLang;
167 |     currentLangData = line.replace(`${currentLang}:`, '').trim();
168 |   }
169 |   addCurrentLangData();
170 |   return languageData;
171 | }
172 | 
173 | export { parseEntry };
174 | 


--------------------------------------------------------------------------------
/src/util/entryParse/parseLabels.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Given a list of dictionary entries, find all unique labels.
 3 |  * @param {DictionaryEntry[]} dictionaryEntries
 4 |  * @returns {Record<string, Set<string>>}
 5 |  */
 6 | function findLabelValues(dictionaryEntries) {
 7 |   const tagCategories = {
 8 |     label: new Set(),
 9 |     pos: new Set(),
10 |   };
11 |   for (const entry of dictionaryEntries) {
12 |     for (const tag of entry.tags) {
13 |       if (tagCategories[tag.name]) {
14 |         tagCategories[tag.name].add(tag.value);
15 |       }
16 |     }
17 |   }
18 |   return tagCategories;
19 | }
20 | 
21 | export { findLabelValues };
22 | 


--------------------------------------------------------------------------------
/src/util/getVersion.js:
--------------------------------------------------------------------------------
 1 | import path from 'path';
 2 | import fs from 'fs';
 3 | 
 4 | /**
 5 |  * Get the version from the package.json file.
 6 |  * @returns {string} The version.
 7 |  */
 8 | export function getVersion() {
 9 |   const packageJsonPath = path.join(process.cwd(), 'package.json');
10 |   const packageJson = JSON.parse(fs.readFileSync(packageJsonPath, 'utf-8'));
11 |   return packageJson.version;
12 | }
13 | 


--------------------------------------------------------------------------------
/src/util/imageHandler/compressImages.js:
--------------------------------------------------------------------------------
 1 | import sharp from 'sharp';
 2 | import fs from 'fs';
 3 | 
 4 | /**
 5 |  * Compresses and resizes all jpg and png images in the image folder
 6 |  * @param {string} imageFolder
 7 |  * @param {string} outputFolder
 8 |  * @param {number} resizeWidth
 9 |  * @returns {Promise<void[]>}
10 |  */
11 | function compressImages(imageFolder, outputFolder, resizeWidth) {
12 |   // Create directory
13 |   if (!fs.existsSync(outputFolder)) {
14 |     fs.mkdirSync(outputFolder);
15 |   }
16 |   const imageFiles = fs.readdirSync(imageFolder);
17 |   const promises = [];
18 |   for (const imageFile of imageFiles) {
19 |     const filePath = `${imageFolder}/${imageFile}`;
20 |     const outputPath = `${outputFolder}/${imageFile}`;
21 |     promises.push(compressImage(filePath, outputPath, resizeWidth));
22 |   }
23 |   return Promise.all(promises);
24 | }
25 | 
26 | /**
27 |  * Compresses and resizes the image at the given path.
28 |  * @param {string} imagePath
29 |  * @param {string} outputPath
30 |  * @param {number} resizeWidth
31 |  * @returns
32 |  */
33 | async function compressImage(imagePath, outputPath, resizeWidth) {
34 |   try {
35 |     const image = sharp(imagePath);
36 |     const metadata = await image.metadata();
37 |     // Check if image is jpg or png
38 |     if (metadata.format !== 'jpeg' && metadata.format !== 'png') {
39 |       throw new Error(`Invalid image format: ${metadata.format}`);
40 |       return;
41 |     }
42 |     // Resize image
43 |     if (resizeWidth && metadata.width && metadata.width > resizeWidth) {
44 |       image.resize(resizeWidth);
45 |     }
46 |     // Compress image
47 |     if (metadata.format === 'jpeg') {
48 |       await image.jpeg({ quality: 85 }).toFile(outputPath);
49 |     } else {
50 |       await image.toFile(outputPath);
51 |     }
52 |   } catch (e) {
53 |     // Copy file if error
54 |     fs.copyFileSync(imagePath, outputPath);
55 |   }
56 | }
57 | 
58 | export { compressImages };
59 | 


--------------------------------------------------------------------------------
/src/util/imageHandler/downloadImages.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs';
 2 | import path from 'path';
 3 | import axios from 'axios';
 4 | import sharp from 'sharp';
 5 | 
 6 | import { getImageFileName } from './getImageFileName.js';
 7 | import { IMAGE_FOLDER } from '../../constants.js';
 8 | 
 9 | const DELAY_MS = 1000;
10 | 
11 | /**
12 |  * Downloads all the images in the given set.
13 |  * @param {Set} imageURLs - The set of image URLs to download.
14 |  */
15 | async function downloadImages(imageURLs) {
16 |   // Create directory
17 |   if (!fs.existsSync(IMAGE_FOLDER)) {
18 |     fs.mkdirSync(IMAGE_FOLDER);
19 |   }
20 |   let successful = 0;
21 |   let failed = 0;
22 |   const imageURLsArray = Array.from(imageURLs);
23 |   for (let i = 0; i < imageURLsArray.length; i++) {
24 |     const imageURL = imageURLsArray[i];
25 |     try {
26 |       console.log(`${i}/${imageURLsArray.length}: Downloading ${imageURL}`);
27 |       const fileName = getImageFileName(imageURL);
28 |       const wasDownloadedOnline = await downloadImage(
29 |         imageURL,
30 |         IMAGE_FOLDER,
31 |         fileName
32 |       );
33 |       // Delay if downloaded online
34 |       if (wasDownloadedOnline) {
35 |         await new Promise((resolve) => setTimeout(resolve, DELAY_MS));
36 |       }
37 |       successful++;
38 |       const filePath = path.join(IMAGE_FOLDER, getImageFileName(imageURL));
39 |       // Check if the image is valid, delete if not
40 |       try {
41 |         await sharp(filePath).metadata();
42 |       } catch (error) {
43 |         console.log(`Deleting invalid image ${filePath}`);
44 |         fs.unlinkSync(filePath);
45 |       }
46 |     } catch (error) {
47 |       console.log(`Error when downloading ${imageURL}`);
48 |       failed++;
49 |     }
50 |   }
51 |   console.log(`Successfully downloaded ${successful} images.`);
52 |   console.log(`Failed to download ${failed} images.`);
53 | }
54 | 
55 | /**
56 |  * Downloads the image at the given URL and saves it to the given path.
57 |  * @param {string} imageURL
58 |  * @param {string} savePath
59 |  * @param {string} fileName
60 |  * @returns {Promise<boolean>} - Returns true if the image was downloaded online.
61 |  */
62 | async function downloadImage(imageURL, savePath, fileName) {
63 |   // Check if path valid
64 |   if (!fs.existsSync(savePath)) {
65 |     throw new Error(`Invalid path: ${savePath}`);
66 |   }
67 |   // Check if valid URL
68 |   try {
69 |     new URL(imageURL);
70 |   } catch (error) {
71 |     throw new Error(`Invalid URL: ${imageURL}`);
72 |   }
73 | 
74 |   const filePath = path.join(savePath, fileName);
75 | 
76 |   // Check if file already exists
77 |   if (fs.existsSync(filePath)) {
78 |     return false;
79 |   }
80 |   // Download image
81 |   const response = await axios.get(imageURL, {
82 |     responseType: 'arraybuffer',
83 |   });
84 |   const buffer = Buffer.from(response.data, 'binary');
85 | 
86 |   // Save image
87 |   fs.writeFileSync(filePath, buffer);
88 |   return true;
89 | }
90 | 
91 | export { downloadImages };
92 | 


--------------------------------------------------------------------------------
/src/util/imageHandler/getImageFileName.js:
--------------------------------------------------------------------------------
 1 | import { createHash } from 'crypto';
 2 | 
 3 | /**
 4 |  * Hashes the image URL to get the image file name, preserving the file extension.
 5 |  * @param {string} imageURL
 6 |  */
 7 | function getImageFileName(imageURL) {
 8 |   const hash = createHash('sha256');
 9 |   hash.update(imageURL);
10 |   const hashed = hash.digest('hex');
11 |   const extension = imageURL.split('.').pop()?.toLocaleLowerCase() || '';
12 |   const allowedExtensions = ['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp'];
13 |   if (!allowedExtensions.includes(extension)) {
14 |     throw new Error(`Invalid extension: ${extension}`);
15 |   }
16 |   return `${hashed}.${extension}`;
17 | }
18 | 
19 | export { getImageFileName };


--------------------------------------------------------------------------------
/src/util/readAndParseCSVs.js:
--------------------------------------------------------------------------------
 1 | import path from 'path';
 2 | import { getCSVInfo } from './csv/csvHandler.js';
 3 | import { parseCSVEntries } from './csv/parseCsvEntriesToJson.js';
 4 | 
 5 | /**
 6 |  * @param {string} dataFolder
 7 |  */
 8 | export async function readAndParseCSVs(dataFolder) {
 9 |   const { allCsv, dateString } = await getCSVInfo(dataFolder);
10 |   const dictionaryEntries = await parseCSVEntries(
11 |     path.join(dataFolder, allCsv)
12 |   );
13 |   console.log(`Found ${dictionaryEntries.length} entries.`);
14 | 
15 |   return { dictionaryEntries, dateString };
16 | }
17 | 


--------------------------------------------------------------------------------
/src/util/textHandling/parseCantoneseReadings.js:
--------------------------------------------------------------------------------
  1 | import {
  2 |   punctuations,
  3 |   isHanzi,
  4 |   isJyuutping,
  5 |   isPunctuation,
  6 | } from './textUtils.js';
  7 | 
  8 | /**
  9 |  * Parses a text string into an array matching each character to the readings
 10 |  * @example text: "你get唔get到我講咩？"
 11 |  * reading: "nei5 get1 m4 get1 dou2 ngo5 gong2 me1?"
 12 |  * =>
 13 |  * [{text: "你", reading: "nei5"}, {text: "get", reading: "get1"}, ...]
 14 |  * @param {string} rawText
 15 |  * @param {string} readings
 16 |  * @returns {TextReadingPair[]}
 17 |  */
 18 | function parseCantoneseReadings(rawText, readings) {
 19 |   /**
 20 |    * @type {TextReadingPair[]}
 21 |    */
 22 |   const resultArray = [];
 23 | 
 24 |   const textArray = splitString(rawText);
 25 |   const readingsArray = splitString(readings);
 26 | 
 27 |   let readingIndex = 0;
 28 |   let textIndex = 0;
 29 |   for (let i = 0; i < Math.max(textArray.length, readingsArray.length); i++) {
 30 |     const text = textArray[textIndex];
 31 |     const reading = readingsArray[readingIndex];
 32 |     const isTextHanzi = isHanzi(text);
 33 |     const isTextAlphanumeric = isJyuutping(text);
 34 |     const isTextPunctuation = isPunctuation(text);
 35 |     const isReadingJyutping = isJyuutping(reading);
 36 |     const isReadingPunctuation = isPunctuation(reading);
 37 |     // Ideal case
 38 |     if (
 39 |       !!text &&
 40 |       !!reading &&
 41 |       ((isTextHanzi && isReadingJyutping) ||
 42 |         // Case where for example text is 'bu' and reading is 'bu4'
 43 |         (isTextAlphanumeric && isReadingJyutping))
 44 |     ) {
 45 |       resultArray.push({ text, reading });
 46 |       textIndex++;
 47 |       readingIndex++;
 48 |     } else if (
 49 |       !!text &&
 50 |       ((isTextPunctuation && isReadingJyutping) ||
 51 |         (!!text && reading === undefined) ||
 52 |         (!isTextAlphanumeric && !isTextHanzi && isReadingJyutping))
 53 |     ) {
 54 |       // Send empty string to reading
 55 |       resultArray.push({ text, reading: '' });
 56 |       textIndex++;
 57 |     } else if (
 58 |       !!text &&
 59 |       !!reading &&
 60 |       ((isTextPunctuation && isReadingPunctuation) ||
 61 |         // Where both are special characters
 62 |         (!isTextAlphanumeric && !isTextHanzi && !isReadingJyutping))
 63 |     ) {
 64 |       // Don't add the punctuation but consume it
 65 |       resultArray.push({ text, reading: '' });
 66 |       textIndex++;
 67 |       readingIndex++;
 68 |     } else {
 69 |       throw new Error(
 70 |         `Unexpected text "${text}" and reading "${reading}" at index ${i} in ${rawText}: ${readings}`
 71 |       );
 72 |     }
 73 |   }
 74 |   // Check if remaining readings exist
 75 |   if (readingIndex < readingsArray.length) {
 76 |     throw new Error(
 77 |       `Unexpected reading "${readingsArray[readingIndex]}" at index ${readingIndex} in ${rawText}: ${readings}`
 78 |     );
 79 |   }
 80 |   return resultArray;
 81 | }
 82 | 
 83 | /**
 84 |  *
 85 |  * @param {string} input
 86 |  * @returns {string[]}
 87 |  */
 88 | function splitString(input) {
 89 |   const resultArray = [];
 90 |   let current = '';
 91 |   for (const char of input) {
 92 |     if (/[a-zA-Z0-9]/.test(char)) {
 93 |       // Check if alphabetical or numeric
 94 |       const isAlphabetical = /[a-zA-Z]/.test(char);
 95 |       if (current.length > 0) {
 96 |         // Check if previous character was alphabetical or numeric
 97 |         const isPreviousAlphabetical = /[a-zA-Z]/.test(
 98 |           current[current.length - 1]
 99 |         );
100 |         if (isAlphabetical && !isPreviousAlphabetical) {
101 |           // Probably a case where the reading was typo'd like bit1ging1
102 |           resultArray.push(current);
103 |           current = '';
104 |         }
105 |       }
106 |       current += char;
107 |     } else if (punctuations[char]) {
108 |       if (current) {
109 |         resultArray.push(current);
110 |         current = '';
111 |       }
112 |       resultArray.push(char);
113 |     } else {
114 |       if (current) {
115 |         resultArray.push(current);
116 |         current = '';
117 |       }
118 |       resultArray.push(char);
119 |     }
120 |   }
121 |   // Push the last current
122 |   if (current) {
123 |     resultArray.push(current);
124 |   }
125 | 
126 |   // Remove empty strings
127 |   const resultArrayFiltered = resultArray
128 |     .map((item) => item.trim())
129 |     .filter((item) => item);
130 |   return resultArrayFiltered;
131 | }
132 | 
133 | export { parseCantoneseReadings };
134 | 


--------------------------------------------------------------------------------
/src/util/textHandling/textUtils.js:
--------------------------------------------------------------------------------
 1 | import XRegExp from '@gerhobbelt/xregexp';
 2 | 
 3 | const punctuations = [
 4 |   '，',
 5 |   ',',
 6 |   '。',
 7 |   '.',
 8 |   '？',
 9 |   '?',
10 |   '！',
11 |   '!',
12 |   '；',
13 |   ';',
14 |   '：',
15 |   ':',
16 |   '、',
17 |   ',',
18 |   '，',
19 |   '⋯',
20 | ];
21 | 
22 | /**
23 |  * Returns true if the text is a Chinese character.
24 |  * @param {string} text
25 |  * @returns {boolean}
26 |  */
27 | function isHanzi(text) {
28 |   XRegExp.install('astral');
29 |   return XRegExp(
30 |     '\\p{InCJK_Unified_Ideographs}|\\p{InCJK_Unified_Ideographs_Extension_A}|\\p{InCJK_Unified_Ideographs_Extension_B}|\\p{InCJK_Unified_Ideographs_Extension_C}|\\p{InCJK_Unified_Ideographs_Extension_D}|\\p{InCJK_Unified_Ideographs_Extension_E}'
31 |   ).test(text);
32 | }
33 | 
34 | /**
35 |  * Returns true if the text is a Jyutping reading.
36 |  * @param {string} text
37 |  * @returns {boolean}
38 |  */
39 | function isJyuutping(text) {
40 |   return /[a-zA-Z0-9]/.test(text);
41 | }
42 | 
43 | /**
44 |  * Returns true if the text is a punctuation.
45 |  * @param {string} text
46 |  * @returns {boolean}
47 |  */
48 | function isPunctuation(text) {
49 |   return punctuations.includes(text);
50 | }
51 | 
52 | function isStringSentence(text) {
53 |   // Check if text ends with a punctuation
54 |   const lastChar = text[text.length - 1];
55 |   return isPunctuation(lastChar);
56 | }
57 | 
58 | export { punctuations, isHanzi, isJyuutping, isPunctuation, isStringSentence };
59 | 


--------------------------------------------------------------------------------
/src/util/yomitan/convertEntryToDetailedDefinition.js:
--------------------------------------------------------------------------------
 1 | import { convertHeadwordsToSC } from './convertHeadwordsToSC.js';
 2 | import { convertSenseToLiSC } from './convertSenseToSC.js';
 3 | import { createEntryAttribution } from './createEntryAttribution.js';
 4 | import { createEntryImageSC } from './createEntryImageSC.js';
 5 | import { convertEntryToSynAntsSC } from './convertEntryToSynAntsSC.js';
 6 | 
 7 | /**
 8 |  * Converts a dictionary entry to a detailed definition.
 9 |  * @param {DictionaryEntry} entry
10 |  * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').DetailedDefinition}
11 |  */
12 | function convertEntryToDetailedDefinition(entry) {
13 |   /**
14 |    * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
15 |    */
16 |   const SCArray = [];
17 |   // Headword
18 |   SCArray.push(convertHeadwordsToSC(entry.headwords));
19 | 
20 |   // Senses with explanation/examples
21 |   SCArray.push({
22 |     tag: 'div',
23 |     data: {
24 |       wordshk: 'definition',
25 |     },
26 |     lang: 'yue',
27 |     content: {
28 |       tag: 'ul',
29 |       data: {
30 |         wordshk: 'sense-list',
31 |       },
32 |       content: entry.senses.map(convertSenseToLiSC),
33 |     },
34 |   });
35 |   
36 |   // Synonyms/antonyms
37 |   const synAntsSC = convertEntryToSynAntsSC(entry);
38 |   SCArray.push(...synAntsSC);
39 | 
40 |   // Image
41 |   let imageURLs = [];
42 |   if (entry.tags.some((tag) => tag.name === 'img')) {
43 |     const { SCs, validImageURLs } = createEntryImageSC(entry);
44 |     if (SCs.length > 0) {
45 |       SCArray.push(SCs);
46 |     }
47 |     imageURLs.push(...validImageURLs);
48 |   }
49 |   
50 |   // Attribution
51 |   SCArray.push(createEntryAttribution(entry, imageURLs));
52 |   return {
53 |     type: 'structured-content',
54 |     content: SCArray,
55 |   };
56 | }
57 | 
58 | export { convertEntryToDetailedDefinition };
59 | 


--------------------------------------------------------------------------------
/src/util/yomitan/convertEntryToSynAntsSC.js:
--------------------------------------------------------------------------------
 1 | // const synonymEmoji = ;
 2 | // const antonymEmoji = '🚫';
 3 | 
 4 | const types = {
 5 |   sim: {
 6 |     emoji: '🔗',
 7 |     text: '近義',
 8 |   },
 9 |   ant: {
10 |     emoji: '🚫',
11 |     text: '反義',
12 |   },
13 | };
14 | 
15 | /**
16 |  * Converts an entry to a ul list of the element's synonyms and antonyms.
17 |  * @param {DictionaryEntry} entry
18 |  * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
19 |  */
20 | function convertEntryToSynAntsSC(entry) {
21 |   let exists = false;
22 |   /**
23 |    * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
24 |    */
25 |   const SCArray = [];
26 |   /**
27 |    * @type {('sim'|'ant')[]}
28 |    */
29 |   const tagTypes = ['sim', 'ant'];
30 |   for (const type of tagTypes) {
31 |     const { SC, exists: typeExists } = convertEntryToSCType(entry, type);
32 |     if (typeExists) {
33 |       SCArray.push(SC);
34 |     }
35 |   }
36 | 
37 |   return SCArray;
38 | }
39 | 
40 | /**
41 |  *
42 |  * @param {DictionaryEntry} entry
43 |  * @param {'sim'|'ant'} type
44 |  * @returns {{ SC: import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent, exists: boolean}}
45 |  */
46 | function convertEntryToSCType(entry, type) {
47 |   const typeTags = entry.tags.filter((tag) => tag.name === type);
48 |   if (typeTags.length === 0) {
49 |     return {
50 |       SC: [],
51 |       exists: false,
52 |     };
53 |   }
54 |   // let tagString = typeTags.map((tag) => tag.value).join('・');
55 |   return {
56 |     SC: {
57 |       tag: 'ul',
58 |       data: {
59 |         wordshk: `${type}-list`,
60 |       },
61 |       content: [
62 |         {
63 |           tag: 'li',
64 |           style: {
65 |             listStyleType: `"${types[type].emoji}"`,
66 |             fontWeight: 'bold',
67 |           },
68 |           data: {
69 |             wordshk: `${type}-header`,
70 |           },
71 |           content: types[type].text,
72 |         },
73 |         {
74 |           tag: 'ul',
75 |           /**
76 |            * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
77 |            */
78 |           content: typeTags.map((tag) => ({
79 |             tag: 'li',
80 |             data: {
81 |               wordshk: `${type}-entry`,
82 |             },
83 |             content: tag.value,
84 |             lang: 'yue',
85 |             style: {
86 |               fontSize: '1.2em',
87 |             },
88 |           })),
89 |         },
90 |       ],
91 |     },
92 |     exists: true,
93 |   };
94 | }
95 | 
96 | export { convertEntryToSynAntsSC };
97 | 


--------------------------------------------------------------------------------
/src/util/yomitan/convertEntryToYomitanTerms.js:
--------------------------------------------------------------------------------
 1 | import { TermEntry } from 'yomichan-dict-builder';
 2 | import { convertEntryToDetailedDefinition } from './convertEntryToDetailedDefinition.js';
 3 | 
 4 | /**
 5 |  *
 6 |  * @param {DictionaryEntry} dictionaryEntry
 7 |  * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').TermInformation[]}
 8 |  */
 9 | function convertEntryToYomitanTerms(dictionaryEntry) {
10 |   /**
11 |    * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').TermInformation[]}
12 |    */
13 |   const yomitanTerms = [];
14 | 
15 |   const detailedDefinition = convertEntryToDetailedDefinition(dictionaryEntry);
16 |   for (const headword of dictionaryEntry.headwords) {
17 |     for (const reading of headword.readings) {
18 |       const yomitanTermEntry = new TermEntry(headword.text)
19 |         .setReading(reading)
20 |         .addDetailedDefinition(detailedDefinition);
21 |       addTagsToTermEntry(dictionaryEntry, yomitanTermEntry);
22 |       yomitanTerms.push(yomitanTermEntry.build());
23 |     }
24 |   }
25 | 
26 |   return yomitanTerms;
27 | }
28 | 
29 | /**
30 |  * @param {DictionaryEntry} dictionaryEntry
31 |  * @param {TermEntry} termEntry
32 |  */
33 | function addTagsToTermEntry(dictionaryEntry, termEntry) {
34 |   const termTags = [];
35 |   const entryTags = [];
36 |   const tagTypesToAdd = ['pos', 'label'];
37 |   for (const tag of dictionaryEntry.tags) {
38 |     if (tagTypesToAdd.includes(tag.name)) {
39 |       entryTags.push(tag.value);
40 |     }
41 |   }
42 |   termEntry.setTermTags(termTags.join(' '));
43 |   termEntry.setDefinitionTags(entryTags.join(' '));
44 | }
45 | 
46 | export { convertEntryToYomitanTerms };
47 | 


--------------------------------------------------------------------------------
/src/util/yomitan/convertHeadwordsToSC.js:
--------------------------------------------------------------------------------
 1 | import { convertReadingToRubySC } from './parseTextToSC.js';
 2 | 
 3 | /**
 4 |  * Converts headword(s) to structured content.
 5 |  * @param {Headword[]} headwords
 6 |  */
 7 | function convertHeadwordsToSC(headwords) {
 8 |   const headwordsSCList = headwordsToSC(headwords);
 9 |   const separator = '・';
10 |   /**
11 |    * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
12 |    */
13 |   const headwordsSCListWithSeparator = [];
14 |   for (let i = 0; i < headwordsSCList.length; i++) {
15 |     headwordsSCListWithSeparator.push(headwordsSCList[i]);
16 |     if (i !== headwordsSCList.length - 1) {
17 |       headwordsSCListWithSeparator.push(separator);
18 |     }
19 |   }
20 |   /**
21 |    * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent}
22 |    */
23 |   const sc = {
24 |     tag: 'div',
25 |     data: {
26 |       wordshk: 'headword',
27 |     },
28 |     style: {
29 |       fontSize: '1.2em',
30 |     },
31 |     lang: 'yue',
32 |     content: ['【', ...headwordsSCListWithSeparator, '】'],
33 |   };
34 |   return sc;
35 | }
36 | 
37 | /**
38 |  * Converts a headword to structured content.
39 |  * @param {Headword[]} headwords
40 |  * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
41 |  */
42 | function headwordsToSC(headwords) {
43 |   /**
44 |    * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
45 |    */
46 |   const headwordsSCList = [];
47 |   for (const headword of headwords) {
48 |     headwordsSCList.push(
49 |       ...headword.readings.map((reading) =>
50 |         convertReadingToRubySC(headword.text, reading)
51 |       )
52 |     );
53 |   }
54 |   return headwordsSCList;
55 | }
56 | 
57 | export { convertHeadwordsToSC };
58 | 


--------------------------------------------------------------------------------
/src/util/yomitan/convertSenseToSC.js:
--------------------------------------------------------------------------------
  1 | import { LANGUAGES_DATA } from '../../constants.js';
  2 | import { isStringSentence } from '../textHandling/textUtils.js';
  3 | import { convertTextToSC } from './parseTextToSC.js';
  4 | 
  5 | const examplePhraseText = '配詞 / 用法';
  6 | const exampleSentenceText = '例句';
  7 | const examplePhraseEmoji = '💬';
  8 | const exampleSentenceEmoji = '📝';
  9 | 
 10 | /**
 11 |  * Converts a sense to structured content as a li.
 12 |  * @param {Sense} sense
 13 |  * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent}
 14 |  */
 15 | function convertSenseToLiSC(sense) {
 16 |   /**
 17 |    * @type {LanguageData[]}
 18 |    */
 19 |   const phrases = [];
 20 |   /**
 21 |    * @type {LanguageData[]}
 22 |    */
 23 |   const sentences = [];
 24 |   for (const eg of sense.egs) {
 25 |     // Check if any of the language datas are a sentence
 26 |     const isEgSentence = Object.values(eg).some((languageData) => {
 27 |       return languageData.some((languageText) => {
 28 |         return isStringSentence(languageText);
 29 |       });
 30 |     });
 31 |     if (isEgSentence) {
 32 |       sentences.push(eg);
 33 |     } else {
 34 |       phrases.push(eg);
 35 |     }
 36 |   }
 37 | 
 38 |   /**
 39 |    * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
 40 |    */
 41 |   const exampleNodes = [];
 42 |   if (phrases.length > 0) {
 43 |     exampleNodes.push(
 44 |       convertExampleToSC(
 45 |         phrases,
 46 |         'phrase',
 47 |         examplePhraseText,
 48 |         examplePhraseEmoji
 49 |       )
 50 |     );
 51 |   }
 52 |   if (sentences.length > 0) {
 53 |     exampleNodes.push(
 54 |       convertExampleToSC(
 55 |         sentences,
 56 |         'sentence',
 57 |         exampleSentenceText,
 58 |         exampleSentenceEmoji
 59 |       )
 60 |     );
 61 |   }
 62 | 
 63 |   return {
 64 |     tag: 'li',
 65 |     data: {
 66 |       wordshk: 'sense',
 67 |     },
 68 |     content: [
 69 |       {
 70 |         tag: 'div',
 71 |         data: {
 72 |           wordshk: 'explanation',
 73 |         },
 74 |         content: convertLanguageDataToLiSC(sense.explanation, true),
 75 |       },
 76 |       {
 77 |         tag: 'div',
 78 |         data: {
 79 |           wordshk: 'examples',
 80 |         },
 81 |         content: exampleNodes,
 82 |       },
 83 |     ],
 84 |   };
 85 | }
 86 | 
 87 | /**
 88 |  * Converts an example list to a ul structured content object with the appropriate emoji.
 89 |  * @param {LanguageData[]} languageDatas
 90 |  * @param {'phrase' | 'sentence'} exampleType
 91 |  * @param {string} exampleText
 92 |  * @param {string} exampleEmoji
 93 |  * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent}
 94 |  */
 95 | function convertExampleToSC(
 96 |   languageDatas,
 97 |   exampleType,
 98 |   exampleText,
 99 |   exampleEmoji
100 | ) {
101 |   return {
102 |     tag: 'ul',
103 |     data: {
104 |       wordshk: exampleType,
105 |     },
106 |     content: [
107 |       {
108 |         tag: 'li',
109 |         style: {
110 |           listStyleType: `"${exampleEmoji}"`,
111 |           fontWeight: 'bold',
112 |         },
113 |         data: {
114 |           wordshk: 'example-type-header',
115 |         },
116 |         content: exampleText,
117 |       },
118 |       {
119 |         tag: 'ul',
120 |         data: {
121 |           wordshk: `${exampleType}-list`,
122 |         },
123 |         content: [
124 |           ...languageDatas.map((languageData) => {
125 |             return convertLanguageDataToLiSC(languageData, false);
126 |           }),
127 |         ],
128 |       },
129 |     ],
130 |   };
131 | }
132 | 
133 | /**
134 |  * Converts one single languageData to structured content representing a definition/example/sentence as an unordered list.
135 |  * @param {LanguageData} languageData
136 |  * @param {boolean} isExplanation whether the languageData is an explanation
137 |  or an example
138 |  * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent}
139 |  */
140 | function convertLanguageDataToLiSC(languageData, isExplanation) {
141 |   /**
142 |    * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
143 |    */
144 |   const languageDivArray = [];
145 | 
146 |   for (const language of Object.keys(languageData)) {
147 |     languageDivArray.push(
148 |       ...convertLanguageEntryToListItems(
149 |         // @ts-ignore
150 |         language,
151 |         languageData[language],
152 |         isExplanation
153 |       )
154 |     );
155 |   }
156 | 
157 |   /**
158 |    * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent}
159 |    */
160 |   const sc = {
161 |     tag: 'li',
162 |     style: {
163 |       marginBottom: isExplanation ? '0.3em' : '0.5em',
164 |       listStyleType: isExplanation ? 'none' : 'circle',
165 |     },
166 |     data: {
167 |       wordshk: isExplanation ? 'explanation' : 'example',
168 |     },
169 |     content: languageDivArray,
170 |   };
171 | 
172 |   return sc;
173 | }
174 | 
175 | /**
176 |  * Converts a single language entry consisting of multiple language contents to a list of lis
177 |  * @param {Language} language
178 |  * @param {string[]} languageTexts
179 |  * @param {boolean} isExplanation whether the languageData is an explanation
180 |  * @returns {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
181 |  */
182 | function convertLanguageEntryToListItems(
183 |   language,
184 |   languageTexts,
185 |   isExplanation
186 | ) {
187 |   /**
188 |    * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
189 |    */
190 |   const languageLiScArray = [];
191 |   const languageInfo = LANGUAGES_DATA[language];
192 |   for (const languageText of languageTexts) {
193 |     /**
194 |      * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
195 |      */
196 |     const liChildren = [convertTextToSC(languageText, language)];
197 | 
198 |     // Only push lang tag if non yue/eng language
199 |     const noLanguageTagNecessaryLanguages = ['yue', 'eng'];
200 |     if (!noLanguageTagNecessaryLanguages.includes(language)) {
201 |       liChildren.unshift({
202 |         tag: 'span',
203 |         data: {
204 |           wordshk: 'langSignifier',
205 |         },
206 |         style: {
207 |           color: '#888',
208 |           fontSize: '0.8em',
209 |         },
210 |         content: `${languageInfo.name}› `,
211 |       });
212 |     }
213 | 
214 |     /**
215 |      * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent}
216 |      */
217 |     const singleLanguageLi = {
218 |       tag: 'li',
219 |       lang: languageInfo.langCode,
220 |       content: liChildren,
221 |       style: {
222 |         listStyleType: 'none',
223 |       },
224 |       data: {
225 |         wordshk: languageInfo.langCode,
226 |       },
227 |     };
228 | 
229 |     // Change text size for selected languages
230 |     const cjkLangs = ['yue', 'zho', 'jpn', 'kor', 'lzh'];
231 |     const isCJK = cjkLangs.includes(language);
232 |     // @ts-ignore
233 |     singleLanguageLi.style.fontSize = isCJK
234 |       ? '1.2em'
235 |       : isExplanation
236 |       ? '1em'
237 |       : '0.75em';
238 | 
239 |     languageLiScArray.push(singleLanguageLi);
240 |   }
241 | 
242 |   return languageLiScArray;
243 | }
244 | 
245 | export { convertSenseToLiSC };
246 | 


--------------------------------------------------------------------------------
/src/util/yomitan/createEntryAttribution.js:
--------------------------------------------------------------------------------
 1 | /**
 2 |  *
 3 |  * @param {DictionaryEntry} entry
 4 |  * @param {string[]} imageURLs
 5 |  * @returns {import("yomichan-dict-builder/dist/types/yomitan/termbank").StructuredContent}
 6 |  */
 7 | function createEntryAttribution(entry, imageURLs) {
 8 |   /**
 9 |    * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
10 |    */
11 |   const contentAttributionSCArray = [
12 |     {
13 |       tag: 'a',
14 |       href: `https://words.hk/zidin/v/${entry.id}`,
15 |       content: '粵典 words.hk',
16 |     },
17 |   ];
18 |   if (entry.tags.length > 0) {
19 |     // Find reference tag if exists
20 |     const referenceTag = entry.tags.find((tag) => tag.name === 'ref');
21 |     if (referenceTag) {
22 |       let urlDomain = '';
23 |       try {
24 |         const url = new URL(referenceTag.value);
25 |         urlDomain = url.hostname;
26 |       } catch (error) {
27 |         console.error(`Invalid URL: ${referenceTag.value}`);
28 |       }
29 | 
30 |       contentAttributionSCArray.unshift(
31 |         {
32 |           tag: 'a',
33 |           href: referenceTag.value,
34 |           content: `參考: ${urlDomain}`,
35 |         },
36 |         {
37 |           tag: 'span',
38 |           content: ' | ',
39 |         }
40 |       );
41 |     }
42 |   }
43 | 
44 |   // Add image attributions
45 |   if (imageURLs.length > 0) {
46 |     for (const imageURL of imageURLs) {
47 |       try {
48 |         const url = new URL(imageURL);
49 |         const urlDomain = url.hostname;
50 |         contentAttributionSCArray.unshift(
51 |           {
52 |             tag: 'a',
53 |             href: imageURL,
54 |             content: `圖片: ${urlDomain}`,
55 |           },
56 |           {
57 |             tag: 'span',
58 |             content: ' | ',
59 |           }
60 |         );
61 |       } catch (error) {}
62 |     }
63 |   }
64 | 
65 |   return {
66 |     tag: 'div',
67 |     data: {
68 |       wordshk: 'attribution',
69 |     },
70 |     lang: 'yue',
71 |     style: {
72 |       fontSize: '0.7em',
73 |       textAlign: 'right',
74 |       // The examples/definitions above have marginBottom set
75 |       marginTop: '-0.4em',
76 |     },
77 |     content: contentAttributionSCArray,
78 |   };
79 | }
80 | 
81 | export { createEntryAttribution };
82 | 


--------------------------------------------------------------------------------
/src/util/yomitan/createEntryImageSC.js:
--------------------------------------------------------------------------------
 1 | import fs from 'fs';
 2 | 
 3 | import { getImageFileName } from '../imageHandler/getImageFileName.js';
 4 | import { IMAGE_FOLDER, IMAGE_RESIZE_WIDTH } from '../../constants.js';
 5 | 
 6 | /**
 7 |  * @param {DictionaryEntry} entry
 8 |  * @returns {{SCs: import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[], validImageURLs: string[]}}
 9 |  */
10 | function createEntryImageSC(entry) {
11 |   // Check if entry has images
12 |   const imageTags = entry.tags.filter((tag) => tag.name === 'img');
13 |   if (imageTags.length === 0) {
14 |     throw new Error(`Entry ${entry.headwords[0].text} has no images.`);
15 |   }
16 | 
17 |   /**
18 |    * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent[]}
19 |    */
20 |   const SCs = [];
21 |   const validImageURLs = [];
22 |   for (const tag of imageTags) {
23 |     try {
24 |       const fileName = getImageFileName(tag.value);
25 |       // Check if file exists
26 |       const filePath = `${IMAGE_FOLDER}/${fileName}`;
27 |       if (!fs.existsSync(filePath)) {
28 |         throw new Error(`File does not exist: ${filePath}`);
29 |       }
30 |       /**
31 |        * @type {import('yomichan-dict-builder/dist/types/yomitan/termbank').StructuredContent}
32 |        */
33 |       const imageNode = {
34 |         tag: 'img',
35 |         data: {
36 |           wordshk: 'image',
37 |         },
38 |         path: filePath,
39 |         collapsed: false,
40 |         collapsible: false,
41 |       };
42 |       if (fileName.endsWith('.svg')) {
43 |         imageNode.width = IMAGE_RESIZE_WIDTH;
44 |       }
45 |       SCs.push(imageNode);
46 |       validImageURLs.push(tag.value);
47 |     } catch (error) {}
48 |   }
49 |   return { SCs, validImageURLs };
50 | }
51 | 
52 | export { createEntryImageSC };
53 | 


--------------------------------------------------------------------------------
/src/util/yomitan/parseTextToSC.js:
--------------------------------------------------------------------------------
 1 | import { parseCantoneseReadings } from '../textHandling/parseCantoneseReadings.js';
 2 | 
 3 | /**
 4 |  * Parses a text string into a structured content object.
 5 |  * @param {string} rawText
 6 |  * @param {string} languageCode
 7 |  * @returns {import("yomichan-dict-builder/dist/types/yomitan/termbank").StructuredContent}
 8 |  */
 9 | function convertTextToSC(rawText, languageCode) {
10 |   const rubyTextLangs = ['yue', 'zho', 'lzh'];
11 |   if (!rubyTextLangs.includes(languageCode)) {
12 |     return rawText;
13 |   }
14 |   const cleanedText = cleanRawText(rawText);
15 |   // Parse brackets for possible reading
16 |   const bracketRegex = /(.+)\(([^\(\)]+)\)$/;
17 |   const [_, phrase, reading] = cleanedText.match(bracketRegex) || [];
18 | 
19 |   if (!phrase || !reading) {
20 |     return cleanedText;
21 |   }
22 | 
23 |   // If reading doesn't have alphanumeric characters, it's not a jyut reading
24 |   const hasEnglishChars = /[a-zA-Z0-9]/.test(reading);
25 |   if (!hasEnglishChars) {
26 |     return cleanedText;
27 |   }
28 | 
29 |   try {
30 |     const readings = parseCantoneseReadings(phrase, reading);
31 |     return readings.map(({ text, reading }) =>
32 |       convertReadingToRubySC(text, reading)
33 |     );
34 |   } catch (error) {
35 |     return cleanedText;
36 |   }
37 | }
38 | 
39 | /**
40 |  * Strips out # and spaces from raw text
41 |  * @param {string} rawText
42 |  */
43 | function cleanRawText(rawText) {
44 |   return rawText.replace(/#| /g, '');
45 | }
46 | 
47 | /**
48 |  * Parses a text string into a structured content object with ruby text for readings
49 |  * @param {string} text
50 |  * @param {string} reading
51 |  * @returns {import("yomichan-dict-builder/dist/types/yomitan/termbank").StructuredContent}
52 |  */
53 | function convertReadingToRubySC(text, reading) {
54 |   // Check that both text and reading are type string, if not then cast to string
55 |   if (typeof text !== 'string') {
56 |     text = String(text);
57 |   }
58 |   if (typeof reading !== 'string') {
59 |     reading = String(reading);
60 |   }
61 |   return {
62 |     tag: 'ruby',
63 |     content: [
64 |       text,
65 |       {
66 |         tag: 'rt',
67 |         content: reading,
68 |       },
69 |     ],
70 |   };
71 | }
72 | 
73 | export { convertReadingToRubySC, convertTextToSC };
74 | 


--------------------------------------------------------------------------------