├── .gitmodules ├── LICENSE ├── README-en.md ├── README.md ├── docs ├── Increasing_the_Efficiency_of_Text_Input_in_the_8pen_Method.pdf ├── design │ ├── cross-mode.md │ ├── cross-mode │ │ ├── 00-latin-init.png │ │ ├── 10-8pen-input.png │ │ ├── 10-8vim-active-upper-case.png │ │ ├── 10-8vim-input-demo.mov │ │ ├── 20-latin-input-active-zone.png │ │ ├── 20-latin-input-another.png │ │ ├── 20-latin-input-expand-zone.png │ │ ├── 20-latin-input-lower-b.png │ │ ├── 20-latin-input-steps.png │ │ ├── 20-latin-input-upper-b.png │ │ ├── 30-latin-trace-shape-horizontal.png │ │ ├── 30-latin-trace-shape-vertical.png │ │ ├── 40-full-screen-input.png │ │ ├── 40-input-trace-to-vector-horizontal.png │ │ ├── 40-input-trace-to-vector-vertical.png │ │ ├── 50-latin-blind-hit.png │ │ ├── 60-select-latin-completion.png │ │ ├── 60-switch-latin-to-pinyin.png │ │ ├── 70-pinyin-input-step-1.png │ │ ├── 70-pinyin-input-step-2.png │ │ ├── 75-pinyin-input-active-zone-level-2.png │ │ ├── 75-pinyin-input-active-zone.png │ │ ├── 75-pinyin-input-end.png │ │ ├── 75-pinyin-input-expand-zone-level-1.png │ │ ├── 75-pinyin-input-expand-zone-level-2.png │ │ ├── 75-pinyin-input-expand-zone-level-3.png │ │ └── Kuzi_IME_Cross_Mode_Prototype.drawio │ └── x-mode │ │ ├── 10-latin-input-steps.png │ │ └── Kuaizi_IME_X_Mode_Prototype.drawio ├── donate │ ├── alipay.jpg │ ├── index.md │ └── wechat.png ├── image │ ├── kuaizi_ime_candidates_and_emojis.png │ ├── kuaizi_ime_editor_editing.png │ ├── kuaizi_ime_has_many_emojis.png │ ├── kuaizi_ime_latin_case_input.png │ ├── kuaizi_ime_main.png │ ├── kuaizi_ime_math_expr_input.png │ ├── kuaizi_ime_pair_symbols.png │ ├── kuaizi_ime_pinyin_slipping_input.png │ └── kuaizi_ime_x_pad_input.png └── video │ ├── kuaizi_ime_auto_completion.mov │ ├── kuaizi_ime_editor_editing.mov │ ├── kuaizi_ime_emoji_keyword_matching.mov │ ├── kuaizi_ime_math_expr_input.mov │ ├── kuaizi_ime_pair_symbol_input.mov │ ├── kuaizi_ime_pinyin_comitting_options.mov │ ├── kuaizi_ime_pinyin_slipping_input.mov │ └── kuaizi_ime_x_pad_input.mov ├── logo.svg ├── thirdparty ├── hanzi-level-1.txt ├── hanzi-level-2.txt ├── hanzi-level-3.txt ├── hanzi-traditional-to-simple.txt ├── hanzi-weight.ciyu.txt ├── hanzi-weight.txt ├── 古代汉语语料库字频表.xls ├── 现代汉语常用字表.xls ├── 现代汉语语料库分词类词频表.xls ├── 现代汉语语料库字频表.xls ├── 现代汉语语料库词频表.xls ├── 现代汉语通用字表.xls └── 通用规范汉字表.xls └── tools ├── analyze ├── LICENSE.txt ├── README.md ├── char-links.html ├── char-links.js ├── char-tree.html ├── char-tree.js ├── files │ ├── char-links.json │ ├── char-tree.json │ └── pinyin.txt ├── img │ ├── pinyin-char-links.png │ ├── pinyin-char-tree.png │ └── pinyin-key-layout.png ├── playground.css ├── playground.html ├── playground.js ├── runtime.js ├── simulate.css ├── simulate.html ├── simulate.js └── tree.css └── pinyin-dict ├── .gitignore ├── README.md ├── data ├── emojis.json ├── pinyin-dict-data-phrase.zip ├── pinyin-dict-data-word.zip ├── pinyin-dict-db.zip └── sample.json ├── docs └── img │ └── donate-cngwzj.png ├── package.json ├── src ├── app │ ├── shell.mjs │ └── sqlite.mjs ├── generate │ ├── emoji │ │ ├── emoji.mjs │ │ └── index.mjs │ ├── phrase │ │ ├── index.mjs │ │ └── phrase.mjs │ ├── raw │ │ ├── index.mjs │ │ └── raw.mjs │ ├── sqlite │ │ ├── ime │ │ │ ├── ime.mjs │ │ │ └── index.mjs │ │ ├── phrase │ │ │ └── hmm │ │ │ │ ├── index.mjs │ │ │ │ ├── sqlite.mjs │ │ │ │ ├── trans │ │ │ │ ├── index.mjs │ │ │ │ └── trans.mjs │ │ │ │ ├── trans_kewen │ │ │ │ ├── index.mjs │ │ │ │ └── trans.mjs │ │ │ │ └── utils.mjs │ │ └── word │ │ │ ├── diff.mjs │ │ │ ├── index.mjs │ │ │ ├── patch.mjs │ │ │ └── sqlite.mjs │ └── test.mjs └── utils │ ├── sqlite.mjs │ ├── utils.mjs │ └── zdic.mjs └── yarn.lock /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "android"] 2 | path = android 3 | url = git@github.com:crazydan-studio/kuaizi-ime-android.git 4 | [submodule "thirdparty/OpenCC"] 5 | path = thirdparty/OpenCC 6 | url = git@github.com:crazydan-studio/OpenCC.git 7 | [submodule "thirdparty/pinyin-data"] 8 | path = thirdparty/pinyin-data 9 | url = git@github.com:crazydan-studio/pinyin-data.git 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README-en.md: -------------------------------------------------------------------------------- 1 | 筷字输入法 (Kuaizi IME) 2 | ======================================= 3 | 4 | [中文版](./README.md) | English 5 | 6 | 7 | 8 | > To download the latest version for Android, 9 | > please go to https://github.com/crazydan-studio/kuaizi-ime-android/releases/latest 10 | 11 | [Get it on F-Droid](https://f-droid.org/packages/org.crazydan.studio.app.ime.kuaizi) 14 | 15 | > **Warning**:筷字输入法(Kuaizi IME) is currently exclusively released 16 | > on [F-Droid](https://f-droid.org) and has not been published on other application platforms. 17 | > Please be cautious and aware of related risks. If you need to download and use it, 18 | > please use the link above. 19 | 20 | > If you find any errors or have improvement suggestions, 21 | > please visit [Issues](https://github.com/crazydan-studio/kuaizi-ime/issues) 22 | 23 | **筷字输入法** (Kuaizi IME) is a system based on the combinatorial features of Chinese Pinyin. 24 | It is an efficient and fast input method editor, suitable for electronic touchscreen devices. 25 | 26 | **筷字输入法** (Kuaizi IME) abandones the traditional method of tapping on virtual keyboard keys one by one, 27 | it instead offers a swipe input method for Pinyin, combined with the combinatorial features of Pinyin letters. 28 | The key layout is designed to make Pinyin input more accurate and convenient. 29 | 30 | **筷字输入法** (Kuaizi IME) provides editing support such as cursor movement 31 | and text selection for the target editor, thereby eliminating the frustration 32 | of being unable to accurately position inputs and text due to the small screen size of mobile devices. 33 | This further enhances text input and editing efficiency on small-screen devices. 34 | 35 | User-data generated by **筷字输入法** (Kuaizi IME) is stored locally. 36 | It does not connect to the internet and will not collect nor analyze user data or behavior habits. 37 | Furthermore, it does not support or provide predictive input nor fuzzy matching mechanisms. 38 | To some extent, this input method editor aims to enhance the users' ability to recognize 39 | and memorize Chinese characters and Pinyin, ensuring that the reliance on digital tools 40 | does not lead to the gradual forgetting and eventual abandonment of 41 | 「汉字」 (Hànzì, Chinese characters), a treasure of Chinese civilization. 42 | 43 | ## About the icon 44 | 45 | **筷字输入法** The icon is composed of the Chinese character 「字」 (zì, character) 46 | and the Chinese-style utensil 「筷子」 (Kuàizi, chopsticks). 47 | This combination closely links Chinese civilization with the unique culinary culture of its people, 48 | showcasing the long history of Chinese civilization. 49 | It also helps to spread traditional Chinese culture to the world, 50 | deepening global friends' understanding and appreciation of Chinese culture. 51 | At the same time, it allows the Chinese people to have a stronger confidence in their national culture. 52 | 53 | ## How to clone the repository 54 | 55 | ```bash 56 | git clone git@github.com:crazydan-studio/kuaizi-ime.git 57 | 58 | cd kuaizi-ime 59 | git submodule update --init android 60 | ``` 61 | 62 | > For detailed build instructions, please refer to the README documentation of each module. 63 | 64 | ## Project structure 65 | 66 | - [android/](https://github.com/crazydan-studio/kuaizi-ime-android): 67 | The complete project code for the 筷字输入法 (Kuaizi IME) Android client. 68 | Note: Use the command `git submodule update --init android` to initialize this git submodule. 69 | - [docs/](./docs/): The documentation directory of this project. 70 | - [Cross-shaped input interaction design](./docs/design/cross-mode.md): 71 | Optimization and improvement plan for cross-shaped input. 72 | - [tools/analyze/](./tools/analyze/): 73 | Pinyin key layout online analysis tool, used to analyze the letter combination structure of Pinyin 74 | and validate the layout scheme of the regular hexagonal keys. 75 | - [tools/pinyin-dict/](./tools/pinyin-dict/): 76 | Pinyin dictionary data collection and correction program, 77 | which collects Chinese character data and generates an SQLite database. 78 | The character/dictionary database for the 筷字输入法 (Kuaizi IME) client is also generated by this tool. 79 | - [thirdparty/](./thirdparty/): 80 | Third-party character and word data used in this project. **Note:** Use the command 81 | `git submodule update --init thirdparty/OpenCC thirdparty/pinyin-data` 82 | to initialize the git-submodules within this directory. 83 | 84 | ## License 85 | 86 | [Apache 2.0](./LICENSE) 87 | 88 | ## Donations 89 | 90 | **Note**:Please add the remark `筷字输入法` when donating. 91 | For a complete list of donations, please check the [Donation list](./docs/donate/index.md)。 92 | 93 | | 支付宝 (Alipay) | 微信支付 (WeChat Pay) | 94 | | -- | -- | 95 | | | | 96 | 97 | ## Feature Highlights 98 | 99 | - Input Pinyin via a swiping motion, with a maximum of only two swipes required for a single input. 100 | - Built-in 101 | [8VIM](https://github.com/8VIM/8VIM)/[8pen](./docs/Increasing_the_Efficiency_of_Text_Input_in_the_8pen_Method.pdf) 102 | input mode, allowing continuous input of Pinyin, English, and numbers by drawing circles, 103 | providing a smooth input experience. 104 | - All candidate characters are displayed with their complete Pinyin, 105 | making it easy to identify the accurate pronunciation of each character. 106 | - Input data is stored locally; no collection or analysis of user data. 107 | - Utilizes Hidden Markov Models (HMM) and the Viterbi algorithm for Pinyin input prediction. 108 | - Supports only precise Pinyin input matching; does not support Pinyin predictive input or fuzzy matching mechanisms. 109 | - Provides editing functions such as cursor movement, text selection, copy, paste, and cut. 110 | Additionally, supports undoing inputs submitted to the target editor for easy corrections. 111 | - Supports left-handed and right-handed mode switching to accommodate different user habits. 112 | - Supports basic mathematical operations (addition, subtraction, multiplication, and division), 113 | allowing calculations while typing. 114 | - Supports input of punctuation, emojis, and other text symbols, offering rich forms of expression. 115 | - Supports direct input of paired punctuation marks (e.g., brackets, quotes) 116 | and automatically wraps selected content within them. 117 | - Automatically adds spaces between Latin characters and Chinese characters, 118 | as well as between operands and operators, to standardize input formatting. 119 | 120 | https://github.com/user-attachments/assets/c5a3c769-0e6d-42e1-bc2f-babe85607bfb 121 | 122 | > Note: The audio and subtitles were provided by [GitPodcast](https://www.gitpodcast.com/), 123 | > the final videos were generated and produced using [Veed](https://www.veed.io) 124 | 125 | 126 | 127 | ## Feature demonstration 128 | 129 | > The latest version's key layout may differ from the demonstration videos. 130 | > Please refer to the latest version. 131 | 132 | ### Pinyin Swipe Input 133 | 134 | https://github.com/crazydan-studio/kuaizi-ime/assets/1321315/eb2f74f9-f64e-4d02-ad80-98e3ecb9d61b 135 | 136 | ### Arithmetic Input 137 | 138 | https://github.com/crazydan-studio/kuaizi-ime/assets/1321315/f461b564-0ac4-4257-82ad-11afcd3e1d6c 139 | 140 | ### Content Editing 141 | 142 | https://github.com/crazydan-studio/kuaizi-ime/assets/1321315/992a0a5e-7e1e-4b93-a1ac-c893d0e3ff2e 143 | 144 | ### Emoji Matching 145 | 146 | https://github.com/crazydan-studio/kuaizi-ime/assets/1321315/7fff6ddf-9e10-408f-b160-3b3b8e2ab215 147 | 148 | ### Paired Symbol Input 149 | 150 | https://github.com/crazydan-studio/kuaizi-ime/assets/1321315/97b0f99b-92e1-4b28-a5b6-d45150c4bada 151 | 152 | ### Pinyin Input Submission Options 153 | 154 | https://github.com/crazydan-studio/kuaizi-ime/assets/1321315/274d41dd-e858-4b71-a041-31df3dd24f7d 155 | 156 | ### 8VIM/8pen-style Input 157 | 158 | https://github.com/crazydan-studio/kuaizi-ime/assets/1321315/534fa61e-34dc-4e81-a7d1-5eb7cc3b291f 159 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 筷字(Kuaizi)输入法 2 | ======================================= 3 | 4 | 中文版 | [English](./README-en.md) 5 | 6 | 7 | 8 | > 下载 Android 版最新版本,请前往 https://github.com/crazydan-studio/kuaizi-ime-android/releases/latest 9 | 10 | [Get it on F-Droid](https://f-droid.org/packages/org.crazydan.studio.app.ime.kuaizi) 13 | 14 | > **Warning**:筷字输入法目前仅发布在 [F-Droid](https://f-droid.org) 上,并未发布在其他应用平台, 15 | > 请注意识别并警惕相关风险,若需下载使用,请直接点击以上链接。 16 | 17 | > 若有缺陷反馈和改进意见,请移步至 [Issues](https://github.com/crazydan-studio/kuaizi-ime/issues) 18 | > 页面。 19 | 20 | **筷字输入法** 是一款根据汉语拼音的组合特征而专门设计的、 21 | 高效且快速的、适用于电子触屏设备的拼音输入法。 22 | 23 | **筷字输入法** 摒弃传统的逐个点击虚拟键盘按键的录入方式, 24 | 改为以滑屏方式录入拼音,并结合拼音字母的有限组合特征,精心设计按键布局, 25 | 让拼音输入更加准确且便捷。 26 | 27 | **筷字输入法** 同时提供对目标编辑器的光标移动和文本选择等编辑支持, 28 | 从而彻底摆脱因移动设备屏幕太小而无法准确定位输入和文本位置的烦恼, 29 | 进一步提高在小屏设备上的文本输入和编辑效率。 30 | 31 | **筷字输入法** 的用户数据仅留存在本地, 32 | 其不连接互联网,不会收集和分析用户数据和行为习惯。 33 | 其也不支持和提供联想输入和模糊匹配机制,在某种程度上, 34 | 该输入法是希望加强用户对汉字和拼音的识别和记忆能力, 35 | 不要因为对数字工具的依赖而逐渐遗忘并最终丢弃「汉字」这一中华文明的瑰宝。 36 | 37 | ## 关于图标 38 | 39 | **筷字输入法** 的图标由汉字「字」与中国特色餐具「筷子」组合而成, 40 | 该组合将中华文明与其人民独有的饮食文化紧密相连,既展现了中华文明悠久的历史, 41 | 也有助于向世界传播中华的传统文化,加深世界友人对中华文化的认识和了解, 42 | 同时,也让中华儿女能够对其民族文化拥有更加坚定的自信。 43 | 44 | ## 仓库克隆 45 | 46 | ```bash 47 | git clone git@github.com:crazydan-studio/kuaizi-ime.git 48 | 49 | cd kuaizi-ime 50 | git submodule update --init android 51 | ``` 52 | 53 | > 具体的构建说明,请详见各模块的 README 文档。 54 | 55 | ## 工程结构 56 | 57 | - [android/](https://github.com/crazydan-studio/kuaizi-ime-android): 58 | 筷字输入法 Android 客户端的完整工程代码。注:使用命令 `git submodule update --init android` 59 | 初始化该 git 子模块 60 | - 61 | - [docs/](./docs/): 本项目的文档目录 62 | - [十字型输入的交互设计](./docs/design/cross-mode.md): 针对 X 型输入的优化改进方案 63 | - [tools/analyze/](./tools/analyze/): 64 | 拼音按键布局在线分析工具,用于分析拼音的字母组合结构,并验证正六边形按键的布局方案 65 | - [tools/pinyin-dict/](./tools/pinyin-dict/): 66 | 拼音字典数据采集和校正程序,采集汉字数据并生成 SQLite 数据库。筷字输入法客户端的字/词典数据库也由该工具生成 67 | - [thirdparty/](./thirdparty/): 68 | 本项目所使用的第三方字词数据。注:使用命令 69 | `git submodule update --init thirdparty/OpenCC thirdparty/pinyin-data` 70 | 初始化该目录内的 git 子模块 71 | 72 | ## License 73 | 74 | [Apache 2.0](./LICENSE) 75 | 76 | ## 友情赞助 77 | 78 | **注**:赞助时请添加备注信息 `筷字输入法`。 79 | 80 | 详细的赞助清单请查看[《友情赞助清单》](./docs/donate/index.md)。 81 | 82 | | 支付宝 | 微信支付 | 83 | | -- | -- | 84 | | | | 85 | 86 | ## 功能特性 87 | 88 | - 以连续滑屏方式录入拼音,且单次录入最多仅需滑屏两次 89 | - 内置 [8VIM](https://github.com/8VIM/8VIM)/[8pen](./docs/Increasing_the_Efficiency_of_Text_Input_in_the_8pen_Method.pdf) 90 | 输入模式,以画圈方式进行拼音、英文和数字的连续输入,从而提供顺滑的输入体验 91 | - 所有候选字均附带显示完整的拼音,可清晰识别各个字的准确读音 92 | - 输入数据本地存储,不收集、不分析用户数据 93 | - 采用隐马尔科夫模型(Hidden Markov Models)和维特比(Viterbi)算法实现拼音输入预测 94 | - 仅支持精确的拼音输入匹配,**不支持**拼音联想输入和模糊匹配机制 95 | - 提供光标移动、文本选择、复制、粘贴、剪切等编辑功能, 96 | 同时,支持撤回已提交至目标编辑器的输入,以便于对输入进行修正 97 | - 支持左右手模式切换,以适应不同的用户使用习惯 98 | - 支持简单的数学四则运算,可以边输入边计算 99 | - 支持录入标点、表情等文本符号,提供丰富的内容表达形式 100 | - 支持直接输入括号、引号等配对的标点符号,并自动将选中内容包裹在配对符号中 101 | - 在拉丁字符与汉字、操作数与运算符之间自动添加空格,以规范输入格式 102 | 103 | https://github.com/user-attachments/assets/c5a3c769-0e6d-42e1-bc2f-babe85607bfb 104 | 105 | > Note: 以上音频和字幕由 [GitPodcast](https://www.gitpodcast.com/) 106 | > 生成,最终的视频则是通过 [Veed](https://www.veed.io) 制作而成。 107 | 108 | 109 | 110 | ## 功能演示 111 | 112 | > 最新版本的按键布局可能会与演示视频有差异,请以最新版本的为准。 113 | 114 | ### 拼音滑屏输入 115 | 116 | https://github.com/crazydan-studio/kuaizi-ime/assets/1321315/eb2f74f9-f64e-4d02-ad80-98e3ecb9d61b 117 | 118 | ### 算术输入 119 | 120 | https://github.com/crazydan-studio/kuaizi-ime/assets/1321315/f461b564-0ac4-4257-82ad-11afcd3e1d6c 121 | 122 | ### 内容编辑 123 | 124 | https://github.com/crazydan-studio/kuaizi-ime/assets/1321315/992a0a5e-7e1e-4b93-a1ac-c893d0e3ff2e 125 | 126 | ### 表情符号匹配 127 | 128 | https://github.com/crazydan-studio/kuaizi-ime/assets/1321315/7fff6ddf-9e10-408f-b160-3b3b8e2ab215 129 | 130 | ### 配对符号输入 131 | 132 | https://github.com/crazydan-studio/kuaizi-ime/assets/1321315/97b0f99b-92e1-4b28-a5b6-d45150c4bada 133 | 134 | ### 拼音输入提交选项 135 | 136 | https://github.com/crazydan-studio/kuaizi-ime/assets/1321315/274d41dd-e858-4b71-a041-31df3dd24f7d 137 | 138 | ### 类 8VIM/8pen 型输入 139 | 140 | https://github.com/crazydan-studio/kuaizi-ime/assets/1321315/534fa61e-34dc-4e81-a7d1-5eb7cc3b291f 141 | -------------------------------------------------------------------------------- /docs/Increasing_the_Efficiency_of_Text_Input_in_the_8pen_Method.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/Increasing_the_Efficiency_of_Text_Input_in_the_8pen_Method.pdf -------------------------------------------------------------------------------- /docs/design/cross-mode/00-latin-init.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/00-latin-init.png -------------------------------------------------------------------------------- /docs/design/cross-mode/10-8pen-input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/10-8pen-input.png -------------------------------------------------------------------------------- /docs/design/cross-mode/10-8vim-active-upper-case.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/10-8vim-active-upper-case.png -------------------------------------------------------------------------------- /docs/design/cross-mode/10-8vim-input-demo.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/10-8vim-input-demo.mov -------------------------------------------------------------------------------- /docs/design/cross-mode/20-latin-input-active-zone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/20-latin-input-active-zone.png -------------------------------------------------------------------------------- /docs/design/cross-mode/20-latin-input-another.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/20-latin-input-another.png -------------------------------------------------------------------------------- /docs/design/cross-mode/20-latin-input-expand-zone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/20-latin-input-expand-zone.png -------------------------------------------------------------------------------- /docs/design/cross-mode/20-latin-input-lower-b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/20-latin-input-lower-b.png -------------------------------------------------------------------------------- /docs/design/cross-mode/20-latin-input-steps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/20-latin-input-steps.png -------------------------------------------------------------------------------- /docs/design/cross-mode/20-latin-input-upper-b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/20-latin-input-upper-b.png -------------------------------------------------------------------------------- /docs/design/cross-mode/30-latin-trace-shape-horizontal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/30-latin-trace-shape-horizontal.png -------------------------------------------------------------------------------- /docs/design/cross-mode/30-latin-trace-shape-vertical.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/30-latin-trace-shape-vertical.png -------------------------------------------------------------------------------- /docs/design/cross-mode/40-full-screen-input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/40-full-screen-input.png -------------------------------------------------------------------------------- /docs/design/cross-mode/40-input-trace-to-vector-horizontal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/40-input-trace-to-vector-horizontal.png -------------------------------------------------------------------------------- /docs/design/cross-mode/40-input-trace-to-vector-vertical.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/40-input-trace-to-vector-vertical.png -------------------------------------------------------------------------------- /docs/design/cross-mode/50-latin-blind-hit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/50-latin-blind-hit.png -------------------------------------------------------------------------------- /docs/design/cross-mode/60-select-latin-completion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/60-select-latin-completion.png -------------------------------------------------------------------------------- /docs/design/cross-mode/60-switch-latin-to-pinyin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/60-switch-latin-to-pinyin.png -------------------------------------------------------------------------------- /docs/design/cross-mode/70-pinyin-input-step-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/70-pinyin-input-step-1.png -------------------------------------------------------------------------------- /docs/design/cross-mode/70-pinyin-input-step-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/70-pinyin-input-step-2.png -------------------------------------------------------------------------------- /docs/design/cross-mode/75-pinyin-input-active-zone-level-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/75-pinyin-input-active-zone-level-2.png -------------------------------------------------------------------------------- /docs/design/cross-mode/75-pinyin-input-active-zone.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/75-pinyin-input-active-zone.png -------------------------------------------------------------------------------- /docs/design/cross-mode/75-pinyin-input-end.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/75-pinyin-input-end.png -------------------------------------------------------------------------------- /docs/design/cross-mode/75-pinyin-input-expand-zone-level-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/75-pinyin-input-expand-zone-level-1.png -------------------------------------------------------------------------------- /docs/design/cross-mode/75-pinyin-input-expand-zone-level-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/75-pinyin-input-expand-zone-level-2.png -------------------------------------------------------------------------------- /docs/design/cross-mode/75-pinyin-input-expand-zone-level-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/cross-mode/75-pinyin-input-expand-zone-level-3.png -------------------------------------------------------------------------------- /docs/design/x-mode/10-latin-input-steps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/design/x-mode/10-latin-input-steps.png -------------------------------------------------------------------------------- /docs/donate/alipay.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/donate/alipay.jpg -------------------------------------------------------------------------------- /docs/donate/index.md: -------------------------------------------------------------------------------- 1 | 友情赞助清单 2 | ================================ 3 | 4 | | 赞助人 | 赞助日期 | 赞助途径 | 赞助金额 | 备注 | 5 | | -- | -- | -- | -- | -- | 6 | | \*头 | 2025-01-22 19:17:23 | 微信支付 (收款单号: \*577504) | 5 RMB | | 7 | | \*翔 | 2025-01-24 21:30:35 | 支付宝 (收款单号: \*782440) | 10 RMB | | 8 | | S\*t | 2025-01-29 16:36:04 | 微信支付 (收款单号: \*139532) | 20 RMB | | 9 | | \*生 | 2025-02-03 18:51:25 | 微信支付 (收款单号: \*380626) | 0.01 RMB | 囊中羞涩,但还是支持一下 😄 | 10 | | J\*3 | 2025-02-05 12:54:04 | 微信支付 (收款单号: \*395589) | 6.66 RMB | | 11 | | \*夜 | 2025-02-06 16:46:39 | 微信支付 (收款单号: \*953513) | 5 RMB | 希望筷子输入法增加剪切板功能 | 12 | | \*巴 | 2025-02-07 10:49:53 | 微信支付 (收款单号: \*959595) | 8.8 RMB | 要是有剪贴板就好了 | 13 | | \*\*漪 | 2025-02-09 01:40:09 | 支付宝 (收款单号: \*579449) | 11.45 RMB | 很特别的小玩意,支持一下 | 14 | | h\*p | 2025-02-14 12:27:48 | 微信支付 (收款单号: \*570557) | 10 RMB | 很有意思的输入法,支持一下 | 15 | -------------------------------------------------------------------------------- /docs/donate/wechat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/donate/wechat.png -------------------------------------------------------------------------------- /docs/image/kuaizi_ime_candidates_and_emojis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/image/kuaizi_ime_candidates_and_emojis.png -------------------------------------------------------------------------------- /docs/image/kuaizi_ime_editor_editing.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/image/kuaizi_ime_editor_editing.png -------------------------------------------------------------------------------- /docs/image/kuaizi_ime_has_many_emojis.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/image/kuaizi_ime_has_many_emojis.png -------------------------------------------------------------------------------- /docs/image/kuaizi_ime_latin_case_input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/image/kuaizi_ime_latin_case_input.png -------------------------------------------------------------------------------- /docs/image/kuaizi_ime_main.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/image/kuaizi_ime_main.png -------------------------------------------------------------------------------- /docs/image/kuaizi_ime_math_expr_input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/image/kuaizi_ime_math_expr_input.png -------------------------------------------------------------------------------- /docs/image/kuaizi_ime_pair_symbols.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/image/kuaizi_ime_pair_symbols.png -------------------------------------------------------------------------------- /docs/image/kuaizi_ime_pinyin_slipping_input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/image/kuaizi_ime_pinyin_slipping_input.png -------------------------------------------------------------------------------- /docs/image/kuaizi_ime_x_pad_input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/image/kuaizi_ime_x_pad_input.png -------------------------------------------------------------------------------- /docs/video/kuaizi_ime_auto_completion.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/video/kuaizi_ime_auto_completion.mov -------------------------------------------------------------------------------- /docs/video/kuaizi_ime_editor_editing.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/video/kuaizi_ime_editor_editing.mov -------------------------------------------------------------------------------- /docs/video/kuaizi_ime_emoji_keyword_matching.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/video/kuaizi_ime_emoji_keyword_matching.mov -------------------------------------------------------------------------------- /docs/video/kuaizi_ime_math_expr_input.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/video/kuaizi_ime_math_expr_input.mov -------------------------------------------------------------------------------- /docs/video/kuaizi_ime_pair_symbol_input.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/video/kuaizi_ime_pair_symbol_input.mov -------------------------------------------------------------------------------- /docs/video/kuaizi_ime_pinyin_comitting_options.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/video/kuaizi_ime_pinyin_comitting_options.mov -------------------------------------------------------------------------------- /docs/video/kuaizi_ime_pinyin_slipping_input.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/video/kuaizi_ime_pinyin_slipping_input.mov -------------------------------------------------------------------------------- /docs/video/kuaizi_ime_x_pad_input.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/docs/video/kuaizi_ime_x_pad_input.mov -------------------------------------------------------------------------------- /logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 16 | 35 | 37 | 39 | 40 | 42 | 46 | 50 | 51 | 55 | 58 | 64 | 68 | 72 | 76 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /thirdparty/hanzi-level-3.txt: -------------------------------------------------------------------------------- 1 | # http://corpus.zhonghuayuwen.org/resources.aspx 2 | 亍 3 | 尢 4 | 彳 5 | 卬 6 | 殳 7 | 𠙶 8 | 毌 9 | 邘 10 | 戋 11 | 圢 12 | 氕 13 | 伋 14 | 仝 15 | 冮 16 | 氿 17 | 汈 18 | 氾 19 | 忉 20 | 宄 21 | 讱 22 | 扞 23 | 圲 24 | 圫 25 | 芏 26 | 芃 27 | 朳 28 | 朸 29 | 𨙸 30 | 邨 31 | 吒 32 | 吖 33 | 屼 34 | 屾 35 | 辿 36 | 钆 37 | 仳 38 | 伣 39 | 伈 40 | 癿 41 | 甪 42 | 邠 43 | 犴 44 | 冱 45 | 邡 46 | 闫 47 | 汋 48 | 䜣 49 | 讻 50 | 孖 51 | 纩 52 | 玒 53 | 玓 54 | 玘 55 | 玚 56 | 刬 57 | 坜 58 | 坉 59 | 扽 60 | 坋 61 | 扺 62 | 㧑 63 | 毐 64 | 芰 65 | 芣 66 | 苊 67 | 苉 68 | 芘 69 | 芴 70 | 芠 71 | 芤 72 | 杕 73 | 杙 74 | 杄 75 | 杧 76 | 杩 77 | 尪 78 | 尨 79 | 轪 80 | 坒 81 | 芈 82 | 旴 83 | 旵 84 | 呙 85 | 㕮 86 | 岍 87 | 岠 88 | 岜 89 | 呇 90 | 冏 91 | 觃 92 | 岙 93 | 伾 94 | 㑇 95 | 伭 96 | 佖 97 | 伲 98 | 佁 99 | 飏 100 | 狃 101 | 闶 102 | 汧 103 | 汫 104 | 𣲘 105 | 𣲗 106 | 沄 107 | 沘 108 | 汭 109 | 㳇 110 | 沇 111 | 忮 112 | 忳 113 | 忺 114 | 祃 115 | 诇 116 | 邲 117 | 诎 118 | 诐 119 | 屃 120 | 岊 121 | 阽 122 | 䢺 123 | 阼 124 | 妧 125 | 妘 126 | 𨚕 127 | 纮 128 | 驲 129 | 纻 130 | 纼 131 | 玤 132 | 玞 133 | 玱 134 | 玟 135 | 邽 136 | 邿 137 | 坥 138 | 坰 139 | 坬 140 | 坽 141 | 弆 142 | 耵 143 | 䢼 144 | 𦭜 145 | 茋 146 | 苧 147 | 苾 148 | 苠 149 | 枅 150 | 㭎 151 | 枘 152 | 枍 153 | 矼 154 | 矻 155 | 匼 156 | 旿 157 | 昇 158 | 昄 159 | 昒 160 | 昈 161 | 咉 162 | 咇 163 | 咍 164 | 岵 165 | 岽 166 | 岨 167 | 岞 168 | 峂 169 | 㟃 170 | 囷 171 | 钐 172 | 钔 173 | 钖 174 | 牥 175 | 佴 176 | 垈 177 | 侁 178 | 侹 179 | 佸 180 | 佺 181 | 隹 182 | 㑊 183 | 侂 184 | 佽 185 | 侘 186 | 郈 187 | 舠 188 | 郐 189 | 郃 190 | 攽 191 | 肭 192 | 肸 193 | 肷 194 | 狉 195 | 狝 196 | 饳 197 | 忞 198 | 於 199 | 炌 200 | 炆 201 | 泙 202 | 沺 203 | 泂 204 | 泜 205 | 泃 206 | 泇 207 | 怊 208 | 峃 209 | 穸 210 | 祋 211 | 祊 212 | 鸤 213 | 弢 214 | 弨 215 | 陑 216 | 陎 217 | 卺 218 | 乸 219 | 妭 220 | 姈 221 | 迳 222 | 叕 223 | 驵 224 | 䌹 225 | 驺 226 | 绋 227 | 绐 228 | 砉 229 | 耔 230 | 㛃 231 | 玶 232 | 珇 233 | 珅 234 | 珋 235 | 玹 236 | 珌 237 | 玿 238 | 韨 239 | 垚 240 | 垯 241 | 垙 242 | 垲 243 | 埏 244 | 垍 245 | 耇 246 | 垎 247 | 垴 248 | 垟 249 | 垞 250 | 挓 251 | 垵 252 | 垏 253 | 拶 254 | 荖 255 | 荁 256 | 荙 257 | 荛 258 | 茈 259 | 茽 260 | 荄 261 | 茺 262 | 荓 263 | 茳 264 | 𦰡 265 | 茛 266 | 荭 267 | 㭕 268 | 柷 269 | 柃 270 | 柊 271 | 枹 272 | 栐 273 | 柖 274 | 郚 275 | 剅 276 | 䴓 277 | 迺 278 | 厖 279 | 砆 280 | 砑 281 | 砄 282 | 耏 283 | 奓 284 | 䶮 285 | 轵 286 | 轷 287 | 轹 288 | 轺 289 | 昺 290 | 昽 291 | 盷 292 | 咡 293 | 咺 294 | 昳 295 | 昣 296 | 哒 297 | 昤 298 | 昫 299 | 昡 300 | 咥 301 | 昪 302 | 虷 303 | 虸 304 | 哃 305 | 峘 306 | 耑 307 | 峛 308 | 峗 309 | 峧 310 | 帡 311 | 钘 312 | 钜 313 | 钪 314 | 钬 315 | 钭 316 | 矧 317 | 秬 318 | 俫 319 | 舁 320 | 俜 321 | 俙 322 | 俍 323 | 垕 324 | 衎 325 | 舣 326 | 弇 327 | 侴 328 | 鸧 329 | 䏡 330 | 胠 331 | 𦙶 332 | 胈 333 | 胩 334 | 胣 335 | 朏 336 | 飐 337 | 訄 338 | 饻 339 | 庤 340 | 疢 341 | 炣 342 | 炟 343 | 㶲 344 | 洭 345 | 洘 346 | 洓 347 | 洿 348 | 㳚 349 | 泚 350 | 浈 351 | 浉 352 | 洸 353 | 洑 354 | 洢 355 | 洈 356 | 洚 357 | 洺 358 | 洨 359 | 浐 360 | 㳘 361 | 洴 362 | 洣 363 | 恔 364 | 宬 365 | 窀 366 | 扂 367 | 袆 368 | 祏 369 | 祐 370 | 祕 371 | 叚 372 | 陧 373 | 陞 374 | 娀 375 | 姞 376 | 姱 377 | 姤 378 | 姶 379 | 姽 380 | 枲 381 | 绖 382 | 骃 383 | 彖 384 | 骉 385 | 恝 386 | 珪 387 | 珛 388 | 珹 389 | 琊 390 | 玼 391 | 珖 392 | 珽 393 | 珦 394 | 珫 395 | 珒 396 | 珢 397 | 珕 398 | 珝 399 | 埗 400 | 垾 401 | 垺 402 | 埆 403 | 垿 404 | 埌 405 | 埇 406 | 莰 407 | 茝 408 | 鄀 409 | 莶 410 | 莝 411 | 䓖 412 | 莙 413 | 栻 414 | 桠 415 | 桄 416 | 梠 417 | 栴 418 | 梴 419 | 栒 420 | 酎 421 | 酏 422 | 砵 423 | 砠 424 | 砫 425 | 砬 426 | 硁 427 | 恧 428 | 翃 429 | 郪 430 | 𨐈 431 | 辀 432 | 辁 433 | 剕 434 | 赀 435 | 哢 436 | 晅 437 | 晊 438 | 唝 439 | 哳 440 | 哱 441 | 冔 442 | 晔 443 | 晐 444 | 晖 445 | 畖 446 | 蚄 447 | 蚆 448 | 帱 449 | 崁 450 | 峿 451 | 崄 452 | 帨 453 | 崀 454 | 赆 455 | 钷 456 | 眚 457 | 甡 458 | 笫 459 | 倻 460 | 倴 461 | 脩 462 | 倮 463 | 倕 464 | 倞 465 | 倓 466 | 倧 467 | 衃 468 | 虒 469 | 舭 470 | 舯 471 | 舥 472 | 瓞 473 | 鬯 474 | 鸰 475 | 脎 476 | 朓 477 | 胲 478 | 虓 479 | 鱽 480 | 狴 481 | 峱 482 | 狻 483 | 眢 484 | 勍 485 | 痄 486 | 疰 487 | 痃 488 | 竘 489 | 羖 490 | 羓 491 | 桊 492 | 敉 493 | 烠 494 | 烔 495 | 烶 496 | 烻 497 | 涍 498 | 浡 499 | 浭 500 | 浬 501 | 涄 502 | 涢 503 | 涐 504 | 浰 505 | 浟 506 | 浛 507 | 浼 508 | 浲 509 | 涘 510 | 悈 511 | 悃 512 | 悢 513 | 宧 514 | 窅 515 | 窊 516 | 窎 517 | 扅 518 | 扆 519 | 袪 520 | 袗 521 | 袯 522 | 祧 523 | 隺 524 | 堲 525 | 疍 526 | 𨺙 527 | 陴 528 | 烝 529 | 砮 530 | 㛚 531 | 哿 532 | 翀 533 | 翂 534 | 剟 535 | 绤 536 | 骍 537 | 䂮 538 | 琎 539 | 珸 540 | 珵 541 | 琄 542 | 琈 543 | 琀 544 | 珺 545 | 掭 546 | 堎 547 | 堐 548 | 埼 549 | 掎 550 | 埫 551 | 堌 552 | 晢 553 | 掞 554 | 埪 555 | 壸 556 | 㙍 557 | 聍 558 | 菝 559 | 萚 560 | 菥 561 | 莿 562 | 䓫 563 | 勚 564 | 䓬 565 | 萆 566 | 菂 567 | 菍 568 | 菼 569 | 萣 570 | 䓨 571 | 菉 572 | 䓛 573 | 梼 574 | 梽 575 | 桲 576 | 梾 577 | 桯 578 | 梣 579 | 梌 580 | 桹 581 | 敔 582 | 厣 583 | 硔 584 | 硙 585 | 硚 586 | 硊 587 | 硍 588 | 勔 589 | 䴕 590 | 龁 591 | 逴 592 | 唪 593 | 啫 594 | 翈 595 | 㫰 596 | 晙 597 | 畤 598 | 趼 599 | 跂 600 | 蛃 601 | 蚲 602 | 蚺 603 | 啴 604 | 䎃 605 | 崧 606 | 崟 607 | 崞 608 | 崒 609 | 崌 610 | 崡 611 | 铏 612 | 铕 613 | 铖 614 | 铘 615 | 铚 616 | 铞 617 | 铥 618 | 铴 619 | 牻 620 | 牿 621 | 稆 622 | 笱 623 | 笯 624 | 偰 625 | 偡 626 | 鸺 627 | 偭 628 | 偲 629 | 偁 630 | 㿠 631 | 鄅 632 | 偓 633 | 徛 634 | 衒 635 | 舳 636 | 舲 637 | 鸼 638 | 悆 639 | 鄃 640 | 瓻 641 | 䝙 642 | 脶 643 | 脞 644 | 脟 645 | 䏲 646 | 鱾 647 | 猇 648 | 猊 649 | 猄 650 | 觖 651 | 𠅤 652 | 庱 653 | 庼 654 | 庳 655 | 痓 656 | 䴔 657 | 竫 658 | 堃 659 | 阌 660 | 羝 661 | 羕 662 | 焆 663 | 烺 664 | 焌 665 | 淏 666 | 淟 667 | 淜 668 | 淴 669 | 淯 670 | 湴 671 | 涴 672 | 㥄 673 | 惛 674 | 惔 675 | 悰 676 | 惙 677 | 寁 678 | 逭 679 | 袼 680 | 裈 681 | 祲 682 | 谞 683 | 艴 684 | 弸 685 | 弶 686 | 隃 687 | 婞 688 | 娵 689 | 婼 690 | 媖 691 | 婳 692 | 婍 693 | 婌 694 | 婫 695 | 婤 696 | 婘 697 | 婠 698 | 绹 699 | 骕 700 | 絜 701 | 珷 702 | 琲 703 | 琡 704 | 琟 705 | 琔 706 | 琭 707 | 堾 708 | 堼 709 | 揕 710 | 㙘 711 | 堧 712 | 喆 713 | 堨 714 | 塅 715 | 堠 716 | 絷 717 | 𡎚 718 | 葜 719 | 惎 720 | 萳 721 | 葙 722 | 靬 723 | 葴 724 | 蒇 725 | 蒈 726 | 鄚 727 | 蒉 728 | 蓇 729 | 萩 730 | 蒐 731 | 葰 732 | 葎 733 | 鄑 734 | 蒎 735 | 葖 736 | 蒄 737 | 萹 738 | 棤 739 | 棽 740 | 棫 741 | 椓 742 | 椑 743 | 鹀 744 | 椆 745 | 棓 746 | 棬 747 | 棪 748 | 椀 749 | 楗 750 | 甦 751 | 酦 752 | 觌 753 | 奡 754 | 皕 755 | 硪 756 | 欹 757 | 詟 758 | 辌 759 | 棐 760 | 龂 761 | 黹 762 | 牚 763 | 睎 764 | 晫 765 | 晪 766 | 晱 767 | 𧿹 768 | 蛑 769 | 畯 770 | 斝 771 | 喤 772 | 崶 773 | 嵁 774 | 崾 775 | 嵅 776 | 崿 777 | 嵚 778 | 翙 779 | 圌 780 | 圐 781 | 赑 782 | 淼 783 | 赒 784 | 铹 785 | 铽 786 | 𨱇 787 | 锊 788 | 锍 789 | 锎 790 | 锓 791 | 犇 792 | 颋 793 | 稌 794 | 筀 795 | 筘 796 | 筜 797 | 筥 798 | 筅 799 | 傃 800 | 傉 801 | 翛 802 | 傒 803 | 傕 804 | 舾 805 | 畬 806 | 脿 807 | 腘 808 | 䐃 809 | 腙 810 | 腒 811 | 鲃 812 | 猰 813 | 猯 814 | 㺄 815 | 馉 816 | 凓 817 | 鄗 818 | 廋 819 | 廆 820 | 鄌 821 | 粢 822 | 遆 823 | 旐 824 | 焞 825 | 欻 826 | 𣸣 827 | 溚 828 | 溁 829 | 湝 830 | 渰 831 | 湓 832 | 㴔 833 | 渟 834 | 溠 835 | 渼 836 | 溇 837 | 湣 838 | 湑 839 | 溞 840 | 愐 841 | 愃 842 | 敩 843 | 甯 844 | 棨 845 | 扊 846 | 裣 847 | 祼 848 | 婻 849 | 媆 850 | 媞 851 | 㛹 852 | 媓 853 | 媂 854 | 媄 855 | 毵 856 | 矞 857 | 缊 858 | 缐 859 | 骙 860 | 瑃 861 | 瑓 862 | 瑅 863 | 瑆 864 | 䴖 865 | 瑖 866 | 瑝 867 | 瑔 868 | 瑀 869 | 𤧛 870 | 瑳 871 | 瑂 872 | 嶅 873 | 瑑 874 | 遘 875 | 髢 876 | 塥 877 | 堽 878 | 赪 879 | 摛 880 | 塝 881 | 搒 882 | 搌 883 | 蒱 884 | 蒨 885 | 蓏 886 | 蔀 887 | 蓢 888 | 蓂 889 | 蒻 890 | 蓣 891 | 椹 892 | 楪 893 | 榃 894 | 榅 895 | 楒 896 | 楞 897 | 楩 898 | 榇 899 | 椸 900 | 楙 901 | 歅 902 | 碃 903 | 碏 904 | 碈 905 | 䃅 906 | 硿 907 | 鄠 908 | 辒 909 | 龆 910 | 觜 911 | 䣘 912 | 暕 913 | 鹍 914 | 㬊 915 | 暅 916 | 跱 917 | 蜐 918 | 蜎 919 | 嵲 920 | 赗 921 | 骱 922 | 锖 923 | 锘 924 | 锳 925 | 锧 926 | 锪 927 | 锫 928 | 锬 929 | 稑 930 | 稙 931 | 䅟 932 | 筻 933 | 筼 934 | 筶 935 | 筦 936 | 筤 937 | 傺 938 | 鹎 939 | 僇 940 | 艅 941 | 艉 942 | 谼 943 | 貆 944 | 腽 945 | 腨 946 | 腯 947 | 鲉 948 | 鲊 949 | 鲌 950 | 䲟 951 | 鲏 952 | 雊 953 | 猺 954 | 飔 955 | 觟 956 | 𦝼 957 | 馌 958 | 裛 959 | 廒 960 | 瘀 961 | 瘅 962 | 鄘 963 | 鹒 964 | 鄜 965 | 麀 966 | 鄣 967 | 阘 968 | 煁 969 | 煃 970 | 煴 971 | 煋 972 | 煟 973 | 煓 974 | 滠 975 | 溍 976 | 溹 977 | 滆 978 | 滉 979 | 溦 980 | 溵 981 | 漷 982 | 滧 983 | 滘 984 | 滍 985 | 愭 986 | 慥 987 | 慆 988 | 塱 989 | 裼 990 | 禋 991 | 禔 992 | 禘 993 | 禒 994 | 谫 995 | 鹔 996 | 愍 997 | 嫄 998 | 媱 999 | 戤 1000 | 勠 1001 | 戣 1002 | 缞 1003 | 耤 1004 | 瑧 1005 | 瑨 1006 | 瑱 1007 | 瑷 1008 | 瑢 1009 | 斠 1010 | 摏 1011 | 墕 1012 | 墈 1013 | 墐 1014 | 墘 1015 | 摴 1016 | 銎 1017 | 𡐓 1018 | 墚 1019 | 撖 1020 | 靽 1021 | 鞁 1022 | 蔌 1023 | 蔈 1024 | 蓰 1025 | 蔹 1026 | 蔊 1027 | 嘏 1028 | 榰 1029 | 榑 1030 | 槚 1031 | 𣗋 1032 | 槜 1033 | 榍 1034 | 疐 1035 | 酺 1036 | 酾 1037 | 酲 1038 | 酴 1039 | 碶 1040 | 䃎 1041 | 碨 1042 | 𥔲 1043 | 碹 1044 | 碥 1045 | 劂 1046 | 䴗 1047 | 夥 1048 | 瞍 1049 | 鹖 1050 | 㬎 1051 | 跽 1052 | 蜾 1053 | 幖 1054 | 嶍 1055 | 圙 1056 | 𨱏 1057 | 锺 1058 | 锼 1059 | 锽 1060 | 锾 1061 | 锿 1062 | 镃 1063 | 镄 1064 | 镅 1065 | 馝 1066 | 鹙 1067 | 箨 1068 | 箖 1069 | 劄 1070 | 僬 1071 | 僦 1072 | 僔 1073 | 僎 1074 | 槃 1075 | 㙦 1076 | 鲒 1077 | 鲕 1078 | 鲖 1079 | 鲗 1080 | 鲘 1081 | 鲙 1082 | 𩽾 1083 | 夐 1084 | 獍 1085 | 飗 1086 | 凘 1087 | 廑 1088 | 廙 1089 | 瘗 1090 | 瘥 1091 | 瘕 1092 | 鲝 1093 | 鄫 1094 | 熇 1095 | 漹 1096 | 漖 1097 | 潆 1098 | 漤 1099 | 潩 1100 | 漼 1101 | 漴 1102 | 㽏 1103 | 漈 1104 | 漋 1105 | 漻 1106 | 慬 1107 | 窬 1108 | 窭 1109 | 㮾 1110 | 褕 1111 | 禛 1112 | 禚 1113 | 隩 1114 | 嫕 1115 | 嫭 1116 | 嫜 1117 | 嫪 1118 | 㻬 1119 | 麹 1120 | 璆 1121 | 漦 1122 | 叇 1123 | 墣 1124 | 墦 1125 | 墡 1126 | 劐 1127 | 薁 1128 | 蕰 1129 | 蔃 1130 | 鼒 1131 | 槱 1132 | 鹝 1133 | 磏 1134 | 磉 1135 | 殣 1136 | 慭 1137 | 霅 1138 | 暵 1139 | 暲 1140 | 暶 1141 | 踦 1142 | 踣 1143 | 䗖 1144 | 蝘 1145 | 蝲 1146 | 蝤 1147 | 噇 1148 | 噂 1149 | 噀 1150 | 罶 1151 | 嶲 1152 | 嶓 1153 | 㠇 1154 | 嶟 1155 | 嶒 1156 | 镆 1157 | 镈 1158 | 镋 1159 | 镎 1160 | 镕 1161 | 稹 1162 | 儇 1163 | 皞 1164 | 皛 1165 | 䴘 1166 | 艎 1167 | 艏 1168 | 鹟 1169 | 𩾃 1170 | 鲦 1171 | 鲪 1172 | 鲬 1173 | 橥 1174 | 觭 1175 | 鹠 1176 | 鹡 1177 | 糇 1178 | 糈 1179 | 翦 1180 | 鹢 1181 | 鹣 1182 | 熛 1183 | 潖 1184 | 潵 1185 | 㵐 1186 | 澂 1187 | 澛 1188 | 瑬 1189 | 潽 1190 | 潾 1191 | 潏 1192 | 憭 1193 | 憕 1194 | 戭 1195 | 褯 1196 | 禤 1197 | 嫽 1198 | 遹 1199 | 璥 1200 | 璲 1201 | 璒 1202 | 憙 1203 | 擐 1204 | 鄹 1205 | 薳 1206 | 鞔 1207 | 黇 1208 | 蕗 1209 | 薢 1210 | 蕹 1211 | 橞 1212 | 橑 1213 | 橦 1214 | 醑 1215 | 觱 1216 | 磡 1217 | 𥕢 1218 | 磜 1219 | 豮 1220 | 鹾 1221 | 虤 1222 | 暿 1223 | 曌 1224 | 曈 1225 | 㬚 1226 | 蹅 1227 | 踶 1228 | 䗛 1229 | 螗 1230 | 疁 1231 | 㠓 1232 | 幪 1233 | 嶦 1234 | 𨱑 1235 | 馞 1236 | 穄 1237 | 篚 1238 | 篯 1239 | 簉 1240 | 鼽 1241 | 衠 1242 | 盦 1243 | 螣 1244 | 縢 1245 | 鲭 1246 | 鲯 1247 | 鲰 1248 | 鲺 1249 | 鲹 1250 | 亸 1251 | 癀 1252 | 瘭 1253 | 羱 1254 | 糒 1255 | 燋 1256 | 熻 1257 | 燊 1258 | 燚 1259 | 燏 1260 | 濩 1261 | 濋 1262 | 澪 1263 | 澽 1264 | 澴 1265 | 澭 1266 | 澼 1267 | 憷 1268 | 憺 1269 | 懔 1270 | 黉 1271 | 嬛 1272 | 鹨 1273 | 翯 1274 | 璱 1275 | 𤩽 1276 | 璬 1277 | 璮 1278 | 髽 1279 | 擿 1280 | 薿 1281 | 薸 1282 | 檑 1283 | 櫆 1284 | 檞 1285 | 醨 1286 | 繄 1287 | 磹 1288 | 磻 1289 | 瞫 1290 | 瞵 1291 | 蹐 1292 | 蟏 1293 | 㘎 1294 | 镤 1295 | 镥 1296 | 镨 1297 | 𨱔 1298 | 矰 1299 | 穙 1300 | 穜 1301 | 穟 1302 | 簕 1303 | 簃 1304 | 簏 1305 | 儦 1306 | 魋 1307 | 斶 1308 | 艚 1309 | 谿 1310 | 䲠 1311 | 鲾 1312 | 鲿 1313 | 鳁 1314 | 鳂 1315 | 鳈 1316 | 鳉 1317 | 獯 1318 | 䗪 1319 | 馘 1320 | 襕 1321 | 襚 1322 | 螱 1323 | 甓 1324 | 嬬 1325 | 嬥 1326 | 𦈡 1327 | 瓀 1328 | 釐 1329 | 鬶 1330 | 爇 1331 | 鞳 1332 | 鞮 1333 | 藟 1334 | 藦 1335 | 藨 1336 | 鹲 1337 | 檫 1338 | 黡 1339 | 礞 1340 | 礌 1341 | 𥖨 1342 | 蹢 1343 | 蹜 1344 | 蟫 1345 | 䗴 1346 | 嚚 1347 | 髃 1348 | 镮 1349 | 镱 1350 | 酂 1351 | 馧 1352 | 簠 1353 | 簝 1354 | 簰 1355 | 鼫 1356 | 鼩 1357 | 皦 1358 | 臑 1359 | 䲢 1360 | 鳑 1361 | 鳒 1362 | 鹱 1363 | 鹯 1364 | 癗 1365 | 𦒍 1366 | 旞 1367 | 翷 1368 | 冁 1369 | 䎖 1370 | 瀔 1371 | 瀍 1372 | 瀌 1373 | 襜 1374 | 䴙 1375 | 嚭 1376 | 㰀 1377 | 鬷 1378 | 醭 1379 | 蹯 1380 | 蠋 1381 | 翾 1382 | 鳘 1383 | 儳 1384 | 儴 1385 | 鼗 1386 | 𩾌 1387 | 鳚 1388 | 鳛 1389 | 麑 1390 | 麖 1391 | 蠃 1392 | 彟 1393 | 嬿 1394 | 鬒 1395 | 蘘 1396 | 欂 1397 | 醵 1398 | 颥 1399 | 甗 1400 | 𨟠 1401 | 巇 1402 | 酅 1403 | 髎 1404 | 犨 1405 | 𨭉 1406 | 㸌 1407 | 爔 1408 | 瀱 1409 | 瀹 1410 | 瀼 1411 | 瀵 1412 | 襫 1413 | 孅 1414 | 骦 1415 | 耰 1416 | 𤫉 1417 | 瓖 1418 | 鬘 1419 | 趯 1420 | 罍 1421 | 鼱 1422 | 鳠 1423 | 鳡 1424 | 鳣 1425 | 爟 1426 | 爚 1427 | 灈 1428 | 韂 1429 | 糵 1430 | 蘼 1431 | 礵 1432 | 鹴 1433 | 躔 1434 | 皭 1435 | 龢 1436 | 鳤 1437 | 亹 1438 | 籥 1439 | 鼷 1440 | 玃 1441 | 醾 1442 | 齇 1443 | 觿 1444 | 蠼 -------------------------------------------------------------------------------- /thirdparty/古代汉语语料库字频表.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/thirdparty/古代汉语语料库字频表.xls -------------------------------------------------------------------------------- /thirdparty/现代汉语常用字表.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/thirdparty/现代汉语常用字表.xls -------------------------------------------------------------------------------- /thirdparty/现代汉语语料库分词类词频表.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/thirdparty/现代汉语语料库分词类词频表.xls -------------------------------------------------------------------------------- /thirdparty/现代汉语语料库字频表.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/thirdparty/现代汉语语料库字频表.xls -------------------------------------------------------------------------------- /thirdparty/现代汉语语料库词频表.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/thirdparty/现代汉语语料库词频表.xls -------------------------------------------------------------------------------- /thirdparty/现代汉语通用字表.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/thirdparty/现代汉语通用字表.xls -------------------------------------------------------------------------------- /thirdparty/通用规范汉字表.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/thirdparty/通用规范汉字表.xls -------------------------------------------------------------------------------- /tools/analyze/LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright 2019–2020 Observable, Inc. 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any 4 | purpose with or without fee is hereby granted, provided that the above 5 | copyright notice and this permission notice appear in all copies. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 8 | WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 9 | MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 10 | ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 11 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 12 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 13 | OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 14 | -------------------------------------------------------------------------------- /tools/analyze/README.md: -------------------------------------------------------------------------------- 1 | 拼音按键布局在线分析工具 2 | ============================== 3 | 4 | > 本代码改造自 5 | > [Force-Directed Tree](https://observablehq.com/@d3/force-directed-tree@183) 6 | > 和 [Collapsible tree](https://observablehq.com/@d3/collapsible-tree)。 7 | 8 | 在当前目录中执行命令以启动静态页面服务: 9 | 10 | ```sh 11 | npx http-server 12 | ``` 13 | 14 | > 需先安装 [NodeJS](https://nodejs.org/)。 15 | 16 | ## 汉语拼音字母后继树 17 | 18 | 访问地址 http://127.0.0.1:8080/char-tree.html 19 | 以查看拼音的后继字母的树形结构,从而规划出适和滑屏输入的拼音字母的按键布局。 20 | 21 | > 代码为 [char-tree.js](./char-tree.js)。 22 | 23 | ![](./img/pinyin-char-tree.png) 24 | 25 | ## 汉语拼音字母组合树 26 | 27 | 访问地址 http://127.0.0.1:8080/char-links.html 28 | 以查看拼音字母的组合关系。 29 | 30 | > 代码为 [char-links.js](./char-links.js)。 31 | 32 | ![](./img/pinyin-char-links.png) 33 | 34 | ## 汉语拼音划词模拟 35 | 36 | 访问地址 http://127.0.0.1:8080/simulate.html 37 | 以查看规划的按键布局是否符合要求。 38 | 39 | > 代码为 [simulate.js](./simulate.js)。 40 | 41 | ![](./img/pinyin-key-layout.png) 42 | 43 | ## 外部资料 44 | 45 | ### 拼音字母统计频率 46 | 47 | > - 数据来自于[@軒轅羽](https://www.zhihu.com/question/23111438/answer/559582999) 48 | > - 脚本:`echo $data | sort -r -n -k 2` 49 | 50 | | 声母 | 频率 | 51 | | -- | -- | 52 | | d | 10.29% | 53 | | y | 9.69% | 54 | | sh | 8.04% | 55 | | j | 6.86% | 56 | | zh | 6.52% | 57 | | x | 5.86% | 58 | | g | 5.64% | 59 | | l | 4.61% | 60 | | b | 4.49% | 61 | | h | 4.10% | 62 | | z | 3.65% | 63 | | w | 3.26% | 64 | | q | 3.23% | 65 | | ch | 3.02% | 66 | | f | 2.97% | 67 | | m | 2.90% | 68 | | t | 2.79% | 69 | | r | 2.76% | 70 | | n | 2.18% | 71 | | k | 1.85% | 72 | | s | 1.47% | 73 | | p | 1.40% | 74 | | c | 1.27% | 75 | 76 | | 单字母韵母 | 频率 | 77 | | -- | -- | 78 | | i | 15.81% | 79 | | e | 10.48% | 80 | | u | 9.08% | 81 | | a | 2.95% | 82 | | o | 0.51% | 83 | | ü | 0.40% | 84 | 85 | | 多字母韵母 | 频率 | 86 | | -- | -- | 87 | | ao | 4.04% | 88 | | ai | 3.91% | 89 | | an | 3.78% | 90 | | ang | 3.45% | 91 | | en | 4.02% | 92 | | eng | 3.26% | 93 | | ei | 2.83% | 94 | | er | 0.57% | 95 | | ian | 3.68% | 96 | | ing | 3.68% | 97 | | in | 2.07% | 98 | | iang | 1.57% | 99 | | iao | 1.83% | 100 | | ie | 1.24% | 101 | | ia | 1.18% | 102 | | iu | 1.09% | 103 | | iong | 0.02% | 104 | | ong | 3.83% | 105 | | ou | 3.35% | 106 | | uo | 3.41% | 107 | | uan | 2.73% | 108 | | ui | 2.14% | 109 | | ue | 1.29% | 110 | | un | 0.96% | 111 | | uang | 0.41% | 112 | | ua | 0.35% | 113 | | uai | 0.10% | 114 | | üe | 0.03% | 115 | 116 | ### 英文字母统计频率 117 | 118 | > - 数据来自于[英语单词中首字母的频率](https://zh.wikipedia.org/zh-cn/%E5%AD%97%E6%AF%8D%E9%A2%91%E7%8E%87#.E8.8B.B1.E8.AF.AD.E5.8D.95.E8.AF.8D.E4.B8.AD.E9.A6.96.E5.AD.97.E6.AF.8D.E7.9A.84.E9.A2.91.E7.8E.87) 119 | > - 脚本:`echo $data | sort -r -n -k 2` 120 | 121 | | 字母 | 频率 | 122 | | -- | -- | 123 | | t | 16.671% | 124 | | a | 11.602% | 125 | | s | 7.755% | 126 | | h | 7.232% | 127 | | w | 6.753% | 128 | | i | 6.286% | 129 | | o | 6.264% | 130 | | b | 4.702% | 131 | | m | 4.374% | 132 | | f | 3.779% | 133 | | c | 3.511% | 134 | | l | 2.705% | 135 | | d | 2.670% | 136 | | p | 2.545% | 137 | | n | 2.365% | 138 | | e | 2.007% | 139 | | g | 1.950% | 140 | | r | 1.653% | 141 | | y | 1.620% | 142 | | u | 1.487% | 143 | | v | 0.649% | 144 | | j | 0.597% | 145 | | k | 0.590% | 146 | | q | 0.173% | 147 | | x | 0.037% | 148 | | z | 0.034% | 149 | -------------------------------------------------------------------------------- /tools/analyze/char-links.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 汉语拼音字母组合树 4 | 5 | 6 | 13 | 14 | -------------------------------------------------------------------------------- /tools/analyze/char-links.js: -------------------------------------------------------------------------------- 1 | // https://observablehq.com/@d3/force-directed-tree@183 2 | function _1(md) { 3 | return md` 4 | ## 汉语拼音字母组合树 5 | 6 | - 韵母字母(6 个): a i e u ü o 7 | - 声母字母(20 个): n g; z c s h; r x y w; b p m f d t l k j q 8 | `; 9 | } 10 | 11 | const yunmuTable = ['a', 'i', 'e', 'u', 'ü', 'o']; 12 | 13 | function _chart(d3, data, width, height, drag, invalidation) { 14 | // const root = d3.hierarchy(data); 15 | // const links = root.links(); 16 | // const nodes = root.descendants(); 17 | const links = data.map((d) => Object.create(d)); 18 | const nodes = Array.from( 19 | new Set(data.flatMap((l) => [l.source, l.target])), 20 | (id) => ({ id, data: { name: id } }) 21 | ).map((d) => Object.create(d)); 22 | 23 | const simulation = d3 24 | .forceSimulation(nodes) 25 | .force( 26 | 'link', 27 | d3 28 | .forceLink(links) 29 | .id((d) => d.id) 30 | .distance(300) 31 | .strength(0.5) 32 | ) 33 | .force('charge', d3.forceManyBody().strength(-380)) 34 | .force('x', d3.forceX()) 35 | .force('y', d3.forceY()); 36 | 37 | const svg = d3 38 | .create('svg') 39 | .attr('viewBox', [-width / 2, -height / 2, width, height]); 40 | svg.call(dragGraph(d3, svg)); 41 | 42 | const linkStroke = (l) => 43 | !yunmuTable.includes(l.source.id) && !yunmuTable.includes(l.target.id) 44 | ? 'rgb(199, 53, 0)' 45 | : ['j', 'q', 'x'].includes(l.source.id) && 46 | ['i', 'u'].includes(l.target.id) 47 | ? 'rgb(0, 138, 0)' 48 | : '#999'; 49 | const linkHighlightStroke = 'rgb(184, 84, 80)'; 50 | const link = svg 51 | .append('g') 52 | .selectAll('line') 53 | .data(links) 54 | .join('line') 55 | .attr('stroke', linkStroke) 56 | .attr('stroke-width', 1.5) 57 | .attr('stroke-opacity', 0.6); 58 | 59 | const nodeFill = (d) => 60 | yunmuTable.includes(d.data.name) 61 | ? 'rgb(225, 213, 231)' 62 | : 'rgb(255, 242, 204)'; 63 | const nodeStroke = 'rgb(108, 142, 191)'; 64 | const nodeHighlightFill = 'rgb(248, 206, 204)'; 65 | const nodeHighlightStroke = 'rgb(184, 84, 80)'; 66 | const node = svg 67 | .append('g') 68 | .attr('fill', '#fff') 69 | .attr('stroke-linecap', 'round') 70 | .attr('stroke-linejoin', 'round') 71 | .attr('style', 'cursor: pointer') 72 | .selectAll('g') 73 | .data(nodes) 74 | .join('g') 75 | .call(drag(simulation)); 76 | node 77 | .append('circle') 78 | .attr('stroke-width', 3) 79 | .attr('fill', nodeFill) 80 | .attr('stroke', nodeStroke) 81 | .attr('r', 18) 82 | .on('mouseenter', (evt, d) => { 83 | const matchedLinks = []; 84 | const matchedNodeIds = {}; 85 | link.each((lnk) => { 86 | if ( 87 | lnk.source.id === d.id 88 | // 89 | // || lnk.target.id === d.id 90 | ) { 91 | matchedLinks.push(lnk); 92 | matchedNodeIds[lnk.source.id] = matchedNodeIds[lnk.target.id] = true; 93 | } 94 | }); 95 | 96 | link 97 | .attr('display', 'none') 98 | .filter((l) => matchedLinks.includes(l)) 99 | .attr('display', 'block') 100 | .attr('stroke', linkHighlightStroke); 101 | 102 | node 103 | .selectAll('circle') 104 | .attr('stroke', nodeStroke) 105 | .filter((n) => matchedNodeIds[n.id]) 106 | .attr('fill', nodeHighlightFill) 107 | .attr('stroke', nodeHighlightStroke); 108 | }) 109 | .on('mouseleave', (evt) => { 110 | link.attr('display', 'block').attr('stroke', linkStroke); 111 | node 112 | .selectAll('circle') 113 | .attr('fill', nodeFill) 114 | .attr('stroke', nodeStroke); 115 | }); 116 | node 117 | .append('text') 118 | .attr('x', -5) 119 | .attr('y', '0.31em') 120 | .text((d) => d.data.name) 121 | .attr('fill', '#000') 122 | .attr('stroke', '#000') 123 | .attr('style', 'pointer-events: none'); 124 | 125 | simulation.on('tick', () => { 126 | link 127 | .attr('x1', (d) => d.source.x) 128 | .attr('y1', (d) => d.source.y) 129 | .attr('x2', (d) => d.target.x) 130 | .attr('y2', (d) => d.target.y); 131 | 132 | node.attr('transform', (d) => `translate(${d.x},${d.y})`); 133 | }); 134 | 135 | invalidation.then(() => simulation.stop()); 136 | 137 | return svg.node(); 138 | } 139 | 140 | function _data(FileAttachment) { 141 | return FileAttachment('pinyin-tree.json').json(); 142 | } 143 | 144 | function _height() { 145 | return 800; 146 | } 147 | 148 | function _drag(d3) { 149 | return (simulation) => { 150 | function dragstarted(event, d) { 151 | if (!event.active) simulation.alphaTarget(0.3).restart(); 152 | d.fx = d.x; 153 | d.fy = d.y; 154 | } 155 | 156 | function dragged(event, d) { 157 | d.fx = event.x; 158 | d.fy = event.y; 159 | } 160 | 161 | function dragended(event, d) { 162 | if (!event.active) simulation.alphaTarget(0); 163 | d.fx = null; 164 | d.fy = null; 165 | } 166 | 167 | return d3 168 | .drag() 169 | .on('start', dragstarted) 170 | .on('drag', dragged) 171 | .on('end', dragended); 172 | }; 173 | } 174 | 175 | function dragGraph(d3, svg) { 176 | function dragstarted(event) { 177 | svg.node().style.cursor = 'move'; 178 | } 179 | 180 | function dragged(event) { 181 | const viewBox = svg.node().viewBox.baseVal; 182 | const newViewBox = { 183 | x: viewBox.x - event.dx, 184 | y: viewBox.y - event.dy, 185 | width: viewBox.width, 186 | height: viewBox.height 187 | }; 188 | 189 | updateSvgViewBox(svg, newViewBox); 190 | } 191 | 192 | function dragended(event) { 193 | svg.node().style.cursor = ''; 194 | } 195 | 196 | return d3 197 | .drag() 198 | .on('start', dragstarted) 199 | .on('drag', dragged) 200 | .on('end', dragended); 201 | } 202 | 203 | function updateSvgViewBox(svg, viewBox) { 204 | svg.attr('viewBox', [viewBox.x, viewBox.y, viewBox.width, viewBox.height]); 205 | } 206 | 207 | function _d3(require) { 208 | return require('d3@6'); 209 | } 210 | 211 | export function define(runtime, observer) { 212 | const main = runtime.module(); 213 | function toString() { 214 | return this.url; 215 | } 216 | const fileAttachments = new Map([ 217 | [ 218 | 'pinyin-tree.json', 219 | { 220 | url: new URL('./files/char-links.json', import.meta.url), 221 | mimeType: 'application/json', 222 | toString 223 | } 224 | ] 225 | ]); 226 | main.builtin( 227 | 'FileAttachment', 228 | runtime.fileAttachments((name) => fileAttachments.get(name)) 229 | ); 230 | main.variable(observer()).define(['md'], _1); 231 | main 232 | .variable(observer('chart')) 233 | .define( 234 | 'chart', 235 | ['d3', 'data', 'width', 'height', 'drag', 'invalidation'], 236 | _chart 237 | ); 238 | main.variable(observer('data')).define('data', ['FileAttachment'], _data); 239 | main.variable(observer('height')).define('height', _height); 240 | main.variable(observer('drag')).define('drag', ['d3'], _drag); 241 | main.variable(observer('d3')).define('d3', ['require'], _d3); 242 | return main; 243 | } 244 | -------------------------------------------------------------------------------- /tools/analyze/char-tree.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 汉语拼音字母后继树 4 | 5 | 6 | 13 | 14 | -------------------------------------------------------------------------------- /tools/analyze/char-tree.js: -------------------------------------------------------------------------------- 1 | function _1(md) { 2 | return md` 3 | ## 汉语拼音字母后继树 4 | `; 5 | } 6 | 7 | function _chart(d3, data) { 8 | const width = window.innerWidth; 9 | 10 | const levelNodeFill = { 11 | 0: 'rgb(157, 23, 77)', 12 | 1: 'rgb(21, 94, 117)', 13 | 2: 'rgb(91, 33, 182)' 14 | }; 15 | const nodeFill = (d) => 16 | d._children || d.children 17 | ? d.data.name 18 | ? levelNodeFill[d.data.level] 19 | : '#555' 20 | : 'rgb(91, 33, 182)'; 21 | 22 | const marginTop = 40; 23 | const marginRight = 10; 24 | const marginBottom = 40; 25 | const marginLeft = 40; 26 | 27 | // Rows are separated by dx pixels, columns by dy pixels. These names can be counter-intuitive 28 | // (dx is a height, and dy a width). This because the tree must be viewed with the root at the 29 | // “bottom”, in the data domain. The width of a column is based on the tree’s height. 30 | const root = d3.hierarchy(data); 31 | const dx = 35; 32 | const dy = (width - marginRight - marginLeft) / (1 + root.height); 33 | 34 | // Define the tree layout and the shape for links. 35 | const tree = d3.tree().nodeSize([dx, dy]); 36 | const diagonal = d3 37 | .linkHorizontal() 38 | .x((d) => d.y) 39 | .y((d) => d.x); 40 | 41 | // Create the SVG container, a layer for the links and a layer for the nodes. 42 | const svg = d3 43 | .create('svg') 44 | .attr('width', width) 45 | .attr('height', dx) 46 | .attr('viewBox', [-marginLeft, -marginTop, width, dx]) 47 | .attr( 48 | 'style', 49 | 'max-width: 100%; height: auto; font: 10px sans-serif; user-select: none;' 50 | ); 51 | 52 | const gLink = svg 53 | .append('g') 54 | .attr('fill', 'none') 55 | .attr('stroke', '#555') 56 | .attr('stroke-opacity', 0.4) 57 | .attr('stroke-width', 1.5); 58 | 59 | const gNode = svg.append('g').attr('pointer-events', 'all'); 60 | 61 | function update(event, source) { 62 | const duration = event?.altKey ? 2500 : 250; // hold the alt key to slow down the transition 63 | const nodes = root.descendants().reverse(); 64 | const links = root.links(); 65 | 66 | // Compute the new tree layout. 67 | tree(root); 68 | 69 | let left = root; 70 | let right = root; 71 | root.eachBefore((node) => { 72 | if (node.x < left.x) left = node; 73 | if (node.x > right.x) right = node; 74 | }); 75 | 76 | const height = right.x - left.x + marginTop + marginBottom; 77 | 78 | const transition = svg 79 | .transition() 80 | .duration(duration) 81 | .attr('height', height) 82 | .attr('viewBox', [-marginLeft, left.x - marginTop, width, height]) 83 | .tween( 84 | 'resize', 85 | window.ResizeObserver ? null : () => () => svg.dispatch('toggle') 86 | ); 87 | 88 | // Update the nodes… 89 | const node = gNode.selectAll('g').data(nodes, (d) => d.id); 90 | 91 | // Enter any new nodes at the parent's previous position. 92 | const nodeEnter = node 93 | .enter() 94 | .append('g') 95 | .attr('cursor', (d) => (d._children || d.children ? 'pointer' : '')) 96 | .attr('transform', (d) => `translate(${source.y0},${source.x0})`) 97 | .attr('fill-opacity', 0) 98 | .attr('stroke-opacity', 0) 99 | .on('click', (event, d) => { 100 | if (d.data.name) { 101 | d.children = d.children ? null : d._children; 102 | update(event, d); 103 | } else { 104 | (d.children || d._children).forEach((child) => { 105 | child.children = child.children ? null : child._children; 106 | update(event, child); 107 | }); 108 | } 109 | }); 110 | 111 | nodeEnter 112 | .append('circle') 113 | .attr('r', (d) => (d.children || d._children ? 12 : 16)) 114 | .attr('fill', nodeFill) 115 | .attr('stroke', (d) => (d.data.pinyin ? 'rgb(236, 72, 153)' : '')) 116 | .attr('stroke-width', 4); 117 | 118 | nodeEnter 119 | .append('text') 120 | .text((d) => d.data.name) 121 | .attr('style', 'font-size: 16px') 122 | .attr('dy', '0.31em') 123 | .attr('x', (d) => 124 | d.children || d._children ? -6 : d.data.name.length > 2 ? -12 : -6 125 | ) 126 | .attr('fill', 'rgb(209, 213, 219)') 127 | .attr('stroke', 'rgb(209, 213, 219)'); 128 | 129 | // Transition nodes to their new position. 130 | node 131 | .merge(nodeEnter) 132 | .transition(transition) 133 | .attr('transform', (d) => `translate(${d.y},${d.x})`) 134 | .attr('fill-opacity', 1) 135 | .attr('stroke-opacity', 1); 136 | 137 | // Transition exiting nodes to the parent's new position. 138 | node 139 | .exit() 140 | .transition(transition) 141 | .remove() 142 | .attr('transform', (d) => `translate(${source.y},${source.x})`) 143 | .attr('fill-opacity', 0) 144 | .attr('stroke-opacity', 0); 145 | 146 | // Update the links… 147 | const link = gLink.selectAll('path').data(links, (d) => d.target.id); 148 | 149 | // Enter any new links at the parent's previous position. 150 | const linkEnter = link 151 | .enter() 152 | .append('path') 153 | .attr('d', (d) => { 154 | const o = { x: source.x0, y: source.y0 }; 155 | return diagonal({ source: o, target: o }); 156 | }); 157 | 158 | // Transition links to their new position. 159 | link.merge(linkEnter).transition(transition).attr('d', diagonal); 160 | 161 | // Transition exiting nodes to the parent's new position. 162 | link 163 | .exit() 164 | .transition(transition) 165 | .remove() 166 | .attr('d', (d) => { 167 | const o = { x: source.x, y: source.y }; 168 | return diagonal({ source: o, target: o }); 169 | }); 170 | 171 | // Stash the old positions for transition. 172 | root.eachBefore((d) => { 173 | d.x0 = d.x; 174 | d.y0 = d.y; 175 | }); 176 | } 177 | 178 | // Do the first update to the initial configuration of the tree — where a number of nodes 179 | // are open (arbitrarily selected as the root, plus nodes with 7 letters). 180 | root.x0 = dy / 2; 181 | root.y0 = 0; 182 | root.descendants().forEach((d, i) => { 183 | d.id = i; 184 | d._children = d.children; 185 | // Note: d.children = null 表示不展开 186 | // if (d.depth && d.data.name.length !== 7) d.children = null; 187 | }); 188 | 189 | update(null, root); 190 | 191 | return svg.node(); 192 | } 193 | 194 | function _data(FileAttachment) { 195 | return FileAttachment('data.json').json(); 196 | } 197 | 198 | export function define(runtime, observer) { 199 | const main = runtime.module(); 200 | function toString() { 201 | return this.url; 202 | } 203 | const fileAttachments = new Map([ 204 | [ 205 | 'data.json', 206 | { 207 | url: new URL('./files/char-tree.json', import.meta.url), 208 | mimeType: 'application/json', 209 | toString 210 | } 211 | ] 212 | ]); 213 | main.builtin( 214 | 'FileAttachment', 215 | runtime.fileAttachments((name) => fileAttachments.get(name)) 216 | ); 217 | main.variable(observer()).define(['md'], _1); 218 | main.variable(observer('chart')).define('chart', ['d3', 'data'], _chart); 219 | main.variable(observer('data')).define('data', ['FileAttachment'], _data); 220 | return main; 221 | } 222 | -------------------------------------------------------------------------------- /tools/analyze/files/char-links.json: -------------------------------------------------------------------------------- 1 | [{"source":"a","target":"i"},{"source":"a","target":"n"},{"source":"a","target":"o"},{"source":"n","target":"g"},{"source":"n","target":"a"},{"source":"n","target":"e"},{"source":"n","target":"i"},{"source":"n","target":"o"},{"source":"n","target":"u"},{"source":"n","target":"ü"},{"source":"b","target":"a"},{"source":"b","target":"e"},{"source":"b","target":"i"},{"source":"b","target":"o"},{"source":"b","target":"u"},{"source":"e","target":"i"},{"source":"e","target":"n"},{"source":"e","target":"r"},{"source":"i","target":"a"},{"source":"i","target":"e"},{"source":"i","target":"n"},{"source":"i","target":"u"},{"source":"i","target":"o"},{"source":"c","target":"a"},{"source":"c","target":"e"},{"source":"c","target":"h"},{"source":"c","target":"i"},{"source":"c","target":"o"},{"source":"c","target":"u"},{"source":"h","target":"a"},{"source":"h","target":"e"},{"source":"h","target":"i"},{"source":"h","target":"o"},{"source":"h","target":"u"},{"source":"h","target":"m"},{"source":"h","target":"n"},{"source":"o","target":"n"},{"source":"o","target":"u"},{"source":"u","target":"a"},{"source":"u","target":"i"},{"source":"u","target":"n"},{"source":"u","target":"o"},{"source":"u","target":"e"},{"source":"d","target":"a"},{"source":"d","target":"e"},{"source":"d","target":"i"},{"source":"d","target":"o"},{"source":"d","target":"u"},{"source":"f","target":"a"},{"source":"f","target":"e"},{"source":"f","target":"i"},{"source":"f","target":"o"},{"source":"f","target":"u"},{"source":"g","target":"a"},{"source":"g","target":"e"},{"source":"g","target":"o"},{"source":"g","target":"u"},{"source":"j","target":"i"},{"source":"j","target":"u"},{"source":"k","target":"a"},{"source":"k","target":"e"},{"source":"k","target":"o"},{"source":"k","target":"u"},{"source":"l","target":"a"},{"source":"l","target":"e"},{"source":"l","target":"i"},{"source":"l","target":"o"},{"source":"l","target":"u"},{"source":"l","target":"ü"},{"source":"ü","target":"e"},{"source":"m","target":"a"},{"source":"m","target":"e"},{"source":"m","target":"i"},{"source":"m","target":"o"},{"source":"m","target":"u"},{"source":"p","target":"a"},{"source":"p","target":"e"},{"source":"p","target":"i"},{"source":"p","target":"o"},{"source":"p","target":"u"},{"source":"q","target":"i"},{"source":"q","target":"u"},{"source":"r","target":"a"},{"source":"r","target":"e"},{"source":"r","target":"i"},{"source":"r","target":"o"},{"source":"r","target":"u"},{"source":"s","target":"a"},{"source":"s","target":"e"},{"source":"s","target":"h"},{"source":"s","target":"i"},{"source":"s","target":"o"},{"source":"s","target":"u"},{"source":"t","target":"a"},{"source":"t","target":"e"},{"source":"t","target":"i"},{"source":"t","target":"o"},{"source":"t","target":"u"},{"source":"w","target":"a"},{"source":"w","target":"e"},{"source":"w","target":"o"},{"source":"w","target":"u"},{"source":"x","target":"i"},{"source":"x","target":"u"},{"source":"y","target":"a"},{"source":"y","target":"e"},{"source":"y","target":"i"},{"source":"y","target":"o"},{"source":"y","target":"u"},{"source":"z","target":"a"},{"source":"z","target":"e"},{"source":"z","target":"h"},{"source":"z","target":"i"},{"source":"z","target":"o"},{"source":"z","target":"u"}] 2 | -------------------------------------------------------------------------------- /tools/analyze/files/pinyin.txt: -------------------------------------------------------------------------------- 1 | a 2 | ai 3 | an 4 | ang 5 | ao 6 | ba 7 | bai 8 | ban 9 | bang 10 | bao 11 | bei 12 | ben 13 | beng 14 | bi 15 | bian 16 | biao 17 | bie 18 | bin 19 | bing 20 | bo 21 | bu 22 | ca 23 | cai 24 | can 25 | cang 26 | cao 27 | ce 28 | cen 29 | ceng 30 | cha 31 | chai 32 | chan 33 | chang 34 | chao 35 | che 36 | chen 37 | cheng 38 | chi 39 | chong 40 | chou 41 | chu 42 | chua 43 | chuai 44 | chuan 45 | chuang 46 | chui 47 | chun 48 | chuo 49 | ci 50 | cong 51 | cou 52 | cu 53 | cuan 54 | cui 55 | cun 56 | cuo 57 | da 58 | dai 59 | dan 60 | dang 61 | dao 62 | de 63 | dei 64 | den 65 | deng 66 | di 67 | dia 68 | dian 69 | diao 70 | die 71 | ding 72 | diu 73 | dong 74 | dou 75 | du 76 | duan 77 | dui 78 | dun 79 | duo 80 | e 81 | ei 82 | en 83 | eng 84 | er 85 | fa 86 | fan 87 | fang 88 | fei 89 | fen 90 | feng 91 | fiao 92 | fo 93 | fou 94 | fu 95 | ga 96 | gai 97 | gan 98 | gang 99 | gao 100 | ge 101 | gei 102 | gen 103 | geng 104 | gong 105 | gou 106 | gu 107 | gua 108 | guai 109 | guan 110 | guang 111 | gui 112 | gun 113 | guo 114 | ha 115 | hai 116 | han 117 | hang 118 | hao 119 | he 120 | hei 121 | hen 122 | heng 123 | hm 124 | hng 125 | hong 126 | hou 127 | hu 128 | hua 129 | huai 130 | huan 131 | huang 132 | hui 133 | hun 134 | huo 135 | ji 136 | jia 137 | jian 138 | jiang 139 | jiao 140 | jie 141 | jin 142 | jing 143 | jiong 144 | jiu 145 | ju 146 | juan 147 | jue 148 | jun 149 | ka 150 | kai 151 | kan 152 | kang 153 | kao 154 | ke 155 | kei 156 | ken 157 | keng 158 | kong 159 | kou 160 | ku 161 | kua 162 | kuai 163 | kuan 164 | kuang 165 | kui 166 | kun 167 | kuo 168 | la 169 | lai 170 | lan 171 | lang 172 | lao 173 | le 174 | lei 175 | leng 176 | li 177 | lia 178 | lian 179 | liang 180 | liao 181 | lie 182 | lin 183 | ling 184 | liu 185 | lo 186 | long 187 | lou 188 | lu 189 | luan 190 | lun 191 | luo 192 | lü 193 | lüe 194 | m 195 | ma 196 | mai 197 | man 198 | mang 199 | mao 200 | me 201 | mei 202 | men 203 | meng 204 | mi 205 | mian 206 | miao 207 | mie 208 | min 209 | ming 210 | miu 211 | mo 212 | mou 213 | mu 214 | n 215 | na 216 | nai 217 | nan 218 | nang 219 | nao 220 | ne 221 | nei 222 | nen 223 | neng 224 | ng 225 | ni 226 | nian 227 | niang 228 | niao 229 | nie 230 | nin 231 | ning 232 | niu 233 | nong 234 | nou 235 | nu 236 | nuan 237 | nun 238 | nuo 239 | nü 240 | nüe 241 | o 242 | ou 243 | pa 244 | pai 245 | pan 246 | pang 247 | pao 248 | pei 249 | pen 250 | peng 251 | pi 252 | pian 253 | piao 254 | pie 255 | pin 256 | ping 257 | po 258 | pou 259 | pu 260 | qi 261 | qia 262 | qian 263 | qiang 264 | qiao 265 | qie 266 | qin 267 | qing 268 | qiong 269 | qiu 270 | qu 271 | quan 272 | que 273 | qun 274 | ran 275 | rang 276 | rao 277 | re 278 | ren 279 | reng 280 | ri 281 | rong 282 | rou 283 | ru 284 | rua 285 | ruan 286 | rui 287 | run 288 | ruo 289 | sa 290 | sai 291 | san 292 | sang 293 | sao 294 | se 295 | sen 296 | seng 297 | sha 298 | shai 299 | shan 300 | shang 301 | shao 302 | she 303 | shei 304 | shen 305 | sheng 306 | shi 307 | shou 308 | shu 309 | shua 310 | shuai 311 | shuan 312 | shuang 313 | shui 314 | shun 315 | shuo 316 | si 317 | song 318 | sou 319 | su 320 | suan 321 | sui 322 | sun 323 | suo 324 | ta 325 | tai 326 | tan 327 | tang 328 | tao 329 | te 330 | teng 331 | ti 332 | tian 333 | tiao 334 | tie 335 | ting 336 | tong 337 | tou 338 | tu 339 | tuan 340 | tui 341 | tun 342 | tuo 343 | wa 344 | wai 345 | wan 346 | wang 347 | wei 348 | wen 349 | weng 350 | wo 351 | wu 352 | xi 353 | xia 354 | xian 355 | xiang 356 | xiao 357 | xie 358 | xin 359 | xing 360 | xiong 361 | xiu 362 | xu 363 | xuan 364 | xue 365 | xun 366 | ya 367 | yan 368 | yang 369 | yao 370 | ye 371 | yi 372 | yin 373 | ying 374 | yo 375 | yong 376 | you 377 | yu 378 | yuan 379 | yue 380 | yun 381 | za 382 | zai 383 | zan 384 | zang 385 | zao 386 | ze 387 | zei 388 | zen 389 | zeng 390 | zha 391 | zhai 392 | zhan 393 | zhang 394 | zhao 395 | zhe 396 | zhei 397 | zhen 398 | zheng 399 | zhi 400 | zhong 401 | zhou 402 | zhu 403 | zhua 404 | zhuai 405 | zhuan 406 | zhuang 407 | zhui 408 | zhun 409 | zhuo 410 | zi 411 | zong 412 | zou 413 | zu 414 | zuan 415 | zui 416 | zun 417 | zuo 418 | -------------------------------------------------------------------------------- /tools/analyze/img/pinyin-char-links.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/tools/analyze/img/pinyin-char-links.png -------------------------------------------------------------------------------- /tools/analyze/img/pinyin-char-tree.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/tools/analyze/img/pinyin-char-tree.png -------------------------------------------------------------------------------- /tools/analyze/img/pinyin-key-layout.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/tools/analyze/img/pinyin-key-layout.png -------------------------------------------------------------------------------- /tools/analyze/playground.css: -------------------------------------------------------------------------------- 1 | html, 2 | body { 3 | width: 100%; 4 | height: 100%; 5 | } 6 | 7 | #playground { 8 | width: 100%; 9 | height: 100%; 10 | display: flex; 11 | flex-direction: column; 12 | } 13 | 14 | .input-result { 15 | margin: 1rem 0; 16 | height: 2rem; 17 | font-size: 2rem; 18 | line-height: 1; 19 | text-align: center; 20 | } 21 | 22 | .graph { 23 | flex: 1; 24 | } 25 | 26 | .trigger { 27 | fill: #f7f5f7; 28 | stroke: #1f1e1e; 29 | stroke-width: 1px; 30 | } 31 | 32 | .trigger:hover { 33 | fill: #959696; 34 | } 35 | 36 | .sector:hover { 37 | fill: #bc1dca; 38 | } 39 | 40 | .label { 41 | fill: #000000; 42 | pointer-events: none; 43 | font-size: 2rem; 44 | } 45 | -------------------------------------------------------------------------------- /tools/analyze/playground.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 交互设计演示 7 | 8 | 9 |
10 |
11 | 12 | 13 | 14 |
15 | 16 | 17 | -------------------------------------------------------------------------------- /tools/analyze/playground.js: -------------------------------------------------------------------------------- 1 | const graph = document.querySelector('.graph'); 2 | const board = document.querySelector('.board'); 3 | const center = { x: graph.clientWidth / 2, y: graph.clientHeight / 2 }; 4 | 5 | // 添加一个Math.radians方法,用于将度转换为弧度 6 | Math.radians = function (degrees) { 7 | return (degrees * Math.PI) / 180; 8 | }; 9 | 10 | const pinyinNextStrChars = { 11 | zh: 'zha,zhai,zhan,zhao,zhang,zhe,zhei,zhen,zheng,zhi,zhou,zhong,zhu,zhua,zhui,zhun,zhuo,zhuai,zhuan,zhuang', 12 | ch: 'cha,chai,chan,chao,chang,che,chen,cheng,chi,chou,chong,chu,chua,chui,chun,chuo,chuai,chuan,chuang', 13 | sh: 'sha,shai,shan,shao,shang,she,shei,shen,sheng,shi,shou,shu,shua,shui,shun,shuo,shuai,shuan,shuang', 14 | n: 'n,na,nai,nan,nao,nang,ne,nei,nen,neng,ng,ni,nie,nin,niu,nian,niao,ning,niang,nou,nong,nu,nun,nuo,nuan,nü,nüe', 15 | l: 'la,lai,lan,lao,lang,le,lei,leng,li,lia,lie,lin,liu,lian,liao,ling,liang,lo,lou,long,lu,lun,luo,luan,lü,lüe', 16 | d: 'da,dai,dan,dao,dang,de,dei,den,deng,di,dia,die,diu,dian,diao,ding,dou,dong,du,dui,dun,duo,duan', 17 | h: 'ha,hai,han,hao,hang,he,hei,hen,heng,hm,hng,hou,hong,hu,hua,hui,hun,huo,huai,huan,huang', 18 | m: 'm,ma,mai,man,mao,mang,me,mei,men,meng,mi,mie,min,miu,mian,miao,ming,mo,mou,mu', 19 | g: 'ga,gai,gan,gao,gang,ge,gei,gen,geng,gou,gong,gu,gua,gui,gun,guo,guai,guan,guang', 20 | k: 'ka,kai,kan,kao,kang,ke,kei,ken,keng,kou,kong,ku,kua,kui,kun,kuo,kuai,kuan,kuang', 21 | t: 'ta,tai,tan,tao,tang,te,teng,ti,tie,tian,tiao,ting,tou,tong,tu,tui,tun,tuo,tuan', 22 | p: 'pa,pai,pan,pao,pang,pei,pen,peng,pi,pie,pin,pian,piao,ping,po,pou,pu', 23 | z: 'za,zai,zan,zao,zang,ze,zei,zen,zeng,zi,zou,zong,zu,zui,zun,zuo,zuan', 24 | b: 'ba,bai,ban,bao,bang,bei,ben,beng,bi,bie,bin,bian,biao,bing,bo,bu', 25 | c: 'ca,cai,can,cao,cang,ce,cen,ceng,ci,cou,cong,cu,cui,cun,cuo,cuan', 26 | s: 'sa,sai,san,sao,sang,se,sen,seng,si,sou,song,su,sui,sun,suo,suan', 27 | r: 'ran,rao,rang,re,ren,reng,ri,rou,rong,ru,rua,rui,run,ruo,ruan', 28 | y: 'ya,yan,yao,yang,ye,yi,yin,ying,yo,you,yong,yu,yue,yun,yuan', 29 | j: 'ji,jia,jie,jin,jiu,jian,jiao,jing,jiang,jiong,ju,jue,jun,juan', 30 | q: 'qi,qia,qie,qin,qiu,qian,qiao,qing,qiang,qiong,qu,que,qun,quan', 31 | x: 'xi,xia,xie,xin,xiu,xian,xiao,xing,xiang,xiong,xu,xue,xun,xuan', 32 | f: 'fa,fan,fang,fei,fen,feng,fiao,fo,fou,fu', 33 | w: 'wa,wai,wan,wang,wei,wen,weng,wo,wu', 34 | a: 'a,ai,an,ang,ao', 35 | e: 'e,ei,en,eng,er', 36 | o: 'o,ou' 37 | }; 38 | // {zh: {a: {'': true, i: true, n: true, o: true, ng: true}}} 39 | const pinyinNextChars = {}; 40 | const pinyinAll = {}; 41 | const pinyinStartChars = Object.keys(pinyinNextStrChars); 42 | pinyinStartChars.forEach((ch) => { 43 | const nextChars = pinyinNextStrChars[ch].split(/,/).map((c) => { 44 | pinyinAll[c] = true; 45 | 46 | return c.substr(ch.length); 47 | }); 48 | 49 | const nextMap = (pinyinNextChars[ch] = {}); 50 | nextChars.forEach((c) => { 51 | const first = c.substring(0, 1); 52 | const left = c.substr(1); 53 | 54 | nextMap[first] ||= {}; 55 | nextMap[first][left] = true; 56 | }); 57 | }); 58 | 59 | graph.onmouseup = function () { 60 | nextState('end'); 61 | }; 62 | 63 | document.addEventListener('touchstart', function (e) { 64 | if (e.target.getAttribute('type') === 'trigger') { 65 | nextState('start'); 66 | } 67 | }); 68 | 69 | document.addEventListener('touchmove', function (e) { 70 | const touch = e.touches[0]; 71 | const target = document.elementFromPoint(touch.clientX, touch.clientY); 72 | if (!target) { 73 | return; 74 | } 75 | 76 | if (target.getAttribute('type') === 'trigger') { 77 | nextState('next'); 78 | } else if (target.getAttribute('type') === 'sector') { 79 | nextState('choose', target.getAttribute('id')); 80 | } 81 | }); 82 | 83 | document.addEventListener('touchend', function (e) { 84 | nextState('end'); 85 | }); 86 | 87 | function drawTrigger(graph, center, radius) { 88 | const circle = document.createElementNS( 89 | 'http://www.w3.org/2000/svg', 90 | 'circle' 91 | ); 92 | circle.setAttribute('type', 'trigger'); 93 | circle.setAttribute('class', 'trigger'); 94 | circle.setAttribute('r', radius); 95 | circle.setAttribute('cx', center.x); 96 | circle.setAttribute('cy', center.y); 97 | 98 | circle.onmousedown = function () { 99 | nextState('start'); 100 | }; 101 | circle.onmouseover = function () { 102 | nextState('next'); 103 | }; 104 | 105 | graph.appendChild(circle); 106 | } 107 | 108 | const sectorChars = {}; 109 | const numSectors = 12; 110 | function drawPinyinSectors(graph, center, innerRadius, outerRadius) { 111 | const anglePerSector = 360 / numSectors; 112 | const startAngle = 135; 113 | const charSize = Math.round(pinyinStartChars.length / numSectors); 114 | 115 | for (let i = 0; i < numSectors; i++) { 116 | const sectorId = `sector_${i}`; 117 | const sectorTextId = `${sectorId}_text`; 118 | const sectorStartAngle = startAngle + i * anglePerSector; 119 | const sectorEndAngle = sectorStartAngle + anglePerSector; 120 | 121 | sectorChars[sectorId] = pinyinStartChars.slice( 122 | i * charSize, 123 | i == numSectors - 1 ? pinyinStartChars.length : (i + 1) * charSize 124 | ); 125 | 126 | const sectorInner = { 127 | radius: innerRadius, 128 | start: { 129 | x: center.x + innerRadius * Math.cos(Math.radians(sectorStartAngle)), 130 | y: center.y - innerRadius * Math.sin(Math.radians(sectorStartAngle)) 131 | }, 132 | end: { 133 | x: center.x + innerRadius * Math.cos(Math.radians(sectorEndAngle)), 134 | y: center.y - innerRadius * Math.sin(Math.radians(sectorEndAngle)) 135 | } 136 | }; 137 | const sectorOuter = { 138 | radius: outerRadius, 139 | start: { 140 | x: center.x + outerRadius * Math.cos(Math.radians(sectorStartAngle)), 141 | y: center.y - outerRadius * Math.sin(Math.radians(sectorStartAngle)) 142 | }, 143 | end: { 144 | x: center.x + outerRadius * Math.cos(Math.radians(sectorEndAngle)), 145 | y: center.y - outerRadius * Math.sin(Math.radians(sectorEndAngle)) 146 | } 147 | }; 148 | 149 | const path = document.createElementNS('http://www.w3.org/2000/svg', 'path'); 150 | path.setAttribute('id', sectorId); 151 | path.setAttribute('type', 'sector'); 152 | path.setAttribute('class', 'sector'); 153 | // M100,100 表示移动到中心点 (100,100)。 154 | // L100,20 绘制一条从中心点到圆边缘的线(半径线)。 155 | // A80,80 0 0,1 183.643,70.588 绘制一个椭圆弧,其中80,80是椭圆的x轴半径和y轴半径(因为是一个圆,所以两个半径相等),0 0,1表示弧的旋转角度、大弧标志和顺时针标志,183.643,70.588是弧的终点坐标。 156 | // L100,100 绘制另一条半径线回到中心点。 157 | // Z 表示闭合路径,形成一个扇形。 158 | path.setAttribute( 159 | 'd', 160 | `M${sectorInner.start.x},${sectorInner.start.y} 161 | L${sectorOuter.start.x},${sectorOuter.start.y} 162 | A${sectorOuter.radius},${sectorOuter.radius} 0 0,0 ${sectorOuter.end.x},${sectorOuter.end.y} 163 | L${sectorInner.end.x},${sectorInner.end.y} 164 | A${sectorInner.radius},${sectorInner.radius} 0 0,1 ${sectorInner.start.x},${sectorInner.start.y} 165 | Z` 166 | ); 167 | path.setAttribute('fill', `hsl(${i * (360 / numSectors)}, 100%, 50%)`); 168 | 169 | graph.appendChild(path); 170 | 171 | const text = document.createElementNS('http://www.w3.org/2000/svg', 'text'); 172 | const textPos = { 173 | x: 174 | ((sectorOuter.end.x + sectorOuter.start.x) / 2 + 175 | (sectorInner.end.x + sectorInner.start.x) / 2) / 176 | 2, 177 | y: 178 | ((sectorOuter.end.y + sectorOuter.start.y) / 2 + 179 | (sectorInner.end.y + sectorInner.start.y) / 2) / 180 | 2 181 | }; 182 | text.setAttribute('id', sectorTextId); 183 | text.setAttribute('class', 'label'); 184 | text.setAttribute('font-size', '24'); 185 | text.setAttribute('text-anchor', 'middle'); 186 | text.setAttribute('dominant-baseline', 'middle'); 187 | text.setAttribute('x', textPos.x); 188 | text.setAttribute('y', textPos.y); 189 | text.innerHTML = sectorChars[sectorId].join(', '); 190 | 191 | graph.appendChild(text); 192 | 193 | path.onmouseover = function () { 194 | nextState('choose', sectorId); 195 | }; 196 | } 197 | } 198 | 199 | function mergeDeep(...objects) { 200 | const isObject = (obj) => obj && typeof obj === 'object'; 201 | 202 | return objects.reduce((prev, obj) => { 203 | Object.keys(obj).forEach((key) => { 204 | const pVal = prev[key]; 205 | const oVal = obj[key]; 206 | 207 | if (Array.isArray(pVal) && Array.isArray(oVal)) { 208 | prev[key] = pVal.concat(...oVal); 209 | } else if (isObject(pVal) && isObject(oVal)) { 210 | prev[key] = mergeDeep(pVal, oVal); 211 | } else { 212 | prev[key] = oVal; 213 | } 214 | }); 215 | 216 | return prev; 217 | }, {}); 218 | } 219 | 220 | const state = { 221 | started: false, 222 | paths: [], 223 | candidates: {}, 224 | pending: null 225 | }; 226 | function nextState(event, sectorId) { 227 | if (event === 'end') { 228 | state.started = false; 229 | updateSectorChars(pinyinStartChars); 230 | } else if (event === 'start') { 231 | state.started = true; 232 | state.paths = []; 233 | state.candidates = {}; 234 | state.pending = null; 235 | } else if (state.started && event === 'choose') { 236 | const sectorTextId = `${sectorId}_text`; 237 | 238 | state.pending = document 239 | .querySelector(`#${sectorTextId}`) 240 | .innerHTML.split(/\s*,\s*/); 241 | } else if (state.started && event === 'next' && state.pending) { 242 | console.log('pending: ', event, state.pending); 243 | 244 | const firstPending = state.pending[0]; 245 | if (state.pending.length === 1 && state.candidates[firstPending]) { 246 | document.querySelector('.input-result').innerHTML = firstPending; 247 | 248 | updateSectorChars(pinyinStartChars); 249 | 250 | return nextState('start'); 251 | } 252 | 253 | state.paths.push(state.pending); 254 | state.pending = null; 255 | 256 | let nextChars = Object.assign({}, pinyinNextChars); 257 | state.paths.forEach((path) => { 258 | // 删除不在输入路径中的后继 259 | Object.keys(nextChars).forEach((n) => { 260 | if (!path.includes(n)) { 261 | delete nextChars[n]; 262 | } 263 | }); 264 | 265 | // 提升后继 266 | let newNextChars = {}; 267 | Object.keys(nextChars).forEach((n) => { 268 | const next = nextChars[n]; 269 | if (next !== true) { 270 | newNextChars = mergeDeep(newNextChars, next); 271 | } 272 | }); 273 | 274 | nextChars = newNextChars; 275 | }); 276 | 277 | const pinyins = showCandidates(); 278 | 279 | let chars = Object.keys(nextChars).concat(pinyins); 280 | // 拼音组合已无后继 281 | if (chars.length === 0) { 282 | chars = pinyinStartChars; 283 | state.paths = []; 284 | } 285 | 286 | updateSectorChars(chars); 287 | } 288 | } 289 | 290 | function updateSectorChars(chars) { 291 | const selectorAmount = 292 | chars.length > numSectors ? Math.round(chars.length / numSectors) : 1; 293 | 294 | for (let i = 0; i < numSectors; i++) { 295 | const sectorId = `sector_${i}`; 296 | const sectorTextId = `${sectorId}_text`; 297 | const sectorChars = chars.slice( 298 | i * selectorAmount, 299 | i == numSectors - 1 ? chars.length : (i + 1) * selectorAmount 300 | ); 301 | 302 | document.querySelector(`#${sectorTextId}`).innerHTML = 303 | sectorChars.join(', '); 304 | } 305 | } 306 | 307 | function showCandidates() { 308 | let results = { '': true }; 309 | 310 | state.paths.forEach((path) => { 311 | const prev = Object.keys(results); 312 | results = {}; 313 | 314 | path.forEach((ch) => { 315 | if (state.candidates[ch]) { 316 | return; 317 | } 318 | 319 | prev.forEach((p) => { 320 | results[p + ch] = true; 321 | }); 322 | }); 323 | }); 324 | 325 | Object.keys(results).forEach((p) => { 326 | if (pinyinAll[p]) { 327 | state.candidates[p] = true; 328 | } 329 | }); 330 | console.log('candidates: ', state.candidates); 331 | 332 | const pinyins = Object.keys(state.candidates); 333 | document.querySelector('.input-result').innerHTML = pinyins.join(', '); 334 | 335 | return pinyins; 336 | } 337 | 338 | drawTrigger(graph, center, 150); 339 | drawPinyinSectors(board, center, 150, Math.min(center.x, center.y)); 340 | -------------------------------------------------------------------------------- /tools/analyze/simulate.css: -------------------------------------------------------------------------------- 1 | html, 2 | body { 3 | margin: 0; 4 | padding: 0; 5 | min-width: 100%; 6 | min-height: 100%; 7 | } 8 | 9 | body { 10 | display: flex; 11 | flex-direction: column; 12 | justify-content: center; 13 | color: rgb(236, 236, 236); 14 | background-color: #030712; 15 | } 16 | 17 | .title { 18 | font-size: 3em; 19 | font-weight: bold; 20 | text-align: center; 21 | padding-top: 10px; 22 | } 23 | 24 | .body { 25 | display: flex; 26 | justify-content: center; 27 | } 28 | 29 | .data { 30 | padding: 2em; 31 | margin-top: 5em; 32 | display: flex; 33 | flex-direction: column; 34 | justify-content: flex-start; 35 | align-items: center; 36 | gap: 2em; 37 | } 38 | 39 | .data [name="pinyin"] { 40 | width: 40em; 41 | height: 20em; 42 | } 43 | 44 | .data [name="duration"] { 45 | width: 5em; 46 | } 47 | 48 | .data .result { 49 | overflow-y: auto; 50 | width: 100%; 51 | font-size: 1.5em; 52 | font-weight: bold; 53 | height: 18em; 54 | } 55 | 56 | .data .result .item { 57 | cursor: pointer; 58 | padding: 0 .5em; 59 | } 60 | 61 | .data .result .item:hover { 62 | background-color: #ccc; 63 | } 64 | 65 | .demo { 66 | margin: 2em; 67 | margin-top: 5em; 68 | position: relative; 69 | } 70 | 71 | .keyboard { 72 | margin: 0; 73 | padding: 0; 74 | list-style: none; 75 | width: 560px; 76 | display: flex; 77 | flex-wrap: wrap; 78 | padding-bottom: 20px; 79 | } 80 | 81 | .key {} 82 | 83 | .hex { 84 | overflow: hidden; 85 | display: flex; 86 | justify-content: center; 87 | align-items: center; 88 | } 89 | 90 | .hex-inner { 91 | text-align: center; 92 | clip-path: polygon(0% 25%, 0% 75%, 50% 100%, 100% 75%, 100% 25%, 50% 0%); 93 | display: flex; 94 | justify-content: center; 95 | align-items: center; 96 | flex-direction: column; 97 | } 98 | 99 | .hex-inner .index {} 100 | 101 | .hex-inner .char { 102 | font-size: 1.5em; 103 | line-height: 1.5em; 104 | } 105 | 106 | .key .hex-inner { 107 | background-color: rgb(57, 57, 57); 108 | } 109 | 110 | .key.disabled .hex-inner { 111 | background-color: rgb(57, 57, 57); 112 | } 113 | 114 | .key.highlight .hex-inner { 115 | opacity: .4; 116 | } 117 | 118 | .key.hidden .hex-inner { 119 | opacity: 0 !important; 120 | } 121 | 122 | .pinyin-stroke-layer { 123 | position: absolute; 124 | top: 0; 125 | bottom: 0; 126 | left: 0; 127 | right: 0; 128 | z-index: 1; 129 | pointer-events: none; 130 | } 131 | 132 | .pinyin-stroke-layer .text { 133 | position: absolute; 134 | top: -1.5em; 135 | bottom: 0; 136 | left: 0; 137 | right: 0; 138 | font-size: 4em; 139 | display: flex; 140 | justify-content: center; 141 | align-items: flex-start; 142 | color: rgb(136, 6, 134); 143 | opacity: .6; 144 | } 145 | 146 | .hidden:not(.key) { 147 | display: none !important; 148 | } 149 | 150 | .key[name="删除"] .hex-inner { 151 | background-color: rgb(153, 27, 27); 152 | } 153 | 154 | .key[name="空格"] .hex-inner, 155 | .key[name="<定位>"] .hex-inner { 156 | color: rgb(57, 57, 57); 157 | background-color: rgb(236, 236, 236); 158 | } 159 | 160 | .key[name="换行"] .hex-inner { 161 | background-color: rgb(2, 170, 245); 162 | } 163 | 164 | .key[name="数字"] .hex-inner, 165 | .key[name="标点"] .hex-inner {} 166 | 167 | /* .key[name="n"] .hex-inner, 168 | .key[name="r"] .hex-inner, */ 169 | .key[name="ü"] .hex-inner, 170 | .key[name="i"] .hex-inner, 171 | .key[name="u"] .hex-inner, 172 | .key[name="a"] .hex-inner, 173 | .key[name="e"] .hex-inner, 174 | .key[name="o"] .hex-inner { 175 | background-color: rgb(157, 23, 77); 176 | } 177 | 178 | .key[name="t"] .hex-inner, 179 | .key[name="p"] .hex-inner, 180 | .key[name="q"] .hex-inner, 181 | .key[name="s"] .hex-inner { 182 | /* background-color: rgb(91, 33, 182); */ 183 | } 184 | 185 | .key[name="f"] .hex-inner, 186 | .key[name="g"] .hex-inner, 187 | .key[name="c"] .hex-inner, 188 | .key[name="b"] .hex-inner, 189 | .key[name="d"] .hex-inner { 190 | /* background-color: rgb(21, 94, 117); */ 191 | } 192 | 193 | .key[name="zh"] .hex-inner, 194 | .key[name="ch"] .hex-inner, 195 | .key[name="sh"] .hex-inner { 196 | background-color: rgb(154, 52, 18); 197 | } 198 | 199 | .key[name="w"] .hex-inner, 200 | .key[name="x"] .hex-inner, 201 | .key[name="y"] .hex-inner, 202 | .key[name="z"] .hex-inner { 203 | /* background-color: rgb(134, 25, 143); */ 204 | } 205 | 206 | .key[name="h"] .hex-inner, 207 | .key[name="m"] .hex-inner, 208 | .key[name="l"] .hex-inner, 209 | .key[name="j"] .hex-inner, 210 | .key[name="k"] .hex-inner { 211 | /* background-color: rgb(55, 48, 163); */ 212 | } 213 | 214 | .key[name=";"] .hex-inner, 215 | .key[name=":"] .hex-inner, 216 | .key[name="!"] .hex-inner, 217 | .key[name="?"] .hex-inner, 218 | .key[name=";"] .hex-inner, 219 | .key[name=","] .hex-inner, 220 | .key[name="。"] .hex-inner { 221 | /* background-color: rgb(2, 170, 245); */ 222 | } 223 | -------------------------------------------------------------------------------- /tools/analyze/simulate.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 汉语拼音划词模拟 4 | 5 | 6 |
汉语拼音划词模拟
7 |
8 |
9 | 13 |
14 | 15 | 16 | 17 | 18 |
19 |
20 |
21 |
22 | 38 | 39 |
40 |
41 | 42 | 43 | -------------------------------------------------------------------------------- /tools/analyze/simulate.js: -------------------------------------------------------------------------------- 1 | // https://www.cnblogs.com/zczhangcui/p/10300090.html 2 | 3 | const $keyboard = document.body.querySelector('.keyboard'); 4 | const $pinyinStrokeLayer = document.body.querySelector('.pinyin-stroke-layer'); 5 | const $simulateResult = document.body.querySelector('.data .result'); 6 | 7 | const timer = { 8 | id: 0, 9 | duration: 500 10 | }; 11 | const cos_30 = Math.cos(Math.PI / 6); 12 | const sin_30 = Math.sin(Math.PI / 6); 13 | const hexRows = 6; 14 | const hexColumns = 8; 15 | const hexRadius = 50; 16 | const hexWidth = 2 * (hexRadius * cos_30); 17 | const hexHeight = 2 * hexRadius; 18 | // 正六边形间距通过嵌套外六边形实现 19 | const hexSpacing = 5; 20 | const hexOuterRadius = hexRadius + hexSpacing / (2 * cos_30); 21 | const hexOuterWidth = 2 * (hexOuterRadius * cos_30); 22 | const hexOuterHeight = 2 * hexOuterRadius; 23 | const hexOuterMarginTop = -hexOuterRadius * sin_30; 24 | const hexRowsWidth = (hexColumns + 0.5) * hexOuterWidth; 25 | const keys = [ 26 | // row 0 27 | ['翻转', '😂', '!', 'ü', 'i', 'u', 'o', 'j'], 28 | // row 1 29 | ['算术', '?', 'd', 'm', 'x', 'q', 'a', '删除'], 30 | // row 2 31 | ['拉丁', '😄', ';', 'b', 'l', 'y', 'p', 'e'], 32 | // row 3 33 | ['表情', ':', 's', 't', '<定位>', 'r', 'h', '换行'], 34 | // row 4 35 | ['标点', '😉', '。', 'c', 'z', 'f', 'n', 'k'], 36 | // row 5 37 | ['撤回', ',', 'sh', 'ch', 'zh', 'g', 'w', '空格'] 38 | ]; 39 | 40 | initKeyboard(); 41 | 42 | function initKeyboard() { 43 | for (let i = 0; i < hexRows; i++) { 44 | for (let j = 0; j < hexColumns; j++) { 45 | const keyChar = (keys[i] || [])[j] || ''; 46 | 47 | const $key = document.createElement('li'); 48 | $key.className = 'key ' + (keyChar ? '' : 'hidden'); 49 | $key.id = getKeyElementId(keyChar || i + '-' + j); 50 | $key.setAttribute('name', keyChar || ''); 51 | $key.innerHTML = ` 52 | 53 | ${i},${j}
54 | ${keyChar} 55 | 56 |
57 | `; 58 | 59 | $keyboard.appendChild($key); 60 | } 61 | } 62 | 63 | $keyboard.style.width = `${hexRowsWidth}px`; 64 | 65 | const $hexComputedStyle = document.createElement('style'); 66 | $hexComputedStyle.textContent = ` 67 | .demo { padding-top: ${-hexOuterMarginTop}px; } 68 | .hex { 69 | width: ${hexOuterWidth}px; 70 | height: ${hexOuterHeight}px; 71 | margin-top: ${hexOuterMarginTop}px; 72 | } 73 | .hex-inner { width: ${hexWidth}px; height: ${hexHeight}px; } 74 | .key:nth-child(${hexColumns * 2}n+${hexColumns + 1}) { 75 | margin-left: ${0.5 * hexOuterWidth}px; 76 | } 77 | `; 78 | document.head.appendChild($hexComputedStyle); 79 | 80 | const $btnClear = document.body.querySelector('.data .btn [name="clear"]'); 81 | const $btnStop = document.body.querySelector('.data .btn [name="stop"]'); 82 | const $btnSimulate = document.body.querySelector( 83 | '.data .btn [name="simulate"]' 84 | ); 85 | const $inputPinyin = document.body.querySelector('.data [name="pinyin"]'); 86 | const $inputDuration = document.body.querySelector('.data [name="duration"]'); 87 | 88 | $inputDuration.value = timer.duration + ''; 89 | $inputDuration.onchange = function () { 90 | timer.duration = parseInt(this.value); 91 | }; 92 | $btnStop.onclick = function () { 93 | if (timer.id > 0) { 94 | clearTimeout(timer.id); 95 | timer.id = 0; 96 | } 97 | $btnClear.disabled = false; 98 | $btnSimulate.disabled = false; 99 | }; 100 | $btnClear.onclick = function () { 101 | $inputPinyin.value = ''; 102 | $simulateResult.innerHTML = ''; 103 | 104 | oneByOne(hiddenStrokeLayer(), unhighlightAllKeyElements())(); 105 | }; 106 | $btnSimulate.onclick = function () { 107 | const text = $inputPinyin.value.trim(); 108 | if (!text) { 109 | return; 110 | } 111 | 112 | $btnStop.onclick(); 113 | $btnSimulate.disabled = true; 114 | $btnClear.disabled = true; 115 | $simulateResult.innerHTML = ''; 116 | 117 | strokePinyin(text.split(/\s+/), () => { 118 | $btnClear.disabled = false; 119 | $btnSimulate.disabled = false; 120 | $btnSimulate.disabled = false; 121 | }); 122 | }; 123 | } 124 | 125 | function getKeyElementId(char) { 126 | return `key-${char}`; 127 | } 128 | 129 | function getKeyElement(k) { 130 | const id = getKeyElementId(k); 131 | return document.getElementById(id); 132 | } 133 | 134 | function unhighlightAllKeyElements() { 135 | return unhighlightElement(...$keyboard.querySelectorAll('.key')); 136 | } 137 | 138 | function unhighlightAllDescendantKeyElements() { 139 | return () => { 140 | $keyboard 141 | .querySelectorAll('.key') 142 | .forEach(($el) => $el.classList.remove('descendant', 'hidden')); 143 | }; 144 | } 145 | 146 | function highlightDescendantKeyElements(keys) { 147 | return () => { 148 | keys 149 | .map(getKeyElement) 150 | .forEach(($el) => $el && $el.classList.add('descendant')); 151 | $keyboard 152 | .querySelectorAll('.key:not(.highlight,.descendant)') 153 | .forEach(($el) => $el.classList.add('hidden')); 154 | }; 155 | } 156 | 157 | function highlightElement(...$els) { 158 | return () => { 159 | $els.forEach(($el) => $el && $el.classList.add('highlight')); 160 | }; 161 | } 162 | 163 | function unhighlightElement(...$els) { 164 | return () => { 165 | $els.forEach( 166 | ($el) => $el && $el.classList.remove('highlight', 'descendant', 'hidden') 167 | ); 168 | }; 169 | } 170 | 171 | function getPinyinKeys(pinyin) { 172 | if (!pinyin) { 173 | return []; 174 | } 175 | 176 | const keys = []; 177 | for (let i = 0; i < pinyin.length; i++) { 178 | keys.push(pinyin.charAt(i)); 179 | } 180 | return keys; 181 | } 182 | 183 | function clearStrokePath() { 184 | const $strokePath = document.getElementById('stroke-path'); 185 | $strokePath.setAttribute('d', ''); 186 | } 187 | 188 | function drawStrokePath(x, y) { 189 | return () => { 190 | const $strokePath = document.getElementById('stroke-path'); 191 | const d = $strokePath.getAttribute('d'); 192 | 193 | $strokePath.setAttribute('d', d ? `${d} L${x} ${y}` : `M${x} ${y}`); 194 | }; 195 | } 196 | 197 | function showStrokeLayer(pinyin) { 198 | return () => { 199 | clearStrokePath(); 200 | $pinyinStrokeLayer.querySelector( 201 | '.text' 202 | ).innerHTML = `划词: ${pinyin}`; 203 | 204 | $pinyinStrokeLayer.classList.remove('hidden'); 205 | }; 206 | } 207 | 208 | function hiddenStrokeLayer() { 209 | return () => { 210 | $pinyinStrokeLayer.classList.add('hidden'); 211 | 212 | clearStrokePath(); 213 | $pinyinStrokeLayer.querySelector('.text').innerHTML = ''; 214 | }; 215 | } 216 | 217 | function strokePinyin(pinyinList, gotoNext) { 218 | const pinyinKeyTree = createPinyinKeyTree(pinyinList); 219 | 220 | let steps = []; 221 | for (let i = 0; i < pinyinList.length; i++) { 222 | const pinyin = pinyinList[i]; 223 | 224 | steps = steps.concat(creatStorkePinyinSteps(pinyin, pinyinKeyTree)); 225 | } 226 | 227 | if (gotoNext) { 228 | steps.push(gotoNext); 229 | } 230 | 231 | stepRun(...steps); 232 | } 233 | 234 | function creatStorkePinyinSteps(pinyin, pinyinKeyTree) { 235 | const keys = getPinyinKeys(pinyin.trim()); 236 | if (keys.length === 0) { 237 | return; 238 | } 239 | 240 | const keyboardRect = $keyboard.getBoundingClientRect(); 241 | 242 | let keyTree = pinyinKeyTree; 243 | const highlights = []; 244 | const keyIndexes = []; 245 | for (let i = 0; i < keys.length; i++) { 246 | const key = keys[i]; 247 | const $key = getKeyElement(key); 248 | const keyIndex = $key.querySelector('.index').innerText; 249 | const keyRect = $key.querySelector('.hex').getBoundingClientRect(); 250 | const cx = keyRect.x + keyRect.width / 2; 251 | const cy = keyRect.y + keyRect.height / 2; 252 | const x = cx - keyboardRect.x; 253 | const y = cy - keyboardRect.y; 254 | 255 | keyTree = keyTree[key]; 256 | const descendantKeys = Object.keys(keyTree); 257 | 258 | highlights.push( 259 | oneByOne( 260 | highlightElement($key), 261 | unhighlightAllDescendantKeyElements(), 262 | highlightDescendantKeyElements(descendantKeys), 263 | drawStrokePath(x, y) 264 | ) 265 | ); 266 | keyIndexes.push(keyIndex); 267 | } 268 | 269 | const createResult = () => { 270 | const $result = document.createElement('div'); 271 | $result.className = 'item'; 272 | $result.innerHTML = `${pinyin}: ${keyIndexes.join(' -> ')}`; 273 | $result.onclick = function () { 274 | if (timer.id > 0) { 275 | return; 276 | } 277 | 278 | oneByOne(unhighlightAllKeyElements(), showStrokeLayer(pinyin))(); 279 | stepRun(...highlights); 280 | }; 281 | 282 | $simulateResult.prepend($result); 283 | }; 284 | 285 | return [ 286 | oneByOne(unhighlightAllKeyElements(), showStrokeLayer(pinyin)), 287 | createResult, 288 | ...highlights 289 | ]; 290 | } 291 | 292 | function stepRun(...steps) { 293 | if (!steps || steps.length === 0) { 294 | timer.id = 0; 295 | return; 296 | } 297 | 298 | timer.id = setTimeout(() => { 299 | const [first, ...left] = steps; 300 | 301 | first(); 302 | stepRun(...left); 303 | }, timer.duration); 304 | } 305 | 306 | function oneByOne(...fns) { 307 | return () => { 308 | fns.forEach((fn) => fn()); 309 | }; 310 | } 311 | 312 | function createPinyinKeyTree(pinyinList) { 313 | const tree = {}; 314 | 315 | for (let i = 0; i < pinyinList.length; i++) { 316 | const pinyin = pinyinList[i]; 317 | const keys = getPinyinKeys(pinyin); 318 | 319 | let subTree = tree; 320 | for (let j = 0; j < keys.length; j++) { 321 | const key = keys[j]; 322 | 323 | subTree = subTree[key] || (subTree[key] = {}); 324 | } 325 | } 326 | 327 | return tree; 328 | } 329 | -------------------------------------------------------------------------------- /tools/analyze/tree.css: -------------------------------------------------------------------------------- 1 | :root{--syntax_normal:#1b1e23;--syntax_comment:#a9b0bc;--syntax_number:#20a5ba;--syntax_keyword:#c30771;--syntax_atom:#10a778;--syntax_string:#008ec4;--syntax_error:#ffbedc;--syntax_unknown_variable:#838383;--syntax_known_variable:#005f87;--syntax_matchbracket:#20bbfc;--syntax_key:#6636b4;--mono_fonts:82%/1.5 Menlo,Consolas,monospace}.observablehq--collapsed,.observablehq--expanded,.observablehq--function,.observablehq--gray,.observablehq--import,.observablehq--string:after,.observablehq--string:before{color:var(--syntax_normal)}.observablehq--collapsed,.observablehq--inspect a{cursor:pointer}.observablehq--field{text-indent:-1em;margin-left:1em}.observablehq--empty{color:var(--syntax_comment)}.observablehq--blue,.observablehq--keyword{color:#3182bd}.observablehq--forbidden,.observablehq--pink{color:#e377c2}.observablehq--orange{color:#e6550d}.observablehq--boolean,.observablehq--null,.observablehq--undefined{color:var(--syntax_atom)}.observablehq--bigint,.observablehq--date,.observablehq--green,.observablehq--number,.observablehq--regexp,.observablehq--symbol{color:var(--syntax_number)}.observablehq--index,.observablehq--key{color:var(--syntax_key)}.observablehq--prototype-key{color:#aaa}.observablehq--empty{font-style:oblique}.observablehq--purple,.observablehq--string{color:var(--syntax_string)}.observablehq--error,.observablehq--red{color:#e7040f}.observablehq--inspect{font:var(--mono_fonts);overflow-x:auto;display:block;white-space:pre}.observablehq--error .observablehq--inspect{word-break:break-all;white-space:pre-wrap} -------------------------------------------------------------------------------- /tools/pinyin-dict/.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | *.log 3 | data/pinyin-dict.*.txt 4 | data/pinyin-dict-*.*.txt 5 | data/pinyin-*.sqlite 6 | data/*_params/ 7 | result.txt 8 | -------------------------------------------------------------------------------- /tools/pinyin-dict/data/pinyin-dict-data-phrase.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/tools/pinyin-dict/data/pinyin-dict-data-phrase.zip -------------------------------------------------------------------------------- /tools/pinyin-dict/data/pinyin-dict-data-word.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/tools/pinyin-dict/data/pinyin-dict-data-word.zip -------------------------------------------------------------------------------- /tools/pinyin-dict/data/pinyin-dict-db.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/tools/pinyin-dict/data/pinyin-dict-db.zip -------------------------------------------------------------------------------- /tools/pinyin-dict/docs/img/donate-cngwzj.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crazydan-studio/kuaizi-ime/3eaa40866034455c8938a645c9c1e08047f1a253/tools/pinyin-dict/docs/img/donate-cngwzj.png -------------------------------------------------------------------------------- /tools/pinyin-dict/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "pinyin-dict", 3 | "version": "0.1.0", 4 | "description": "Pinyin dictionary maker", 5 | "author": "flytreeleft@crazydan.org", 6 | "license": "Apache-2.0", 7 | "imports": { 8 | "#*": "./src/*" 9 | }, 10 | "scripts": { 11 | "app:shell": "node src/app/shell.mjs", 12 | "app:shell:debug": "node --inspect-brk src/app/shell.mjs", 13 | "generate:test": "node src/generate/test.mjs", 14 | "generate:raw": "node src/generate/raw/index.mjs", 15 | "generate:raw:debug": "node --inspect-brk src/generate/raw/index.mjs", 16 | "generate:emoji": "node src/generate/emoji/index.mjs", 17 | "generate:phrase": "node src/generate/phrase/index.mjs", 18 | "generate:phrase:debug": "node --inspect-brk src/generate/phrase/index.mjs", 19 | "generate:sqlite:ime": "node src/generate/sqlite/ime/index.mjs", 20 | "generate:sqlite:ime:debug": "node --inspect-brk src/generate/sqlite/ime/index.mjs", 21 | "generate:sqlite:word": "node src/generate/sqlite/word/index.mjs", 22 | "generate:sqlite:word:debug": "node --inspect-brk src/generate/sqlite/word/index.mjs", 23 | "generate:sqlite:word:diff": "node src/generate/sqlite/word/diff.mjs", 24 | "generate:sqlite:phrase:hmm": "node src/generate/sqlite/phrase/hmm/index.mjs", 25 | "generate:sqlite:phrase:hmm:debug": "node --inspect-brk src/generate/sqlite/phrase/hmm/index.mjs", 26 | "generate:sqlite:phrase:hmm:trans": "node src/generate/sqlite/phrase/hmm/trans/index.mjs", 27 | "generate:sqlite:phrase:hmm:trans:debug": "node --inspect-brk src/generate/sqlite/phrase/hmm/trans/index.mjs", 28 | "generate:sqlite:phrase:hmm:trans_kewen": "node src/generate/sqlite/phrase/hmm/trans_kewen/index.mjs" 29 | }, 30 | "dependencies": { 31 | "@inquirer/prompts": "^3.3.0", 32 | "@pinyin-pro/data": "^1.2.0", 33 | "fontkit": "^2.0.2", 34 | "get-system-fonts": "^2.0.2", 35 | "got": "^13.0.0", 36 | "grapheme-splitter": "^1.0.4", 37 | "jsdom": "^22.1.0", 38 | "pinyin-pro": "^3.25.0", 39 | "sqlite": "^5.0.1", 40 | "sqlite3": "^5.1.6" 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/app/shell.mjs: -------------------------------------------------------------------------------- 1 | /* SQLite 词典库 */ 2 | import { fromRootPath, existFile } from '#utils/utils.mjs'; 3 | import { input, select } from '@inquirer/prompts'; 4 | 5 | import * as sqlite from './sqlite.mjs'; 6 | 7 | // 用户字典库 8 | const userDictSQLiteFile = fromRootPath('data', 'pinyin-user-dict.sqlite'); 9 | 10 | console.log(); 11 | console.log('初始化用户字典 ...'); 12 | const needToInitUserDict = !existFile(userDictSQLiteFile); 13 | const userDictDB = await sqlite.open(userDictSQLiteFile); 14 | 15 | // 通过 attach database 连接字典、词典库, 16 | // 库中的非同名表可以直接使用,无需通过连接名称区分 17 | // Note:性能不太好 18 | await sqlite.attach(userDictDB, { 19 | // 应用字典库 20 | word: fromRootPath('data', 'pinyin-word-dict.sqlite'), 21 | // 应用词典库 22 | phrase: fromRootPath('data', 'pinyin-phrase-dict.sqlite') 23 | }); 24 | 25 | try { 26 | if (needToInitUserDict) { 27 | await sqlite.init(userDictDB); 28 | } 29 | console.log(); 30 | 31 | while ((await start(userDictDB)) !== false) {} 32 | } catch (e) { 33 | throw e; 34 | } finally { 35 | await sqlite.close(userDictDB); 36 | } 37 | 38 | console.log(); 39 | 40 | async function start(userDictDB) { 41 | // https://github.com/SBoudrias/Inquirer.js 42 | const pinyin = ( 43 | await input({ 44 | message: '请输入拼音,拼音之间以空格分隔(输入 exit 退出):' 45 | }) 46 | ).trim(); 47 | 48 | if (!pinyin) { 49 | return true; 50 | } else if (pinyin === 'exit') { 51 | return false; 52 | } 53 | 54 | const chars = pinyin.replaceAll(/v/g, 'ü').split(/\s+/g); 55 | const words = await sqlite.predict(userDictDB, chars); 56 | 57 | const selectedPhrase = await select({ 58 | message: '请选择最佳的匹配结果:', 59 | choices: words.map((w, i) => ({ 60 | name: w[1].map(({ value }) => value).join(''), // 显示内容 61 | value: w[1], // 函数返回内容 62 | // 选中时的提示内容 63 | description: `${i + 1}: (${w[0]}) ${w[1] 64 | .map(({ value, spell }) => `${value} - ${spell}`) 65 | .join(', ')}` 66 | })) 67 | }); 68 | 69 | while (true) { 70 | const selectedWord = await select({ 71 | message: '请选择待修改的字:', 72 | choices: [ 73 | { name: '[结束修改]', value: { index: -1, word: { id: 0 } } } 74 | ].concat( 75 | selectedPhrase.map((w, i) => ({ 76 | name: w.id == 0 ? `{w.value}` : `${w.value} - ${w.spell}`, 77 | value: { index: i, word: w } 78 | })) 79 | ) 80 | }); 81 | 82 | if (selectedWord.index < 0) { 83 | break; 84 | } 85 | 86 | const selectedCandidate = await select({ 87 | message: `请修改选中的字 [${selectedWord.index + 1}: ${ 88 | selectedWord.word.value 89 | }]:`, 90 | choices: selectedWord.word.get_candidates().map((w) => ({ 91 | name: `${w.value} - ${w.spell}`, 92 | value: w, 93 | description: ` ${w.value}: ${w.id}, ${w.spell}` 94 | })) 95 | }); 96 | 97 | selectedPhrase[selectedWord.index] = selectedCandidate; 98 | } 99 | 100 | sqlite.saveUsedPhrase(userDictDB, selectedPhrase); 101 | 102 | console.log( 103 | ' 最终确认结果为: ', 104 | chars.join(' '), 105 | '->', 106 | selectedPhrase 107 | .map(({ id, value, spell }) => `${id}:${value}:${spell}:`) 108 | .join(',') 109 | ); 110 | console.log(); 111 | } 112 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/emoji/emoji.mjs: -------------------------------------------------------------------------------- 1 | import got from 'got'; 2 | import { JSDOM } from 'jsdom'; 3 | 4 | import { sleep } from '#utils/utils.mjs'; 5 | 6 | const baseUrl = 'https://emojixd.com'; 7 | const gotOptions = { timeout: { connect: 50000 } }; 8 | 9 | export async function fetchEmojis() { 10 | const html = await got(baseUrl, gotOptions).text(); 11 | const $dom = new JSDOM(html); 12 | const $doc = (($dom || {}).window || {}).document; 13 | if (!$doc) { 14 | return []; 15 | } 16 | 17 | const $groupLinks = $doc.querySelectorAll('.emoji-item > a'); 18 | const groups = []; 19 | $groupLinks.forEach(($el) => { 20 | const url = baseUrl + $el.getAttribute('href'); 21 | const zhName = $el.querySelector('.h3').textContent.trim(); 22 | const enName = $el.querySelector('.h5').textContent.trim(); 23 | 24 | groups.push({ 25 | url, 26 | name: { zh: zhName, en: enName }, 27 | emojis: [] 28 | }); 29 | }); 30 | 31 | for (const group of groups) { 32 | const emojis = await fetchGroupEmojis(group.name.zh, group.url); 33 | group.emojis = emojis; 34 | } 35 | 36 | return groups; 37 | } 38 | 39 | async function fetchGroupEmojis(groupName, groupUrl) { 40 | const html = await got(groupUrl, gotOptions).text(); 41 | const $dom = new JSDOM(html); 42 | const $doc = (($dom || {}).window || {}).document; 43 | if (!$doc) { 44 | return []; 45 | } 46 | 47 | const $emojiLinks = $doc.querySelectorAll('a.emoji-item'); 48 | const emojiUrls = []; 49 | $emojiLinks.forEach(($el) => { 50 | const url = baseUrl + $el.getAttribute('href'); 51 | 52 | emojiUrls.push(url); 53 | }); 54 | 55 | const batchSize = 50; 56 | const emojis = []; 57 | for (let i = 0; i < emojiUrls.length; i += batchSize) { 58 | const urls = emojiUrls.slice(i, i + batchSize); 59 | const data = await Promise.all(urls.map(fetchEmoji)); 60 | 61 | console.log( 62 | `已抓取到 ${groupName} 第 ${i + 1} 到 ${i + 1 + batchSize} 之间的数据.` 63 | ); 64 | 65 | data.forEach((e) => { 66 | emojis.push(e); 67 | }); 68 | 69 | await sleep(1500); 70 | } 71 | 72 | return emojis; 73 | } 74 | 75 | async function fetchEmoji(emojiUrl) { 76 | const html = await got(emojiUrl, gotOptions).text(); 77 | const $dom = new JSDOM(html); 78 | const $doc = (($dom || {}).window || {}).document; 79 | if (!$doc) { 80 | return { url: emojiUrl }; 81 | } 82 | 83 | const emoji = { 84 | value: '', 85 | name: { zh: '', en: '' }, 86 | unicode: '', 87 | unicode_version: '', 88 | url: emojiUrl, 89 | keywords: [] 90 | }; 91 | 92 | const $value = $doc.querySelector('.center .emoji'); 93 | emoji.value = $value.textContent.trim(); 94 | 95 | $doc.querySelectorAll('dl > dt').forEach(($el) => { 96 | const title = $el.textContent.trim(); 97 | const $next = $el.nextSibling; 98 | const value = $next.textContent.trim(); 99 | 100 | switch (title) { 101 | case 'Emoji名称': 102 | emoji.name.zh = value; 103 | if (value.includes('旗:')) { 104 | emoji.keywords.push(value.replaceAll(/^旗:\s*/g, '')); 105 | } 106 | break; 107 | case '英文名称': 108 | emoji.name.en = value; 109 | break; 110 | case 'unicode编码': 111 | emoji.unicode = value; 112 | break; 113 | case 'unicode版本': 114 | emoji.unicode_version = value; 115 | break; 116 | case '关键词': 117 | $next.querySelectorAll('a').forEach(($a) => { 118 | emoji.keywords.push($a.textContent.trim()); 119 | }); 120 | break; 121 | } 122 | }); 123 | 124 | return emoji; 125 | } 126 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/emoji/index.mjs: -------------------------------------------------------------------------------- 1 | import { fromRootPath, appendLineToFile } from '#utils/utils.mjs'; 2 | 3 | import { fetchEmojis } from './emoji.mjs'; 4 | 5 | const emojiDataFile = fromRootPath('data', 'emojis.json'); 6 | 7 | console.log(); 8 | console.log('抓取表情符号 ...'); 9 | const emojiGroups = await fetchEmojis(); 10 | console.log('- 已抓取表情分类总数:' + emojiGroups.length); 11 | console.log( 12 | '- 已抓取表情符号总数:' + 13 | emojiGroups.reduce((r, group) => r + group.emojis.length, 0) 14 | ); 15 | console.log(); 16 | 17 | console.log(); 18 | console.log('保存表情符号 ...'); 19 | appendLineToFile(emojiDataFile, JSON.stringify(emojiGroups), true); 20 | console.log('- 保存成功'); 21 | console.log(); 22 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/phrase/index.mjs: -------------------------------------------------------------------------------- 1 | import { fromRootPath } from '#utils/utils.mjs'; 2 | import { 3 | fetchAndSaveAllKeWen, 4 | fetchAndSaveAllGushi, 5 | fetchAndSaveAllGuci 6 | } from './phrase.mjs'; 7 | 8 | // 采集 古文之家(https://www.cngwzj.com) 的数据 9 | // 语文课文 10 | const kewenDataRawFile = fromRootPath('data', 'pinyin-dict-kewen.raw.txt'); 11 | // 古诗 12 | const gushiDataRawFile = fromRootPath('data', 'pinyin-dict-gushi.raw.txt'); 13 | // 古词 14 | const guciDataRawFile = fromRootPath('data', 'pinyin-dict-guci.raw.txt'); 15 | const enableDump = false; 16 | 17 | console.log(); 18 | console.log('拉取课文数据 ...'); 19 | await fetchAndSaveAllKeWen(kewenDataRawFile, enableDump); 20 | 21 | console.log(); 22 | console.log('拉取古诗数据 ...'); 23 | await fetchAndSaveAllGushi(gushiDataRawFile, enableDump); 24 | 25 | console.log(); 26 | console.log('拉取古词数据 ...'); 27 | await fetchAndSaveAllGuci(guciDataRawFile, enableDump); 28 | 29 | console.log(); 30 | console.log('Done!'); 31 | console.log(); 32 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/phrase/phrase.mjs: -------------------------------------------------------------------------------- 1 | import got from 'got'; 2 | import { JSDOM } from 'jsdom'; 3 | 4 | import { 5 | sleep, 6 | appendLineToFile, 7 | correctPinyin, 8 | extractPinyinChars 9 | } from '#utils/utils.mjs'; 10 | 11 | // 根据 www.cngwzj.com 拉取带拼音的语文课文 12 | const gushiBaseUrl = 'https://www.cngwzj.com/tangshi300/78.html'; 13 | const guciBaseUrl = 'https://www.cngwzj.com/tangshi300/2137.html'; 14 | const gotOptions = { timeout: { connect: 50000 } }; 15 | 16 | /** 拉取所有的课文数据 */ 17 | export async function fetchAndSaveAllKeWen(file, dump) { 18 | const pageUrls = [ 19 | // 课文 20 | 'https://do.cngwzj.com/search/?zz=&keys=%BF%CE%CE%C4&px=&acc=&newpage=', 21 | // 语文课文 22 | 'https://do.cngwzj.com/search/?zz=&keys=%D3%EF%CE%C4%BF%CE%CE%C4&px=&acc=&newpage=', 23 | // 年级 24 | 'https://do.cngwzj.com/search/?zz=&keys=%C4%EA%BC%B6&px=&acc=&newpage=', 25 | // 成语故事 26 | 'https://do.cngwzj.com/search/?zz=&keys=%B3%C9%D3%EF%B9%CA%CA%C2&px=&acc=&newpage=', 27 | // 读读写写 28 | 'https://do.cngwzj.com/search/?zz=&keys=%B6%C1%B6%C1%D0%B4%D0%B4&px=&acc=&newpage=' 29 | ]; 30 | 31 | const urls = []; 32 | for (let pageUrl of pageUrls) { 33 | (await fetchKeWenUrls(pageUrl)).forEach((url) => { 34 | if (!urls.includes(url)) { 35 | urls.push(url); 36 | } 37 | }); 38 | } 39 | 40 | console.log(` - 总计 ${urls.length} 篇课文`); 41 | await fetchAndSaveArticles(file, urls, dump); 42 | } 43 | 44 | /** 拉取所有的古诗数据 */ 45 | export async function fetchAndSaveAllGushi(file, dump) { 46 | const urls = await fetchGushiciUrls(gushiBaseUrl); 47 | 48 | console.log(` - 总计 ${urls.length} 篇古诗`); 49 | await fetchAndSaveArticles(file, urls, dump); 50 | } 51 | 52 | /** 拉取所有的古词数据 */ 53 | export async function fetchAndSaveAllGuci(file, dump) { 54 | const urls = await fetchGushiciUrls(guciBaseUrl); 55 | 56 | console.log(` - 总计 ${urls.length} 篇古词`); 57 | await fetchAndSaveArticles(file, urls, dump); 58 | } 59 | 60 | /** 拉取课文 URL 地址 */ 61 | async function fetchKeWenUrls(url, page = 1) { 62 | return fetchAndParsePage(url + page, [], async ($doc) => { 63 | const $pageLinks = $doc.querySelectorAll('.pages a'); 64 | const lastPageNumStr = $pageLinks[$pageLinks.length - 1] 65 | .getAttribute('href') 66 | .replaceAll(/.*newpage=/g, ''); 67 | const lastPageNum = parseInt(lastPageNumStr); 68 | 69 | const tjItemLinks = parseArticleLinks($doc); 70 | 71 | if (page >= lastPageNum) { 72 | return tjItemLinks; 73 | } 74 | return tjItemLinks.concat(await fetchKeWenUrls(url, page + 1)); 75 | }); 76 | } 77 | 78 | /** 拉取古诗/词 URL 地址 */ 79 | async function fetchGushiciUrls(baseUrl) { 80 | return fetchAndParsePage(baseUrl, [], parseArticleLinks); 81 | } 82 | 83 | /** 拉取带拼音的文章全文 */ 84 | async function fetchArticle(url) { 85 | return fetchAndParsePage(url, {}, ($doc) => { 86 | const $titles = $doc.querySelectorAll('#gsbox .g_box .text-c li'); 87 | const $title = $titles[0]; 88 | const $subtitle = $titles[1]; 89 | // Note: 段落顺序在服务端被打乱了,暂时不清楚其还原逻辑 90 | const $pargraphs = $doc.querySelectorAll('#gsbox .g_box #showgushi li'); 91 | 92 | const title = parsePargraph($title); 93 | const subtitle = parsePargraph($subtitle); 94 | const pargraphs = []; 95 | $pargraphs.forEach(($el) => { 96 | const pargraph = parsePargraph($el); 97 | pargraphs.push(pargraph); 98 | }); 99 | 100 | console.log( 101 | ' - 已拉取到文章:《' + title.map((w) => w.zi).join('') + '》' 102 | ); 103 | 104 | return { title, subtitle, pargraphs }; 105 | }); 106 | } 107 | 108 | /** 拉取并保存文章 */ 109 | async function fetchAndSaveArticles(file, urls, dump) { 110 | const batchSize = 10; 111 | 112 | for (let i = 0; i < urls.length; i += batchSize) { 113 | const batchUrls = urls.slice(i, i + batchSize); 114 | const list = await Promise.all(batchUrls.map(fetchArticle)).then( 115 | (values) => values 116 | ); 117 | 118 | if (dump) { 119 | list.forEach(dumpArticle); 120 | } 121 | 122 | // Note: 首行写入前,先清空文件 123 | appendLineToFile(file, JSON.stringify(list), i === 0); 124 | 125 | await sleep(1000); 126 | } 127 | } 128 | 129 | async function fetchAndParsePage(url, defaultVal, parse) { 130 | const html = await got(url, gotOptions).text(); 131 | const $dom = new JSDOM(html); 132 | const $doc = (($dom || {}).window || {}).document; 133 | 134 | if (!$doc) { 135 | return defaultVal; 136 | } 137 | 138 | return await parse($doc); 139 | } 140 | 141 | function parseArticleLinks($doc) { 142 | const $links = $doc.querySelectorAll('.tj_listbox .tj_title a'); 143 | const links = []; 144 | 145 | $links.forEach(($el) => { 146 | const href = $el.getAttribute('href'); 147 | 148 | // 仅取拼音版文章 149 | if (href.includes('/pygushi/')) { 150 | links.push(href); 151 | } 152 | }); 153 | 154 | return links; 155 | } 156 | 157 | /** @return [{zi: '语', py: 'yǔ'}, {...}, ...] */ 158 | function parsePargraph($el) { 159 | //  
cūn

160 | const $spans = $el.querySelectorAll('span'); 161 | 162 | const pargraph = []; 163 | $spans.forEach(($span) => { 164 | const splits = $span.innerHTML.split('
'); 165 | const py = cleanPinyin(splits[0]); 166 | const zi = cleanZi(splits[1]); 167 | 168 | if (zi) { 169 | pargraph.push(py ? { zi, py } : { zi }); 170 | } 171 | }); 172 | 173 | return pargraph; 174 | } 175 | 176 | function dumpArticle(article) { 177 | const dump = (words) => { 178 | console.error(words.map((w) => (w.py ? w.py : ' ')).join(' ')); 179 | console.error(words.map((w) => w.zi).join('')); 180 | }; 181 | 182 | console.error('============================='); 183 | dump(article.title); 184 | dump(article.subtitle); 185 | article.pargraphs.forEach(dump); 186 | console.error('============================='); 187 | } 188 | 189 | function cleanPinyin(py) { 190 | py = py.replaceAll(' ', '').trim(); 191 | py = correctPinyin(py); 192 | 193 | if (py == 'g') { 194 | py = 'ǹg'; 195 | } 196 | 197 | if (py && !/^[a-zü]+$/g.test(extractPinyinChars(py))) { 198 | console.error(' 无效拼音:' + py); 199 | return ''; 200 | } 201 | return py; 202 | } 203 | 204 | function cleanZi(zi) { 205 | return zi 206 | .replaceAll(' ', '') 207 | .replaceAll('', '') 208 | .replaceAll('', '') 209 | .replaceAll('', '') 210 | .replaceAll('', '') 211 | .replaceAll(/[。,、+]/g, '') 212 | .trim(); 213 | } 214 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/raw/index.mjs: -------------------------------------------------------------------------------- 1 | import { fromRootPath } from '#utils/utils.mjs'; 2 | import { 3 | readZDicWordsFromPinyinData, 4 | readTraditionalWordsFromOpenCC, 5 | patchAndSaveZDicWordsToFile, 6 | saveWordMetasToFile, 7 | calculateWordWeightByGlyph, 8 | plusWordUsageWeight, 9 | plusPhraseUsageWeight, 10 | readWordUsage, 11 | readPhraseUsage 12 | } from './raw.mjs'; 13 | 14 | // 采用 汉典网(http://zdic.net/) 的数据 15 | // https://github.com/mozillazg/pinyin-data/blob/master/zdic.txt 16 | const pinyinDataFile = fromRootPath('../..', 'thirdparty/pinyin-data/zdic.txt'); 17 | // 繁->简 转换数据,用于确定繁体字 18 | // https://github.com/BYVoid/OpenCC/blob/master/data/dictionary/TSCharacters.txt 19 | const tradToSimpleDataFile = fromRootPath( 20 | '../..', 21 | 'thirdparty/OpenCC/data/dictionary/TSCharacters.txt' 22 | ); 23 | // 字/词的使用权重 24 | const wordUsageDataFile = fromRootPath('../..', 'thirdparty/hanzi-weight.txt'); 25 | const phraseUsageDataFile = fromRootPath('../..', 'thirdparty/hanzi-weight.ciyu.txt'); 26 | 27 | // 包含完整拼音和字信息的文本文件 28 | const dictDataRawFile = fromRootPath('data', 'pinyin-dict.raw.txt'); 29 | const dictDataValidFile = fromRootPath('data', 'pinyin-dict.valid.txt'); 30 | 31 | console.log(); 32 | console.log('读取 OpenCC 数据 ...'); 33 | const traditionalWords = await readTraditionalWordsFromOpenCC( 34 | tradToSimpleDataFile 35 | ); 36 | console.log('已读取 OpenCC 数据:'); 37 | console.log('- 繁体字数:' + Object.keys(traditionalWords).length); 38 | console.log(); 39 | 40 | console.log(); 41 | console.log('读取 pinyin-data 数据 ...'); 42 | const zdicWords = await readZDicWordsFromPinyinData(pinyinDataFile); 43 | zdicWords.forEach((word) => { 44 | word.traditional = !!traditionalWords[word.value]; 45 | }); 46 | console.log('已读取 pinyin-data 数据:'); 47 | console.log('- 总字数:' + zdicWords.length); 48 | console.log('- 繁体字数:' + zdicWords.filter((w) => w.traditional).length); 49 | console.log('- 简体字数:' + zdicWords.filter((w) => !w.traditional).length); 50 | console.log(); 51 | 52 | console.log(); 53 | console.log('读取 zdic.net 数据 ...'); 54 | const wordMetas = await patchAndSaveZDicWordsToFile(dictDataRawFile, zdicWords); 55 | const wordMetasWithPinyin = wordMetas.filter((w) => w.pinyins.length > 0); 56 | const wordMetasWithoutPinyin = wordMetas.filter((w) => w.pinyins.length === 0); 57 | const wordMetasWithGlyph = wordMetas.filter((w) => w.glyph_font_exists); 58 | const wordMetasWithoutGlyph = wordMetas.filter((w) => !w.glyph_font_exists); 59 | const wordMetasWithStrokeOrder = wordMetas.filter((w) => !!w.stroke_order); 60 | const wordMetasWithoutStrokeOrder = wordMetas.filter((w) => !w.stroke_order); 61 | console.log('已读取 zdic.net 数据:'); 62 | console.log('- 总字数:' + wordMetas.length); 63 | console.log('- 繁体字数:' + wordMetas.filter((w) => w.traditional).length); 64 | console.log('- 简体字数:' + wordMetas.filter((w) => !w.traditional).length); 65 | console.log('- 有拼音字数:' + wordMetasWithPinyin.length); 66 | console.log('- 无拼音字数:' + wordMetasWithoutPinyin.length); 67 | console.log('- 有字形字数:' + wordMetasWithGlyph.length); 68 | console.log('- 无字形字数:' + wordMetasWithoutGlyph.length); 69 | console.log('- 有笔顺字数:' + wordMetasWithStrokeOrder.length); 70 | console.log('- 无笔顺字数:' + wordMetasWithoutStrokeOrder.length); 71 | console.log('- 短语数:' + wordMetas.reduce((r, w) => r + w.phrases.length, 0)); 72 | console.log(); 73 | console.log( 74 | '- 无拼音字列表:' + 75 | wordMetasWithoutPinyin.map((meta) => meta.value).join(', ') 76 | ); 77 | console.log( 78 | '- 无拼音无笔顺无字形字列表:' + 79 | wordMetasWithoutPinyin 80 | .filter((w) => !w.stroke_order && !w.glyph_font_exists) 81 | .map((meta) => meta.value) 82 | .join(', ') 83 | ); 84 | console.log( 85 | '- 无拼音有笔顺无字形字列表:' + 86 | wordMetasWithoutPinyin 87 | .filter((w) => w.stroke_order && !w.glyph_font_exists) 88 | .map((meta) => meta.value) 89 | .join(', ') 90 | ); 91 | console.log( 92 | '- 无拼音无笔顺有字形字列表:' + 93 | wordMetasWithoutPinyin 94 | .filter((w) => !w.stroke_order && w.glyph_font_exists) 95 | .map((meta) => meta.value) 96 | .join(', ') 97 | ); 98 | 99 | console.log( 100 | '- 有字形无笔顺字列表:' + 101 | wordMetasWithGlyph 102 | .filter((w) => !w.stroke_order) 103 | .map((meta) => meta.value) 104 | .join(', ') 105 | ); 106 | console.log( 107 | '- 有字形无拼音字列表:' + 108 | wordMetasWithGlyph 109 | .filter((w) => w.pinyins.length === 0) 110 | .map((meta) => meta.value) 111 | .join(', ') 112 | ); 113 | console.log( 114 | '- 有字形无拼音无笔顺字列表:' + 115 | wordMetasWithGlyph 116 | .filter((w) => w.pinyins.length === 0 && !w.stroke_order) 117 | .map((meta) => meta.value) 118 | .join(', ') 119 | ); 120 | // console.log( 121 | // '- 有字形字列表:' + wordMetasWithGlyph.map((meta) => meta.value).join(', ') 122 | // ); 123 | 124 | // console.log( 125 | // '- 无字形有拼音字列表:' + 126 | // wordMetasWithoutGlyph 127 | // .filter((w) => w.pinyins.length !== 0) 128 | // .map((meta) => `${meta.value}(${meta.unicode})`) 129 | // .join(', ') 130 | // ); 131 | // console.log( 132 | // '- 无字形有笔顺字列表:' + 133 | // wordMetasWithoutGlyph 134 | // .filter((w) => w.stroke_order) 135 | // .map((meta) => `${meta.value}(${meta.unicode})`) 136 | // .join(', ') 137 | // ); 138 | // console.log( 139 | // '- 无字形有拼音有笔顺字列表:' + 140 | // wordMetasWithoutGlyph 141 | // .filter((w) => w.pinyins.length !== 0 && w.stroke_order) 142 | // .map((meta) => `${meta.value}(${meta.unicode})`) 143 | // .join(', ') 144 | // ); 145 | // console.log( 146 | // '- 无字形字列表:' + 147 | // wordMetasWithoutGlyph.map((meta) => meta.value).join(', ') 148 | // ); 149 | console.log(); 150 | 151 | console.log(); 152 | console.log('按字形计算字的权重 ...'); 153 | calculateWordWeightByGlyph(wordMetasWithGlyph); 154 | console.log(); 155 | 156 | console.log(); 157 | console.log('为字/词增加使用权重 ...'); 158 | const wordUsages = await readWordUsage(wordUsageDataFile); 159 | const phraseUsages = await readPhraseUsage(phraseUsageDataFile); 160 | plusWordUsageWeight(wordMetasWithGlyph, wordUsages); 161 | plusPhraseUsageWeight(wordMetasWithGlyph, phraseUsages); 162 | console.log(); 163 | 164 | console.log(); 165 | console.log('保存有字形的字数据 ...'); 166 | saveWordMetasToFile(dictDataValidFile, wordMetasWithGlyph); 167 | console.log('有字形的字数据已保存至:' + dictDataValidFile); 168 | console.log(); 169 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/sqlite/ime/ime.mjs: -------------------------------------------------------------------------------- 1 | import { asyncForEach } from '#utils/utils.mjs'; 2 | import { saveToDB, removeFromDB, execSQL } from '#utils/sqlite.mjs'; 3 | 4 | export { openDB as open, closeDB as close } from '#utils/sqlite.mjs'; 5 | 6 | // 查看表上的索引: PRAGMA index_list('MyTable'); 7 | // 查看索引的列: PRAGMA index_info('MyIndex'); 8 | 9 | // 除主键外,唯一性约束、外键约束、索引等均在 IME 客户端初始化时设置, 10 | // 从而降低 App 的打包大小,其相关的数据准确性由原始字典库保证。 11 | // Note: 在 IME 客户端,对于只读不写的表,其外键约束也可以去掉,但需添加索引 12 | 13 | /* 14 | -- 查询繁/简体 15 | select 16 | w_.id_, 17 | w_.word_, 18 | w_.spell_, 19 | w_.traditional_, 20 | w_.variant_ 21 | from 22 | pinyin_word w_ 23 | where 24 | w_.variant_ is not null 25 | order by 26 | w_.spell_ 27 | ; 28 | */ 29 | 30 | /** 同步字信息 */ 31 | export async function syncWords(imeDB, rawDB) { 32 | await execSQL( 33 | imeDB, 34 | ` 35 | -- 拼音字母组合 36 | create table 37 | if not exists meta_pinyin_chars ( 38 | id_ integer not null primary key, 39 | value_ text not null 40 | -- , unique (value_) 41 | ); 42 | 43 | -- 在一张表中记录拼音字的全部信息,从而降低数据库文件大小,同时消除表连接以提升查询性能 44 | create table 45 | if not exists pinyin_word ( 46 | id_ integer not null primary key, 47 | -- 字 48 | word_ text not null, 49 | word_id_ integer not null, 50 | -- 拼音 51 | spell_ text not null, 52 | spell_id_ integer not null, 53 | spell_chars_id_ integer not null, 54 | -- 字使用权重 55 | used_weight_ integer default 0, 56 | -- 按拼音分组计算的字形权重 57 | glyph_weight_ integer default 0, 58 | -- 部首 59 | radical_ text default null, 60 | radical_stroke_count_ integer default 0, 61 | -- 当前拼音字的繁/简字及其 id(对应 pinyin_word 表的 id_) 62 | variant_ text default null, 63 | variant_id_ integer default null, 64 | traditional_ integer default 0 65 | ); 66 | ` 67 | ); 68 | await syncTableData(imeDB, rawDB, ['meta_pinyin_chars']); 69 | 70 | // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 71 | const dataInRawDB = {}; 72 | ( 73 | await rawDB.all( 74 | ` 75 | select 76 | lnk_.id_ as id_, 77 | word_.value_ as word_, 78 | word_.id_ as word_id_, 79 | spell_.value_ as spell_, 80 | spell_.id_ as spell_id_, 81 | spell_.chars_id_ as spell_chars_id_, 82 | word_.used_weight_ as used_weight_, 83 | lnk_.glyph_weight_ as glyph_weight_, 84 | word_.traditional_ as traditional_, 85 | radical_.value_ as radical_, 86 | radical_.stroke_count_ as radical_stroke_count_, 87 | -- 提供字段占位,以确保做更新检查时,数据中已包含全部的字段 88 | null as variant_, 89 | null as variant_id_ 90 | from 91 | meta_word_with_pinyin lnk_ 92 | inner join meta_word word_ on word_.id_ = lnk_.word_id_ 93 | inner join meta_pinyin spell_ on spell_.id_ = lnk_.spell_id_ 94 | inner join meta_word_radical radical_ on radical_.id_ = word_.radical_id_ 95 | ` 96 | ) 97 | ).forEach((row) => { 98 | dataInRawDB[row.id_] = row; 99 | }); 100 | // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 101 | 102 | // <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< 103 | // 绑定各个拼音字的繁/简体,且繁/简体的读音需相同 104 | const pinyinWords = {}; 105 | Object.keys(dataInRawDB).forEach((id) => { 106 | const raw = dataInRawDB[id]; 107 | 108 | pinyinWords[raw.word_id_] ||= []; 109 | pinyinWords[raw.word_id_].push(raw); 110 | }); 111 | 112 | const words = {}; 113 | (await rawDB.all(`select * from meta_word`)).forEach((row) => { 114 | words[row.id_] = row; 115 | }); 116 | const getWord = (id) => words[id].value_; 117 | 118 | const variantWords = {}; 119 | ( 120 | await rawDB.all( 121 | ` 122 | select 123 | source_id_, target_id_, 0 as traditional_ 124 | from link_word_with_simple_word 125 | union 126 | select 127 | source_id_, target_id_, 1 as traditional_ 128 | from link_word_with_traditional_word 129 | ` 130 | ) 131 | ).forEach((row) => { 132 | if ((pinyinWords[row.source_id_] || []).length == 0) { 133 | console.log( 134 | `字 ${getWord(row.source_id_)}:${row.source_id_} 没有拼音信息` 135 | ); 136 | return; 137 | } 138 | if ((pinyinWords[row.target_id_] || []).length == 0) { 139 | console.log( 140 | `字 ${getWord(row.target_id_)}:${row.target_id_} 没有拼音信息` 141 | ); 142 | return; 143 | } 144 | 145 | const variant = variantWords[row.source_id_]; 146 | if (variant) { 147 | console.log( 148 | `字 ${getWord(row.source_id_)}:${ 149 | row.source_id_ 150 | } 存在多个繁/简体:${getWord(row.target_id_)}:${row.target_id_}:${ 151 | row.traditional_ 152 | }, ${getWord(variant.target_id_)}:${variant.target_id_}:${ 153 | variant.traditional_ 154 | }` 155 | ); 156 | } else if (row.source_id_ == row.target_id_) { 157 | console.log( 158 | `繁/简字同体:${getWord(row.source_id_)}:${ 159 | row.source_id_ 160 | } <=> ${getWord(row.target_id_)}:${row.target_id_}` 161 | ); 162 | } else { 163 | variantWords[row.source_id_] = row; 164 | } 165 | }); 166 | 167 | Object.keys(variantWords).forEach((source_id_) => { 168 | const { target_id_ } = variantWords[source_id_]; 169 | 170 | const sources = pinyinWords[source_id_]; 171 | const targets = pinyinWords[target_id_]; 172 | 173 | sources.forEach((source) => { 174 | const exist = targets.filter( 175 | (target) => target.spell_id_ == source.spell_id_ 176 | )[0]; 177 | 178 | if (!exist) { 179 | console.log( 180 | `拼音字 ${source.id_}:${source.word_}:${ 181 | source.spell_ 182 | } 没有相同读音的繁/简体:${targets[0].word_}:${targets 183 | .map((t) => t.spell_) 184 | .join(',')}` 185 | ); 186 | return; 187 | } 188 | 189 | // 绑定繁/简体的拼音字 190 | source.variant_ = exist.word_; 191 | source.variant_id_ = exist.id_; 192 | }); 193 | }); 194 | // >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 195 | 196 | const dataInImeDB = {}; 197 | const missingDataInImeDB = []; 198 | (await imeDB.all(`select * from pinyin_word`)).forEach((row) => { 199 | const id = row.id_; 200 | const raw = dataInRawDB[id]; 201 | 202 | if (raw) { 203 | // 待更新 204 | dataInImeDB[id] = { 205 | ...raw, 206 | __exist__: row 207 | }; 208 | 209 | // Note: 在原始数据中仅保留待新增的 210 | delete dataInRawDB[id]; 211 | } else { 212 | // 待删除 213 | missingDataInImeDB.push(id); 214 | } 215 | }); 216 | 217 | // 添加新数据 218 | await saveToDB(imeDB, 'pinyin_word', dataInRawDB); 219 | // 更新已存在数据 220 | await saveToDB(imeDB, 'pinyin_word', dataInImeDB); 221 | // 删除多余数据 222 | await removeFromDB(imeDB, 'pinyin_word', missingDataInImeDB); 223 | } 224 | 225 | /** 同步表情符号信息 */ 226 | export async function syncEmojis(imeDB, rawDB) { 227 | await execSQL( 228 | imeDB, 229 | ` 230 | create table 231 | if not exists meta_emoji_group ( 232 | id_ integer not null primary key, 233 | value_ text not null 234 | -- , unique (value_) 235 | ); 236 | 237 | create table 238 | if not exists meta_emoji ( 239 | id_ integer not null primary key, 240 | value_ text not null, 241 | group_id_ interget not null, 242 | -- 表情关键字中的字 id(meta_word 中的 id)数组列表:二维 json 数组形式 243 | keyword_ids_list_ text not null 244 | -- , unique (value_), 245 | -- foreign key (group_id_) references meta_emoji_group (id_) 246 | ); 247 | ` 248 | ); 249 | 250 | await syncTableData(imeDB, rawDB, ['meta_emoji_group', 'meta_emoji']); 251 | } 252 | 253 | async function syncTableData(imeDB, rawDB, tables) { 254 | await asyncForEach(tables, async (tableInfo) => { 255 | const table = typeof tableInfo === 'string' ? tableInfo : tableInfo.name; 256 | const columnsInImeDB = []; 257 | const primaryKeysInImeDB = []; 258 | 259 | (await imeDB.all(`select name,pk from pragma_table_info('${table}');`)).map( 260 | (row) => { 261 | columnsInImeDB.push(row.name); 262 | if (row.pk > 0) { 263 | primaryKeysInImeDB.push(row.name); 264 | } 265 | } 266 | ); 267 | 268 | const getId = (row) => { 269 | return primaryKeysInImeDB.map((key) => row[key]).join(':'); 270 | }; 271 | 272 | const dataInRawDB = {}; 273 | ( 274 | await rawDB.all( 275 | typeof tableInfo === 'string' 276 | ? `select * from ${table}` 277 | : tableInfo.select 278 | ) 279 | ).forEach((row) => { 280 | const id = getId(row); 281 | 282 | const data = (dataInRawDB[id] = {}); 283 | columnsInImeDB.forEach((column) => { 284 | data[column] = row[column]; 285 | }); 286 | }); 287 | 288 | const dataInImeDB = {}; 289 | const missingDataInImeDB = []; 290 | (await imeDB.all(`select * from ${table}`)).forEach((row) => { 291 | const id = getId(row); 292 | const exist = dataInRawDB[id]; 293 | 294 | if (exist) { 295 | // 待更新 296 | dataInImeDB[id] = { 297 | ...row, 298 | __exist__: exist 299 | }; 300 | 301 | // Note: 在原始数据中仅保留待新增的 302 | delete dataInRawDB[id]; 303 | } else { 304 | // 待删除 305 | missingDataInImeDB.push(primaryKeysInImeDB.length === 0 ? id : row); 306 | } 307 | }); 308 | 309 | // 添加新数据 310 | await saveToDB(imeDB, table, dataInRawDB, false, primaryKeysInImeDB); 311 | // 更新已存在数据 312 | await saveToDB(imeDB, table, dataInImeDB, false, primaryKeysInImeDB); 313 | 314 | // 删除多余数据 315 | await removeFromDB(imeDB, table, missingDataInImeDB, primaryKeysInImeDB); 316 | }); 317 | } 318 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/sqlite/ime/index.mjs: -------------------------------------------------------------------------------- 1 | /* 供输入法使用的 SQLite 字典库 */ 2 | import { 3 | fromRootPath, 4 | fileSHA256, 5 | appendLineToFile, 6 | copyFile 7 | } from '#utils/utils.mjs'; 8 | import * as ime from './ime.mjs'; 9 | 10 | // SQLite 字典库 11 | const wordDictSQLiteFile = fromRootPath('data', 'pinyin-word-dict.sqlite'); 12 | // SQLite 词典库 13 | const phraseDictSQLiteFile = fromRootPath('data', 'pinyin-phrase-dict.sqlite'); 14 | 15 | // 适用于 IME 输入法的 SQLite 字典库 16 | const wordDictImeSQLiteFile = fromRootPath( 17 | '../..', 18 | 'android/app/src/main/res/raw/pinyin_word_dict.db' 19 | ); 20 | const wordDictImeSQLiteHashFile = fromRootPath( 21 | '../..', 22 | 'android/app/src/main/res/raw/pinyin_word_dict_db_hash' 23 | ); 24 | // 输入法的 SQLite 词典库 25 | const phraseDictImeSQLiteFile = fromRootPath( 26 | '../..', 27 | 'android/app/src/main/res/raw/pinyin_phrase_dict.db' 28 | ); 29 | const phraseDictImeSQLiteHashFile = fromRootPath( 30 | '../..', 31 | 'android/app/src/main/res/raw/pinyin_phrase_dict_db_hash' 32 | ); 33 | 34 | // ===================================================== 35 | console.log(); 36 | console.log('同步汉字数据到输入法的 SQLite 字典库 ...'); 37 | const wordDictDB = await ime.open(wordDictSQLiteFile, true); 38 | const imeWordDictDB = await ime.open(wordDictImeSQLiteFile); 39 | 40 | try { 41 | await ime.syncWords(imeWordDictDB, wordDictDB); 42 | console.log('- 已同步字信息'); 43 | 44 | await ime.syncEmojis(imeWordDictDB, wordDictDB); 45 | console.log('- 已同步表情符号数据'); 46 | } catch (e) { 47 | throw e; 48 | } finally { 49 | await ime.close(wordDictDB); 50 | await ime.close(imeWordDictDB); 51 | } 52 | 53 | const imeWordDictDBFileHash = fileSHA256(wordDictImeSQLiteFile); 54 | appendLineToFile(wordDictImeSQLiteHashFile, imeWordDictDBFileHash, true); 55 | console.log('- 已记录数据库 Hash 值:' + imeWordDictDBFileHash); 56 | 57 | // ===================================================== 58 | console.log(); 59 | console.log('同步词组数据到输入法的 SQLite 词典库 ...'); 60 | 61 | copyFile(phraseDictSQLiteFile, phraseDictImeSQLiteFile, true); 62 | 63 | const imePhraseDictDBFileHash = fileSHA256(phraseDictImeSQLiteFile); 64 | appendLineToFile(phraseDictImeSQLiteHashFile, imePhraseDictDBFileHash, true); 65 | console.log('- 已记录数据库 Hash 值:' + imePhraseDictDBFileHash); 66 | 67 | // ===================================================== 68 | console.log(); 69 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/sqlite/phrase/hmm/index.mjs: -------------------------------------------------------------------------------- 1 | /* SQLite 词典库 */ 2 | import { fromRootPath, readJSONFromFile } from '#utils/utils.mjs'; 3 | import * as sqlite from './sqlite.mjs'; 4 | 5 | // SQLite 字典库 6 | const wordDictSQLiteFile = fromRootPath('data', 'pinyin-word-dict.sqlite'); 7 | // HMM 参数目录 8 | const transParamsDir = fromRootPath('data/hmm_params/kewen'); 9 | // SQLite 词典库 10 | const phraseDictSQLiteFile = fromRootPath('data', 'pinyin-phrase-dict.sqlite'); 11 | 12 | console.log(); 13 | console.log('创建 SQLite 词典库(累积更新) ...'); 14 | let wordDictDB = await sqlite.open(wordDictSQLiteFile, true); 15 | let phraseDictDB = await sqlite.open(phraseDictSQLiteFile); 16 | 17 | try { 18 | await sqlite.updateData(phraseDictDB, wordDictDB, { 19 | word_prob: readJSONFromFile(transParamsDir + '/word_prob.json'), 20 | // 汉字间转移概率矩阵:当前字与前一个字的关联概率 21 | trans_prob: readJSONFromFile(transParamsDir + '/trans_prob.json') 22 | }); 23 | console.log('- 已创建词典库'); 24 | } catch (e) { 25 | throw e; 26 | } finally { 27 | await sqlite.close(wordDictDB); 28 | await sqlite.close(phraseDictDB); 29 | } 30 | 31 | console.log(); 32 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/sqlite/phrase/hmm/sqlite.mjs: -------------------------------------------------------------------------------- 1 | import { asyncForEach } from '#utils/utils.mjs'; 2 | import { saveToDB, removeFromDB, execSQL } from '#utils/sqlite.mjs'; 3 | 4 | export { 5 | openDB as open, 6 | closeDB as close, 7 | attachDB as attach, 8 | execSQL as exec 9 | } from '#utils/sqlite.mjs'; 10 | 11 | // 查看表上的索引:PRAGMA index_list('MyTable'); 12 | // 查看索引的列:PRAGMA index_info('MyIndex'); 13 | // 《基于HMM的拼音输入法》:https://zhuanlan.zhihu.com/p/508599305 14 | // 《自制输入法:拼音输入法与 HMM》: https://elliot00.com/posts/input-method-hmm 15 | 16 | /** 初始化词典库的表结构 */ 17 | async function init(db) { 18 | await execSQL( 19 | db, 20 | ` 21 | -- Note:采用联合主键,以降低数据库文件大小 22 | 23 | -- 汉字出现次数 24 | create table 25 | if not exists phrase_word ( 26 | -- 具体读音的字 id 27 | -- Note:其为字典库中 字及其拼音表(link_word_with_pinyin)中的 id 28 | word_id_ integer not null, 29 | 30 | -- 短语中的字权重:出现次数 31 | weight_ integer not null, 32 | 33 | primary key (word_id_) 34 | ); 35 | 36 | -- 汉字间转移概率矩阵:当前字与前一个字的关联次数(概率在应用侧计算) 37 | create table 38 | if not exists phrase_trans_prob ( 39 | -- 当前拼音字 id: EOS 用 -1 代替(句尾字) 40 | -- Note:其为字典库中 字及其拼音表(link_word_with_pinyin)中的 id 41 | word_id_ integer not null, 42 | 43 | -- 前序拼音字 id: BOS 用 -1 代替(句首字),__total__ 用 -2 代替 44 | -- Note:其为字典库中 字及其拼音表(link_word_with_pinyin)中的 id 45 | prev_word_id_ integer not null, 46 | 47 | -- 字出现的次数 48 | -- Note:当 word_id_ == -1 且 prev_word_id_ == -2 时, 49 | -- 其代表训练数据的句子总数,用于计算 句首字出现频率; 50 | -- 51 | -- 当 word_id_ == -1 且 prev_word_id_ != -1 时, 52 | -- 其代表末尾字出现次数; 53 | -- 54 | -- 当 word_id_ != -1 且 prev_word_id_ == -1 时, 55 | -- 其代表句首字出现次数; 56 | -- 57 | -- 当 word_id_ != -1 且 prev_word_id_ == -2 时, 58 | -- 其代表当前拼音字的转移总数; 59 | -- 60 | -- 当 word_id_ != -1 且 prev_word_id_ != -1 时, 61 | -- 其代表前序拼音字的出现次数; 62 | value_ integer not null, 63 | 64 | primary key (word_id_, prev_word_id_) 65 | ); 66 | ` 67 | ); 68 | } 69 | 70 | /** 根据 HMM 参数创建词典库 */ 71 | export async function updateData(phraseDictDB, wordDictDB, hmmParams) { 72 | await init(phraseDictDB); 73 | 74 | // ======================================================= 75 | const word_dict = { 76 | pinyin_chars: {}, 77 | pinyin_word: { BOS: -1, EOS: -1, __total__: -2 } 78 | }; 79 | await asyncForEach( 80 | [ 81 | { 82 | table: 'meta_pinyin', 83 | prop: 'pinyin_chars', 84 | fields: 'chars_id_ as id_, value_' 85 | }, 86 | { 87 | table: 'pinyin_word', 88 | prop: 'pinyin_word', 89 | fields: "id_, (word_ || ':' || spell_) as value_" 90 | } 91 | ], 92 | async ({ table, fields, prop }) => { 93 | (await wordDictDB.all(`select ${fields} from ${table}`)).forEach( 94 | (row) => { 95 | const { id_, value_ } = row; 96 | 97 | word_dict[prop][value_] = id_; 98 | } 99 | ); 100 | } 101 | ); 102 | 103 | // ======================================================= 104 | const pred_dict = { 105 | word_chars: {}, 106 | trans_prob: {} 107 | }; 108 | // {'': {'': 123, ...}, ...} 109 | Object.keys(hmmParams.trans_prob).forEach((word_code) => { 110 | const probs = hmmParams.trans_prob[word_code]; 111 | const word_id = word_dict.pinyin_word[word_code]; 112 | 113 | if (!word_id) { 114 | console.log('汉字间转移概率矩阵中的当前字不存在:', word_code); 115 | return; 116 | } 117 | 118 | Object.keys(probs).forEach((prev_word_code) => { 119 | const prob_value = probs[prev_word_code]; 120 | const prev_word_id = word_dict.pinyin_word[prev_word_code]; 121 | 122 | const prob_code = `${word_id}:${prev_word_id}`; 123 | 124 | if (!prev_word_id) { 125 | console.log( 126 | '汉字间转移概率矩阵中的前序字不存在:', 127 | prob_code, 128 | prev_word_code, 129 | prob_value 130 | ); 131 | return; 132 | } 133 | 134 | pred_dict.trans_prob[prob_code] = { 135 | word_id_: word_id, 136 | prev_word_id_: prev_word_id, 137 | value_: prob_value 138 | }; 139 | }); 140 | }); 141 | 142 | // 收集字权重信息 143 | // {'': 12, ...} 144 | Object.keys(hmmParams.word_prob).forEach((word_code) => { 145 | const weight = hmmParams.word_prob[word_code]; 146 | const word_id = word_dict.pinyin_word[word_code]; 147 | 148 | pred_dict.word_chars[word_id] = { 149 | word_id_: word_id, 150 | weight_: weight 151 | }; 152 | }); 153 | 154 | // ======================================================= 155 | await asyncForEach( 156 | [ 157 | { 158 | table: 'phrase_word', 159 | prop: 'word_chars', 160 | primaryKeys: ['word_id_'], 161 | update: (data, row) => { 162 | data.weight_ += row.weight_; 163 | } 164 | }, 165 | { 166 | table: 'phrase_trans_prob', 167 | prop: 'trans_prob', 168 | primaryKeys: ['word_id_', 'prev_word_id_'], 169 | update: (data, row) => { 170 | data.value_ += row.value_; 171 | } 172 | } 173 | ], 174 | async ({ table, prop, primaryKeys, update }) => { 175 | const data = pred_dict[prop]; 176 | 177 | (await phraseDictDB.all(`select * from ${table}`)).forEach((row) => { 178 | const code = primaryKeys.map((k) => row[k]).join(':'); 179 | 180 | if (data[code]) { 181 | data[code].__exist__ = row; 182 | update(data[code], row); 183 | } 184 | }); 185 | 186 | await saveToDB(phraseDictDB, table, data, true, primaryKeys); 187 | } 188 | ); 189 | } 190 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/sqlite/phrase/hmm/trans/index.mjs: -------------------------------------------------------------------------------- 1 | /* 生成 HMM 计算参数 */ 2 | import { 3 | fromRootPath, 4 | appendLineToFile, 5 | getAllFiles, 6 | readJSONFromFile, 7 | readFile 8 | } from '#utils/utils.mjs'; 9 | import { openDB, closeDB } from '#utils/sqlite.mjs'; 10 | import { readWordsFromDB } from '../utils.mjs'; 11 | import * as trans from './trans.mjs'; 12 | 13 | // 样本文件。可试用样本如下: 14 | // - [已分词] https://raw.githubusercontent.com/InsaneLife/ChineseNLPCorpus/master/NER/MSRA/train1.txt 15 | // - [已分词] https://raw.githubusercontent.com/InsaneLife/ChineseNLPCorpus/master/NER/renMinRiBao/renmin.txt 16 | let phraseSampleFiles = []; 17 | let appendExistData = false; 18 | 19 | const args = process.argv.slice(2); 20 | for (let i = 0; i < args.length; i++) { 21 | const arg = args[i]; 22 | if (arg == '-f') { 23 | phraseSampleFiles.push(args[++i]); 24 | } else if (arg == '-a') { 25 | appendExistData = true; 26 | } 27 | } 28 | 29 | if (!phraseSampleFiles) { 30 | console.log( 31 | 'Usage: npm run generate:sqlite:phrase:hmm:trans -- [-a] -f /file1 -f file2 ...' 32 | ); 33 | console.log(); 34 | 35 | process.exit(1); 36 | } 37 | 38 | // SQLite 字典库 39 | const wordDictSQLiteFile = fromRootPath('data', 'pinyin-word-dict.sqlite'); 40 | // HMM 参数目录 41 | const transParamsDir = fromRootPath('data', 'hmm_params'); 42 | 43 | console.log(); 44 | console.log(`创建计算参数${appendExistData ? '(累积更新)' : ''} ...`); 45 | let wordDictDB = await openDB(wordDictSQLiteFile, true); 46 | 47 | let words; 48 | try { 49 | words = await readWordsFromDB(wordDictDB); 50 | } catch (e) { 51 | throw e; 52 | } finally { 53 | await closeDB(wordDictDB); 54 | } 55 | 56 | let transParams = appendExistData 57 | ? { 58 | word_prob: readJSONFromFile(transParamsDir + `/word_prob.json`), 59 | trans_prob: readJSONFromFile(transParamsDir + `/trans_prob.json`) 60 | } 61 | : null; 62 | 63 | getAllFiles(phraseSampleFiles).forEach((file) => { 64 | console.log(` - 分析文件: ${file} ...`); 65 | 66 | const sampleText = readFile(file); 67 | 68 | transParams = trans.countParams(sampleText, words, transParams); 69 | }); 70 | 71 | Object.keys(transParams).forEach((name) => { 72 | appendLineToFile( 73 | transParamsDir + `/${name}.json`, 74 | JSON.stringify(transParams[name]), 75 | true 76 | ); 77 | }); 78 | 79 | console.log(); 80 | console.log('Done'); 81 | console.log(); 82 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/sqlite/phrase/hmm/trans/trans.mjs: -------------------------------------------------------------------------------- 1 | import { getPinyinTone, getPinyin } from '#utils/utils.mjs'; 2 | import { countTrans, countWords } from '../utils.mjs'; 3 | 4 | /** 5 | * HMM 参数计算 6 | * trans_prob - 汉字间转移概率 7 | */ 8 | export function countParams(sampleText, words, existParams) { 9 | const clauses = extractClauses(sampleText, words); 10 | 11 | existParams = existParams || { word_prob: {}, trans_prob: {} }; 12 | 13 | return { 14 | // 字的出现次数 15 | word_prob: countWords(clauses, existParams.word_prob), 16 | // 当前字为 EOS 且其前序为 BOS 的转移次数即为 训练的句子总数, 17 | // 而各个包含 BOS 前序的字即为句首字,且其出现次数即为 BOS 的值 18 | trans_prob: countTrans(clauses, existParams.trans_prob) 19 | }; 20 | } 21 | 22 | // extractClauses('迈向/v 充满/v 希望/n 的/u 新/a 世纪/n', { 23 | // 力: true, 24 | // 争: true, 25 | // 达: true, 26 | // 到: true, 27 | // 万: true, 28 | // 标: true, 29 | // 准: true, 30 | // 箱: true 31 | // }); 32 | /** 拆分样本数据,按汉字短句返回 */ 33 | export function extractClauses(sampleText, words) { 34 | const clauses = getClauses(sampleText, words); 35 | 36 | const result = []; 37 | clauses.forEach((clause) => { 38 | const phrase = clause.join(''); 39 | const pinyins = clause.map(getPinyin).reduce((ret, p) => ret.concat(p), []); 40 | 41 | // console.log(` - 获取拼音 for ${phrase} ...`); 42 | 43 | // 直接按 字:拼音 进行统计,故,无需再计算 拼音-汉字发射概率 44 | const pinyinWords = correctPinyin(phrase, pinyins, words); 45 | // 忽略包含无效拼音的短语 46 | if (pinyinWords.includes(null)) { 47 | console.log( 48 | ` - 忽略包含无效拼音的短语 '${phrase}': `, 49 | pinyinWords 50 | .map((p, i) => (p ? null : `${phrase.charAt(i)}:${pinyins[i]}`)) 51 | .filter((w) => !!w) 52 | .join() 53 | ); 54 | return; 55 | } 56 | 57 | result.push(pinyinWords); 58 | }); 59 | 60 | return result; 61 | } 62 | 63 | /** @return [['迈向', '充满', '的'], [...], ...] */ 64 | export function getClauses(sampleText, words) { 65 | const clauses = []; 66 | 67 | let clause = []; 68 | const splittedPhrases = sampleText.split(/\/[a-z]+\s+/g); 69 | for (let phrase of splittedPhrases) { 70 | if (isValidPhrase(phrase, words)) { 71 | clause.push(phrase); 72 | } else { 73 | if (clause.length > 0) { 74 | clauses.push(clause); 75 | } 76 | clause = []; 77 | } 78 | } 79 | 80 | return clauses; 81 | } 82 | 83 | function isValidPhrase(phrase, words) { 84 | const excludes = ['丨', '丶', '氵']; 85 | 86 | for (let i = 0; i < phrase.length; i++) { 87 | const word = phrase.charAt(i); 88 | 89 | if (!words[word] || excludes.includes(word)) { 90 | return false; 91 | } 92 | } 93 | return true; 94 | } 95 | 96 | /** @return ['迈:mài', ...] */ 97 | function correctPinyin(clause, pinyins, words) { 98 | return pinyins.map((pinyin, index) => { 99 | let word = clause.charAt(index); 100 | const wordCode = `${word}:${pinyin}`; 101 | const prevWord = clause.charAt(index - 1); 102 | const postWord = clause.charAt(index + 1); 103 | const postPinyin = pinyins[index + 1] || ''; 104 | 105 | // 唯一拼音 106 | const uniques = { 107 | '上:shang': 'shàng', 108 | '生:sheng': 'shēng', 109 | '娘:niang': 'niáng', 110 | '袱:fu': 'fú', 111 | '伍:wu': 'wǔ', 112 | '事:shi': 'shì', 113 | '情:qing': 'qíng', 114 | '下:xia': 'xià', 115 | '同:tong': 'tóng', 116 | '个:ge': 'gè', 117 | '喇:lā': 'lǎ', 118 | '姑:gu': 'gū', 119 | '闹:nao': 'nào', 120 | '实:shi': 'shí', 121 | '人:ren': 'rén', 122 | '萄:tao': 'táo', 123 | '究:jiu': 'jiū', 124 | '太:tai': 'tài', 125 | '芦:lu': 'lú', 126 | '嚣:áo': 'xiāo', 127 | '篷:peng': 'péng', 128 | '哈:ha': 'hā', 129 | '亮:liang': 'liàng', 130 | '栏:lan': 'lán', 131 | '悉:xi': 'xī', 132 | '桃:tao': 'táo', 133 | '气:qi': 'qì', 134 | '户:hu': 'hù', 135 | '脯:pú': 'fǔ', 136 | '敞:chang': 'chǎng', 137 | '复:fu': 'fù', 138 | '务:wu': 'wù', 139 | '歌:ge': 'gē', 140 | '欢:huan': 'huān', 141 | '氛:fen': 'fēn', 142 | '宝:bao': 'bǎo', 143 | '蟆:ma': 'má', 144 | '利:li': 'lì', 145 | '成:cheng': 'chéng', 146 | '婆:po': 'pó', 147 | '甲:jia': 'jiǎ', 148 | '腐:fu': 'fǔ', 149 | '摸:mo': 'mō', 150 | '郗:chī': 'xī', 151 | '女:nü': 'nǚ', 152 | '才:cai': 'cái', 153 | '蛐:qu': 'qū', 154 | '呼:hu': 'hū', 155 | '话:hua': 'huà', 156 | '嫂:sao': 'sǎo', 157 | '辑:ji': 'jí', 158 | '算:suan': 'suàn', 159 | '烦:fan': 'fán', 160 | '屉:ti': 'tì', 161 | '方:fang': 'fāng', 162 | '叔:shu': 'shū', 163 | '应:ying': 'yìng', 164 | '戚:qi': 'qī', 165 | '麻:ma': 'má', 166 | '拉:la': 'lā', 167 | '司:si': 'sī', 168 | '瑰:gui': 'guī', 169 | '牌:pai': 'pái', 170 | '疾:ji': 'jí', 171 | '误:wu': 'wù', 172 | '叭:ba': 'bā', 173 | '付:fu': 'fù', 174 | '蠡:lí': 'lǐ', 175 | '镗:táng': 'tāng', 176 | '毛:mao': 'máo', 177 | '荡:dang': 'dàng', 178 | '拾:shi': 'shí', 179 | '系:xi': 'xì', 180 | '妇:fu': 'fù', 181 | '仗:zhang': 'zhàng', 182 | '面:mian': 'miàn', 183 | '甥:sheng': 'shēng', 184 | '快:kuai': 'kuài', 185 | '婿:xu': 'xù', 186 | '计:ji': 'jì', 187 | '明:ming': 'míng', 188 | '琶:pa': 'pá', 189 | '遛:liú': 'liù', 190 | '兄:xiong': 'xiōng', 191 | '搁:ge': 'gē', 192 | '们:men': 'mén', 193 | '友:you': 'yǒu', 194 | '生:sheng': 'shēng', 195 | '难:nan': 'nán', 196 | '分:fen': 'fēn', 197 | '识:shi': 'shí', 198 | '食:shi': 'shí', 199 | '下:xia': 'xià', 200 | '氛:fen': 'fēn', 201 | '得:de': 'dé', 202 | '星:xing': 'xīng', 203 | '笼:long': 'lóng', 204 | '爷:ye': 'yé', 205 | '奶:nai': 'nǎi', 206 | '爸:ba': 'bà', 207 | '妈:ma': 'mā', 208 | '儿:er': 'ér', 209 | '哥:ge': 'gē', 210 | '服:fu': 'fú', 211 | '睛:jing': 'jīng', 212 | '弟:di': 'dì', 213 | '妹:mei': 'mèi', 214 | '司:si': 'sī', 215 | '候:hou': 'hòu', 216 | '腾:teng': 'téng', 217 | '璃:li': 'lí', 218 | '息:xi': 'xī', 219 | '傅:fu': 'fù', 220 | '娃:wa': 'wá', 221 | '卖:mai': 'mài', 222 | '屈:qu': 'qū', 223 | '思:si': 'sī', 224 | '活:huo': 'huó', 225 | '量:liang': 'liáng', 226 | '伯:bo': 'bó', 227 | '丧:sang': 'sàng', 228 | '嗦:suo': 'suō', 229 | '当:dang': 'dāng', 230 | '咕:gu': 'gū', 231 | '巴:ba': 'bā', 232 | '粑:ba': 'bā', 233 | '矩:ju': 'jǔ', 234 | '发:fa': 'fà', 235 | '合:he': 'hé', 236 | '帚:zhou': 'zhǒu', 237 | '蛋:dan': 'dàn', 238 | '枉:wang': 'wǎng', 239 | '泡:pao': 'pào', 240 | '酬:chou': 'chóu', 241 | '股:gu': 'gǔ', 242 | '剔:ti': 'tī', 243 | '西:xi': 'xī', 244 | '糊:hu': 'hú', 245 | '元:yuan': 'yuán', 246 | '杠:gang': 'gàng', 247 | '乎:hu': 'hū', 248 | '猬:wei': 'wèi', 249 | '指:zhi': 'zhǐ', 250 | '撒:sa': 'sā', 251 | '瞧:qiao': 'qiáo', 252 | '磨:mo': 'mó', 253 | '坊:fang': 'fáng', 254 | '叨:dao': 'dāo', 255 | '蹭:ceng': 'cèng', 256 | '姐:jie': 'jiě', 257 | '狸:li': 'lí', 258 | '楼:lou': 'lóu', 259 | '膊:bo': 'bó', 260 | '堂:tang': 'táng', 261 | '涂:tu': 'tú', 262 | '负:fu': 'fù', 263 | '灵:ling': 'líng', 264 | '菇:gu': 'gū', 265 | '舅:jiu': 'jiù', 266 | '饼:bing': 'bǐng', 267 | '罕:han': 'hǎn', 268 | '药:yao': 'yào', 269 | '筝:zheng': 'zhēng', 270 | '框:kuang': 'kuàng', 271 | '转:zhuan': 'zhuàn', 272 | '壳:ke': 'ké', 273 | '忽:hu': 'hū', 274 | '荒:huang': 'huāng', 275 | '莉:li': 'lì', 276 | '悠:you': 'yōu', 277 | '士:shi': 'shì', 278 | '嚷:rang': 'rāng', 279 | '笆:ba': 'bā', 280 | '窿:long': 'lóng', 281 | '缝:feng': 'féng', 282 | '口:kou': 'kǒu', 283 | '末:mo': 'mò', 284 | '里:li': 'lǐ', 285 | '叽:ji': 'jī', 286 | '心:xin': 'xīn', 287 | '宗:zong': 'zōng', 288 | '姥:lao': 'lǎo', 289 | '喝:he': 'hē', 290 | '伙:huo': 'huǒ', 291 | '囊:nang': 'nāng', 292 | '物:wu': 'wù', 293 | '嗽:sou': 'sòu', 294 | '咙:long': 'lóng', 295 | '': '', 296 | // 占位用 297 | _: '' 298 | }; 299 | 300 | // 在 四声字 前念 二声:不要、不错、不是、不再、不认识 301 | if (word == '不' && ['bu'].includes(pinyin)) { 302 | const tone = getPinyinTone(postPinyin); 303 | if (tone == 4) { 304 | pinyin = 'bú'; 305 | } else if (tone != 0 || ['得'].includes(postWord)) { 306 | pinyin = 'bù'; 307 | } 308 | } 309 | // 在 四声 前念 二声:一样,一下子、一座、一位、一次、一块儿 310 | // 在 一声、二声、三声字 前念 四声:大吃一惊、一般、一年、一门、一口、一起、一种 311 | else if (word == '一' && ['yi'].includes(pinyin)) { 312 | const tone = getPinyinTone(postPinyin); 313 | if (tone == 4) { 314 | pinyin = 'yí'; 315 | } else if (tone != 0) { 316 | pinyin = 'yì'; 317 | } 318 | } else if (word == '同' && ['胡'].includes(prevWord)) { 319 | pinyin = 'tòng'; 320 | } else if (word == '蕃' && ['茄'].includes(postWord)) { 321 | word = '番'; 322 | pinyin = 'fān'; 323 | } else if (word == '蕃' && ['吐'].includes(prevWord)) { 324 | pinyin = 'bō'; 325 | } else if (word == '朵' && ['耳'].includes(prevWord)) { 326 | pinyin = 'duo'; 327 | return `${word}:${pinyin}`; 328 | } else if ( 329 | word == '脯' && 330 | (['胸'].includes(prevWord) || ['子'].includes(postWord)) 331 | ) { 332 | pinyin = 'pú'; 333 | } else if ( 334 | word == '夫' && 335 | ['丈', '工', '功', '姐', '大', '妹'].includes(prevWord) 336 | ) { 337 | pinyin = 'fū'; 338 | } else if (word == '喇' && ['喇', '哗', '呼', '喀'].includes(prevWord)) { 339 | pinyin = 'lā'; 340 | } 341 | // 姓氏:https://baike.baidu.com/item/%E5%96%87%E5%A7%93/9730899 342 | else if ( 343 | word == '喇' && 344 | (['哈', '半'].includes(prevWord) || ['进', '敏', '秉'].includes(postWord)) 345 | ) { 346 | pinyin = 'lá'; 347 | } else if ( 348 | word == '大' && 349 | ['士'].includes(prevWord) && 350 | ['夫'].includes(postWord) 351 | ) { 352 | pinyin = 'dà'; 353 | } else if (word == '大' && ['夫'].includes(postWord)) { 354 | pinyin = 'dài'; 355 | } else if ( 356 | word == '个' && 357 | ['自'].includes(prevWord) && 358 | ['儿'].includes(postWord) 359 | ) { 360 | pinyin = 'gě'; 361 | } 362 | // 363 | else if (uniques[wordCode]) { 364 | pinyin = uniques[wordCode]; 365 | } 366 | 367 | if (!words[word].includes(pinyin)) { 368 | return null; 369 | } 370 | 371 | if ( 372 | words[word].length > 1 && 373 | getPinyinTone(pinyin) == 0 && 374 | ![ 375 | '的', 376 | '不', 377 | '一', 378 | '着', 379 | '么', 380 | '了', 381 | '子', 382 | '啊', 383 | '呢', 384 | '吧', 385 | '宜', 386 | '吗', 387 | '家', 388 | '头', 389 | '呀', 390 | '卜', 391 | '和', 392 | '嘛', 393 | '地', 394 | '匙', 395 | '啦', 396 | '裳', 397 | '瘩', 398 | '喽' 399 | ].includes(word) 400 | ) { 401 | return null; 402 | } 403 | 404 | return `${word}:${pinyin}`; 405 | }); 406 | } 407 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/sqlite/phrase/hmm/trans_kewen/index.mjs: -------------------------------------------------------------------------------- 1 | /* 生成 HMM 计算参数 */ 2 | import { 3 | fromRootPath, 4 | appendLineToFile, 5 | readLineFromFile, 6 | getAllFiles, 7 | readJSONFromFile, 8 | asyncForEach 9 | } from '#utils/utils.mjs'; 10 | import { openDB, closeDB } from '#utils/sqlite.mjs'; 11 | import { readWordsFromDB } from '../utils.mjs'; 12 | import * as trans from './trans.mjs'; 13 | 14 | // 训练课文数据 15 | let phraseSampleFiles = []; 16 | let appendExistData = false; 17 | 18 | const args = process.argv.slice(2); 19 | for (let i = 0; i < args.length; i++) { 20 | const arg = args[i]; 21 | if (arg == '-f') { 22 | phraseSampleFiles.push(args[++i]); 23 | } else if (arg == '-a') { 24 | appendExistData = true; 25 | } 26 | } 27 | 28 | if (!phraseSampleFiles) { 29 | console.log( 30 | 'Usage: npm run generate:sqlite:phrase:hmm:trans_kewen -- [-a] -f /file1 -f file2 ...' 31 | ); 32 | console.log(); 33 | 34 | process.exit(1); 35 | } 36 | 37 | // SQLite 字典库 38 | const wordDictSQLiteFile = fromRootPath('data', 'pinyin-word-dict.sqlite'); 39 | // HMM 参数目录 40 | const transParamsDir = fromRootPath('data', 'hmm_params/kewen'); 41 | 42 | console.log(); 43 | console.log(`创建计算参数${appendExistData ? '(累积更新)' : ''} ...`); 44 | let wordDictDB = await openDB(wordDictSQLiteFile, true); 45 | 46 | let words; 47 | try { 48 | words = await readWordsFromDB(wordDictDB); 49 | } catch (e) { 50 | throw e; 51 | } finally { 52 | await closeDB(wordDictDB); 53 | } 54 | 55 | let transParams = appendExistData 56 | ? { 57 | word_prob: readJSONFromFile(transParamsDir + `/word_prob.json`), 58 | trans_prob: readJSONFromFile(transParamsDir + `/trans_prob.json`) 59 | } 60 | : null; 61 | 62 | await asyncForEach(getAllFiles(phraseSampleFiles), async (file) => { 63 | console.log(` - 分析文件: ${file} ...`); 64 | 65 | await readLineFromFile(file, (line) => { 66 | if (!line || !line.trim()) { 67 | return; 68 | } 69 | 70 | const json = JSON.parse(line); 71 | transParams = trans.countParams(json, words, transParams); 72 | }); 73 | }); 74 | 75 | Object.keys(transParams).forEach((name) => { 76 | appendLineToFile( 77 | transParamsDir + `/${name}.json`, 78 | JSON.stringify(transParams[name]), 79 | true 80 | ); 81 | }); 82 | 83 | console.log(); 84 | console.log('Done'); 85 | console.log(); 86 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/sqlite/phrase/hmm/trans_kewen/trans.mjs: -------------------------------------------------------------------------------- 1 | import { extractPinyinChars } from '#utils/utils.mjs'; 2 | import { countTrans, countWords } from '../utils.mjs'; 3 | 4 | /** 5 | * HMM 参数计算 6 | * trans_prob - 汉字间转移概率 7 | */ 8 | export function countParams(articles, words, existParams) { 9 | existParams = existParams || { word_prob: {}, trans_prob: {} }; 10 | 11 | const symbols = existParams.symbols || {}; 12 | const clauses = readClausesFromArticles(articles, words, symbols); 13 | 14 | return { 15 | // 字的出现次数 16 | word_prob: countWords(clauses, existParams.word_prob), 17 | // 当前字为 EOS 且其前序为 BOS 的转移次数即为 训练的句子总数, 18 | // 而各个包含 BOS 前序的字即为句首字,且其出现次数即为 BOS 的值 19 | trans_prob: countTrans(clauses, existParams.trans_prob), 20 | // 所用到的符号及其出现次数 21 | symbols 22 | }; 23 | } 24 | 25 | /** 26 | * @param articles
[{title: [...], subtitle: [...], pargraphs: [[...], ...]}, {...}, ...]
27 | */ 28 | function readClausesFromArticles(articles, words, symbols) { 29 | let clauses = []; 30 | 31 | articles.forEach(({ title, subtitle, pargraphs }) => { 32 | const titleText = title.map((w) => w.zi).join(''); 33 | const subtitleText = subtitle.map((w) => w.zi).join(''); 34 | 35 | if ( 36 | titleText.includes('生字表') || 37 | titleText.includes('写字表') || 38 | titleText.includes('识字表') || 39 | titleText.includes('练习版') || 40 | titleText.includes('唐诗') || 41 | titleText.includes('诗词') || 42 | titleText.includes('诗歌') || 43 | titleText.includes('词四首') || 44 | titleText.includes('课文版') || 45 | titleText.includes('世说新语') || 46 | titleText.includes('短诗') || 47 | titleText.includes('庄子') || 48 | titleText.includes('老子') || 49 | titleText.includes('离骚') || 50 | titleText.includes('一年级') || 51 | titleText.includes('二年级') || 52 | subtitleText.includes('一年级') || 53 | subtitleText.includes('二年级') 54 | ) { 55 | console.log(` - 忽略文章: ${titleText}`); 56 | return; 57 | } 58 | console.log(` - 分析文章: ${titleText}`); 59 | 60 | [title, subtitle].concat(pargraphs).forEach((p) => { 61 | clauses = clauses.concat(readClausesFromPargraph(p, words, symbols)); 62 | }); 63 | }); 64 | 65 | return clauses; 66 | } 67 | 68 | /** 69 | * @param pargraph
[{zi: '字', py: 'zì'}, {zi: ','}, {...}, ...]
70 | */ 71 | function readClausesFromPargraph(pargraph, words, symbols) { 72 | const clauses = []; 73 | const addClause = (c) => { 74 | c.length > 0 && clauses.push(c); 75 | // c.length > 0 && console.error(c.join(',')); 76 | }; 77 | 78 | let clause = []; 79 | for (let i = 0; i < pargraph.length; i++) { 80 | const prev = pargraph[i - 1] || {}; 81 | const curr = pargraph[i]; 82 | const zi = getCorrectWord(curr); 83 | const py = getCorrectPinyin(curr, prev); 84 | 85 | if (py) { 86 | const spells = words[zi] || []; 87 | 88 | if (/\w+/.test(zi)) { 89 | console.error(` - 非汉字:${curr.zi}:${curr.py}`); 90 | } else if (spells.includes(py)) { 91 | clause.push(`${zi}:${py}`); 92 | } else { 93 | console.error(` - 不存在拼音字: ${curr.zi}:${curr.py}`); 94 | } 95 | } else { 96 | symbols[zi] ||= 0; 97 | symbols[zi] += 1; 98 | 99 | // 短语结束 100 | if (isClauseEnd(zi)) { 101 | if (clause.length > 0) { 102 | addClause(clause); 103 | } 104 | clause = []; 105 | } 106 | } 107 | } 108 | addClause(clause); 109 | 110 | return clauses; 111 | } 112 | 113 | function isClauseEnd(zi) { 114 | return [',', '。', ';', ':', '?', '!', '∶', '…'].includes(zi); 115 | } 116 | 117 | function getCorrectWord({ zi, py }) { 118 | switch (zi) { 119 | case '轮': 120 | py == 'lūn' && (zi = '抡'); 121 | break; 122 | case '纤': 123 | // https://www.cngwzj.com/pygushi/SongDai/72474/ 124 | py == 'lián' && (zi = '廉'); 125 | break; 126 | case '沉': 127 | // https://www.cngwzj.com/pygushi/SongDai/61484/ 128 | py == 'shěn' && (zi = '沈'); 129 | break; 130 | case '挡': 131 | // https://www.cngwzj.com/pygushi/SongDai/57152/ 132 | // https://baike.baidu.com/item/%E5%BA%86%E5%AE%AB%E6%98%A5%C2%B7%E5%8F%8C%E6%A1%A8%E8%8E%BC%E6%B3%A2/9918314 133 | py == 'dāng' && (zi = '珰'); 134 | break; 135 | } 136 | return zi; 137 | } 138 | 139 | function getCorrectPinyin({ zi, py }, prev) { 140 | switch (zi) { 141 | // <<<<<<<<<<<<<<<<<<<<<< 142 | case '看': 143 | prev.zi == zi && (py = 'kàn'); 144 | break; 145 | // <<<<<<<<<<<<<<< 叠词:第二个字为轻声 146 | case '爸': 147 | case '妈': 148 | case '哥': 149 | case '弟': 150 | case '姐': 151 | case '妹': 152 | case '爷': 153 | case '奶': 154 | case '婶': 155 | case '叔': 156 | prev.zi == zi && (py = extractPinyinChars(py)); 157 | break; 158 | // <<<<<<<<<<<<<<<<<<<<<<< 159 | case '儿': 160 | ['墩', '褂', '势', '猴', '点', '劲'].includes(prev.zi) && (py = 'ér'); 161 | break; 162 | // >>>>>>>>>>>>>>>>>>>>> 163 | default: 164 | const replacements = { 165 | 其: 'qí', 166 | 实: 'shí', 167 | 他: 'tā', 168 | 朴: 'pǔ', 169 | 笼: 'lóng', 170 | 牛: 'niú', 171 | 妞: 'niū', 172 | 剔: 'tī', 173 | 菇: 'gū', 174 | 活: 'huó', 175 | 笛: 'dí', 176 | 杵: 'chǔ', 177 | 釭: 'gāng', 178 | 墩: 'dūn', 179 | 褂: 'guà', 180 | 势: 'shì', 181 | 猴: 'hóu', 182 | 点: 'diǎn', 183 | // 184 | '景:ijǐng': 'jǐng', 185 | '温:yùn': 'wēn', 186 | '篷:peng': 'péng', 187 | '蓬:peng': 'péng', 188 | '晨:chen': 'chén', 189 | '袋:dai': 'dài', 190 | '来:lai': 'lái', 191 | '枉:wang': 'wǎng', 192 | '蟆:ma': 'má', 193 | '铛:dang': 'dāng', 194 | '闷:men': 'mèn', 195 | '粱:liang': 'liáng', 196 | '里:li': 'lǐ', 197 | '角:gǔ': 'jiǎo', 198 | '那:nàr': 'nà', 199 | '时:shi': 'shí', 200 | '焚:fèn': 'fén', 201 | '亮:liang': 'liàng', 202 | '道:dao': 'dào', 203 | '家:gū': 'jiā', 204 | '司:si': 'sī', 205 | '上:shang': 'shàng', 206 | '是:shi': 'shì', 207 | '不:bu': 'bù', 208 | '芦:lu': 'lú', 209 | '莫:mo': 'mò', 210 | '夫:fu': 'fū', 211 | '么:mò': 'me', 212 | '少:shāo': 'shǎo', 213 | '搁:ge': 'gē', 214 | '地:di': 'dì', 215 | '呵:ā': 'a', 216 | '劲:jìnr': 'jìn', 217 | '碌:lū': 'lù', 218 | '碌:lu': 'lù' 219 | }; 220 | 221 | for (let key of Object.keys(replacements)) { 222 | if ([zi, `${zi}:${py}`].includes(key)) { 223 | py = replacements[key]; 224 | break; 225 | } 226 | } 227 | } 228 | 229 | return py; 230 | } 231 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/sqlite/phrase/hmm/utils.mjs: -------------------------------------------------------------------------------- 1 | /** 2 | * 从字典库中读取字及其拼音 3 | * 4 | * @returns 结构为
{'字': ['zì'], ...}
5 | */ 6 | export async function readWordsFromDB(wordDictDB) { 7 | const words = {}; 8 | (await wordDictDB.all(`select word_, spell_ from pinyin_word`)).forEach( 9 | (row) => { 10 | const { word_, spell_ } = row; 11 | 12 | words[word_] ||= []; 13 | words[word_].push(spell_); 14 | } 15 | ); 16 | 17 | return words; 18 | } 19 | 20 | /** 21 | * 计算汉字(状态)间转移概率:每个句子中汉字转移概率 22 | * 23 | * @param clauses 结构为
[['字:zì', ...], [...], ...]
24 | */ 25 | export function countTrans(clauses, existTransProb) { 26 | const transProb = existTransProb || {}; 27 | 28 | clauses.forEach((clause) => { 29 | for (let i = 0; i <= clause.length; i++) { 30 | const curr = i == clause.length ? 'EOS' : clause[i]; 31 | const prev = i == 0 ? 'BOS' : clause[i - 1]; 32 | 33 | const prob = (transProb[curr] = transProb[curr] || {}); 34 | 35 | prob[prev] = (prob[prev] || 0) + 1; 36 | // 转移概率: math.log(前序字出现次数 / total) 37 | prob.__total__ = (prob.__total__ || 0) + 1; 38 | } 39 | }); 40 | 41 | return transProb; 42 | } 43 | 44 | /** 45 | * 统计短语中的汉字数量 46 | * 47 | * @param clauses 结构为
[['字:zì', ...], [...], ...]
48 | */ 49 | export function countWords(clauses, existWordProp) { 50 | const wordProb = existWordProp || {}; 51 | 52 | clauses.forEach((clause) => { 53 | for (let i = 0; i < clause.length; i++) { 54 | const curr = clause[i]; 55 | 56 | wordProb[curr] ||= 0; 57 | wordProb[curr] += 1; 58 | } 59 | }); 60 | 61 | return wordProb; 62 | } 63 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/sqlite/word/diff.mjs: -------------------------------------------------------------------------------- 1 | /* 对比不同版本的 SQLite 字典库的数据差异 */ 2 | import { fromRootPath, asyncForEach } from '#utils/utils.mjs'; 3 | 4 | import { openDB, closeDB } from '#utils/sqlite.mjs'; 5 | 6 | const oldDictDataSQLiteFile = fromRootPath( 7 | 'data', 8 | 'pinyin-word-dict.v2.sqlite' 9 | ); 10 | const dictDataSQLiteFile = fromRootPath('data', 'pinyin-word-dict.sqlite'); 11 | 12 | let oldDb = await openDB(oldDictDataSQLiteFile); 13 | let newDb = await openDB(dictDataSQLiteFile); 14 | 15 | try { 16 | console.log(); 17 | console.log('对比元数据的差异 ...'); 18 | await diffMetaData(oldDb, newDb); 19 | 20 | console.log(); 21 | console.log('对比字数据的差异 ...'); 22 | await diffWordData(oldDb, newDb); 23 | } catch (e) { 24 | throw e; 25 | } finally { 26 | await closeDB(oldDb); 27 | await closeDB(newDb); 28 | } 29 | 30 | async function diffMetaData(oldDb, newDb) { 31 | await asyncForEach( 32 | [ 33 | 'meta_pinyin', 34 | 'meta_pinyin_chars', 35 | 'meta_zhuyin', 36 | 'meta_zhuyin_chars', 37 | 'meta_word' 38 | ], 39 | async (table) => { 40 | const oldData = {}; 41 | const newData = {}; 42 | 43 | (await oldDb.all(`select * from ${table}`)).forEach((row) => { 44 | const value = row.value_; 45 | const id_ = row.id_; 46 | 47 | oldData[value] = { id_ }; 48 | }); 49 | (await newDb.all(`select * from ${table}`)).forEach((row) => { 50 | const value = row.value_; 51 | const id_ = row.id_; 52 | 53 | newData[value] = { id_ }; 54 | }); 55 | 56 | Object.keys(newData).forEach((value) => { 57 | if (!oldData[value]) { 58 | console.log(`- ${table} => 元数据 ${value} 为新增`); 59 | return; 60 | } 61 | 62 | const oldId = oldData[value].id_; 63 | const newId = newData[value].id_; 64 | 65 | if (oldId != newId) { 66 | console.log( 67 | `- ${table} => 元数据 ${value} 的 id 不同: ${oldId} -> ${newId}` 68 | ); 69 | } 70 | }); 71 | 72 | Object.keys(oldData).forEach((value) => { 73 | if (!newData[value]) { 74 | console.log(`- ${table} => 元数据 ${value} 已被删除`); 75 | return; 76 | } 77 | }); 78 | } 79 | ); 80 | } 81 | 82 | async function diffWordData(oldDb, newDb) { 83 | await asyncForEach(['pinyin_word' /*, 'zhuyin_word'*/], async (table) => { 84 | const oldData = {}; 85 | const newData = {}; 86 | 87 | (await oldDb.all(`select * from ${table}`)).forEach((row) => { 88 | const id = row.id_; 89 | 90 | oldData[id] = row; 91 | }); 92 | (await newDb.all(`select * from ${table}`)).forEach((row) => { 93 | const id = row.id_; 94 | 95 | newData[id] = row; 96 | }); 97 | 98 | const genCode = (row) => { 99 | return `${row.word_id_ || row.source_id_}:${ 100 | row.spell_id_ || row.target_id_ 101 | }:${row.spell_chars_id_ || row.target_chars_id_}`; 102 | }; 103 | 104 | Object.keys(newData).forEach((id) => { 105 | const oldRow = oldData[id]; 106 | const newRow = newData[id]; 107 | 108 | if (!oldRow) { 109 | console.log( 110 | `- ${table} => 字数据 ${id}:${newRow.word_}:${newRow.spell_} 为新增` 111 | ); 112 | return; 113 | } 114 | 115 | const oldCode = genCode(oldRow); 116 | const newCode = genCode(newRow); 117 | 118 | if (oldCode != newCode) { 119 | console.log( 120 | `- ${table} => 字数据 ${id}:${newRow.word_}:${newRow.spell_} 的组合不同: ${oldCode} -> ${newCode}` 121 | ); 122 | } 123 | }); 124 | 125 | Object.keys(oldData).forEach((id) => { 126 | const oldRow = oldData[id]; 127 | if (!newData[id]) { 128 | console.log( 129 | `- ${table} => 字数据 ${id}:${oldRow.word_}:${oldRow.spell_} 已被删除` 130 | ); 131 | return; 132 | } 133 | }); 134 | }); 135 | } 136 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/sqlite/word/index.mjs: -------------------------------------------------------------------------------- 1 | /* SQLite 字典库 */ 2 | import { fromRootPath, readLineFromFile } from '#utils/utils.mjs'; 3 | import { patch } from './patch.mjs'; 4 | import * as sqlite from './sqlite.mjs'; 5 | 6 | // 收集数据 7 | const wordDataValidFile = fromRootPath('data', 'pinyin-dict.valid.txt'); 8 | const emojiDataFile = fromRootPath('data', 'emojis.json'); 9 | // 分析数据 10 | const pinyinCharsFile = fromRootPath('..', 'analyze/files/pinyin.txt'); 11 | const pinyinCharLinksFile = fromRootPath('..', 'analyze/files/char-links.json'); 12 | const pinyinCharTreeFile = fromRootPath('..', 'analyze/files/char-tree.json'); 13 | 14 | // SQLite 字典库 15 | const wordDictDataSQLiteFile = fromRootPath('data', 'pinyin-word-dict.sqlite'); 16 | 17 | console.log(); 18 | console.log('读取已收集的有效字信息 ...'); 19 | const wordMetas = []; 20 | await readLineFromFile(wordDataValidFile, (line) => { 21 | if (!line || !line.trim()) { 22 | return; 23 | } 24 | 25 | const metas = JSON.parse(line); 26 | metas.forEach((meta) => { 27 | wordMetas.push(meta); 28 | 29 | patch(meta); 30 | }); 31 | }); 32 | console.log('- 有效字信息总数:' + wordMetas.length); 33 | console.log(); 34 | 35 | console.log(); 36 | console.log('写入字信息到 SQLite ...'); 37 | let db1 = await sqlite.open(wordDictDataSQLiteFile); 38 | 39 | try { 40 | await sqlite.saveSpells(db1, wordMetas); 41 | console.log('- 已保存字读音信息'); 42 | 43 | await sqlite.saveWords(db1, wordMetas); 44 | console.log('- 已保存字信息'); 45 | 46 | await sqlite.savePhrases(db1, wordMetas); 47 | console.log('- 已保存词组信息'); 48 | } catch (e) { 49 | throw e; 50 | } finally { 51 | await sqlite.close(db1); 52 | } 53 | 54 | console.log(); 55 | 56 | console.log(); 57 | console.log('读取已收集的表情符号 ...'); 58 | const groupEmojiMetas = {}; 59 | await readLineFromFile(emojiDataFile, (line) => { 60 | if (!line || !line.trim()) { 61 | return; 62 | } 63 | 64 | const groups = JSON.parse(line); 65 | groups.forEach((group) => { 66 | let groupName = group.name.zh; 67 | switch (groupName) { 68 | case '表情与情感': 69 | groupName = '表情'; 70 | break; 71 | case '人物与身体': 72 | groupName = '人物'; 73 | break; 74 | case '动物与自然': 75 | groupName = '动植物'; 76 | break; 77 | case '食物与饮料': 78 | groupName = '饮食'; 79 | break; 80 | case '旅行与地理': 81 | groupName = '旅行'; 82 | break; 83 | case '符号标志': 84 | groupName = '符号'; 85 | break; 86 | } 87 | 88 | groupEmojiMetas[groupName] = group.emojis; 89 | }); 90 | }); 91 | console.log( 92 | '- 表情符号总数:' + 93 | Object.values(groupEmojiMetas).reduce( 94 | (acc, emojis) => acc + emojis.length, 95 | 0 96 | ) 97 | ); 98 | console.log(); 99 | 100 | console.log(); 101 | console.log('写入表情符号到 SQLite ...'); 102 | let db2 = await sqlite.open(wordDictDataSQLiteFile); 103 | try { 104 | await sqlite.saveEmojis(db2, groupEmojiMetas); 105 | console.log('- 已保存表情符号数据'); 106 | } catch (e) { 107 | throw e; 108 | } finally { 109 | await sqlite.close(db2); 110 | } 111 | console.log(); 112 | 113 | console.log(); 114 | console.log('通过 SQLite 生成分析数据 ...'); 115 | let db3 = await sqlite.open(wordDictDataSQLiteFile); 116 | try { 117 | await sqlite.generatePinyinChars(db3, pinyinCharsFile); 118 | console.log('- 已保存拼音字母组合数据'); 119 | 120 | await sqlite.generatePinyinCharLinks(db3, pinyinCharLinksFile); 121 | console.log('- 已保存拼音字母关联数据'); 122 | 123 | await sqlite.generatePinyinCharTree(db3, pinyinCharTreeFile); 124 | console.log('- 已保存拼音字母后继数据'); 125 | } catch (e) { 126 | throw e; 127 | } finally { 128 | await sqlite.close(db3); 129 | } 130 | console.log(); 131 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/sqlite/word/patch.mjs: -------------------------------------------------------------------------------- 1 | import { extractPinyinChars } from '#utils/utils.mjs'; 2 | 3 | /** 修正输入数据 */ 4 | export function patch(meta) { 5 | const deleted = [ 6 | '虾:hā' // -> 虾:há 7 | ]; 8 | 9 | const added = [ 10 | // “一”和“不”变调有规律:https://www.chinanews.com.cn/hwjy/news/2010/04-15/2228742.shtml 11 | '不:bú', 12 | '一:yì', 13 | '一:yí', 14 | '子:zi', 15 | // 便宜:pián yi 16 | '宜:yi', 17 | '噷:hm', 18 | '吒:zhà', 19 | '虎:hu', 20 | '枸:gōu', 21 | '焘:tāo', 22 | '喇:lā', 23 | '喇:lá', 24 | '蕃:bō', 25 | '蕃:fān', 26 | '脯:pú', 27 | '蕻:hóng', 28 | '朵:duo', 29 | '鏜:táng', 30 | '咔:kā', 31 | '蹬:dèng', 32 | '爸:ba', 33 | '叔:shu', 34 | '喝:he', 35 | // 《定风波·自春来》 - 无那。恨薄情一去,音书无个 36 | // https://www.cngwzj.com/pygushi/SongDai/48900/ 37 | '那:nuó', 38 | // 《桂枝香·金陵怀古》 - 谩嗟荣辱 39 | // https://www.cngwzj.com/pygushi/SongDai/49417/ 40 | '谩:màn', 41 | // 《贺新郎·春情》 - 殢酒厌厌病 42 | // https://www.cngwzj.com/pygushi/SongDai/61645/ 43 | '厌:yǎn', 44 | // 《贺新郎·春情》 - 断鸿难倩 45 | // https://www.cngwzj.com/pygushi/SongDai/61645/ 46 | '倩:qìng', 47 | // 《八声甘州·记玉关踏雪事清游》 - 长河饮马 48 | // https://www.cngwzj.com/pygushi/SongDai/61043/ 49 | '饮:yìn', 50 | // 王维《青溪》 - 趣途无百里 51 | // https://www.cngwzj.com/pygushi/TangDai/10982/ 52 | '趣:qū', 53 | // 李白《关山月》 - 戍客望边色 54 | // https://www.cngwzj.com/pygushi/TangDai/12860/ 55 | '色:yì', 56 | // 《听董大弹胡笳声兼寄语弄房给事》 - 四郊秋叶惊摵摵 57 | // https://www.cngwzj.com/pygushi/TangDai/11474/ 58 | '摵:shè', 59 | // 白居易《琵琶行》 - 自言本是京城女,家在虾蟆陵下住 60 | // https://www.cngwzj.com/pygushi/TangDai/25273/ 61 | '虾:há', 62 | // 李白《将进酒》 63 | // https://www.cngwzj.com/pygushi/TangDai/12843/ 64 | '将:qiāng', 65 | // 《行经华阴》- 借问路傍名利客 66 | // https://www.cngwzj.com/pygushi/TangDai/11353/ 67 | '傍:páng', 68 | // 王维《鹿柴》 69 | // https://www.cngwzj.com/pygushi/TangDai/11206/ 70 | '柴:zhài', 71 | // 礼记《虽有嘉肴》- 学学半 72 | // https://www.cngwzj.com/pygushi/LiangHan/76970/ 73 | '学:xiào', 74 | // 屈原《离骚》- 肇锡余以嘉名 75 | // https://www.cngwzj.com/pygushi/XianQin/87343/ 76 | '锡:cì', 77 | // - 来吾道夫先路 78 | '道:dǎo', 79 | // 论语《不义而富且贵,于我如浮云》- 久要不忘平生之言 80 | // https://www.cngwzj.com/pygushi/XianQin/88550/ 81 | '要:yuē', 82 | // 论语《己所不欲,勿施于人》- 举皋陶 83 | // https://www.cngwzj.com/pygushi/XianQin/88549/ 84 | '陶:yáo', 85 | // - 乡也 86 | '乡:xiàng', 87 | // 论语《好仁不好学,其蔽也愚》- 陈亢问 88 | // https://www.cngwzj.com/pygushi/XianQin/88551/ 89 | '亢:gāng', 90 | // 荀子《劝学》- 君子生非异也 91 | // https://www.cngwzj.com/pygushi/XianQin/86629/ 92 | '生:xìng', 93 | // 司马迁《陈涉世家》- 发闾左適戍渔阳 94 | // https://www.cngwzj.com/pygushi/LiangHan/88083/ 95 | '適:zhé', 96 | '夏:jiǎ', 97 | '苦:hù', 98 | // 列子《杞人忧天》- 舍然大喜 99 | // https://www.cngwzj.com/pygushi/KeWen/87901/ 100 | '舍:shì' 101 | ]; 102 | 103 | // 先增改, 104 | extraWords(added).forEach(({ value, pinyin, chars }) => { 105 | if ( 106 | meta.value == value && 107 | meta.pinyins.filter(({ value }) => value == pinyin).length == 0 108 | ) { 109 | meta.pinyins.push({ value: pinyin, chars }); 110 | } 111 | }); 112 | // 再删除,以避免自增 id 发生较大变动 113 | extraWords(deleted).forEach(({ value, pinyin }) => { 114 | if (meta.value == value) { 115 | meta.pinyins = meta.pinyins.filter((py) => py.value !== pinyin); 116 | } 117 | }); 118 | } 119 | 120 | function extraWords(words) { 121 | return words 122 | .map((w) => w.split(':')) 123 | .map((s) => ({ 124 | value: s[0], 125 | pinyin: s[1], 126 | chars: extractPinyinChars(s[1]) 127 | })); 128 | } 129 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/generate/test.mjs: -------------------------------------------------------------------------------- 1 | import { hasGlyphFontForCodePoint } from '#utils/utils.mjs'; 2 | import { fetchWordMetas } from '#utils/zdic.mjs'; 3 | 4 | const unicodes = [ 5 | 'U+20C43' /* 𠱃 */, 6 | 'U+20C53' /* 𠱓 */, 7 | 'U+20C65' /* 𠱥 */, 8 | 'U+20C8D' /* 𠲍 */, 9 | 'U+20C96' /* 𠲖 */, 10 | 'U+20C9C' /* 𠲜 */, 11 | 'U+20CB5' /* 𠲵 */, 12 | 'U+20CD0' /* 𠳐 */, 13 | 'U+20CED' /* 𠳭 */ 14 | ]; 15 | for (let i = 0; i < unicodes.length; i++) { 16 | const unicode = unicodes[i]; 17 | const codePoint = parseInt(unicode.replaceAll(/^U\+/g, '0x'), 16); 18 | const char = String.fromCharCode(codePoint); 19 | const exist = hasGlyphFontForCodePoint(unicode); 20 | 21 | console.log(unicode + ' - ' + char + ': ' + exist); 22 | } 23 | 24 | // const words = ['㑵', '𥁞', '尽', '国', '𣴘']; 25 | // const wordMetas = await fetchWordMetas(words); 26 | // console.log(JSON.stringify(wordMetas)); 27 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/utils/sqlite.mjs: -------------------------------------------------------------------------------- 1 | // https://www.sqlitetutorial.net/sqlite-nodejs/connect/ 2 | // https://github.com/TryGhost/node-sqlite3/wiki/API 3 | import sqlite3 from 'sqlite3'; 4 | // https://www.npmjs.com/package/sqlite 5 | import * as sqlite from 'sqlite'; 6 | 7 | import { splitChars, extractPinyinChars, asyncForEach } from './utils.mjs'; 8 | 9 | export async function openDB(file, readonly) { 10 | const db = await sqlite.open({ 11 | filename: file, 12 | mode: readonly 13 | ? sqlite3.OPEN_READONLY 14 | : sqlite3.OPEN_CREATE | sqlite3.OPEN_READWRITE, 15 | driver: sqlite3.Database 16 | }); 17 | 18 | // 提升批量写入性能: https://avi.im/blag/2021/fast-sqlite-inserts/ 19 | await execSQL( 20 | db, 21 | ` 22 | pragma journal_mode = off; 23 | pragma synchronous = 0; 24 | pragma cache_size = 1000000; 25 | pragma locking_mode = exclusive; 26 | pragma temp_store = memory; 27 | ` 28 | ); 29 | 30 | return db; 31 | } 32 | 33 | export async function attachDB(db, sources) { 34 | // 附加数据库(连接期内有效): https://www.sqlite.org/lang_attach.html 35 | await execSQL( 36 | db, 37 | Object.keys(sources) 38 | .map((name) => `attach database '${sources[name]}' as ${name}`) 39 | .join(';') 40 | ); 41 | 42 | return db; 43 | } 44 | 45 | export async function closeDB(db, skipClean) { 46 | try { 47 | if (db.config.mode != sqlite3.OPEN_READONLY && !skipClean) { 48 | // 数据库无用空间回收 49 | await execSQL(db, 'vacuum'); 50 | } 51 | 52 | await db.close(); 53 | } catch (e) { 54 | console.error(e); 55 | } 56 | } 57 | 58 | /** 新增或更新数据 */ 59 | export async function saveToDB( 60 | db, 61 | table, 62 | dataMap, 63 | disableSorting, 64 | primaryKeys 65 | ) { 66 | const dataArray = mapToArray(dataMap, disableSorting); 67 | if (dataArray.length === 0) { 68 | return; 69 | } 70 | 71 | primaryKeys = primaryKeys || ['id_']; 72 | const hasOnlyIdKey = primaryKeys.length == 1 && primaryKeys[0] == 'id_'; 73 | 74 | const columnsWithPrimaryKey = Object.keys(dataArray[0]).filter( 75 | (k) => !k.startsWith('__') 76 | ); 77 | const columns = columnsWithPrimaryKey.filter((k) => !primaryKeys.includes(k)); 78 | 79 | const insertWithIdSql = `insert into ${table} (${columnsWithPrimaryKey.join( 80 | ', ' 81 | )}) values (${columnsWithPrimaryKey.map(() => '?').join(', ')}) 82 | `; 83 | const insertWithIdStatement = await db.prepare(insertWithIdSql); 84 | const insertStatement = hasOnlyIdKey 85 | ? await db.prepare( 86 | `insert into ${table} (${columns.join(', ')}) values (${columns 87 | .map(() => '?') 88 | .join(', ')}) 89 | ` 90 | ) 91 | : await db.prepare(insertWithIdSql); 92 | const updateStatement = 93 | columns.length > 0 94 | ? await db.prepare( 95 | `update ${table} set ${columns 96 | .map((c) => c + ' = ?') 97 | .join(', ')} where ${primaryKeys 98 | .map((key) => key + ' = ?') 99 | .join(' and ')} 100 | ` 101 | ) 102 | : // 所有的列都为主键,则不需要更新 103 | null; 104 | 105 | const getId = (d) => primaryKeys.map((k) => d[k]).join(''); 106 | await asyncForEach(dataArray, async (data) => { 107 | if (getId(data)) { 108 | const needToUpdate = 109 | data.__exist__ && 110 | columns.reduce((r, c) => r || data[c] !== data.__exist__[c], false); 111 | 112 | if (needToUpdate) { 113 | await updateStatement.run( 114 | ...columns.concat(primaryKeys).map((c) => data[c]) 115 | ); 116 | } 117 | // 新增包含 id 的数据 118 | else if (!data.__exist__) { 119 | await insertWithIdStatement.run( 120 | ...columnsWithPrimaryKey.map((c) => data[c]) 121 | ); 122 | } 123 | } else { 124 | const params = (hasOnlyIdKey ? columns : columnsWithPrimaryKey).map( 125 | (c) => data[c] 126 | ); 127 | await insertStatement.run(...params); 128 | } 129 | }); 130 | 131 | await insertStatement.finalize(); 132 | await insertWithIdStatement.finalize(); 133 | updateStatement && (await updateStatement.finalize()); 134 | } 135 | 136 | /** 删除数据 */ 137 | export async function removeFromDB(db, table, data, primaryKeys) { 138 | if (data.length === 0) { 139 | return; 140 | } 141 | 142 | primaryKeys = primaryKeys || ['id_']; 143 | 144 | const deleteStatement = await db.prepare( 145 | `delete from ${table} where ${primaryKeys 146 | .map((key) => key + ' = ?') 147 | .join(' and ')} 148 | ` 149 | ); 150 | 151 | await asyncForEach(data, async (d) => { 152 | const params = typeof d == 'object' ? primaryKeys.map((c) => d[c]) : [d]; 153 | await deleteStatement.run(...params); 154 | }); 155 | 156 | await deleteStatement.finalize(); 157 | } 158 | 159 | export async function hasTable(db, table) { 160 | const result = await db.get( 161 | `select count(*) as total from sqlite_master where type='table' and name='${table}'` 162 | ); 163 | return result.total == 1; 164 | } 165 | 166 | export async function execSQL(db, sqls) { 167 | await asyncForEach(sqls.split(/;/g), async (sql) => { 168 | await db.exec(sql); 169 | }); 170 | } 171 | 172 | function mapToArray(obj, disableSorting) { 173 | if (disableSorting === true) { 174 | return Object.keys(obj).map((k) => obj[k]); 175 | } 176 | 177 | const charSpecials = { 178 | a: ['ā', 'á', 'ǎ', 'à'], 179 | o: ['ō', 'ó', 'ǒ', 'ò'], 180 | e: ['ē', 'é', 'ě', 'è', 'ê', 'ê̄', 'ế', 'ê̌', 'ề'], 181 | i: ['ī', 'í', 'ǐ', 'ì'], 182 | u: ['ū', 'ú', 'ǔ', 'ù'], 183 | ü: ['ǖ', 'ǘ', 'ǚ', 'ǜ'], 184 | n: ['ń', 'ň', 'ǹ'], 185 | m: ['m̄', 'ḿ', 'm̀'] 186 | }; 187 | const charWeights = { ˉ: 10001, ˊ: 10002, ˇ: 10003, ˋ: 10004 }; 188 | for (let i = 97, j = 1; i <= 122; i++, j++) { 189 | const ch = String.fromCharCode(i); 190 | const weight = j * 15; 191 | charWeights[ch] = weight; 192 | 193 | const specials = charSpecials[ch]; 194 | if (specials) { 195 | for (let k = 0; k < specials.length; k++) { 196 | const special = specials[k]; 197 | 198 | charWeights[special] = weight + (k + 1); 199 | } 200 | } 201 | } 202 | const getCharCode = (ch) => { 203 | let sum = 0; 204 | for (let i = 0; i < ch.length; i++) { 205 | sum += ch.charCodeAt(i); 206 | } 207 | return sum; 208 | }; 209 | 210 | // Note: 主要排序带音调的拼音(注音规则暂时不清楚,故不处理),其余的按字符顺序排序 211 | const keys = Object.keys(obj).sort((a, b) => { 212 | const a_without_special = extractPinyinChars(a).replaceAll(/[ˊˇˋˉ]$/g, ''); 213 | const b_without_special = extractPinyinChars(b).replaceAll(/[ˊˇˋˉ]$/g, ''); 214 | 215 | if (a_without_special === b_without_special) { 216 | const a_weight = splitChars(a) 217 | .map((ch) => charWeights[ch] || getCharCode(ch)) 218 | .reduce((acc, w) => acc + w, 0); 219 | const b_weight = splitChars(b) 220 | .map((ch) => charWeights[ch] || getCharCode(ch)) 221 | .reduce((acc, w) => acc + w, 0); 222 | 223 | return a_weight - b_weight; 224 | } 225 | 226 | return a_without_special > b_without_special 227 | ? 1 228 | : a_without_special < b_without_special 229 | ? -1 230 | : 0; 231 | }); 232 | 233 | return keys.map((k) => obj[k]); 234 | } 235 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/utils/utils.mjs: -------------------------------------------------------------------------------- 1 | import * as fs from 'fs'; 2 | import * as path from 'path'; 3 | import * as crypto from 'crypto'; 4 | import { fileURLToPath } from 'url'; 5 | import * as events from 'events'; 6 | import * as readline from 'readline'; 7 | 8 | import * as fontkit from 'fontkit'; 9 | import getSystemFonts from 'get-system-fonts'; 10 | import GraphemeSplitter from 'grapheme-splitter'; 11 | 12 | import { pinyin as parsePinyin, addDict } from 'pinyin-pro'; 13 | // https://pinyin-pro.cn/use/addDict.html 14 | import CompleteDict from '@pinyin-pro/data/complete'; 15 | 16 | addDict(CompleteDict); 17 | 18 | const systemFonts = await prepareSystemFonts(); 19 | const graphemeSplitter = new GraphemeSplitter(); 20 | 21 | // https://codingbeautydev.com/blog/javascript-dirname-is-not-defined-in-es-module-scope/ 22 | const __filename = fileURLToPath(import.meta.url); 23 | const __dirname = path.dirname(__filename); 24 | 25 | export async function asyncForEach(array, cb) { 26 | for (const e of array) { 27 | await cb(e); 28 | } 29 | } 30 | 31 | export function fromRootPath(...paths) { 32 | return path.join(__dirname, '../..', ...paths); 33 | } 34 | 35 | export function sleep(ms) { 36 | return new Promise((resolve) => setTimeout(() => resolve(), ms)); 37 | } 38 | 39 | export function fileSHA256(filepath) { 40 | // https://gist.github.com/GuillermoPena/9233069#gistcomment-3149231-permalink 41 | const file = fs.readFileSync(filepath); 42 | const hash = crypto.createHash('sha256'); 43 | hash.update(file); 44 | 45 | return hash.digest('hex'); 46 | } 47 | 48 | export function existFile(filepath) { 49 | return fs.existsSync(filepath); 50 | } 51 | 52 | export function copyFile(source, target, override) { 53 | if (existFile(target) && override !== true) { 54 | return; 55 | } 56 | 57 | fs.copyFileSync(source, target); 58 | } 59 | 60 | export function readJSONFromFile(filepath, defaultValue = {}) { 61 | if (!existFile(filepath)) { 62 | return defaultValue; 63 | } 64 | 65 | return JSON.parse(readFile(filepath)); 66 | } 67 | 68 | export function readFile(filepath) { 69 | return fs.readFileSync(filepath, 'utf8'); 70 | } 71 | 72 | export function readAllFiles(dir) { 73 | return getAllFiles(dir).map((file) => readFile(file)); 74 | } 75 | 76 | export function getAllFiles(dir) { 77 | if (Array.isArray(dir)) { 78 | return dir.map(getAllFiles).reduce((acc, files) => acc.concat(files), []); 79 | } 80 | 81 | if (fs.lstatSync(dir).isFile()) { 82 | return [dir]; 83 | } 84 | 85 | let files = []; 86 | fs.readdirSync(dir).forEach((file) => { 87 | const filepath = path.join(dir, file); 88 | 89 | if (fs.lstatSync(filepath).isDirectory()) { 90 | files = files.concat(getAllFiles(filepath)); 91 | } else { 92 | files.push(filepath); 93 | } 94 | }); 95 | 96 | return files; 97 | } 98 | 99 | export async function readLineFromFile(filepath, consumer) { 100 | if (!existFile(filepath)) { 101 | return []; 102 | } 103 | 104 | const rl = readline.createInterface({ 105 | input: fs.createReadStream(filepath), 106 | crlfDelay: Infinity 107 | }); 108 | 109 | const results = []; 110 | rl.on('line', (line) => { 111 | const result = consumer(line); 112 | if (typeof result !== 'undefined') { 113 | results.push(result); 114 | } 115 | }); 116 | 117 | await events.once(rl, 'close'); 118 | 119 | return results; 120 | } 121 | 122 | export function appendLineToFile(filepath, line, doEmpty) { 123 | const dirpath = path.dirname(filepath); 124 | 125 | if (!fs.existsSync(dirpath)) { 126 | fs.mkdirSync(dirpath); 127 | } 128 | 129 | if (!fs.existsSync(filepath) || doEmpty) { 130 | fs.writeFileSync(filepath, ''); 131 | } 132 | 133 | let fd; 134 | try { 135 | fd = fs.openSync(filepath, 'a'); 136 | fs.appendFileSync(fd, line + '\n', 'utf8'); 137 | } finally { 138 | fd && fs.closeSync(fd); 139 | } 140 | } 141 | 142 | export function naiveHTMLNodeInnerText(node) { 143 | // https://github.com/jsdom/jsdom/issues/1245#issuecomment-1243809196 144 | // We need Node(DOM's Node) for the constants, 145 | // but Node doesn't exist in the nodejs global space, 146 | // and any Node instance references the constants 147 | // through the prototype chain 148 | const Node = node; 149 | 150 | return node && node.childNodes 151 | ? [...node.childNodes] 152 | .map((node) => { 153 | switch (node.nodeType) { 154 | case Node.TEXT_NODE: 155 | return node.textContent; 156 | case Node.ELEMENT_NODE: 157 | return naiveHTMLNodeInnerText(node); 158 | default: 159 | return ''; 160 | } 161 | }) 162 | .join(' ') 163 | : ''; 164 | } 165 | 166 | async function prepareSystemFonts() { 167 | // https://www.npmjs.com/package/get-system-fonts 168 | const fontFiles = await getSystemFonts(); 169 | const fonts = []; 170 | 171 | // https://github.com/foliojs/fontkit#fonthasglyphforcodepointcodepoint 172 | fontFiles.forEach((file) => { 173 | try { 174 | const font = fontkit.openSync(file); 175 | if (!font.hasGlyphForCodePoint) { 176 | return; 177 | } 178 | 179 | //console.info('Read font file: ' + file); 180 | fonts.push(font); 181 | } catch (e) { 182 | //console.warn('Failed to read font file: ' + file, e); 183 | } 184 | }); 185 | 186 | return fonts; 187 | } 188 | 189 | /** 判断系统字体中是否存在指定编码的字形,若不存在,则表示该编码的字不可读 */ 190 | export function hasGlyphFontForCodePoint(unicode) { 191 | const codePoint = parseInt('0x' + unicode.replaceAll(/^U\+/g, ''), 16); 192 | 193 | for (let i = 0; i < systemFonts.length; i++) { 194 | const font = systemFonts[i]; 195 | 196 | if (font.hasGlyphForCodePoint(codePoint)) { 197 | return true; 198 | } 199 | } 200 | return false; 201 | } 202 | 203 | /** 部分中文和表情符号等占用字节数大于 2,比如: 𫫇,需单独处理 */ 204 | export function splitChars(str) { 205 | // https://github.com/orling/grapheme-splitter 206 | return graphemeSplitter.splitGraphemes(str); 207 | } 208 | 209 | /** @return ['nǐ', 'hǎo', 'ma'] */ 210 | export function getPinyin(str) { 211 | // https://pinyin-pro.cn/use/pinyin.html 212 | return parsePinyin(str, { 213 | // 输出为数组 214 | type: 'array', 215 | // 作为音调符号带在拼音字母上 216 | toneType: 'symbol', 217 | // 识别字符串开头的姓氏 218 | surname: 'head', 219 | // 是否对一和不应用智能变调 220 | // 不(bù)在去声字前面读阳平声,如“~会”“~是”,这属于变调读音 221 | // http://www.moe.gov.cn/jyb_hygq/hygq_zczx/moe_1346/moe_1364/tnull_42118.html 222 | // “一”和“不”变调有规律:https://www.chinanews.com.cn/hwjy/news/2010/04-15/2228742.shtml 223 | toneSandhi: true 224 | }); 225 | } 226 | 227 | /** 修正拼音 */ 228 | export function correctPinyin(str) { 229 | return str 230 | .replaceAll('ā', 'ā') 231 | .replaceAll('ă', 'ǎ') 232 | .replaceAll('à', 'à') 233 | .replaceAll('ɑ', 'a') 234 | .replaceAll('ō', 'ō') 235 | .replaceAll('ŏ', 'ǒ') 236 | .replaceAll('ī', 'ī') 237 | .replaceAll('ĭ', 'ǐ') 238 | .replaceAll('ŭ', 'ǔ') 239 | .replaceAll('ɡ', 'g') 240 | .replaceAll('ē', 'ē') 241 | .replaceAll(/[·]/g, ''); 242 | } 243 | 244 | /** 修正注音 */ 245 | export function correctZhuyin(str) { 246 | return str.replaceAll('π', 'ㄫ').replaceAll('˙', ''); 247 | } 248 | 249 | /** 拼音去掉声调后的字母组合 */ 250 | export function extractPinyinChars(pinyin) { 251 | if ('m̀' === pinyin || 'ḿ' === pinyin || 'm̄' === pinyin) { 252 | return 'm'; 253 | } else if ( 254 | 'ê̄' === pinyin || 255 | 'ế' === pinyin || 256 | 'ê̌' === pinyin || 257 | 'ề' === pinyin 258 | ) { 259 | return 'e'; 260 | } 261 | 262 | const chars = []; 263 | 264 | const splits = splitChars(pinyin); 265 | for (let i = 0; i < splits.length; i++) { 266 | const ch = splits[i]; 267 | switch (ch) { 268 | case 'ā': 269 | case 'á': 270 | case 'ǎ': 271 | case 'à': 272 | chars.push('a'); 273 | break; 274 | case 'ō': 275 | case 'ó': 276 | case 'ǒ': 277 | case 'ò': 278 | chars.push('o'); 279 | break; 280 | case 'ē': 281 | case 'é': 282 | case 'ě': 283 | case 'è': 284 | case 'ê': 285 | chars.push('e'); 286 | break; 287 | case 'ī': 288 | case 'í': 289 | case 'ǐ': 290 | case 'ì': 291 | chars.push('i'); 292 | break; 293 | case 'ū': 294 | case 'ú': 295 | case 'ǔ': 296 | case 'ù': 297 | chars.push('u'); 298 | break; 299 | case 'ǖ': 300 | case 'ǘ': 301 | case 'ǚ': 302 | case 'ǜ': 303 | chars.push('ü'); 304 | break; 305 | case 'ń': 306 | case 'ň': 307 | case 'ǹ': 308 | chars.push('n'); 309 | break; 310 | default: 311 | chars.push(ch); 312 | } 313 | } 314 | 315 | return chars.join(''); 316 | } 317 | 318 | export function getPinyinTone(pinyin) { 319 | const tones = { 320 | ā: 1, 321 | á: 2, 322 | ǎ: 3, 323 | à: 4, 324 | // 325 | ō: 1, 326 | ó: 2, 327 | ǒ: 3, 328 | ò: 4, 329 | // 330 | ē: 1, 331 | é: 2, 332 | ě: 3, 333 | è: 4, 334 | ê: 0, 335 | ê̄: 1, 336 | ế: 2, 337 | ê̌: 3, 338 | ề: 4, 339 | // 340 | ī: 1, 341 | í: 2, 342 | ǐ: 3, 343 | ì: 4, 344 | // 345 | ū: 1, 346 | ú: 2, 347 | ǔ: 3, 348 | ù: 4, 349 | // 350 | ǖ: 1, 351 | ǘ: 2, 352 | ǚ: 3, 353 | ǜ: 4, 354 | // 355 | ń: 2, 356 | ň: 3, 357 | ǹ: 4, 358 | // 359 | m̄: 1, 360 | ḿ: 2, 361 | m̀: 4 362 | }; 363 | 364 | for (let ch in tones) { 365 | if (pinyin.includes(ch)) { 366 | return tones[ch]; 367 | } 368 | } 369 | 370 | return 0; 371 | } 372 | 373 | /** 注音去掉声调后的字符组合 */ 374 | export function extractZhuyinChars(zhuyin) { 375 | return zhuyin.replaceAll(/[ˊˇˋˉ˙]/g, ''); 376 | } 377 | 378 | /** 379 | * 计算两个笔画的相似度(Levenshtein Distance): 380 | * - [Sort an array by the "Levenshtein Distance" with best performance in Javascript](https://stackoverflow.com/a/11958496) 381 | * - [字符串编辑距离之 Damerau–Levenshtein Distance](https://blog.csdn.net/asty9000/article/details/81570627) 382 | * - [字符串编辑距离之 Levenshtein Distance](https://blog.csdn.net/asty9000/article/details/81384650) 383 | * - [Damerau–Levenshtein distance](https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance) 384 | */ 385 | export function calculateStrokeSimilarity(s, t) { 386 | const d = []; // 2d matrix 387 | 388 | // Step 1 389 | const n = s.length; 390 | const m = t.length; 391 | 392 | if (n == 0) return 0; 393 | if (m == 0) return 0; 394 | 395 | // Create an array of arrays in javascript (a descending loop is quicker) 396 | for (let i = n; i >= 0; i--) d[i] = []; 397 | 398 | // Step 2 399 | for (let i = n; i >= 0; i--) d[i][0] = i; 400 | for (let j = m; j >= 0; j--) d[0][j] = j; 401 | 402 | // Step 3 403 | for (let i = 1; i <= n; i++) { 404 | const s_i = s.charAt(i - 1); 405 | 406 | // Step 4 407 | for (let j = 1; j <= m; j++) { 408 | // Check the jagged ld total so far 409 | if (i == j && d[i][j] > 4) return n; 410 | 411 | const t_j = t.charAt(j - 1); 412 | const cost = s_i == t_j ? 0 : 1; // Step 5 413 | 414 | // Calculate the minimum 415 | let mi = d[i - 1][j] + 1; 416 | const b = d[i][j - 1] + 1; 417 | const c = d[i - 1][j - 1] + cost; 418 | 419 | if (b < mi) mi = b; 420 | if (c < mi) mi = c; 421 | 422 | d[i][j] = mi; // Step 6 423 | 424 | // Note: 不做转换变换 425 | // // Damerau transposition 426 | // if (i > 1 && j > 1 && s_i == t.charAt(j - 2) && s.charAt(i - 2) == t_j) { 427 | // d[i][j] = Math.min(d[i][j], d[i - 2][j - 2] + cost); 428 | // } 429 | } 430 | } 431 | 432 | // Step 7 433 | return 1 - d[n][m] / Math.max(n, m); 434 | } 435 | -------------------------------------------------------------------------------- /tools/pinyin-dict/src/utils/zdic.mjs: -------------------------------------------------------------------------------- 1 | import got from 'got'; 2 | import { JSDOM } from 'jsdom'; 3 | 4 | import { 5 | sleep, 6 | splitChars, 7 | hasGlyphFontForCodePoint, 8 | naiveHTMLNodeInnerText 9 | } from './utils.mjs'; 10 | 11 | // 根据 zdic.net 获取字的详细数据 12 | const baseUrl = 'https://www.zdic.net/hans/'; 13 | const gotOptions = { timeout: { connect: 50000 } }; 14 | 15 | export async function fetchWordMetas(words) { 16 | return await Promise.all(words.map(fetchWordMeta)); 17 | } 18 | 19 | async function fetchWordMeta(word) { 20 | const srcUrl = baseUrl + word; 21 | // const html = await (await fetch(srcUrl)).text(); 22 | const html = await got(srcUrl, gotOptions).text(); 23 | const $dom = new JSDOM(html); 24 | const $doc = (($dom || {}).window || {}).document; 25 | if (!$doc) { 26 | return { value: word }; 27 | } 28 | 29 | const wordMeta = { 30 | value: word, 31 | unicode: '', 32 | src_url: srcUrl, 33 | glyph_svg_url: '', 34 | glyph_gif_url: '', 35 | glyph_struct: '', 36 | glyph_font_exists: true, 37 | // 注音与拼音的区别和历史: https://sspai.com/post/75248 38 | pinyins: [], 39 | zhuyins: [], 40 | radical: '', 41 | stroke_order: '', 42 | total_stroke_count: 0, 43 | radical_stroke_count: 0, 44 | traditional: false, 45 | simple_words: [], 46 | variant_words: [], 47 | traditional_words: [], 48 | wubi_codes: [], 49 | cangjie_codes: [], 50 | zhengma_codes: [], 51 | sijiao_codes: [], 52 | phrases: [] 53 | }; 54 | 55 | // 字形图片和笔顺动画 56 | const $img = $doc.querySelector('.ziif .zipic img'); 57 | if ($img) { 58 | const src = $img.getAttribute('src'); 59 | const gif = $img.getAttribute('data-gif'); 60 | 61 | src && (wordMeta.glyph_svg_url = 'https:' + src); 62 | gif && (wordMeta.glyph_gif_url = 'https:' + gif); 63 | } 64 | 65 | // 拼音 66 | const $pinyin = $doc.querySelectorAll('.ziif .dsk .z_py .z_d'); 67 | $pinyin.forEach(($el) => { 68 | const value = naiveHTMLNodeInnerText($el).trim(); 69 | const $audio = $el.querySelector('a[data-src-mp3]'); 70 | const audio = ($audio && $audio.getAttribute('data-src-mp3')) || ''; 71 | 72 | wordMeta.pinyins.push({ 73 | value, 74 | audio_url: audio ? 'https:' + audio : '' 75 | }); 76 | }); 77 | 78 | // 注音,与拼音按顺序对应 79 | const $zhuyin = $doc.querySelectorAll('.ziif .dsk .z_zy .z_d'); 80 | $zhuyin.forEach(($el) => { 81 | const value = naiveHTMLNodeInnerText($el).trim(); 82 | const $audio = $el.querySelector('a[data-src-mp3]'); 83 | const audio = ($audio && $audio.getAttribute('data-src-mp3')) || ''; 84 | 85 | wordMeta.zhuyins.push({ 86 | value, 87 | audio_url: audio ? 'https:' + audio : '' 88 | }); 89 | }); 90 | 91 | // 总笔画数 92 | const $totalStrokeCount = $doc.querySelector('.ziif .dsk .z_bs2 .z_ts3'); 93 | $totalStrokeCount && 94 | (wordMeta.total_stroke_count = parseInt( 95 | naiveHTMLNodeInnerText($totalStrokeCount.parentElement) 96 | .replaceAll(/^.+\s+/g, '') 97 | .trim() 98 | )); 99 | 100 | // 部首、部外笔画数 101 | const $radical = $doc.querySelectorAll('.ziif .dsk .z_bs2 .z_ts2'); 102 | $radical.forEach(($el) => { 103 | const text = naiveHTMLNodeInnerText($el.parentElement); 104 | const value = text.replaceAll(/^.+\s+/g, '').trim(); 105 | 106 | if (text.includes('部首')) { 107 | wordMeta.radical = value; 108 | } else if (text.includes('部外')) { 109 | wordMeta.radical_stroke_count = Math.max( 110 | 0, 111 | wordMeta.total_stroke_count - parseInt(value) 112 | ); 113 | } 114 | }); 115 | 116 | // 简繁字 117 | const $jianfan = $doc.querySelectorAll('.ziif .dsk .z_jfz > p > a'); 118 | $jianfan.forEach(($el) => { 119 | if ($el.querySelector('img')) { 120 | return; 121 | } 122 | 123 | const parentText = naiveHTMLNodeInnerText($el.parentElement); 124 | const value = naiveHTMLNodeInnerText($el).trim(); 125 | 126 | if (parentText.includes('繁体')) { 127 | wordMeta.traditional = false; 128 | wordMeta.traditional_words = value.split(/\s+/g); 129 | } else if (parentText.includes('简体')) { 130 | wordMeta.traditional = true; 131 | wordMeta.simple_words = value.split(/\s+/g); 132 | } 133 | }); 134 | 135 | // 异体字 136 | const $variant = $doc.querySelectorAll('.ziif .dsk .z_ytz2 > a'); 137 | $variant.forEach(($el) => { 138 | if ($el.querySelector('img')) { 139 | return; 140 | } 141 | 142 | const value = naiveHTMLNodeInnerText($el).trim(); 143 | wordMeta.variant_words.push(value); 144 | }); 145 | 146 | // 笔顺 147 | const $strokeOrder = $doc.querySelector('.ziif .dsk .z_bis2'); 148 | $strokeOrder && 149 | (wordMeta.stroke_order = naiveHTMLNodeInnerText($strokeOrder).trim()); 150 | 151 | // 编码信息 152 | const codeTitles = []; 153 | const $codeTitle = $doc.querySelectorAll('.ziif .dsk .dsk_2_1 > p > span'); 154 | $codeTitle.forEach(($el) => { 155 | const value = naiveHTMLNodeInnerText($el).trim(); 156 | 157 | codeTitles.push(value); 158 | }); 159 | 160 | const codes = []; 161 | $doc.querySelectorAll('.ziif .dsk .dsk_2_1').forEach(($el) => { 162 | const value = naiveHTMLNodeInnerText($el).trim(); 163 | 164 | if (!codeTitles.includes(value)) { 165 | codes.push(value); 166 | } 167 | }); 168 | for (let i = 0; i < codeTitles.length; i++) { 169 | const title = codeTitles[i]; 170 | const value = codes[i]; 171 | 172 | if (title === '统一码') { 173 | wordMeta.unicode = value.replaceAll(/^.+(U\+.+)\s*/g, '$1'); 174 | } else if (title === '字形分析') { 175 | wordMeta.glyph_struct = value; 176 | } else if (title === '五笔') { 177 | wordMeta.wubi_codes = value.split(/\|/g); 178 | } else if (title === '仓颉') { 179 | wordMeta.cangjie_codes = value.split(/\|/g); 180 | } else if (title === '郑码') { 181 | wordMeta.zhengma_codes = value.split(/\|/g); 182 | } else if (title === '四角') { 183 | wordMeta.sijiao_codes = value.split(/\|/g); 184 | } 185 | } 186 | 187 | wordMeta.glyph_font_exists = hasGlyphFontForCodePoint(wordMeta.unicode); 188 | 189 | // 词组、短语 190 | const phrases = []; 191 | const $phrase = $doc.querySelectorAll('.crefe'); 192 | $phrase.forEach((el) => { 193 | const text = naiveHTMLNodeInnerText(el).trim(); 194 | 195 | phrases.push(text); 196 | }); 197 | 198 | const batchSize = 10; 199 | for (let i = 0; i < phrases.length; i += batchSize) { 200 | const phraseMetas = await Promise.all( 201 | phrases.slice(i, i + batchSize).map(fetchPhraseMeta) 202 | ); 203 | 204 | phraseMetas.forEach((phrase) => { 205 | wordMeta.phrases.push(...phrase); 206 | }); 207 | 208 | await sleep(1500); 209 | } 210 | 211 | return wordMeta; 212 | } 213 | 214 | async function fetchPhraseMeta(phrase) { 215 | const srcUrl = baseUrl + phrase; 216 | // const html = await (await fetch(srcUrl)).text(); 217 | const html = await got(srcUrl, gotOptions).text(); 218 | const $dom = new JSDOM(html); 219 | const $doc = (($dom || {}).window || {}).document; 220 | 221 | // https://www.zdic.net/hans/不塞不流,不止不行 222 | const phrases = phrase.split(/[,,、;]/g); 223 | const phraseCharsArray = phrases 224 | .map((p) => splitChars(p)) 225 | .filter((p) => p.length > 1); 226 | if (!$doc || phraseCharsArray.length === 0) { 227 | return phraseCharsArray.map((p) => ({ 228 | value: p, 229 | pinyins: [], 230 | zhuyins: [] 231 | })); 232 | } 233 | 234 | // 拼音及注音 235 | const pinyinsArray = []; 236 | const zhuyinsArray = []; 237 | const $duyin = $doc.querySelectorAll('.ciif p .z_ts2'); 238 | $duyin.forEach(($el) => { 239 | const text = naiveHTMLNodeInnerText($el); 240 | const $dicpy = $el.parentElement.querySelectorAll('.dicpy'); 241 | 242 | $dicpy.forEach(($e) => { 243 | naiveHTMLNodeInnerText($e) 244 | .split(/[,,;]/g) 245 | .forEach((val) => { 246 | const splits = val 247 | .replaceAll(/[·]/g, '') 248 | .replaceAll(/\s+([ˊˇˋˉ])/g, '$1') 249 | .split(/\s+/g); 250 | 251 | if (text === '拼音') { 252 | pinyinsArray.push({ value: splits }); 253 | } else if (text === '注音') { 254 | zhuyinsArray.push({ value: splits }); 255 | } 256 | }); 257 | }); 258 | }); 259 | 260 | const metas = []; 261 | if (pinyinsArray.length === 0) { 262 | return metas; 263 | } 264 | 265 | phraseCharsArray.forEach((chars, i) => { 266 | metas.push({ 267 | value: chars, 268 | src_url: srcUrl, 269 | pinyins: [pinyinsArray[i]], 270 | zhuyins: [zhuyinsArray[i]] 271 | }); 272 | }); 273 | 274 | return metas; 275 | } 276 | --------------------------------------------------------------------------------