├── .gitignore ├── LICENSE-2.0.txt ├── README.md ├── README.org ├── dartsclone ├── bitvector.go ├── da.go ├── da_test.go ├── dabuilder.go └── dawgbuilder.go ├── data ├── assets.go ├── assets_generate.go ├── assets_vfsdata.go ├── data.go └── root │ ├── char.def │ ├── rewrite.def │ ├── sudachi.json │ ├── sudachi_fulldict.json │ └── unk.def ├── definputtextplugin.go ├── dicbuilder └── main.go ├── dicconv └── main.go ├── dictionary.go ├── dictionary ├── binarydict.go ├── bytes.go ├── charcategory.go ├── dalexicon.go ├── dicbuilder.go ├── dicheader.go ├── dicprinter.go ├── dicversion.go ├── grammar.go ├── lexiconset.go └── wordinfo.go ├── go.mod ├── go.sum ├── gosudachicli └── main.go ├── inhibitconnectioncostplugin.go ├── inputtext.go ├── internal ├── lnreader │ └── lnreader.go └── mmap │ ├── mmap_unix.go │ └── mmap_windows.go ├── joinkatakanaoovplugin.go ├── joinnumericplugin.go ├── lattice.go ├── mecaboovproviderplugin.go ├── morpheme.go ├── numericparser.go ├── plugin.go ├── printdic └── main.go ├── printdicheader └── main.go ├── prolongedsoundmarkinputtextplugin.go ├── scripts ├── build.sh ├── mksystemdic.sh └── mksystemdicutf16.sh ├── settingsjson.go ├── settingsjson_test.go ├── simpleoovproviderplugin.go ├── tokenizer.go └── userdicbuilder └── main.go /.gitignore: -------------------------------------------------------------------------------- 1 | dist/ 2 | SudachiDict/ 3 | -------------------------------------------------------------------------------- /LICENSE-2.0.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # gosudachi 2 | 3 | gosudachiは日本語形態素解析器である[Sudachi](https://github.com/WorksApplications/Sudachi)のGo移植版です。 4 | 5 | 以下では、株式会社ワークスアプリケーションズ徳島人工知能NLP研究所が開発公開しているオリジナルのSudachiを「Java版Sudachi」「Java版」、Java版sudachi用の辞書ファイルを「Java版sudachi辞書」と表記します。 6 | 7 | gosudachiは、Java版sudachiのバージョン0.3.0相当です。 8 | 9 | 10 | ## 特徴 11 | 12 | 現時点のJava版Sudachiが持つ機能や特徴をすべて移植しました。よって詳しい情報は[Java版の文書](https://github.com/WorksApplications/Sudachi)を参照してください。この文書にはGo版のみに該当する内容が記述されています。 13 | 14 | - Java版と同じコマンドラインオプション 15 | - Java版と同じく分割モード指定が可能 16 | - Java版と同じシステム提供プラグイン同梱 17 | - Java版と同等のプラグインの仕組みを提供 18 | - Java版と同じ設定ファイルが利用可能 19 | - ユーザー辞書の作成および利用が可能 20 | 21 | 22 | ## Java版とGo版の違い 23 | 24 | - 辞書の文字列エンコード 25 | - 設定ファイルに指定するプラグイン名 26 | - 設定ファイルに辞書の文字列エンコードを指定する設定値を新設 27 | 28 | 29 | ### 辞書の文字列エンコードを変更した理由 30 | 31 | Java版Sudachiは、辞書の作成時に文字列をUTF-16エンコードのバイト列として記録します。辞書を利用するときは、辞書ファイルをメモリにマップし、バイト列をそのまま(文字コード変換をせずに)文字列として扱います。 32 | 33 | Goの文字列はUTF-8エンコードのバイト列であることが一般的です。GoでJavaと同様に辞書中のバイト列をそのまま文字列として扱うには、UTF-8エンコードで記録された辞書を準備する必要があります。 34 | 35 | Go版ではシステム辞書作成ツールとして `dicbuilder` 、ユーザー辞書作成ツールとして `userdicbuilder` を準備しており、どちらもUTF-8エンコードの辞書を作成します。(UTF-16エンコードの辞書を作成することもできます。 `dicconv` を使って相互に変換することも可能です。) 36 | 37 | ただし、UTF-8エンコードの辞書はUTF-16エンコードの辞書よりもサイズが大きくなります。以下の2点がその理由です。 38 | 39 | - 日本語に使用される文字の多くが、1文字あたりUTF-16では2byte長であり、UTF-8では3byte長 40 | - 文字列のバイト長を記録するための領域に2byteを使用する頻度が高い 41 | 42 | UTF-8エンコードでのバイト長が127を超える文字列の場合、2byteを使用してバイト長を記録します。なお、UTF-16エンコードの辞書ではバイト長ではなくUTF-16表現でのint16配列の長さを記録しており、記録可能な文字列の長さはUTF-8の方が短くなります。 43 | 44 | ちなみに辞書中に記録される文字列とは、品詞情報リストおよび単語情報です。 45 | 46 | Go版においても、UTF-16エンコードの辞書を利用することが可能です。この場合、辞書から文字列を読み出す処理においてUTF-16からUTF-8への文字コード変換が行われます。利用する辞書のエンコードを設定ファイルに設定できます。 47 | 48 | 49 | ### 設定ファイルの違い 50 | 51 | Go版でのみ利用できる設定値に関する記述です。 52 | 53 | 54 | #### utf16String 55 | 56 | `utf16String` が `true` になっている場合、UTF-16エンコードの辞書であると判断します。デフォルトはfalseです。 57 | 58 | { 59 | "systemDict" : "system_core_utf16.dic", 60 | "utf16String" : true, 61 | ... 62 | } 63 | 64 | 65 | #### プラグイン名 66 | 67 | Go版ではJava版の設定ファイルをそのまま利用することが可能ですが、プラグイン名に省略形を用いることもできます。 68 | 69 | Java版と同様にデフォルトで利用できるプラグインは以下の7つがあります。省略形とはJavaのクラス階層を省いたプラグイン名です。また、設定先は `class` ではなく `name` にすることも可能です。 70 | 71 | | 処理部分 | プラグイン | プラグイン名 | 省略形 | 72 | |-------- |------------ |--------------------------------------------------------- |--------------------------------- | 73 | | 入力テキスト修正 | 文字列正規化 | com.worksap.nlp.sudachi.DefaultInputTextPlugin | DefaultInputTextPlugin | 74 | | | 長音正規化 | com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin | ProlongedSoundMarkInputTextPlugin | 75 | | 未知語処理 | 1文字未知語 | com.worksap.nlp.sudachi.SimpleOovProviderPlugin | SimpleOovProviderPlugin | 76 | | | MeCab互換 | com.worksap.nlp.sudachi.MeCabOovProviderPlugin | MeCabOovProviderPlugin | 77 | | 単語接続処理 | 品詞接続禁制 | com.worksap.nlp.sudachi.InhibitConnectionPlugin | InhibitConnectionPlugin | 78 | | 出力解修正 | カタカナ未知語まとめ上げ | com.worksap.nlp.sudachi.JoinKatakanaOovPlugin | JoinKatakanaOovPlugin | 79 | | | 数詞まとめ上げ | com.worksap.nlp.sudachi.JoinNumericPlugin | JoinNumericPlugin | 80 | 81 | { 82 | "systemDict" : "system_core.dic", 83 | "inputTextPlugin" : [ 84 | { "name" : "DefaultInputTextPlugin" }, 85 | { "name" : "ProlongedSoundMarkInputTextPlugin", 86 | "prolongedSoundMarks": ["ー", "-", "⁓", "〜", "〰"], 87 | "replacementSymbol": "ー"} 88 | ], 89 | "oovProviderPlugin" : [ 90 | { "name" : "MeCabOovProviderPlugin" }, 91 | { "name" : "SimpleOovProviderPlugin", 92 | "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ], 93 | "leftId" : 5968, 94 | "rightId" : 5968, 95 | "cost" : 3857 } 96 | ], 97 | "pathRewritePlugin" : [ 98 | { "name" : "JoinNumericPlugin", 99 | "joinKanjiNumeric" : true }, 100 | { "name" : "JoinKatakanaOovPlugin", 101 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 102 | "minLength" : 3 103 | } 104 | ] 105 | } 106 | 107 | 108 | ## Goへのポーティング指針 109 | 110 | 以下の指針のもと、移植作業を行っています。 111 | 112 | 1. なるべくJavaのコードに似たような構成にする 113 | - オリジナルに修正が入ったときに追随しやすいように 114 | 115 | 2. Java版Sudachiと同じ設定ファイルが利用できるように 116 | 117 | 3. Java版Sudachiのコマンドラインインターフェースも同じにする 118 | 119 | 4. Java版Sudachi用に作成された辞書ファイルをGo版でも使えるように 120 | 121 | 5. Java版Sudachi用の辞書が作れるように 122 | 123 | 124 | ## ビルド 125 | 126 | プログラムと辞書を作成する方法です。 127 | 128 | 129 | ### プログラムのビルド 130 | 131 | このリポジトリをcloneします。 cloneしたディレクトリに移動し、ビルドスクリプトを実行します。 132 | 133 | $ git clone https://github.com/msnoigrs/gosudachi 134 | $ cd gosudachi 135 | $ bash scripts/build.sh 136 | 137 | distディレクトリにバイナリが作成されます。作成されるバイナリは以下の通りです。 138 | 139 | - **gosudachicli:** Sudachiコマンドライン 140 | - **dicbuilder:** システム辞書作成ツール 141 | - **userdicbuilder:** ユーザー辞書作成ツール 142 | - **printdic:** 辞書ファイルに登録されている単語リスト表示プログラム 143 | - **printdicheader:** 辞書ファイルヘッダ情報表示プログラム 144 | - **dicconv:** 辞書の文字列エンコードをUTF-16とUTF-8間で相互に変換するプログラム 145 | 146 | ビルドスクリプトを使わない場合は、コマンドプロンプト上で以下を実行してください。Windowsでも作成可能です。 147 | 148 | $ git clone https://github.com/msnoigrs/gosudachi 149 | $ cd gosudachi/data 150 | $ go generate 151 | $ cd .. 152 | $ cd gosudachicli 153 | $ go build 154 | $ cd .. 155 | $ cd dicbuilder 156 | $ go build 157 | $ cd .. 158 | $ cd userdicbuilder 159 | $ go build 160 | $ cd .. 161 | $ cd printdic 162 | $ go build 163 | $ cd .. 164 | $ go printdicheader 165 | $ go build 166 | $ cd .. 167 | $ cd dicconv 168 | $ go build 169 | 170 | 171 | ### 辞書の作成 172 | 173 | 辞書のソースもJava版Sudachiのものを利用します。 [SudachiDict](https://github.com/WorksApplications/SudachiDict)をgithubからcloneした後、git lfs pullで取得します。 辞書のソースファイルは、 `small_lex.csv` と `core_lex.csv` と `notcore_lex.csv` の3つです。 174 | 175 | 辞書を作成するスクリプトを利用する場合、以下を実行してください。 176 | 177 | $ git clone https://github.com/WorksApplications/SudachiDict.git 178 | $ cd SudachiDict 179 | $ git lfs pull 180 | $ cd ../dist 181 | $ bash ../scripts/mksystemdic.sh ../SudachiDict 182 | 183 | distディレクトリに `system_small.dic` 、 `system_core.dic` および `system_full.dic` ファイルが作成されます。 184 | 185 | 辞書作成スクリプトを使わない場合は、コマンドプロンプト上で以下を実行してください。 186 | 187 | $ dicbuilder -o system_small.dic -m matrix.def small_lex.csv 188 | $ dicbuilder -o system_core.dic -m matrix.def small_lex.csv core_lex.csv 189 | $ dicbuilder -o system_full.dic -m matrix.def small_lex.csv core_lex.csv notcore_lex.csv 190 | 191 | 192 | ## コマンド 193 | 194 | Go版で提供するコマンドの説明です。 195 | 196 | 197 | ### gosudachicli 198 | 199 | Sudachiコマンドラインです。オプションを指定せずに実行する場合、 `system_core.dic` ファイルが実行時のディレクトリに存在する必要があります。辞書ファイルの場所は設定ファイルに指定可能です。 200 | 201 | $ gosudachicli [-r conf] [-m mode] [-a] [-d] [-o output] [-j] [file...] 202 | 203 | 204 | #### オプション 205 | 206 | - -r conf設定ファイルを指定 207 | - -s デフォルト設定を上書きする設定(json文字列) 208 | - -p リソースディレクトリ(設定ファイル内の各種リソースのベースディレクトリ、デフォルトは実行時ディレクトリ) 209 | - -m {A|B|C}分割モード 210 | - -a 読み、辞書形も出力 211 | - -d デバッグ情報の出力 212 | - -o 出力ファイル(指定がない場合は標準出力) 213 | - -f エラーを無視して処理を続行する 214 | - -j UTF-16エンコードの辞書ファイルを利用する 215 | 216 | 217 | #### 出力例 218 | 219 | $ echo 東京都へ行く | gosudachicli 220 | 東京都 名詞,固有名詞,地名,一般,*,* 東京都 221 | へ 助詞,格助詞,*,*,*,* へ 222 | 行く 動詞,非自立可能,*,*,五段-カ行,終止形-一般 行く 223 | EOS 224 | 225 | $ echo 東京都へ行く | gosudachicli -a 226 | 東京都 名詞,固有名詞,地名,一般,*,* 東京都 東京都 トウキョウト 227 | へ 助詞,格助詞,*,*,*,* へ へ エ 228 | 行く 動詞,非自立可能,*,*,五段-カ行,終止形-一般 行く 行く イク 229 | EOS 230 | 231 | $ echo 東京都へ行く | gosudachicli -m A 232 | 東京 名詞,固有名詞,地名,一般,*,* 東京 233 | 都 名詞,普通名詞,一般,*,*,* 都 234 | へ 助詞,格助詞,*,*,*,* へ 235 | 行く 動詞,非自立可能,*,*,五段-カ行,終止形-一般 行く 236 | EOS 237 | 238 | - **Java版:** com.worksap.nlp.sudachi.SudachiCommandLine 239 | 240 | 241 | ### dicbuilder 242 | 243 | 辞書ソースファイルからシステム辞書を作成します。デフォルトではUTF-8エンコードの辞書が作成されます。 244 | 245 | $ dicbuilder -o outputdic -m matrix.def [-d description] [-j] filecsv1 [filecsv2...] 246 | 247 | 248 | #### オプション 249 | 250 | - -o 出力ファイル(必須) 251 | - -m matrix.defファイル(必須) 252 | - -d 辞書ヘッダ情報に埋め込む文字 253 | - -j UTF-16エンコードの辞書ファイルを生成する 254 | 255 | - **Java版:** com.worksap.nlp.sudachi.dictionary.DictionaryBuilder 256 | 257 | 258 | ### userdicbuilder 259 | 260 | ユーザー辞書ソースファイルからユーザー辞書を作成します。デフォルトではUTF-8エンコードの辞書が作成されます。 261 | 262 | $ userdicbuilder -o outputdic -s systemdic [-d description] [-j] filecsv1 [filecsv2...] 263 | 264 | 265 | #### オプション 266 | 267 | - -o 出力ファイル(必須) 268 | - -s システム辞書ファイル(必須) 269 | - -d 辞書ヘッダ情報に埋め込む文字 270 | - -j UTF-16エンコードの辞書ファイルを生成する 271 | 272 | - **Java版:** com.worksap.nlp.sudachi.dictionary.UserDictionaryBuilder 273 | 274 | 275 | ### printdic 276 | 277 | 辞書ファイルに登録されている単語リストを表示します。 278 | 279 | $ printdic [-s systemdic] [-j] inputdic 280 | 281 | 282 | #### オプション 283 | 284 | - -s システム辞書ファイル(ユーザー辞書の情報を出力する場合に必要) 285 | - -j UTF-16エンコードの辞書を読み込み 286 | 287 | - **Java版:** com.worksap.nlp.sudachi.dictionary.DictionaryPrinter 288 | 289 | 290 | ### printdicheader 291 | 292 | 辞書ファイルのヘッダ情報を表示します。 293 | 294 | $ printdicheader inputdic 295 | 296 | - **java版:** com.worksap.nlp.sudachi.dictionary.DictionaryHeaderPrinter 297 | 298 | 299 | ### dicconv 300 | 301 | 辞書ファイルに記録されている文字列のエンコードを変換します。オプションを指定しない場合、UTF-16エンコード(Java版)からUTF-8エンコード(Go版)に変換します。 302 | 303 | $ dicconv [-o outputdic] [-j] inputdic 304 | 305 | 306 | #### オプション 307 | 308 | - -o 出力ファイル、省略すると `out_utf16.dic` もしくは `out_utf8.dic` に出力 309 | - -j UTF-8エンコードからUTF-16エンコードに変換する 310 | 311 | 312 | ## ライセンス 313 | 314 | Java版Sudachiと同じ[Apache License, Version2.0](http://www.apache.org/licenses/LICENSE-2.0.html) 315 | 316 | 317 | ## 謝辞 318 | 319 | [Sudachi](https://github.com/WorksApplications/Sudachi)においてプログラムや辞書をOSSとして公開されている、株式会社ワークスアプリケーションズ徳島人工知能NLP研究所およびその開発者の方々に感謝いたします。 320 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | #+TITLE: gosudachi 2 | #+AUTHOR: 五十嵐 正尚 3 | #+EMAIL: syoux2@gmail.com 4 | #+DATE: 2019/08/03 5 | #+DESCRIPTION: Go porting of Sudachi 6 | #+KEYWORDS: 7 | #+LANGUAGE: ja 8 | #+OPTIONS: H:4 num:nil toc:nil ::t |:t ^:t -:t f:t *:t <:t 9 | #+OPTIONS: tex:t todo:t pri:nil tags:t texht:nil 10 | #+OPTIONS: author:t creator:nil email:nil date:t 11 | 12 | * gosudachi 13 | 14 | gosudachiは日本語形態素解析器である[[https://github.com/WorksApplications/Sudachi][Sudachi]]のGo移植版です。 15 | 16 | 以下では、株式会社ワークスアプリケーションズ徳島人工知能NLP研究所が開発公開しているオリジナルのSudachiを「Java版Sudachi」「Java版」、Java版sudachi用の辞書ファイルを「Java版sudachi辞書」と表記します。 17 | 18 | gosudachiは、Java版sudachiのバージョン0.3.0相当です。 19 | 20 | ** 特徴 21 | 22 | 現時点のJava版Sudachiが持つ機能や特徴をすべて移植しました。よって詳しい情報は[[https://github.com/WorksApplications/Sudachi][Java版の文書]]を参照してください。この文書にはGo版のみに該当する内容が記述されています。 23 | 24 | - Java版と同じコマンドラインオプション 25 | - Java版と同じく分割モード指定が可能 26 | - Java版と同じシステム提供プラグイン同梱 27 | - Java版と同等のプラグインの仕組みを提供 28 | - Java版と同じ設定ファイルが利用可能 29 | - ユーザー辞書の作成および利用が可能 30 | 31 | ** Java版とGo版の違い 32 | 33 | - 辞書の文字列エンコード 34 | - 設定ファイルに指定するプラグイン名 35 | - 設定ファイルに辞書の文字列エンコードを指定する設定値を新設 36 | 37 | *** 辞書の文字列エンコードを変更した理由 38 | 39 | Java版Sudachiは、辞書の作成時に文字列をUTF-16エンコードのバイト列として記録します。辞書を利用するときは、辞書ファイルをメモリにマップし、バイト列をそのまま(文字コード変換をせずに)文字列として扱います。 40 | 41 | Goの文字列はUTF-8エンコードのバイト列であることが一般的です。GoでJavaと同様に辞書中のバイト列をそのまま文字列として扱うには、UTF-8エンコードで記録された辞書を準備する必要があります。 42 | 43 | Go版ではシステム辞書作成ツールとして ~dicbuilder~ 、ユーザー辞書作成ツールとして ~userdicbuilder~ を準備しており、どちらもUTF-8エンコードの辞書を作成します。(UTF-16エンコードの辞書を作成することもできます。 ~dicconv~ を使って相互に変換することも可能です。) 44 | 45 | ただし、UTF-8エンコードの辞書はUTF-16エンコードの辞書よりもサイズが大きくなります。以下の2点がその理由です。 46 | 47 | - 日本語に使用される文字の多くが、1文字あたりUTF-16では2byte長であり、UTF-8では3byte長 48 | - 文字列のバイト長を記録するための領域に2byteを使用する頻度が高い 49 | 50 | UTF-8エンコードでのバイト長が127を超える文字列の場合、2byteを使用してバイト長を記録します。なお、UTF-16エンコードの辞書ではバイト長ではなくUTF-16表現でのint16配列の長さを記録しており、記録可能な文字列の長さはUTF-8の方が短くなります。 51 | 52 | ちなみに辞書中に記録される文字列とは、品詞情報リストおよび単語情報です。 53 | 54 | Go版においても、UTF-16エンコードの辞書を利用することが可能です。この場合、辞書から文字列を読み出す処理においてUTF-16からUTF-8への文字コード変換が行われます。利用する辞書のエンコードを設定ファイルに設定できます。 55 | 56 | *** 設定ファイルの違い 57 | 58 | Go版でのみ利用できる設定値に関する記述です。 59 | 60 | **** utf16String 61 | 62 | ~utf16String~ が ~true~ になっている場合、UTF-16エンコードの辞書であると判断します。デフォルトはfalseです。 63 | 64 | #+BEGIN_EXAMPLE 65 | { 66 | "systemDict" : "system_core_utf16.dic", 67 | "utf16String" : true, 68 | ... 69 | } 70 | #+END_EXAMPLE 71 | 72 | **** プラグイン名 73 | 74 | Go版ではJava版の設定ファイルをそのまま利用することが可能ですが、プラグイン名に省略形を用いることもできます。 75 | 76 | Java版と同様にデフォルトで利用できるプラグインは以下の7つがあります。省略形とはJavaのクラス階層を省いたプラグイン名です。また、設定先は ~class~ ではなく ~name~ にすることも可能です。 77 | 78 | | 処理部分 | プラグイン | プラグイン名 | 省略形 | 79 | |------------------+--------------------------+-----------------------------------------------------------+-----------------------------------| 80 | | 入力テキスト修正 | 文字列正規化 | com.worksap.nlp.sudachi.DefaultInputTextPlugin | DefaultInputTextPlugin | 81 | | | 長音正規化 | com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin | ProlongedSoundMarkInputTextPlugin | 82 | | 未知語処理 | 1文字未知語 | com.worksap.nlp.sudachi.SimpleOovProviderPlugin | SimpleOovProviderPlugin | 83 | | | MeCab互換 | com.worksap.nlp.sudachi.MeCabOovProviderPlugin | MeCabOovProviderPlugin | 84 | | 単語接続処理 | 品詞接続禁制 | com.worksap.nlp.sudachi.InhibitConnectionPlugin | InhibitConnectionPlugin | 85 | | 出力解修正 | カタカナ未知語まとめ上げ | com.worksap.nlp.sudachi.JoinKatakanaOovPlugin | JoinKatakanaOovPlugin | 86 | | | 数詞まとめ上げ | com.worksap.nlp.sudachi.JoinNumericPlugin | JoinNumericPlugin | 87 | 88 | #+BEGIN_EXAMPLE 89 | { 90 | "systemDict" : "system_core.dic", 91 | "inputTextPlugin" : [ 92 | { "name" : "DefaultInputTextPlugin" }, 93 | { "name" : "ProlongedSoundMarkInputTextPlugin", 94 | "prolongedSoundMarks": ["ー", "-", "⁓", "〜", "〰"], 95 | "replacementSymbol": "ー"} 96 | ], 97 | "oovProviderPlugin" : [ 98 | { "name" : "MeCabOovProviderPlugin" }, 99 | { "name" : "SimpleOovProviderPlugin", 100 | "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ], 101 | "leftId" : 5968, 102 | "rightId" : 5968, 103 | "cost" : 3857 } 104 | ], 105 | "pathRewritePlugin" : [ 106 | { "name" : "JoinNumericPlugin", 107 | "joinKanjiNumeric" : true }, 108 | { "name" : "JoinKatakanaOovPlugin", 109 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 110 | "minLength" : 3 111 | } 112 | ] 113 | } 114 | #+END_EXAMPLE 115 | 116 | ** Goへのポーティング指針 117 | 118 | 以下の指針のもと、移植作業を行っています。 119 | 120 | 1. なるべくJavaのコードに似たような構成にする 121 | + オリジナルに修正が入ったときに追随しやすいように 122 | 123 | 2. Java版Sudachiと同じ設定ファイルが利用できるように 124 | 125 | 3. Java版Sudachiのコマンドラインインターフェースも同じにする 126 | 127 | 4. Java版Sudachi用に作成された辞書ファイルをGo版でも使えるように 128 | 129 | 5. Java版Sudachi用の辞書が作れるように 130 | 131 | ** ビルド 132 | 133 | プログラムと辞書を作成する方法です。 134 | 135 | *** プログラムのビルド 136 | 137 | このリポジトリをcloneします。 138 | cloneしたディレクトリに移動し、ビルドスクリプトを実行します。 139 | 140 | #+BEGIN_EXAMPLE 141 | $ git clone https://github.com/msnoigrs/gosudachi 142 | $ cd gosudachi 143 | $ bash scripts/build.sh 144 | #+END_EXAMPLE 145 | 146 | distディレクトリにバイナリが作成されます。作成されるバイナリは以下の通りです。 147 | 148 | - gosudachicli :: Sudachiコマンドライン 149 | - dicbuilder :: システム辞書作成ツール 150 | - userdicbuilder :: ユーザー辞書作成ツール 151 | - printdic :: 辞書ファイルに登録されている単語リスト表示プログラム 152 | - printdicheader :: 辞書ファイルヘッダ情報表示プログラム 153 | - dicconv :: 辞書の文字列エンコードをUTF-16とUTF-8間で相互に変換するプログラム 154 | 155 | ビルドスクリプトを使わない場合は、コマンドプロンプト上で以下を実行してください。Windowsでも作成可能です。 156 | 157 | #+BEGIN_EXAMPLE 158 | $ git clone https://github.com/msnoigrs/gosudachi 159 | $ cd gosudachi/data 160 | $ go generate 161 | $ cd .. 162 | $ cd gosudachicli 163 | $ go build 164 | $ cd .. 165 | $ cd dicbuilder 166 | $ go build 167 | $ cd .. 168 | $ cd userdicbuilder 169 | $ go build 170 | $ cd .. 171 | $ cd printdic 172 | $ go build 173 | $ cd .. 174 | $ go printdicheader 175 | $ go build 176 | $ cd .. 177 | $ cd dicconv 178 | $ go build 179 | #+END_EXAMPLE 180 | 181 | *** 辞書の作成 182 | 183 | 辞書のソースもJava版Sudachiのものを利用します。 184 | [[https://github.com/WorksApplications/SudachiDict][SudachiDict]]をgithubからcloneした後、git lfs pullで取得します。 185 | 辞書のソースファイルは、 ~small_lex.csv~ と ~core_lex.csv~ と ~notcore_lex.csv~ の3つです。 186 | 187 | 辞書を作成するスクリプトを利用する場合、以下を実行してください。 188 | 189 | #+BEGIN_EXAMPLE 190 | $ git clone https://github.com/WorksApplications/SudachiDict.git 191 | $ cd SudachiDict 192 | $ git lfs pull 193 | $ cd ../dist 194 | $ bash ../scripts/mksystemdic.sh ../SudachiDict 195 | #+END_EXAMPLE 196 | 197 | distディレクトリに ~system_small.dic~ 、 ~system_core.dic~ および ~system_full.dic~ ファイルが作成されます。 198 | 199 | 辞書作成スクリプトを使わない場合は、コマンドプロンプト上で以下を実行してください。 200 | 201 | #+BEGIN_EXAMPLE 202 | $ dicbuilder -o system_small.dic -m matrix.def small_lex.csv 203 | $ dicbuilder -o system_core.dic -m matrix.def small_lex.csv core_lex.csv 204 | $ dicbuilder -o system_full.dic -m matrix.def small_lex.csv core_lex.csv notcore_lex.csv 205 | #+END_EXAMPLE 206 | 207 | ** コマンド 208 | 209 | Go版で提供するコマンドの説明です。 210 | 211 | *** gosudachicli 212 | 213 | Sudachiコマンドラインです。オプションを指定せずに実行する場合、 ~system_core.dic~ ファイルが実行時のディレクトリに存在する必要があります。辞書ファイルの場所は設定ファイルに指定可能です。 214 | 215 | #+BEGIN_EXAMPLE 216 | $ gosudachicli [-r conf] [-m mode] [-a] [-d] [-o output] [-j] [file...] 217 | #+END_EXAMPLE 218 | 219 | **** オプション 220 | 221 | - -r conf設定ファイルを指定 222 | - -s デフォルト設定を上書きする設定(json文字列) 223 | - -p リソースディレクトリ(設定ファイル内の各種リソースのベースディレクトリ、デフォルトは実行時ディレクトリ) 224 | - -m {A|B|C}分割モード 225 | - -a 読み、辞書形も出力 226 | - -d デバッグ情報の出力 227 | - -o 出力ファイル(指定がない場合は標準出力) 228 | - -f エラーを無視して処理を続行する 229 | - -j UTF-16エンコードの辞書ファイルを利用する 230 | 231 | **** 出力例 232 | 233 | #+BEGIN_EXAMPLE 234 | $ echo 東京都へ行く | gosudachicli 235 | 東京都 名詞,固有名詞,地名,一般,*,* 東京都 236 | へ 助詞,格助詞,*,*,*,* へ 237 | 行く 動詞,非自立可能,*,*,五段-カ行,終止形-一般 行く 238 | EOS 239 | 240 | $ echo 東京都へ行く | gosudachicli -a 241 | 東京都 名詞,固有名詞,地名,一般,*,* 東京都 東京都 トウキョウト 242 | へ 助詞,格助詞,*,*,*,* へ へ エ 243 | 行く 動詞,非自立可能,*,*,五段-カ行,終止形-一般 行く 行く イク 244 | EOS 245 | 246 | $ echo 東京都へ行く | gosudachicli -m A 247 | 東京 名詞,固有名詞,地名,一般,*,* 東京 248 | 都 名詞,普通名詞,一般,*,*,* 都 249 | へ 助詞,格助詞,*,*,*,* へ 250 | 行く 動詞,非自立可能,*,*,五段-カ行,終止形-一般 行く 251 | EOS 252 | #+END_EXAMPLE 253 | 254 | - Java版 :: com.worksap.nlp.sudachi.SudachiCommandLine 255 | 256 | *** dicbuilder 257 | 258 | 辞書ソースファイルからシステム辞書を作成します。デフォルトではUTF-8エンコードの辞書が作成されます。 259 | 260 | #+BEGIN_EXAMPLE 261 | $ dicbuilder -o outputdic -m matrix.def [-d description] [-j] filecsv1 [filecsv2...] 262 | #+END_EXAMPLE 263 | 264 | **** オプション 265 | 266 | - -o 出力ファイル(必須) 267 | - -m matrix.defファイル(必須) 268 | - -d 辞書ヘッダ情報に埋め込む文字 269 | - -j UTF-16エンコードの辞書ファイルを生成する 270 | 271 | - Java版 :: com.worksap.nlp.sudachi.dictionary.DictionaryBuilder 272 | 273 | *** userdicbuilder 274 | 275 | ユーザー辞書ソースファイルからユーザー辞書を作成します。デフォルトではUTF-8エンコードの辞書が作成されます。 276 | 277 | #+BEGIN_EXAMPLE 278 | $ userdicbuilder -o outputdic -s systemdic [-d description] [-j] filecsv1 [filecsv2...] 279 | #+END_EXAMPLE 280 | 281 | **** オプション 282 | 283 | - -o 出力ファイル(必須) 284 | - -s システム辞書ファイル(必須) 285 | - -d 辞書ヘッダ情報に埋め込む文字 286 | - -j UTF-16エンコードの辞書ファイルを生成する 287 | 288 | - Java版 :: com.worksap.nlp.sudachi.dictionary.UserDictionaryBuilder 289 | 290 | *** printdic 291 | 292 | 辞書ファイルに登録されている単語リストを表示します。 293 | 294 | #+BEGIN_EXAMPLE 295 | $ printdic [-s systemdic] [-j] inputdic 296 | #+END_EXAMPLE 297 | 298 | **** オプション 299 | 300 | - -s システム辞書ファイル(ユーザー辞書の情報を出力する場合に必要) 301 | - -j UTF-16エンコードの辞書を読み込み 302 | 303 | - Java版 :: com.worksap.nlp.sudachi.dictionary.DictionaryPrinter 304 | 305 | *** printdicheader 306 | 307 | 辞書ファイルのヘッダ情報を表示します。 308 | 309 | #+BEGIN_EXAMPLE 310 | $ printdicheader inputdic 311 | #+END_EXAMPLE 312 | 313 | - java版 :: com.worksap.nlp.sudachi.dictionary.DictionaryHeaderPrinter 314 | 315 | *** dicconv 316 | 317 | 辞書ファイルに記録されている文字列のエンコードを変換します。オプションを指定しない場合、UTF-16エンコード(Java版)からUTF-8エンコード(Go版)に変換します。 318 | 319 | #+BEGIN_EXAMPLE 320 | $ dicconv [-o outputdic] [-j] inputdic 321 | #+END_EXAMPLE 322 | 323 | **** オプション 324 | 325 | - -o 出力ファイル、省略すると ~out_utf16.dic~ もしくは ~out_utf8.dic~ に出力 326 | - -j UTF-8エンコードからUTF-16エンコードに変換する 327 | 328 | ** ライセンス 329 | 330 | Java版Sudachiと同じ[[http://www.apache.org/licenses/LICENSE-2.0.html][Apache License, Version2.0]] 331 | 332 | ** 謝辞 333 | 334 | [[https://github.com/WorksApplications/Sudachi][Sudachi]]においてプログラムや辞書をOSSとして公開されている、株式会社ワークスアプリケーションズ徳島人工知能NLP研究所およびその開発者の方々に感謝いたします。 335 | -------------------------------------------------------------------------------- /dartsclone/bitvector.go: -------------------------------------------------------------------------------- 1 | package dartsclone 2 | 3 | const ( 4 | unitLength = 32 5 | ) 6 | 7 | type bitVector struct { 8 | units []uint32 9 | ranks []int 10 | numOnes int 11 | length int 12 | } 13 | 14 | func newBitVector() *bitVector { 15 | return &bitVector{} 16 | } 17 | 18 | func (v *bitVector) get(id int) bool { 19 | return v.units[id/unitLength]>>((uint(id)%unitLength)&1) == 1 20 | } 21 | 22 | func (v *bitVector) rank(id int) int { 23 | const mask = uint32(0xffffffff) 24 | unitId := id / unitLength 25 | offset := uint(id % unitLength) 26 | return v.ranks[unitId] + popCount(v.units[unitId] & ^(mask<> 1) + (unit & 0x55555555) 60 | unit = ((unit & 0xCCCCCCCC) >> 2) + (unit & 0x33333333) 61 | unit = ((unit >> 4) + unit) & 0x0F0F0F0F 62 | unit += unit >> 8 63 | unit += unit >> 16 64 | return int(unit & 0xFF) 65 | } 66 | -------------------------------------------------------------------------------- /dartsclone/da.go: -------------------------------------------------------------------------------- 1 | package dartsclone 2 | 3 | import ( 4 | "errors" 5 | "io" 6 | "os" 7 | 8 | // "math" 9 | "unsafe" 10 | 11 | "github.com/msnoigrs/gosudachi/internal/mmap" 12 | ) 13 | 14 | type DoubleArray struct { 15 | array []uint32 16 | buffer []byte 17 | } 18 | 19 | func NewDoubleArray() *DoubleArray { 20 | return &DoubleArray{} 21 | } 22 | 23 | func (da *DoubleArray) SetArray(array []uint32) { 24 | da.array = array 25 | da.buffer = asByteArray(array) 26 | } 27 | 28 | func (da *DoubleArray) SetBuffer(buffer []byte) { 29 | da.buffer = buffer 30 | da.array = asUInt32Array(buffer) 31 | } 32 | 33 | func (da *DoubleArray) Array() []uint32 { 34 | return da.array 35 | } 36 | 37 | func (da *DoubleArray) ByteArray() []byte { 38 | return da.buffer 39 | } 40 | 41 | func (da *DoubleArray) Clear() { 42 | da.buffer = []byte{} 43 | da.array = []uint32{} 44 | } 45 | 46 | func (da *DoubleArray) Length() int { 47 | return len(da.array) 48 | } 49 | 50 | func (da *DoubleArray) TotalSize() int { 51 | return len(da.buffer) 52 | } 53 | 54 | func (da *DoubleArray) Build(keys [][]byte, values []int, f ProgressFunc) error { 55 | var err error 56 | dab := newDoubleArrayBuilder(f) 57 | da.array, err = dab.build(newKeySet(keys, values)) 58 | if err != nil { 59 | return err 60 | } 61 | da.buffer = asByteArray(da.array) 62 | 63 | return nil 64 | } 65 | 66 | func (da *DoubleArray) Open(f *os.File, position int64, totalSize int64) (err error) { 67 | if position < 0 { 68 | position = 0 69 | } 70 | if totalSize <= 0 { 71 | finfo, err := f.Stat() 72 | if err != nil { 73 | return err 74 | } 75 | totalSize = finfo.Size() 76 | } 77 | da.buffer, err = mmap.Mmap(f, false, position, totalSize) 78 | if err != nil { 79 | return err 80 | } 81 | // err = mmap.Madvise(da.buffer, false) 82 | // if err != nil { 83 | // return err 84 | // } 85 | da.array = asUInt32Array(da.buffer) 86 | 87 | return nil 88 | } 89 | 90 | func (da *DoubleArray) Close() error { 91 | err := mmap.Munmap(da.buffer) 92 | if err != nil { 93 | return err 94 | } 95 | da.buffer = []byte{} 96 | da.array = []uint32{} 97 | 98 | return nil 99 | } 100 | 101 | func (da *DoubleArray) Save(writer io.Writer) (int, error) { 102 | return writer.Write(da.buffer) 103 | } 104 | 105 | func (da *DoubleArray) ExactMatchSearch(key []byte) (int, int) { 106 | var nodePos uint32 107 | u := daunit(da.array[0]) 108 | 109 | for _, k := range key { 110 | nodePos ^= u.offset() ^ uint32(k) 111 | u = daunit(da.array[int(nodePos)]) 112 | if u.label() != uint32(k) { 113 | return -1, 0 114 | } 115 | } 116 | if !u.hasLeaf() { 117 | return -1, 0 118 | } 119 | u = daunit(da.array[int(nodePos^u.offset())]) 120 | return u.value(), len(key) 121 | } 122 | 123 | func (da *DoubleArray) CommonPrefixSearch(key []byte, offset int, maxNumResult int) [][2]int { 124 | result := make([][2]int, 0) 125 | 126 | var nodePos uint32 127 | u := daunit(da.array[0]) 128 | nodePos ^= u.offset() 129 | for i := offset; i < len(key); i++ { 130 | k := uint32(key[i]) 131 | nodePos ^= k 132 | u = daunit(da.array[int(nodePos)]) 133 | if u.label() != k { 134 | return result 135 | } 136 | 137 | nodePos ^= u.offset() 138 | if u.hasLeaf() && len(result) < maxNumResult { 139 | result = append(result, [2]int{daunit(da.array[int(nodePos)]).value(), i + 1}) 140 | } 141 | } 142 | return result 143 | } 144 | 145 | func (da *DoubleArray) CommonPrefixSearchItr(key []byte, offset int) *Iterator { 146 | return newIterator(da.array, key, offset) 147 | } 148 | 149 | type Iterator struct { 150 | array []uint32 151 | key []byte 152 | offset int 153 | nodePos uint32 154 | rvalue int 155 | roffset int 156 | err error 157 | } 158 | 159 | func newIterator(array []uint32, key []byte, offset int) *Iterator { 160 | var nodePos uint32 161 | u := daunit(array[0]) 162 | nodePos ^= u.offset() 163 | return &Iterator{ 164 | array: array, 165 | key: key, 166 | offset: offset, 167 | nodePos: nodePos, 168 | rvalue: -1, 169 | } 170 | } 171 | 172 | func (it *Iterator) Next() bool { 173 | if it.err != nil { 174 | return false 175 | } 176 | if it.rvalue == -1 { 177 | it.rvalue, it.roffset = it.getNext() 178 | } 179 | return it.rvalue != -1 180 | } 181 | 182 | func (it *Iterator) Get() (int, int) { 183 | var ( 184 | rvalue int 185 | roffset int 186 | ) 187 | if it.rvalue == -1 { 188 | rvalue, roffset = it.getNext() 189 | if rvalue == -1 { 190 | it.err = errors.New("No more element") 191 | return rvalue, roffset 192 | } 193 | } else { 194 | rvalue = it.rvalue 195 | roffset = it.roffset 196 | it.rvalue = -1 197 | it.roffset = 0 198 | } 199 | return rvalue, roffset 200 | } 201 | 202 | func (it *Iterator) Err() error { 203 | return it.err 204 | } 205 | 206 | func (it *Iterator) getNext() (int, int) { 207 | for ; it.offset < len(it.key); it.offset++ { 208 | k := uint32(it.key[it.offset]) 209 | it.nodePos ^= k 210 | u := daunit(it.array[int(it.nodePos)]) 211 | if u.label() != k { 212 | it.offset = len(it.key) // no more loop 213 | return -1, 0 214 | } 215 | 216 | it.nodePos ^= u.offset() 217 | if u.hasLeaf() { 218 | it.offset++ 219 | rvalue := daunit(it.array[int(it.nodePos)]).value() 220 | roffset := it.offset 221 | return rvalue, roffset 222 | } 223 | } 224 | return -1, 0 225 | } 226 | 227 | type TraverseResult struct { 228 | Result int 229 | Offset int 230 | NodePosition int 231 | } 232 | 233 | func (da *DoubleArray) Traverse(key []byte, offset int, length int, nodePosition int) *TraverseResult { 234 | nodePos := uint32(nodePosition) 235 | id := nodePos 236 | u := daunit(da.array[0]) 237 | 238 | for i := offset; i < length; i++ { 239 | k := uint32(key[i]) 240 | id ^= u.offset() ^ k 241 | u = daunit(da.array[int(id)]) 242 | if u.label() != k { 243 | return &TraverseResult{ 244 | -2, 245 | i, 246 | int(nodePos), 247 | } 248 | } 249 | nodePos = id 250 | } 251 | if !u.hasLeaf() { 252 | return &TraverseResult{ 253 | -1, 254 | length, 255 | int(nodePos), 256 | } 257 | } 258 | u = daunit(da.array[int(nodePos^u.offset())]) 259 | return &TraverseResult{ 260 | u.value(), 261 | length, 262 | int(nodePos), 263 | } 264 | } 265 | 266 | func asUInt32Array(data []byte) []uint32 { 267 | var sl = struct { 268 | addr uintptr 269 | len int 270 | cap int 271 | }{uintptr(unsafe.Pointer(&data[0])), len(data) / 4, len(data) / 4} 272 | return *(*[]uint32)(unsafe.Pointer(&sl)) 273 | // return (*[math.MaxUint32 / 4]uint32)(unsafe.Pointer(&data[0]))[:len(data) / 4] 274 | } 275 | 276 | func asByteArray(data []uint32) []byte { 277 | // Slice memory layout 278 | // Copied this snippet from golang/sys package 279 | var sl = struct { 280 | addr uintptr 281 | len int 282 | cap int 283 | }{uintptr(unsafe.Pointer(&data[0])), len(data) * 4, len(data) * 4} 284 | return *(*[]byte)(unsafe.Pointer(&sl)) 285 | // return (*[math.MaxUint32]byte)(unsafe.Pointer(&data[0]))[:len(data) * 4] 286 | } 287 | 288 | type daunit uint32 289 | 290 | func (u daunit) hasLeaf() bool { 291 | return ((uint32(u) >> 8) & uint32(1)) == 1 292 | } 293 | 294 | func (u daunit) value() int { 295 | return int(uint32(u) & ((uint32(1) << 31) - 1)) 296 | } 297 | 298 | func (u daunit) label() uint32 { 299 | return uint32(u) & (uint32(1) << 31 | 0xFF) 300 | } 301 | 302 | func (u daunit) offset() uint32 { 303 | return (uint32(u) >> 10) << ((uint32(u) & (uint32(1) << 9)) >> 6) 304 | } 305 | -------------------------------------------------------------------------------- /dartsclone/da_test.go: -------------------------------------------------------------------------------- 1 | package dartsclone 2 | 3 | import ( 4 | "bytes" 5 | "testing" 6 | ) 7 | 8 | func TestAsUInt32Array(t *testing.T) { 9 | ba := []byte{0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00} 10 | ia := asUInt32Array(ba) 11 | if len(ia) != 2 { 12 | t.Errorf("length is %d", len(ia)) 13 | } 14 | if ia[0] != 1 { 15 | t.Errorf("unexpected error %v", ia[0]) 16 | } 17 | if ia[1] != 2 { 18 | t.Errorf("unexpected error %v", ia[1]) 19 | } 20 | } 21 | 22 | func TestAsByteArray(t *testing.T) { 23 | ia := []uint32{1, 2} 24 | ba := asByteArray(ia) 25 | if len(ba) != 8 { 26 | t.Errorf("length is %d", len(ba)) 27 | } 28 | if !bytes.Equal(ba, []byte{0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00}) { 29 | t.Errorf("unexpected error %v", ba) 30 | } 31 | } 32 | 33 | func TestBuild(t *testing.T) { 34 | keys := [][]byte{ 35 | []byte("電気"), 36 | []byte("電気通信"), 37 | []byte("電気通信大学"), 38 | []byte("電気通信大学大学院"), 39 | []byte("電気通信大学大学院大学"), 40 | } 41 | values := []int{ 42 | 0, 43 | 1, 44 | 2, 45 | 3, 46 | 4, 47 | } 48 | t.Run("Build", func(t *testing.T) { 49 | trie := NewDoubleArray() 50 | err := trie.Build(keys, values, func(state int, max int) { 51 | return 52 | }) 53 | if err != nil { 54 | t.Errorf("unexpected error: %v", err) 55 | } 56 | t.Run("CommonPrefixSearch", func(t *testing.T) { 57 | ret := trie.CommonPrefixSearch([]byte("電気通信大学大学院大学"), 0, 5) 58 | for i := 0; i < len(ret); i++ { 59 | if got, expected := ret[i][0], i; got != expected { 60 | t.Errorf("got %v, expected %v", got, expected) 61 | } 62 | if got, expected := []byte("電気通信大学大学院大学")[0:ret[i][1]], keys[i]; string(got) != string(expected) { 63 | t.Errorf("got %v, expected %v", string(got), string(expected)) 64 | } 65 | } 66 | }) 67 | t.Run("CommonPrefixSearchItr", func(t *testing.T) { 68 | it := trie.CommonPrefixSearchItr([]byte("電気通信大学大学院大学"), 0) 69 | i := 0 70 | for it.Next() { 71 | if it.Err() != nil { 72 | t.Errorf("unexpected error: %v", err) 73 | } 74 | got1, got2 := it.Get() 75 | if got1 != i { 76 | t.Errorf("got %v, expected %v", got1, i) 77 | } 78 | if string([]byte("電気通信大学大学院大学")[0:got2]) != string(keys[i]) { 79 | t.Errorf("got %v, expected %v", string([]byte("電気通信大学大学院大学")[0:got2]), string(keys[i])) 80 | } 81 | i++ 82 | } 83 | if it.Err() != nil { 84 | t.Errorf("unexpected error: %v", err) 85 | } 86 | if i != 5 { 87 | t.Errorf("no match") 88 | } 89 | }) 90 | t.Run("CommonPrefixSearchItr offset", func(t *testing.T) { 91 | it := trie.CommonPrefixSearchItr([]byte("あ電気通信大学大学院大学"), 3) 92 | i := 0 93 | for it.Next() { 94 | if it.Err() != nil { 95 | t.Errorf("unexpected error: %v", err) 96 | } 97 | got1, got2 := it.Get() 98 | if got1 != i { 99 | t.Errorf("got %v, expected %v", got1, i) 100 | } 101 | if string([]byte("あ電気通信大学大学院大学")[3:got2]) != string(keys[i]) { 102 | t.Errorf("got %v, expected %v", string([]byte("あ電気通信大学大学院大学")[3:got2]), string(keys[i])) 103 | } 104 | i++ 105 | } 106 | if it.Err() != nil && i != 5 { 107 | t.Errorf("unexpected error: %v", err) 108 | } 109 | if i != 5 { 110 | t.Errorf("no match") 111 | } 112 | }) 113 | }) 114 | } 115 | -------------------------------------------------------------------------------- /dartsclone/dawgbuilder.go: -------------------------------------------------------------------------------- 1 | package dartsclone 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | const ( 8 | initialTableSize = 1 << 10 9 | dawgRoot = 0 10 | ) 11 | 12 | type node struct { 13 | child int 14 | sibling int 15 | label byte 16 | isState bool 17 | hasSibling bool 18 | } 19 | 20 | func (n *node) reset() { 21 | n.child = 0 22 | n.sibling = 0 23 | n.label = 0 24 | n.isState = false 25 | n.hasSibling = false 26 | } 27 | 28 | func (n *node) unit() uint32 { 29 | var sibling uint32 30 | if n.hasSibling { 31 | sibling = 1 32 | } 33 | if n.label == 0 { 34 | return uint32(n.child)<<1 | sibling 35 | } 36 | var state uint32 37 | if n.isState { 38 | state = 2 39 | } 40 | return uint32(n.child)<<2 | state | sibling 41 | } 42 | 43 | type unit uint32 44 | 45 | func (u unit) child() uint32 { 46 | return uint32(u) >> 2 47 | } 48 | 49 | func (u unit) hasSibling() bool { 50 | return (uint32(u) & 1) == 1 51 | } 52 | 53 | func (u unit) value() uint32 { 54 | return uint32(u) >> 1 55 | } 56 | 57 | func (u unit) isState() bool { 58 | return (uint32(u) & 2) == 2 59 | } 60 | 61 | type stack []int 62 | 63 | func (s stack) top() int { 64 | return s[len(s)-1] 65 | } 66 | 67 | func (s stack) pop() stack { 68 | return s[:len(s)-1] 69 | } 70 | 71 | type dawgBuilder struct { 72 | nodes []node 73 | units []uint32 74 | labels []byte 75 | isIntersections *bitVector 76 | table []int 77 | nodeStack stack 78 | recycleBin stack 79 | numStates int 80 | } 81 | 82 | func newDAWGBuilder() *dawgBuilder { 83 | return &dawgBuilder{ 84 | isIntersections: newBitVector(), 85 | table: make([]int, initialTableSize, initialTableSize), 86 | } 87 | } 88 | 89 | func (b *dawgBuilder) child(id int) uint32 { 90 | return unit(b.units[id]).child() 91 | } 92 | 93 | func (b *dawgBuilder) sibling(id int) int { 94 | if unit(b.units[id]).hasSibling() { 95 | return id + 1 96 | } 97 | return 0 98 | } 99 | 100 | func (b *dawgBuilder) value(id int) uint32 { 101 | return unit(b.units[id]).value() 102 | } 103 | 104 | func (b *dawgBuilder) isLeaf(id int) bool { 105 | return b.labels[id] == 0 106 | } 107 | 108 | func (b *dawgBuilder) label(id int) byte { 109 | return b.labels[id] 110 | } 111 | 112 | func (b *dawgBuilder) isIntersection(id int) bool { 113 | return b.isIntersections.get(id) 114 | } 115 | 116 | func (b *dawgBuilder) intersectionId(id int) int { 117 | return b.isIntersections.rank(id) - 1 118 | } 119 | 120 | func (b *dawgBuilder) numIntersections() int { 121 | return b.isIntersections.numOnes 122 | } 123 | 124 | func (b *dawgBuilder) length() int { 125 | return len(b.units) 126 | } 127 | 128 | func (b *dawgBuilder) initialize() { 129 | b.appendNode() 130 | b.appendUnit() 131 | 132 | b.numStates = 1 133 | 134 | b.nodes[0].label = 0xFF 135 | b.nodeStack = append(b.nodeStack, 0) 136 | } 137 | 138 | func (b *dawgBuilder) finish() { 139 | b.flush(0) 140 | 141 | b.units[0] = b.nodes[0].unit() 142 | b.labels[0] = b.nodes[0].label 143 | 144 | b.nodes = []node{} 145 | b.table = []int{} 146 | b.nodeStack = []int{} 147 | b.recycleBin = []int{} 148 | 149 | b.isIntersections.build() 150 | } 151 | 152 | func (b *dawgBuilder) insert(key []byte, value int) error { 153 | if value < 0 { 154 | return fmt.Errorf("negative value") 155 | } 156 | keylen := len(key) 157 | if keylen == 0 { 158 | return fmt.Errorf("zero-length key") 159 | } 160 | 161 | var id int 162 | var keyPos int 163 | 164 | for ; keyPos <= keylen; keyPos++ { 165 | childId := b.nodes[id].child 166 | if childId == 0 { 167 | break 168 | } 169 | 170 | var keyLabel byte 171 | if keyPos <= keylen { 172 | keyLabel = key[keyPos] 173 | } 174 | if keyPos < keylen && keyLabel == 0 { 175 | return fmt.Errorf("invalid null character") 176 | } 177 | 178 | unitLabel := b.nodes[childId].label 179 | if keyLabel < unitLabel { 180 | return fmt.Errorf("wrong key order") 181 | } else if keyLabel > unitLabel { 182 | b.nodes[childId].hasSibling = true 183 | b.flush(childId) 184 | break 185 | } 186 | id = childId 187 | } 188 | 189 | if keyPos > keylen { 190 | return nil 191 | } 192 | 193 | for ; keyPos <= keylen; keyPos++ { 194 | var keyLabel byte 195 | if keyPos < keylen { 196 | keyLabel = key[keyPos] 197 | } 198 | childId := b.appendNode() 199 | 200 | if b.nodes[id].child == 0 { 201 | b.nodes[childId].isState = true 202 | } 203 | b.nodes[childId].sibling = b.nodes[id].child 204 | b.nodes[childId].label = keyLabel 205 | b.nodes[id].child = childId 206 | b.nodeStack = append(b.nodeStack, childId) 207 | 208 | id = childId 209 | } 210 | b.nodes[id].child = value 211 | 212 | return nil 213 | } 214 | 215 | func (b *dawgBuilder) clear() { 216 | b.nodes = []node{} 217 | b.units = []uint32{} 218 | b.labels = []byte{} 219 | b.isIntersections = nil 220 | b.table = []int{} 221 | b.nodeStack = []int{} 222 | b.recycleBin = []int{} 223 | } 224 | 225 | func (b *dawgBuilder) flush(id int) { 226 | for { 227 | nodeId := b.nodeStack.top() 228 | if nodeId == id { 229 | break 230 | } 231 | b.nodeStack = b.nodeStack.pop() 232 | 233 | if b.numStates >= len(b.table)-len(b.table)/4 { 234 | b.expandTable() 235 | } 236 | 237 | var numSiblings int 238 | for i := nodeId; i != 0; i = b.nodes[i].sibling { 239 | numSiblings++ 240 | } 241 | 242 | matchId, hashId := b.findNode(nodeId) 243 | 244 | if matchId != 0 { 245 | b.isIntersections.set(matchId, true) 246 | } else { 247 | var unitId int 248 | for i := 0; i < numSiblings; i++ { 249 | unitId = b.appendUnit() 250 | } 251 | for i := nodeId; i != 0; i = b.nodes[i].sibling { 252 | b.units[unitId] = b.nodes[i].unit() 253 | b.labels[unitId] = b.nodes[i].label 254 | unitId-- 255 | } 256 | matchId = unitId + 1 257 | b.table[hashId] = matchId 258 | b.numStates++ 259 | } 260 | 261 | var next int 262 | for i := nodeId; i != 0; i = next { 263 | next = b.nodes[i].sibling 264 | b.freeNode(i) 265 | } 266 | 267 | b.nodes[b.nodeStack.top()].child = matchId 268 | } 269 | b.nodeStack = b.nodeStack.pop() 270 | } 271 | 272 | func (b *dawgBuilder) expandTable() { 273 | tablesize := len(b.table) * 2 274 | b.table = make([]int, tablesize, tablesize) 275 | for id := 1; id < len(b.units); id++ { 276 | if b.labels[id] == 0 || unit(b.units[id]).isState() { 277 | hashId := b.findUnit(id) 278 | b.table[hashId] = id 279 | } 280 | } 281 | } 282 | 283 | func (b *dawgBuilder) findUnit(id int) int { 284 | hashId := b.hashUnit(id) % len(b.table) 285 | for ; ; hashId = (hashId + 1) % len(b.table) { 286 | unitId := b.table[hashId] 287 | if unitId == 0 { 288 | break 289 | } 290 | } 291 | return hashId 292 | } 293 | 294 | func (b *dawgBuilder) findNode(nodeId int) (int, int) { 295 | hashId := b.hashNode(nodeId) % len(b.table) 296 | for ; ; hashId = (hashId + 1) % len(b.table) { 297 | unitId := b.table[hashId] 298 | if unitId == 0 { 299 | break 300 | } 301 | 302 | if b.areEqual(nodeId, unitId) { 303 | return unitId, hashId 304 | } 305 | } 306 | return 0, hashId 307 | } 308 | 309 | func (b *dawgBuilder) areEqual(nodeId int, unitId int) bool { 310 | for i := b.nodes[nodeId].sibling; i != 0; i = b.nodes[i].sibling { 311 | if !unit(b.units[unitId]).hasSibling() { 312 | return false 313 | } 314 | unitId++ 315 | } 316 | if unit(b.units[unitId]).hasSibling() { 317 | return false 318 | } 319 | 320 | for i := nodeId; i != 0; i = b.nodes[i].sibling { 321 | if b.nodes[i].unit() != b.units[unitId] || 322 | b.nodes[i].label != b.labels[unitId] { 323 | return false 324 | } 325 | unitId-- 326 | } 327 | return true 328 | } 329 | 330 | func (b *dawgBuilder) hashUnit(id int) int { 331 | var hashValue int 332 | for ; id != 0; id++ { 333 | u := b.units[id] 334 | label := b.labels[id] 335 | hashValue ^= hash((uint32(label) << 24) ^ u) 336 | 337 | if !unit(u).hasSibling() { 338 | break 339 | } 340 | } 341 | return hashValue 342 | } 343 | 344 | func (b *dawgBuilder) hashNode(id int) int { 345 | var hashValue int 346 | for ; id != 0; id = b.nodes[id].sibling { 347 | u := b.nodes[id].unit() 348 | label := b.nodes[id].label 349 | hashValue ^= hash((uint32(label) << 24) ^ u) 350 | } 351 | return hashValue 352 | } 353 | 354 | func (b *dawgBuilder) appendUnit() int { 355 | b.isIntersections.extend() 356 | b.units = append(b.units, 0) 357 | b.labels = append(b.labels, 0) 358 | return b.isIntersections.length - 1 359 | } 360 | 361 | func (b *dawgBuilder) appendNode() int { 362 | var id int 363 | if len(b.recycleBin) == 0 { 364 | id = len(b.nodes) 365 | b.nodes = append(b.nodes, node{}) 366 | } else { 367 | id = b.recycleBin.top() 368 | b.nodes[id].reset() 369 | b.recycleBin = b.recycleBin.pop() 370 | } 371 | return id 372 | } 373 | 374 | func (b *dawgBuilder) freeNode(id int) { 375 | b.recycleBin = append(b.recycleBin, id) 376 | } 377 | 378 | func hash(key uint32) int { 379 | key = ^key + (key << 15) 380 | key = key ^ (key >> 12) 381 | key = key + (key << 2) 382 | key = key ^ (key >> 4) 383 | key = key * 2057 384 | key = key ^ (key >> 16) 385 | return int(key) 386 | } 387 | -------------------------------------------------------------------------------- /data/assets.go: -------------------------------------------------------------------------------- 1 | // +build dev 2 | 3 | package data 4 | 5 | import ( 6 | "net/http" 7 | ) 8 | 9 | var Assets http.FileSystem = http.Dir("./root") 10 | -------------------------------------------------------------------------------- /data/assets_generate.go: -------------------------------------------------------------------------------- 1 | // +build ignore 2 | 3 | package main 4 | 5 | import ( 6 | "log" 7 | 8 | "github.com/msnoigrs/gosudachi/data" 9 | "github.com/shurcooL/vfsgen" 10 | ) 11 | 12 | func main() { 13 | err := vfsgen.Generate(data.Assets, vfsgen.Options{ 14 | BuildTags: "!dev", 15 | PackageName: "data", 16 | VariableName: "Assets", 17 | }) 18 | 19 | if err != nil { 20 | log.Fatalln(err) 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /data/data.go: -------------------------------------------------------------------------------- 1 | // +build !dev 2 | 3 | package data 4 | 5 | //go:generate go run -tags=dev assets_generate.go 6 | -------------------------------------------------------------------------------- /data/root/char.def: -------------------------------------------------------------------------------- 1 | # 2 | # Japanese charcter category map 3 | # 4 | # $Id: char.def 9 2012-12-12 04:13:15Z togiso $; 5 | # 6 | 7 | ################################################################################### 8 | # 9 | # CHARACTER CATEGORY DEFINITION 10 | # 11 | # CATEGORY_NAME INVOKE GROUP LENGTH 12 | # 13 | # - CATEGORY_NAME: Name of category. you have to define DEFAULT class. 14 | # - INVOKE: 1/0: always invoke unknown word processing, evan when the word can be found in the lexicon 15 | # - GROUP: 1/0: make a new word by grouping the same chracter category 16 | # - LENGTH: n: 1 to n length new words are added 17 | # 18 | DEFAULT 0 1 0 # DEFAULT is a mandatory category! 19 | SPACE 0 1 0 20 | KANJI 0 0 2 21 | SYMBOL 1 1 0 22 | NUMERIC 1 1 0 23 | ALPHA 1 1 0 24 | HIRAGANA 0 1 2 25 | KATAKANA 1 1 2 26 | KANJINUMERIC 0 1 0 #change INVOKE 1->0 27 | GREEK 1 1 0 28 | CYRILLIC 1 1 0 29 | 30 | ################################################################################### 31 | # 32 | # CODE(UCS2) TO CATEGORY MAPPING 33 | # 34 | 35 | # SPACE 36 | 0x0020 SPACE # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE 37 | 0x000D SPACE 38 | 0x0009 SPACE 39 | 0x000B SPACE 40 | 0x000A SPACE 41 | 42 | # ASCII 43 | 0x0021..0x002F SYMBOL #!"#$%&'()*+,-./ 44 | 0x0030..0x0039 NUMERIC #0-9 45 | 0x003A..0x0040 SYMBOL #:;<=>?@ 46 | 0x0041..0x005A ALPHA #A-Z 47 | 0x005B..0x0060 SYMBOL #[\]^_` 48 | 0x0061..0x007A ALPHA #a-z 49 | 0x007B..0x007E SYMBOL #{|}~ 50 | 51 | # Latin 52 | 0x00A1..0x00BF SYMBOL # Latin 1 #¡->¿ 53 | 0x00C0..0x00D6 ALPHA # Latin 1 #À->Ö 54 | 0x00D7 SYMBOL # Latin 1 #× 55 | 0x00D8..0x00F6 ALPHA # Latin 1 #Ø->ö 56 | 0x00F7 SYMBOL # Latin 1 #÷ 57 | 0x00F8..0x00FF ALPHA # Latin 1 #ø->ÿ 58 | 0x0100..0x017F ALPHA # Latin Extended A 59 | 0x0180..0x0236 ALPHA # Latin Extended B 60 | 0x1E00..0x1EF9 ALPHA # Latin Extended Additional 61 | 62 | # CYRILLIC 63 | 0x0400..0x04F9 CYRILLIC #Ѐ->ӹ 64 | 0x0500..0x050F CYRILLIC # Cyrillic supplementary 65 | 66 | # GREEK 67 | 0x0374..0x03FB GREEK # Greek and Coptic #ʹ->ϻ 68 | 69 | # HIRAGANA 70 | 0x3041..0x309F HIRAGANA 71 | 72 | # KATAKANA 73 | #0x30A1..0x30FF KATAKANA 74 | 0x30A1..0x30FA KATAKANA 75 | 0x30FC..0x30FF KATAKANA 76 | 0x31F0..0x31FF KATAKANA # Small KU .. Small RO 77 | # 0x30FC KATAKANA HIRAGANA # ー 78 | 0x30A1 NOOOVBOW # Small A 79 | 0x30A3 NOOOVBOW 80 | 0x30A5 NOOOVBOW 81 | 0x30A7 NOOOVBOW 82 | 0x30A9 NOOOVBOW 83 | 0x30E3 NOOOVBOW 84 | 0x30E5 NOOOVBOW 85 | 0x30E7 NOOOVBOW 86 | 0x30EE NOOOVBOW 87 | 0x30FB..0x30FE NOOOVBOW 88 | 89 | # Half KATAKANA 90 | 0xFF66..0xFF9D KATAKANA 91 | 0xFF9E..0xFF9F KATAKANA 92 | 93 | # KANJI 94 | 0x2E80..0x2EF3 KANJI # CJK Raidcals Supplement 95 | 0x2F00..0x2FD5 KANJI 96 | 0x3005 KANJI NOOOVBOW 97 | 0x3007 KANJI 98 | 0x3400..0x4DB5 KANJI # CJK Unified Ideographs Extention 99 | #0x4E00..0x9FA5 KANJI 100 | 0x4E00..0x9FFF KANJI 101 | 0xF900..0xFA2D KANJI 102 | 0xFA30..0xFA6A KANJI 103 | 104 | 105 | # KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆) 106 | 0x4E00 KANJINUMERIC KANJI 107 | 0x4E8C KANJINUMERIC KANJI 108 | 0x4E09 KANJINUMERIC KANJI 109 | 0x56DB KANJINUMERIC KANJI 110 | 0x4E94 KANJINUMERIC KANJI 111 | 0x516D KANJINUMERIC KANJI 112 | 0x4E03 KANJINUMERIC KANJI 113 | 0x516B KANJINUMERIC KANJI 114 | 0x4E5D KANJINUMERIC KANJI 115 | 0x5341 KANJINUMERIC KANJI 116 | 0x767E KANJINUMERIC KANJI 117 | 0x5343 KANJINUMERIC KANJI 118 | 0x4E07 KANJINUMERIC KANJI 119 | 0x5104 KANJINUMERIC KANJI 120 | 0x5146 KANJINUMERIC KANJI 121 | 122 | # ZENKAKU 123 | 0xFF10..0xFF19 NUMERIC 124 | 0xFF21..0xFF3A ALPHA 125 | 0xFF41..0xFF5A ALPHA 126 | 0xFF01..0xFF0F SYMBOL #!->/ 127 | 0xFF1A..0xFF20 SYMBOL #:->@ 128 | 0xFF3B..0xFF40 SYMBOL #[->` 129 | 0xFF5B..0xFF65 SYMBOL #{->・ 130 | 0xFFE0..0xFFEF SYMBOL # HalfWidth and Full width Form 131 | 132 | # OTHER SYMBOLS 133 | 0x2000..0x206F SYMBOL # General Punctuation 134 | 0x2070..0x209F NUMERIC # Superscripts and Subscripts 135 | 0x20A0..0x20CF SYMBOL # Currency Symbols 136 | 0x20D0..0x20FF SYMBOL # Combining Diaritical Marks for Symbols 137 | 0x2100..0x214F SYMBOL # Letterlike Symbols 138 | 0x2150..0x218F NUMERIC # Number forms 139 | 0x2100..0x214B SYMBOL # Letterlike Symbols 140 | 0x2190..0x21FF SYMBOL # Arrow 141 | 0x2200..0x22FF SYMBOL # Mathematical Operators 142 | 0x2300..0x23FF SYMBOL # Miscellaneuos Technical 143 | 0x2460..0x24FF SYMBOL # Enclosed NUMERICs 144 | 0x2501..0x257F SYMBOL # Box Drawing 145 | 0x2580..0x259F SYMBOL # Block Elements 146 | 0x25A0..0x25FF SYMBOL # Geometric Shapes 147 | 0x2600..0x26FE SYMBOL # Miscellaneous Symbols 148 | 0x2700..0x27BF SYMBOL # Dingbats 149 | 0x27F0..0x27FF SYMBOL # Supplemental Arrows A 150 | 0x27C0..0x27EF SYMBOL # Miscellaneous Mathematical Symbols-A 151 | 0x2800..0x28FF SYMBOL # Braille Patterns 152 | 0x2900..0x297F SYMBOL # Supplemental Arrows B 153 | 0x2B00..0x2BFF SYMBOL # Miscellaneous Symbols and Arrows 154 | 0x2A00..0x2AFF SYMBOL # Supplemental Mathematical Operators 155 | 0x3300..0x33FF SYMBOL 156 | 0x3200..0x32FE SYMBOL # ENclosed CJK Letters and Months 157 | 0x3000..0x303F SYMBOL # CJK Symbol and Punctuation 158 | 0xFE30..0xFE4F SYMBOL # CJK Compatibility Forms 159 | 0xFE50..0xFE6B SYMBOL # Small Form Variants 160 | 161 | # added 2006/3/13 162 | 0x3007 SYMBOL KANJINUMERIC 163 | 164 | # added 2018/11/30 165 | 0x309b..0x309c HIRAGANA KATAKANA # voiced/semi-voiced sound marks 166 | 167 | # END OF TABLE 168 | -------------------------------------------------------------------------------- /data/root/rewrite.def: -------------------------------------------------------------------------------- 1 | # ignore normalize list 2 | # ^{char}%n 3 | Ⅰ 4 | Ⅱ 5 | Ⅲ 6 | Ⅳ 7 | Ⅴ 8 | Ⅵ 9 | Ⅶ 10 | Ⅷ 11 | Ⅸ 12 | Ⅹ 13 | Ⅺ 14 | Ⅻ 15 | Ⅼ 16 | Ⅽ 17 | Ⅾ 18 | Ⅿ 19 | ⅰ 20 | ⅱ 21 | ⅲ 22 | ⅳ 23 | ⅴ 24 | ⅵ 25 | ⅶ 26 | ⅷ 27 | ⅸ 28 | ⅹ 29 | ⅺ 30 | ⅻ 31 | ⅼ 32 | ⅽ 33 | ⅾ 34 | ⅿ 35 | ⺀ 36 | ⺁ 37 | ⺂ 38 | ⺃ 39 | ⺄ 40 | ⺅ 41 | ⺆ 42 | ⺇ 43 | ⺈ 44 | ⺉ 45 | ⺊ 46 | ⺋ 47 | ⺌ 48 | ⺍ 49 | ⺎ 50 | ⺏ 51 | ⺐ 52 | ⺑ 53 | ⺒ 54 | ⺓ 55 | ⺔ 56 | ⺕ 57 | ⺖ 58 | ⺗ 59 | ⺘ 60 | ⺙ 61 | ⺛ 62 | ⺜ 63 | ⺝ 64 | ⺞ 65 | ⺟ 66 | ⺠ 67 | ⺡ 68 | ⺢ 69 | ⺣ 70 | ⺤ 71 | ⺥ 72 | ⺦ 73 | ⺧ 74 | ⺨ 75 | ⺩ 76 | ⺪ 77 | ⺫ 78 | ⺬ 79 | ⺭ 80 | ⺮ 81 | ⺯ 82 | ⺰ 83 | ⺱ 84 | ⺲ 85 | ⺳ 86 | ⺴ 87 | ⺵ 88 | ⺶ 89 | ⺷ 90 | ⺸ 91 | ⺹ 92 | ⺺ 93 | ⺻ 94 | ⺼ 95 | ⺽ 96 | ⺾ 97 | ⺿ 98 | ⻀ 99 | ⻁ 100 | ⻂ 101 | ⻃ 102 | ⻄ 103 | ⻅ 104 | ⻆ 105 | ⻇ 106 | ⻈ 107 | ⻉ 108 | ⻊ 109 | ⻋ 110 | ⻌ 111 | ⻍ 112 | ⻎ 113 | ⻏ 114 | ⻐ 115 | ⻑ 116 | ⻒ 117 | ⻓ 118 | ⻔ 119 | ⻕ 120 | ⻖ 121 | ⻗ 122 | ⻘ 123 | ⻙ 124 | ⻚ 125 | ⻛ 126 | ⻜ 127 | ⻝ 128 | ⻞ 129 | ⻟ 130 | ⻠ 131 | ⻡ 132 | ⻢ 133 | ⻣ 134 | ⻤ 135 | ⻥ 136 | ⻦ 137 | ⻧ 138 | ⻨ 139 | ⻩ 140 | ⻪ 141 | ⻫ 142 | ⻬ 143 | ⻭ 144 | ⻮ 145 | ⻯ 146 | ⻰ 147 | ⻱ 148 | ⻲ 149 | ⻳ 150 | ⼀ 151 | ⼁ 152 | ⼂ 153 | ⼃ 154 | ⼄ 155 | ⼅ 156 | ⼆ 157 | ⼇ 158 | ⼈ 159 | ⼉ 160 | ⼊ 161 | ⼋ 162 | ⼌ 163 | ⼍ 164 | ⼎ 165 | ⼏ 166 | ⼐ 167 | ⼑ 168 | ⼒ 169 | ⼓ 170 | ⼔ 171 | ⼕ 172 | ⼖ 173 | ⼗ 174 | ⼘ 175 | ⼙ 176 | ⼚ 177 | ⼛ 178 | ⼜ 179 | ⼝ 180 | ⼞ 181 | ⼟ 182 | ⼠ 183 | ⼡ 184 | ⼢ 185 | ⼣ 186 | ⼤ 187 | ⼥ 188 | ⼦ 189 | ⼧ 190 | ⼨ 191 | ⼩ 192 | ⼪ 193 | ⼫ 194 | ⼬ 195 | ⼭ 196 | ⼮ 197 | ⼯ 198 | ⼰ 199 | ⼱ 200 | ⼲ 201 | ⼳ 202 | ⼴ 203 | ⼵ 204 | ⼶ 205 | ⼷ 206 | ⼸ 207 | ⼹ 208 | ⼺ 209 | ⼻ 210 | ⼼ 211 | ⼽ 212 | ⼾ 213 | ⼿ 214 | ⽀ 215 | ⽁ 216 | ⽂ 217 | ⽃ 218 | ⽄ 219 | ⽅ 220 | ⽆ 221 | ⽇ 222 | ⽈ 223 | ⽉ 224 | ⽊ 225 | ⽋ 226 | ⽌ 227 | ⽍ 228 | ⽎ 229 | ⽏ 230 | ⽐ 231 | ⽑ 232 | ⽒ 233 | ⽓ 234 | ⽔ 235 | ⽕ 236 | ⽖ 237 | ⽗ 238 | ⽘ 239 | ⽙ 240 | ⽚ 241 | ⽛ 242 | ⽜ 243 | ⽝ 244 | ⽞ 245 | ⽟ 246 | ⽠ 247 | ⽡ 248 | ⽢ 249 | ⽣ 250 | ⽤ 251 | ⽥ 252 | ⽦ 253 | ⽧ 254 | ⽨ 255 | ⽩ 256 | ⽪ 257 | ⽫ 258 | ⽬ 259 | ⽭ 260 | ⽮ 261 | ⽯ 262 | ⽰ 263 | ⽱ 264 | ⽲ 265 | ⽳ 266 | ⽴ 267 | ⽵ 268 | ⽶ 269 | ⽷ 270 | ⽸ 271 | ⽹ 272 | ⽺ 273 | ⽻ 274 | ⽼ 275 | ⽽ 276 | ⽾ 277 | ⽿ 278 | ⾀ 279 | ⾁ 280 | ⾂ 281 | ⾃ 282 | ⾄ 283 | ⾅ 284 | ⾆ 285 | ⾇ 286 | ⾈ 287 | ⾉ 288 | ⾊ 289 | ⾋ 290 | ⾌ 291 | ⾍ 292 | ⾎ 293 | ⾏ 294 | ⾐ 295 | ⾑ 296 | ⾒ 297 | ⾓ 298 | ⾔ 299 | ⾕ 300 | ⾖ 301 | ⾗ 302 | ⾘ 303 | ⾙ 304 | ⾚ 305 | ⾛ 306 | ⾜ 307 | ⾝ 308 | ⾞ 309 | ⾟ 310 | ⾠ 311 | ⾡ 312 | ⾢ 313 | ⾣ 314 | ⾤ 315 | ⾥ 316 | ⾦ 317 | ⾧ 318 | ⾨ 319 | ⾩ 320 | ⾪ 321 | ⾫ 322 | ⾬ 323 | ⾭ 324 | ⾮ 325 | ⾯ 326 | ⾰ 327 | ⾱ 328 | ⾲ 329 | ⾳ 330 | ⾴ 331 | ⾵ 332 | ⾶ 333 | ⾷ 334 | ⾸ 335 | ⾹ 336 | ⾺ 337 | ⾻ 338 | ⾼ 339 | ⾽ 340 | ⾾ 341 | ⾿ 342 | ⿀ 343 | ⿁ 344 | ⿂ 345 | ⿃ 346 | ⿄ 347 | ⿅ 348 | ⿆ 349 | ⿇ 350 | ⿈ 351 | ⿉ 352 | ⿊ 353 | ⿋ 354 | ⿌ 355 | ⿍ 356 | ⿎ 357 | ⿏ 358 | ⿐ 359 | ⿑ 360 | ⿒ 361 | ⿓ 362 | ⿔ 363 | ⿕ 364 | 豈 365 | 更 366 | 車 367 | 賈 368 | 滑 369 | 串 370 | 句 371 | 龜 372 | 龜 373 | 契 374 | 金 375 | 喇 376 | 奈 377 | 懶 378 | 癩 379 | 羅 380 | 蘿 381 | 螺 382 | 裸 383 | 邏 384 | 樂 385 | 洛 386 | 烙 387 | 珞 388 | 落 389 | 酪 390 | 駱 391 | 亂 392 | 卵 393 | 欄 394 | 爛 395 | 蘭 396 | 鸞 397 | 嵐 398 | 濫 399 | 藍 400 | 襤 401 | 拉 402 | 臘 403 | 蠟 404 | 廊 405 | 朗 406 | 浪 407 | 狼 408 | 郎 409 | 來 410 | 冷 411 | 勞 412 | 擄 413 | 櫓 414 | 爐 415 | 盧 416 | 老 417 | 蘆 418 | 虜 419 | 路 420 | 露 421 | 魯 422 | 鷺 423 | 碌 424 | 祿 425 | 綠 426 | 菉 427 | 錄 428 | 鹿 429 | 論 430 | 壟 431 | 弄 432 | 籠 433 | 聾 434 | 牢 435 | 磊 436 | 賂 437 | 雷 438 | 壘 439 | 屢 440 | 樓 441 | 淚 442 | 漏 443 | 累 444 | 縷 445 | 陋 446 | 勒 447 | 肋 448 | 凜 449 | 凌 450 | 稜 451 | 綾 452 | 菱 453 | 陵 454 | 讀 455 | 拏 456 | 樂 457 | 諾 458 | 丹 459 | 寧 460 | 怒 461 | 率 462 | 異 463 | 北 464 | 磻 465 | 便 466 | 復 467 | 不 468 | 泌 469 | 數 470 | 索 471 | 參 472 | 塞 473 | 省 474 | 葉 475 | 說 476 | 殺 477 | 辰 478 | 沈 479 | 拾 480 | 若 481 | 掠 482 | 略 483 | 亮 484 | 兩 485 | 凉 486 | 梁 487 | 糧 488 | 良 489 | 諒 490 | 量 491 | 勵 492 | 呂 493 | 女 494 | 廬 495 | 旅 496 | 濾 497 | 礪 498 | 閭 499 | 驪 500 | 麗 501 | 黎 502 | 力 503 | 曆 504 | 歷 505 | 轢 506 | 年 507 | 憐 508 | 戀 509 | 撚 510 | 漣 511 | 煉 512 | 璉 513 | 秊 514 | 練 515 | 聯 516 | 輦 517 | 蓮 518 | 連 519 | 鍊 520 | 列 521 | 劣 522 | 咽 523 | 烈 524 | 裂 525 | 說 526 | 廉 527 | 念 528 | 捻 529 | 殮 530 | 簾 531 | 獵 532 | 令 533 | 囹 534 | 寧 535 | 嶺 536 | 怜 537 | 玲 538 | 瑩 539 | 羚 540 | 聆 541 | 鈴 542 | 零 543 | 靈 544 | 領 545 | 例 546 | 禮 547 | 醴 548 | 隸 549 | 惡 550 | 了 551 | 僚 552 | 寮 553 | 尿 554 | 料 555 | 樂 556 | 燎 557 | 療 558 | 蓼 559 | 遼 560 | 龍 561 | 暈 562 | 阮 563 | 劉 564 | 杻 565 | 柳 566 | 流 567 | 溜 568 | 琉 569 | 留 570 | 硫 571 | 紐 572 | 類 573 | 六 574 | 戮 575 | 陸 576 | 倫 577 | 崙 578 | 淪 579 | 輪 580 | 律 581 | 慄 582 | 栗 583 | 率 584 | 隆 585 | 利 586 | 吏 587 | 履 588 | 易 589 | 李 590 | 梨 591 | 泥 592 | 理 593 | 痢 594 | 罹 595 | 裏 596 | 裡 597 | 里 598 | 離 599 | 匿 600 | 溺 601 | 吝 602 | 燐 603 | 璘 604 | 藺 605 | 隣 606 | 鱗 607 | 麟 608 | 林 609 | 淋 610 | 臨 611 | 立 612 | 笠 613 | 粒 614 | 狀 615 | 炙 616 | 識 617 | 什 618 | 茶 619 | 刺 620 | 切 621 | 度 622 | 拓 623 | 糖 624 | 宅 625 | 洞 626 | 暴 627 | 輻 628 | 行 629 | 降 630 | 見 631 | 廓 632 | 兀 633 | 嗀 634 | 﨎 635 | 﨏 636 | 塚 637 | 﨑 638 | 晴 639 | 﨓 640 | 﨔 641 | 凞 642 | 猪 643 | 益 644 | 礼 645 | 神 646 | 祥 647 | 福 648 | 靖 649 | 精 650 | 羽 651 | 﨟 652 | 蘒 653 | 﨡 654 | 諸 655 | 﨣 656 | 﨤 657 | 逸 658 | 都 659 | 﨧 660 | 﨨 661 | 﨩 662 | 飯 663 | 飼 664 | 館 665 | 鶴 666 | 郞 667 | 隷 668 | 侮 669 | 僧 670 | 免 671 | 勉 672 | 勤 673 | 卑 674 | 喝 675 | 嘆 676 | 器 677 | 塀 678 | 墨 679 | 層 680 | 屮 681 | 悔 682 | 慨 683 | 憎 684 | 懲 685 | 敏 686 | 既 687 | 暑 688 | 梅 689 | 海 690 | 渚 691 | 漢 692 | 煮 693 | 爫 694 | 琢 695 | 碑 696 | 社 697 | 祉 698 | 祈 699 | 祐 700 | 祖 701 | 祝 702 | 禍 703 | 禎 704 | 穀 705 | 突 706 | 節 707 | 練 708 | 縉 709 | 繁 710 | 署 711 | 者 712 | 臭 713 | 艹 714 | 艹 715 | 著 716 | 褐 717 | 視 718 | 謁 719 | 謹 720 | 賓 721 | 贈 722 | 辶 723 | 逸 724 | 難 725 | 響 726 | 頻 727 | 恵 728 | 𤋮 729 | 舘 730 | 並 731 | 况 732 | 全 733 | 侀 734 | 充 735 | 冀 736 | 勇 737 | 勺 738 | 喝 739 | 啕 740 | 喙 741 | 嗢 742 | 塚 743 | 墳 744 | 奄 745 | 奔 746 | 婢 747 | 嬨 748 | 廒 749 | 廙 750 | 彩 751 | 徭 752 | 惘 753 | 慎 754 | 愈 755 | 憎 756 | 慠 757 | 懲 758 | 戴 759 | 揄 760 | 搜 761 | 摒 762 | 敖 763 | 晴 764 | 朗 765 | 望 766 | 杖 767 | 歹 768 | 殺 769 | 流 770 | 滛 771 | 滋 772 | 漢 773 | 瀞 774 | 煮 775 | 瞧 776 | 爵 777 | 犯 778 | 猪 779 | 瑱 780 | 甆 781 | 画 782 | 瘝 783 | 瘟 784 | 益 785 | 盛 786 | 直 787 | 睊 788 | 着 789 | 磌 790 | 窱 791 | 節 792 | 类 793 | 絛 794 | 練 795 | 缾 796 | 者 797 | 荒 798 | 華 799 | 蝹 800 | 襁 801 | 覆 802 | 視 803 | 調 804 | 諸 805 | 請 806 | 謁 807 | 諾 808 | 諭 809 | 謹 810 | 變 811 | 贈 812 | 輸 813 | 遲 814 | 醙 815 | 鉶 816 | 陼 817 | 難 818 | 靖 819 | 韛 820 | 響 821 | 頋 822 | 頻 823 | 鬒 824 | 龜 825 | 𢡊 826 | 𢡄 827 | 𣏕 828 | 㮝 829 | 䀘 830 | 䀹 831 | 𥉉 832 | 𥳐 833 | 𧻓 834 | 齃 835 | 龎 836 | ゛ 837 | ゜ 838 | 839 | # replace char list 840 | # ^{before}\s{after}%n 841 | ヴ ヴ 842 | ガ ガ 843 | ギ ギ 844 | グ グ 845 | ゲ ゲ 846 | ゴ ゴ 847 | ザ ザ 848 | ジ ジ 849 | ズ ズ 850 | ゼ ゼ 851 | ゾ ゾ 852 | ダ ダ 853 | ヂ ヂ 854 | ヅ ヅ 855 | デ デ 856 | ド ド 857 | バ バ 858 | ビ ビ 859 | ブ ブ 860 | ベ ベ 861 | ボ ボ 862 | パ パ 863 | ピ ピ 864 | プ プ 865 | ペ ペ 866 | ポ ポ 867 | ゔ ゔ 868 | が が 869 | ぎ ぎ 870 | ぐ ぐ 871 | げ げ 872 | ご ご 873 | ざ ざ 874 | じ じ 875 | ず ず 876 | ぜ ぜ 877 | ぞ ぞ 878 | だ だ 879 | ぢ ぢ 880 | づ づ 881 | で で 882 | ど ど 883 | ば ば 884 | び び 885 | ぶ ぶ 886 | べ べ 887 | ぼ ぼ 888 | ぱ ぱ 889 | ぴ ぴ 890 | ぷ ぷ 891 | ぺ ぺ 892 | ぽ ぽ 893 | ヴ ヴ 894 | ガ ガ 895 | ギ ギ 896 | グ グ 897 | ゲ ゲ 898 | ゴ ゴ 899 | ザ ザ 900 | ジ ジ 901 | ズ ズ 902 | ゼ ゼ 903 | ゾ ゾ 904 | ダ ダ 905 | ヂ ヂ 906 | ヅ ヅ 907 | デ デ 908 | ド ド 909 | バ バ 910 | ビ ビ 911 | ブ ブ 912 | ベ ベ 913 | ボ ボ 914 | パ パ 915 | ピ ピ 916 | プ プ 917 | ペ ペ 918 | ポ ポ 919 | ゔ ゔ 920 | が が 921 | ぎ ぎ 922 | ぐ ぐ 923 | げ げ 924 | ご ご 925 | ざ ざ 926 | じ じ 927 | ず ず 928 | ぜ ぜ 929 | ぞ ぞ 930 | だ だ 931 | ぢ ぢ 932 | づ づ 933 | で で 934 | ど ど 935 | ば ば 936 | び び 937 | ぶ ぶ 938 | べ べ 939 | ぼ ぼ 940 | ぱ ぱ 941 | ぴ ぴ 942 | ぷ ぷ 943 | ぺ ぺ 944 | ぽ ぽ 945 | ヴ ヴ 946 | ガ ガ 947 | ギ ギ 948 | グ グ 949 | ゲ ゲ 950 | ゴ ゴ 951 | ザ ザ 952 | ジ ジ 953 | ズ ズ 954 | ゼ ゼ 955 | ゾ ゾ 956 | ダ ダ 957 | ヂ ヂ 958 | ヅ ヅ 959 | デ デ 960 | ド ド 961 | バ バ 962 | ビ ビ 963 | ブ ブ 964 | ベ ベ 965 | ボ ボ 966 | パ パ 967 | ピ ピ 968 | プ プ 969 | ペ ペ 970 | ポ ポ 971 | う゛ ゔ 972 | か゛ が 973 | き゛ ぎ 974 | く゛ ぐ 975 | け゛ げ 976 | こ゛ ご 977 | さ゛ ざ 978 | し゛ じ 979 | す゛ ず 980 | せ゛ ぜ 981 | そ゛ ぞ 982 | た゛ だ 983 | ち゛ ぢ 984 | つ゛ づ 985 | て゛ で 986 | と゛ ど 987 | は゛ ば 988 | ひ゛ び 989 | ふ゛ ぶ 990 | へ゛ べ 991 | ほ゛ ぼ 992 | は゜ ぱ 993 | ひ゜ ぴ 994 | ふ゜ ぷ 995 | へ゜ ぺ 996 | ほ゜ ぽ 997 | ウ゛ ヴ 998 | カ゛ ガ 999 | キ゛ ギ 1000 | ク゛ グ 1001 | ケ゛ ゲ 1002 | コ゛ ゴ 1003 | サ゛ ザ 1004 | シ゛ ジ 1005 | ス゛ ズ 1006 | セ゛ ゼ 1007 | ソ゛ ゾ 1008 | タ゛ ダ 1009 | チ゛ ヂ 1010 | ツ゛ ヅ 1011 | テ゛ デ 1012 | ト゛ ド 1013 | ハ゛ バ 1014 | ヒ゛ ビ 1015 | フ゛ ブ 1016 | ヘ゛ ベ 1017 | ホ゛ ボ 1018 | ハ゜ パ 1019 | ヒ゜ ピ 1020 | フ゜ プ 1021 | ヘ゜ ペ 1022 | ホ゜ ポ 1023 | -------------------------------------------------------------------------------- /data/root/sudachi.json: -------------------------------------------------------------------------------- 1 | { 2 | "systemDict" : "system_core.dic", 3 | "inputTextPlugin" : [ 4 | { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" }, 5 | { "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin", 6 | "prolongedSoundMarks": ["ー", "-", "⁓", "〜", "〰"], 7 | "replacementSymbol": "ー"} 8 | ], 9 | "oovProviderPlugin" : [ 10 | { "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin" }, 11 | { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", 12 | "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ], 13 | "leftId" : 5968, 14 | "rightId" : 5968, 15 | "cost" : 3857 } 16 | ], 17 | "pathRewritePlugin" : [ 18 | { "class" : "com.worksap.nlp.sudachi.JoinNumericPlugin", 19 | "joinKanjiNumeric" : true }, 20 | { "class" : "com.worksap.nlp.sudachi.JoinKatakanaOovPlugin", 21 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 22 | "minLength" : 3 23 | } 24 | ] 25 | } 26 | -------------------------------------------------------------------------------- /data/root/sudachi_fulldict.json: -------------------------------------------------------------------------------- 1 | { 2 | "systemDict" : "system_full.dic", 3 | "inputTextPlugin" : [ 4 | { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" }, 5 | { "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin", 6 | "prolongedSoundMarks": ["ー", "-", "⁓", "〜", "〰"], 7 | "replacementSymbol": "ー"} 8 | ], 9 | "oovProviderPlugin" : [ 10 | { "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin" }, 11 | { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", 12 | "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ], 13 | "leftId" : 5968, 14 | "rightId" : 5968, 15 | "cost" : 3857 } 16 | ], 17 | "pathRewritePlugin" : [ 18 | { "class" : "com.worksap.nlp.sudachi.JoinNumericPlugin", 19 | "joinKanjiNumeric" : true }, 20 | { "class" : "com.worksap.nlp.sudachi.JoinKatakanaOovPlugin", 21 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 22 | "minLength" : 3 23 | } 24 | ] 25 | } 26 | -------------------------------------------------------------------------------- /data/root/unk.def: -------------------------------------------------------------------------------- 1 | DEFAULT,5968,5968,3857,補助記号,一般,*,*,*,* 2 | SPACE,5966,5966,6056,空白,*,*,*,*,* 3 | KANJI,5139,5139,14657,名詞,普通名詞,一般,*,*,* 4 | KANJI,5129,5129,17308,名詞,普通名詞,サ変可能,*,*,* 5 | KANJI,4785,4785,18181,名詞,固有名詞,一般,*,*,* 6 | KANJI,4787,4787,18086,名詞,固有名詞,人名,一般,*,* 7 | KANJI,4791,4791,19198,名詞,固有名詞,地名,一般,*,* 8 | SYMBOL,5129,5129,17094,名詞,普通名詞,サ変可能,*,*,* 9 | NUMERIC,4794,4794,12450,名詞,数詞,*,*,*,* 10 | ALPHA,5139,5139,11633,名詞,普通名詞,一般,*,*,* 11 | ALPHA,4785,4785,13620,名詞,固有名詞,一般,*,*,* 12 | ALPHA,4787,4787,14228,名詞,固有名詞,人名,一般,*,* 13 | ALPHA,4791,4791,15793,名詞,固有名詞,地名,一般,*,* 14 | ALPHA,5687,5687,15246,感動詞,一般,*,*,*,* 15 | HIRAGANA,5139,5139,16012,名詞,普通名詞,一般,*,*,* 16 | HIRAGANA,5129,5129,20012,名詞,普通名詞,サ変可能,*,*,* 17 | HIRAGANA,4785,4785,18282,名詞,固有名詞,一般,*,*,* 18 | HIRAGANA,4787,4787,18269,名詞,固有名詞,人名,一般,*,* 19 | HIRAGANA,4791,4791,20474,名詞,固有名詞,地名,一般,*,* 20 | HIRAGANA,5687,5687,17786,感動詞,一般,*,*,*,* 21 | KATAKANA,5139,5139,10980,名詞,普通名詞,一般,*,*,* 22 | KATAKANA,5129,5129,14802,名詞,普通名詞,サ変可能,*,*,* 23 | KATAKANA,4785,4785,13451,名詞,固有名詞,一般,*,*,* 24 | KATAKANA,4787,4787,13759,名詞,固有名詞,人名,一般,*,* 25 | KATAKANA,4791,4791,14554,名詞,固有名詞,地名,一般,*,* 26 | KATAKANA,5687,5687,15272,感動詞,一般,*,*,*,* 27 | KANJINUMERIC,4794,4794,14170,名詞,数詞,*,*,*,* 28 | GREEK,5139,5139,11051,名詞,普通名詞,一般,*,*,* 29 | GREEK,4785,4785,13353,名詞,固有名詞,一般,*,*,* 30 | GREEK,4787,4787,13671,名詞,固有名詞,人名,一般,*,* 31 | GREEK,4791,4791,14862,名詞,固有名詞,地名,一般,*,* 32 | CYRILLIC,5139,5139,11140,名詞,普通名詞,一般,*,*,* 33 | CYRILLIC,4785,4785,13174,名詞,固有名詞,一般,*,*,* 34 | CYRILLIC,4787,4787,13495,名詞,固有名詞,人名,一般,*,* 35 | CYRILLIC,4791,4791,14700,名詞,固有名詞,地名,一般,*,* 36 | -------------------------------------------------------------------------------- /definputtextplugin.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "os" 7 | "strings" 8 | "unicode" 9 | "unicode/utf8" 10 | 11 | "github.com/msnoigrs/gosudachi/data" 12 | "github.com/msnoigrs/gosudachi/internal/lnreader" 13 | "golang.org/x/text/unicode/norm" 14 | ) 15 | 16 | type DefaultInputTextPluginConfig struct { 17 | RewriteDef string 18 | } 19 | 20 | type DefaultInputTextPlugin struct { 21 | config *DefaultInputTextPluginConfig 22 | rewriteDef string 23 | ignoreNormalizeMap map[rune]bool 24 | keyLengths map[rune]int 25 | replaceCharMap map[string][]rune 26 | } 27 | 28 | func NewDefaultInputTextPlugin(config *DefaultInputTextPluginConfig) *DefaultInputTextPlugin { 29 | if config == nil { 30 | config = &DefaultInputTextPluginConfig{} 31 | } 32 | return &DefaultInputTextPlugin{ 33 | config: config, 34 | ignoreNormalizeMap: map[rune]bool{}, 35 | keyLengths: map[rune]int{}, 36 | replaceCharMap: map[string][]rune{}, 37 | } 38 | } 39 | 40 | func (p *DefaultInputTextPlugin) GetConfigStruct() interface{} { 41 | if p.config == nil { 42 | p.config = &DefaultInputTextPluginConfig{} 43 | } 44 | return p.config 45 | } 46 | 47 | func (p *DefaultInputTextPlugin) SetUp() error { 48 | if p.rewriteDef == "" { 49 | p.rewriteDef = p.config.RewriteDef 50 | } 51 | p.config = nil 52 | if p.ignoreNormalizeMap == nil { 53 | p.ignoreNormalizeMap = map[rune]bool{} 54 | } 55 | if p.keyLengths == nil { 56 | p.keyLengths = map[rune]int{} 57 | } 58 | if p.replaceCharMap == nil { 59 | p.replaceCharMap = map[string][]rune{} 60 | } 61 | err := p.readRewriteLists(p.rewriteDef) 62 | if err != nil { 63 | return fmt.Errorf("DefaultInputTextPlugin: %s", err) 64 | } 65 | return nil 66 | } 67 | 68 | func (p *DefaultInputTextPlugin) getKeyLength(key rune, def int) int { 69 | l, ok := p.keyLengths[key] 70 | if !ok { 71 | return def 72 | } 73 | return l 74 | } 75 | 76 | func (p *DefaultInputTextPlugin) Rewrite(builder *InputTextBuilder) error { 77 | runes := builder.GetText() 78 | runelen := len(runes) 79 | 80 | utf8buf := make([]byte, 8, 8) 81 | 82 | offset := 0 83 | nextOffset := 0 84 | TEXTLOOP: 85 | for i := 0; i < runelen; i++ { 86 | offset += nextOffset 87 | nextOffset = 0 88 | // 1. replace char without normalize 89 | for l := minInt(p.getKeyLength(runes[i], 0), runelen-i); l > 0; l-- { 90 | replace, ok := p.replaceCharMap[string(runes[i:i+l])] 91 | if ok { 92 | builder.Replace(i+offset, i+l+offset, replace) 93 | nextOffset += len(replace) - l 94 | i += l - 1 95 | continue TEXTLOOP 96 | } 97 | } 98 | 99 | // 2. normalize 100 | original := runes[i] 101 | 102 | // 2-1. capital alphabet (not only latin but greek, cyrillic, etc) -> small 103 | lower := unicode.ToLower(original) 104 | var replace []rune 105 | _, ok := p.ignoreNormalizeMap[lower] 106 | if ok { 107 | if original == lower { 108 | continue 109 | } 110 | replace = []rune{lower} 111 | } else { 112 | // 2-2. normalize (except in ignoreNormalize) 113 | // e.g. full-width alphabet -> half-width / ligature / etc. 114 | size := utf8.EncodeRune(utf8buf, lower) 115 | replace = []rune(string(norm.NFKC.Bytes(utf8buf[:size]))) 116 | } 117 | nextOffset = len(replace) - 1 118 | if len(replace) != 1 || original != replace[0] { 119 | builder.Replace(i+offset, i+1+offset, replace) 120 | } 121 | } 122 | return nil 123 | } 124 | 125 | func minInt(a, b int) int { 126 | if a < b { 127 | return a 128 | } 129 | return b 130 | } 131 | 132 | func (p *DefaultInputTextPlugin) readRewriteLists(rewriteDef string) error { 133 | var rewriteDefReader io.Reader 134 | if rewriteDef != "" { 135 | rewriteDefFd, err := os.OpenFile(rewriteDef, os.O_RDONLY, 0644) 136 | if err != nil { 137 | return fmt.Errorf("DefaultInputTextPlugin: %s: %s", err, rewriteDef) 138 | } 139 | defer rewriteDefFd.Close() 140 | rewriteDefReader = rewriteDefFd 141 | } else { 142 | rewiteDefF, err := data.Assets.Open("rewrite.def") 143 | if err != nil { 144 | return fmt.Errorf("DefaultInputTextPlugin: %s: (data.Assets)rewrite.def", err) 145 | } 146 | defer rewiteDefF.Close() 147 | rewriteDefReader = rewiteDefF 148 | } 149 | 150 | r := lnreader.NewLineNumberReader(rewriteDefReader) 151 | for { 152 | line, err := r.ReadLine() 153 | if err == io.EOF { 154 | break 155 | } 156 | if err != nil { 157 | return fmt.Errorf("DefaultInputTextPlugin: %s", err) 158 | } 159 | if lnreader.IsSkipLine(line) { 160 | continue 161 | } 162 | cols := strings.Fields(string(line)) 163 | if len(cols) == 1 { 164 | // ignored normalize list 165 | key := []rune(cols[0]) 166 | if len(key) != 1 { 167 | return fmt.Errorf("DefaultInputTextPlugin: %s is already defined at line %d", cols[0], r.NumLine) 168 | } 169 | p.ignoreNormalizeMap[key[0]] = true 170 | } else if len(cols) == 2 { 171 | // replace char list 172 | _, ok := p.replaceCharMap[cols[0]] 173 | if ok { 174 | return fmt.Errorf("DefaultInputTextPlugin: %s is already defined at line %d", cols[0], r.NumLine) 175 | } 176 | key := []rune(cols[0]) 177 | if p.getKeyLength(key[0], -1) < len(key) { 178 | // store the longest key length 179 | p.keyLengths[key[0]] = len(key) 180 | } 181 | p.replaceCharMap[cols[0]] = []rune(cols[1]) 182 | } else { 183 | return fmt.Errorf("DefaultInputTextPlugin: invalid format at line %d", r.NumLine) 184 | } 185 | } 186 | return nil 187 | } 188 | -------------------------------------------------------------------------------- /dicbuilder/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "flag" 6 | "fmt" 7 | "os" 8 | "time" 9 | 10 | "github.com/msnoigrs/gosudachi/dictionary" 11 | "golang.org/x/text/language" 12 | "golang.org/x/text/message" 13 | ) 14 | 15 | func main() { 16 | flag.Usage = func() { 17 | fmt.Fprintf(os.Stderr, `Usage of %s: 18 | %s -o file -m file [-d description] [-j] file1 [file2 ...] 19 | 20 | Options: 21 | `, os.Args[0], os.Args[0]) 22 | flag.PrintDefaults() 23 | } 24 | 25 | var ( 26 | outputpath string 27 | matrixpath string 28 | description string 29 | utf16string bool 30 | ) 31 | flag.StringVar(&outputpath, "o", "", "output to file") 32 | flag.StringVar(&matrixpath, "m", "", "connection matrix file") 33 | flag.StringVar(&description, "d", "", "comment") 34 | flag.BoolVar(&utf16string, "j", false, "use UTF-16 string") 35 | 36 | flag.Parse() 37 | 38 | if outputpath == "" || matrixpath == "" || len(flag.Args()) == 0 { 39 | flag.Usage() 40 | os.Exit(1) 41 | } 42 | 43 | dh := dictionary.NewDictionaryHeader( 44 | dictionary.SystemDictVersion, 45 | time.Now().Unix(), 46 | description, 47 | ) 48 | 49 | hb, err := dh.ToBytes() 50 | if err != nil { 51 | fmt.Fprintf(os.Stderr, "%s\n", err) 52 | os.Exit(1) 53 | } 54 | 55 | outputWriter, err := os.OpenFile(outputpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) 56 | if err != nil { 57 | fmt.Fprintf(os.Stderr, "%s: %s\n", outputpath, err) 58 | os.Exit(1) 59 | } 60 | defer outputWriter.Close() 61 | 62 | bufout := bufio.NewWriter(outputWriter) 63 | n, err := bufout.Write(hb) 64 | if err != nil { 65 | fmt.Fprintf(os.Stderr, "fail to write header: %s\n", err) 66 | os.Exit(1) 67 | } 68 | err = bufout.Flush() 69 | if err != nil { 70 | fmt.Fprintf(os.Stderr, "fail to write header: %s\n", err) 71 | os.Exit(1) 72 | } 73 | 74 | matrixReader, err := os.OpenFile(matrixpath, os.O_RDONLY, 0644) 75 | if err != nil { 76 | fmt.Fprintf(os.Stderr, "%s: %s\n", matrixpath, err) 77 | os.Exit(1) 78 | } 79 | defer matrixReader.Close() 80 | 81 | dicbuilder := dictionary.NewDictionaryBuilder(int64(n), nil, utf16string) 82 | store := dictionary.NewPosTable() 83 | 84 | fmt.Fprint(os.Stderr, "reading the source file...") 85 | for _, lexiconpath := range flag.Args() { 86 | err := build(dicbuilder, store, lexiconpath) 87 | if err != nil { 88 | fmt.Fprintf(os.Stderr, "%s: %s\n", lexiconpath, err) 89 | os.Exit(1) 90 | } 91 | } 92 | p := message.NewPrinter(language.English) 93 | p.Fprintf(os.Stderr, " %d words\n", dicbuilder.EntrySize()) 94 | 95 | err = dicbuilder.WriteGrammar(store, matrixReader, outputWriter) 96 | if err != nil { 97 | fmt.Fprintf(os.Stderr, "fail to write grammar: %s\n", err) 98 | os.Exit(1) 99 | } 100 | 101 | err = dicbuilder.WriteLexicon(outputWriter, store) 102 | if err != nil { 103 | fmt.Fprintf(os.Stderr, "fail to write lexicon: %s\n", err) 104 | os.Exit(1) 105 | } 106 | } 107 | 108 | func build(dicbuilder *dictionary.DictionaryBuilder, store dictionary.PosIdStore, lexiconpath string) error { 109 | lexiconReader, err := os.OpenFile(lexiconpath, os.O_RDONLY, 0644) 110 | if err != nil { 111 | return err 112 | } 113 | defer lexiconReader.Close() 114 | 115 | err = dicbuilder.BuildLexicon(store, lexiconReader) 116 | if err != nil { 117 | return err 118 | } 119 | return nil 120 | } 121 | -------------------------------------------------------------------------------- /dicconv/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "bytes" 6 | "flag" 7 | "fmt" 8 | "io" 9 | "os" 10 | "path/filepath" 11 | 12 | "github.com/msnoigrs/gosudachi/dictionary" 13 | "golang.org/x/text/language" 14 | "golang.org/x/text/message" 15 | ) 16 | 17 | func main() { 18 | flag.Usage = func() { 19 | fmt.Fprintf(os.Stderr, `Usage of %s: 20 | %s [-o file] [-j] file 21 | 22 | Options: 23 | `, os.Args[0], os.Args[0]) 24 | flag.PrintDefaults() 25 | } 26 | 27 | var ( 28 | outputfile string 29 | utf16string bool 30 | ) 31 | flag.StringVar(&outputfile, "o", "", "output to file") 32 | flag.BoolVar(&utf16string, "j", false, "from UTF-8 to UTF-16") 33 | 34 | flag.Parse() 35 | 36 | if len(flag.Args()) == 0 { 37 | flag.Usage() 38 | os.Exit(1) 39 | } 40 | 41 | if outputfile == "" { 42 | if utf16string { 43 | outputfile = "out_utf16.dic" 44 | } else { 45 | outputfile = "out_utf8.dic" 46 | } 47 | } 48 | if !filepath.IsAbs(outputfile) { 49 | var err error 50 | outputfile, err = filepath.Abs(outputfile) 51 | if err != nil { 52 | fmt.Fprintln(os.Stderr, err) 53 | os.Exit(1) 54 | } 55 | } 56 | outputfd, err := os.OpenFile(outputfile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) 57 | if err != nil { 58 | fmt.Fprintf(os.Stderr, "%s: %s\n", outputfile, err) 59 | os.Exit(1) 60 | } 61 | defer outputfd.Close() 62 | bufiooutput := bufio.NewWriter(outputfd) 63 | 64 | args := flag.Args() 65 | fromdic, err := dictionary.NewBinaryDictionary(args[0], !utf16string) 66 | if err != nil { 67 | fmt.Fprintln(os.Stderr, err) 68 | } 69 | defer fromdic.Close() 70 | 71 | hb, err := fromdic.Header.ToBytes() 72 | if err != nil { 73 | fmt.Fprintln(os.Stderr, err) 74 | os.Exit(1) 75 | } 76 | 77 | var offset int64 78 | n, err := bufiooutput.Write(hb) 79 | if err != nil { 80 | fmt.Fprintf(os.Stderr, "fail to write header: %s\n", err) 81 | os.Exit(1) 82 | } 83 | offset = int64(n) 84 | 85 | var n64 int64 86 | p := message.NewPrinter(language.English) 87 | if fromdic.Grammar != nil { 88 | fmt.Fprint(os.Stderr, "writting the POS table...") 89 | buffer := bytes.NewBuffer([]byte{}) 90 | err = fromdic.Grammar.WritePOSTableTo(buffer, utf16string) 91 | if err != nil { 92 | fmt.Fprintln(os.Stderr, err) 93 | os.Exit(1) 94 | } 95 | n64, err = buffer.WriteTo(bufiooutput) 96 | if err != nil { 97 | fmt.Fprintln(os.Stderr, err) 98 | os.Exit(1) 99 | } 100 | p.Fprintf(os.Stderr, " %d bytes\n", n64) 101 | buffer.Reset() 102 | offset += n64 103 | 104 | fmt.Fprint(os.Stderr, "writting the connection matrix...") 105 | n, err = fromdic.Grammar.WriteConnMatrixTo(bufiooutput) 106 | if err != nil { 107 | fmt.Fprintln(os.Stderr, err) 108 | os.Exit(1) 109 | } 110 | p.Fprintf(os.Stderr, " %d bytes\n", n) 111 | offset += int64(n) 112 | } 113 | 114 | fmt.Fprint(os.Stderr, "writting the trie...") 115 | n, err = fromdic.Lexicon.WriteTrieTo(bufiooutput) 116 | if err != nil { 117 | fmt.Fprintln(os.Stderr, err) 118 | os.Exit(1) 119 | } 120 | p.Fprintf(os.Stderr, " %d bytes\n", n) 121 | offset += int64(n) 122 | 123 | fmt.Fprint(os.Stderr, "writting the word-ID table...") 124 | n, err = fromdic.Lexicon.WriteWordIdTableTo(bufiooutput) 125 | if err != nil { 126 | fmt.Fprintln(os.Stderr, err) 127 | os.Exit(1) 128 | } 129 | p.Fprintf(os.Stderr, " %d bytes\n", n) 130 | offset += int64(n) 131 | 132 | fmt.Fprint(os.Stderr, "writting the word parameters...") 133 | n, err = fromdic.Lexicon.WriteWordParamsTo(bufiooutput) 134 | if err != nil { 135 | fmt.Fprintln(os.Stderr, err) 136 | os.Exit(1) 137 | } 138 | p.Fprintf(os.Stderr, " %d bytes\n", n) 139 | offset += int64(n) 140 | 141 | err = bufiooutput.Flush() 142 | if err != nil { 143 | fmt.Fprintln(os.Stderr, err) 144 | os.Exit(1) 145 | } 146 | 147 | fmt.Fprint(os.Stderr, "writting the wordInfos...") 148 | offsetlen := int64(4 * fromdic.Lexicon.Size()) 149 | _, err = outputfd.Seek(offsetlen, io.SeekCurrent) 150 | if err != nil { 151 | fmt.Fprintln(os.Stderr, err) 152 | os.Exit(1) 153 | } 154 | bufiooutput = bufio.NewWriter(outputfd) 155 | 156 | n, offsets, err := fromdic.Lexicon.WriteWordInfos(bufiooutput, offset, offsetlen, utf16string) 157 | if err != nil { 158 | fmt.Fprintln(os.Stderr, err) 159 | os.Exit(1) 160 | } 161 | p.Fprintf(os.Stderr, " %d bytes\n", n) 162 | 163 | err = bufiooutput.Flush() 164 | if err != nil { 165 | fmt.Fprintln(os.Stderr, err) 166 | os.Exit(1) 167 | } 168 | 169 | fmt.Fprint(os.Stderr, "writting wordInfo offsets...") 170 | _, err = outputfd.Seek(offset, io.SeekStart) 171 | if err != nil { 172 | fmt.Fprintln(os.Stderr, err) 173 | os.Exit(1) 174 | } 175 | bufiooutput = bufio.NewWriter(outputfd) 176 | 177 | n64, err = offsets.WriteTo(bufiooutput) 178 | if err != nil { 179 | fmt.Fprintln(os.Stderr, err) 180 | os.Exit(1) 181 | } 182 | p.Fprintf(os.Stderr, " %d bytes\n", n64) 183 | 184 | err = bufiooutput.Flush() 185 | if err != nil { 186 | fmt.Fprintln(os.Stderr, err) 187 | os.Exit(1) 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /dictionary.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "os" 7 | 8 | "github.com/msnoigrs/gosudachi/data" 9 | "github.com/msnoigrs/gosudachi/dictionary" 10 | ) 11 | 12 | const ( 13 | UserDictCostParMorph = -20 14 | ) 15 | 16 | const maxcost = int(int16(^uint16(0) >> 1)) 17 | const mincost = int(-maxcost - 1) 18 | 19 | type JapaneseDictionary struct { 20 | grammar *dictionary.Grammar 21 | lexicon *dictionary.LexiconSet 22 | inputTextPlugins []InputTextPlugin 23 | oovProviderPlugins []OovProviderPlugin 24 | pathRewritePlugins []PathRewritePlugin 25 | dictionaries []*dictionary.BinaryDictionary 26 | } 27 | 28 | func NewJapaneseDictionary(config *BaseConfig, inputTextPlugins []InputTextPlugin, oovProviderPlugins []OovProviderPlugin, pathRewritePlugins []PathRewritePlugin, editConnectionCostPlugins []EditConnectionCostPlugin) (*JapaneseDictionary, error) { 29 | if len(oovProviderPlugins) == 0 { 30 | return nil, fmt.Errorf("no OOV provider") 31 | } 32 | 33 | d := &JapaneseDictionary{ 34 | inputTextPlugins: inputTextPlugins, 35 | oovProviderPlugins: oovProviderPlugins, 36 | pathRewritePlugins: pathRewritePlugins, 37 | } 38 | 39 | err := d.ReadSystemDictionary(config.SystemDict, config.Utf16String) 40 | if err != nil { 41 | return nil, fmt.Errorf("fail to read a system dictionary: %s", err) 42 | } 43 | 44 | for _, plugin := range editConnectionCostPlugins { 45 | err := plugin.SetUp(d.grammar) 46 | if err != nil { 47 | return nil, err 48 | } 49 | err = plugin.Edit(d.grammar) 50 | if err != nil { 51 | return nil, err 52 | } 53 | } 54 | 55 | err = d.ReadCharacterDefinition(config.CharacterDefinitionFile) 56 | if err != nil { 57 | return nil, fmt.Errorf("fail to read a character defition file: %s", err) 58 | } 59 | 60 | for _, plugin := range inputTextPlugins { 61 | err := plugin.SetUp() 62 | if err != nil { 63 | return nil, err 64 | } 65 | } 66 | for _, plugin := range oovProviderPlugins { 67 | err := plugin.SetUp(d.grammar) 68 | if err != nil { 69 | return nil, err 70 | } 71 | } 72 | for _, plugin := range pathRewritePlugins { 73 | err := plugin.SetUp(d.grammar) 74 | if err != nil { 75 | return nil, err 76 | } 77 | } 78 | 79 | for _, ud := range config.UserDict { 80 | err := d.ReadUserDictionary(ud, config.Utf16String) 81 | if err != nil { 82 | return nil, fmt.Errorf("fail to read a user dictionary: %s", err) 83 | } 84 | } 85 | return d, nil 86 | } 87 | 88 | func (d *JapaneseDictionary) ReadSystemDictionary(filename string, utf16string bool) error { 89 | dict, err := dictionary.ReadSystemDictionary(filename, utf16string) 90 | if err != nil { 91 | return err 92 | } 93 | 94 | d.dictionaries = append(d.dictionaries, dict) 95 | d.grammar = dict.Grammar 96 | d.lexicon = dictionary.NewLexiconSet(dict.Lexicon) 97 | return nil 98 | } 99 | 100 | func (d *JapaneseDictionary) ReadUserDictionary(filename string, utf16string bool) error { 101 | if d.lexicon.IsFull() { 102 | return fmt.Errorf("too many dictionaries") 103 | } 104 | 105 | dict, err := dictionary.ReadUserDictionary(filename, utf16string) 106 | if err != nil { 107 | return err 108 | } 109 | 110 | d.dictionaries = append(d.dictionaries, dict) 111 | 112 | userLexicon := dict.Lexicon 113 | tokenizer := NewJapaneseTokenizer( 114 | d.grammar, 115 | d.lexicon, 116 | d.inputTextPlugins, 117 | d.oovProviderPlugins, 118 | []PathRewritePlugin{}, 119 | ) 120 | userLexicon.CalculateCost(func(text string) (int16, error) { 121 | ms, err := tokenizer.Tokenize("C", text) 122 | if err != nil { 123 | return int16(mincost), err 124 | } 125 | cost := ms.GetInternalCost() + UserDictCostParMorph*ms.Length() 126 | if cost > maxcost { 127 | cost = maxcost 128 | } else if cost < mincost { 129 | cost = mincost 130 | } 131 | return int16(cost), nil 132 | }) 133 | d.lexicon.Add(userLexicon, int32(d.grammar.GetPartOfSpeechSize())) 134 | d.grammar.AddPosList(dict.Grammar) 135 | return nil 136 | } 137 | 138 | func (d *JapaneseDictionary) ReadCharacterDefinition(charDef string) error { 139 | var charDefReader io.Reader 140 | if charDef != "" { 141 | charDefFd, err := os.OpenFile(charDef, os.O_RDONLY, 0644) 142 | if err != nil { 143 | return fmt.Errorf("%s: %s", err, charDef) 144 | } 145 | defer charDefFd.Close() 146 | charDefReader = charDefFd 147 | } else { 148 | charDefF, err := data.Assets.Open("char.def") 149 | if err != nil { 150 | return fmt.Errorf("%s: (data.Assets)char.def", err) 151 | } 152 | defer charDefF.Close() 153 | charDefReader = charDefF 154 | } 155 | 156 | cat := dictionary.NewCharacterCategory() 157 | err := cat.ReadCharacterDefinition(charDefReader) 158 | if err != nil { 159 | return err 160 | } 161 | d.grammar.CharCategory = cat 162 | return nil 163 | } 164 | 165 | func (d *JapaneseDictionary) Close() { 166 | d.grammar = nil 167 | d.lexicon = nil 168 | for _, dict := range d.dictionaries { 169 | dict.Close() 170 | } 171 | d.dictionaries = d.dictionaries[:0] 172 | } 173 | 174 | func (d *JapaneseDictionary) Create() *JapaneseTokenizer { 175 | return NewJapaneseTokenizer( 176 | d.grammar, 177 | d.lexicon, 178 | d.inputTextPlugins, 179 | d.oovProviderPlugins, 180 | d.pathRewritePlugins, 181 | ) 182 | } 183 | 184 | func (d *JapaneseDictionary) GetPartOfSpeechSize() int { 185 | return d.grammar.GetPartOfSpeechSize() 186 | } 187 | 188 | func (d *JapaneseDictionary) GetPartOfSpeechString(posId int16) []string { 189 | return d.grammar.GetPartOfSpeechString(posId) 190 | } 191 | -------------------------------------------------------------------------------- /dictionary/binarydict.go: -------------------------------------------------------------------------------- 1 | package dictionary 2 | 3 | import ( 4 | "fmt" 5 | "github.com/msnoigrs/gosudachi/internal/mmap" 6 | "os" 7 | ) 8 | 9 | type BinaryDictionary struct { 10 | fd *os.File 11 | fmap []byte 12 | Header *DictionaryHeader 13 | Grammar *Grammar 14 | Lexicon *DoubleArrayLexicon 15 | } 16 | 17 | func NewBinaryDictionary(filename string, utf16string bool) (*BinaryDictionary, error) { 18 | fd, err := os.OpenFile(filename, os.O_RDONLY, 0644) 19 | if err != nil { 20 | return nil, err 21 | } 22 | 23 | finfo, err := fd.Stat() 24 | if err != nil { 25 | _ = fd.Close() 26 | return nil, err 27 | } 28 | fmap, err := mmap.Mmap(fd, false, 0, finfo.Size()) 29 | if err != nil { 30 | _ = fd.Close() 31 | return nil, err 32 | } 33 | 34 | offset := 0 35 | header := ParseDictionaryHeader(fmap, offset) 36 | if header == nil { 37 | return nil, fmt.Errorf("invalid header: %s", filename) 38 | } 39 | 40 | offset += HeaderStorageSize 41 | var grammar *Grammar 42 | if header.Version == SystemDictVersion || header.Version == UserDictVersion2 { 43 | grammar = NewGrammar(fmap, offset, utf16string) 44 | offset += grammar.StorageSize 45 | } else if header.Version != UserDictVersion { 46 | _ = mmap.Munmap(fmap) 47 | _ = fd.Close() 48 | return nil, fmt.Errorf("invalid dictionary: %s", filename) 49 | } 50 | 51 | lexicon := NewDoubleArrayLexicon(fmap, offset, utf16string) 52 | 53 | return &BinaryDictionary{ 54 | fd, 55 | fmap, 56 | header, 57 | grammar, 58 | lexicon, 59 | }, nil 60 | } 61 | 62 | func ReadSystemDictionary(filename string, utf16string bool) (*BinaryDictionary, error) { 63 | dict, err := NewBinaryDictionary(filename, utf16string) 64 | if err != nil { 65 | return nil, err 66 | } 67 | if dict.Header.Version != SystemDictVersion { 68 | _ = dict.Close() 69 | return nil, fmt.Errorf("invalid systemd dictionary: %s", filename) 70 | } 71 | return dict, nil 72 | } 73 | 74 | func ReadUserDictionary(filename string, utf16string bool) (*BinaryDictionary, error) { 75 | dict, err := NewBinaryDictionary(filename, utf16string) 76 | if err != nil { 77 | return nil, err 78 | } 79 | if !IsUserDictionary(dict.Header.Version) { 80 | _ = dict.Close() 81 | return nil, fmt.Errorf("invalid user dictionary: %s", filename) 82 | } 83 | return dict, nil 84 | } 85 | 86 | func (bd *BinaryDictionary) Close() error { 87 | err := mmap.Munmap(bd.fmap) 88 | if err != nil { 89 | return err 90 | } 91 | return bd.fd.Close() 92 | } 93 | -------------------------------------------------------------------------------- /dictionary/bytes.go: -------------------------------------------------------------------------------- 1 | package dictionary 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "unicode/utf16" 7 | ) 8 | 9 | func bufferToInt16(bytebuffer []byte, offset int) (int, int16) { 10 | var ret int16 11 | offsetend := offset + 2 12 | _ = binary.Read(bytes.NewBuffer(bytebuffer[offset:offsetend]), binary.LittleEndian, &ret) 13 | return offsetend, ret 14 | } 15 | 16 | func bufferToUint16(bytebuffer []byte, offset int) (int, uint16) { 17 | var ret uint16 18 | offsetend := offset + 2 19 | _ = binary.Read(bytes.NewBuffer(bytebuffer[offset:offsetend]), binary.LittleEndian, &ret) 20 | return offsetend, ret 21 | } 22 | 23 | func bufferToInt32(bytebuffer []byte, offset int) (int, int32) { 24 | var ret int32 25 | offsetend := offset + 4 26 | _ = binary.Read(bytes.NewBuffer(bytebuffer[offset:offsetend]), binary.LittleEndian, &ret) 27 | return offsetend, ret 28 | } 29 | 30 | func bufferToUint32(bytebuffer []byte, offset int) (int, uint32) { 31 | var ret uint32 32 | offsetend := offset + 4 33 | _ = binary.Read(bytes.NewBuffer(bytebuffer[offset:offsetend]), binary.LittleEndian, &ret) 34 | return offsetend, ret 35 | } 36 | 37 | func bufferToInt64(bytebuffer []byte, offset int) (int, int64) { 38 | var ret int64 39 | offsetend := offset + 8 40 | _ = binary.Read(bytes.NewBuffer(bytebuffer[offset:offsetend]), binary.LittleEndian, &ret) 41 | return offsetend, ret 42 | } 43 | 44 | func bufferToUint64(bytebuffer []byte, offset int) (int, uint64) { 45 | var ret uint64 46 | offsetend := offset + 8 47 | _ = binary.Read(bytes.NewBuffer(bytebuffer[offset:offsetend]), binary.LittleEndian, &ret) 48 | return offsetend, ret 49 | } 50 | 51 | func bufferToStringLength(bytebuffer []byte, offset int) (int, int) { 52 | length := bytebuffer[offset] 53 | if (length & 0x80) == 0x80 { 54 | high := int16(length & 0x7F) 55 | low := int16(bytebuffer[offset+1]) 56 | return offset + 2, int(high<<8 | low) 57 | } 58 | return offset + 1, int(length) 59 | } 60 | 61 | type bufferToStringFunc func(bytebuffer []byte, offset int) (int, string) 62 | 63 | func bufferToString(bytebuffer []byte, offset int) (int, string) { 64 | offset, length := bufferToStringLength(bytebuffer, offset) 65 | offsetend := offset + int(length) 66 | return offsetend, string(bytebuffer[offset:offsetend]) 67 | } 68 | 69 | func bufferToStringUtf16(bytebuffer []byte, offset int) (int, string) { 70 | // java compatible 71 | offset, length := bufferToStringLength(bytebuffer, offset) 72 | javainternal := make([]uint16, length, length) 73 | for i := 0; i < length; i++ { 74 | s := offset + 2*i 75 | _ = binary.Read(bytes.NewBuffer(bytebuffer[s:s+2]), binary.LittleEndian, &javainternal[i]) 76 | } 77 | return offset + length*2, string(utf16.Decode(javainternal)) 78 | } 79 | 80 | func bufferToInt32Array(bytebuffer []byte, offset int) (int, []int32) { 81 | length := int(bytebuffer[offset]) 82 | offset++ 83 | array := make([]int32, length, length) 84 | for i := 0; i < length; i++ { 85 | s := offset + 4*i 86 | _ = binary.Read(bytes.NewBuffer(bytebuffer[s:s+4]), binary.LittleEndian, &array[i]) 87 | } 88 | return offset + 4*length, array 89 | } 90 | -------------------------------------------------------------------------------- /dictionary/charcategory.go: -------------------------------------------------------------------------------- 1 | package dictionary 2 | 3 | import ( 4 | "encoding/hex" 5 | "fmt" 6 | "io" 7 | "strings" 8 | "unicode/utf8" 9 | 10 | "github.com/msnoigrs/gosudachi/internal/lnreader" 11 | ) 12 | 13 | // Categories of characters 14 | const ( 15 | DEFAULT uint32 = 1 // The fall back category 16 | SPACE uint32 = 1 << 1 // WhiteSpaces 17 | KANJI uint32 = 1 << 2 // CJKV ideographic characters 18 | SYMBOL uint32 = 1 << 3 // Symbols 19 | NUMERIC uint32 = 1 << 4 // Numerical characters 20 | ALPHA uint32 = 1 << 5 // Latin alphabets 21 | HIRAGANA uint32 = 1 << 6 // Hiragana characters 22 | KATAKANA uint32 = 1 << 7 // Katakana characters 23 | KANJINUMERIC uint32 = 1 << 8 // Knaji numeric characters 24 | GREEK uint32 = 1 << 9 // Greek alphabets 25 | CYRILLIC uint32 = 1 << 10 // Cyrillic alphabets 26 | USER1 uint32 = 1 << 11 // User defined category 27 | USER2 uint32 = 1 << 12 // User defined category 28 | USER3 uint32 = 1 << 13 // User defined category 29 | USER4 uint32 = 1 << 14 // User defined category 30 | NOOOVBOW uint32 = 1 << 15 // Characters that cannot be the beginning of word 31 | ) 32 | 33 | func GetCategoryType(s string) (uint32, error) { 34 | switch s { 35 | case "DEFAULT": 36 | return DEFAULT, nil 37 | case "SPACE": 38 | return SPACE, nil 39 | case "KANJI": 40 | return KANJI, nil 41 | case "SYMBOL": 42 | return SYMBOL, nil 43 | case "NUMERIC": 44 | return NUMERIC, nil 45 | case "ALPHA": 46 | return ALPHA, nil 47 | case "HIRAGANA": 48 | return HIRAGANA, nil 49 | case "KATAKANA": 50 | return KATAKANA, nil 51 | case "KANJINUMERIC": 52 | return KANJINUMERIC, nil 53 | case "GREEK": 54 | return GREEK, nil 55 | case "CYRILLIC": 56 | return CYRILLIC, nil 57 | case "USER1": 58 | return USER1, nil 59 | case "USER2": 60 | return USER2, nil 61 | case "USER3": 62 | return USER3, nil 63 | case "USER4": 64 | return USER4, nil 65 | case "NOOOVBOW": 66 | return NOOOVBOW, nil 67 | } 68 | return 0, fmt.Errorf("%s is invalid type", s) 69 | } 70 | 71 | type categoryRange struct { 72 | low int32 73 | high int32 74 | categories uint32 75 | } 76 | 77 | func (r *categoryRange) contains(cp rune) bool { 78 | if int32(cp) >= r.low && int32(cp) <= r.high { 79 | return true 80 | } 81 | return false 82 | } 83 | 84 | func (r *categoryRange) containingLength(text string) int { 85 | for i, c := range text { 86 | if int32(c) < r.low || int32(c) > r.high { 87 | return i 88 | } 89 | } 90 | return utf8.RuneCountInString(text) 91 | } 92 | 93 | type CharacterCategory struct { 94 | rangeList []*categoryRange 95 | } 96 | 97 | func NewCharacterCategory() *CharacterCategory { 98 | return &CharacterCategory{} 99 | } 100 | 101 | func (cc *CharacterCategory) GetCategoryTypes(codePoint rune) uint32 { 102 | var categories uint32 103 | for _, cr := range cc.rangeList { 104 | if cr.contains(codePoint) { 105 | categories |= cr.categories 106 | } 107 | } 108 | 109 | if categories == 0 { 110 | categories = DEFAULT 111 | } 112 | return categories 113 | } 114 | 115 | func (cc *CharacterCategory) ReadCharacterDefinition(charDefReader io.Reader) error { 116 | r := lnreader.NewLineNumberReader(charDefReader) 117 | for { 118 | line, err := r.ReadLine() 119 | if err == io.EOF { 120 | break 121 | } 122 | if err != nil { 123 | return err 124 | } 125 | if lnreader.IsSkipLine(line) { 126 | continue 127 | } 128 | cols := strings.Fields(string(line)) 129 | if len(cols) < 2 { 130 | return fmt.Errorf("invalid format at line %d: too short fields", r.NumLine) 131 | } 132 | if !strings.HasPrefix(cols[0], "0x") { 133 | continue 134 | } 135 | 136 | catrange := new(categoryRange) 137 | rs := strings.Split(cols[0], "..") 138 | low, err := decodeHexStrToInt32(rs[0]) 139 | if err != nil { 140 | return fmt.Errorf("invalid format at line %d: %s", r.NumLine, err) 141 | } 142 | catrange.low = low 143 | if len(rs) > 1 { 144 | high, err := decodeHexStrToInt32(rs[1]) 145 | if err != nil { 146 | return fmt.Errorf("invalid format at line %d: %s", r.NumLine, err) 147 | } 148 | catrange.high = high 149 | } else { 150 | catrange.high = catrange.low 151 | } 152 | if catrange.low > catrange.high { 153 | return fmt.Errorf("invalid format at line %d: low > high", r.NumLine) 154 | } 155 | for i := 1; i < len(cols); i++ { 156 | if strings.HasPrefix(cols[i], "#") { 157 | break 158 | } 159 | t, err := GetCategoryType(cols[i]) 160 | if err != nil { 161 | return fmt.Errorf("%s at line %d: %s", err, r.NumLine, err) 162 | } 163 | catrange.categories |= t 164 | } 165 | cc.rangeList = append(cc.rangeList, catrange) 166 | } 167 | 168 | return nil 169 | } 170 | 171 | func decodeHexStrToInt32(s string) (int32, error) { 172 | if len(s) < 3 { 173 | return 0, fmt.Errorf("invalid hex string: too short") 174 | } 175 | src := []byte(s[2:]) 176 | dst := make([]byte, hex.DecodedLen(len(src))) 177 | n, err := hex.Decode(dst, src) 178 | if err != nil { 179 | return 0, err 180 | } 181 | if n > 4 { 182 | return 0, fmt.Errorf("invalid hex string: too long") 183 | } 184 | var ret int32 185 | switch n { 186 | case 4: 187 | ret = int32(dst[0])*16777216 + int32(dst[1])*65536 + int32(dst[2])*256 + int32(dst[3]) 188 | case 3: 189 | ret = int32(dst[0])*65536 + int32(dst[1])*256 + int32(dst[2]) 190 | case 2: 191 | ret = int32(dst[0])*256 + int32(dst[1]) 192 | case 1: 193 | ret = int32(dst[0]) 194 | } 195 | return ret, nil 196 | } 197 | -------------------------------------------------------------------------------- /dictionary/dalexicon.go: -------------------------------------------------------------------------------- 1 | package dictionary 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "io" 7 | 8 | "github.com/msnoigrs/gosudachi/dartsclone" 9 | ) 10 | 11 | const ( 12 | wordParameterListElementSize = 2 * 3 13 | ) 14 | 15 | type wordIdTable struct { 16 | bytebuffer []byte 17 | size int32 18 | //offset int 19 | } 20 | 21 | func newWordIdTable(bytebuffer []byte, offset int) *wordIdTable { 22 | _, size := bufferToInt32(bytebuffer, offset) 23 | return &wordIdTable{ 24 | bytebuffer: bytebuffer[offset+4 : offset+4+int(size)], 25 | size: size, 26 | //offset: offset + 4, 27 | } 28 | } 29 | 30 | func (t *wordIdTable) storageSize() int { 31 | return 4 + int(t.size) 32 | } 33 | 34 | func (t *wordIdTable) get(index int) []int32 { 35 | _, result := bufferToInt32Array(t.bytebuffer, index) 36 | return result 37 | } 38 | 39 | type wordParameterList struct { 40 | bytebuffer []byte 41 | size int32 42 | offset int 43 | isCopied bool 44 | } 45 | 46 | func newWordParameterList(bytebuffer []byte, offset int) *wordParameterList { 47 | offset, size := bufferToInt32(bytebuffer, offset) 48 | return &wordParameterList{ 49 | bytebuffer: bytebuffer, 50 | size: size, 51 | offset: offset, 52 | isCopied: false, 53 | } 54 | } 55 | 56 | func (l *wordParameterList) storageSize() int { 57 | return 4 + wordParameterListElementSize*int(l.size) 58 | } 59 | 60 | func (l *wordParameterList) getLeftId(wordId int32) int16 { 61 | _, ret := bufferToInt16(l.bytebuffer, l.offset+wordParameterListElementSize*int(wordId)) 62 | return ret 63 | } 64 | 65 | func (l *wordParameterList) getRightId(wordId int32) int16 { 66 | _, ret := bufferToInt16(l.bytebuffer, l.offset+wordParameterListElementSize*int(wordId)+2) 67 | return ret 68 | } 69 | 70 | func (l *wordParameterList) getCost(wordId int32) int16 { 71 | _, ret := bufferToInt16(l.bytebuffer, l.offset+wordParameterListElementSize*int(wordId)+4) 72 | return ret 73 | } 74 | 75 | func (l *wordParameterList) setCost(wordId int32, cost int16) { 76 | if !l.isCopied { 77 | l.copyBuffer() 78 | } 79 | 80 | s := l.offset + wordParameterListElementSize*int(wordId) + 4 81 | binary.LittleEndian.PutUint16(l.bytebuffer[s:], uint16(cost)) 82 | } 83 | 84 | // syncronized ??? 85 | func (l *wordParameterList) copyBuffer() { 86 | nl := int(wordParameterListElementSize) * int(l.size) 87 | newBuffer := make([]byte, nl, nl) 88 | s := l.offset 89 | copy(newBuffer, l.bytebuffer[s:s+nl]) 90 | l.bytebuffer = newBuffer 91 | l.offset = 0 92 | l.isCopied = true 93 | } 94 | 95 | type wordInfoList struct { 96 | bytebuffer []byte 97 | offset int 98 | wordSize int32 99 | bufferToStringF bufferToStringFunc 100 | } 101 | 102 | func newWordInfoList(bytebuffer []byte, offset int, wordSize int32, bufferToStringF bufferToStringFunc) *wordInfoList { 103 | return &wordInfoList{ 104 | bytebuffer: bytebuffer, 105 | offset: offset, 106 | wordSize: wordSize, 107 | bufferToStringF: bufferToStringF, 108 | } 109 | } 110 | 111 | func (l *wordInfoList) getWordInfo(wordId int32) *WordInfo { 112 | index := l.wordIdToOffset(wordId) 113 | 114 | index, surface := l.bufferToStringF(l.bytebuffer, index) 115 | index, headwordLength := bufferToStringLength(l.bytebuffer, index) 116 | index, posId := bufferToInt16(l.bytebuffer, index) 117 | index, normalizedForm := l.bufferToStringF(l.bytebuffer, index) 118 | if normalizedForm == "" { 119 | normalizedForm = surface 120 | } 121 | index, dictionaryFormWordId := bufferToInt32(l.bytebuffer, index) 122 | index, readingForm := l.bufferToStringF(l.bytebuffer, index) 123 | if readingForm == "" { 124 | readingForm = surface 125 | } 126 | index, aUnitSplit := bufferToInt32Array(l.bytebuffer, index) 127 | index, bUnitSplit := bufferToInt32Array(l.bytebuffer, index) 128 | index, wordStructure := bufferToInt32Array(l.bytebuffer, index) 129 | 130 | dictionaryForm := surface 131 | if dictionaryFormWordId >= 0 && dictionaryFormWordId != wordId { 132 | wi := l.getWordInfo(dictionaryFormWordId) 133 | dictionaryForm = wi.Surface 134 | } 135 | 136 | return &WordInfo{ 137 | Surface: surface, 138 | HeadwordLength: int16(headwordLength), 139 | PosId: posId, 140 | NormalizedForm: normalizedForm, 141 | DictionaryFormWordId: dictionaryFormWordId, 142 | DictionaryForm: dictionaryForm, 143 | ReadingForm: readingForm, 144 | AUnitSplit: aUnitSplit, 145 | BUnitSplit: bUnitSplit, 146 | WordStructure: wordStructure, 147 | } 148 | } 149 | 150 | func (l *wordInfoList) wordIdToOffset(wordId int32) int { 151 | s := l.offset + 4*int(wordId) 152 | _, ret := bufferToInt32(l.bytebuffer, s) 153 | return int(ret) 154 | } 155 | 156 | type DoubleArrayLexicon struct { 157 | wordIdT *wordIdTable 158 | wordParams *wordParameterList 159 | wordInfos *wordInfoList 160 | trie *dartsclone.DoubleArray 161 | } 162 | 163 | func NewDoubleArrayLexicon(bytebuffer []byte, offset int, utf16string bool) *DoubleArrayLexicon { 164 | var size uint32 165 | trie := dartsclone.NewDoubleArray() 166 | offset, size = bufferToUint32(bytebuffer, offset) 167 | trie.SetBuffer(bytebuffer[offset : offset+int(size)*4]) 168 | offset += trie.TotalSize() 169 | 170 | wordIdT := newWordIdTable(bytebuffer, offset) 171 | offset += wordIdT.storageSize() 172 | 173 | wordParams := newWordParameterList(bytebuffer, offset) 174 | offset += wordParams.storageSize() 175 | 176 | var wordInfos *wordInfoList 177 | if utf16string { 178 | wordInfos = newWordInfoList(bytebuffer, offset, wordParams.size, bufferToStringUtf16) 179 | } else { 180 | wordInfos = newWordInfoList(bytebuffer, offset, wordParams.size, bufferToString) 181 | } 182 | 183 | return &DoubleArrayLexicon{ 184 | wordIdT: wordIdT, 185 | wordParams: wordParams, 186 | wordInfos: wordInfos, 187 | trie: trie, 188 | } 189 | } 190 | 191 | func (lexicon *DoubleArrayLexicon) Lookup(text []byte, offset int) *DoubleArrayLexiconIterator { 192 | it := lexicon.trie.CommonPrefixSearchItr(text, offset) 193 | return newDoubleArrayLexiconIterator(it, lexicon.wordIdT) 194 | } 195 | 196 | func (lexicon *DoubleArrayLexicon) GetWordId(headword string, posId int16, readingForm string) int32 { 197 | var wid int32 198 | for ; wid < lexicon.wordInfos.wordSize; wid++ { 199 | wi := lexicon.wordInfos.getWordInfo(wid) 200 | if wi.Surface == headword && 201 | wi.PosId == posId && 202 | wi.ReadingForm == readingForm { 203 | return wid 204 | } 205 | } 206 | return -1 207 | } 208 | 209 | func (lexicon *DoubleArrayLexicon) GetLeftId(wordId int32) int16 { 210 | return lexicon.wordParams.getLeftId(wordId) 211 | } 212 | 213 | func (lexicon *DoubleArrayLexicon) GetRightId(wordId int32) int16 { 214 | return lexicon.wordParams.getRightId(wordId) 215 | } 216 | 217 | func (lexicon *DoubleArrayLexicon) GetCost(wordId int32) int16 { 218 | return lexicon.wordParams.getCost(wordId) 219 | } 220 | 221 | func (lexicon *DoubleArrayLexicon) GetWordInfo(wordId int32) *WordInfo { 222 | return lexicon.wordInfos.getWordInfo(wordId) 223 | } 224 | 225 | func (lexicon *DoubleArrayLexicon) GetDictionaryId(wordId int32) int { 226 | return 0 227 | } 228 | 229 | func (lexicon *DoubleArrayLexicon) Size() int32 { 230 | return lexicon.wordParams.size 231 | } 232 | 233 | const maxint16 = int16(^uint16(0) >> 1) 234 | const minint16 = -maxint16 - 1 235 | 236 | type CalculateCostFunc func(text string) (int16, error) 237 | 238 | func (lexicon *DoubleArrayLexicon) CalculateCost(cf CalculateCostFunc) error { 239 | var wordId int32 240 | for ; wordId < lexicon.wordParams.size; wordId++ { 241 | if lexicon.wordParams.getCost(wordId) != minint16 { 242 | continue 243 | } 244 | wi := lexicon.wordInfos.getWordInfo(wordId) 245 | cost, err := cf(wi.Surface) 246 | if err != nil { 247 | return err 248 | } 249 | lexicon.wordParams.setCost(wordId, cost) 250 | } 251 | return nil 252 | } 253 | 254 | func (lexicon *DoubleArrayLexicon) WriteTrieTo(writer io.Writer) (int, error) { 255 | err := binary.Write(writer, binary.LittleEndian, uint32(lexicon.trie.Length())) 256 | if err != nil { 257 | return 0, err 258 | } 259 | n, err := writer.Write(lexicon.trie.ByteArray()) 260 | if err != nil { 261 | return 4, err 262 | } 263 | return n + 4, nil 264 | } 265 | 266 | func (lexicon *DoubleArrayLexicon) WriteWordIdTableTo(writer io.Writer) (int, error) { 267 | err := binary.Write(writer, binary.LittleEndian, uint32(lexicon.wordIdT.size)) 268 | if err != nil { 269 | return 0, err 270 | } 271 | n, err := writer.Write(lexicon.wordIdT.bytebuffer) 272 | if err != nil { 273 | return 4, err 274 | } 275 | return n + 4, nil 276 | } 277 | 278 | func (lexicon *DoubleArrayLexicon) WriteWordParamsTo(writer io.Writer) (int, error) { 279 | size := lexicon.wordParams.size 280 | err := binary.Write(writer, binary.LittleEndian, uint32(size)) 281 | if err != nil { 282 | return 0, err 283 | } 284 | n, err := writer.Write(lexicon.wordParams.bytebuffer[lexicon.wordParams.offset : lexicon.wordParams.offset+wordParameterListElementSize*int(size)]) 285 | if err != nil { 286 | return 4, err 287 | } 288 | return n + 4, nil 289 | } 290 | 291 | func (lexicon *DoubleArrayLexicon) WriteWordInfos(writer io.Writer, offset int64, offsetlen int64, utf16string bool) (int, *bytes.Buffer, error) { 292 | var writeStringF writeStringFunc 293 | if utf16string { 294 | writeStringF = writeStringUtf16 295 | } else { 296 | writeStringF = writeString 297 | } 298 | 299 | buffer := bytes.NewBuffer([]byte{}) 300 | 301 | offsets := bytes.NewBuffer(make([]byte, 0, offsetlen)) 302 | base := offset + offsetlen 303 | position := base 304 | for wordId := int32(0); wordId < lexicon.Size(); wordId++ { 305 | wi := lexicon.GetWordInfo(wordId) 306 | err := binary.Write(offsets, binary.LittleEndian, uint32(position)) 307 | if err != nil { 308 | return 0, offsets, err 309 | } 310 | err = writeStringF(buffer, wi.Surface) 311 | if err != nil { 312 | return 0, offsets, err 313 | } 314 | err = writeStringLength(buffer, wi.HeadwordLength) 315 | if err != nil { 316 | return 0, offsets, err 317 | } 318 | err = binary.Write(buffer, binary.LittleEndian, uint16(wi.PosId)) 319 | if err != nil { 320 | return 0, offsets, err 321 | } 322 | var normalizedForm string 323 | if wi.NormalizedForm != wi.Surface { 324 | normalizedForm = wi.NormalizedForm 325 | } 326 | err = writeStringF(buffer, normalizedForm) 327 | if err != nil { 328 | return 0, offsets, err 329 | } 330 | err = binary.Write(buffer, binary.LittleEndian, uint32(wi.DictionaryFormWordId)) 331 | if err != nil { 332 | return 0, offsets, err 333 | } 334 | var readingForm string 335 | if wi.ReadingForm != wi.Surface { 336 | readingForm = wi.ReadingForm 337 | } 338 | err = writeStringF(buffer, readingForm) 339 | if err != nil { 340 | return 0, offsets, err 341 | } 342 | err = writeIntArray(buffer, wi.AUnitSplit) 343 | if err != nil { 344 | return 0, offsets, err 345 | } 346 | err = writeIntArray(buffer, wi.BUnitSplit) 347 | if err != nil { 348 | return 0, offsets, err 349 | } 350 | err = writeIntArray(buffer, wi.WordStructure) 351 | if err != nil { 352 | return 0, offsets, err 353 | } 354 | n, err := buffer.WriteTo(writer) 355 | buffer.Reset() 356 | position += n 357 | } 358 | return int(position - base), offsets, nil 359 | } 360 | 361 | type DoubleArrayLexiconIterator struct { 362 | wordIdT *wordIdTable 363 | dait *dartsclone.Iterator 364 | wordIds []int32 365 | length int 366 | index int 367 | } 368 | 369 | func newDoubleArrayLexiconIterator(dait *dartsclone.Iterator, wordIdT *wordIdTable) *DoubleArrayLexiconIterator { 370 | return &DoubleArrayLexiconIterator{ 371 | wordIdT: wordIdT, 372 | dait: dait, 373 | index: -1, 374 | } 375 | } 376 | 377 | func (it *DoubleArrayLexiconIterator) Next() bool { 378 | if it.dait.Err() != nil { 379 | return false 380 | } 381 | if it.index < 0 { 382 | return it.dait.Next() 383 | } else { 384 | return it.index < len(it.wordIds) || it.dait.Next() 385 | } 386 | } 387 | 388 | func (it *DoubleArrayLexiconIterator) Get() (int32, int) { 389 | if it.index < 0 || it.index >= len(it.wordIds) { 390 | tindex, length := it.dait.Get() 391 | if it.dait.Err() != nil { 392 | return -1, 0 393 | } 394 | it.wordIds = it.wordIdT.get(tindex) 395 | it.length = length 396 | it.index = 0 397 | } 398 | wordId := it.wordIds[it.index] 399 | it.index++ 400 | return wordId, it.length 401 | } 402 | 403 | func (it *DoubleArrayLexiconIterator) Err() error { 404 | return it.dait.Err() 405 | } 406 | -------------------------------------------------------------------------------- /dictionary/dicheader.go: -------------------------------------------------------------------------------- 1 | package dictionary 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "errors" 7 | ) 8 | 9 | const ( 10 | DescriptionSize = 256 11 | HeaderStorageSize = 8 + 8 + DescriptionSize 12 | ) 13 | 14 | type DictionaryHeader struct { 15 | Version uint64 16 | CreateTime int64 17 | Description string 18 | } 19 | 20 | func NewDictionaryHeader(version uint64, createTime int64, description string) *DictionaryHeader { 21 | return &DictionaryHeader{ 22 | Version: version, 23 | CreateTime: createTime, 24 | Description: description, 25 | } 26 | } 27 | 28 | func ParseDictionaryHeader(input []byte, offset int) *DictionaryHeader { 29 | offset, version := bufferToUint64(input, offset) 30 | offset, createTime := bufferToInt64(input, offset) 31 | 32 | i := offset 33 | for ; i < HeaderStorageSize; i++ { 34 | if input[i] == 0 { 35 | break 36 | } 37 | } 38 | // UTF-8 39 | description := string(input[offset:i]) 40 | 41 | return &DictionaryHeader{ 42 | Version: version, 43 | CreateTime: createTime, 44 | Description: description, 45 | } 46 | } 47 | 48 | func (dh *DictionaryHeader) ToBytes() ([]byte, error) { 49 | desc := []byte(dh.Description) 50 | if len(desc) > DescriptionSize { 51 | return nil, errors.New("description is too long") 52 | } 53 | 54 | buf := bytes.NewBuffer(make([]byte, 0, HeaderStorageSize)) 55 | err := binary.Write(buf, binary.LittleEndian, uint64(dh.Version)) 56 | if err != nil { 57 | return nil, err 58 | } 59 | err = binary.Write(buf, binary.LittleEndian, uint64(dh.CreateTime)) 60 | if err != nil { 61 | return nil, err 62 | } 63 | _, err = buf.Write(desc) 64 | if err != nil { 65 | return nil, err 66 | } 67 | 68 | if len(desc) < DescriptionSize { 69 | padding := make([]byte, DescriptionSize-len(desc)) 70 | _, err = buf.Write(padding) 71 | if err != nil { 72 | return nil, err 73 | } 74 | } 75 | return buf.Bytes(), nil 76 | } 77 | -------------------------------------------------------------------------------- /dictionary/dicprinter.go: -------------------------------------------------------------------------------- 1 | package dictionary 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "io" 7 | "os" 8 | "strconv" 9 | "strings" 10 | "time" 11 | 12 | "github.com/msnoigrs/gosudachi/internal/mmap" 13 | ) 14 | 15 | func PrintDictionary(filename string, utf16string bool, systemDict *BinaryDictionary, output io.Writer) error { 16 | var grammar *Grammar 17 | 18 | dic, err := NewBinaryDictionary(filename, utf16string) 19 | if err != nil { 20 | return err 21 | } 22 | defer dic.Close() 23 | if dic.Header.Version == SystemDictVersion { 24 | grammar = dic.Grammar 25 | } else if systemDict == nil { 26 | return errors.New("the system dictionary is not specified") 27 | } else { 28 | grammar = systemDict.Grammar 29 | if dic.Header.Version == UserDictVersion2 { 30 | grammar.AddPosList(dic.Grammar) 31 | } 32 | } 33 | 34 | possize := grammar.GetPartOfSpeechSize() 35 | posStrings := make([]string, possize, possize) 36 | for pid := 0; pid < possize; pid++ { 37 | posStrings = append(posStrings, strings.Join(grammar.GetPartOfSpeechString(int16(pid)), ",")) 38 | } 39 | 40 | lexicon := dic.Lexicon 41 | for wordId := int32(0); wordId < lexicon.Size(); wordId++ { 42 | leftId := lexicon.GetLeftId(wordId) 43 | rightId := lexicon.GetRightId(wordId) 44 | cost := lexicon.GetCost(wordId) 45 | wi := lexicon.GetWordInfo(wordId) 46 | 47 | unitType := getUnitType(wi) 48 | 49 | fmt.Fprintf(output, 50 | "%s,%d,%d,%d,%s,%s,%s,%s,%s,%s,%s,%s,%s\n", 51 | wi.Surface, 52 | leftId, 53 | rightId, 54 | cost, 55 | wi.Surface, 56 | posStrings[int(wi.PosId)], 57 | wi.ReadingForm, 58 | wi.NormalizedForm, 59 | wordIdToString(int(wi.DictionaryFormWordId)), 60 | unitType, 61 | splitToString(wi.AUnitSplit), 62 | splitToString(wi.BUnitSplit), 63 | splitToString(wi.WordStructure), 64 | ) 65 | } 66 | return nil 67 | } 68 | 69 | func wordIdToString(wid int) string { 70 | if wid < 0 { 71 | return "*" 72 | } 73 | return strconv.Itoa(wid) 74 | } 75 | 76 | func getUnitType(wi *WordInfo) string { 77 | if len(wi.AUnitSplit) == 0 { 78 | return "A" 79 | } else if len(wi.BUnitSplit) == 0 { 80 | return "B" 81 | } 82 | return "C" 83 | } 84 | 85 | func splitToString(split []int32) string { 86 | if len(split) == 0 { 87 | return "*" 88 | } 89 | splitstrs := make([]string, len(split), len(split)) 90 | for _, i := range split { 91 | splitstrs = append(splitstrs, strconv.Itoa(int(i))) 92 | } 93 | return strings.Join(splitstrs, "/") 94 | } 95 | 96 | func PrintHeader(dictfile string, output io.Writer) error { 97 | dictfd, err := os.OpenFile(dictfile, os.O_RDONLY, 0644) 98 | if err != nil { 99 | return err 100 | } 101 | defer dictfd.Close() 102 | 103 | finfo, err := dictfd.Stat() 104 | if err != nil { 105 | return err 106 | } 107 | 108 | bytebuffer, err := mmap.Mmap(dictfd, false, 0, finfo.Size()) 109 | if err != nil { 110 | return err 111 | } 112 | defer mmap.Munmap(bytebuffer) 113 | 114 | dh := ParseDictionaryHeader(bytebuffer, 0) 115 | 116 | fmt.Fprintf(output, "filename: %s\n", dictfile) 117 | 118 | switch dh.Version { 119 | case SystemDictVersion: 120 | fmt.Fprintln(output, "type: system dictionary") 121 | case UserDictVersion, UserDictVersion2: 122 | fmt.Fprintln(output, "type: user dictionary") 123 | default: 124 | fmt.Fprintln(output, "invalid file") 125 | os.Exit(1) 126 | } 127 | 128 | ctime := time.Unix(dh.CreateTime, 0) 129 | zone, _ := ctime.Zone() 130 | fmt.Fprintf(output, "createTime: %s[%s]\n", ctime.Format(time.RFC3339), zone) 131 | fmt.Fprintf(output, "description: %s\n", dh.Description) 132 | 133 | return nil 134 | } 135 | -------------------------------------------------------------------------------- /dictionary/dicversion.go: -------------------------------------------------------------------------------- 1 | package dictionary 2 | 3 | const ( 4 | SystemDictVersion = 0x7366d3f18bd111e7 5 | UserDictVersion = 0xa50f31188bd211e7 6 | UserDictVersion2 = 0x9fdeb5a90168d868 7 | ) 8 | 9 | func IsUserDictionary(version uint64) bool { 10 | return version == UserDictVersion || version == UserDictVersion2 11 | } 12 | -------------------------------------------------------------------------------- /dictionary/grammar.go: -------------------------------------------------------------------------------- 1 | package dictionary 2 | 3 | import ( 4 | "bytes" 5 | "encoding/binary" 6 | "io" 7 | "math" 8 | ) 9 | 10 | const ( 11 | posDepth = 6 12 | InhibitedConnection = math.MaxInt16 13 | ) 14 | 15 | var ( 16 | BosParameter = []int16{0, 0, 0} 17 | EosParameter = []int16{0, 0, 0} 18 | ) 19 | 20 | type Grammar struct { 21 | bytebuffer []byte 22 | posList [][]string 23 | connectTableBytes []byte 24 | isCopiedConnectTable bool 25 | connectTableOffset int 26 | leftIdSize int16 27 | rightIdSize int16 28 | CharCategory *CharacterCategory 29 | StorageSize int 30 | } 31 | 32 | func NewGrammar(bytebuffer []byte, offset int, utf16string bool) *Grammar { 33 | var bufferToStringF bufferToStringFunc 34 | if utf16string { 35 | bufferToStringF = bufferToStringUtf16 36 | } else { 37 | bufferToStringF = bufferToString 38 | } 39 | originalOffset := offset 40 | var posLen uint16 41 | offset, posLen = bufferToUint16(bytebuffer, offset) 42 | posLeni := int(posLen) 43 | posList := make([][]string, posLeni, posLeni) 44 | for i := 0; i < posLeni; i++ { 45 | pos := make([]string, posDepth, posDepth) 46 | for j := 0; j < posDepth; j++ { 47 | offset, pos[j] = bufferToStringF(bytebuffer, offset) 48 | } 49 | posList[i] = pos 50 | } 51 | var ( 52 | leftIdSize int16 53 | rightIdSize int16 54 | ) 55 | offset, leftIdSize = bufferToInt16(bytebuffer, offset) 56 | offset, rightIdSize = bufferToInt16(bytebuffer, offset) 57 | 58 | return &Grammar{ 59 | bytebuffer: bytebuffer, 60 | posList: posList, 61 | connectTableBytes: bytebuffer, 62 | isCopiedConnectTable: false, 63 | connectTableOffset: offset, 64 | leftIdSize: leftIdSize, 65 | rightIdSize: rightIdSize, 66 | StorageSize: (offset - originalOffset) + 2*int(leftIdSize)*int(rightIdSize), 67 | } 68 | } 69 | 70 | func (g *Grammar) AddPosList(fromg *Grammar) { 71 | g.posList = append(g.posList, fromg.posList...) 72 | } 73 | 74 | func (g *Grammar) GetPartOfSpeechSize() int { 75 | return len(g.posList) 76 | } 77 | 78 | func (g *Grammar) GetPartOfSpeechString(posId int16) []string { 79 | return g.posList[posId] 80 | } 81 | 82 | func (g *Grammar) GetPartOfSpeechId(pos []string) int16 { 83 | L: 84 | for i, p := range g.posList { 85 | for j := 0; j < posDepth; j++ { 86 | if p[j] != pos[j] { 87 | continue L 88 | } 89 | } 90 | return int16(i) 91 | } 92 | return int16(-1) 93 | } 94 | 95 | func (g *Grammar) GetPosId(posstrings ...string) int16 { 96 | return g.GetPartOfSpeechId(posstrings) 97 | } 98 | 99 | func (g *Grammar) GetConnectCost(leftId int16, rightId int16) int16 { 100 | s := g.connectTableOffset + int(leftId)*2 + 2*int(g.leftIdSize)*int(rightId) 101 | _, cost := bufferToInt16(g.connectTableBytes, s) 102 | return cost 103 | } 104 | 105 | func (g *Grammar) SetConnectCost(leftId int16, rightId int16, cost int16) { 106 | if !g.isCopiedConnectTable { 107 | g.copyConnectTable() 108 | } 109 | s := g.connectTableOffset + int(leftId)*2 + 2*int(g.leftIdSize)*int(rightId) 110 | binary.LittleEndian.PutUint16(g.connectTableBytes[s:], uint16(cost)) 111 | } 112 | 113 | // syncronized ??? 114 | func (g *Grammar) copyConnectTable() { 115 | l := 2 * int(g.leftIdSize) * int(g.rightIdSize) 116 | newbuffer := make([]byte, l, l) 117 | s := g.connectTableOffset 118 | copy(newbuffer, g.connectTableBytes[s:s+l]) 119 | g.connectTableBytes = newbuffer 120 | g.connectTableOffset = 0 121 | g.isCopiedConnectTable = true 122 | } 123 | 124 | func (g *Grammar) WritePOSTableTo(buffer *bytes.Buffer, utf16string bool) error { 125 | var writeStringF writeStringFunc 126 | if utf16string { 127 | writeStringF = writeStringUtf16 128 | } else { 129 | writeStringF = writeString 130 | } 131 | err := binary.Write(buffer, binary.LittleEndian, uint16(len(g.posList))) 132 | if err != nil { 133 | return err 134 | } 135 | 136 | for _, pos := range g.posList { 137 | for _, t := range pos { 138 | err := writeStringF(buffer, t) 139 | if err != nil { 140 | return err 141 | } 142 | } 143 | } 144 | return nil 145 | } 146 | 147 | func (g *Grammar) WriteConnMatrixTo(writer io.Writer) (int, error) { 148 | err := binary.Write(writer, binary.LittleEndian, uint16(g.leftIdSize)) 149 | if err != nil { 150 | return 0, err 151 | } 152 | err = binary.Write(writer, binary.LittleEndian, uint16(g.rightIdSize)) 153 | if err != nil { 154 | return 2, err 155 | } 156 | var n int 157 | l := 2 * int(g.leftIdSize) * int(g.rightIdSize) 158 | if l > 0 { 159 | var err error 160 | n, err = writer.Write(g.connectTableBytes[g.connectTableOffset : g.connectTableOffset+l]) 161 | if err != nil { 162 | return 4, err 163 | } 164 | } 165 | return n + 4, nil 166 | } 167 | -------------------------------------------------------------------------------- /dictionary/lexiconset.go: -------------------------------------------------------------------------------- 1 | package dictionary 2 | 3 | const ( 4 | LexiconSetMaxDictionaries = 16 5 | ) 6 | 7 | type LexiconSet struct { 8 | lexicons []*DoubleArrayLexicon 9 | posOffsets []int32 10 | } 11 | 12 | func NewLexiconSet(systemLexicon *DoubleArrayLexicon) *LexiconSet { 13 | return &LexiconSet{ 14 | lexicons: []*DoubleArrayLexicon{systemLexicon}, 15 | posOffsets: []int32{0}, 16 | } 17 | } 18 | 19 | func (s *LexiconSet) Add(lexicon *DoubleArrayLexicon, posOffset int32) { 20 | s.lexicons = append(s.lexicons, lexicon) 21 | s.posOffsets = append(s.posOffsets, posOffset) 22 | } 23 | 24 | func (s *LexiconSet) IsFull() bool { 25 | return len(s.lexicons) >= LexiconSetMaxDictionaries 26 | } 27 | 28 | func (s *LexiconSet) Lookup(text []byte, offset int) *LexiconSetIterator { 29 | return newLexiconSetIterator(text, offset, s.lexicons) 30 | } 31 | 32 | func (s *LexiconSet) GetWordId(headword string, posId int16, readingForm string) int32 { 33 | for dictId := 1; dictId < len(s.lexicons); dictId++ { 34 | wordId := s.lexicons[dictId].GetWordId(headword, posId, readingForm) 35 | if wordId >= 0 { 36 | // buildWordId 37 | return int32(uint32(dictId)<<28) | wordId 38 | } 39 | } 40 | return s.lexicons[0].GetWordId(headword, posId, readingForm) 41 | } 42 | 43 | func (s *LexiconSet) GetLeftId(wordId int32) int16 { 44 | dictId := int(uint32(wordId) >> 28) 45 | wordId = int32(uint32(wordId) & 0xfffffff) 46 | return s.lexicons[dictId].GetLeftId(wordId) 47 | } 48 | 49 | func (s *LexiconSet) GetRightId(wordId int32) int16 { 50 | dictId := int(uint32(wordId) >> 28) 51 | wordId = int32(uint32(wordId) & 0xfffffff) 52 | return s.lexicons[dictId].GetRightId(wordId) 53 | } 54 | 55 | func (s *LexiconSet) GetCost(wordId int32) int16 { 56 | dictId := int(uint32(wordId) >> 28) 57 | wordId = int32(uint32(wordId) & 0xfffffff) 58 | return s.lexicons[dictId].GetCost(wordId) 59 | } 60 | 61 | func (s *LexiconSet) GetWordInfo(wordId int32) *WordInfo { 62 | dictId := int(uint32(wordId) >> 28) 63 | wordId = int32(uint32(wordId) & 0xfffffff) 64 | wi := s.lexicons[dictId].GetWordInfo(wordId) 65 | if dictId > 0 && int32(wi.PosId) >= s.posOffsets[1] { 66 | // user defined part-of-speech 67 | wi.PosId = int16(int32(wi.PosId) - s.posOffsets[1] + s.posOffsets[dictId]) 68 | } 69 | s.convertSplit(wi.AUnitSplit, dictId) 70 | s.convertSplit(wi.BUnitSplit, dictId) 71 | s.convertSplit(wi.WordStructure, dictId) 72 | return wi 73 | } 74 | 75 | func (s *LexiconSet) GetDictionaryId(wordId int32) int { 76 | return int(uint32(wordId) >> 28) 77 | } 78 | 79 | func (s *LexiconSet) Size() int32 { 80 | var n int32 81 | for _, l := range s.lexicons { 82 | n += l.Size() 83 | } 84 | return n 85 | } 86 | 87 | func (s *LexiconSet) convertSplit(split []int32, dictId int) { 88 | for i, id := range split { 89 | if s.GetDictionaryId(id) > 0 { 90 | wordId := uint32(id) & 0xfffffff 91 | // buildWordId 92 | split[i] = int32(uint32(dictId<<28) | wordId) 93 | } 94 | } 95 | } 96 | 97 | type LexiconSetIterator struct { 98 | text []byte 99 | offset int 100 | dictId int 101 | lexicons []*DoubleArrayLexicon 102 | dalit *DoubleArrayLexiconIterator 103 | } 104 | 105 | func newLexiconSetIterator(text []byte, offset int, lexicons []*DoubleArrayLexicon) *LexiconSetIterator { 106 | var ( 107 | dalit *DoubleArrayLexiconIterator 108 | dictId int 109 | ) 110 | if len(lexicons) == 1 { 111 | dictId = 0 112 | } else { 113 | dictId = 1 114 | } 115 | dalit = lexicons[dictId].Lookup(text, offset) 116 | 117 | return &LexiconSetIterator{ 118 | text: text, 119 | offset: offset, 120 | dictId: dictId, 121 | lexicons: lexicons, 122 | dalit: dalit, 123 | } 124 | } 125 | 126 | func (it *LexiconSetIterator) Next() bool { 127 | if it.dalit.Err() != nil { 128 | return false 129 | } 130 | for !it.dalit.Next() { 131 | if it.dictId == 0 { 132 | return false 133 | } 134 | it.dictId++ 135 | if it.dictId >= len(it.lexicons) { 136 | it.dictId = 0 137 | } 138 | it.dalit = it.lexicons[it.dictId].Lookup(it.text, it.offset) 139 | } 140 | return true 141 | } 142 | 143 | func (it *LexiconSetIterator) Get() (int32, int) { 144 | rvalue, roffset := it.dalit.Get() 145 | if it.dalit.Err() != nil { 146 | return -1, 0 147 | } 148 | if it.dictId > 0 { 149 | // buildWordId 150 | rvalue = int32(uint32(it.dictId<<28) | uint32(rvalue)) 151 | } 152 | return rvalue, roffset 153 | } 154 | 155 | func (it *LexiconSetIterator) Err() error { 156 | return it.dalit.Err() 157 | } 158 | -------------------------------------------------------------------------------- /dictionary/wordinfo.go: -------------------------------------------------------------------------------- 1 | package dictionary 2 | 3 | type WordInfo struct { 4 | Surface string 5 | HeadwordLength int16 6 | PosId int16 7 | NormalizedForm string 8 | DictionaryFormWordId int32 9 | DictionaryForm string 10 | ReadingForm string 11 | AUnitSplit []int32 12 | BUnitSplit []int32 13 | WordStructure []int32 14 | } 15 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/msnoigrs/gosudachi 2 | 3 | go 1.12 4 | 5 | require ( 6 | github.com/emirpasic/gods v1.12.0 7 | github.com/shurcooL/httpfs v0.0.0-20190707220628-8d4bc4ba7749 // indirect 8 | github.com/shurcooL/vfsgen v0.0.0-20181202132449-6a9ea43bcacd // indirect 9 | golang.org/x/sys v0.0.0-20190310054646-10058d7d4faa 10 | golang.org/x/text v0.3.0 11 | ) 12 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | github.com/emirpasic/gods v1.12.0 h1:QAUIPSaCu4G+POclxeqb3F+WPpdKqFGlw36+yOzGlrg= 2 | github.com/emirpasic/gods v1.12.0/go.mod h1:YfzfFFoVP/catgzJb4IKIqXjX78Ha8FMSDh3ymbK86o= 3 | github.com/shurcooL/httpfs v0.0.0-20190707220628-8d4bc4ba7749 h1:bUGsEnyNbVPw06Bs80sCeARAlK8lhwqGyi6UT8ymuGk= 4 | github.com/shurcooL/httpfs v0.0.0-20190707220628-8d4bc4ba7749/go.mod h1:ZY1cvUeJuFPAdZ/B6v7RHavJWZn2YPVFQ1OSXhCGOkg= 5 | github.com/shurcooL/vfsgen v0.0.0-20181202132449-6a9ea43bcacd h1:ug7PpSOB5RBPK1Kg6qskGBoP3Vnj/aNYFTznWvlkGo0= 6 | github.com/shurcooL/vfsgen v0.0.0-20181202132449-6a9ea43bcacd/go.mod h1:TrYk7fJVaAttu97ZZKrO9UbRa8izdowaMIZcxYMbVaw= 7 | golang.org/x/sys v0.0.0-20190310054646-10058d7d4faa h1:lqti/xP+yD/6zH5TqEwx2MilNIJY5Vbc6Qr8J3qyPIQ= 8 | golang.org/x/sys v0.0.0-20190310054646-10058d7d4faa/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 9 | golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg= 10 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 11 | -------------------------------------------------------------------------------- /inhibitconnectioncostplugin.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "github.com/msnoigrs/gosudachi/dictionary" 5 | ) 6 | 7 | type InhibitConnectionPlugin struct { 8 | inhibitedPair []*[]int 9 | } 10 | 11 | func NewInhibitConnectionPlugin(inhibitedPair []*[]int) *InhibitConnectionPlugin { 12 | return &InhibitConnectionPlugin{ 13 | inhibitedPair: inhibitedPair, 14 | } 15 | } 16 | 17 | func (p *InhibitConnectionPlugin) GetConfigStruct() interface{} { 18 | return p 19 | } 20 | 21 | func (p *InhibitConnectionPlugin) SetUp(grammar *dictionary.Grammar) error { 22 | return nil 23 | } 24 | 25 | func (p *InhibitConnectionPlugin) Edit(grammar *dictionary.Grammar) error { 26 | for _, pair := range p.inhibitedPair { 27 | if len(*pair) < 2 { 28 | continue 29 | } 30 | InhibitConnection(grammar, int16((*pair)[0]), int16((*pair)[1])) 31 | } 32 | return nil 33 | } 34 | -------------------------------------------------------------------------------- /inputtext.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "unicode/utf8" 5 | 6 | "github.com/msnoigrs/gosudachi/dictionary" 7 | ) 8 | 9 | type InputText struct { 10 | OriginalText string 11 | ModifiedText string 12 | Bytea []byte 13 | offsets []int 14 | byteIndexes []int 15 | charCategories []uint32 16 | charCategoryContinuities []int 17 | canBowList []bool 18 | } 19 | 20 | func NewInputText(originalText string, modifiedText string, bytea []byte, offsets []int, byteIndexes []int, charCategories []uint32, charCategoryContinuities []int, canBowList []bool) *InputText { 21 | return &InputText{ 22 | OriginalText: originalText, 23 | ModifiedText: modifiedText, 24 | Bytea: bytea, 25 | offsets: offsets, 26 | byteIndexes: byteIndexes, 27 | charCategories: charCategories, 28 | charCategoryContinuities: charCategoryContinuities, 29 | canBowList: canBowList, 30 | } 31 | } 32 | 33 | func (t *InputText) GetText() string { 34 | return t.ModifiedText 35 | } 36 | 37 | func (t *InputText) GetByteText() []byte { 38 | return t.Bytea 39 | } 40 | 41 | func (t *InputText) GetSubstring(begin int, end int) string { 42 | return string([]rune(t.ModifiedText)[t.byteIndexes[begin]:t.byteIndexes[end]]) 43 | } 44 | 45 | func (t *InputText) GetOffsetTextLength(index int) int { 46 | return t.byteIndexes[index] 47 | } 48 | 49 | func (t *InputText) GetOriginalIndex(index int) int { 50 | return t.offsets[index] 51 | } 52 | 53 | func (t *InputText) GetCharCategoryTypes(index int) uint32 { 54 | return t.charCategories[t.byteIndexes[index]] 55 | } 56 | 57 | func (t *InputText) GetCharCategoryTypesRange(begin int, end int) uint32 { 58 | if begin+t.charCategoryContinuities[begin] < end { 59 | return uint32(0) 60 | } 61 | b := t.byteIndexes[begin] 62 | e := t.byteIndexes[end] 63 | continuousCategory := t.charCategories[b] 64 | for i := b + 1; i < e; i++ { 65 | continuousCategory &= t.charCategories[i] 66 | } 67 | return continuousCategory 68 | } 69 | 70 | func (t *InputText) GetCharCategoryContinuousLength(index int) int { 71 | return t.charCategoryContinuities[index] 72 | } 73 | 74 | func (t *InputText) GetCodePointsOffsetLength(index int, codePointOffset int) int { 75 | length := 0 76 | target := t.byteIndexes[index] + codePointOffset 77 | for i := index; i < len(t.Bytea); i++ { 78 | if t.byteIndexes[i] >= target { 79 | return length 80 | } 81 | length++ 82 | } 83 | return length 84 | } 85 | 86 | func (t *InputText) CodePointCount(begin int, end int) int { 87 | return t.byteIndexes[end] - t.byteIndexes[begin] 88 | } 89 | 90 | func (t *InputText) CanBow(index int) bool { 91 | return t.IsCharAlignment(index) && t.canBowList[t.byteIndexes[index]] 92 | } 93 | 94 | func (t *InputText) IsCharAlignment(index int) bool { 95 | return (t.Bytea[index] & 0xC0) != 0x80 96 | } 97 | 98 | type InputTextBuilder struct { 99 | OriginalText string 100 | modifiedRunes []rune 101 | textOffsets []int 102 | grammar *dictionary.Grammar 103 | } 104 | 105 | func NewInputTextBuilder(text string, grammar *dictionary.Grammar) *InputTextBuilder { 106 | modifiedRunes := []rune(text) 107 | offsetslen := len(modifiedRunes) + 1 108 | textOffsets := make([]int, offsetslen, offsetslen) 109 | for i := 0; i < len(modifiedRunes); i++ { 110 | textOffsets[i] = i 111 | } 112 | textOffsets[len(modifiedRunes)] = len(modifiedRunes) 113 | return &InputTextBuilder{ 114 | OriginalText: text, 115 | modifiedRunes: modifiedRunes, 116 | textOffsets: textOffsets, 117 | grammar: grammar, 118 | } 119 | } 120 | 121 | func (builder *InputTextBuilder) GetText() []rune { 122 | ret := make([]rune, len(builder.modifiedRunes)) 123 | copy(ret, builder.modifiedRunes) 124 | return ret 125 | } 126 | 127 | func (builder *InputTextBuilder) Replace(begin int, end int, runes []rune) { 128 | rl := len(runes) 129 | tlen := end - begin 130 | 131 | offset := builder.textOffsets[begin] 132 | 133 | if rl < tlen { 134 | ol := len(builder.modifiedRunes) 135 | copy(builder.modifiedRunes[begin+rl:], builder.modifiedRunes[end:]) 136 | copy(builder.modifiedRunes[begin:], runes) 137 | builder.modifiedRunes = builder.modifiedRunes[:ol-tlen+rl] 138 | 139 | tolen := len(builder.textOffsets) 140 | copy(builder.textOffsets[begin+rl:], builder.textOffsets[end:]) 141 | builder.textOffsets = builder.textOffsets[:tolen-tlen+rl] 142 | } else if rl == tlen { 143 | copy(builder.modifiedRunes[begin:], runes) 144 | } else { 145 | builder.modifiedRunes = append(builder.modifiedRunes, make([]rune, rl-tlen)...) 146 | copy(builder.modifiedRunes[begin+rl:], builder.modifiedRunes[end:]) 147 | copy(builder.modifiedRunes[begin:], runes) 148 | 149 | builder.textOffsets = append(builder.textOffsets, make([]int, rl-tlen)...) 150 | copy(builder.textOffsets[begin+rl:], builder.textOffsets[end:]) 151 | } 152 | 153 | for i := 0; i < rl; i++ { 154 | builder.textOffsets[begin+i] = offset 155 | } 156 | } 157 | 158 | func (builder *InputTextBuilder) Build() *InputText { 159 | // getCharCategoryTypes 160 | runeCount := len(builder.modifiedRunes) 161 | charCategoryTypes := make([]uint32, runeCount, runeCount) 162 | for i := 0; i < runeCount; i++ { 163 | charCategoryTypes[i] = builder.grammar.CharCategory.GetCategoryTypes(builder.modifiedRunes[i]) 164 | } 165 | 166 | modifiedText := string(builder.modifiedRunes) 167 | p := []byte(modifiedText) 168 | keepp := p 169 | bytelength := len(p) 170 | size := bytelength + 1 171 | indexes := make([]int, size, size) 172 | offsets := make([]int, size, size) 173 | 174 | sizes := make([]int, runeCount, runeCount) 175 | 176 | pi := 0 177 | for i := 0; len(p) > 0; i++ { 178 | _, size := utf8.DecodeRune(p) 179 | sizes[i] = size 180 | for j := 0; j < size; j++ { 181 | indexes[pi] = i 182 | offsets[pi] = builder.textOffsets[i] 183 | pi++ 184 | } 185 | p = p[size:] 186 | } 187 | indexes[bytelength] = runeCount 188 | offsets[bytelength] = builder.textOffsets[len(builder.textOffsets)-1] 189 | 190 | // getCharCategoryContinuities 191 | charCategoryContinuities := make([]int, bytelength, bytelength) 192 | pi = 0 193 | for i := 0; i < runeCount; { 194 | next := i + getCharCategoryContinuousLength(charCategoryTypes, i) 195 | var length int 196 | for j := i; j < next; j++ { 197 | length += sizes[j] 198 | } 199 | for k := length; k > 0; k-- { 200 | charCategoryContinuities[pi] = k 201 | pi++ 202 | } 203 | i = next 204 | } 205 | 206 | // buildCanBowList 207 | canBowList := make([]bool, runeCount, runeCount) 208 | if runeCount > 0 { 209 | canBowList[0] = true 210 | for i := 1; i < runeCount; i++ { 211 | types := charCategoryTypes[i] 212 | if (types&dictionary.ALPHA == dictionary.ALPHA) || 213 | (types&dictionary.GREEK == dictionary.GREEK) || 214 | (types&dictionary.CYRILLIC == dictionary.CYRILLIC) { 215 | cc := charCategoryTypes[i-1] & types 216 | canBowList[i] = cc == 0 217 | continue 218 | } 219 | canBowList[i] = true 220 | } 221 | } 222 | 223 | return &InputText{ 224 | builder.OriginalText, 225 | modifiedText, 226 | keepp, 227 | offsets, 228 | indexes, 229 | charCategoryTypes, 230 | charCategoryContinuities, 231 | canBowList, 232 | } 233 | } 234 | 235 | func getCharCategoryContinuousLength(charCategories []uint32, offset int) int { 236 | continuousCategory := charCategories[offset] 237 | var length int 238 | for length = 1; length < len(charCategories)-offset; length++ { 239 | cc := continuousCategory & charCategories[offset+length] 240 | if cc == 0 { 241 | return length 242 | } 243 | } 244 | return length 245 | } 246 | -------------------------------------------------------------------------------- /internal/lnreader/lnreader.go: -------------------------------------------------------------------------------- 1 | package lnreader 2 | 3 | import ( 4 | "bufio" 5 | "io" 6 | ) 7 | 8 | type LineNumberReader struct { 9 | r *bufio.Reader 10 | rawBuffer []byte 11 | NumLine int 12 | } 13 | 14 | func NewLineNumberReader(r io.Reader) *LineNumberReader { 15 | return &LineNumberReader{ 16 | r: bufio.NewReader(r), 17 | } 18 | } 19 | 20 | func (r *LineNumberReader) ReadLine() ([]byte, error) { 21 | line, err := r.r.ReadSlice('\n') 22 | if err == bufio.ErrBufferFull { 23 | r.rawBuffer = append(r.rawBuffer[:0], line...) 24 | for err == bufio.ErrBufferFull { 25 | line, err = r.r.ReadSlice('\n') 26 | r.rawBuffer = append(r.rawBuffer, line...) 27 | } 28 | line = r.rawBuffer 29 | } 30 | if len(line) > 0 && err == io.EOF { 31 | err = nil 32 | } else if err == nil { 33 | n := len(line) 34 | if n >= 2 && line[n-2] == '\r' && line[n-1] == '\n' { 35 | line = line[:n-2] 36 | } else { 37 | line = line[:n-1] 38 | } 39 | } 40 | if err == nil { 41 | r.NumLine++ 42 | } 43 | return line, err 44 | } 45 | 46 | func IsSkipLine(l []byte) bool { 47 | for i, c := range l { 48 | if i == 0 && c == '#' { 49 | return true 50 | } else { 51 | if c != ' ' && c != '\n' && c != '\t' { 52 | return false 53 | } 54 | } 55 | } 56 | return true 57 | } 58 | 59 | func IsEmptyLine(l []byte) bool { 60 | for _, c := range l { 61 | if c != ' ' && c != '\n' && c != '\t' { 62 | return false 63 | } 64 | } 65 | return true 66 | } 67 | -------------------------------------------------------------------------------- /internal/mmap/mmap_unix.go: -------------------------------------------------------------------------------- 1 | // +build !windows 2 | 3 | package mmap 4 | 5 | import ( 6 | "os" 7 | "syscall" 8 | "unsafe" 9 | 10 | "golang.org/x/sys/unix" 11 | ) 12 | 13 | func Mmap(fd *os.File, writable bool, offset int64, size int64) ([]byte, error) { 14 | mtype := unix.PROT_READ 15 | if writable { 16 | mtype |= unix.PROT_WRITE 17 | } 18 | return unix.Mmap(int(fd.Fd()), offset, int(size), mtype, unix.MAP_SHARED) 19 | } 20 | 21 | func Munmap(b []byte) error { 22 | return unix.Munmap(b) 23 | } 24 | 25 | func Madvise(b []byte, readahead bool) error { 26 | flags := unix.MADV_NORMAL 27 | if !readahead { 28 | flags = unix.MADV_RANDOM 29 | } 30 | return madvise(b, flags) 31 | } 32 | 33 | // This is required because the unix package does not support the madvise system call on OS X 34 | func madvise(b []byte, advice int) (err error) { 35 | _, _, e1 := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&b[0])), 36 | uintptr(len(b)), uintptr(advice)) 37 | if e1 != 0 { 38 | err = e1 39 | } 40 | return 41 | } 42 | -------------------------------------------------------------------------------- /internal/mmap/mmap_windows.go: -------------------------------------------------------------------------------- 1 | // +build windows 2 | 3 | package mmap 4 | 5 | import ( 6 | "fmt" 7 | "os" 8 | "syscall" 9 | "unsafe" 10 | ) 11 | 12 | func Mmap(fd *os.File, write bool, offset int64, size int64) ([]byte, error) { 13 | protect := syscall.PAGE_READONLY 14 | access := syscall.FILE_MAP_READ 15 | 16 | if write { 17 | protect = syscall.PAGE_READWRITE 18 | access = syscall.FILE_MAP_WRITE 19 | } 20 | fi, err := fd.Stat() 21 | if err != nil { 22 | return nil, err 23 | } 24 | 25 | if fi.Size() < size { 26 | if err := fd.Truncate(size); err != nil { 27 | return nil, fmt.Errorf("truncate: %s", err) 28 | } 29 | } 30 | 31 | maxsize := size + offset 32 | maxsizehi := uint32(maxsize >> 32) 33 | maxsizelo := uint32(maxsize & 0xffffffff) 34 | 35 | handle, err := syscall.CreateFileMapping(syscall.Handle(fd.Fd()), nil, 36 | uint32(protect), maxsizehi, maxsizelo, nil) 37 | if err != nil { 38 | return nil, os.NewSyscallError("CreateFileMapping", err) 39 | } 40 | 41 | offsethi := uint32(offset >> 32) 42 | offsetlo := uint32(offset & 0xffffffff) 43 | addr, err := syscall.MapViewOfFile(handle, uint32(access), offsethi, offsetlo, uintptr(size)) 44 | if addr == 0 { 45 | return nil, os.NewSyscallError("MapViewOfFile", err) 46 | } 47 | 48 | if err := syscall.CloseHandle(syscall.Handle(handle)); err != nil { 49 | return nil, os.NewSyscallError("CloseHandle", err) 50 | } 51 | 52 | // Slice memory layout 53 | // Copied this snippet from golang/sys package 54 | var sl = struct { 55 | addr uintptr 56 | len int 57 | cap int 58 | }{addr, int(size), int(size)} 59 | 60 | // Use unsafe to turn sl into a []byte 61 | data := *(*[]byte)(unsafe.Pointer(&sl)) 62 | 63 | return data, nil 64 | } 65 | 66 | func Munmap(b []byte) error { 67 | return syscall.UnmapViewOfFile(uintptr(unsafe.Pointer(&b[0]))) 68 | } 69 | 70 | func Madvise(b []byte, readahead bool) error { 71 | // Do Nothing. We don't care about this setting on Windows 72 | return nil 73 | } 74 | -------------------------------------------------------------------------------- /joinkatakanaoovplugin.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/msnoigrs/gosudachi/dictionary" 7 | ) 8 | 9 | type JoinKatakanaOovPluginConfig struct { 10 | OovPOS *[]string 11 | MinLength *int 12 | } 13 | 14 | type JoinKatakanaOovPlugin struct { 15 | config *JoinKatakanaOovPluginConfig 16 | oovPosId int16 17 | minLength int 18 | } 19 | 20 | func NewJoinKatakanaOovPlugin(config *JoinKatakanaOovPluginConfig) *JoinKatakanaOovPlugin { 21 | if config == nil { 22 | config = &JoinKatakanaOovPluginConfig{} 23 | } 24 | return &JoinKatakanaOovPlugin{ 25 | config: config, 26 | } 27 | } 28 | 29 | func (p *JoinKatakanaOovPlugin) GetConfigStruct() interface{} { 30 | if p.config == nil { 31 | p.config = &JoinKatakanaOovPluginConfig{} 32 | } 33 | return p.config 34 | } 35 | 36 | func (p *JoinKatakanaOovPlugin) SetUp(grammar *dictionary.Grammar) error { 37 | if p.config.OovPOS == nil || len(*p.config.OovPOS) == 0 { 38 | return fmt.Errorf("JoinKatakanaOovPlugin: oovPOS is not specified") 39 | } 40 | p.oovPosId = grammar.GetPartOfSpeechId(*p.config.OovPOS) 41 | if p.oovPosId < 0 { 42 | return fmt.Errorf("JoinKatakanaOovPlugin: oovPOS is invalid") 43 | } 44 | minLength := 1 45 | if p.config.MinLength != nil { 46 | minLength = *p.config.MinLength 47 | if minLength < 0 { 48 | return fmt.Errorf("JoinKatakanaOovPlugin: minLength is negative") 49 | } 50 | } 51 | p.minLength = minLength 52 | p.config = nil 53 | return nil 54 | } 55 | 56 | func isShorter(length int, text *InputText, node *LatticeNode) bool { 57 | return text.CodePointCount(node.Begin, node.End) < length 58 | } 59 | 60 | func isKatakanaNode(text *InputText, node *LatticeNode) bool { 61 | types := GetCharCategoryTypes(text, node) 62 | return (types & dictionary.KATAKANA) == dictionary.KATAKANA 63 | } 64 | 65 | func canOovBowNode(text *InputText, node *LatticeNode) bool { 66 | types := GetCharCategoryTypes(text, node) 67 | return types&dictionary.NOOOVBOW != dictionary.NOOOVBOW 68 | } 69 | 70 | func (p *JoinKatakanaOovPlugin) Rewrite(text *InputText, path *[]*LatticeNode, lattice *Lattice) error { 71 | for i := 0; i < len(*path); i++ { 72 | node := (*path)[i] 73 | if (node.IsOov || isShorter(p.minLength, text, node)) && 74 | isKatakanaNode(text, node) { 75 | begin := i - 1 76 | for ; begin >= 0; begin-- { 77 | if !isKatakanaNode(text, (*path)[begin]) { 78 | begin++ 79 | break 80 | } 81 | } 82 | if begin < 0 { 83 | begin = 0 84 | } 85 | end := i + 1 86 | for ; end < len(*path); end++ { 87 | if !isKatakanaNode(text, (*path)[end]) { 88 | break 89 | } 90 | } 91 | for begin != end && !canOovBowNode(text, (*path)[begin]) { 92 | begin++ 93 | } 94 | if end-begin > 1 { 95 | _, err := ConcatenateOov(path, begin, end, p.oovPosId, lattice) 96 | if err != nil { 97 | return fmt.Errorf("JoinKatakanaOovPlugin: %s", err) 98 | } 99 | i = begin + 1 100 | } 101 | } 102 | } 103 | return nil 104 | } 105 | -------------------------------------------------------------------------------- /joinnumericplugin.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/msnoigrs/gosudachi/dictionary" 7 | ) 8 | 9 | type JoinNumericPluginConfig struct { 10 | EnableNormalize *bool 11 | } 12 | 13 | type JoinNumericPlugin struct { 14 | config *JoinNumericPluginConfig 15 | enableNormalize bool 16 | numericPosId int16 17 | } 18 | 19 | func NewJoinNumericPlugin(config *JoinNumericPluginConfig) *JoinNumericPlugin { 20 | if config == nil { 21 | config = &JoinNumericPluginConfig{} 22 | } 23 | return &JoinNumericPlugin{ 24 | config: config, 25 | } 26 | } 27 | 28 | func (p *JoinNumericPlugin) GetConfigStruct() interface{} { 29 | if p.config == nil { 30 | p.config = &JoinNumericPluginConfig{} 31 | } 32 | return p.config 33 | } 34 | 35 | func (p *JoinNumericPlugin) SetUp(grammar *dictionary.Grammar) error { 36 | p.numericPosId = grammar.GetPartOfSpeechId(NumericPos) 37 | if p.config.EnableNormalize == nil { 38 | p.enableNormalize = true 39 | } else { 40 | p.enableNormalize = *p.config.EnableNormalize 41 | } 42 | p.config = nil 43 | return nil 44 | } 45 | 46 | func (p *JoinNumericPlugin) concatNodes(path *[]*LatticeNode, begin int, end int, lattice *Lattice, parser *numericParser) error { 47 | tpath := *path 48 | wi := tpath[begin].GetWordInfo() 49 | if wi.PosId != p.numericPosId { 50 | return nil 51 | } 52 | if p.enableNormalize { 53 | normalizedForm := parser.getNormalized() 54 | if end-begin > 1 || 55 | normalizedForm != wi.NormalizedForm { 56 | _, err := ConcatenateNodes(path, begin, end, lattice, normalizedForm) 57 | if err != nil { 58 | return err 59 | } 60 | } 61 | } else { 62 | if end-begin > 1 { 63 | _, err := ConcatenateNodes(path, begin, end, lattice, "") 64 | if err != nil { 65 | return err 66 | } 67 | } 68 | } 69 | return nil 70 | } 71 | 72 | func (p *JoinNumericPlugin) Rewrite(text *InputText, path *[]*LatticeNode, lattice *Lattice) error { 73 | beginIndex := -1 74 | commaAsDigit := true 75 | periodAsDigit := true 76 | parser := newNumericParser() 77 | 78 | for i := 0; i < len(*path); i++ { 79 | node := (*path)[i] 80 | types := GetCharCategoryTypes(text, node) 81 | wi := node.GetWordInfo() 82 | s := wi.NormalizedForm 83 | if (types&dictionary.NUMERIC) == dictionary.NUMERIC || 84 | (types&dictionary.KANJINUMERIC) == dictionary.KANJINUMERIC || 85 | (periodAsDigit && s == ".") || 86 | (commaAsDigit && s == ",") { 87 | 88 | if beginIndex < 0 { 89 | parser.clear() 90 | beginIndex = i 91 | } 92 | 93 | for _, c := range s { 94 | if !parser.append(c) { 95 | if beginIndex >= 0 { 96 | if parser.errorState == errComma { 97 | commaAsDigit = false 98 | i = beginIndex - 1 99 | } else if parser.errorState == errPoint { 100 | periodAsDigit = false 101 | i = beginIndex - 1 102 | } 103 | beginIndex = -1 104 | } 105 | break 106 | } 107 | } 108 | } else { 109 | if beginIndex >= 0 { 110 | if parser.done() { 111 | err := p.concatNodes(path, beginIndex, i, lattice, parser) 112 | if err != nil { 113 | return fmt.Errorf("JoinNumericPlugin: %s", err) 114 | } 115 | i = beginIndex + 1 116 | } else { 117 | wi := (*path)[i-1].GetWordInfo() 118 | ss := wi.NormalizedForm 119 | if (parser.errorState == errComma && ss == ",") || 120 | (parser.errorState == errPoint && ss == ".") { 121 | err := p.concatNodes(path, beginIndex, i-1, lattice, parser) 122 | if err != nil { 123 | return fmt.Errorf("JoinNumericPlugin: %s", err) 124 | } 125 | i = beginIndex + 2 126 | } 127 | } 128 | } 129 | beginIndex = -1 130 | if !commaAsDigit && s != "," { 131 | commaAsDigit = true 132 | } 133 | if !periodAsDigit && s != "." { 134 | periodAsDigit = true 135 | } 136 | } 137 | } 138 | 139 | if beginIndex >= 0 { 140 | if parser.done() { 141 | p.concatNodes(path, beginIndex, len(*path), lattice, parser) 142 | } else { 143 | wi := (*path)[len(*path)-1].GetWordInfo() 144 | ss := wi.NormalizedForm 145 | if (parser.errorState == errComma && ss == ",") || 146 | (parser.errorState == errPoint && ss == ".") { 147 | p.concatNodes(path, beginIndex, len(*path)-1, lattice, parser) 148 | } 149 | } 150 | } 151 | return nil 152 | } 153 | 154 | var NumericPos []string = []string{"名詞", "数詞", "*", "*", "*", "*"} 155 | -------------------------------------------------------------------------------- /lattice.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "errors" 5 | "fmt" 6 | "io" 7 | "math" 8 | "strings" 9 | 10 | "github.com/msnoigrs/gosudachi/dictionary" 11 | ) 12 | 13 | const ( 14 | NullSurface = "(null)" 15 | ) 16 | 17 | var UndefinedWordInfo = &dictionary.WordInfo{ 18 | Surface: NullSurface, 19 | HeadwordLength: 0, 20 | PosId: -1, 21 | NormalizedForm: NullSurface, 22 | DictionaryFormWordId: -1, 23 | DictionaryForm: NullSurface, 24 | ReadingForm: NullSurface, 25 | } 26 | 27 | type LatticeNode struct { 28 | Begin int 29 | End int 30 | leftId int16 31 | rightId int16 32 | cost int16 33 | wordId int32 34 | totalCost int 35 | bestPreviousNode *LatticeNode 36 | isConnectedToBOS bool 37 | isDefined bool 38 | IsOov bool 39 | extraWordInfo *dictionary.WordInfo 40 | lexicon *dictionary.LexiconSet 41 | } 42 | 43 | func NewLatticeNode(lexicon *dictionary.LexiconSet, leftId int16, rightId int16, cost int16, wordId int32) *LatticeNode { 44 | return &LatticeNode{ 45 | lexicon: lexicon, 46 | leftId: leftId, 47 | rightId: rightId, 48 | cost: cost, 49 | wordId: wordId, 50 | isDefined: true, 51 | } 52 | } 53 | 54 | func (ln *LatticeNode) SetParameter(leftId int16, rightId int16, cost int16) { 55 | ln.leftId = leftId 56 | ln.rightId = rightId 57 | ln.cost = cost 58 | } 59 | 60 | func (ln *LatticeNode) GetBegin() int { 61 | return ln.Begin 62 | } 63 | 64 | func (ln *LatticeNode) GetEnd() int { 65 | return ln.End 66 | } 67 | 68 | func (ln *LatticeNode) SetRange(begin int, end int) { 69 | ln.Begin = begin 70 | ln.End = end 71 | } 72 | 73 | func (ln *LatticeNode) IsOOV() bool { 74 | return ln.IsOov 75 | } 76 | 77 | func (ln *LatticeNode) SetOOV() { 78 | ln.IsOov = true 79 | } 80 | 81 | func (ln *LatticeNode) GetWordInfo() *dictionary.WordInfo { 82 | if !ln.isDefined { 83 | return UndefinedWordInfo 84 | } 85 | if ln.extraWordInfo != nil { 86 | return ln.extraWordInfo 87 | } 88 | return ln.lexicon.GetWordInfo(ln.wordId) 89 | } 90 | 91 | func (ln *LatticeNode) SetWordInfo(wordInfo *dictionary.WordInfo) { 92 | ln.extraWordInfo = wordInfo 93 | ln.isDefined = true 94 | } 95 | 96 | func (ln *LatticeNode) GetPathCost() int { 97 | return int(ln.cost) 98 | } 99 | 100 | func (ln *LatticeNode) GetWordId() int { 101 | return int(uint32(ln.wordId)) 102 | } 103 | 104 | func (ln *LatticeNode) GetDictionaryId() int { 105 | if !ln.isDefined || ln.extraWordInfo != nil { 106 | return -1 107 | } 108 | return ln.lexicon.GetDictionaryId(ln.wordId) 109 | } 110 | 111 | func (ln *LatticeNode) String() string { 112 | var ( 113 | surface string 114 | pos int16 115 | ) 116 | 117 | wi := ln.GetWordInfo() 118 | surface = wi.Surface 119 | pos = wi.PosId 120 | 121 | return fmt.Sprintf("%d %d %s(%d) %d %d %d %d", ln.Begin, ln.End, surface, ln.wordId, pos, ln.leftId, ln.rightId, ln.cost) 122 | } 123 | 124 | type Lattice struct { 125 | endLists [][]*LatticeNode 126 | eosNode *LatticeNode 127 | grammar *dictionary.Grammar 128 | eosParams []int16 129 | } 130 | 131 | func NewLattice(grammar *dictionary.Grammar) *Lattice { 132 | bosNode := &LatticeNode{} 133 | bosParams := dictionary.BosParameter 134 | bosNode.SetParameter(bosParams[0], bosParams[1], bosParams[2]) 135 | bosNode.isConnectedToBOS = true 136 | endLists := make([][]*LatticeNode, 1) 137 | singletonList := make([]*LatticeNode, 1) 138 | singletonList[0] = bosNode 139 | endLists[0] = singletonList 140 | return &Lattice{ 141 | endLists: endLists, 142 | grammar: grammar, 143 | eosParams: dictionary.EosParameter, 144 | } 145 | } 146 | 147 | func (l *Lattice) resize(size int) { 148 | if size > len(l.endLists)-1 { 149 | l.expand(size) 150 | } 151 | l.eosNode = &LatticeNode{} 152 | l.eosNode.SetParameter(l.eosParams[0], l.eosParams[1], l.eosParams[2]) 153 | l.eosNode.Begin = size 154 | l.eosNode.End = size 155 | } 156 | 157 | func (l *Lattice) clear() { 158 | for i := 1; i < len(l.endLists); i++ { 159 | l.endLists[i] = l.endLists[i][:0] 160 | } 161 | } 162 | 163 | func (l *Lattice) expand(newSize int) { 164 | reallen := newSize + 1 165 | oldlen := len(l.endLists) 166 | if oldlen < reallen { 167 | l.endLists = append(l.endLists, make([][]*LatticeNode, reallen-oldlen)...) 168 | for i := oldlen; i < reallen; i++ { 169 | l.endLists[i] = []*LatticeNode{} 170 | } 171 | } 172 | } 173 | 174 | func (l *Lattice) GetNodesWithEnd(end int) []*LatticeNode { 175 | return l.endLists[end] 176 | } 177 | 178 | func (l *Lattice) GetNodes(begin int, end int) []*LatticeNode { 179 | ret := make([]*LatticeNode, 0) 180 | for _, n := range l.endLists[end] { 181 | if n.Begin == begin { 182 | ret = append(ret, n) 183 | } 184 | } 185 | return ret 186 | } 187 | 188 | func (l *Lattice) GetMinimumNode(begin int, end int) *LatticeNode { 189 | var ( 190 | ret *LatticeNode 191 | mincost int16 192 | ) 193 | for _, n := range l.endLists[end] { 194 | if n.Begin == begin { 195 | if ret == nil || mincost > n.cost { 196 | ret = n 197 | mincost = n.cost 198 | } 199 | } 200 | } 201 | return ret 202 | } 203 | 204 | func (l *Lattice) Insert(begin int, end int, node *LatticeNode) { 205 | l.endLists[end] = append(l.endLists[end], node) 206 | node.Begin = begin 207 | node.End = end 208 | 209 | l.connectNode(node) 210 | } 211 | 212 | func (l *Lattice) Remove(begin int, end int, node *LatticeNode) { 213 | t := l.endLists[end] 214 | for i, n := range t { 215 | if n == node { 216 | if len(t) > 1 { 217 | copy(t[i:], t[i+1:]) 218 | } 219 | t[len(t)-1] = nil 220 | l.endLists[end] = t[:len(t)-1] 221 | } 222 | } 223 | } 224 | 225 | func (l *Lattice) HasPreviousNode(index int) bool { 226 | return len(l.endLists[index]) > 0 227 | } 228 | 229 | func (l *Lattice) connectNode(rNode *LatticeNode) { 230 | begin := rNode.Begin 231 | rNode.totalCost = math.MaxInt32 232 | for _, lNode := range l.endLists[begin] { 233 | if !lNode.isConnectedToBOS { 234 | continue 235 | } 236 | connectCost := l.grammar.GetConnectCost(lNode.rightId, rNode.leftId) 237 | if connectCost == dictionary.InhibitedConnection { 238 | continue // this connection is not allowed 239 | } 240 | cost := lNode.totalCost + int(connectCost) 241 | if cost < rNode.totalCost { 242 | rNode.totalCost = cost 243 | rNode.bestPreviousNode = lNode 244 | } 245 | } 246 | rNode.isConnectedToBOS = rNode.bestPreviousNode != nil 247 | rNode.totalCost += int(rNode.cost) 248 | } 249 | 250 | func (l *Lattice) connectEosNode() { 251 | l.connectNode(l.eosNode) 252 | } 253 | 254 | func (l *Lattice) GetBestPath() ([]*LatticeNode, error) { 255 | if !l.eosNode.isConnectedToBOS { // EOS node 256 | return nil, errors.New("EOS isn't connected to BOS") 257 | } 258 | ret := make([]*LatticeNode, 0) 259 | for node := l.eosNode.bestPreviousNode; node != l.endLists[0][0]; node = node.bestPreviousNode { 260 | ret = append(ret, node) 261 | } 262 | 263 | if len(ret) > 1 { 264 | // reverse 265 | for i := len(ret)/2 - 1; i >= 0; i-- { 266 | opp := len(ret) - 1 - i 267 | ret[i], ret[opp] = ret[opp], ret[i] 268 | } 269 | } 270 | return ret, nil 271 | } 272 | 273 | func (l *Lattice) Dump(w io.Writer) { 274 | index := 0 275 | for i := len(l.endLists); i >= 0; i-- { 276 | var rNodes []*LatticeNode 277 | if i <= len(l.endLists)-1 { 278 | rNodes = l.endLists[i] 279 | } else { 280 | rNodes = []*LatticeNode{l.eosNode} 281 | } 282 | for _, rNode := range rNodes { 283 | var ( 284 | surface, pos string 285 | ) 286 | if !rNode.isDefined { 287 | surface = "(null)" 288 | pos = "BOS/EOS" 289 | } else { 290 | wi := rNode.GetWordInfo() 291 | surface = wi.Surface 292 | posId := wi.PosId 293 | if posId < 0 { 294 | pos = "(null)" 295 | } else { 296 | pos = strings.Join(l.grammar.GetPartOfSpeechString(posId), ",") 297 | } 298 | } 299 | 300 | fmt.Fprintf(w, "%d: %d %d %s(%d) %s %d %d %d: ", index, rNode.Begin, rNode.End, surface, rNode.wordId, pos, rNode.leftId, rNode.rightId, rNode.cost) 301 | index++ 302 | 303 | for _, lNode := range l.endLists[rNode.Begin] { 304 | cost := l.grammar.GetConnectCost(lNode.rightId, rNode.leftId) 305 | fmt.Fprintf(w, "%d ", cost) 306 | } 307 | fmt.Fprintln(w, "") 308 | } 309 | } 310 | } 311 | -------------------------------------------------------------------------------- /mecaboovproviderplugin.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | "os" 7 | "strconv" 8 | "strings" 9 | 10 | "github.com/msnoigrs/gosudachi/data" 11 | "github.com/msnoigrs/gosudachi/dictionary" 12 | "github.com/msnoigrs/gosudachi/internal/lnreader" 13 | ) 14 | 15 | type categoryInfo struct { 16 | catType uint32 17 | isInvoke bool 18 | isGroup bool 19 | length int 20 | } 21 | 22 | type oov struct { 23 | leftId int16 24 | rightId int16 25 | cost int16 26 | posId int16 27 | } 28 | 29 | type MeCabOovProviderPluginConfig struct { 30 | CharDef *string 31 | UnkDef *string 32 | } 33 | 34 | type MeCabOovProviderPlugin struct { 35 | config *MeCabOovProviderPluginConfig 36 | categories map[uint32]*categoryInfo 37 | oovList map[uint32]*[]*oov 38 | } 39 | 40 | func NewMeCabOovProviderPlugin(config *MeCabOovProviderPluginConfig) *MeCabOovProviderPlugin { 41 | if config == nil { 42 | config = &MeCabOovProviderPluginConfig{} 43 | } 44 | return &MeCabOovProviderPlugin{ 45 | config: config, 46 | categories: map[uint32]*categoryInfo{}, 47 | oovList: map[uint32]*[]*oov{}, 48 | } 49 | } 50 | 51 | func (p *MeCabOovProviderPlugin) GetConfigStruct() interface{} { 52 | if p.config == nil { 53 | p.config = &MeCabOovProviderPluginConfig{} 54 | } 55 | return p.config 56 | } 57 | 58 | func (p *MeCabOovProviderPlugin) SetUp(grammar *dictionary.Grammar) error { 59 | if p.config.CharDef == nil { 60 | zstr := "" 61 | p.config.CharDef = &zstr 62 | } 63 | if p.config.UnkDef == nil { 64 | zstr := "" 65 | p.config.UnkDef = &zstr 66 | } 67 | if p.categories == nil { 68 | p.categories = map[uint32]*categoryInfo{} 69 | } 70 | if p.oovList == nil { 71 | p.oovList = map[uint32]*[]*oov{} 72 | } 73 | err := p.readCharacterProperty(*p.config.CharDef) 74 | if err != nil { 75 | return fmt.Errorf("MeCabOovProviderPlugin: %s", err) 76 | } 77 | err = p.readOov(*p.config.UnkDef, grammar) 78 | if err != nil { 79 | return fmt.Errorf("MeCabOovProviderPlugin: %s", err) 80 | } 81 | p.config = nil 82 | return nil 83 | } 84 | 85 | func (p *MeCabOovProviderPlugin) ProvideOOV(inputText *InputText, offset int, hasOtherWords bool) ([]*LatticeNode, error) { 86 | nodes := []*LatticeNode{} 87 | length := inputText.GetCharCategoryContinuousLength(offset) 88 | if length > 0 { 89 | catTypes := inputText.GetCharCategoryTypes(offset) 90 | for t := dictionary.DEFAULT; t <= dictionary.NOOOVBOW; t *= 2 { 91 | if (catTypes & t) != t { 92 | continue 93 | } 94 | cinfo, ok := p.categories[t] 95 | if !ok { 96 | continue 97 | } 98 | llength := length 99 | oovs, ok := p.oovList[t] 100 | if !ok { 101 | continue 102 | } 103 | if cinfo.isGroup && (cinfo.isInvoke || !hasOtherWords) { 104 | s := inputText.GetSubstring(offset, offset+length) 105 | for _, oov := range *oovs { 106 | nodes = append(nodes, p.getOovNode(s, oov, length)) 107 | } 108 | llength -= 1 109 | } 110 | if cinfo.isInvoke || !hasOtherWords { 111 | for i := 1; i <= cinfo.length; i++ { 112 | sublength := inputText.GetCodePointsOffsetLength(offset, i) 113 | if sublength > llength { 114 | break 115 | } 116 | s := inputText.GetSubstring(offset, offset+sublength) 117 | for _, oov := range *oovs { 118 | nodes = append(nodes, p.getOovNode(s, oov, sublength)) 119 | } 120 | } 121 | } 122 | } 123 | } 124 | return nodes, nil 125 | } 126 | 127 | func (p *MeCabOovProviderPlugin) getOovNode(text string, oov *oov, length int) *LatticeNode { 128 | node := CreateNodeOfOOV() 129 | node.SetParameter(oov.leftId, oov.rightId, oov.cost) 130 | wi := &dictionary.WordInfo{ 131 | Surface: text, 132 | HeadwordLength: int16(length), 133 | PosId: oov.posId, 134 | NormalizedForm: text, 135 | DictionaryForm: text, 136 | ReadingForm: "", 137 | } 138 | node.SetWordInfo(wi) 139 | return node 140 | } 141 | 142 | func (p *MeCabOovProviderPlugin) readCharacterProperty(charDef string) error { 143 | var charDefReader io.Reader 144 | if charDef != "" { 145 | charDefFd, err := os.OpenFile(charDef, os.O_RDONLY, 0644) 146 | if err != nil { 147 | return fmt.Errorf("%s: %s", err, charDef) 148 | } 149 | defer charDefFd.Close() 150 | charDefReader = charDefFd 151 | } else { 152 | charDefF, err := data.Assets.Open("char.def") 153 | if err != nil { 154 | return fmt.Errorf("%s: (data.Assets)char.def", err) 155 | } 156 | defer charDefF.Close() 157 | charDefReader = charDefF 158 | } 159 | 160 | r := lnreader.NewLineNumberReader(charDefReader) 161 | for { 162 | line, err := r.ReadLine() 163 | if err == io.EOF { 164 | break 165 | } 166 | if err != nil { 167 | return err 168 | } 169 | if lnreader.IsSkipLine(line) { 170 | continue 171 | } 172 | if len(line) > 2 && line[0] == '0' && line[1] == 'x' { 173 | continue 174 | } 175 | cols := strings.Fields(string(line)) 176 | if len(cols) < 4 { 177 | return fmt.Errorf("char.def: invalid format at line %d", r.NumLine) 178 | } 179 | catType, err := dictionary.GetCategoryType(cols[0]) 180 | if err != nil { 181 | return fmt.Errorf("char.def: %s is invalid type at line %d", cols[0], r.NumLine) 182 | } 183 | _, ok := p.categories[catType] 184 | if ok { 185 | return fmt.Errorf("char.def: %s is already defined at line %d", cols[0], r.NumLine) 186 | } 187 | l, err := strconv.Atoi(cols[3]) 188 | if err != nil { 189 | return fmt.Errorf("char.def: %s is invalid number at line %d", cols[3], r.NumLine) 190 | } 191 | catinfo := &categoryInfo{ 192 | catType: catType, 193 | isInvoke: cols[1] != "0", 194 | isGroup: cols[2] != "0", 195 | length: l, 196 | } 197 | p.categories[catType] = catinfo 198 | } 199 | return nil 200 | } 201 | 202 | func (p *MeCabOovProviderPlugin) readOov(unkDef string, grammar *dictionary.Grammar) error { 203 | var unkDefReader io.Reader 204 | if unkDef != "" { 205 | unkDefFd, err := os.OpenFile(unkDef, os.O_RDONLY, 0644) 206 | if err != nil { 207 | return err 208 | } 209 | defer unkDefFd.Close() 210 | unkDefReader = unkDefFd 211 | } else { 212 | unkDefF, err := data.Assets.Open("unk.def") 213 | if err != nil { 214 | return err 215 | } 216 | defer unkDefF.Close() 217 | unkDefReader = unkDefF 218 | } 219 | 220 | r := lnreader.NewLineNumberReader(unkDefReader) 221 | for { 222 | line, err := r.ReadLine() 223 | if err == io.EOF { 224 | break 225 | } 226 | if err != nil { 227 | return err 228 | } 229 | cols := strings.Split(string(line), ",") 230 | if len(cols) < 10 { 231 | return fmt.Errorf("unk.def: invalid format at line %d", r.NumLine) 232 | } 233 | catType, err := dictionary.GetCategoryType(cols[0]) 234 | if err != nil { 235 | return fmt.Errorf("unk.def: %s is invalid type at line %d", cols[0], r.NumLine) 236 | } 237 | _, ok := p.categories[catType] 238 | if !ok { 239 | return fmt.Errorf("unk.def: %s is undefined at line %d", cols[0], r.NumLine) 240 | } 241 | 242 | leftId, err := strconv.ParseInt(cols[1], 10, 16) 243 | if err != nil { 244 | return fmt.Errorf("unk.def: %s is invalid number at line %d", cols[1], r.NumLine) 245 | } 246 | rightId, err := strconv.ParseInt(cols[2], 10, 16) 247 | if err != nil { 248 | return fmt.Errorf("unk.def: %s is invalid number at line %d", cols[2], r.NumLine) 249 | } 250 | cost, err := strconv.ParseInt(cols[3], 10, 16) 251 | if err != nil { 252 | return fmt.Errorf("unk.def: %s is invalid number at line %d", cols[3], r.NumLine) 253 | } 254 | pos := []string{cols[4], cols[5], cols[6], cols[7], cols[8], cols[9]} 255 | posId := grammar.GetPartOfSpeechId(pos) 256 | if posId == -1 { 257 | return fmt.Errorf("unk.def: unknown Part Of Speech at line %d", r.NumLine) 258 | } 259 | poov := &oov{ 260 | leftId: int16(leftId), 261 | rightId: int16(rightId), 262 | cost: int16(cost), 263 | posId: posId, 264 | } 265 | 266 | l, ok := p.oovList[catType] 267 | if !ok { 268 | ll := []*oov{} 269 | l = &ll 270 | p.oovList[catType] = l 271 | } 272 | *l = append(*l, poov) 273 | } 274 | return nil 275 | } 276 | -------------------------------------------------------------------------------- /morpheme.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "github.com/msnoigrs/gosudachi/dictionary" 5 | ) 6 | 7 | type Morpheme struct { 8 | list *MorphemeList 9 | index int 10 | wordInfo *dictionary.WordInfo 11 | } 12 | 13 | func newMorpheme(list *MorphemeList, index int) *Morpheme { 14 | return &Morpheme{ 15 | list: list, 16 | index: index, 17 | } 18 | } 19 | 20 | func (m *Morpheme) Begin() int { 21 | return m.list.GetBegin(m.index) 22 | } 23 | 24 | func (m *Morpheme) End() int { 25 | return m.list.GetEnd(m.index) 26 | } 27 | 28 | func (m *Morpheme) Surface() string { 29 | return m.list.GetSurface(m.index) 30 | } 31 | 32 | func (m *Morpheme) PartOfSpeech() []string { 33 | wi := m.GetWordInfo() 34 | return m.list.grammar.GetPartOfSpeechString(wi.PosId) 35 | } 36 | 37 | func (m *Morpheme) DictionaryForm() string { 38 | wi := m.GetWordInfo() 39 | return wi.DictionaryForm 40 | } 41 | 42 | func (m *Morpheme) NormalizedForm() string { 43 | wi := m.GetWordInfo() 44 | return wi.NormalizedForm 45 | } 46 | 47 | func (m *Morpheme) ReadingForm() string { 48 | wi := m.GetWordInfo() 49 | return wi.ReadingForm 50 | } 51 | 52 | func (m *Morpheme) Split(mode string) *MorphemeList { 53 | wi := m.GetWordInfo() 54 | return m.list.Split(mode, m.index, wi) 55 | } 56 | 57 | func (m *Morpheme) IsOOV() bool { 58 | return m.list.IsOOV(m.index) 59 | } 60 | 61 | func (m *Morpheme) GetWordId() int { 62 | return m.list.GetWordId(m.index) 63 | } 64 | 65 | func (m *Morpheme) GetDictionaryId() int { 66 | return m.list.GetDictionaryId(m.index) 67 | } 68 | 69 | func (m *Morpheme) GetWordInfo() *dictionary.WordInfo { 70 | if m.wordInfo == nil { 71 | wordInfo := m.list.GetWordInfo(m.index) 72 | m.wordInfo = wordInfo 73 | } 74 | return m.wordInfo 75 | } 76 | 77 | type MorphemeList struct { 78 | inputText *InputText 79 | grammar *dictionary.Grammar 80 | lexicon *dictionary.LexiconSet 81 | path []*LatticeNode 82 | } 83 | 84 | func NewMorphemeList(inputText *InputText, grammar *dictionary.Grammar, lexicon *dictionary.LexiconSet, path []*LatticeNode) *MorphemeList { 85 | return &MorphemeList{ 86 | inputText: inputText, 87 | grammar: grammar, 88 | lexicon: lexicon, 89 | path: path, 90 | } 91 | } 92 | 93 | func (l *MorphemeList) Length() int { 94 | return len(l.path) 95 | } 96 | 97 | func (l *MorphemeList) Get(index int) *Morpheme { 98 | return newMorpheme(l, index) 99 | } 100 | 101 | func (l *MorphemeList) GetBegin(index int) int { 102 | return l.inputText.GetOriginalIndex(l.path[index].Begin) 103 | } 104 | 105 | func (l *MorphemeList) GetEnd(index int) int { 106 | return l.inputText.GetOriginalIndex(l.path[index].End) 107 | } 108 | 109 | func (l *MorphemeList) GetSurface(index int) string { 110 | begin := l.GetBegin(index) 111 | end := l.GetEnd(index) 112 | return string([]rune(l.inputText.OriginalText)[begin:end]) 113 | } 114 | 115 | func (l *MorphemeList) GetWordInfo(index int) *dictionary.WordInfo { 116 | return l.path[index].GetWordInfo() 117 | } 118 | 119 | func (l *MorphemeList) Split(mode string, index int, wi *dictionary.WordInfo) *MorphemeList { 120 | var wordIds []int32 121 | switch mode { 122 | case "A": 123 | wordIds = wi.AUnitSplit 124 | case "B": 125 | wordIds = wi.BUnitSplit 126 | default: 127 | return NewMorphemeList(l.inputText, l.grammar, l.lexicon, []*LatticeNode{l.path[index]}) 128 | } 129 | if len(wordIds) == 0 || len(wordIds) == 1 { 130 | return NewMorphemeList(l.inputText, l.grammar, l.lexicon, []*LatticeNode{l.path[index]}) 131 | } 132 | 133 | offset := l.path[index].Begin 134 | nodes := make([]*LatticeNode, len(wordIds), len(wordIds)) 135 | for i, wid := range wordIds { 136 | n := NewLatticeNode(l.lexicon, 0, 0, 0, wid) 137 | n.Begin = offset 138 | wi := n.GetWordInfo() 139 | offset += int(wi.HeadwordLength) 140 | n.End = offset 141 | nodes[i] = n 142 | } 143 | 144 | return NewMorphemeList(l.inputText, l.grammar, l.lexicon, nodes) 145 | } 146 | 147 | func (l *MorphemeList) IsOOV(index int) bool { 148 | return l.path[index].IsOOV() 149 | } 150 | 151 | func (l *MorphemeList) GetWordId(index int) int { 152 | return l.path[index].GetWordId() 153 | } 154 | 155 | func (l *MorphemeList) GetDictionaryId(index int) int { 156 | return l.path[index].GetDictionaryId() 157 | } 158 | 159 | func (l *MorphemeList) GetInternalCost() int { 160 | return l.path[len(l.path)-1].GetPathCost() - l.path[0].GetPathCost() 161 | } 162 | -------------------------------------------------------------------------------- /numericparser.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | type errState int 4 | 5 | const ( 6 | errNone errState = iota 7 | errPoint 8 | errComma 9 | errOther 10 | ) 11 | 12 | type numericParser struct { 13 | digitLength int 14 | isFirstDigit bool 15 | hasComma bool 16 | hasHangingPoint bool 17 | errorState errState 18 | total *stringNumber 19 | subtotal *stringNumber 20 | tmp *stringNumber 21 | } 22 | 23 | func newNumericParser() *numericParser { 24 | return &numericParser{ 25 | isFirstDigit: true, 26 | total: newStringNumber(), 27 | subtotal: newStringNumber(), 28 | tmp: newStringNumber(), 29 | } 30 | } 31 | 32 | type stringNumber struct { 33 | significand []rune 34 | scale int 35 | point int 36 | IsAllZero bool 37 | } 38 | 39 | func newStringNumber() *stringNumber { 40 | return &stringNumber{ 41 | point: -1, 42 | IsAllZero: true, 43 | } 44 | } 45 | 46 | func (n *stringNumber) clear() { 47 | n.significand = n.significand[:0] 48 | n.scale = 0 49 | n.point = -1 50 | n.IsAllZero = true 51 | } 52 | 53 | func (n *stringNumber) append(i int) { 54 | if i != 0 { 55 | n.IsAllZero = false 56 | } 57 | n.significand = append(n.significand, intToRune(i)) 58 | } 59 | 60 | func (n *stringNumber) shiftScale(i int) { 61 | if len(n.significand) == 0 { 62 | n.significand = append(n.significand, '1') 63 | } 64 | n.scale += i 65 | } 66 | 67 | func (n *stringNumber) add(t *stringNumber) bool { 68 | if len(t.significand) == 0 { 69 | return true 70 | } 71 | 72 | if len(n.significand) == 0 { 73 | n.significand = append(n.significand, t.significand...) 74 | n.scale = t.scale 75 | n.point = t.point 76 | return true 77 | } 78 | 79 | l := t.intLength() 80 | if n.scale >= l { 81 | n.fillZero(n.scale - l) 82 | if t.point >= 0 { 83 | n.point = len(n.significand) + t.point 84 | } 85 | _ = t.String() 86 | n.significand = append(n.significand, t.significand...) 87 | n.scale = t.scale 88 | return true 89 | } 90 | 91 | return false 92 | } 93 | 94 | func (n *stringNumber) setPoint() bool { 95 | if n.scale == 0 && n.point < 0 { 96 | n.point = len(n.significand) 97 | return true 98 | } 99 | return false 100 | } 101 | 102 | func (n *stringNumber) intLength() int { 103 | n.normalizeScale() 104 | if n.point >= 0 { 105 | return n.point 106 | } 107 | return len(n.significand) + n.scale 108 | } 109 | 110 | func (n *stringNumber) isZero() bool { 111 | return len(n.significand) == 0 112 | } 113 | 114 | func (n *stringNumber) String() string { 115 | if len(n.significand) == 0 { 116 | return "0" 117 | } 118 | 119 | n.normalizeScale() 120 | if n.scale > 0 { 121 | n.fillZero(n.scale) 122 | } else if n.point >= 0 { 123 | if n.point == 0 { 124 | n.significand = append(n.significand, []rune{0, 0}...) 125 | copy(n.significand[2:], n.significand[:len(n.significand)-2]) 126 | n.significand[0] = '0' 127 | n.significand[1] = '.' 128 | } else { 129 | n.significand = append(n.significand, rune(0)) 130 | copy(n.significand[n.point+1:], n.significand[n.point:]) 131 | n.significand[n.point] = '.' 132 | } 133 | i := len(n.significand) - 1 134 | j := 0 135 | for i >= 0 && n.significand[i] == '0' { 136 | i-- 137 | j++ 138 | } 139 | if n.significand[i] == '.' { 140 | i-- 141 | j++ 142 | } 143 | if j > 0 { 144 | n.significand = n.significand[:i+1] 145 | } 146 | } 147 | 148 | return string(n.significand) 149 | } 150 | 151 | func (n *stringNumber) normalizeScale() { 152 | if n.point >= 0 { 153 | nScale := len(n.significand) - n.point 154 | if nScale > n.scale { 155 | n.point += n.scale 156 | n.scale = 0 157 | } else { 158 | n.scale -= nScale 159 | n.point = -1 160 | } 161 | } 162 | } 163 | 164 | func (n *stringNumber) fillZero(length int) { 165 | for i := 0; i < length; i++ { 166 | n.significand = append(n.significand, '0') 167 | } 168 | } 169 | 170 | func intToRune(i int) rune { 171 | return rune(int32('0') + int32(i)) 172 | } 173 | 174 | func (p *numericParser) clear() { 175 | p.digitLength = 0 176 | p.isFirstDigit = true 177 | p.hasComma = false 178 | p.hasHangingPoint = false 179 | p.errorState = errNone 180 | p.total.clear() 181 | p.subtotal.clear() 182 | p.tmp.clear() 183 | } 184 | 185 | func (p *numericParser) checkComma() bool { 186 | if p.isFirstDigit { 187 | return false 188 | } else if !p.hasComma { 189 | return p.digitLength <= 3 && !p.tmp.isZero() && !p.tmp.IsAllZero 190 | } else { 191 | return p.digitLength == 3 192 | } 193 | } 194 | 195 | func (p *numericParser) append(c rune) bool { 196 | if c == '.' { 197 | p.hasHangingPoint = true 198 | if p.isFirstDigit { 199 | p.errorState = errPoint 200 | return false 201 | } else if p.hasComma && !p.checkComma() { 202 | p.errorState = errComma 203 | return false 204 | 205 | } else if p.tmp.setPoint() { 206 | p.errorState = errPoint 207 | return false 208 | } 209 | p.hasComma = false 210 | return true 211 | } else if c == ',' { 212 | if !p.checkComma() { 213 | p.errorState = errComma 214 | return false 215 | } 216 | p.hasComma = true 217 | p.digitLength = 0 218 | return true 219 | } 220 | 221 | n, ok := runeToNumMap[c] 222 | if !ok { 223 | return false 224 | } 225 | if n < 0 && n >= -3 { // isSmallUnit 226 | p.tmp.shiftScale(-n) 227 | if !p.subtotal.add(p.tmp) { 228 | return false 229 | } 230 | p.tmp.clear() 231 | p.isFirstDigit = true 232 | p.digitLength = 0 233 | p.hasComma = false 234 | } else if n <= -4 { // isLargeUnit 235 | if !p.subtotal.add(p.tmp) || p.subtotal.isZero() { 236 | return false 237 | } 238 | p.subtotal.shiftScale(-n) 239 | if !p.total.add(p.subtotal) { 240 | return false 241 | } 242 | p.subtotal.clear() 243 | p.tmp.clear() 244 | p.isFirstDigit = true 245 | p.digitLength = 0 246 | p.hasComma = false 247 | } else { 248 | p.tmp.append(n) 249 | p.isFirstDigit = false 250 | p.digitLength++ 251 | p.hasHangingPoint = false 252 | } 253 | 254 | return true 255 | } 256 | 257 | func (p *numericParser) done() bool { 258 | ret := p.subtotal.add(p.tmp) && p.total.add(p.subtotal) 259 | if p.hasHangingPoint { 260 | p.errorState = errPoint 261 | return false 262 | } else if p.hasComma && p.digitLength != 3 { 263 | p.errorState = errComma 264 | return false 265 | } 266 | return ret 267 | } 268 | 269 | func (p *numericParser) getNormalized() string { 270 | return p.total.String() 271 | } 272 | 273 | var runeToNumMap = map[rune]int{ 274 | '0': 0, 275 | '1': 1, 276 | '2': 2, 277 | '3': 3, 278 | '4': 4, 279 | '5': 5, 280 | '6': 6, 281 | '7': 7, 282 | '8': 8, 283 | '9': 9, 284 | '〇': 0, 285 | '一': 1, 286 | '二': 2, 287 | '三': 3, 288 | '四': 4, 289 | '五': 5, 290 | '六': 6, 291 | '七': 7, 292 | '八': 8, 293 | '九': 9, 294 | '十': -1, 295 | '百': -2, 296 | '千': -3, 297 | '万': -4, 298 | '億': -8, 299 | '兆': -12, 300 | } 301 | -------------------------------------------------------------------------------- /plugin.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/msnoigrs/gosudachi/dictionary" 7 | ) 8 | 9 | type Settings interface { 10 | GetBaseConfig() *BaseConfig 11 | } 12 | 13 | type BaseConfig struct { 14 | SystemDict string 15 | CharacterDefinitionFile string 16 | UserDict []string 17 | Utf16String bool 18 | } 19 | 20 | type PluginMaker interface { 21 | GetInputTextPluginArray(f MakeInputTextPluginFunc) ([]InputTextPlugin, error) 22 | GetOovProviderPluginArray(f MakeOovProviderPluginFunc) ([]OovProviderPlugin, error) 23 | GetPathRewritePluginArray(f MakePathRewritePluginFunc) ([]PathRewritePlugin, error) 24 | GetEditConnectionCostPluginArray(f MakeEditConnectionCostPluginFunc) ([]EditConnectionCostPlugin, error) 25 | } 26 | 27 | type Plugin interface { 28 | GetConfigStruct() interface{} 29 | } 30 | 31 | type MakeInputTextPluginFunc func(n string) InputTextPlugin 32 | type MakeEditConnectionCostPluginFunc func(n string) EditConnectionCostPlugin 33 | type MakeOovProviderPluginFunc func(n string) OovProviderPlugin 34 | type MakePathRewritePluginFunc func(n string) PathRewritePlugin 35 | 36 | func DefMakeInputTextPlugin(k string) InputTextPlugin { 37 | switch k { 38 | case "DefaultInputTextPlugin", "com.worksap.nlp.sudachi.DefaultInputTextPlugin": 39 | return NewDefaultInputTextPlugin(nil) 40 | case "ProlongedSoundMarkInputTextPlugin", "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin": 41 | return NewProlongedSoundMarkInputTextPlugin(nil) 42 | } 43 | return nil 44 | } 45 | 46 | func DefMakeEditConnectionCostPlugin(k string) EditConnectionCostPlugin { 47 | switch k { 48 | case "InhibitConnectionPlugin", "com.worksap.nlp.sudachi.InhibitConnectionPlugin": 49 | return NewInhibitConnectionPlugin([]*[]int{}) 50 | } 51 | return nil 52 | } 53 | 54 | func DefMakeOovProviderPlugin(k string) OovProviderPlugin { 55 | switch k { 56 | case "MeCabOovProviderPlugin", "com.worksap.nlp.sudachi.MeCabOovProviderPlugin": 57 | return NewMeCabOovProviderPlugin(nil) 58 | case "SimpleOovProviderPlugin", "com.worksap.nlp.sudachi.SimpleOovProviderPlugin": 59 | return NewSimpleOovProviderPlugin(nil) 60 | } 61 | return nil 62 | } 63 | 64 | func DefMakePathRewritePlugin(k string) PathRewritePlugin { 65 | switch k { 66 | case "JoinNumericPlugin", "com.worksap.nlp.sudachi.JoinNumericPlugin": 67 | return NewJoinNumericPlugin(nil) 68 | case "JoinKatakanaOovPlugin", "com.worksap.nlp.sudachi.JoinKatakanaOovPlugin": 69 | return NewJoinKatakanaOovPlugin(nil) 70 | } 71 | return nil 72 | } 73 | 74 | type EditConnectionCostPlugin interface { 75 | Plugin 76 | SetUp(grammar *dictionary.Grammar) error 77 | Edit(grammar *dictionary.Grammar) error 78 | } 79 | 80 | func InhibitConnection(grammar *dictionary.Grammar, leftId int16, rightId int16) { 81 | grammar.SetConnectCost(leftId, rightId, dictionary.InhibitedConnection) 82 | } 83 | 84 | type PathRewritePlugin interface { 85 | Plugin 86 | SetUp(grammar *dictionary.Grammar) error 87 | Rewrite(text *InputText, path *[]*LatticeNode, lattice *Lattice) error 88 | } 89 | 90 | func ConcatenateNodes(path *[]*LatticeNode, begin int, end int, lattice *Lattice, normalizedForm string) (*LatticeNode, error) { 91 | if begin >= end { 92 | return nil, fmt.Errorf("begin >= end") 93 | } 94 | tpath := *path 95 | b := tpath[begin].GetBegin() 96 | e := tpath[end-1].GetEnd() 97 | bwi := tpath[begin].GetWordInfo() 98 | posId := bwi.PosId 99 | var ( 100 | surfaceLen int 101 | normalizedFormLen int 102 | dictionaryFormLen int 103 | readingFormLen int 104 | length int16 105 | ) 106 | wilist := make([]*dictionary.WordInfo, 0, end - begin) 107 | for i := begin; i < end; i++ { 108 | info := tpath[i].GetWordInfo() 109 | wilist = append(wilist, info) 110 | surfaceLen += len(info.Surface) 111 | length += info.HeadwordLength 112 | if normalizedForm == "" { 113 | normalizedFormLen += len(info.NormalizedForm) 114 | } 115 | dictionaryFormLen += len(info.DictionaryForm) 116 | readingFormLen += len(info.ReadingForm) 117 | } 118 | csurface := make([]byte, 0, surfaceLen) 119 | var cnormalizedForm []byte 120 | if normalizedForm == "" { 121 | cnormalizedForm = make([]byte, 0, normalizedFormLen) 122 | } 123 | cdictionaryForm := make([]byte, 0, dictionaryFormLen) 124 | creadingForm := make([]byte, 0, readingFormLen) 125 | for _, wi := range wilist { 126 | csurface = append(csurface, []byte(wi.Surface)...) 127 | if normalizedForm == "" { 128 | cnormalizedForm = append(cnormalizedForm, []byte(wi.NormalizedForm)...) 129 | } 130 | cdictionaryForm = append(cdictionaryForm, []byte(wi.DictionaryForm)...) 131 | creadingForm = append(creadingForm, []byte(wi.ReadingForm)...) 132 | } 133 | if normalizedForm == "" { 134 | normalizedForm = string(cnormalizedForm) 135 | } 136 | wi := &dictionary.WordInfo{ 137 | Surface: string(csurface), 138 | HeadwordLength: length, 139 | PosId: posId, 140 | NormalizedForm: normalizedForm, 141 | DictionaryForm: string(cdictionaryForm), 142 | ReadingForm: string(creadingForm), 143 | } 144 | 145 | node := &LatticeNode{} 146 | node.SetRange(b, e) 147 | node.SetWordInfo(wi) 148 | *path = replaceNode(tpath, begin, end, node) 149 | return node, nil 150 | } 151 | 152 | func ConcatenateOov(path *[]*LatticeNode, begin int, end int, posId int16, lattice *Lattice) (*LatticeNode, error) { 153 | if begin >= end { 154 | return nil, fmt.Errorf("begin >= end") 155 | } 156 | tpath := *path 157 | b := tpath[begin].GetBegin() 158 | e := tpath[end-1].GetEnd() 159 | 160 | n := lattice.GetMinimumNode(b, e) 161 | if n != nil { 162 | *path = replaceNode(tpath, begin, end, n) 163 | return n, nil 164 | } 165 | 166 | var ( 167 | surfaceLen int 168 | length int16 169 | ) 170 | wilist := make([]*dictionary.WordInfo, 0, end - begin) 171 | for i := begin; i < end; i++ { 172 | info := tpath[i].GetWordInfo() 173 | wilist = append(wilist, info) 174 | surfaceLen += len(info.Surface) 175 | length += info.HeadwordLength 176 | } 177 | csurface := make([]byte, 0, surfaceLen) 178 | for _, wi := range wilist { 179 | csurface = append(csurface, []byte(wi.Surface)...) 180 | } 181 | s := string(csurface) 182 | wi := &dictionary.WordInfo{ 183 | Surface: s, 184 | HeadwordLength: length, 185 | PosId: posId, 186 | NormalizedForm: s, 187 | DictionaryForm: s, 188 | ReadingForm: "", 189 | } 190 | 191 | node := &LatticeNode{} 192 | node.SetRange(b, e) 193 | node.SetWordInfo(wi) 194 | node.IsOov = true 195 | *path = replaceNode(tpath, begin, end, node) 196 | return node, nil 197 | } 198 | 199 | func GetCharCategoryTypes(text *InputText, node *LatticeNode) uint32 { 200 | return text.GetCharCategoryTypesRange(node.Begin, node.End) 201 | } 202 | 203 | func replaceNode(path []*LatticeNode, begin int, end int, node *LatticeNode) []*LatticeNode { 204 | d := end - begin 205 | if d > 1 { 206 | if end < len(path) { 207 | copy(path[begin+1:], path[end:]) 208 | } 209 | path = path[:len(path)-d+1] 210 | } 211 | path[begin] = node 212 | return path 213 | } 214 | 215 | type InputTextPlugin interface { 216 | Plugin 217 | SetUp() error 218 | Rewrite(builder *InputTextBuilder) error 219 | } 220 | 221 | type OovProviderPlugin interface { 222 | Plugin 223 | SetUp(grammar *dictionary.Grammar) error 224 | ProvideOOV(inputText *InputText, offset int, hasOtherWords bool) ([]*LatticeNode, error) 225 | } 226 | 227 | func GetOOV(p OovProviderPlugin, inputText *InputText, offset int, hasOtherWords bool) ([]*LatticeNode, error) { 228 | nodes, err := p.ProvideOOV(inputText, offset, hasOtherWords) 229 | if err != nil { 230 | return []*LatticeNode{}, err 231 | } 232 | for _, node := range nodes { 233 | wi := node.GetWordInfo() 234 | node.Begin = offset 235 | node.End = offset + int(wi.HeadwordLength) 236 | } 237 | return nodes, nil 238 | } 239 | 240 | func CreateNodeOfOOV() *LatticeNode { 241 | return &LatticeNode{ 242 | IsOov: true, 243 | } 244 | } 245 | -------------------------------------------------------------------------------- /printdic/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "os" 7 | 8 | "github.com/msnoigrs/gosudachi/dictionary" 9 | ) 10 | 11 | func main() { 12 | flag.Usage = func() { 13 | fmt.Fprintf(os.Stderr, `Usage of %s: 14 | %s [-s file] [-j] file 15 | 16 | Options: 17 | `, os.Args[0], os.Args[0]) 18 | flag.PrintDefaults() 19 | } 20 | 21 | var ( 22 | systemdict string 23 | utf16string bool 24 | ) 25 | flag.StringVar(&systemdict, "s", "", "system dictionary") 26 | flag.BoolVar(&utf16string, "j", false, "use UTF-16 string") 27 | 28 | flag.Parse() 29 | 30 | if len(flag.Args()) == 0 { 31 | flag.Usage() 32 | os.Exit(1) 33 | } 34 | 35 | var ( 36 | sdic *dictionary.BinaryDictionary 37 | err error 38 | ) 39 | if systemdict != "" { 40 | sdic, err = dictionary.ReadSystemDictionary(systemdict, utf16string) 41 | if err != nil { 42 | fmt.Fprintln(os.Stderr, err) 43 | os.Exit(1) 44 | } 45 | defer sdic.Close() 46 | } 47 | 48 | err = dictionary.PrintDictionary(flag.Args()[0], utf16string, sdic, os.Stdout) 49 | if err != nil { 50 | fmt.Fprintln(os.Stderr, err) 51 | os.Exit(1) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /printdicheader/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "flag" 5 | "fmt" 6 | "os" 7 | 8 | "github.com/msnoigrs/gosudachi/dictionary" 9 | ) 10 | 11 | func main() { 12 | flag.Usage = func() { 13 | fmt.Fprintf(os.Stderr, `Usage of %s: 14 | %s file 15 | `, os.Args[0], os.Args[0]) 16 | flag.PrintDefaults() 17 | } 18 | 19 | flag.Parse() 20 | 21 | if len(flag.Args()) == 0 { 22 | flag.Usage() 23 | os.Exit(1) 24 | } 25 | 26 | err := dictionary.PrintHeader(flag.Arg(0), os.Stdout) 27 | if err != nil { 28 | fmt.Fprintln(os.Stderr, err) 29 | os.Exit(1) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /prolongedsoundmarkinputtextplugin.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | type ProlongedSoundMarkInputTextPluginConfig struct { 8 | ProlongedSoundMarks *[]string 9 | ReplacementSymbol *string 10 | } 11 | 12 | type ProlongedSoundMarkInputTextPlugin struct { 13 | config *ProlongedSoundMarkInputTextPluginConfig 14 | prolongedSoundMarkMap map[rune]bool 15 | replacementSymbol []rune 16 | } 17 | 18 | func NewProlongedSoundMarkInputTextPlugin(config *ProlongedSoundMarkInputTextPluginConfig) *ProlongedSoundMarkInputTextPlugin { 19 | if config == nil { 20 | config = &ProlongedSoundMarkInputTextPluginConfig{} 21 | } 22 | return &ProlongedSoundMarkInputTextPlugin{ 23 | config: config, 24 | prolongedSoundMarkMap: map[rune]bool{}, 25 | } 26 | } 27 | 28 | func (p *ProlongedSoundMarkInputTextPlugin) GetConfigStruct() interface{} { 29 | if p.config == nil { 30 | p.config = &ProlongedSoundMarkInputTextPluginConfig{} 31 | } 32 | return p.config 33 | } 34 | 35 | func (p *ProlongedSoundMarkInputTextPlugin) SetUp() error { 36 | if p.config.ProlongedSoundMarks == nil || len(*p.config.ProlongedSoundMarks) == 0 { 37 | return fmt.Errorf("ProlongedSoundMarkInputTextPlugin: prolongedSoundMarkStrings is not specified") 38 | } 39 | if p.config.ReplacementSymbol == nil { 40 | return fmt.Errorf("ProlongedSoundMarkInputTextPlugin: replacementSymbol is not specified") 41 | } 42 | if p.prolongedSoundMarkMap == nil { 43 | p.prolongedSoundMarkMap = map[rune]bool{} 44 | } 45 | for _, s := range *p.config.ProlongedSoundMarks { 46 | runes := []rune(s) 47 | if len(runes) > 0 { 48 | p.prolongedSoundMarkMap[runes[0]] = true 49 | } 50 | } 51 | p.replacementSymbol = []rune(*p.config.ReplacementSymbol) 52 | p.config = nil 53 | return nil 54 | } 55 | 56 | func (p *ProlongedSoundMarkInputTextPlugin) Rewrite(builder *InputTextBuilder) error { 57 | runes := builder.GetText() 58 | 59 | runelen := len(runes) 60 | offset := 0 61 | markStartIndex := runelen 62 | isProlongedSoundMark := false 63 | for i := 0; i < runelen; i++ { 64 | _, ok := p.prolongedSoundMarkMap[runes[i]] 65 | if !isProlongedSoundMark && ok { 66 | isProlongedSoundMark = true 67 | markStartIndex = i 68 | } else if isProlongedSoundMark && !ok { 69 | if (i - markStartIndex) > 1 { 70 | builder.Replace(markStartIndex-offset, i-offset, p.replacementSymbol) 71 | offset += i - markStartIndex - 1 72 | } 73 | isProlongedSoundMark = false 74 | } 75 | } 76 | if isProlongedSoundMark && (runelen-markStartIndex) > 1 { 77 | builder.Replace(markStartIndex-offset, runelen-offset, p.replacementSymbol) 78 | } 79 | return nil 80 | } 81 | -------------------------------------------------------------------------------- /scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SRC_DIR="${PWD}" 4 | BUILD_DIR="${PWD}" 5 | DIST="${BUILD_DIR}/dist" 6 | CMDDIRS="gosudachicli dicbuilder userdicbuilder printdic printdicheader dicconv" 7 | 8 | build() { 9 | cd "${SRC_DIR}/$1" 10 | echo -n "Building $1..." 11 | go build -o "${DIST}/$1" 12 | echo "done" 13 | cd "${BUILD_DIR}" 14 | } 15 | 16 | assets() { 17 | cd "${SRC_DIR}/data" 18 | go generate 19 | cd "${BUILD_DIR}" 20 | } 21 | 22 | assets 23 | 24 | if [ ! -d "${DIST}" ]; then 25 | mkdir "${DIST}" 26 | fi 27 | 28 | for f in ${CMDDIRS}; do 29 | build "${f}" 30 | done 31 | -------------------------------------------------------------------------------- /scripts/mksystemdic.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROXY="" 4 | VERSION="" 5 | UNIDICVER="2.1.2" 6 | UNIDICZIP="unidic-mecab-${UNIDICVER}_src.zip" 7 | UNIDICURL="https://unidic.ninjal.ac.jp/unidic_archive/cwj/${UNIDICVER}/${UNIDICZIP}" 8 | POMXML="pom.xml" 9 | MATRIXDEF="matrix.def" 10 | SYSSMALLDIC="system_small.dic" 11 | SYSCOREDIC="system_core.dic" 12 | SYSFULLDIC="system_full.dic" 13 | SMALLCSV="small_lex.csv" 14 | CORECSV="core_lex.csv" 15 | NOTCORECSV="notcore_lex.csv" 16 | 17 | init_env() { 18 | if [ -f "${1}/pom.xml" ]; then 19 | POMXML="${1}/pom.xml" 20 | fi 21 | if [ -f "${1}/src/main/text/${MATRIXDEF}.zip" ]; then 22 | cp "${1}/src/main/text/${MATRIXDEF}.zip" . 23 | fi 24 | if [ -f "${1}/src/main/text/${SMALLCSV}" ]; then 25 | SMALLCSV="${1}/src/main/text/${SMALLCSV}" 26 | fi 27 | if [ -f "${1}/src/main/text/${CORECSV}" ]; then 28 | CORECSV="${1}/src/main/text/${CORECSV}" 29 | fi 30 | if [ -f "${1}/src/main/text/${NOTCORECSV}" ]; then 31 | NOTCORECSV="${1}/src/main/text/${NOTCORECSV}" 32 | fi 33 | } 34 | 35 | if [ -n "${1}" ]; then 36 | init_env "${1}" 37 | elif [ -d "../SudachiDict" ]; then 38 | init_env "../SudachiDict" 39 | fi 40 | 41 | if [ ! -f "${MATRIXDEF}" ]; then 42 | if [ ! -f "${MATRIXDEF}.zip" ]; then 43 | if [ -z "${PROXY}" ]; then 44 | curl "${UNIDICURL}" -o "${UNIDICZIP}" 45 | else 46 | curl "${UNIDICURL}" -x "${PROXY}" -o "${UNIDICZIP}" 47 | fi 48 | unzip "${UNIDICZIP}" 49 | cp "unidic-mecab-${UNIDICVER}_src/matrix.def" "${MATRIXDEF}" 50 | else 51 | unzip "${MATRIXDEF}.zip" 52 | fi 53 | fi 54 | 55 | if [ -f "${POMXML}" ]; then 56 | VERSION=$(grep -oP -m 1 '\K([^<]+)' "${POMXML}") 57 | fi 58 | 59 | if [ -z "${VERSION}" ]; then 60 | VERSION="go" 61 | fi 62 | 63 | if [ ! -f "${SMALLCSV}" -o ! -f "${CORECSV}" -o ! -f "${NOTCORECSV}" ]; then 64 | echo "dictionary files are needed: ${SMALLCSV}, ${CORECSV}, ${NOTCORECSV}" 1>&2 65 | fi 66 | 67 | ./dicbuilder -o "${SYSSMALLDIC}" -m "${MATRIXDEF}" -d "${VERSION}" "${SMALLCSV}" 68 | ./dicbuilder -o "${SYSCOREDIC}" -m "${MATRIXDEF}" -d "${VERSION}" "${SMALLCSV}" "${CORECSV}" 69 | ./dicbuilder -o "${SYSFULLDIC}" -m "${MATRIXDEF}" -d "${VERSION}" "${SMALLCSV}" "${CORECSV}" "${NOTCORECSV}" 70 | -------------------------------------------------------------------------------- /scripts/mksystemdicutf16.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROXY="" 4 | VERSION="" 5 | UNIDICVER="2.1.2" 6 | UNIDICZIP="unidic-mecab-${UNIDICVER}_src.zip" 7 | UNIDICURL="https://unidic.ninjal.ac.jp/unidic_archive/cwj/${UNIDICVER}/${UNIDICZIP}" 8 | POMXML="pom.xml" 9 | MATRIXDEF="matrix.def" 10 | SYSSMALLDIC="system_small.dic" 11 | SYSCOREDIC="system_core.dic" 12 | SYSFULLDIC="system_full.dic" 13 | SMALLCSV="small_lex.csv" 14 | CORECSV="core_lex.csv" 15 | NOTCORECSV="notcore_lex.csv" 16 | 17 | init_env() { 18 | if [ -f "${1}/pom.xml" ]; then 19 | POMXML="${1}/pom.xml" 20 | fi 21 | if [ -f "${1}/src/main/text/${MATRIXDEF}.zip" ]; then 22 | cp "${1}/src/main/text/${MATRIXDEF}.zip" . 23 | fi 24 | if [ -f "${1}/src/main/text/${SMALLCSV}" ]; then 25 | SMALLCSV="${1}/src/main/text/${SMALLCSV}" 26 | fi 27 | if [ -f "${1}/src/main/text/${CORECSV}" ]; then 28 | CORECSV="${1}/src/main/text/${CORECSV}" 29 | fi 30 | if [ -f "${1}/src/main/text/${NOTCORECSV}" ]; then 31 | NOTCORECSV="${1}/src/main/text/${NOTCORECSV}" 32 | fi 33 | } 34 | 35 | if [ -n "${1}" ]; then 36 | init_env "${1}" 37 | elif [ -d "../SudachiDict" ]; then 38 | init_env "../SudachiDict" 39 | fi 40 | 41 | if [ ! -f "${MATRIXDEF}" ]; then 42 | if [ ! -f "${MATRIXDEF}.zip" ]; then 43 | if [ -z "${PROXY}" ]; then 44 | curl "${UNIDICURL}" -o "${UNIDICZIP}" 45 | else 46 | curl "${UNIDICURL}" -x "${PROXY}" -o "${UNIDICZIP}" 47 | fi 48 | unzip "${UNIDICZIP}" 49 | cp "unidic-mecab-${UNIDICVER}_src/matrix.def" "${MATRIXDEF}" 50 | else 51 | unzip "${MATRIXDEF}.zip" 52 | fi 53 | fi 54 | 55 | if [ -f "${POMXML}" ]; then 56 | VERSION=$(grep -oP -m 1 '\K([^<]+)' "${POMXML}") 57 | fi 58 | 59 | if [ -z "${VERSION}" ]; then 60 | VERSION="go" 61 | fi 62 | 63 | if [ ! -f "${SMALLCSV}" -o ! -f "${CORECSV}" -o ! -f "${NOTCORECSV}" ]; then 64 | echo "dictionary files are needed: ${SMALLCSV}, ${CORECSV}, ${NOTCORECSV}" 1>&2 65 | fi 66 | 67 | ./dicbuilder -o "${SYSSMALLDIC}" -m "${MATRIXDEF}" -d "${VERSION}" -j "${SMALLCSV}" 68 | ./dicbuilder -o "${SYSCOREDIC}" -m "${MATRIXDEF}" -d "${VERSION}" -j "${SMALLCSV}" "${CORECSV}" 69 | ./dicbuilder -o "${SYSFULLDIC}" -m "${MATRIXDEF}" -d "${VERSION}" -j "${SMALLCSV}" "${CORECSV}" "${NOTCORECSV}" 70 | -------------------------------------------------------------------------------- /settingsjson.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "encoding/json" 5 | "fmt" 6 | "io" 7 | "path/filepath" 8 | ) 9 | 10 | type SettingsJSON struct { 11 | BaseConfig 12 | path string 13 | inputTextPlugin []json.RawMessage 14 | oovProviderPlugin []json.RawMessage 15 | pathRewritePlugin []json.RawMessage 16 | editConnectionCostPlugin []json.RawMessage 17 | } 18 | 19 | func NewSettingsJSON() *SettingsJSON { 20 | return &SettingsJSON{} 21 | } 22 | 23 | func (settings *SettingsJSON) GetBaseConfig() *BaseConfig { 24 | return &settings.BaseConfig 25 | } 26 | 27 | func (settings *SettingsJSON) ParseSettingsJSON(defpath string, reader io.Reader) error { 28 | internalBaseConfig := &struct { 29 | Path *string 30 | SystemDict *string 31 | CharacterDefinitionFile *string 32 | Utf16String *bool 33 | UserDict *[]string 34 | InputTextPlugin *[]json.RawMessage 35 | OovProviderPlugin *[]json.RawMessage 36 | PathRewritePlugin *[]json.RawMessage 37 | EditConnectionCostPlugin *[]json.RawMessage 38 | }{} 39 | 40 | decoder := json.NewDecoder(reader) 41 | err := decoder.Decode(internalBaseConfig) 42 | if err != nil { 43 | return err 44 | } 45 | if internalBaseConfig.Path == nil && settings.path != "" { 46 | settings.path = defpath 47 | } else if internalBaseConfig.Path != nil { 48 | settings.path = *internalBaseConfig.Path 49 | } 50 | if internalBaseConfig.SystemDict != nil { 51 | settings.SystemDict = settings.getPath(*internalBaseConfig.SystemDict) 52 | } 53 | if internalBaseConfig.CharacterDefinitionFile != nil { 54 | settings.CharacterDefinitionFile = settings.getPath(*internalBaseConfig.CharacterDefinitionFile) 55 | } 56 | if internalBaseConfig.Utf16String != nil { 57 | settings.Utf16String = *internalBaseConfig.Utf16String 58 | } 59 | if internalBaseConfig.UserDict != nil { 60 | for _, ud := range *internalBaseConfig.UserDict { 61 | settings.UserDict = append(settings.UserDict, settings.getPath(ud)) 62 | } 63 | } 64 | 65 | if internalBaseConfig.InputTextPlugin != nil { 66 | settings.inputTextPlugin = *internalBaseConfig.InputTextPlugin 67 | } 68 | if internalBaseConfig.OovProviderPlugin != nil { 69 | settings.oovProviderPlugin = *internalBaseConfig.OovProviderPlugin 70 | } 71 | if internalBaseConfig.PathRewritePlugin != nil { 72 | settings.pathRewritePlugin = *internalBaseConfig.PathRewritePlugin 73 | } 74 | if internalBaseConfig.EditConnectionCostPlugin != nil { 75 | settings.editConnectionCostPlugin = *internalBaseConfig.EditConnectionCostPlugin 76 | } 77 | return nil 78 | } 79 | 80 | func (settings *SettingsJSON) getPath(path string) string { 81 | if path == "" || filepath.IsAbs(path) || settings.path == "" { 82 | return path 83 | } 84 | return filepath.Join(settings.path, path) 85 | } 86 | 87 | func (settings *SettingsJSON) GetInputTextPluginArray(makeproc MakeInputTextPluginFunc) ([]InputTextPlugin, error) { 88 | ret := []InputTextPlugin{} 89 | pname := &struct { 90 | Class *string 91 | Name *string 92 | }{} 93 | for _, raw := range settings.inputTextPlugin { 94 | err := json.Unmarshal(raw, pname) 95 | if err != nil { 96 | return ret, err 97 | } 98 | var name string 99 | if pname.Class != nil { 100 | name = *pname.Class 101 | } 102 | if pname.Name != nil { 103 | name = *pname.Name 104 | } 105 | plugin := makeproc(name) 106 | if plugin == nil { 107 | return ret, fmt.Errorf("InputTextPlugin: %s is unknown", name) 108 | } 109 | err = json.Unmarshal(raw, plugin.GetConfigStruct()) 110 | if err != nil { 111 | return ret, err 112 | } 113 | ret = append(ret, plugin) 114 | } 115 | return ret, nil 116 | } 117 | 118 | func (settings *SettingsJSON) GetOovProviderPluginArray(makeproc MakeOovProviderPluginFunc) ([]OovProviderPlugin, error) { 119 | ret := []OovProviderPlugin{} 120 | pname := &struct { 121 | Class *string 122 | Name *string 123 | }{} 124 | for _, raw := range settings.oovProviderPlugin { 125 | err := json.Unmarshal(raw, pname) 126 | if err != nil { 127 | return ret, err 128 | } 129 | var name string 130 | if pname.Class != nil { 131 | name = *pname.Class 132 | } 133 | if pname.Name != nil { 134 | name = *pname.Name 135 | } 136 | plugin := makeproc(name) 137 | if plugin == nil { 138 | return ret, fmt.Errorf("OovProviderPlugin: %s is unknown", name) 139 | } 140 | err = json.Unmarshal(raw, plugin.GetConfigStruct()) 141 | if err != nil { 142 | return ret, err 143 | } 144 | ret = append(ret, plugin) 145 | } 146 | return ret, nil 147 | } 148 | 149 | func (settings *SettingsJSON) GetEditConnectionCostPluginArray(makeproc MakeEditConnectionCostPluginFunc) ([]EditConnectionCostPlugin, error) { 150 | ret := []EditConnectionCostPlugin{} 151 | pname := &struct { 152 | Class *string 153 | Name *string 154 | }{} 155 | for _, raw := range settings.editConnectionCostPlugin { 156 | err := json.Unmarshal(raw, pname) 157 | if err != nil { 158 | return ret, err 159 | } 160 | var name string 161 | if pname.Class != nil { 162 | name = *pname.Class 163 | } 164 | if pname.Name != nil { 165 | name = *pname.Name 166 | } 167 | plugin := makeproc(name) 168 | if plugin == nil { 169 | return ret, fmt.Errorf("EditConnectionCostPlugin: %s is unknown", name) 170 | } 171 | err = json.Unmarshal(raw, plugin.GetConfigStruct()) 172 | if err != nil { 173 | return ret, err 174 | } 175 | ret = append(ret, plugin) 176 | } 177 | return ret, nil 178 | } 179 | 180 | func (settings *SettingsJSON) GetPathRewritePluginArray(makeproc MakePathRewritePluginFunc) ([]PathRewritePlugin, error) { 181 | ret := []PathRewritePlugin{} 182 | pname := &struct { 183 | Class *string 184 | Name *string 185 | }{} 186 | for _, raw := range settings.pathRewritePlugin { 187 | err := json.Unmarshal(raw, pname) 188 | if err != nil { 189 | return ret, err 190 | } 191 | var name string 192 | if pname.Class != nil { 193 | name = *pname.Class 194 | } 195 | if pname.Name != nil { 196 | name = *pname.Name 197 | } 198 | plugin := makeproc(name) 199 | if plugin == nil { 200 | return ret, fmt.Errorf("PathRewritePlugin: %s is unknown", name) 201 | } 202 | err = json.Unmarshal(raw, plugin.GetConfigStruct()) 203 | if err != nil { 204 | return ret, err 205 | } 206 | ret = append(ret, plugin) 207 | } 208 | return ret, nil 209 | } 210 | -------------------------------------------------------------------------------- /settingsjson_test.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "strings" 5 | "testing" 6 | ) 7 | 8 | var s string = ` 9 | { 10 | "path" : "/usr/local/share/sudachi", 11 | "systemDict" : "system.dic", 12 | "characterDefinitionFile" : "char.def", 13 | "inputTextPlugin" : [ 14 | { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" }, 15 | { "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin", 16 | "prolongedSoundMarks" : ["ー", "-", "⁓", "〜", "〰"], 17 | "replacementSymbol" : "ー" 18 | } 19 | ], 20 | "oovProviderPlugin" : [ 21 | { 22 | "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin", 23 | "charDef" : "char.def", 24 | "unkDef" : "unk.def" 25 | }, 26 | { 27 | "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", 28 | "oovPOSStrings" : [ "補助記号", "一般", "*", "*", "*", "*" ], 29 | "leftId" : 5968, 30 | "rightId" : 5968, 31 | "cost" : 3857 32 | } 33 | ], 34 | "pathRewritePlugin" : [ 35 | { 36 | "name" : "JoinNumericPlugin", 37 | "enableNormalize" : false 38 | }, 39 | { 40 | "name" : "JoinKatakanaOovPlugin", 41 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 42 | "minLength" : 3 43 | } 44 | ] 45 | } 46 | ` 47 | 48 | // TestSettingsJSON_ParseSettingsJSON 49 | func TestSettingsJSON_ParseSettingsJSON(t *testing.T) { 50 | settings := NewSettingsJSON() 51 | err := settings.ParseSettingsJSON("", strings.NewReader(s)) 52 | if err != nil { 53 | t.Errorf("fail to parse json: %s", err) 54 | } 55 | 56 | bc := settings.GetBaseConfig() 57 | want := "/usr/local/share/sudachi/system.dic" 58 | if bc.SystemDict != want { 59 | t.Errorf("invalid result. want = %s, got = %s", want, bc.SystemDict) 60 | } 61 | 62 | iplugins, err := settings.GetInputTextPluginArray(DefMakeInputTextPlugin) 63 | if err != nil { 64 | t.Errorf("GetInputTextPluginArray: %s", err) 65 | } 66 | if len(iplugins) != 2 { 67 | t.Errorf("invalid result. want = 2, got = %d", len(iplugins)) 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /simpleoovproviderplugin.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "fmt" 5 | 6 | "github.com/msnoigrs/gosudachi/dictionary" 7 | ) 8 | 9 | type SimpleOovProviderPluginConfig struct { 10 | OovPos *[]string 11 | LeftId *int16 12 | RightId *int16 13 | Cost *int16 14 | } 15 | 16 | type SimpleOovProviderPlugin struct { 17 | config *SimpleOovProviderPluginConfig 18 | oovPosId int16 19 | leftId int16 20 | rightId int16 21 | cost int16 22 | } 23 | 24 | func NewSimpleOovProviderPlugin(config *SimpleOovProviderPluginConfig) *SimpleOovProviderPlugin { 25 | if config == nil { 26 | config = &SimpleOovProviderPluginConfig{} 27 | } 28 | return &SimpleOovProviderPlugin{ 29 | config: config, 30 | } 31 | } 32 | 33 | func (p *SimpleOovProviderPlugin) GetConfigStruct() interface{} { 34 | if p.config == nil { 35 | p.config = &SimpleOovProviderPluginConfig{} 36 | } 37 | return p.config 38 | } 39 | 40 | func (p *SimpleOovProviderPlugin) SetUp(grammar *dictionary.Grammar) error { 41 | if p.config.OovPos == nil { 42 | return fmt.Errorf("SimpleOovProviderPlugin: oovPOS is not specified") 43 | } 44 | if p.config.LeftId == nil { 45 | return fmt.Errorf("SimpleOovProviderPlugin: leftId is not specified") 46 | } 47 | if p.config.RightId == nil { 48 | return fmt.Errorf("SimpleOovProviderPlugin: rightId is not specified") 49 | } 50 | if p.config.Cost == nil { 51 | return fmt.Errorf("SimpleOovProviderPlugin: cost is not specified") 52 | } 53 | if len(*(p.config.OovPos)) == 0 { 54 | return fmt.Errorf("SimpleOovProviderPlugin: oovPOS is zero length") 55 | } 56 | oovPosId := grammar.GetPartOfSpeechId(*p.config.OovPos) 57 | if oovPosId < 0 { 58 | return fmt.Errorf("SimpleOovProviderPlugin: oovPOS is invalid") 59 | } 60 | p.oovPosId = oovPosId 61 | p.leftId = *p.config.LeftId 62 | p.rightId = *p.config.RightId 63 | p.cost = *p.config.Cost 64 | p.config = nil 65 | return nil 66 | } 67 | 68 | func (p *SimpleOovProviderPlugin) ProvideOOV(inputText *InputText, offset int, hasOtherWords bool) ([]*LatticeNode, error) { 69 | if !hasOtherWords { 70 | node := CreateNodeOfOOV() 71 | node.SetParameter(p.leftId, p.rightId, p.cost) 72 | length := inputText.GetCodePointsOffsetLength(offset, 1) 73 | s := inputText.GetSubstring(offset, offset+length) 74 | wi := &dictionary.WordInfo{ 75 | Surface: s, 76 | HeadwordLength: int16(length), 77 | PosId: p.oovPosId, 78 | NormalizedForm: s, 79 | DictionaryForm: s, 80 | ReadingForm: "", 81 | } 82 | node.SetWordInfo(wi) 83 | return []*LatticeNode{node}, nil 84 | } else { 85 | return []*LatticeNode{}, nil 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /tokenizer.go: -------------------------------------------------------------------------------- 1 | package gosudachi 2 | 3 | import ( 4 | "fmt" 5 | "io" 6 | 7 | "github.com/msnoigrs/gosudachi/dictionary" 8 | ) 9 | 10 | type JapaneseTokenizer struct { 11 | grammar *dictionary.Grammar 12 | lexicon *dictionary.LexiconSet 13 | inputTextPlugins []InputTextPlugin 14 | oovProviderPlugins []OovProviderPlugin 15 | pathRewritePlugins []PathRewritePlugin 16 | defaultOovProvider OovProviderPlugin 17 | 18 | DumpOutput io.Writer 19 | lattice *Lattice 20 | } 21 | 22 | func NewJapaneseTokenizer(grammar *dictionary.Grammar, lexicon *dictionary.LexiconSet, inputTextPlugins []InputTextPlugin, oovProviderPlugins []OovProviderPlugin, pathRewritePlugins []PathRewritePlugin) *JapaneseTokenizer { 23 | ret := &JapaneseTokenizer{ 24 | grammar: grammar, 25 | lexicon: lexicon, 26 | inputTextPlugins: inputTextPlugins, 27 | oovProviderPlugins: oovProviderPlugins, 28 | pathRewritePlugins: pathRewritePlugins, 29 | lattice: NewLattice(grammar), 30 | } 31 | if len(oovProviderPlugins) > 0 { 32 | ret.defaultOovProvider = oovProviderPlugins[0] 33 | } 34 | return ret 35 | } 36 | 37 | func (t *JapaneseTokenizer) Tokenize(mode string, text string) (*MorphemeList, error) { 38 | inputTextBuilder := NewInputTextBuilder(text, t.grammar) 39 | 40 | if len(text) == 0 { 41 | return NewMorphemeList(inputTextBuilder.Build(), t.grammar, t.lexicon, []*LatticeNode{}), nil 42 | } 43 | 44 | for _, plugin := range t.inputTextPlugins { 45 | err := plugin.Rewrite(inputTextBuilder) 46 | if err != nil { 47 | return nil, err 48 | } 49 | } 50 | input := inputTextBuilder.Build() 51 | 52 | if t.DumpOutput != nil { 53 | fmt.Fprintln(t.DumpOutput, "=== Input dump") 54 | fmt.Fprintln(t.DumpOutput, input.GetText()) 55 | } 56 | 57 | err := t.buildLattice(input) 58 | if err != nil { 59 | return nil, err 60 | } 61 | 62 | if t.DumpOutput != nil { 63 | fmt.Fprintln(t.DumpOutput, "=== Lattice dump") 64 | t.lattice.Dump(t.DumpOutput) 65 | } 66 | 67 | path, err := t.lattice.GetBestPath() 68 | if err != nil { 69 | return nil, err 70 | } 71 | 72 | if t.DumpOutput != nil { 73 | fmt.Fprintln(t.DumpOutput, "=== Before rewriting:") 74 | t.dumpPath(path) 75 | } 76 | 77 | for _, plugin := range t.pathRewritePlugins { 78 | err := plugin.Rewrite(input, &path, t.lattice) 79 | if err != nil { 80 | return nil, err 81 | } 82 | } 83 | t.lattice.clear() 84 | 85 | if mode != "C" { 86 | path = t.splitPath(path, mode) 87 | } 88 | 89 | if t.DumpOutput != nil { 90 | fmt.Fprintln(t.DumpOutput, "=== After rewriting:") 91 | t.dumpPath(path) 92 | fmt.Fprintln(t.DumpOutput, "===") 93 | } 94 | 95 | return NewMorphemeList(input, t.grammar, t.lexicon, path), nil 96 | } 97 | 98 | func (t *JapaneseTokenizer) buildLattice(input *InputText) error { 99 | bytea := input.Bytea 100 | t.lattice.resize(len(bytea)) 101 | for i, _ := range bytea { 102 | if !input.CanBow(i) || !t.lattice.HasPreviousNode(i) { 103 | continue 104 | } 105 | iterator := t.lexicon.Lookup(bytea, i) 106 | hasWords := iterator.Next() 107 | for iterator.Next() { 108 | wordId, end := iterator.Get() 109 | if err := iterator.Err(); err != nil { 110 | break 111 | } 112 | n := NewLatticeNode( 113 | t.lexicon, 114 | t.lexicon.GetLeftId(wordId), 115 | t.lexicon.GetRightId(wordId), 116 | t.lexicon.GetCost(wordId), 117 | wordId, 118 | ) 119 | t.lattice.Insert(i, end, n) 120 | } 121 | if err := iterator.Err(); err != nil { 122 | return err 123 | } 124 | 125 | // OOV 126 | types := input.GetCharCategoryTypes(i) 127 | if (types & dictionary.NOOOVBOW) != dictionary.NOOOVBOW { 128 | for _, plugin := range t.oovProviderPlugins { 129 | nodes, err := GetOOV(plugin, input, i, hasWords) 130 | if err != nil { 131 | return err 132 | } 133 | for _, node := range nodes { 134 | hasWords = true 135 | t.lattice.Insert(node.Begin, node.End, node) 136 | } 137 | } 138 | } 139 | if !hasWords && t.defaultOovProvider != nil { 140 | nodes, err := GetOOV(t.defaultOovProvider, input, i, hasWords) 141 | if err != nil { 142 | return err 143 | } 144 | for _, node := range nodes { 145 | hasWords = true 146 | t.lattice.Insert(node.Begin, node.End, node) 147 | } 148 | } 149 | if !hasWords { 150 | return fmt.Errorf("there is no morpheme at %d", i) 151 | } 152 | } 153 | t.lattice.connectEosNode() 154 | 155 | return nil 156 | } 157 | 158 | func (t *JapaneseTokenizer) splitPath(path []*LatticeNode, mode string) []*LatticeNode { 159 | newPath := []*LatticeNode{} 160 | for _, node := range path { 161 | wi := node.GetWordInfo() 162 | var wids []int32 163 | if mode == "A" { 164 | wids = wi.AUnitSplit 165 | } else { 166 | wids = wi.BUnitSplit 167 | } 168 | if len(wids) == 0 || len(wids) == 1 { 169 | newPath = append(newPath, node) 170 | } else { 171 | offset := node.Begin 172 | for _, wid := range wids { 173 | n := NewLatticeNode(t.lexicon, 0, 0, 0, wid) 174 | n.Begin = offset 175 | nwi := n.GetWordInfo() 176 | offset += int(nwi.HeadwordLength) 177 | n.End = offset 178 | newPath = append(newPath, n) 179 | } 180 | } 181 | } 182 | return newPath 183 | } 184 | 185 | func (t *JapaneseTokenizer) dumpPath(path []*LatticeNode) { 186 | for i, node := range path { 187 | fmt.Fprintf(t.DumpOutput, "%d: %s\n", i, node.String()) 188 | } 189 | } 190 | -------------------------------------------------------------------------------- /userdicbuilder/main.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "bufio" 5 | "flag" 6 | "fmt" 7 | "os" 8 | "time" 9 | 10 | "github.com/msnoigrs/gosudachi/dictionary" 11 | "golang.org/x/text/language" 12 | "golang.org/x/text/message" 13 | ) 14 | 15 | func main() { 16 | flag.Usage = func() { 17 | fmt.Fprintf(os.Stderr, `Usage of %s: 18 | %s -o file -s file [-d description] [-j] file1 [file2 ...] 19 | 20 | Options: 21 | `, os.Args[0], os.Args[0]) 22 | flag.PrintDefaults() 23 | } 24 | 25 | var ( 26 | outputpath string 27 | systemdict string 28 | description string 29 | utf16string bool 30 | ) 31 | flag.StringVar(&outputpath, "o", "", "output to file") 32 | flag.StringVar(&systemdict, "s", "", "system dictionary") 33 | flag.StringVar(&description, "d", "", "comment") 34 | flag.BoolVar(&utf16string, "j", false, "use UTF-16 string") 35 | 36 | flag.Parse() 37 | 38 | if outputpath == "" || systemdict == "" || len(flag.Args()) == 0 { 39 | flag.Usage() 40 | os.Exit(1) 41 | } 42 | 43 | dh := dictionary.NewDictionaryHeader( 44 | dictionary.UserDictVersion2, 45 | time.Now().Unix(), 46 | description, 47 | ) 48 | 49 | hb, err := dh.ToBytes() 50 | if err != nil { 51 | fmt.Fprintln(os.Stderr, err) 52 | os.Exit(1) 53 | } 54 | 55 | sdic, err := dictionary.ReadSystemDictionary(systemdict, utf16string) 56 | if err != nil { 57 | fmt.Fprintln(os.Stderr, err) 58 | os.Exit(1) 59 | } 60 | defer sdic.Close() 61 | 62 | outputWriter, err := os.OpenFile(outputpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644) 63 | if err != nil { 64 | fmt.Fprintln(os.Stderr, err) 65 | os.Exit(1) 66 | } 67 | defer outputWriter.Close() 68 | 69 | bufout := bufio.NewWriter(outputWriter) 70 | n, err := bufout.Write(hb) 71 | if err != nil { 72 | fmt.Fprintf(os.Stderr, "fail to write header: %s\n", err) 73 | os.Exit(1) 74 | } 75 | err = bufout.Flush() 76 | if err != nil { 77 | fmt.Fprintf(os.Stderr, "fail to write header: %s\n", err) 78 | os.Exit(1) 79 | } 80 | 81 | dicbuilder := dictionary.NewDictionaryBuilder(int64(n), sdic.Lexicon, utf16string) 82 | store := dictionary.NewPosTableUser(sdic.Grammar) 83 | 84 | fmt.Fprint(os.Stderr, "reading the source file...") 85 | for _, lexiconpath := range flag.Args() { 86 | err := build(dicbuilder, store, lexiconpath) 87 | if err != nil { 88 | fmt.Fprintf(os.Stderr, "%s: %s", err, lexiconpath) 89 | os.Exit(1) 90 | } 91 | } 92 | p := message.NewPrinter(language.English) 93 | p.Fprintf(os.Stderr, " %d words\n", dicbuilder.EntrySize()) 94 | 95 | err = dicbuilder.WriteGrammarUser(&store.PosTable, outputWriter) 96 | if err != nil { 97 | fmt.Fprintf(os.Stderr, "fail to write grammar: %s\n", err) 98 | os.Exit(1) 99 | } 100 | 101 | err = dicbuilder.WriteLexicon(outputWriter, store) 102 | if err != nil { 103 | fmt.Fprintf(os.Stderr, "fail to write lexicon: %s\n", err) 104 | os.Exit(1) 105 | } 106 | } 107 | 108 | func build(dicbuilder *dictionary.DictionaryBuilder, store dictionary.PosIdStore, lexiconpath string) error { 109 | lexiconReader, err := os.OpenFile(lexiconpath, os.O_RDONLY, 0644) 110 | if err != nil { 111 | return err 112 | } 113 | defer lexiconReader.Close() 114 | 115 | err = dicbuilder.BuildLexicon(store, lexiconReader) 116 | if err != nil { 117 | return err 118 | } 119 | return nil 120 | } 121 | --------------------------------------------------------------------------------