├── .gitignore
├── LICENSE-2.0.txt
├── README.md
├── README.org
├── dartsclone
    ├── bitvector.go
    ├── da.go
    ├── da_test.go
    ├── dabuilder.go
    └── dawgbuilder.go
├── data
    ├── assets.go
    ├── assets_generate.go
    ├── assets_vfsdata.go
    ├── data.go
    └── root
    │   ├── char.def
    │   ├── rewrite.def
    │   ├── sudachi.json
    │   ├── sudachi_fulldict.json
    │   └── unk.def
├── definputtextplugin.go
├── dicbuilder
    └── main.go
├── dicconv
    └── main.go
├── dictionary.go
├── dictionary
    ├── binarydict.go
    ├── bytes.go
    ├── charcategory.go
    ├── dalexicon.go
    ├── dicbuilder.go
    ├── dicheader.go
    ├── dicprinter.go
    ├── dicversion.go
    ├── grammar.go
    ├── lexiconset.go
    └── wordinfo.go
├── go.mod
├── go.sum
├── gosudachicli
    └── main.go
├── inhibitconnectioncostplugin.go
├── inputtext.go
├── internal
    ├── lnreader
    │   └── lnreader.go
    └── mmap
    │   ├── mmap_unix.go
    │   └── mmap_windows.go
├── joinkatakanaoovplugin.go
├── joinnumericplugin.go
├── lattice.go
├── mecaboovproviderplugin.go
├── morpheme.go
├── numericparser.go
├── plugin.go
├── printdic
    └── main.go
├── printdicheader
    └── main.go
├── prolongedsoundmarkinputtextplugin.go
├── scripts
    ├── build.sh
    ├── mksystemdic.sh
    └── mksystemdicutf16.sh
├── settingsjson.go
├── settingsjson_test.go
├── simpleoovproviderplugin.go
├── tokenizer.go
└── userdicbuilder
    └── main.go


/.gitignore:
--------------------------------------------------------------------------------
1 | dist/
2 | SudachiDict/
3 | 


--------------------------------------------------------------------------------
/LICENSE-2.0.txt:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # gosudachi
  2 | 
  3 | gosudachiは日本語形態素解析器である[Sudachi](https://github.com/WorksApplications/Sudachi)のGo移植版です。
  4 | 
  5 | 以下では、株式会社ワークスアプリケーションズ徳島人工知能NLP研究所が開発公開しているオリジナルのSudachiを「Java版Sudachi」「Java版」、Java版sudachi用の辞書ファイルを「Java版sudachi辞書」と表記します。
  6 | 
  7 | gosudachiは、Java版sudachiのバージョン0.3.0相当です。
  8 | 
  9 | 
 10 | ## 特徴
 11 | 
 12 | 現時点のJava版Sudachiが持つ機能や特徴をすべて移植しました。よって詳しい情報は[Java版の文書](https://github.com/WorksApplications/Sudachi)を参照してください。この文書にはGo版のみに該当する内容が記述されています。
 13 | 
 14 | -   Java版と同じコマンドラインオプション
 15 | -   Java版と同じく分割モード指定が可能
 16 | -   Java版と同じシステム提供プラグイン同梱
 17 | -   Java版と同等のプラグインの仕組みを提供
 18 | -   Java版と同じ設定ファイルが利用可能
 19 | -   ユーザー辞書の作成および利用が可能
 20 | 
 21 | 
 22 | ## Java版とGo版の違い
 23 | 
 24 | -   辞書の文字列エンコード
 25 | -   設定ファイルに指定するプラグイン名
 26 | -   設定ファイルに辞書の文字列エンコードを指定する設定値を新設
 27 | 
 28 | 
 29 | ### 辞書の文字列エンコードを変更した理由
 30 | 
 31 | Java版Sudachiは、辞書の作成時に文字列をUTF-16エンコードのバイト列として記録します。辞書を利用するときは、辞書ファイルをメモリにマップし、バイト列をそのまま（文字コード変換をせずに）文字列として扱います。
 32 | 
 33 | Goの文字列はUTF-8エンコードのバイト列であることが一般的です。GoでJavaと同様に辞書中のバイト列をそのまま文字列として扱うには、UTF-8エンコードで記録された辞書を準備する必要があります。
 34 | 
 35 | Go版ではシステム辞書作成ツールとして `dicbuilder` 、ユーザー辞書作成ツールとして `userdicbuilder` を準備しており、どちらもUTF-8エンコードの辞書を作成します。（UTF-16エンコードの辞書を作成することもできます。 `dicconv` を使って相互に変換することも可能です。）
 36 | 
 37 | ただし、UTF-8エンコードの辞書はUTF-16エンコードの辞書よりもサイズが大きくなります。以下の2点がその理由です。
 38 | 
 39 | -   日本語に使用される文字の多くが、1文字あたりUTF-16では2byte長であり、UTF-8では3byte長
 40 | -   文字列のバイト長を記録するための領域に2byteを使用する頻度が高い
 41 | 
 42 | UTF-8エンコードでのバイト長が127を超える文字列の場合、2byteを使用してバイト長を記録します。なお、UTF-16エンコードの辞書ではバイト長ではなくUTF-16表現でのint16配列の長さを記録しており、記録可能な文字列の長さはUTF-8の方が短くなります。
 43 | 
 44 | ちなみに辞書中に記録される文字列とは、品詞情報リストおよび単語情報です。
 45 | 
 46 | Go版においても、UTF-16エンコードの辞書を利用することが可能です。この場合、辞書から文字列を読み出す処理においてUTF-16からUTF-8への文字コード変換が行われます。利用する辞書のエンコードを設定ファイルに設定できます。
 47 | 
 48 | 
 49 | ### 設定ファイルの違い
 50 | 
 51 | Go版でのみ利用できる設定値に関する記述です。
 52 | 
 53 | 
 54 | #### utf16String
 55 | 
 56 | `utf16String` が `true` になっている場合、UTF-16エンコードの辞書であると判断します。デフォルトはfalseです。
 57 | 
 58 |     {
 59 |         "systemDict" : "system_core_utf16.dic",
 60 |         "utf16String" : true,
 61 |         ...
 62 |     }
 63 | 
 64 | 
 65 | #### プラグイン名
 66 | 
 67 | Go版ではJava版の設定ファイルをそのまま利用することが可能ですが、プラグイン名に省略形を用いることもできます。
 68 | 
 69 | Java版と同様にデフォルトで利用できるプラグインは以下の7つがあります。省略形とはJavaのクラス階層を省いたプラグイン名です。また、設定先は `class` ではなく `name` にすることも可能です。
 70 | 
 71 | | 処理部分 | プラグイン   | プラグイン名                                              | 省略形                            |
 72 | |-------- |------------ |--------------------------------------------------------- |--------------------------------- |
 73 | | 入力テキスト修正 | 文字列正規化 | com.worksap.nlp.sudachi.DefaultInputTextPlugin            | DefaultInputTextPlugin            |
 74 | |          | 長音正規化   | com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin | ProlongedSoundMarkInputTextPlugin |
 75 | | 未知語処理 | 1文字未知語  | com.worksap.nlp.sudachi.SimpleOovProviderPlugin           | SimpleOovProviderPlugin           |
 76 | |          | MeCab互換    | com.worksap.nlp.sudachi.MeCabOovProviderPlugin            | MeCabOovProviderPlugin            |
 77 | | 単語接続処理 | 品詞接続禁制 | com.worksap.nlp.sudachi.InhibitConnectionPlugin           | InhibitConnectionPlugin           |
 78 | | 出力解修正 | カタカナ未知語まとめ上げ | com.worksap.nlp.sudachi.JoinKatakanaOovPlugin             | JoinKatakanaOovPlugin             |
 79 | |          | 数詞まとめ上げ | com.worksap.nlp.sudachi.JoinNumericPlugin                 | JoinNumericPlugin                 |
 80 | 
 81 |     {
 82 |         "systemDict" : "system_core.dic",
 83 |         "inputTextPlugin" : [
 84 |             { "name" : "DefaultInputTextPlugin" },
 85 |             { "name" : "ProlongedSoundMarkInputTextPlugin",
 86 |               "prolongedSoundMarks": ["ー", "-", "⁓", "〜", "〰"],
 87 |               "replacementSymbol": "ー"}
 88 |         ],
 89 |         "oovProviderPlugin" : [
 90 |             { "name" : "MeCabOovProviderPlugin" },
 91 |             { "name" : "SimpleOovProviderPlugin",
 92 |               "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ],
 93 |               "leftId" : 5968,
 94 |               "rightId" : 5968,
 95 |               "cost" : 3857 }
 96 |         ],
 97 |         "pathRewritePlugin" : [
 98 |             { "name" : "JoinNumericPlugin",
 99 |               "joinKanjiNumeric" : true },
100 |             { "name" : "JoinKatakanaOovPlugin",
101 |               "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
102 |               "minLength" : 3
103 |             }
104 |         ]
105 |     }
106 | 
107 | 
108 | ## Goへのポーティング指針
109 | 
110 | 以下の指針のもと、移植作業を行っています。
111 | 
112 | 1.  なるべくJavaのコードに似たような構成にする
113 |     -   オリジナルに修正が入ったときに追随しやすいように
114 | 
115 | 2.  Java版Sudachiと同じ設定ファイルが利用できるように
116 | 
117 | 3.  Java版Sudachiのコマンドラインインターフェースも同じにする
118 | 
119 | 4.  Java版Sudachi用に作成された辞書ファイルをGo版でも使えるように
120 | 
121 | 5.  Java版Sudachi用の辞書が作れるように
122 | 
123 | 
124 | ## ビルド
125 | 
126 | プログラムと辞書を作成する方法です。
127 | 
128 | 
129 | ### プログラムのビルド
130 | 
131 | このリポジトリをcloneします。 cloneしたディレクトリに移動し、ビルドスクリプトを実行します。
132 | 
133 |     $ git clone https://github.com/msnoigrs/gosudachi
134 |     $ cd gosudachi
135 |     $ bash scripts/build.sh
136 | 
137 | distディレクトリにバイナリが作成されます。作成されるバイナリは以下の通りです。
138 | 
139 | -   **gosudachicli:** Sudachiコマンドライン
140 | -   **dicbuilder:** システム辞書作成ツール
141 | -   **userdicbuilder:** ユーザー辞書作成ツール
142 | -   **printdic:** 辞書ファイルに登録されている単語リスト表示プログラム
143 | -   **printdicheader:** 辞書ファイルヘッダ情報表示プログラム
144 | -   **dicconv:** 辞書の文字列エンコードをUTF-16とUTF-8間で相互に変換するプログラム
145 | 
146 | ビルドスクリプトを使わない場合は、コマンドプロンプト上で以下を実行してください。Windowsでも作成可能です。
147 | 
148 |     $ git clone https://github.com/msnoigrs/gosudachi
149 |     $ cd gosudachi/data
150 |     $ go generate
151 |     $ cd ..
152 |     $ cd gosudachicli
153 |     $ go build
154 |     $ cd ..
155 |     $ cd dicbuilder
156 |     $ go build
157 |     $ cd ..
158 |     $ cd userdicbuilder
159 |     $ go build
160 |     $ cd ..
161 |     $ cd printdic
162 |     $ go build
163 |     $ cd ..
164 |     $ go printdicheader
165 |     $ go build
166 |     $ cd ..
167 |     $ cd dicconv
168 |     $ go build
169 | 
170 | 
171 | ### 辞書の作成
172 | 
173 | 辞書のソースもJava版Sudachiのものを利用します。 [SudachiDict](https://github.com/WorksApplications/SudachiDict)をgithubからcloneした後、git lfs pullで取得します。 辞書のソースファイルは、 `small_lex.csv` と `core_lex.csv` と `notcore_lex.csv` の3つです。
174 | 
175 | 辞書を作成するスクリプトを利用する場合、以下を実行してください。
176 | 
177 |     $ git clone https://github.com/WorksApplications/SudachiDict.git
178 |     $ cd SudachiDict
179 |     $ git lfs pull
180 |     $ cd ../dist
181 |     $ bash ../scripts/mksystemdic.sh ../SudachiDict
182 | 
183 | distディレクトリに `system_small.dic` 、 `system_core.dic` および `system_full.dic` ファイルが作成されます。
184 | 
185 | 辞書作成スクリプトを使わない場合は、コマンドプロンプト上で以下を実行してください。
186 | 
187 |     $ dicbuilder -o system_small.dic -m matrix.def small_lex.csv
188 |     $ dicbuilder -o system_core.dic -m matrix.def small_lex.csv core_lex.csv
189 |     $ dicbuilder -o system_full.dic -m matrix.def small_lex.csv core_lex.csv notcore_lex.csv
190 | 
191 | 
192 | ## コマンド
193 | 
194 | Go版で提供するコマンドの説明です。
195 | 
196 | 
197 | ### gosudachicli
198 | 
199 | Sudachiコマンドラインです。オプションを指定せずに実行する場合、 `system_core.dic` ファイルが実行時のディレクトリに存在する必要があります。辞書ファイルの場所は設定ファイルに指定可能です。
200 | 
201 |     $ gosudachicli [-r conf] [-m mode] [-a] [-d] [-o output] [-j] [file...]
202 | 
203 | 
204 | #### オプション
205 | 
206 | -   -r conf設定ファイルを指定
207 | -   -s デフォルト設定を上書きする設定(json文字列)
208 | -   -p リソースディレクトリ(設定ファイル内の各種リソースのベースディレクトリ、デフォルトは実行時ディレクトリ)
209 | -   -m {A|B|C}分割モード
210 | -   -a 読み、辞書形も出力
211 | -   -d デバッグ情報の出力
212 | -   -o 出力ファイル（指定がない場合は標準出力）
213 | -   -f エラーを無視して処理を続行する
214 | -   -j UTF-16エンコードの辞書ファイルを利用する
215 | 
216 | 
217 | #### 出力例
218 | 
219 |     $ echo 東京都へ行く | gosudachicli
220 |     東京都  名詞,固有名詞,地名,一般,*,*     東京都
221 |     へ      助詞,格助詞,*,*,*,*     へ
222 |     行く    動詞,非自立可能,*,*,五段-カ行,終止形-一般       行く
223 |     EOS
224 |     
225 |     $ echo 東京都へ行く | gosudachicli -a
226 |     東京都  名詞,固有名詞,地名,一般,*,*     東京都  東京都  トウキョウト
227 |     へ      助詞,格助詞,*,*,*,*     へ      へ      エ
228 |     行く    動詞,非自立可能,*,*,五段-カ行,終止形-一般       行く    行く    イク
229 |     EOS
230 |     
231 |     $ echo 東京都へ行く | gosudachicli -m A
232 |     東京    名詞,固有名詞,地名,一般,*,*     東京
233 |     都      名詞,普通名詞,一般,*,*,*        都
234 |     へ      助詞,格助詞,*,*,*,*     へ
235 |     行く    動詞,非自立可能,*,*,五段-カ行,終止形-一般       行く
236 |     EOS
237 | 
238 | -   **Java版:** com.worksap.nlp.sudachi.SudachiCommandLine
239 | 
240 | 
241 | ### dicbuilder
242 | 
243 | 辞書ソースファイルからシステム辞書を作成します。デフォルトではUTF-8エンコードの辞書が作成されます。
244 | 
245 |     $ dicbuilder -o outputdic -m matrix.def [-d description] [-j] filecsv1 [filecsv2...]
246 | 
247 | 
248 | #### オプション
249 | 
250 | -   -o 出力ファイル（必須）
251 | -   -m matrix.defファイル（必須）
252 | -   -d 辞書ヘッダ情報に埋め込む文字
253 | -   -j UTF-16エンコードの辞書ファイルを生成する
254 | 
255 | -   **Java版:** com.worksap.nlp.sudachi.dictionary.DictionaryBuilder
256 | 
257 | 
258 | ### userdicbuilder
259 | 
260 | ユーザー辞書ソースファイルからユーザー辞書を作成します。デフォルトではUTF-8エンコードの辞書が作成されます。
261 | 
262 |     $ userdicbuilder -o outputdic -s systemdic [-d description] [-j] filecsv1 [filecsv2...]
263 | 
264 | 
265 | #### オプション
266 | 
267 | -   -o 出力ファイル（必須）
268 | -   -s システム辞書ファイル（必須）
269 | -   -d 辞書ヘッダ情報に埋め込む文字
270 | -   -j UTF-16エンコードの辞書ファイルを生成する
271 | 
272 | -   **Java版:** com.worksap.nlp.sudachi.dictionary.UserDictionaryBuilder
273 | 
274 | 
275 | ### printdic
276 | 
277 | 辞書ファイルに登録されている単語リストを表示します。
278 | 
279 |     $ printdic [-s systemdic] [-j] inputdic
280 | 
281 | 
282 | #### オプション
283 | 
284 | -   -s システム辞書ファイル（ユーザー辞書の情報を出力する場合に必要）
285 | -   -j UTF-16エンコードの辞書を読み込み
286 | 
287 | -   **Java版:** com.worksap.nlp.sudachi.dictionary.DictionaryPrinter
288 | 
289 | 
290 | ### printdicheader
291 | 
292 | 辞書ファイルのヘッダ情報を表示します。
293 | 
294 |     $ printdicheader inputdic
295 | 
296 | -   **java版:** com.worksap.nlp.sudachi.dictionary.DictionaryHeaderPrinter
297 | 
298 | 
299 | ### dicconv
300 | 
301 | 辞書ファイルに記録されている文字列のエンコードを変換します。オプションを指定しない場合、UTF-16エンコード（Java版）からUTF-8エンコード（Go版）に変換します。
302 | 
303 |     $ dicconv [-o outputdic] [-j] inputdic
304 | 
305 | 
306 | #### オプション
307 | 
308 | -   -o 出力ファイル、省略すると `out_utf16.dic` もしくは `out_utf8.dic` に出力
309 | -   -j UTF-8エンコードからUTF-16エンコードに変換する
310 | 
311 | 
312 | ## ライセンス
313 | 
314 | Java版Sudachiと同じ[Apache License, Version2.0](http://www.apache.org/licenses/LICENSE-2.0.html)
315 | 
316 | 
317 | ## 謝辞
318 | 
319 | [Sudachi](https://github.com/WorksApplications/Sudachi)においてプログラムや辞書をOSSとして公開されている、株式会社ワークスアプリケーションズ徳島人工知能NLP研究所およびその開発者の方々に感謝いたします。
320 | 


--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
  1 | #+TITLE: gosudachi
  2 | #+AUTHOR: 五十嵐 正尚
  3 | #+EMAIL: syoux2@gmail.com
  4 | #+DATE: 2019/08/03
  5 | #+DESCRIPTION: Go porting of Sudachi
  6 | #+KEYWORDS:
  7 | #+LANGUAGE:  ja
  8 | #+OPTIONS: H:4 num:nil toc:nil ::t |:t ^:t -:t f:t *:t <:t
  9 | #+OPTIONS: tex:t todo:t pri:nil tags:t texht:nil
 10 | #+OPTIONS: author:t creator:nil email:nil date:t
 11 | 
 12 | * gosudachi
 13 | 
 14 | gosudachiは日本語形態素解析器である[[https://github.com/WorksApplications/Sudachi][Sudachi]]のGo移植版です。
 15 | 
 16 | 以下では、株式会社ワークスアプリケーションズ徳島人工知能NLP研究所が開発公開しているオリジナルのSudachiを「Java版Sudachi」「Java版」、Java版sudachi用の辞書ファイルを「Java版sudachi辞書」と表記します。
 17 | 
 18 | gosudachiは、Java版sudachiのバージョン0.3.0相当です。
 19 | 
 20 | ** 特徴
 21 | 
 22 | 現時点のJava版Sudachiが持つ機能や特徴をすべて移植しました。よって詳しい情報は[[https://github.com/WorksApplications/Sudachi][Java版の文書]]を参照してください。この文書にはGo版のみに該当する内容が記述されています。
 23 | 
 24 | - Java版と同じコマンドラインオプション
 25 | - Java版と同じく分割モード指定が可能
 26 | - Java版と同じシステム提供プラグイン同梱
 27 | - Java版と同等のプラグインの仕組みを提供
 28 | - Java版と同じ設定ファイルが利用可能
 29 | - ユーザー辞書の作成および利用が可能
 30 | 
 31 | ** Java版とGo版の違い
 32 | 
 33 | - 辞書の文字列エンコード
 34 | - 設定ファイルに指定するプラグイン名
 35 | - 設定ファイルに辞書の文字列エンコードを指定する設定値を新設
 36 | 
 37 | *** 辞書の文字列エンコードを変更した理由
 38 | 
 39 | Java版Sudachiは、辞書の作成時に文字列をUTF-16エンコードのバイト列として記録します。辞書を利用するときは、辞書ファイルをメモリにマップし、バイト列をそのまま（文字コード変換をせずに）文字列として扱います。
 40 | 
 41 | Goの文字列はUTF-8エンコードのバイト列であることが一般的です。GoでJavaと同様に辞書中のバイト列をそのまま文字列として扱うには、UTF-8エンコードで記録された辞書を準備する必要があります。
 42 | 
 43 | Go版ではシステム辞書作成ツールとして ~dicbuilder~ 、ユーザー辞書作成ツールとして ~userdicbuilder~ を準備しており、どちらもUTF-8エンコードの辞書を作成します。（UTF-16エンコードの辞書を作成することもできます。 ~dicconv~ を使って相互に変換することも可能です。）
 44 | 
 45 | ただし、UTF-8エンコードの辞書はUTF-16エンコードの辞書よりもサイズが大きくなります。以下の2点がその理由です。
 46 | 
 47 | - 日本語に使用される文字の多くが、1文字あたりUTF-16では2byte長であり、UTF-8では3byte長
 48 | - 文字列のバイト長を記録するための領域に2byteを使用する頻度が高い
 49 | 
 50 | UTF-8エンコードでのバイト長が127を超える文字列の場合、2byteを使用してバイト長を記録します。なお、UTF-16エンコードの辞書ではバイト長ではなくUTF-16表現でのint16配列の長さを記録しており、記録可能な文字列の長さはUTF-8の方が短くなります。
 51 | 
 52 | ちなみに辞書中に記録される文字列とは、品詞情報リストおよび単語情報です。
 53 | 
 54 | Go版においても、UTF-16エンコードの辞書を利用することが可能です。この場合、辞書から文字列を読み出す処理においてUTF-16からUTF-8への文字コード変換が行われます。利用する辞書のエンコードを設定ファイルに設定できます。
 55 | 
 56 | *** 設定ファイルの違い
 57 | 
 58 | Go版でのみ利用できる設定値に関する記述です。
 59 | 
 60 | **** utf16String
 61 | 
 62 | ~utf16String~ が ~true~ になっている場合、UTF-16エンコードの辞書であると判断します。デフォルトはfalseです。
 63 | 
 64 | #+BEGIN_EXAMPLE
 65 | {
 66 |     "systemDict" : "system_core_utf16.dic",
 67 |     "utf16String" : true,
 68 |     ...
 69 | }
 70 | #+END_EXAMPLE
 71 | 
 72 | **** プラグイン名
 73 | 
 74 | Go版ではJava版の設定ファイルをそのまま利用することが可能ですが、プラグイン名に省略形を用いることもできます。
 75 | 
 76 | Java版と同様にデフォルトで利用できるプラグインは以下の7つがあります。省略形とはJavaのクラス階層を省いたプラグイン名です。また、設定先は ~class~ ではなく ~name~ にすることも可能です。
 77 | 
 78 | | 処理部分         | プラグイン               | プラグイン名                                              | 省略形                            |
 79 | |------------------+--------------------------+-----------------------------------------------------------+-----------------------------------|
 80 | | 入力テキスト修正 | 文字列正規化             | com.worksap.nlp.sudachi.DefaultInputTextPlugin            | DefaultInputTextPlugin            |
 81 | |                  | 長音正規化               | com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin | ProlongedSoundMarkInputTextPlugin |
 82 | | 未知語処理       | 1文字未知語              | com.worksap.nlp.sudachi.SimpleOovProviderPlugin           | SimpleOovProviderPlugin           |
 83 | |                  | MeCab互換                | com.worksap.nlp.sudachi.MeCabOovProviderPlugin            | MeCabOovProviderPlugin            |
 84 | | 単語接続処理     | 品詞接続禁制             | com.worksap.nlp.sudachi.InhibitConnectionPlugin           | InhibitConnectionPlugin           |
 85 | | 出力解修正       | カタカナ未知語まとめ上げ | com.worksap.nlp.sudachi.JoinKatakanaOovPlugin             | JoinKatakanaOovPlugin             |
 86 | |                  | 数詞まとめ上げ           | com.worksap.nlp.sudachi.JoinNumericPlugin                 | JoinNumericPlugin                 |
 87 | 
 88 | #+BEGIN_EXAMPLE
 89 | {
 90 |     "systemDict" : "system_core.dic",
 91 |     "inputTextPlugin" : [
 92 |         { "name" : "DefaultInputTextPlugin" },
 93 |         { "name" : "ProlongedSoundMarkInputTextPlugin",
 94 |           "prolongedSoundMarks": ["ー", "-", "⁓", "〜", "〰"],
 95 |           "replacementSymbol": "ー"}
 96 |     ],
 97 |     "oovProviderPlugin" : [
 98 |         { "name" : "MeCabOovProviderPlugin" },
 99 |         { "name" : "SimpleOovProviderPlugin",
100 |           "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ],
101 |           "leftId" : 5968,
102 |           "rightId" : 5968,
103 |           "cost" : 3857 }
104 |     ],
105 |     "pathRewritePlugin" : [
106 |         { "name" : "JoinNumericPlugin",
107 |           "joinKanjiNumeric" : true },
108 |         { "name" : "JoinKatakanaOovPlugin",
109 |           "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
110 |           "minLength" : 3
111 |         }
112 |     ]
113 | }
114 | #+END_EXAMPLE
115 | 
116 | ** Goへのポーティング指針
117 | 
118 | 以下の指針のもと、移植作業を行っています。
119 | 
120 | 1. なるべくJavaのコードに似たような構成にする
121 |   + オリジナルに修正が入ったときに追随しやすいように
122 | 
123 | 2. Java版Sudachiと同じ設定ファイルが利用できるように
124 | 
125 | 3. Java版Sudachiのコマンドラインインターフェースも同じにする
126 | 
127 | 4. Java版Sudachi用に作成された辞書ファイルをGo版でも使えるように
128 | 
129 | 5. Java版Sudachi用の辞書が作れるように
130 | 
131 | ** ビルド
132 | 
133 | プログラムと辞書を作成する方法です。
134 | 
135 | *** プログラムのビルド
136 | 
137 | このリポジトリをcloneします。
138 | cloneしたディレクトリに移動し、ビルドスクリプトを実行します。
139 | 
140 | #+BEGIN_EXAMPLE
141 | $ git clone https://github.com/msnoigrs/gosudachi
142 | $ cd gosudachi
143 | $ bash scripts/build.sh
144 | #+END_EXAMPLE
145 | 
146 | distディレクトリにバイナリが作成されます。作成されるバイナリは以下の通りです。
147 | 
148 | - gosudachicli :: Sudachiコマンドライン
149 | - dicbuilder :: システム辞書作成ツール
150 | - userdicbuilder :: ユーザー辞書作成ツール
151 | - printdic :: 辞書ファイルに登録されている単語リスト表示プログラム
152 | - printdicheader :: 辞書ファイルヘッダ情報表示プログラム
153 | - dicconv :: 辞書の文字列エンコードをUTF-16とUTF-8間で相互に変換するプログラム
154 | 
155 | ビルドスクリプトを使わない場合は、コマンドプロンプト上で以下を実行してください。Windowsでも作成可能です。
156 | 
157 | #+BEGIN_EXAMPLE
158 | $ git clone https://github.com/msnoigrs/gosudachi
159 | $ cd gosudachi/data
160 | $ go generate
161 | $ cd ..
162 | $ cd gosudachicli
163 | $ go build
164 | $ cd ..
165 | $ cd dicbuilder
166 | $ go build
167 | $ cd ..
168 | $ cd userdicbuilder
169 | $ go build
170 | $ cd ..
171 | $ cd printdic
172 | $ go build
173 | $ cd ..
174 | $ go printdicheader
175 | $ go build
176 | $ cd ..
177 | $ cd dicconv
178 | $ go build
179 | #+END_EXAMPLE
180 | 
181 | *** 辞書の作成
182 | 
183 | 辞書のソースもJava版Sudachiのものを利用します。
184 | [[https://github.com/WorksApplications/SudachiDict][SudachiDict]]をgithubからcloneした後、git lfs pullで取得します。
185 | 辞書のソースファイルは、 ~small_lex.csv~ と ~core_lex.csv~ と ~notcore_lex.csv~ の3つです。
186 | 
187 | 辞書を作成するスクリプトを利用する場合、以下を実行してください。
188 | 
189 | #+BEGIN_EXAMPLE
190 | $ git clone https://github.com/WorksApplications/SudachiDict.git
191 | $ cd SudachiDict
192 | $ git lfs pull
193 | $ cd ../dist
194 | $ bash ../scripts/mksystemdic.sh ../SudachiDict
195 | #+END_EXAMPLE
196 | 
197 | distディレクトリに ~system_small.dic~ 、 ~system_core.dic~ および ~system_full.dic~ ファイルが作成されます。
198 | 
199 | 辞書作成スクリプトを使わない場合は、コマンドプロンプト上で以下を実行してください。
200 | 
201 | #+BEGIN_EXAMPLE
202 | $ dicbuilder -o system_small.dic -m matrix.def small_lex.csv
203 | $ dicbuilder -o system_core.dic -m matrix.def small_lex.csv core_lex.csv
204 | $ dicbuilder -o system_full.dic -m matrix.def small_lex.csv core_lex.csv notcore_lex.csv
205 | #+END_EXAMPLE
206 | 
207 | ** コマンド
208 | 
209 | Go版で提供するコマンドの説明です。
210 | 
211 | *** gosudachicli
212 | 
213 | Sudachiコマンドラインです。オプションを指定せずに実行する場合、 ~system_core.dic~ ファイルが実行時のディレクトリに存在する必要があります。辞書ファイルの場所は設定ファイルに指定可能です。
214 | 
215 | #+BEGIN_EXAMPLE
216 | $ gosudachicli [-r conf] [-m mode] [-a] [-d] [-o output] [-j] [file...]
217 | #+END_EXAMPLE
218 | 
219 | **** オプション
220 | 
221 | - -r conf設定ファイルを指定
222 | - -s デフォルト設定を上書きする設定(json文字列)
223 | - -p リソースディレクトリ(設定ファイル内の各種リソースのベースディレクトリ、デフォルトは実行時ディレクトリ)
224 | - -m {A|B|C}分割モード
225 | - -a 読み、辞書形も出力
226 | - -d デバッグ情報の出力
227 | - -o 出力ファイル（指定がない場合は標準出力）
228 | - -f エラーを無視して処理を続行する
229 | - -j UTF-16エンコードの辞書ファイルを利用する
230 | 
231 | **** 出力例
232 | 
233 | #+BEGIN_EXAMPLE
234 | $ echo 東京都へ行く | gosudachicli
235 | 東京都  名詞,固有名詞,地名,一般,*,*     東京都
236 | へ      助詞,格助詞,*,*,*,*     へ
237 | 行く    動詞,非自立可能,*,*,五段-カ行,終止形-一般       行く
238 | EOS
239 | 
240 | $ echo 東京都へ行く | gosudachicli -a
241 | 東京都  名詞,固有名詞,地名,一般,*,*     東京都  東京都  トウキョウト
242 | へ      助詞,格助詞,*,*,*,*     へ      へ      エ
243 | 行く    動詞,非自立可能,*,*,五段-カ行,終止形-一般       行く    行く    イク
244 | EOS
245 | 
246 | $ echo 東京都へ行く | gosudachicli -m A
247 | 東京    名詞,固有名詞,地名,一般,*,*     東京
248 | 都      名詞,普通名詞,一般,*,*,*        都
249 | へ      助詞,格助詞,*,*,*,*     へ
250 | 行く    動詞,非自立可能,*,*,五段-カ行,終止形-一般       行く
251 | EOS
252 | #+END_EXAMPLE
253 | 
254 | - Java版 :: com.worksap.nlp.sudachi.SudachiCommandLine
255 | 
256 | *** dicbuilder
257 | 
258 | 辞書ソースファイルからシステム辞書を作成します。デフォルトではUTF-8エンコードの辞書が作成されます。
259 | 
260 | #+BEGIN_EXAMPLE
261 | $ dicbuilder -o outputdic -m matrix.def [-d description] [-j] filecsv1 [filecsv2...]
262 | #+END_EXAMPLE
263 | 
264 | **** オプション
265 | 
266 | - -o 出力ファイル（必須）
267 | - -m matrix.defファイル（必須）
268 | - -d 辞書ヘッダ情報に埋め込む文字
269 | - -j UTF-16エンコードの辞書ファイルを生成する
270 | 
271 | - Java版 :: com.worksap.nlp.sudachi.dictionary.DictionaryBuilder
272 | 
273 | *** userdicbuilder
274 | 
275 | ユーザー辞書ソースファイルからユーザー辞書を作成します。デフォルトではUTF-8エンコードの辞書が作成されます。
276 | 
277 | #+BEGIN_EXAMPLE
278 | $ userdicbuilder -o outputdic -s systemdic [-d description] [-j] filecsv1 [filecsv2...]
279 | #+END_EXAMPLE
280 | 
281 | **** オプション
282 | 
283 | - -o 出力ファイル（必須）
284 | - -s システム辞書ファイル（必須）
285 | - -d 辞書ヘッダ情報に埋め込む文字
286 | - -j UTF-16エンコードの辞書ファイルを生成する
287 | 
288 | - Java版 :: com.worksap.nlp.sudachi.dictionary.UserDictionaryBuilder
289 | 
290 | *** printdic
291 | 
292 | 辞書ファイルに登録されている単語リストを表示します。
293 | 
294 | #+BEGIN_EXAMPLE
295 | $ printdic [-s systemdic] [-j] inputdic
296 | #+END_EXAMPLE
297 | 
298 | **** オプション
299 | 
300 | - -s システム辞書ファイル（ユーザー辞書の情報を出力する場合に必要）
301 | - -j UTF-16エンコードの辞書を読み込み
302 | 
303 | - Java版 :: com.worksap.nlp.sudachi.dictionary.DictionaryPrinter
304 | 
305 | *** printdicheader
306 | 
307 | 辞書ファイルのヘッダ情報を表示します。
308 | 
309 | #+BEGIN_EXAMPLE
310 | $ printdicheader inputdic
311 | #+END_EXAMPLE
312 | 
313 | - java版 :: com.worksap.nlp.sudachi.dictionary.DictionaryHeaderPrinter
314 | 
315 | *** dicconv
316 | 
317 | 辞書ファイルに記録されている文字列のエンコードを変換します。オプションを指定しない場合、UTF-16エンコード（Java版）からUTF-8エンコード（Go版）に変換します。
318 | 
319 | #+BEGIN_EXAMPLE
320 | $ dicconv [-o outputdic] [-j] inputdic
321 | #+END_EXAMPLE
322 | 
323 | **** オプション
324 | 
325 | - -o 出力ファイル、省略すると ~out_utf16.dic~ もしくは ~out_utf8.dic~ に出力
326 | - -j UTF-8エンコードからUTF-16エンコードに変換する
327 | 
328 | ** ライセンス
329 | 
330 | Java版Sudachiと同じ[[http://www.apache.org/licenses/LICENSE-2.0.html][Apache License, Version2.0]]
331 | 
332 | ** 謝辞
333 | 
334 | [[https://github.com/WorksApplications/Sudachi][Sudachi]]においてプログラムや辞書をOSSとして公開されている、株式会社ワークスアプリケーションズ徳島人工知能NLP研究所およびその開発者の方々に感謝いたします。
335 | 


--------------------------------------------------------------------------------
/dartsclone/bitvector.go:
--------------------------------------------------------------------------------
 1 | package dartsclone
 2 | 
 3 | const (
 4 | 	unitLength = 32
 5 | )
 6 | 
 7 | type bitVector struct {
 8 | 	units   []uint32
 9 | 	ranks   []int
10 | 	numOnes int
11 | 	length  int
12 | }
13 | 
14 | func newBitVector() *bitVector {
15 | 	return &bitVector{}
16 | }
17 | 
18 | func (v *bitVector) get(id int) bool {
19 | 	return v.units[id/unitLength]>>((uint(id)%unitLength)&1) == 1
20 | }
21 | 
22 | func (v *bitVector) rank(id int) int {
23 | 	const mask = uint32(0xffffffff)
24 | 	unitId := id / unitLength
25 | 	offset := uint(id % unitLength)
26 | 	return v.ranks[unitId] + popCount(v.units[unitId] & ^(mask<<offset))
27 | }
28 | 
29 | func (v *bitVector) set(id int, bit bool) {
30 | 	if bit {
31 | 		v.units[id/unitLength] |= uint32(1) << uint(id%unitLength)
32 | 	} else {
33 | 		v.units[id/unitLength] &= ^(uint32(1) << uint(id%unitLength))
34 | 	}
35 | }
36 | 
37 | func (v *bitVector) extend() {
38 | 	if (v.length % unitLength) == 0 {
39 | 		v.units = append(v.units, 0)
40 | 	}
41 | 	v.length++
42 | }
43 | 
44 | func (v *bitVector) build() {
45 | 	v.ranks = make([]int, len(v.units), len(v.units))
46 | 	v.numOnes = 0
47 | 	for i := 0; i < len(v.units); i++ {
48 | 		v.ranks[i] = v.numOnes
49 | 		v.numOnes += popCount(v.units[i])
50 | 	}
51 | }
52 | 
53 | func (v *bitVector) clear() {
54 | 	v.units = v.units[:0]
55 | 	v.ranks = []int{}
56 | }
57 | 
58 | func popCount(unit uint32) int {
59 | 	unit = ((unit & 0xAAAAAAAA) >> 1) + (unit & 0x55555555)
60 | 	unit = ((unit & 0xCCCCCCCC) >> 2) + (unit & 0x33333333)
61 | 	unit = ((unit >> 4) + unit) & 0x0F0F0F0F
62 | 	unit += unit >> 8
63 | 	unit += unit >> 16
64 | 	return int(unit & 0xFF)
65 | }
66 | 


--------------------------------------------------------------------------------
/dartsclone/da.go:
--------------------------------------------------------------------------------
  1 | package dartsclone
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"io"
  6 | 	"os"
  7 | 
  8 | 	// "math"
  9 | 	"unsafe"
 10 | 
 11 | 	"github.com/msnoigrs/gosudachi/internal/mmap"
 12 | )
 13 | 
 14 | type DoubleArray struct {
 15 | 	array  []uint32
 16 | 	buffer []byte
 17 | }
 18 | 
 19 | func NewDoubleArray() *DoubleArray {
 20 | 	return &DoubleArray{}
 21 | }
 22 | 
 23 | func (da *DoubleArray) SetArray(array []uint32) {
 24 | 	da.array = array
 25 | 	da.buffer = asByteArray(array)
 26 | }
 27 | 
 28 | func (da *DoubleArray) SetBuffer(buffer []byte) {
 29 | 	da.buffer = buffer
 30 | 	da.array = asUInt32Array(buffer)
 31 | }
 32 | 
 33 | func (da *DoubleArray) Array() []uint32 {
 34 | 	return da.array
 35 | }
 36 | 
 37 | func (da *DoubleArray) ByteArray() []byte {
 38 | 	return da.buffer
 39 | }
 40 | 
 41 | func (da *DoubleArray) Clear() {
 42 | 	da.buffer = []byte{}
 43 | 	da.array = []uint32{}
 44 | }
 45 | 
 46 | func (da *DoubleArray) Length() int {
 47 | 	return len(da.array)
 48 | }
 49 | 
 50 | func (da *DoubleArray) TotalSize() int {
 51 | 	return len(da.buffer)
 52 | }
 53 | 
 54 | func (da *DoubleArray) Build(keys [][]byte, values []int, f ProgressFunc) error {
 55 | 	var err error
 56 | 	dab := newDoubleArrayBuilder(f)
 57 | 	da.array, err = dab.build(newKeySet(keys, values))
 58 | 	if err != nil {
 59 | 		return err
 60 | 	}
 61 | 	da.buffer = asByteArray(da.array)
 62 | 
 63 | 	return nil
 64 | }
 65 | 
 66 | func (da *DoubleArray) Open(f *os.File, position int64, totalSize int64) (err error) {
 67 | 	if position < 0 {
 68 | 		position = 0
 69 | 	}
 70 | 	if totalSize <= 0 {
 71 | 		finfo, err := f.Stat()
 72 | 		if err != nil {
 73 | 			return err
 74 | 		}
 75 | 		totalSize = finfo.Size()
 76 | 	}
 77 | 	da.buffer, err = mmap.Mmap(f, false, position, totalSize)
 78 | 	if err != nil {
 79 | 		return err
 80 | 	}
 81 | 	// err = mmap.Madvise(da.buffer, false)
 82 | 	// if err != nil {
 83 | 	// 	return err
 84 | 	// }
 85 | 	da.array = asUInt32Array(da.buffer)
 86 | 
 87 | 	return nil
 88 | }
 89 | 
 90 | func (da *DoubleArray) Close() error {
 91 | 	err := mmap.Munmap(da.buffer)
 92 | 	if err != nil {
 93 | 		return err
 94 | 	}
 95 | 	da.buffer = []byte{}
 96 | 	da.array = []uint32{}
 97 | 
 98 | 	return nil
 99 | }
100 | 
101 | func (da *DoubleArray) Save(writer io.Writer) (int, error) {
102 | 	return writer.Write(da.buffer)
103 | }
104 | 
105 | func (da *DoubleArray) ExactMatchSearch(key []byte) (int, int) {
106 | 	var nodePos uint32
107 | 	u := daunit(da.array[0])
108 | 
109 | 	for _, k := range key {
110 | 		nodePos ^= u.offset() ^ uint32(k)
111 | 		u = daunit(da.array[int(nodePos)])
112 | 		if u.label() != uint32(k) {
113 | 			return -1, 0
114 | 		}
115 | 	}
116 | 	if !u.hasLeaf() {
117 | 		return -1, 0
118 | 	}
119 | 	u = daunit(da.array[int(nodePos^u.offset())])
120 | 	return u.value(), len(key)
121 | }
122 | 
123 | func (da *DoubleArray) CommonPrefixSearch(key []byte, offset int, maxNumResult int) [][2]int {
124 | 	result := make([][2]int, 0)
125 | 
126 | 	var nodePos uint32
127 | 	u := daunit(da.array[0])
128 | 	nodePos ^= u.offset()
129 | 	for i := offset; i < len(key); i++ {
130 | 		k := uint32(key[i])
131 | 		nodePos ^= k
132 | 		u = daunit(da.array[int(nodePos)])
133 | 		if u.label() != k {
134 | 			return result
135 | 		}
136 | 
137 | 		nodePos ^= u.offset()
138 | 		if u.hasLeaf() && len(result) < maxNumResult {
139 | 			result = append(result, [2]int{daunit(da.array[int(nodePos)]).value(), i + 1})
140 | 		}
141 | 	}
142 | 	return result
143 | }
144 | 
145 | func (da *DoubleArray) CommonPrefixSearchItr(key []byte, offset int) *Iterator {
146 | 	return newIterator(da.array, key, offset)
147 | }
148 | 
149 | type Iterator struct {
150 | 	array   []uint32
151 | 	key     []byte
152 | 	offset  int
153 | 	nodePos uint32
154 | 	rvalue  int
155 | 	roffset int
156 | 	err     error
157 | }
158 | 
159 | func newIterator(array []uint32, key []byte, offset int) *Iterator {
160 | 	var nodePos uint32
161 | 	u := daunit(array[0])
162 | 	nodePos ^= u.offset()
163 | 	return &Iterator{
164 | 		array:   array,
165 | 		key:     key,
166 | 		offset:  offset,
167 | 		nodePos: nodePos,
168 | 		rvalue:  -1,
169 | 	}
170 | }
171 | 
172 | func (it *Iterator) Next() bool {
173 | 	if it.err != nil {
174 | 		return false
175 | 	}
176 | 	if it.rvalue == -1 {
177 | 		it.rvalue, it.roffset = it.getNext()
178 | 	}
179 | 	return it.rvalue != -1
180 | }
181 | 
182 | func (it *Iterator) Get() (int, int) {
183 | 	var (
184 | 		rvalue  int
185 | 		roffset int
186 | 	)
187 | 	if it.rvalue == -1 {
188 | 		rvalue, roffset = it.getNext()
189 | 		if rvalue == -1 {
190 | 			it.err = errors.New("No more element")
191 | 			return rvalue, roffset
192 | 		}
193 | 	} else {
194 | 		rvalue = it.rvalue
195 | 		roffset = it.roffset
196 | 		it.rvalue = -1
197 | 		it.roffset = 0
198 | 	}
199 | 	return rvalue, roffset
200 | }
201 | 
202 | func (it *Iterator) Err() error {
203 | 	return it.err
204 | }
205 | 
206 | func (it *Iterator) getNext() (int, int) {
207 | 	for ; it.offset < len(it.key); it.offset++ {
208 | 		k := uint32(it.key[it.offset])
209 | 		it.nodePos ^= k
210 | 		u := daunit(it.array[int(it.nodePos)])
211 | 		if u.label() != k {
212 | 			it.offset = len(it.key) // no more loop
213 | 			return -1, 0
214 | 		}
215 | 
216 | 		it.nodePos ^= u.offset()
217 | 		if u.hasLeaf() {
218 | 			it.offset++
219 | 			rvalue := daunit(it.array[int(it.nodePos)]).value()
220 | 			roffset := it.offset
221 | 			return rvalue, roffset
222 | 		}
223 | 	}
224 | 	return -1, 0
225 | }
226 | 
227 | type TraverseResult struct {
228 | 	Result       int
229 | 	Offset       int
230 | 	NodePosition int
231 | }
232 | 
233 | func (da *DoubleArray) Traverse(key []byte, offset int, length int, nodePosition int) *TraverseResult {
234 | 	nodePos := uint32(nodePosition)
235 | 	id := nodePos
236 | 	u := daunit(da.array[0])
237 | 
238 | 	for i := offset; i < length; i++ {
239 | 		k := uint32(key[i])
240 | 		id ^= u.offset() ^ k
241 | 		u = daunit(da.array[int(id)])
242 | 		if u.label() != k {
243 | 			return &TraverseResult{
244 | 				-2,
245 | 				i,
246 | 				int(nodePos),
247 | 			}
248 | 		}
249 | 		nodePos = id
250 | 	}
251 | 	if !u.hasLeaf() {
252 | 		return &TraverseResult{
253 | 			-1,
254 | 			length,
255 | 			int(nodePos),
256 | 		}
257 | 	}
258 | 	u = daunit(da.array[int(nodePos^u.offset())])
259 | 	return &TraverseResult{
260 | 		u.value(),
261 | 		length,
262 | 		int(nodePos),
263 | 	}
264 | }
265 | 
266 | func asUInt32Array(data []byte) []uint32 {
267 | 	var sl = struct {
268 | 		addr uintptr
269 | 		len  int
270 | 		cap  int
271 | 	}{uintptr(unsafe.Pointer(&data[0])), len(data) / 4, len(data) / 4}
272 | 	return *(*[]uint32)(unsafe.Pointer(&sl))
273 | 	// return (*[math.MaxUint32 / 4]uint32)(unsafe.Pointer(&data[0]))[:len(data) / 4]
274 | }
275 | 
276 | func asByteArray(data []uint32) []byte {
277 | 	// Slice memory layout
278 | 	// Copied this snippet from golang/sys package
279 | 	var sl = struct {
280 | 		addr uintptr
281 | 		len  int
282 | 		cap  int
283 | 	}{uintptr(unsafe.Pointer(&data[0])), len(data) * 4, len(data) * 4}
284 | 	return *(*[]byte)(unsafe.Pointer(&sl))
285 | 	// return (*[math.MaxUint32]byte)(unsafe.Pointer(&data[0]))[:len(data) * 4]
286 | }
287 | 
288 | type daunit uint32
289 | 
290 | func (u daunit) hasLeaf() bool {
291 | 	return ((uint32(u) >> 8) & uint32(1)) == 1
292 | }
293 | 
294 | func (u daunit) value() int {
295 | 	return int(uint32(u) & ((uint32(1) << 31) - 1))
296 | }
297 | 
298 | func (u daunit) label() uint32 {
299 | 	return uint32(u) & (uint32(1) << 31 | 0xFF)
300 | }
301 | 
302 | func (u daunit) offset() uint32 {
303 | 	return (uint32(u) >> 10) << ((uint32(u) & (uint32(1) << 9)) >> 6)
304 | }
305 | 


--------------------------------------------------------------------------------
/dartsclone/da_test.go:
--------------------------------------------------------------------------------
  1 | package dartsclone
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"testing"
  6 | )
  7 | 
  8 | func TestAsUInt32Array(t *testing.T) {
  9 | 	ba := []byte{0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00}
 10 | 	ia := asUInt32Array(ba)
 11 | 	if len(ia) != 2 {
 12 | 		t.Errorf("length is %d", len(ia))
 13 | 	}
 14 | 	if ia[0] != 1 {
 15 | 		t.Errorf("unexpected error %v", ia[0])
 16 | 	}
 17 | 	if ia[1] != 2 {
 18 | 		t.Errorf("unexpected error %v", ia[1])
 19 | 	}
 20 | }
 21 | 
 22 | func TestAsByteArray(t *testing.T) {
 23 | 	ia := []uint32{1, 2}
 24 | 	ba := asByteArray(ia)
 25 | 	if len(ba) != 8 {
 26 | 		t.Errorf("length is %d", len(ba))
 27 | 	}
 28 | 	if !bytes.Equal(ba, []byte{0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00}) {
 29 | 		t.Errorf("unexpected error %v", ba)
 30 | 	}
 31 | }
 32 | 
 33 | func TestBuild(t *testing.T) {
 34 | 	keys := [][]byte{
 35 | 		[]byte("電気"),
 36 | 		[]byte("電気通信"),
 37 | 		[]byte("電気通信大学"),
 38 | 		[]byte("電気通信大学大学院"),
 39 | 		[]byte("電気通信大学大学院大学"),
 40 | 	}
 41 | 	values := []int{
 42 | 		0,
 43 | 		1,
 44 | 		2,
 45 | 		3,
 46 | 		4,
 47 | 	}
 48 | 	t.Run("Build", func(t *testing.T) {
 49 | 		trie := NewDoubleArray()
 50 | 		err := trie.Build(keys, values, func(state int, max int) {
 51 | 			return
 52 | 		})
 53 | 		if err != nil {
 54 | 			t.Errorf("unexpected error: %v", err)
 55 | 		}
 56 | 		t.Run("CommonPrefixSearch", func(t *testing.T) {
 57 | 			ret := trie.CommonPrefixSearch([]byte("電気通信大学大学院大学"), 0, 5)
 58 | 			for i := 0; i < len(ret); i++ {
 59 | 				if got, expected := ret[i][0], i; got != expected {
 60 | 					t.Errorf("got %v, expected %v", got, expected)
 61 | 				}
 62 | 				if got, expected := []byte("電気通信大学大学院大学")[0:ret[i][1]], keys[i]; string(got) != string(expected) {
 63 | 					t.Errorf("got %v, expected %v", string(got), string(expected))
 64 | 				}
 65 | 			}
 66 | 		})
 67 | 		t.Run("CommonPrefixSearchItr", func(t *testing.T) {
 68 | 			it := trie.CommonPrefixSearchItr([]byte("電気通信大学大学院大学"), 0)
 69 | 			i := 0
 70 | 			for it.Next() {
 71 | 				if it.Err() != nil {
 72 | 					t.Errorf("unexpected error: %v", err)
 73 | 				}
 74 | 				got1, got2 := it.Get()
 75 | 				if got1 != i {
 76 | 					t.Errorf("got %v, expected %v", got1, i)
 77 | 				}
 78 | 				if string([]byte("電気通信大学大学院大学")[0:got2]) != string(keys[i]) {
 79 | 					t.Errorf("got %v, expected %v", string([]byte("電気通信大学大学院大学")[0:got2]), string(keys[i]))
 80 | 				}
 81 | 				i++
 82 | 			}
 83 | 			if it.Err() != nil {
 84 | 				t.Errorf("unexpected error: %v", err)
 85 | 			}
 86 | 			if i != 5 {
 87 | 				t.Errorf("no match")
 88 | 			}
 89 | 		})
 90 | 		t.Run("CommonPrefixSearchItr offset", func(t *testing.T) {
 91 | 			it := trie.CommonPrefixSearchItr([]byte("あ電気通信大学大学院大学"), 3)
 92 | 			i := 0
 93 | 			for it.Next() {
 94 | 				if it.Err() != nil {
 95 | 					t.Errorf("unexpected error: %v", err)
 96 | 				}
 97 | 				got1, got2 := it.Get()
 98 | 				if got1 != i {
 99 | 					t.Errorf("got %v, expected %v", got1, i)
100 | 				}
101 | 				if string([]byte("あ電気通信大学大学院大学")[3:got2]) != string(keys[i]) {
102 | 					t.Errorf("got %v, expected %v", string([]byte("あ電気通信大学大学院大学")[3:got2]), string(keys[i]))
103 | 				}
104 | 				i++
105 | 			}
106 | 			if it.Err() != nil && i != 5 {
107 | 				t.Errorf("unexpected error: %v", err)
108 | 			}
109 | 			if i != 5 {
110 | 				t.Errorf("no match")
111 | 			}
112 | 		})
113 | 	})
114 | }
115 | 


--------------------------------------------------------------------------------
/dartsclone/dawgbuilder.go:
--------------------------------------------------------------------------------
  1 | package dartsclone
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | )
  6 | 
  7 | const (
  8 | 	initialTableSize = 1 << 10
  9 | 	dawgRoot         = 0
 10 | )
 11 | 
 12 | type node struct {
 13 | 	child      int
 14 | 	sibling    int
 15 | 	label      byte
 16 | 	isState    bool
 17 | 	hasSibling bool
 18 | }
 19 | 
 20 | func (n *node) reset() {
 21 | 	n.child = 0
 22 | 	n.sibling = 0
 23 | 	n.label = 0
 24 | 	n.isState = false
 25 | 	n.hasSibling = false
 26 | }
 27 | 
 28 | func (n *node) unit() uint32 {
 29 | 	var sibling uint32
 30 | 	if n.hasSibling {
 31 | 		sibling = 1
 32 | 	}
 33 | 	if n.label == 0 {
 34 | 		return uint32(n.child)<<1 | sibling
 35 | 	}
 36 | 	var state uint32
 37 | 	if n.isState {
 38 | 		state = 2
 39 | 	}
 40 | 	return uint32(n.child)<<2 | state | sibling
 41 | }
 42 | 
 43 | type unit uint32
 44 | 
 45 | func (u unit) child() uint32 {
 46 | 	return uint32(u) >> 2
 47 | }
 48 | 
 49 | func (u unit) hasSibling() bool {
 50 | 	return (uint32(u) & 1) == 1
 51 | }
 52 | 
 53 | func (u unit) value() uint32 {
 54 | 	return uint32(u) >> 1
 55 | }
 56 | 
 57 | func (u unit) isState() bool {
 58 | 	return (uint32(u) & 2) == 2
 59 | }
 60 | 
 61 | type stack []int
 62 | 
 63 | func (s stack) top() int {
 64 | 	return s[len(s)-1]
 65 | }
 66 | 
 67 | func (s stack) pop() stack {
 68 | 	return s[:len(s)-1]
 69 | }
 70 | 
 71 | type dawgBuilder struct {
 72 | 	nodes           []node
 73 | 	units           []uint32
 74 | 	labels          []byte
 75 | 	isIntersections *bitVector
 76 | 	table           []int
 77 | 	nodeStack       stack
 78 | 	recycleBin      stack
 79 | 	numStates       int
 80 | }
 81 | 
 82 | func newDAWGBuilder() *dawgBuilder {
 83 | 	return &dawgBuilder{
 84 | 		isIntersections: newBitVector(),
 85 | 		table:           make([]int, initialTableSize, initialTableSize),
 86 | 	}
 87 | }
 88 | 
 89 | func (b *dawgBuilder) child(id int) uint32 {
 90 | 	return unit(b.units[id]).child()
 91 | }
 92 | 
 93 | func (b *dawgBuilder) sibling(id int) int {
 94 | 	if unit(b.units[id]).hasSibling() {
 95 | 		return id + 1
 96 | 	}
 97 | 	return 0
 98 | }
 99 | 
100 | func (b *dawgBuilder) value(id int) uint32 {
101 | 	return unit(b.units[id]).value()
102 | }
103 | 
104 | func (b *dawgBuilder) isLeaf(id int) bool {
105 | 	return b.labels[id] == 0
106 | }
107 | 
108 | func (b *dawgBuilder) label(id int) byte {
109 | 	return b.labels[id]
110 | }
111 | 
112 | func (b *dawgBuilder) isIntersection(id int) bool {
113 | 	return b.isIntersections.get(id)
114 | }
115 | 
116 | func (b *dawgBuilder) intersectionId(id int) int {
117 | 	return b.isIntersections.rank(id) - 1
118 | }
119 | 
120 | func (b *dawgBuilder) numIntersections() int {
121 | 	return b.isIntersections.numOnes
122 | }
123 | 
124 | func (b *dawgBuilder) length() int {
125 | 	return len(b.units)
126 | }
127 | 
128 | func (b *dawgBuilder) initialize() {
129 | 	b.appendNode()
130 | 	b.appendUnit()
131 | 
132 | 	b.numStates = 1
133 | 
134 | 	b.nodes[0].label = 0xFF
135 | 	b.nodeStack = append(b.nodeStack, 0)
136 | }
137 | 
138 | func (b *dawgBuilder) finish() {
139 | 	b.flush(0)
140 | 
141 | 	b.units[0] = b.nodes[0].unit()
142 | 	b.labels[0] = b.nodes[0].label
143 | 
144 | 	b.nodes = []node{}
145 | 	b.table = []int{}
146 | 	b.nodeStack = []int{}
147 | 	b.recycleBin = []int{}
148 | 
149 | 	b.isIntersections.build()
150 | }
151 | 
152 | func (b *dawgBuilder) insert(key []byte, value int) error {
153 | 	if value < 0 {
154 | 		return fmt.Errorf("negative value")
155 | 	}
156 | 	keylen := len(key)
157 | 	if keylen == 0 {
158 | 		return fmt.Errorf("zero-length key")
159 | 	}
160 | 
161 | 	var id int
162 | 	var keyPos int
163 | 
164 | 	for ; keyPos <= keylen; keyPos++ {
165 | 		childId := b.nodes[id].child
166 | 		if childId == 0 {
167 | 			break
168 | 		}
169 | 
170 | 		var keyLabel byte
171 | 		if keyPos <= keylen {
172 | 			keyLabel = key[keyPos]
173 | 		}
174 | 		if keyPos < keylen && keyLabel == 0 {
175 | 			return fmt.Errorf("invalid null character")
176 | 		}
177 | 
178 | 		unitLabel := b.nodes[childId].label
179 | 		if keyLabel < unitLabel {
180 | 			return fmt.Errorf("wrong key order")
181 | 		} else if keyLabel > unitLabel {
182 | 			b.nodes[childId].hasSibling = true
183 | 			b.flush(childId)
184 | 			break
185 | 		}
186 | 		id = childId
187 | 	}
188 | 
189 | 	if keyPos > keylen {
190 | 		return nil
191 | 	}
192 | 
193 | 	for ; keyPos <= keylen; keyPos++ {
194 | 		var keyLabel byte
195 | 		if keyPos < keylen {
196 | 			keyLabel = key[keyPos]
197 | 		}
198 | 		childId := b.appendNode()
199 | 
200 | 		if b.nodes[id].child == 0 {
201 | 			b.nodes[childId].isState = true
202 | 		}
203 | 		b.nodes[childId].sibling = b.nodes[id].child
204 | 		b.nodes[childId].label = keyLabel
205 | 		b.nodes[id].child = childId
206 | 		b.nodeStack = append(b.nodeStack, childId)
207 | 
208 | 		id = childId
209 | 	}
210 | 	b.nodes[id].child = value
211 | 
212 | 	return nil
213 | }
214 | 
215 | func (b *dawgBuilder) clear() {
216 | 	b.nodes = []node{}
217 | 	b.units = []uint32{}
218 | 	b.labels = []byte{}
219 | 	b.isIntersections = nil
220 | 	b.table = []int{}
221 | 	b.nodeStack = []int{}
222 | 	b.recycleBin = []int{}
223 | }
224 | 
225 | func (b *dawgBuilder) flush(id int) {
226 | 	for {
227 | 		nodeId := b.nodeStack.top()
228 | 		if nodeId == id {
229 | 			break
230 | 		}
231 | 		b.nodeStack = b.nodeStack.pop()
232 | 
233 | 		if b.numStates >= len(b.table)-len(b.table)/4 {
234 | 			b.expandTable()
235 | 		}
236 | 
237 | 		var numSiblings int
238 | 		for i := nodeId; i != 0; i = b.nodes[i].sibling {
239 | 			numSiblings++
240 | 		}
241 | 
242 | 		matchId, hashId := b.findNode(nodeId)
243 | 
244 | 		if matchId != 0 {
245 | 			b.isIntersections.set(matchId, true)
246 | 		} else {
247 | 			var unitId int
248 | 			for i := 0; i < numSiblings; i++ {
249 | 				unitId = b.appendUnit()
250 | 			}
251 | 			for i := nodeId; i != 0; i = b.nodes[i].sibling {
252 | 				b.units[unitId] = b.nodes[i].unit()
253 | 				b.labels[unitId] = b.nodes[i].label
254 | 				unitId--
255 | 			}
256 | 			matchId = unitId + 1
257 | 			b.table[hashId] = matchId
258 | 			b.numStates++
259 | 		}
260 | 
261 | 		var next int
262 | 		for i := nodeId; i != 0; i = next {
263 | 			next = b.nodes[i].sibling
264 | 			b.freeNode(i)
265 | 		}
266 | 
267 | 		b.nodes[b.nodeStack.top()].child = matchId
268 | 	}
269 | 	b.nodeStack = b.nodeStack.pop()
270 | }
271 | 
272 | func (b *dawgBuilder) expandTable() {
273 | 	tablesize := len(b.table) * 2
274 | 	b.table = make([]int, tablesize, tablesize)
275 | 	for id := 1; id < len(b.units); id++ {
276 | 		if b.labels[id] == 0 || unit(b.units[id]).isState() {
277 | 			hashId := b.findUnit(id)
278 | 			b.table[hashId] = id
279 | 		}
280 | 	}
281 | }
282 | 
283 | func (b *dawgBuilder) findUnit(id int) int {
284 | 	hashId := b.hashUnit(id) % len(b.table)
285 | 	for ; ; hashId = (hashId + 1) % len(b.table) {
286 | 		unitId := b.table[hashId]
287 | 		if unitId == 0 {
288 | 			break
289 | 		}
290 | 	}
291 | 	return hashId
292 | }
293 | 
294 | func (b *dawgBuilder) findNode(nodeId int) (int, int) {
295 | 	hashId := b.hashNode(nodeId) % len(b.table)
296 | 	for ; ; hashId = (hashId + 1) % len(b.table) {
297 | 		unitId := b.table[hashId]
298 | 		if unitId == 0 {
299 | 			break
300 | 		}
301 | 
302 | 		if b.areEqual(nodeId, unitId) {
303 | 			return unitId, hashId
304 | 		}
305 | 	}
306 | 	return 0, hashId
307 | }
308 | 
309 | func (b *dawgBuilder) areEqual(nodeId int, unitId int) bool {
310 | 	for i := b.nodes[nodeId].sibling; i != 0; i = b.nodes[i].sibling {
311 | 		if !unit(b.units[unitId]).hasSibling() {
312 | 			return false
313 | 		}
314 | 		unitId++
315 | 	}
316 | 	if unit(b.units[unitId]).hasSibling() {
317 | 		return false
318 | 	}
319 | 
320 | 	for i := nodeId; i != 0; i = b.nodes[i].sibling {
321 | 		if b.nodes[i].unit() != b.units[unitId] ||
322 | 			b.nodes[i].label != b.labels[unitId] {
323 | 			return false
324 | 		}
325 | 		unitId--
326 | 	}
327 | 	return true
328 | }
329 | 
330 | func (b *dawgBuilder) hashUnit(id int) int {
331 | 	var hashValue int
332 | 	for ; id != 0; id++ {
333 | 		u := b.units[id]
334 | 		label := b.labels[id]
335 | 		hashValue ^= hash((uint32(label) << 24) ^ u)
336 | 
337 | 		if !unit(u).hasSibling() {
338 | 			break
339 | 		}
340 | 	}
341 | 	return hashValue
342 | }
343 | 
344 | func (b *dawgBuilder) hashNode(id int) int {
345 | 	var hashValue int
346 | 	for ; id != 0; id = b.nodes[id].sibling {
347 | 		u := b.nodes[id].unit()
348 | 		label := b.nodes[id].label
349 | 		hashValue ^= hash((uint32(label) << 24) ^ u)
350 | 	}
351 | 	return hashValue
352 | }
353 | 
354 | func (b *dawgBuilder) appendUnit() int {
355 | 	b.isIntersections.extend()
356 | 	b.units = append(b.units, 0)
357 | 	b.labels = append(b.labels, 0)
358 | 	return b.isIntersections.length - 1
359 | }
360 | 
361 | func (b *dawgBuilder) appendNode() int {
362 | 	var id int
363 | 	if len(b.recycleBin) == 0 {
364 | 		id = len(b.nodes)
365 | 		b.nodes = append(b.nodes, node{})
366 | 	} else {
367 | 		id = b.recycleBin.top()
368 | 		b.nodes[id].reset()
369 | 		b.recycleBin = b.recycleBin.pop()
370 | 	}
371 | 	return id
372 | }
373 | 
374 | func (b *dawgBuilder) freeNode(id int) {
375 | 	b.recycleBin = append(b.recycleBin, id)
376 | }
377 | 
378 | func hash(key uint32) int {
379 | 	key = ^key + (key << 15)
380 | 	key = key ^ (key >> 12)
381 | 	key = key + (key << 2)
382 | 	key = key ^ (key >> 4)
383 | 	key = key * 2057
384 | 	key = key ^ (key >> 16)
385 | 	return int(key)
386 | }
387 | 


--------------------------------------------------------------------------------
/data/assets.go:
--------------------------------------------------------------------------------
 1 | // +build dev
 2 | 
 3 | package data
 4 | 
 5 | import (
 6 | 	"net/http"
 7 | )
 8 | 
 9 | var Assets http.FileSystem = http.Dir("./root")
10 | 


--------------------------------------------------------------------------------
/data/assets_generate.go:
--------------------------------------------------------------------------------
 1 | // +build ignore
 2 | 
 3 | package main
 4 | 
 5 | import (
 6 | 	"log"
 7 | 
 8 | 	"github.com/msnoigrs/gosudachi/data"
 9 | 	"github.com/shurcooL/vfsgen"
10 | )
11 | 
12 | func main() {
13 | 	err := vfsgen.Generate(data.Assets, vfsgen.Options{
14 | 		BuildTags:    "!dev",
15 | 		PackageName:  "data",
16 | 		VariableName: "Assets",
17 | 	})
18 | 
19 | 	if err != nil {
20 | 		log.Fatalln(err)
21 | 	}
22 | }
23 | 


--------------------------------------------------------------------------------
/data/data.go:
--------------------------------------------------------------------------------
1 | // +build !dev
2 | 
3 | package data
4 | 
5 | //go:generate go run -tags=dev assets_generate.go
6 | 


--------------------------------------------------------------------------------
/data/root/char.def:
--------------------------------------------------------------------------------
  1 | #
  2 | #   Japanese charcter category map
  3 | #
  4 | #   $Id: char.def 9 2012-12-12 04:13:15Z togiso $;
  5 | #
  6 | 
  7 | ###################################################################################
  8 | # 
  9 | #  CHARACTER CATEGORY DEFINITION
 10 | #
 11 | #  CATEGORY_NAME INVOKE GROUP LENGTH
 12 | #
 13 | #   - CATEGORY_NAME: Name of category. you have to define DEFAULT class.
 14 | #   - INVOKE: 1/0:   always invoke unknown word processing, evan when the word can be found in the lexicon
 15 | #   - GROUP:  1/0:   make a new word by grouping the same chracter category
 16 | #   - LENGTH: n:     1 to n length new words are added
 17 | #
 18 | DEFAULT         0 1 0  # DEFAULT is a mandatory category!
 19 | SPACE           0 1 0  
 20 | KANJI           0 0 2
 21 | SYMBOL          1 1 0
 22 | NUMERIC         1 1 0
 23 | ALPHA           1 1 0
 24 | HIRAGANA        0 1 2
 25 | KATAKANA        1 1 2
 26 | KANJINUMERIC    0 1 0  #change INVOKE 1->0
 27 | GREEK           1 1 0
 28 | CYRILLIC        1 1 0
 29 | 
 30 | ###################################################################################
 31 | #
 32 | # CODE(UCS2) TO CATEGORY MAPPING
 33 | #
 34 | 
 35 | # SPACE
 36 | 0x0020 SPACE  # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE
 37 | 0x000D SPACE
 38 | 0x0009 SPACE
 39 | 0x000B SPACE
 40 | 0x000A SPACE
 41 | 
 42 | # ASCII
 43 | 0x0021..0x002F SYMBOL   #!"#$%&'()*+,-./
 44 | 0x0030..0x0039 NUMERIC  #0-9
 45 | 0x003A..0x0040 SYMBOL   #:;<=>?@
 46 | 0x0041..0x005A ALPHA    #A-Z
 47 | 0x005B..0x0060 SYMBOL   #[\]^_`
 48 | 0x0061..0x007A ALPHA    #a-z
 49 | 0x007B..0x007E SYMBOL   #{|}~
 50 | 
 51 | # Latin
 52 | 0x00A1..0x00BF SYMBOL # Latin 1 #¡->¿
 53 | 0x00C0..0x00D6 ALPHA  # Latin 1 #À->Ö
 54 | 0x00D7         SYMBOL # Latin 1 #×
 55 | 0x00D8..0x00F6 ALPHA  # Latin 1 #Ø->ö
 56 | 0x00F7         SYMBOL # Latin 1 #÷
 57 | 0x00F8..0x00FF ALPHA  # Latin 1 #ø->ÿ
 58 | 0x0100..0x017F ALPHA  # Latin Extended A
 59 | 0x0180..0x0236 ALPHA  # Latin Extended B
 60 | 0x1E00..0x1EF9 ALPHA  # Latin Extended Additional
 61 | 
 62 | # CYRILLIC
 63 | 0x0400..0x04F9 CYRILLIC #Ѐ->ӹ
 64 | 0x0500..0x050F CYRILLIC # Cyrillic supplementary
 65 | 
 66 | # GREEK
 67 | 0x0374..0x03FB GREEK # Greek and Coptic　#ʹ->ϻ
 68 | 
 69 | # HIRAGANA
 70 | 0x3041..0x309F  HIRAGANA
 71 | 
 72 | # KATAKANA
 73 | #0x30A1..0x30FF  KATAKANA
 74 | 0x30A1..0x30FA  KATAKANA
 75 | 0x30FC..0x30FF  KATAKANA
 76 | 0x31F0..0x31FF  KATAKANA  # Small KU .. Small RO
 77 | # 0x30FC          KATAKANA HIRAGANA  # ー
 78 | 0x30A1          NOOOVBOW # Small A
 79 | 0x30A3          NOOOVBOW
 80 | 0x30A5          NOOOVBOW
 81 | 0x30A7          NOOOVBOW
 82 | 0x30A9          NOOOVBOW
 83 | 0x30E3          NOOOVBOW
 84 | 0x30E5          NOOOVBOW
 85 | 0x30E7          NOOOVBOW
 86 | 0x30EE          NOOOVBOW
 87 | 0x30FB..0x30FE  NOOOVBOW
 88 | 
 89 | # Half KATAKANA
 90 | 0xFF66..0xFF9D  KATAKANA
 91 | 0xFF9E..0xFF9F  KATAKANA
 92 | 
 93 | # KANJI
 94 | 0x2E80..0x2EF3  KANJI # CJK Raidcals Supplement
 95 | 0x2F00..0x2FD5  KANJI
 96 | 0x3005          KANJI NOOOVBOW
 97 | 0x3007          KANJI
 98 | 0x3400..0x4DB5  KANJI # CJK Unified Ideographs Extention
 99 | #0x4E00..0x9FA5  KANJI
100 | 0x4E00..0x9FFF  KANJI
101 | 0xF900..0xFA2D  KANJI
102 | 0xFA30..0xFA6A  KANJI
103 | 
104 | 
105 | # KANJI-NUMERIC (一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆)
106 | 0x4E00 KANJINUMERIC KANJI
107 | 0x4E8C KANJINUMERIC KANJI
108 | 0x4E09 KANJINUMERIC KANJI
109 | 0x56DB KANJINUMERIC KANJI
110 | 0x4E94 KANJINUMERIC KANJI
111 | 0x516D KANJINUMERIC KANJI
112 | 0x4E03 KANJINUMERIC KANJI
113 | 0x516B KANJINUMERIC KANJI
114 | 0x4E5D KANJINUMERIC KANJI
115 | 0x5341 KANJINUMERIC KANJI
116 | 0x767E KANJINUMERIC KANJI
117 | 0x5343 KANJINUMERIC KANJI
118 | 0x4E07 KANJINUMERIC KANJI
119 | 0x5104 KANJINUMERIC KANJI
120 | 0x5146 KANJINUMERIC KANJI
121 | 
122 | # ZENKAKU 
123 | 0xFF10..0xFF19 NUMERIC
124 | 0xFF21..0xFF3A ALPHA
125 | 0xFF41..0xFF5A ALPHA
126 | 0xFF01..0xFF0F SYMBOL   #！->／
127 | 0xFF1A..0xFF20 SYMBOL   #：->＠
128 | 0xFF3B..0xFF40 SYMBOL   #［->｀
129 | 0xFF5B..0xFF65 SYMBOL   #｛->･
130 | 0xFFE0..0xFFEF SYMBOL # HalfWidth and Full width Form
131 | 
132 | # OTHER SYMBOLS
133 | 0x2000..0x206F  SYMBOL # General Punctuation
134 | 0x2070..0x209F  NUMERIC # Superscripts and Subscripts
135 | 0x20A0..0x20CF  SYMBOL # Currency Symbols
136 | 0x20D0..0x20FF  SYMBOL # Combining Diaritical Marks for Symbols
137 | 0x2100..0x214F  SYMBOL # Letterlike Symbols
138 | 0x2150..0x218F  NUMERIC # Number forms
139 | 0x2100..0x214B  SYMBOL # Letterlike Symbols
140 | 0x2190..0x21FF  SYMBOL # Arrow
141 | 0x2200..0x22FF  SYMBOL # Mathematical Operators
142 | 0x2300..0x23FF  SYMBOL # Miscellaneuos Technical
143 | 0x2460..0x24FF  SYMBOL # Enclosed NUMERICs
144 | 0x2501..0x257F  SYMBOL # Box Drawing
145 | 0x2580..0x259F  SYMBOL # Block Elements
146 | 0x25A0..0x25FF  SYMBOL # Geometric Shapes
147 | 0x2600..0x26FE  SYMBOL # Miscellaneous Symbols
148 | 0x2700..0x27BF  SYMBOL # Dingbats
149 | 0x27F0..0x27FF  SYMBOL # Supplemental Arrows A
150 | 0x27C0..0x27EF  SYMBOL # Miscellaneous Mathematical Symbols-A
151 | 0x2800..0x28FF  SYMBOL # Braille Patterns
152 | 0x2900..0x297F  SYMBOL # Supplemental Arrows B
153 | 0x2B00..0x2BFF  SYMBOL # Miscellaneous Symbols and Arrows
154 | 0x2A00..0x2AFF  SYMBOL # Supplemental Mathematical Operators
155 | 0x3300..0x33FF  SYMBOL
156 | 0x3200..0x32FE  SYMBOL # ENclosed CJK Letters and Months
157 | 0x3000..0x303F  SYMBOL # CJK Symbol and Punctuation
158 | 0xFE30..0xFE4F  SYMBOL # CJK Compatibility Forms
159 | 0xFE50..0xFE6B  SYMBOL # Small Form Variants
160 | 
161 | # added 2006/3/13 
162 | 0x3007 SYMBOL KANJINUMERIC
163 | 
164 | # added 2018/11/30
165 | 0x309b..0x309c HIRAGANA KATAKANA # voiced/semi-voiced sound marks
166 | 
167 | # END OF TABLE
168 | 


--------------------------------------------------------------------------------
/data/root/rewrite.def:
--------------------------------------------------------------------------------
   1 | # ignore normalize list
   2 | #   ^{char}%n
   3 | Ⅰ
   4 | Ⅱ
   5 | Ⅲ
   6 | Ⅳ
   7 | Ⅴ
   8 | Ⅵ
   9 | Ⅶ
  10 | Ⅷ
  11 | Ⅸ
  12 | Ⅹ
  13 | Ⅺ
  14 | Ⅻ
  15 | Ⅼ
  16 | Ⅽ
  17 | Ⅾ
  18 | Ⅿ
  19 | ⅰ
  20 | ⅱ
  21 | ⅲ
  22 | ⅳ
  23 | ⅴ
  24 | ⅵ
  25 | ⅶ
  26 | ⅷ
  27 | ⅸ
  28 | ⅹ
  29 | ⅺ
  30 | ⅻ
  31 | ⅼ
  32 | ⅽ
  33 | ⅾ
  34 | ⅿ
  35 | ⺀
  36 | ⺁
  37 | ⺂
  38 | ⺃
  39 | ⺄
  40 | ⺅
  41 | ⺆
  42 | ⺇
  43 | ⺈
  44 | ⺉
  45 | ⺊
  46 | ⺋
  47 | ⺌
  48 | ⺍
  49 | ⺎
  50 | ⺏
  51 | ⺐
  52 | ⺑
  53 | ⺒
  54 | ⺓
  55 | ⺔
  56 | ⺕
  57 | ⺖
  58 | ⺗
  59 | ⺘
  60 | ⺙
  61 | ⺛
  62 | ⺜
  63 | ⺝
  64 | ⺞
  65 | ⺟
  66 | ⺠
  67 | ⺡
  68 | ⺢
  69 | ⺣
  70 | ⺤
  71 | ⺥
  72 | ⺦
  73 | ⺧
  74 | ⺨
  75 | ⺩
  76 | ⺪
  77 | ⺫
  78 | ⺬
  79 | ⺭
  80 | ⺮
  81 | ⺯
  82 | ⺰
  83 | ⺱
  84 | ⺲
  85 | ⺳
  86 | ⺴
  87 | ⺵
  88 | ⺶
  89 | ⺷
  90 | ⺸
  91 | ⺹
  92 | ⺺
  93 | ⺻
  94 | ⺼
  95 | ⺽
  96 | ⺾
  97 | ⺿
  98 | ⻀
  99 | ⻁
 100 | ⻂
 101 | ⻃
 102 | ⻄
 103 | ⻅
 104 | ⻆
 105 | ⻇
 106 | ⻈
 107 | ⻉
 108 | ⻊
 109 | ⻋
 110 | ⻌
 111 | ⻍
 112 | ⻎
 113 | ⻏
 114 | ⻐
 115 | ⻑
 116 | ⻒
 117 | ⻓
 118 | ⻔
 119 | ⻕
 120 | ⻖
 121 | ⻗
 122 | ⻘
 123 | ⻙
 124 | ⻚
 125 | ⻛
 126 | ⻜
 127 | ⻝
 128 | ⻞
 129 | ⻟
 130 | ⻠
 131 | ⻡
 132 | ⻢
 133 | ⻣
 134 | ⻤
 135 | ⻥
 136 | ⻦
 137 | ⻧
 138 | ⻨
 139 | ⻩
 140 | ⻪
 141 | ⻫
 142 | ⻬
 143 | ⻭
 144 | ⻮
 145 | ⻯
 146 | ⻰
 147 | ⻱
 148 | ⻲
 149 | ⻳
 150 | ⼀
 151 | ⼁
 152 | ⼂
 153 | ⼃
 154 | ⼄
 155 | ⼅
 156 | ⼆
 157 | ⼇
 158 | ⼈
 159 | ⼉
 160 | ⼊
 161 | ⼋
 162 | ⼌
 163 | ⼍
 164 | ⼎
 165 | ⼏
 166 | ⼐
 167 | ⼑
 168 | ⼒
 169 | ⼓
 170 | ⼔
 171 | ⼕
 172 | ⼖
 173 | ⼗
 174 | ⼘
 175 | ⼙
 176 | ⼚
 177 | ⼛
 178 | ⼜
 179 | ⼝
 180 | ⼞
 181 | ⼟
 182 | ⼠
 183 | ⼡
 184 | ⼢
 185 | ⼣
 186 | ⼤
 187 | ⼥
 188 | ⼦
 189 | ⼧
 190 | ⼨
 191 | ⼩
 192 | ⼪
 193 | ⼫
 194 | ⼬
 195 | ⼭
 196 | ⼮
 197 | ⼯
 198 | ⼰
 199 | ⼱
 200 | ⼲
 201 | ⼳
 202 | ⼴
 203 | ⼵
 204 | ⼶
 205 | ⼷
 206 | ⼸
 207 | ⼹
 208 | ⼺
 209 | ⼻
 210 | ⼼
 211 | ⼽
 212 | ⼾
 213 | ⼿
 214 | ⽀
 215 | ⽁
 216 | ⽂
 217 | ⽃
 218 | ⽄
 219 | ⽅
 220 | ⽆
 221 | ⽇
 222 | ⽈
 223 | ⽉
 224 | ⽊
 225 | ⽋
 226 | ⽌
 227 | ⽍
 228 | ⽎
 229 | ⽏
 230 | ⽐
 231 | ⽑
 232 | ⽒
 233 | ⽓
 234 | ⽔
 235 | ⽕
 236 | ⽖
 237 | ⽗
 238 | ⽘
 239 | ⽙
 240 | ⽚
 241 | ⽛
 242 | ⽜
 243 | ⽝
 244 | ⽞
 245 | ⽟
 246 | ⽠
 247 | ⽡
 248 | ⽢
 249 | ⽣
 250 | ⽤
 251 | ⽥
 252 | ⽦
 253 | ⽧
 254 | ⽨
 255 | ⽩
 256 | ⽪
 257 | ⽫
 258 | ⽬
 259 | ⽭
 260 | ⽮
 261 | ⽯
 262 | ⽰
 263 | ⽱
 264 | ⽲
 265 | ⽳
 266 | ⽴
 267 | ⽵
 268 | ⽶
 269 | ⽷
 270 | ⽸
 271 | ⽹
 272 | ⽺
 273 | ⽻
 274 | ⽼
 275 | ⽽
 276 | ⽾
 277 | ⽿
 278 | ⾀
 279 | ⾁
 280 | ⾂
 281 | ⾃
 282 | ⾄
 283 | ⾅
 284 | ⾆
 285 | ⾇
 286 | ⾈
 287 | ⾉
 288 | ⾊
 289 | ⾋
 290 | ⾌
 291 | ⾍
 292 | ⾎
 293 | ⾏
 294 | ⾐
 295 | ⾑
 296 | ⾒
 297 | ⾓
 298 | ⾔
 299 | ⾕
 300 | ⾖
 301 | ⾗
 302 | ⾘
 303 | ⾙
 304 | ⾚
 305 | ⾛
 306 | ⾜
 307 | ⾝
 308 | ⾞
 309 | ⾟
 310 | ⾠
 311 | ⾡
 312 | ⾢
 313 | ⾣
 314 | ⾤
 315 | ⾥
 316 | ⾦
 317 | ⾧
 318 | ⾨
 319 | ⾩
 320 | ⾪
 321 | ⾫
 322 | ⾬
 323 | ⾭
 324 | ⾮
 325 | ⾯
 326 | ⾰
 327 | ⾱
 328 | ⾲
 329 | ⾳
 330 | ⾴
 331 | ⾵
 332 | ⾶
 333 | ⾷
 334 | ⾸
 335 | ⾹
 336 | ⾺
 337 | ⾻
 338 | ⾼
 339 | ⾽
 340 | ⾾
 341 | ⾿
 342 | ⿀
 343 | ⿁
 344 | ⿂
 345 | ⿃
 346 | ⿄
 347 | ⿅
 348 | ⿆
 349 | ⿇
 350 | ⿈
 351 | ⿉
 352 | ⿊
 353 | ⿋
 354 | ⿌
 355 | ⿍
 356 | ⿎
 357 | ⿏
 358 | ⿐
 359 | ⿑
 360 | ⿒
 361 | ⿓
 362 | ⿔
 363 | ⿕
 364 | 豈
 365 | 更
 366 | 車
 367 | 賈
 368 | 滑
 369 | 串
 370 | 句
 371 | 龜
 372 | 龜
 373 | 契
 374 | 金
 375 | 喇
 376 | 奈
 377 | 懶
 378 | 癩
 379 | 羅
 380 | 蘿
 381 | 螺
 382 | 裸
 383 | 邏
 384 | 樂
 385 | 洛
 386 | 烙
 387 | 珞
 388 | 落
 389 | 酪
 390 | 駱
 391 | 亂
 392 | 卵
 393 | 欄
 394 | 爛
 395 | 蘭
 396 | 鸞
 397 | 嵐
 398 | 濫
 399 | 藍
 400 | 襤
 401 | 拉
 402 | 臘
 403 | 蠟
 404 | 廊
 405 | 朗
 406 | 浪
 407 | 狼
 408 | 郎
 409 | 來
 410 | 冷
 411 | 勞
 412 | 擄
 413 | 櫓
 414 | 爐
 415 | 盧
 416 | 老
 417 | 蘆
 418 | 虜
 419 | 路
 420 | 露
 421 | 魯
 422 | 鷺
 423 | 碌
 424 | 祿
 425 | 綠
 426 | 菉
 427 | 錄
 428 | 鹿
 429 | 論
 430 | 壟
 431 | 弄
 432 | 籠
 433 | 聾
 434 | 牢
 435 | 磊
 436 | 賂
 437 | 雷
 438 | 壘
 439 | 屢
 440 | 樓
 441 | 淚
 442 | 漏
 443 | 累
 444 | 縷
 445 | 陋
 446 | 勒
 447 | 肋
 448 | 凜
 449 | 凌
 450 | 稜
 451 | 綾
 452 | 菱
 453 | 陵
 454 | 讀
 455 | 拏
 456 | 樂
 457 | 諾
 458 | 丹
 459 | 寧
 460 | 怒
 461 | 率
 462 | 異
 463 | 北
 464 | 磻
 465 | 便
 466 | 復
 467 | 不
 468 | 泌
 469 | 數
 470 | 索
 471 | 參
 472 | 塞
 473 | 省
 474 | 葉
 475 | 說
 476 | 殺
 477 | 辰
 478 | 沈
 479 | 拾
 480 | 若
 481 | 掠
 482 | 略
 483 | 亮
 484 | 兩
 485 | 凉
 486 | 梁
 487 | 糧
 488 | 良
 489 | 諒
 490 | 量
 491 | 勵
 492 | 呂
 493 | 女
 494 | 廬
 495 | 旅
 496 | 濾
 497 | 礪
 498 | 閭
 499 | 驪
 500 | 麗
 501 | 黎
 502 | 力
 503 | 曆
 504 | 歷
 505 | 轢
 506 | 年
 507 | 憐
 508 | 戀
 509 | 撚
 510 | 漣
 511 | 煉
 512 | 璉
 513 | 秊
 514 | 練
 515 | 聯
 516 | 輦
 517 | 蓮
 518 | 連
 519 | 鍊
 520 | 列
 521 | 劣
 522 | 咽
 523 | 烈
 524 | 裂
 525 | 說
 526 | 廉
 527 | 念
 528 | 捻
 529 | 殮
 530 | 簾
 531 | 獵
 532 | 令
 533 | 囹
 534 | 寧
 535 | 嶺
 536 | 怜
 537 | 玲
 538 | 瑩
 539 | 羚
 540 | 聆
 541 | 鈴
 542 | 零
 543 | 靈
 544 | 領
 545 | 例
 546 | 禮
 547 | 醴
 548 | 隸
 549 | 惡
 550 | 了
 551 | 僚
 552 | 寮
 553 | 尿
 554 | 料
 555 | 樂
 556 | 燎
 557 | 療
 558 | 蓼
 559 | 遼
 560 | 龍
 561 | 暈
 562 | 阮
 563 | 劉
 564 | 杻
 565 | 柳
 566 | 流
 567 | 溜
 568 | 琉
 569 | 留
 570 | 硫
 571 | 紐
 572 | 類
 573 | 六
 574 | 戮
 575 | 陸
 576 | 倫
 577 | 崙
 578 | 淪
 579 | 輪
 580 | 律
 581 | 慄
 582 | 栗
 583 | 率
 584 | 隆
 585 | 利
 586 | 吏
 587 | 履
 588 | 易
 589 | 李
 590 | 梨
 591 | 泥
 592 | 理
 593 | 痢
 594 | 罹
 595 | 裏
 596 | 裡
 597 | 里
 598 | 離
 599 | 匿
 600 | 溺
 601 | 吝
 602 | 燐
 603 | 璘
 604 | 藺
 605 | 隣
 606 | 鱗
 607 | 麟
 608 | 林
 609 | 淋
 610 | 臨
 611 | 立
 612 | 笠
 613 | 粒
 614 | 狀
 615 | 炙
 616 | 識
 617 | 什
 618 | 茶
 619 | 刺
 620 | 切
 621 | 度
 622 | 拓
 623 | 糖
 624 | 宅
 625 | 洞
 626 | 暴
 627 | 輻
 628 | 行
 629 | 降
 630 | 見
 631 | 廓
 632 | 兀
 633 | 嗀
 634 | 﨎
 635 | 﨏
 636 | 塚
 637 | 﨑
 638 | 晴
 639 | 﨓
 640 | 﨔
 641 | 凞
 642 | 猪
 643 | 益
 644 | 礼
 645 | 神
 646 | 祥
 647 | 福
 648 | 靖
 649 | 精
 650 | 羽
 651 | 﨟
 652 | 蘒
 653 | 﨡
 654 | 諸
 655 | 﨣
 656 | 﨤
 657 | 逸
 658 | 都
 659 | 﨧
 660 | 﨨
 661 | 﨩
 662 | 飯
 663 | 飼
 664 | 館
 665 | 鶴
 666 | 郞
 667 | 隷
 668 | 侮
 669 | 僧
 670 | 免
 671 | 勉
 672 | 勤
 673 | 卑
 674 | 喝
 675 | 嘆
 676 | 器
 677 | 塀
 678 | 墨
 679 | 層
 680 | 屮
 681 | 悔
 682 | 慨
 683 | 憎
 684 | 懲
 685 | 敏
 686 | 既
 687 | 暑
 688 | 梅
 689 | 海
 690 | 渚
 691 | 漢
 692 | 煮
 693 | 爫
 694 | 琢
 695 | 碑
 696 | 社
 697 | 祉
 698 | 祈
 699 | 祐
 700 | 祖
 701 | 祝
 702 | 禍
 703 | 禎
 704 | 穀
 705 | 突
 706 | 節
 707 | 練
 708 | 縉
 709 | 繁
 710 | 署
 711 | 者
 712 | 臭
 713 | 艹
 714 | 艹
 715 | 著
 716 | 褐
 717 | 視
 718 | 謁
 719 | 謹
 720 | 賓
 721 | 贈
 722 | 辶
 723 | 逸
 724 | 難
 725 | 響
 726 | 頻
 727 | 恵
 728 | 𤋮
 729 | 舘
 730 | 並
 731 | 况
 732 | 全
 733 | 侀
 734 | 充
 735 | 冀
 736 | 勇
 737 | 勺
 738 | 喝
 739 | 啕
 740 | 喙
 741 | 嗢
 742 | 塚
 743 | 墳
 744 | 奄
 745 | 奔
 746 | 婢
 747 | 嬨
 748 | 廒
 749 | 廙
 750 | 彩
 751 | 徭
 752 | 惘
 753 | 慎
 754 | 愈
 755 | 憎
 756 | 慠
 757 | 懲
 758 | 戴
 759 | 揄
 760 | 搜
 761 | 摒
 762 | 敖
 763 | 晴
 764 | 朗
 765 | 望
 766 | 杖
 767 | 歹
 768 | 殺
 769 | 流
 770 | 滛
 771 | 滋
 772 | 漢
 773 | 瀞
 774 | 煮
 775 | 瞧
 776 | 爵
 777 | 犯
 778 | 猪
 779 | 瑱
 780 | 甆
 781 | 画
 782 | 瘝
 783 | 瘟
 784 | 益
 785 | 盛
 786 | 直
 787 | 睊
 788 | 着
 789 | 磌
 790 | 窱
 791 | 節
 792 | 类
 793 | 絛
 794 | 練
 795 | 缾
 796 | 者
 797 | 荒
 798 | 華
 799 | 蝹
 800 | 襁
 801 | 覆
 802 | 視
 803 | 調
 804 | 諸
 805 | 請
 806 | 謁
 807 | 諾
 808 | 諭
 809 | 謹
 810 | 變
 811 | 贈
 812 | 輸
 813 | 遲
 814 | 醙
 815 | 鉶
 816 | 陼
 817 | 難
 818 | 靖
 819 | 韛
 820 | 響
 821 | 頋
 822 | 頻
 823 | 鬒
 824 | 龜
 825 | 𢡊
 826 | 𢡄
 827 | 𣏕
 828 | 㮝
 829 | 䀘
 830 | 䀹
 831 | 𥉉
 832 | 𥳐
 833 | 𧻓
 834 | 齃
 835 | 龎
 836 | ゛
 837 | ゜
 838 | 
 839 | # replace char list
 840 | #   ^{before}\s{after}%n
 841 | ｳﾞ	ヴ
 842 | ｶﾞ	ガ
 843 | ｷﾞ	ギ
 844 | ｸﾞ	グ
 845 | ｹﾞ	ゲ
 846 | ｺﾞ	ゴ
 847 | ｻﾞ	ザ
 848 | ｼﾞ	ジ
 849 | ｽﾞ	ズ
 850 | ｾﾞ	ゼ
 851 | ｿﾞ	ゾ
 852 | ﾀﾞ	ダ
 853 | ﾁﾞ	ヂ
 854 | ﾂﾞ	ヅ
 855 | ﾃﾞ	デ
 856 | ﾄﾞ	ド
 857 | ﾊﾞ	バ
 858 | ﾋﾞ	ビ
 859 | ﾌﾞ	ブ
 860 | ﾍﾞ	ベ
 861 | ﾎﾞ	ボ
 862 | ﾊﾟ	パ
 863 | ﾋﾟ	ピ
 864 | ﾌﾟ	プ
 865 | ﾍﾟ	ペ
 866 | ﾎﾟ	ポ
 867 | うﾞ	ゔ
 868 | かﾞ	が
 869 | きﾞ	ぎ
 870 | くﾞ	ぐ
 871 | けﾞ	げ
 872 | こﾞ	ご
 873 | さﾞ	ざ
 874 | しﾞ	じ
 875 | すﾞ	ず
 876 | せﾞ	ぜ
 877 | そﾞ	ぞ
 878 | たﾞ	だ
 879 | ちﾞ	ぢ
 880 | つﾞ	づ
 881 | てﾞ	で
 882 | とﾞ	ど
 883 | はﾞ	ば
 884 | ひﾞ	び
 885 | ふﾞ	ぶ
 886 | へﾞ	べ
 887 | ほﾞ	ぼ
 888 | はﾟ	ぱ
 889 | ひﾟ	ぴ
 890 | ふﾟ	ぷ
 891 | へﾟ	ぺ
 892 | ほﾟ	ぽ
 893 | ウﾞ	ヴ
 894 | カﾞ	ガ
 895 | キﾞ	ギ
 896 | クﾞ	グ
 897 | ケﾞ	ゲ
 898 | コﾞ	ゴ
 899 | サﾞ	ザ
 900 | シﾞ	ジ
 901 | スﾞ	ズ
 902 | セﾞ	ゼ
 903 | ソﾞ	ゾ
 904 | タﾞ	ダ
 905 | チﾞ	ヂ
 906 | ツﾞ	ヅ
 907 | テﾞ	デ
 908 | トﾞ	ド
 909 | ハﾞ	バ
 910 | ヒﾞ	ビ
 911 | フﾞ	ブ
 912 | ヘﾞ	ベ
 913 | ホﾞ	ボ
 914 | ハﾟ	パ
 915 | ヒﾟ	ピ
 916 | フﾟ	プ
 917 | ヘﾟ	ペ
 918 | ホﾟ	ポ
 919 | ゔ	ゔ
 920 | が	が
 921 | ぎ	ぎ
 922 | ぐ	ぐ
 923 | げ	げ
 924 | ご	ご
 925 | ざ	ざ
 926 | じ	じ
 927 | ず	ず
 928 | ぜ	ぜ
 929 | ぞ	ぞ
 930 | だ	だ
 931 | ぢ	ぢ
 932 | づ	づ
 933 | で	で
 934 | ど	ど
 935 | ば	ば
 936 | び	び
 937 | ぶ	ぶ
 938 | べ	べ
 939 | ぼ	ぼ
 940 | ぱ	ぱ
 941 | ぴ	ぴ
 942 | ぷ	ぷ
 943 | ぺ	ぺ
 944 | ぽ	ぽ
 945 | ヴ	ヴ
 946 | ガ	ガ
 947 | ギ	ギ
 948 | グ	グ
 949 | ゲ	ゲ
 950 | ゴ	ゴ
 951 | ザ	ザ
 952 | ジ	ジ
 953 | ズ	ズ
 954 | ゼ	ゼ
 955 | ゾ	ゾ
 956 | ダ	ダ
 957 | ヂ	ヂ
 958 | ヅ	ヅ
 959 | デ	デ
 960 | ド	ド
 961 | バ	バ
 962 | ビ	ビ
 963 | ブ	ブ
 964 | ベ	ベ
 965 | ボ	ボ
 966 | パ	パ
 967 | ピ	ピ
 968 | プ	プ
 969 | ペ	ペ
 970 | ポ	ポ
 971 | う゛	ゔ
 972 | か゛	が
 973 | き゛	ぎ
 974 | く゛	ぐ
 975 | け゛	げ
 976 | こ゛	ご
 977 | さ゛	ざ
 978 | し゛	じ
 979 | す゛	ず
 980 | せ゛	ぜ
 981 | そ゛	ぞ
 982 | た゛	だ
 983 | ち゛	ぢ
 984 | つ゛	づ
 985 | て゛	で
 986 | と゛	ど
 987 | は゛	ば
 988 | ひ゛	び
 989 | ふ゛	ぶ
 990 | へ゛	べ
 991 | ほ゛	ぼ
 992 | は゜	ぱ
 993 | ひ゜	ぴ
 994 | ふ゜	ぷ
 995 | へ゜	ぺ
 996 | ほ゜	ぽ
 997 | ウ゛	ヴ
 998 | カ゛	ガ
 999 | キ゛	ギ
1000 | ク゛	グ
1001 | ケ゛	ゲ
1002 | コ゛	ゴ
1003 | サ゛	ザ
1004 | シ゛	ジ
1005 | ス゛	ズ
1006 | セ゛	ゼ
1007 | ソ゛	ゾ
1008 | タ゛	ダ
1009 | チ゛	ヂ
1010 | ツ゛	ヅ
1011 | テ゛	デ
1012 | ト゛	ド
1013 | ハ゛	バ
1014 | ヒ゛	ビ
1015 | フ゛	ブ
1016 | ヘ゛	ベ
1017 | ホ゛	ボ
1018 | ハ゜	パ
1019 | ヒ゜	ピ
1020 | フ゜	プ
1021 | ヘ゜	ペ
1022 | ホ゜	ポ
1023 | 


--------------------------------------------------------------------------------
/data/root/sudachi.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "systemDict" : "system_core.dic",
 3 |     "inputTextPlugin" : [
 4 |         { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" },
 5 |         { "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin",
 6 |           "prolongedSoundMarks": ["ー", "-", "⁓", "〜", "〰"],
 7 |           "replacementSymbol": "ー"}
 8 |     ],
 9 |     "oovProviderPlugin" : [
10 |         { "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin" },
11 |         { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin",
12 |           "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ],
13 |           "leftId" : 5968,
14 |           "rightId" : 5968,
15 |           "cost" : 3857 }
16 |     ],
17 |     "pathRewritePlugin" : [
18 |         { "class" : "com.worksap.nlp.sudachi.JoinNumericPlugin",
19 |           "joinKanjiNumeric" : true },
20 |         { "class" : "com.worksap.nlp.sudachi.JoinKatakanaOovPlugin",
21 |           "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
22 |           "minLength" : 3
23 |         }
24 |     ]
25 | }
26 | 


--------------------------------------------------------------------------------
/data/root/sudachi_fulldict.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "systemDict" : "system_full.dic",
 3 |     "inputTextPlugin" : [
 4 |         { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" },
 5 |         { "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin",
 6 |           "prolongedSoundMarks": ["ー", "-", "⁓", "〜", "〰"],
 7 |           "replacementSymbol": "ー"}
 8 |     ],
 9 |     "oovProviderPlugin" : [
10 |         { "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin" },
11 |         { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin",
12 |           "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ],
13 |           "leftId" : 5968,
14 |           "rightId" : 5968,
15 |           "cost" : 3857 }
16 |     ],
17 |     "pathRewritePlugin" : [
18 |         { "class" : "com.worksap.nlp.sudachi.JoinNumericPlugin",
19 |           "joinKanjiNumeric" : true },
20 |         { "class" : "com.worksap.nlp.sudachi.JoinKatakanaOovPlugin",
21 |           "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
22 |           "minLength" : 3
23 |         }
24 |     ]
25 | }
26 | 


--------------------------------------------------------------------------------
/data/root/unk.def:
--------------------------------------------------------------------------------
 1 | DEFAULT,5968,5968,3857,補助記号,一般,*,*,*,*
 2 | SPACE,5966,5966,6056,空白,*,*,*,*,*
 3 | KANJI,5139,5139,14657,名詞,普通名詞,一般,*,*,*
 4 | KANJI,5129,5129,17308,名詞,普通名詞,サ変可能,*,*,*
 5 | KANJI,4785,4785,18181,名詞,固有名詞,一般,*,*,*
 6 | KANJI,4787,4787,18086,名詞,固有名詞,人名,一般,*,*
 7 | KANJI,4791,4791,19198,名詞,固有名詞,地名,一般,*,*
 8 | SYMBOL,5129,5129,17094,名詞,普通名詞,サ変可能,*,*,*
 9 | NUMERIC,4794,4794,12450,名詞,数詞,*,*,*,*
10 | ALPHA,5139,5139,11633,名詞,普通名詞,一般,*,*,*
11 | ALPHA,4785,4785,13620,名詞,固有名詞,一般,*,*,*
12 | ALPHA,4787,4787,14228,名詞,固有名詞,人名,一般,*,*
13 | ALPHA,4791,4791,15793,名詞,固有名詞,地名,一般,*,*
14 | ALPHA,5687,5687,15246,感動詞,一般,*,*,*,*
15 | HIRAGANA,5139,5139,16012,名詞,普通名詞,一般,*,*,*
16 | HIRAGANA,5129,5129,20012,名詞,普通名詞,サ変可能,*,*,*
17 | HIRAGANA,4785,4785,18282,名詞,固有名詞,一般,*,*,*
18 | HIRAGANA,4787,4787,18269,名詞,固有名詞,人名,一般,*,*
19 | HIRAGANA,4791,4791,20474,名詞,固有名詞,地名,一般,*,*
20 | HIRAGANA,5687,5687,17786,感動詞,一般,*,*,*,*
21 | KATAKANA,5139,5139,10980,名詞,普通名詞,一般,*,*,*
22 | KATAKANA,5129,5129,14802,名詞,普通名詞,サ変可能,*,*,*
23 | KATAKANA,4785,4785,13451,名詞,固有名詞,一般,*,*,*
24 | KATAKANA,4787,4787,13759,名詞,固有名詞,人名,一般,*,*
25 | KATAKANA,4791,4791,14554,名詞,固有名詞,地名,一般,*,*
26 | KATAKANA,5687,5687,15272,感動詞,一般,*,*,*,*
27 | KANJINUMERIC,4794,4794,14170,名詞,数詞,*,*,*,*
28 | GREEK,5139,5139,11051,名詞,普通名詞,一般,*,*,*
29 | GREEK,4785,4785,13353,名詞,固有名詞,一般,*,*,*
30 | GREEK,4787,4787,13671,名詞,固有名詞,人名,一般,*,*
31 | GREEK,4791,4791,14862,名詞,固有名詞,地名,一般,*,*
32 | CYRILLIC,5139,5139,11140,名詞,普通名詞,一般,*,*,*
33 | CYRILLIC,4785,4785,13174,名詞,固有名詞,一般,*,*,*
34 | CYRILLIC,4787,4787,13495,名詞,固有名詞,人名,一般,*,*
35 | CYRILLIC,4791,4791,14700,名詞,固有名詞,地名,一般,*,*
36 | 


--------------------------------------------------------------------------------
/definputtextplugin.go:
--------------------------------------------------------------------------------
  1 | package gosudachi
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io"
  6 | 	"os"
  7 | 	"strings"
  8 | 	"unicode"
  9 | 	"unicode/utf8"
 10 | 
 11 | 	"github.com/msnoigrs/gosudachi/data"
 12 | 	"github.com/msnoigrs/gosudachi/internal/lnreader"
 13 | 	"golang.org/x/text/unicode/norm"
 14 | )
 15 | 
 16 | type DefaultInputTextPluginConfig struct {
 17 | 	RewriteDef string
 18 | }
 19 | 
 20 | type DefaultInputTextPlugin struct {
 21 | 	config             *DefaultInputTextPluginConfig
 22 | 	rewriteDef         string
 23 | 	ignoreNormalizeMap map[rune]bool
 24 | 	keyLengths         map[rune]int
 25 | 	replaceCharMap     map[string][]rune
 26 | }
 27 | 
 28 | func NewDefaultInputTextPlugin(config *DefaultInputTextPluginConfig) *DefaultInputTextPlugin {
 29 | 	if config == nil {
 30 | 		config = &DefaultInputTextPluginConfig{}
 31 | 	}
 32 | 	return &DefaultInputTextPlugin{
 33 | 		config:             config,
 34 | 		ignoreNormalizeMap: map[rune]bool{},
 35 | 		keyLengths:         map[rune]int{},
 36 | 		replaceCharMap:     map[string][]rune{},
 37 | 	}
 38 | }
 39 | 
 40 | func (p *DefaultInputTextPlugin) GetConfigStruct() interface{} {
 41 | 	if p.config == nil {
 42 | 		p.config = &DefaultInputTextPluginConfig{}
 43 | 	}
 44 | 	return p.config
 45 | }
 46 | 
 47 | func (p *DefaultInputTextPlugin) SetUp() error {
 48 | 	if p.rewriteDef == "" {
 49 | 		p.rewriteDef = p.config.RewriteDef
 50 | 	}
 51 | 	p.config = nil
 52 | 	if p.ignoreNormalizeMap == nil {
 53 | 		p.ignoreNormalizeMap = map[rune]bool{}
 54 | 	}
 55 | 	if p.keyLengths == nil {
 56 | 		p.keyLengths = map[rune]int{}
 57 | 	}
 58 | 	if p.replaceCharMap == nil {
 59 | 		p.replaceCharMap = map[string][]rune{}
 60 | 	}
 61 | 	err := p.readRewriteLists(p.rewriteDef)
 62 | 	if err != nil {
 63 | 		return fmt.Errorf("DefaultInputTextPlugin: %s", err)
 64 | 	}
 65 | 	return nil
 66 | }
 67 | 
 68 | func (p *DefaultInputTextPlugin) getKeyLength(key rune, def int) int {
 69 | 	l, ok := p.keyLengths[key]
 70 | 	if !ok {
 71 | 		return def
 72 | 	}
 73 | 	return l
 74 | }
 75 | 
 76 | func (p *DefaultInputTextPlugin) Rewrite(builder *InputTextBuilder) error {
 77 | 	runes := builder.GetText()
 78 | 	runelen := len(runes)
 79 | 
 80 | 	utf8buf := make([]byte, 8, 8)
 81 | 
 82 | 	offset := 0
 83 | 	nextOffset := 0
 84 | TEXTLOOP:
 85 | 	for i := 0; i < runelen; i++ {
 86 | 		offset += nextOffset
 87 | 		nextOffset = 0
 88 | 		// 1. replace char without normalize
 89 | 		for l := minInt(p.getKeyLength(runes[i], 0), runelen-i); l > 0; l-- {
 90 | 			replace, ok := p.replaceCharMap[string(runes[i:i+l])]
 91 | 			if ok {
 92 | 				builder.Replace(i+offset, i+l+offset, replace)
 93 | 				nextOffset += len(replace) - l
 94 | 				i += l - 1
 95 | 				continue TEXTLOOP
 96 | 			}
 97 | 		}
 98 | 
 99 | 		// 2. normalize
100 | 		original := runes[i]
101 | 
102 | 		// 2-1. capital alphabet (not only latin but greek, cyrillic, etc) -> small
103 | 		lower := unicode.ToLower(original)
104 | 		var replace []rune
105 | 		_, ok := p.ignoreNormalizeMap[lower]
106 | 		if ok {
107 | 			if original == lower {
108 | 				continue
109 | 			}
110 | 			replace = []rune{lower}
111 | 		} else {
112 | 			// 2-2. normalize (except in ignoreNormalize)
113 | 			//    e.g. full-width alphabet -> half-width / ligature / etc.
114 | 			size := utf8.EncodeRune(utf8buf, lower)
115 | 			replace = []rune(string(norm.NFKC.Bytes(utf8buf[:size])))
116 | 		}
117 | 		nextOffset = len(replace) - 1
118 | 		if len(replace) != 1 || original != replace[0] {
119 | 			builder.Replace(i+offset, i+1+offset, replace)
120 | 		}
121 | 	}
122 | 	return nil
123 | }
124 | 
125 | func minInt(a, b int) int {
126 | 	if a < b {
127 | 		return a
128 | 	}
129 | 	return b
130 | }
131 | 
132 | func (p *DefaultInputTextPlugin) readRewriteLists(rewriteDef string) error {
133 | 	var rewriteDefReader io.Reader
134 | 	if rewriteDef != "" {
135 | 		rewriteDefFd, err := os.OpenFile(rewriteDef, os.O_RDONLY, 0644)
136 | 		if err != nil {
137 | 			return fmt.Errorf("DefaultInputTextPlugin: %s: %s", err, rewriteDef)
138 | 		}
139 | 		defer rewriteDefFd.Close()
140 | 		rewriteDefReader = rewriteDefFd
141 | 	} else {
142 | 		rewiteDefF, err := data.Assets.Open("rewrite.def")
143 | 		if err != nil {
144 | 			return fmt.Errorf("DefaultInputTextPlugin: %s: (data.Assets)rewrite.def", err)
145 | 		}
146 | 		defer rewiteDefF.Close()
147 | 		rewriteDefReader = rewiteDefF
148 | 	}
149 | 
150 | 	r := lnreader.NewLineNumberReader(rewriteDefReader)
151 | 	for {
152 | 		line, err := r.ReadLine()
153 | 		if err == io.EOF {
154 | 			break
155 | 		}
156 | 		if err != nil {
157 | 			return fmt.Errorf("DefaultInputTextPlugin: %s", err)
158 | 		}
159 | 		if lnreader.IsSkipLine(line) {
160 | 			continue
161 | 		}
162 | 		cols := strings.Fields(string(line))
163 | 		if len(cols) == 1 {
164 | 			// ignored normalize list
165 | 			key := []rune(cols[0])
166 | 			if len(key) != 1 {
167 | 				return fmt.Errorf("DefaultInputTextPlugin: %s is already defined at line %d", cols[0], r.NumLine)
168 | 			}
169 | 			p.ignoreNormalizeMap[key[0]] = true
170 | 		} else if len(cols) == 2 {
171 | 			// replace char list
172 | 			_, ok := p.replaceCharMap[cols[0]]
173 | 			if ok {
174 | 				return fmt.Errorf("DefaultInputTextPlugin: %s is already defined at line %d", cols[0], r.NumLine)
175 | 			}
176 | 			key := []rune(cols[0])
177 | 			if p.getKeyLength(key[0], -1) < len(key) {
178 | 				// store the longest key length
179 | 				p.keyLengths[key[0]] = len(key)
180 | 			}
181 | 			p.replaceCharMap[cols[0]] = []rune(cols[1])
182 | 		} else {
183 | 			return fmt.Errorf("DefaultInputTextPlugin: invalid format at line %d", r.NumLine)
184 | 		}
185 | 	}
186 | 	return nil
187 | }
188 | 


--------------------------------------------------------------------------------
/dicbuilder/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"flag"
  6 | 	"fmt"
  7 | 	"os"
  8 | 	"time"
  9 | 
 10 | 	"github.com/msnoigrs/gosudachi/dictionary"
 11 | 	"golang.org/x/text/language"
 12 | 	"golang.org/x/text/message"
 13 | )
 14 | 
 15 | func main() {
 16 | 	flag.Usage = func() {
 17 | 		fmt.Fprintf(os.Stderr, `Usage of %s:
 18 | 	%s -o file -m file [-d description] [-j] file1 [file2 ...]
 19 | 
 20 | Options:
 21 | `, os.Args[0], os.Args[0])
 22 | 		flag.PrintDefaults()
 23 | 	}
 24 | 
 25 | 	var (
 26 | 		outputpath  string
 27 | 		matrixpath  string
 28 | 		description string
 29 | 		utf16string bool
 30 | 	)
 31 | 	flag.StringVar(&outputpath, "o", "", "output to file")
 32 | 	flag.StringVar(&matrixpath, "m", "", "connection matrix file")
 33 | 	flag.StringVar(&description, "d", "", "comment")
 34 | 	flag.BoolVar(&utf16string, "j", false, "use UTF-16 string")
 35 | 
 36 | 	flag.Parse()
 37 | 
 38 | 	if outputpath == "" || matrixpath == "" || len(flag.Args()) == 0 {
 39 | 		flag.Usage()
 40 | 		os.Exit(1)
 41 | 	}
 42 | 
 43 | 	dh := dictionary.NewDictionaryHeader(
 44 | 		dictionary.SystemDictVersion,
 45 | 		time.Now().Unix(),
 46 | 		description,
 47 | 	)
 48 | 
 49 | 	hb, err := dh.ToBytes()
 50 | 	if err != nil {
 51 | 		fmt.Fprintf(os.Stderr, "%s\n", err)
 52 | 		os.Exit(1)
 53 | 	}
 54 | 
 55 | 	outputWriter, err := os.OpenFile(outputpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
 56 | 	if err != nil {
 57 | 		fmt.Fprintf(os.Stderr, "%s: %s\n", outputpath, err)
 58 | 		os.Exit(1)
 59 | 	}
 60 | 	defer outputWriter.Close()
 61 | 
 62 | 	bufout := bufio.NewWriter(outputWriter)
 63 | 	n, err := bufout.Write(hb)
 64 | 	if err != nil {
 65 | 		fmt.Fprintf(os.Stderr, "fail to write header: %s\n", err)
 66 | 		os.Exit(1)
 67 | 	}
 68 | 	err = bufout.Flush()
 69 | 	if err != nil {
 70 | 		fmt.Fprintf(os.Stderr, "fail to write header: %s\n", err)
 71 | 		os.Exit(1)
 72 | 	}
 73 | 
 74 | 	matrixReader, err := os.OpenFile(matrixpath, os.O_RDONLY, 0644)
 75 | 	if err != nil {
 76 | 		fmt.Fprintf(os.Stderr, "%s: %s\n", matrixpath, err)
 77 | 		os.Exit(1)
 78 | 	}
 79 | 	defer matrixReader.Close()
 80 | 
 81 | 	dicbuilder := dictionary.NewDictionaryBuilder(int64(n), nil, utf16string)
 82 | 	store := dictionary.NewPosTable()
 83 | 
 84 | 	fmt.Fprint(os.Stderr, "reading the source file...")
 85 | 	for _, lexiconpath := range flag.Args() {
 86 | 		err := build(dicbuilder, store, lexiconpath)
 87 | 		if err != nil {
 88 | 			fmt.Fprintf(os.Stderr, "%s: %s\n", lexiconpath, err)
 89 | 			os.Exit(1)
 90 | 		}
 91 | 	}
 92 | 	p := message.NewPrinter(language.English)
 93 | 	p.Fprintf(os.Stderr, " %d words\n", dicbuilder.EntrySize())
 94 | 
 95 | 	err = dicbuilder.WriteGrammar(store, matrixReader, outputWriter)
 96 | 	if err != nil {
 97 | 		fmt.Fprintf(os.Stderr, "fail to write grammar: %s\n", err)
 98 | 		os.Exit(1)
 99 | 	}
100 | 
101 | 	err = dicbuilder.WriteLexicon(outputWriter, store)
102 | 	if err != nil {
103 | 		fmt.Fprintf(os.Stderr, "fail to write lexicon: %s\n", err)
104 | 		os.Exit(1)
105 | 	}
106 | }
107 | 
108 | func build(dicbuilder *dictionary.DictionaryBuilder, store dictionary.PosIdStore, lexiconpath string) error {
109 | 	lexiconReader, err := os.OpenFile(lexiconpath, os.O_RDONLY, 0644)
110 | 	if err != nil {
111 | 		return err
112 | 	}
113 | 	defer lexiconReader.Close()
114 | 
115 | 	err = dicbuilder.BuildLexicon(store, lexiconReader)
116 | 	if err != nil {
117 | 		return err
118 | 	}
119 | 	return nil
120 | }
121 | 


--------------------------------------------------------------------------------
/dicconv/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"bytes"
  6 | 	"flag"
  7 | 	"fmt"
  8 | 	"io"
  9 | 	"os"
 10 | 	"path/filepath"
 11 | 
 12 | 	"github.com/msnoigrs/gosudachi/dictionary"
 13 | 	"golang.org/x/text/language"
 14 | 	"golang.org/x/text/message"
 15 | )
 16 | 
 17 | func main() {
 18 | 	flag.Usage = func() {
 19 | 		fmt.Fprintf(os.Stderr, `Usage of %s:
 20 | 	%s [-o file] [-j] file
 21 | 
 22 | Options:
 23 | `, os.Args[0], os.Args[0])
 24 | 		flag.PrintDefaults()
 25 | 	}
 26 | 
 27 | 	var (
 28 | 		outputfile  string
 29 | 		utf16string bool
 30 | 	)
 31 | 	flag.StringVar(&outputfile, "o", "", "output to file")
 32 | 	flag.BoolVar(&utf16string, "j", false, "from UTF-8 to UTF-16")
 33 | 
 34 | 	flag.Parse()
 35 | 
 36 | 	if len(flag.Args()) == 0 {
 37 | 		flag.Usage()
 38 | 		os.Exit(1)
 39 | 	}
 40 | 
 41 | 	if outputfile == "" {
 42 | 		if utf16string {
 43 | 			outputfile = "out_utf16.dic"
 44 | 		} else {
 45 | 			outputfile = "out_utf8.dic"
 46 | 		}
 47 | 	}
 48 | 	if !filepath.IsAbs(outputfile) {
 49 | 		var err error
 50 | 		outputfile, err = filepath.Abs(outputfile)
 51 | 		if err != nil {
 52 | 			fmt.Fprintln(os.Stderr, err)
 53 | 			os.Exit(1)
 54 | 		}
 55 | 	}
 56 | 	outputfd, err := os.OpenFile(outputfile, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
 57 | 	if err != nil {
 58 | 		fmt.Fprintf(os.Stderr, "%s: %s\n", outputfile, err)
 59 | 		os.Exit(1)
 60 | 	}
 61 | 	defer outputfd.Close()
 62 | 	bufiooutput := bufio.NewWriter(outputfd)
 63 | 
 64 | 	args := flag.Args()
 65 | 	fromdic, err := dictionary.NewBinaryDictionary(args[0], !utf16string)
 66 | 	if err != nil {
 67 | 		fmt.Fprintln(os.Stderr, err)
 68 | 	}
 69 | 	defer fromdic.Close()
 70 | 
 71 | 	hb, err := fromdic.Header.ToBytes()
 72 | 	if err != nil {
 73 | 		fmt.Fprintln(os.Stderr, err)
 74 | 		os.Exit(1)
 75 | 	}
 76 | 
 77 | 	var offset int64
 78 | 	n, err := bufiooutput.Write(hb)
 79 | 	if err != nil {
 80 | 		fmt.Fprintf(os.Stderr, "fail to write header: %s\n", err)
 81 | 		os.Exit(1)
 82 | 	}
 83 | 	offset = int64(n)
 84 | 
 85 | 	var n64 int64
 86 | 	p := message.NewPrinter(language.English)
 87 | 	if fromdic.Grammar != nil {
 88 | 		fmt.Fprint(os.Stderr, "writting the POS table...")
 89 | 		buffer := bytes.NewBuffer([]byte{})
 90 | 		err = fromdic.Grammar.WritePOSTableTo(buffer, utf16string)
 91 | 		if err != nil {
 92 | 			fmt.Fprintln(os.Stderr, err)
 93 | 			os.Exit(1)
 94 | 		}
 95 | 		n64, err = buffer.WriteTo(bufiooutput)
 96 | 		if err != nil {
 97 | 			fmt.Fprintln(os.Stderr, err)
 98 | 			os.Exit(1)
 99 | 		}
100 | 		p.Fprintf(os.Stderr, " %d bytes\n", n64)
101 | 		buffer.Reset()
102 | 		offset += n64
103 | 
104 | 		fmt.Fprint(os.Stderr, "writting the connection matrix...")
105 | 		n, err = fromdic.Grammar.WriteConnMatrixTo(bufiooutput)
106 | 		if err != nil {
107 | 			fmt.Fprintln(os.Stderr, err)
108 | 			os.Exit(1)
109 | 		}
110 | 		p.Fprintf(os.Stderr, " %d bytes\n", n)
111 | 		offset += int64(n)
112 | 	}
113 | 
114 | 	fmt.Fprint(os.Stderr, "writting the trie...")
115 | 	n, err = fromdic.Lexicon.WriteTrieTo(bufiooutput)
116 | 	if err != nil {
117 | 		fmt.Fprintln(os.Stderr, err)
118 | 		os.Exit(1)
119 | 	}
120 | 	p.Fprintf(os.Stderr, " %d bytes\n", n)
121 | 	offset += int64(n)
122 | 
123 | 	fmt.Fprint(os.Stderr, "writting the word-ID table...")
124 | 	n, err = fromdic.Lexicon.WriteWordIdTableTo(bufiooutput)
125 | 	if err != nil {
126 | 		fmt.Fprintln(os.Stderr, err)
127 | 		os.Exit(1)
128 | 	}
129 | 	p.Fprintf(os.Stderr, " %d bytes\n", n)
130 | 	offset += int64(n)
131 | 
132 | 	fmt.Fprint(os.Stderr, "writting the word parameters...")
133 | 	n, err = fromdic.Lexicon.WriteWordParamsTo(bufiooutput)
134 | 	if err != nil {
135 | 		fmt.Fprintln(os.Stderr, err)
136 | 		os.Exit(1)
137 | 	}
138 | 	p.Fprintf(os.Stderr, " %d bytes\n", n)
139 | 	offset += int64(n)
140 | 
141 | 	err = bufiooutput.Flush()
142 | 	if err != nil {
143 | 		fmt.Fprintln(os.Stderr, err)
144 | 		os.Exit(1)
145 | 	}
146 | 
147 | 	fmt.Fprint(os.Stderr, "writting the wordInfos...")
148 | 	offsetlen := int64(4 * fromdic.Lexicon.Size())
149 | 	_, err = outputfd.Seek(offsetlen, io.SeekCurrent)
150 | 	if err != nil {
151 | 		fmt.Fprintln(os.Stderr, err)
152 | 		os.Exit(1)
153 | 	}
154 | 	bufiooutput = bufio.NewWriter(outputfd)
155 | 
156 | 	n, offsets, err := fromdic.Lexicon.WriteWordInfos(bufiooutput, offset, offsetlen, utf16string)
157 | 	if err != nil {
158 | 		fmt.Fprintln(os.Stderr, err)
159 | 		os.Exit(1)
160 | 	}
161 | 	p.Fprintf(os.Stderr, " %d bytes\n", n)
162 | 
163 | 	err = bufiooutput.Flush()
164 | 	if err != nil {
165 | 		fmt.Fprintln(os.Stderr, err)
166 | 		os.Exit(1)
167 | 	}
168 | 
169 | 	fmt.Fprint(os.Stderr, "writting wordInfo offsets...")
170 | 	_, err = outputfd.Seek(offset, io.SeekStart)
171 | 	if err != nil {
172 | 		fmt.Fprintln(os.Stderr, err)
173 | 		os.Exit(1)
174 | 	}
175 | 	bufiooutput = bufio.NewWriter(outputfd)
176 | 
177 | 	n64, err = offsets.WriteTo(bufiooutput)
178 | 	if err != nil {
179 | 		fmt.Fprintln(os.Stderr, err)
180 | 		os.Exit(1)
181 | 	}
182 | 	p.Fprintf(os.Stderr, " %d bytes\n", n64)
183 | 
184 | 	err = bufiooutput.Flush()
185 | 	if err != nil {
186 | 		fmt.Fprintln(os.Stderr, err)
187 | 		os.Exit(1)
188 | 	}
189 | }
190 | 


--------------------------------------------------------------------------------
/dictionary.go:
--------------------------------------------------------------------------------
  1 | package gosudachi
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io"
  6 | 	"os"
  7 | 
  8 | 	"github.com/msnoigrs/gosudachi/data"
  9 | 	"github.com/msnoigrs/gosudachi/dictionary"
 10 | )
 11 | 
 12 | const (
 13 | 	UserDictCostParMorph = -20
 14 | )
 15 | 
 16 | const maxcost = int(int16(^uint16(0) >> 1))
 17 | const mincost = int(-maxcost - 1)
 18 | 
 19 | type JapaneseDictionary struct {
 20 | 	grammar            *dictionary.Grammar
 21 | 	lexicon            *dictionary.LexiconSet
 22 | 	inputTextPlugins   []InputTextPlugin
 23 | 	oovProviderPlugins []OovProviderPlugin
 24 | 	pathRewritePlugins []PathRewritePlugin
 25 | 	dictionaries       []*dictionary.BinaryDictionary
 26 | }
 27 | 
 28 | func NewJapaneseDictionary(config *BaseConfig, inputTextPlugins []InputTextPlugin, oovProviderPlugins []OovProviderPlugin, pathRewritePlugins []PathRewritePlugin, editConnectionCostPlugins []EditConnectionCostPlugin) (*JapaneseDictionary, error) {
 29 | 	if len(oovProviderPlugins) == 0 {
 30 | 		return nil, fmt.Errorf("no OOV provider")
 31 | 	}
 32 | 
 33 | 	d := &JapaneseDictionary{
 34 | 		inputTextPlugins:   inputTextPlugins,
 35 | 		oovProviderPlugins: oovProviderPlugins,
 36 | 		pathRewritePlugins: pathRewritePlugins,
 37 | 	}
 38 | 
 39 | 	err := d.ReadSystemDictionary(config.SystemDict, config.Utf16String)
 40 | 	if err != nil {
 41 | 		return nil, fmt.Errorf("fail to read a system dictionary: %s", err)
 42 | 	}
 43 | 
 44 | 	for _, plugin := range editConnectionCostPlugins {
 45 | 		err := plugin.SetUp(d.grammar)
 46 | 		if err != nil {
 47 | 			return nil, err
 48 | 		}
 49 | 		err = plugin.Edit(d.grammar)
 50 | 		if err != nil {
 51 | 			return nil, err
 52 | 		}
 53 | 	}
 54 | 
 55 | 	err = d.ReadCharacterDefinition(config.CharacterDefinitionFile)
 56 | 	if err != nil {
 57 | 		return nil, fmt.Errorf("fail to read a character defition file: %s", err)
 58 | 	}
 59 | 
 60 | 	for _, plugin := range inputTextPlugins {
 61 | 		err := plugin.SetUp()
 62 | 		if err != nil {
 63 | 			return nil, err
 64 | 		}
 65 | 	}
 66 | 	for _, plugin := range oovProviderPlugins {
 67 | 		err := plugin.SetUp(d.grammar)
 68 | 		if err != nil {
 69 | 			return nil, err
 70 | 		}
 71 | 	}
 72 | 	for _, plugin := range pathRewritePlugins {
 73 | 		err := plugin.SetUp(d.grammar)
 74 | 		if err != nil {
 75 | 			return nil, err
 76 | 		}
 77 | 	}
 78 | 
 79 | 	for _, ud := range config.UserDict {
 80 | 		err := d.ReadUserDictionary(ud, config.Utf16String)
 81 | 		if err != nil {
 82 | 			return nil, fmt.Errorf("fail to read a user dictionary: %s", err)
 83 | 		}
 84 | 	}
 85 | 	return d, nil
 86 | }
 87 | 
 88 | func (d *JapaneseDictionary) ReadSystemDictionary(filename string, utf16string bool) error {
 89 | 	dict, err := dictionary.ReadSystemDictionary(filename, utf16string)
 90 | 	if err != nil {
 91 | 		return err
 92 | 	}
 93 | 
 94 | 	d.dictionaries = append(d.dictionaries, dict)
 95 | 	d.grammar = dict.Grammar
 96 | 	d.lexicon = dictionary.NewLexiconSet(dict.Lexicon)
 97 | 	return nil
 98 | }
 99 | 
100 | func (d *JapaneseDictionary) ReadUserDictionary(filename string, utf16string bool) error {
101 | 	if d.lexicon.IsFull() {
102 | 		return fmt.Errorf("too many dictionaries")
103 | 	}
104 | 
105 | 	dict, err := dictionary.ReadUserDictionary(filename, utf16string)
106 | 	if err != nil {
107 | 		return err
108 | 	}
109 | 
110 | 	d.dictionaries = append(d.dictionaries, dict)
111 | 
112 | 	userLexicon := dict.Lexicon
113 | 	tokenizer := NewJapaneseTokenizer(
114 | 		d.grammar,
115 | 		d.lexicon,
116 | 		d.inputTextPlugins,
117 | 		d.oovProviderPlugins,
118 | 		[]PathRewritePlugin{},
119 | 	)
120 | 	userLexicon.CalculateCost(func(text string) (int16, error) {
121 | 		ms, err := tokenizer.Tokenize("C", text)
122 | 		if err != nil {
123 | 			return int16(mincost), err
124 | 		}
125 | 		cost := ms.GetInternalCost() + UserDictCostParMorph*ms.Length()
126 | 		if cost > maxcost {
127 | 			cost = maxcost
128 | 		} else if cost < mincost {
129 | 			cost = mincost
130 | 		}
131 | 		return int16(cost), nil
132 | 	})
133 | 	d.lexicon.Add(userLexicon, int32(d.grammar.GetPartOfSpeechSize()))
134 | 	d.grammar.AddPosList(dict.Grammar)
135 | 	return nil
136 | }
137 | 
138 | func (d *JapaneseDictionary) ReadCharacterDefinition(charDef string) error {
139 | 	var charDefReader io.Reader
140 | 	if charDef != "" {
141 | 		charDefFd, err := os.OpenFile(charDef, os.O_RDONLY, 0644)
142 | 		if err != nil {
143 | 			return fmt.Errorf("%s: %s", err, charDef)
144 | 		}
145 | 		defer charDefFd.Close()
146 | 		charDefReader = charDefFd
147 | 	} else {
148 | 		charDefF, err := data.Assets.Open("char.def")
149 | 		if err != nil {
150 | 			return fmt.Errorf("%s: (data.Assets)char.def", err)
151 | 		}
152 | 		defer charDefF.Close()
153 | 		charDefReader = charDefF
154 | 	}
155 | 
156 | 	cat := dictionary.NewCharacterCategory()
157 | 	err := cat.ReadCharacterDefinition(charDefReader)
158 | 	if err != nil {
159 | 		return err
160 | 	}
161 | 	d.grammar.CharCategory = cat
162 | 	return nil
163 | }
164 | 
165 | func (d *JapaneseDictionary) Close() {
166 | 	d.grammar = nil
167 | 	d.lexicon = nil
168 | 	for _, dict := range d.dictionaries {
169 | 		dict.Close()
170 | 	}
171 | 	d.dictionaries = d.dictionaries[:0]
172 | }
173 | 
174 | func (d *JapaneseDictionary) Create() *JapaneseTokenizer {
175 | 	return NewJapaneseTokenizer(
176 | 		d.grammar,
177 | 		d.lexicon,
178 | 		d.inputTextPlugins,
179 | 		d.oovProviderPlugins,
180 | 		d.pathRewritePlugins,
181 | 	)
182 | }
183 | 
184 | func (d *JapaneseDictionary) GetPartOfSpeechSize() int {
185 | 	return d.grammar.GetPartOfSpeechSize()
186 | }
187 | 
188 | func (d *JapaneseDictionary) GetPartOfSpeechString(posId int16) []string {
189 | 	return d.grammar.GetPartOfSpeechString(posId)
190 | }
191 | 


--------------------------------------------------------------------------------
/dictionary/binarydict.go:
--------------------------------------------------------------------------------
 1 | package dictionary
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 	"github.com/msnoigrs/gosudachi/internal/mmap"
 6 | 	"os"
 7 | )
 8 | 
 9 | type BinaryDictionary struct {
10 | 	fd      *os.File
11 | 	fmap    []byte
12 | 	Header  *DictionaryHeader
13 | 	Grammar *Grammar
14 | 	Lexicon *DoubleArrayLexicon
15 | }
16 | 
17 | func NewBinaryDictionary(filename string, utf16string bool) (*BinaryDictionary, error) {
18 | 	fd, err := os.OpenFile(filename, os.O_RDONLY, 0644)
19 | 	if err != nil {
20 | 		return nil, err
21 | 	}
22 | 
23 | 	finfo, err := fd.Stat()
24 | 	if err != nil {
25 | 		_ = fd.Close()
26 | 		return nil, err
27 | 	}
28 | 	fmap, err := mmap.Mmap(fd, false, 0, finfo.Size())
29 | 	if err != nil {
30 | 		_ = fd.Close()
31 | 		return nil, err
32 | 	}
33 | 
34 | 	offset := 0
35 | 	header := ParseDictionaryHeader(fmap, offset)
36 | 	if header == nil {
37 | 		return nil, fmt.Errorf("invalid header: %s", filename)
38 | 	}
39 | 
40 | 	offset += HeaderStorageSize
41 | 	var grammar *Grammar
42 | 	if header.Version == SystemDictVersion || header.Version == UserDictVersion2 {
43 | 		grammar = NewGrammar(fmap, offset, utf16string)
44 | 		offset += grammar.StorageSize
45 | 	} else if header.Version != UserDictVersion {
46 | 		_ = mmap.Munmap(fmap)
47 | 		_ = fd.Close()
48 | 		return nil, fmt.Errorf("invalid dictionary: %s", filename)
49 | 	}
50 | 
51 | 	lexicon := NewDoubleArrayLexicon(fmap, offset, utf16string)
52 | 
53 | 	return &BinaryDictionary{
54 | 		fd,
55 | 		fmap,
56 | 		header,
57 | 		grammar,
58 | 		lexicon,
59 | 	}, nil
60 | }
61 | 
62 | func ReadSystemDictionary(filename string, utf16string bool) (*BinaryDictionary, error) {
63 | 	dict, err := NewBinaryDictionary(filename, utf16string)
64 | 	if err != nil {
65 | 		return nil, err
66 | 	}
67 | 	if dict.Header.Version != SystemDictVersion {
68 | 		_ = dict.Close()
69 | 		return nil, fmt.Errorf("invalid systemd dictionary: %s", filename)
70 | 	}
71 | 	return dict, nil
72 | }
73 | 
74 | func ReadUserDictionary(filename string, utf16string bool) (*BinaryDictionary, error) {
75 | 	dict, err := NewBinaryDictionary(filename, utf16string)
76 | 	if err != nil {
77 | 		return nil, err
78 | 	}
79 | 	if !IsUserDictionary(dict.Header.Version) {
80 | 		_ = dict.Close()
81 | 		return nil, fmt.Errorf("invalid user dictionary: %s", filename)
82 | 	}
83 | 	return dict, nil
84 | }
85 | 
86 | func (bd *BinaryDictionary) Close() error {
87 | 	err := mmap.Munmap(bd.fmap)
88 | 	if err != nil {
89 | 		return err
90 | 	}
91 | 	return bd.fd.Close()
92 | }
93 | 


--------------------------------------------------------------------------------
/dictionary/bytes.go:
--------------------------------------------------------------------------------
 1 | package dictionary
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"encoding/binary"
 6 | 	"unicode/utf16"
 7 | )
 8 | 
 9 | func bufferToInt16(bytebuffer []byte, offset int) (int, int16) {
10 | 	var ret int16
11 | 	offsetend := offset + 2
12 | 	_ = binary.Read(bytes.NewBuffer(bytebuffer[offset:offsetend]), binary.LittleEndian, &ret)
13 | 	return offsetend, ret
14 | }
15 | 
16 | func bufferToUint16(bytebuffer []byte, offset int) (int, uint16) {
17 | 	var ret uint16
18 | 	offsetend := offset + 2
19 | 	_ = binary.Read(bytes.NewBuffer(bytebuffer[offset:offsetend]), binary.LittleEndian, &ret)
20 | 	return offsetend, ret
21 | }
22 | 
23 | func bufferToInt32(bytebuffer []byte, offset int) (int, int32) {
24 | 	var ret int32
25 | 	offsetend := offset + 4
26 | 	_ = binary.Read(bytes.NewBuffer(bytebuffer[offset:offsetend]), binary.LittleEndian, &ret)
27 | 	return offsetend, ret
28 | }
29 | 
30 | func bufferToUint32(bytebuffer []byte, offset int) (int, uint32) {
31 | 	var ret uint32
32 | 	offsetend := offset + 4
33 | 	_ = binary.Read(bytes.NewBuffer(bytebuffer[offset:offsetend]), binary.LittleEndian, &ret)
34 | 	return offsetend, ret
35 | }
36 | 
37 | func bufferToInt64(bytebuffer []byte, offset int) (int, int64) {
38 | 	var ret int64
39 | 	offsetend := offset + 8
40 | 	_ = binary.Read(bytes.NewBuffer(bytebuffer[offset:offsetend]), binary.LittleEndian, &ret)
41 | 	return offsetend, ret
42 | }
43 | 
44 | func bufferToUint64(bytebuffer []byte, offset int) (int, uint64) {
45 | 	var ret uint64
46 | 	offsetend := offset + 8
47 | 	_ = binary.Read(bytes.NewBuffer(bytebuffer[offset:offsetend]), binary.LittleEndian, &ret)
48 | 	return offsetend, ret
49 | }
50 | 
51 | func bufferToStringLength(bytebuffer []byte, offset int) (int, int) {
52 | 	length := bytebuffer[offset]
53 | 	if (length & 0x80) == 0x80 {
54 | 		high := int16(length & 0x7F)
55 | 		low := int16(bytebuffer[offset+1])
56 | 		return offset + 2, int(high<<8 | low)
57 | 	}
58 | 	return offset + 1, int(length)
59 | }
60 | 
61 | type bufferToStringFunc func(bytebuffer []byte, offset int) (int, string)
62 | 
63 | func bufferToString(bytebuffer []byte, offset int) (int, string) {
64 | 	offset, length := bufferToStringLength(bytebuffer, offset)
65 | 	offsetend := offset + int(length)
66 | 	return offsetend, string(bytebuffer[offset:offsetend])
67 | }
68 | 
69 | func bufferToStringUtf16(bytebuffer []byte, offset int) (int, string) {
70 | 	// java compatible
71 | 	offset, length := bufferToStringLength(bytebuffer, offset)
72 | 	javainternal := make([]uint16, length, length)
73 | 	for i := 0; i < length; i++ {
74 | 		s := offset + 2*i
75 | 		_ = binary.Read(bytes.NewBuffer(bytebuffer[s:s+2]), binary.LittleEndian, &javainternal[i])
76 | 	}
77 | 	return offset + length*2, string(utf16.Decode(javainternal))
78 | }
79 | 
80 | func bufferToInt32Array(bytebuffer []byte, offset int) (int, []int32) {
81 | 	length := int(bytebuffer[offset])
82 | 	offset++
83 | 	array := make([]int32, length, length)
84 | 	for i := 0; i < length; i++ {
85 | 		s := offset + 4*i
86 | 		_ = binary.Read(bytes.NewBuffer(bytebuffer[s:s+4]), binary.LittleEndian, &array[i])
87 | 	}
88 | 	return offset + 4*length, array
89 | }
90 | 


--------------------------------------------------------------------------------
/dictionary/charcategory.go:
--------------------------------------------------------------------------------
  1 | package dictionary
  2 | 
  3 | import (
  4 | 	"encoding/hex"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"strings"
  8 | 	"unicode/utf8"
  9 | 
 10 | 	"github.com/msnoigrs/gosudachi/internal/lnreader"
 11 | )
 12 | 
 13 | // Categories of characters
 14 | const (
 15 | 	DEFAULT      uint32 = 1       // The fall back category
 16 | 	SPACE        uint32 = 1 << 1  // WhiteSpaces
 17 | 	KANJI        uint32 = 1 << 2  // CJKV ideographic characters
 18 | 	SYMBOL       uint32 = 1 << 3  // Symbols
 19 | 	NUMERIC      uint32 = 1 << 4  // Numerical characters
 20 | 	ALPHA        uint32 = 1 << 5  // Latin alphabets
 21 | 	HIRAGANA     uint32 = 1 << 6  // Hiragana characters
 22 | 	KATAKANA     uint32 = 1 << 7  // Katakana characters
 23 | 	KANJINUMERIC uint32 = 1 << 8  // Knaji numeric characters
 24 | 	GREEK        uint32 = 1 << 9  // Greek alphabets
 25 | 	CYRILLIC     uint32 = 1 << 10 // Cyrillic alphabets
 26 | 	USER1        uint32 = 1 << 11 // User defined category
 27 | 	USER2        uint32 = 1 << 12 // User defined category
 28 | 	USER3        uint32 = 1 << 13 // User defined category
 29 | 	USER4        uint32 = 1 << 14 // User defined category
 30 | 	NOOOVBOW     uint32 = 1 << 15 // Characters that cannot be the beginning of word
 31 | )
 32 | 
 33 | func GetCategoryType(s string) (uint32, error) {
 34 | 	switch s {
 35 | 	case "DEFAULT":
 36 | 		return DEFAULT, nil
 37 | 	case "SPACE":
 38 | 		return SPACE, nil
 39 | 	case "KANJI":
 40 | 		return KANJI, nil
 41 | 	case "SYMBOL":
 42 | 		return SYMBOL, nil
 43 | 	case "NUMERIC":
 44 | 		return NUMERIC, nil
 45 | 	case "ALPHA":
 46 | 		return ALPHA, nil
 47 | 	case "HIRAGANA":
 48 | 		return HIRAGANA, nil
 49 | 	case "KATAKANA":
 50 | 		return KATAKANA, nil
 51 | 	case "KANJINUMERIC":
 52 | 		return KANJINUMERIC, nil
 53 | 	case "GREEK":
 54 | 		return GREEK, nil
 55 | 	case "CYRILLIC":
 56 | 		return CYRILLIC, nil
 57 | 	case "USER1":
 58 | 		return USER1, nil
 59 | 	case "USER2":
 60 | 		return USER2, nil
 61 | 	case "USER3":
 62 | 		return USER3, nil
 63 | 	case "USER4":
 64 | 		return USER4, nil
 65 | 	case "NOOOVBOW":
 66 | 		return NOOOVBOW, nil
 67 | 	}
 68 | 	return 0, fmt.Errorf("%s is invalid type", s)
 69 | }
 70 | 
 71 | type categoryRange struct {
 72 | 	low        int32
 73 | 	high       int32
 74 | 	categories uint32
 75 | }
 76 | 
 77 | func (r *categoryRange) contains(cp rune) bool {
 78 | 	if int32(cp) >= r.low && int32(cp) <= r.high {
 79 | 		return true
 80 | 	}
 81 | 	return false
 82 | }
 83 | 
 84 | func (r *categoryRange) containingLength(text string) int {
 85 | 	for i, c := range text {
 86 | 		if int32(c) < r.low || int32(c) > r.high {
 87 | 			return i
 88 | 		}
 89 | 	}
 90 | 	return utf8.RuneCountInString(text)
 91 | }
 92 | 
 93 | type CharacterCategory struct {
 94 | 	rangeList []*categoryRange
 95 | }
 96 | 
 97 | func NewCharacterCategory() *CharacterCategory {
 98 | 	return &CharacterCategory{}
 99 | }
100 | 
101 | func (cc *CharacterCategory) GetCategoryTypes(codePoint rune) uint32 {
102 | 	var categories uint32
103 | 	for _, cr := range cc.rangeList {
104 | 		if cr.contains(codePoint) {
105 | 			categories |= cr.categories
106 | 		}
107 | 	}
108 | 
109 | 	if categories == 0 {
110 | 		categories = DEFAULT
111 | 	}
112 | 	return categories
113 | }
114 | 
115 | func (cc *CharacterCategory) ReadCharacterDefinition(charDefReader io.Reader) error {
116 | 	r := lnreader.NewLineNumberReader(charDefReader)
117 | 	for {
118 | 		line, err := r.ReadLine()
119 | 		if err == io.EOF {
120 | 			break
121 | 		}
122 | 		if err != nil {
123 | 			return err
124 | 		}
125 | 		if lnreader.IsSkipLine(line) {
126 | 			continue
127 | 		}
128 | 		cols := strings.Fields(string(line))
129 | 		if len(cols) < 2 {
130 | 			return fmt.Errorf("invalid format at line %d: too short fields", r.NumLine)
131 | 		}
132 | 		if !strings.HasPrefix(cols[0], "0x") {
133 | 			continue
134 | 		}
135 | 
136 | 		catrange := new(categoryRange)
137 | 		rs := strings.Split(cols[0], "..")
138 | 		low, err := decodeHexStrToInt32(rs[0])
139 | 		if err != nil {
140 | 			return fmt.Errorf("invalid format at line %d: %s", r.NumLine, err)
141 | 		}
142 | 		catrange.low = low
143 | 		if len(rs) > 1 {
144 | 			high, err := decodeHexStrToInt32(rs[1])
145 | 			if err != nil {
146 | 				return fmt.Errorf("invalid format at line %d: %s", r.NumLine, err)
147 | 			}
148 | 			catrange.high = high
149 | 		} else {
150 | 			catrange.high = catrange.low
151 | 		}
152 | 		if catrange.low > catrange.high {
153 | 			return fmt.Errorf("invalid format at line %d: low > high", r.NumLine)
154 | 		}
155 | 		for i := 1; i < len(cols); i++ {
156 | 			if strings.HasPrefix(cols[i], "#") {
157 | 				break
158 | 			}
159 | 			t, err := GetCategoryType(cols[i])
160 | 			if err != nil {
161 | 				return fmt.Errorf("%s at line %d: %s", err, r.NumLine, err)
162 | 			}
163 | 			catrange.categories |= t
164 | 		}
165 | 		cc.rangeList = append(cc.rangeList, catrange)
166 | 	}
167 | 
168 | 	return nil
169 | }
170 | 
171 | func decodeHexStrToInt32(s string) (int32, error) {
172 | 	if len(s) < 3 {
173 | 		return 0, fmt.Errorf("invalid hex string: too short")
174 | 	}
175 | 	src := []byte(s[2:])
176 | 	dst := make([]byte, hex.DecodedLen(len(src)))
177 | 	n, err := hex.Decode(dst, src)
178 | 	if err != nil {
179 | 		return 0, err
180 | 	}
181 | 	if n > 4 {
182 | 		return 0, fmt.Errorf("invalid hex string: too long")
183 | 	}
184 | 	var ret int32
185 | 	switch n {
186 | 	case 4:
187 | 		ret = int32(dst[0])*16777216 + int32(dst[1])*65536 + int32(dst[2])*256 + int32(dst[3])
188 | 	case 3:
189 | 		ret = int32(dst[0])*65536 + int32(dst[1])*256 + int32(dst[2])
190 | 	case 2:
191 | 		ret = int32(dst[0])*256 + int32(dst[1])
192 | 	case 1:
193 | 		ret = int32(dst[0])
194 | 	}
195 | 	return ret, nil
196 | }
197 | 


--------------------------------------------------------------------------------
/dictionary/dalexicon.go:
--------------------------------------------------------------------------------
  1 | package dictionary
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/binary"
  6 | 	"io"
  7 | 
  8 | 	"github.com/msnoigrs/gosudachi/dartsclone"
  9 | )
 10 | 
 11 | const (
 12 | 	wordParameterListElementSize = 2 * 3
 13 | )
 14 | 
 15 | type wordIdTable struct {
 16 | 	bytebuffer []byte
 17 | 	size       int32
 18 | 	//offset     int
 19 | }
 20 | 
 21 | func newWordIdTable(bytebuffer []byte, offset int) *wordIdTable {
 22 | 	_, size := bufferToInt32(bytebuffer, offset)
 23 | 	return &wordIdTable{
 24 | 		bytebuffer: bytebuffer[offset+4 : offset+4+int(size)],
 25 | 		size:       size,
 26 | 		//offset:     offset + 4,
 27 | 	}
 28 | }
 29 | 
 30 | func (t *wordIdTable) storageSize() int {
 31 | 	return 4 + int(t.size)
 32 | }
 33 | 
 34 | func (t *wordIdTable) get(index int) []int32 {
 35 | 	_, result := bufferToInt32Array(t.bytebuffer, index)
 36 | 	return result
 37 | }
 38 | 
 39 | type wordParameterList struct {
 40 | 	bytebuffer []byte
 41 | 	size       int32
 42 | 	offset     int
 43 | 	isCopied   bool
 44 | }
 45 | 
 46 | func newWordParameterList(bytebuffer []byte, offset int) *wordParameterList {
 47 | 	offset, size := bufferToInt32(bytebuffer, offset)
 48 | 	return &wordParameterList{
 49 | 		bytebuffer: bytebuffer,
 50 | 		size:       size,
 51 | 		offset:     offset,
 52 | 		isCopied:   false,
 53 | 	}
 54 | }
 55 | 
 56 | func (l *wordParameterList) storageSize() int {
 57 | 	return 4 + wordParameterListElementSize*int(l.size)
 58 | }
 59 | 
 60 | func (l *wordParameterList) getLeftId(wordId int32) int16 {
 61 | 	_, ret := bufferToInt16(l.bytebuffer, l.offset+wordParameterListElementSize*int(wordId))
 62 | 	return ret
 63 | }
 64 | 
 65 | func (l *wordParameterList) getRightId(wordId int32) int16 {
 66 | 	_, ret := bufferToInt16(l.bytebuffer, l.offset+wordParameterListElementSize*int(wordId)+2)
 67 | 	return ret
 68 | }
 69 | 
 70 | func (l *wordParameterList) getCost(wordId int32) int16 {
 71 | 	_, ret := bufferToInt16(l.bytebuffer, l.offset+wordParameterListElementSize*int(wordId)+4)
 72 | 	return ret
 73 | }
 74 | 
 75 | func (l *wordParameterList) setCost(wordId int32, cost int16) {
 76 | 	if !l.isCopied {
 77 | 		l.copyBuffer()
 78 | 	}
 79 | 
 80 | 	s := l.offset + wordParameterListElementSize*int(wordId) + 4
 81 | 	binary.LittleEndian.PutUint16(l.bytebuffer[s:], uint16(cost))
 82 | }
 83 | 
 84 | // syncronized ???
 85 | func (l *wordParameterList) copyBuffer() {
 86 | 	nl := int(wordParameterListElementSize) * int(l.size)
 87 | 	newBuffer := make([]byte, nl, nl)
 88 | 	s := l.offset
 89 | 	copy(newBuffer, l.bytebuffer[s:s+nl])
 90 | 	l.bytebuffer = newBuffer
 91 | 	l.offset = 0
 92 | 	l.isCopied = true
 93 | }
 94 | 
 95 | type wordInfoList struct {
 96 | 	bytebuffer      []byte
 97 | 	offset          int
 98 | 	wordSize        int32
 99 | 	bufferToStringF bufferToStringFunc
100 | }
101 | 
102 | func newWordInfoList(bytebuffer []byte, offset int, wordSize int32, bufferToStringF bufferToStringFunc) *wordInfoList {
103 | 	return &wordInfoList{
104 | 		bytebuffer:      bytebuffer,
105 | 		offset:          offset,
106 | 		wordSize:        wordSize,
107 | 		bufferToStringF: bufferToStringF,
108 | 	}
109 | }
110 | 
111 | func (l *wordInfoList) getWordInfo(wordId int32) *WordInfo {
112 | 	index := l.wordIdToOffset(wordId)
113 | 
114 | 	index, surface := l.bufferToStringF(l.bytebuffer, index)
115 | 	index, headwordLength := bufferToStringLength(l.bytebuffer, index)
116 | 	index, posId := bufferToInt16(l.bytebuffer, index)
117 | 	index, normalizedForm := l.bufferToStringF(l.bytebuffer, index)
118 | 	if normalizedForm == "" {
119 | 		normalizedForm = surface
120 | 	}
121 | 	index, dictionaryFormWordId := bufferToInt32(l.bytebuffer, index)
122 | 	index, readingForm := l.bufferToStringF(l.bytebuffer, index)
123 | 	if readingForm == "" {
124 | 		readingForm = surface
125 | 	}
126 | 	index, aUnitSplit := bufferToInt32Array(l.bytebuffer, index)
127 | 	index, bUnitSplit := bufferToInt32Array(l.bytebuffer, index)
128 | 	index, wordStructure := bufferToInt32Array(l.bytebuffer, index)
129 | 
130 | 	dictionaryForm := surface
131 | 	if dictionaryFormWordId >= 0 && dictionaryFormWordId != wordId {
132 | 		wi := l.getWordInfo(dictionaryFormWordId)
133 | 		dictionaryForm = wi.Surface
134 | 	}
135 | 
136 | 	return &WordInfo{
137 | 		Surface:              surface,
138 | 		HeadwordLength:       int16(headwordLength),
139 | 		PosId:                posId,
140 | 		NormalizedForm:       normalizedForm,
141 | 		DictionaryFormWordId: dictionaryFormWordId,
142 | 		DictionaryForm:       dictionaryForm,
143 | 		ReadingForm:          readingForm,
144 | 		AUnitSplit:           aUnitSplit,
145 | 		BUnitSplit:           bUnitSplit,
146 | 		WordStructure:        wordStructure,
147 | 	}
148 | }
149 | 
150 | func (l *wordInfoList) wordIdToOffset(wordId int32) int {
151 | 	s := l.offset + 4*int(wordId)
152 | 	_, ret := bufferToInt32(l.bytebuffer, s)
153 | 	return int(ret)
154 | }
155 | 
156 | type DoubleArrayLexicon struct {
157 | 	wordIdT    *wordIdTable
158 | 	wordParams *wordParameterList
159 | 	wordInfos  *wordInfoList
160 | 	trie       *dartsclone.DoubleArray
161 | }
162 | 
163 | func NewDoubleArrayLexicon(bytebuffer []byte, offset int, utf16string bool) *DoubleArrayLexicon {
164 | 	var size uint32
165 | 	trie := dartsclone.NewDoubleArray()
166 | 	offset, size = bufferToUint32(bytebuffer, offset)
167 | 	trie.SetBuffer(bytebuffer[offset : offset+int(size)*4])
168 | 	offset += trie.TotalSize()
169 | 
170 | 	wordIdT := newWordIdTable(bytebuffer, offset)
171 | 	offset += wordIdT.storageSize()
172 | 
173 | 	wordParams := newWordParameterList(bytebuffer, offset)
174 | 	offset += wordParams.storageSize()
175 | 
176 | 	var wordInfos *wordInfoList
177 | 	if utf16string {
178 | 		wordInfos = newWordInfoList(bytebuffer, offset, wordParams.size, bufferToStringUtf16)
179 | 	} else {
180 | 		wordInfos = newWordInfoList(bytebuffer, offset, wordParams.size, bufferToString)
181 | 	}
182 | 
183 | 	return &DoubleArrayLexicon{
184 | 		wordIdT:    wordIdT,
185 | 		wordParams: wordParams,
186 | 		wordInfos:  wordInfos,
187 | 		trie:       trie,
188 | 	}
189 | }
190 | 
191 | func (lexicon *DoubleArrayLexicon) Lookup(text []byte, offset int) *DoubleArrayLexiconIterator {
192 | 	it := lexicon.trie.CommonPrefixSearchItr(text, offset)
193 | 	return newDoubleArrayLexiconIterator(it, lexicon.wordIdT)
194 | }
195 | 
196 | func (lexicon *DoubleArrayLexicon) GetWordId(headword string, posId int16, readingForm string) int32 {
197 | 	var wid int32
198 | 	for ; wid < lexicon.wordInfos.wordSize; wid++ {
199 | 		wi := lexicon.wordInfos.getWordInfo(wid)
200 | 		if wi.Surface == headword &&
201 | 			wi.PosId == posId &&
202 | 			wi.ReadingForm == readingForm {
203 | 			return wid
204 | 		}
205 | 	}
206 | 	return -1
207 | }
208 | 
209 | func (lexicon *DoubleArrayLexicon) GetLeftId(wordId int32) int16 {
210 | 	return lexicon.wordParams.getLeftId(wordId)
211 | }
212 | 
213 | func (lexicon *DoubleArrayLexicon) GetRightId(wordId int32) int16 {
214 | 	return lexicon.wordParams.getRightId(wordId)
215 | }
216 | 
217 | func (lexicon *DoubleArrayLexicon) GetCost(wordId int32) int16 {
218 | 	return lexicon.wordParams.getCost(wordId)
219 | }
220 | 
221 | func (lexicon *DoubleArrayLexicon) GetWordInfo(wordId int32) *WordInfo {
222 | 	return lexicon.wordInfos.getWordInfo(wordId)
223 | }
224 | 
225 | func (lexicon *DoubleArrayLexicon) GetDictionaryId(wordId int32) int {
226 | 	return 0
227 | }
228 | 
229 | func (lexicon *DoubleArrayLexicon) Size() int32 {
230 | 	return lexicon.wordParams.size
231 | }
232 | 
233 | const maxint16 = int16(^uint16(0) >> 1)
234 | const minint16 = -maxint16 - 1
235 | 
236 | type CalculateCostFunc func(text string) (int16, error)
237 | 
238 | func (lexicon *DoubleArrayLexicon) CalculateCost(cf CalculateCostFunc) error {
239 | 	var wordId int32
240 | 	for ; wordId < lexicon.wordParams.size; wordId++ {
241 | 		if lexicon.wordParams.getCost(wordId) != minint16 {
242 | 			continue
243 | 		}
244 | 		wi := lexicon.wordInfos.getWordInfo(wordId)
245 | 		cost, err := cf(wi.Surface)
246 | 		if err != nil {
247 | 			return err
248 | 		}
249 | 		lexicon.wordParams.setCost(wordId, cost)
250 | 	}
251 | 	return nil
252 | }
253 | 
254 | func (lexicon *DoubleArrayLexicon) WriteTrieTo(writer io.Writer) (int, error) {
255 | 	err := binary.Write(writer, binary.LittleEndian, uint32(lexicon.trie.Length()))
256 | 	if err != nil {
257 | 		return 0, err
258 | 	}
259 | 	n, err := writer.Write(lexicon.trie.ByteArray())
260 | 	if err != nil {
261 | 		return 4, err
262 | 	}
263 | 	return n + 4, nil
264 | }
265 | 
266 | func (lexicon *DoubleArrayLexicon) WriteWordIdTableTo(writer io.Writer) (int, error) {
267 | 	err := binary.Write(writer, binary.LittleEndian, uint32(lexicon.wordIdT.size))
268 | 	if err != nil {
269 | 		return 0, err
270 | 	}
271 | 	n, err := writer.Write(lexicon.wordIdT.bytebuffer)
272 | 	if err != nil {
273 | 		return 4, err
274 | 	}
275 | 	return n + 4, nil
276 | }
277 | 
278 | func (lexicon *DoubleArrayLexicon) WriteWordParamsTo(writer io.Writer) (int, error) {
279 | 	size := lexicon.wordParams.size
280 | 	err := binary.Write(writer, binary.LittleEndian, uint32(size))
281 | 	if err != nil {
282 | 		return 0, err
283 | 	}
284 | 	n, err := writer.Write(lexicon.wordParams.bytebuffer[lexicon.wordParams.offset : lexicon.wordParams.offset+wordParameterListElementSize*int(size)])
285 | 	if err != nil {
286 | 		return 4, err
287 | 	}
288 | 	return n + 4, nil
289 | }
290 | 
291 | func (lexicon *DoubleArrayLexicon) WriteWordInfos(writer io.Writer, offset int64, offsetlen int64, utf16string bool) (int, *bytes.Buffer, error) {
292 | 	var writeStringF writeStringFunc
293 | 	if utf16string {
294 | 		writeStringF = writeStringUtf16
295 | 	} else {
296 | 		writeStringF = writeString
297 | 	}
298 | 
299 | 	buffer := bytes.NewBuffer([]byte{})
300 | 
301 | 	offsets := bytes.NewBuffer(make([]byte, 0, offsetlen))
302 | 	base := offset + offsetlen
303 | 	position := base
304 | 	for wordId := int32(0); wordId < lexicon.Size(); wordId++ {
305 | 		wi := lexicon.GetWordInfo(wordId)
306 | 		err := binary.Write(offsets, binary.LittleEndian, uint32(position))
307 | 		if err != nil {
308 | 			return 0, offsets, err
309 | 		}
310 | 		err = writeStringF(buffer, wi.Surface)
311 | 		if err != nil {
312 | 			return 0, offsets, err
313 | 		}
314 | 		err = writeStringLength(buffer, wi.HeadwordLength)
315 | 		if err != nil {
316 | 			return 0, offsets, err
317 | 		}
318 | 		err = binary.Write(buffer, binary.LittleEndian, uint16(wi.PosId))
319 | 		if err != nil {
320 | 			return 0, offsets, err
321 | 		}
322 | 		var normalizedForm string
323 | 		if wi.NormalizedForm != wi.Surface {
324 | 			normalizedForm = wi.NormalizedForm
325 | 		}
326 | 		err = writeStringF(buffer, normalizedForm)
327 | 		if err != nil {
328 | 			return 0, offsets, err
329 | 		}
330 | 		err = binary.Write(buffer, binary.LittleEndian, uint32(wi.DictionaryFormWordId))
331 | 		if err != nil {
332 | 			return 0, offsets, err
333 | 		}
334 | 		var readingForm string
335 | 		if wi.ReadingForm != wi.Surface {
336 | 			readingForm = wi.ReadingForm
337 | 		}
338 | 		err = writeStringF(buffer, readingForm)
339 | 		if err != nil {
340 | 			return 0, offsets, err
341 | 		}
342 | 		err = writeIntArray(buffer, wi.AUnitSplit)
343 | 		if err != nil {
344 | 			return 0, offsets, err
345 | 		}
346 | 		err = writeIntArray(buffer, wi.BUnitSplit)
347 | 		if err != nil {
348 | 			return 0, offsets, err
349 | 		}
350 | 		err = writeIntArray(buffer, wi.WordStructure)
351 | 		if err != nil {
352 | 			return 0, offsets, err
353 | 		}
354 | 		n, err := buffer.WriteTo(writer)
355 | 		buffer.Reset()
356 | 		position += n
357 | 	}
358 | 	return int(position - base), offsets, nil
359 | }
360 | 
361 | type DoubleArrayLexiconIterator struct {
362 | 	wordIdT *wordIdTable
363 | 	dait    *dartsclone.Iterator
364 | 	wordIds []int32
365 | 	length  int
366 | 	index   int
367 | }
368 | 
369 | func newDoubleArrayLexiconIterator(dait *dartsclone.Iterator, wordIdT *wordIdTable) *DoubleArrayLexiconIterator {
370 | 	return &DoubleArrayLexiconIterator{
371 | 		wordIdT: wordIdT,
372 | 		dait:    dait,
373 | 		index:   -1,
374 | 	}
375 | }
376 | 
377 | func (it *DoubleArrayLexiconIterator) Next() bool {
378 | 	if it.dait.Err() != nil {
379 | 		return false
380 | 	}
381 | 	if it.index < 0 {
382 | 		return it.dait.Next()
383 | 	} else {
384 | 		return it.index < len(it.wordIds) || it.dait.Next()
385 | 	}
386 | }
387 | 
388 | func (it *DoubleArrayLexiconIterator) Get() (int32, int) {
389 | 	if it.index < 0 || it.index >= len(it.wordIds) {
390 | 		tindex, length := it.dait.Get()
391 | 		if it.dait.Err() != nil {
392 | 			return -1, 0
393 | 		}
394 | 		it.wordIds = it.wordIdT.get(tindex)
395 | 		it.length = length
396 | 		it.index = 0
397 | 	}
398 | 	wordId := it.wordIds[it.index]
399 | 	it.index++
400 | 	return wordId, it.length
401 | }
402 | 
403 | func (it *DoubleArrayLexiconIterator) Err() error {
404 | 	return it.dait.Err()
405 | }
406 | 


--------------------------------------------------------------------------------
/dictionary/dicheader.go:
--------------------------------------------------------------------------------
 1 | package dictionary
 2 | 
 3 | import (
 4 | 	"bytes"
 5 | 	"encoding/binary"
 6 | 	"errors"
 7 | )
 8 | 
 9 | const (
10 | 	DescriptionSize   = 256
11 | 	HeaderStorageSize = 8 + 8 + DescriptionSize
12 | )
13 | 
14 | type DictionaryHeader struct {
15 | 	Version     uint64
16 | 	CreateTime  int64
17 | 	Description string
18 | }
19 | 
20 | func NewDictionaryHeader(version uint64, createTime int64, description string) *DictionaryHeader {
21 | 	return &DictionaryHeader{
22 | 		Version:     version,
23 | 		CreateTime:  createTime,
24 | 		Description: description,
25 | 	}
26 | }
27 | 
28 | func ParseDictionaryHeader(input []byte, offset int) *DictionaryHeader {
29 | 	offset, version := bufferToUint64(input, offset)
30 | 	offset, createTime := bufferToInt64(input, offset)
31 | 
32 | 	i := offset
33 | 	for ; i < HeaderStorageSize; i++ {
34 | 		if input[i] == 0 {
35 | 			break
36 | 		}
37 | 	}
38 | 	// UTF-8
39 | 	description := string(input[offset:i])
40 | 
41 | 	return &DictionaryHeader{
42 | 		Version:     version,
43 | 		CreateTime:  createTime,
44 | 		Description: description,
45 | 	}
46 | }
47 | 
48 | func (dh *DictionaryHeader) ToBytes() ([]byte, error) {
49 | 	desc := []byte(dh.Description)
50 | 	if len(desc) > DescriptionSize {
51 | 		return nil, errors.New("description is too long")
52 | 	}
53 | 
54 | 	buf := bytes.NewBuffer(make([]byte, 0, HeaderStorageSize))
55 | 	err := binary.Write(buf, binary.LittleEndian, uint64(dh.Version))
56 | 	if err != nil {
57 | 		return nil, err
58 | 	}
59 | 	err = binary.Write(buf, binary.LittleEndian, uint64(dh.CreateTime))
60 | 	if err != nil {
61 | 		return nil, err
62 | 	}
63 | 	_, err = buf.Write(desc)
64 | 	if err != nil {
65 | 		return nil, err
66 | 	}
67 | 
68 | 	if len(desc) < DescriptionSize {
69 | 		padding := make([]byte, DescriptionSize-len(desc))
70 | 		_, err = buf.Write(padding)
71 | 		if err != nil {
72 | 			return nil, err
73 | 		}
74 | 	}
75 | 	return buf.Bytes(), nil
76 | }
77 | 


--------------------------------------------------------------------------------
/dictionary/dicprinter.go:
--------------------------------------------------------------------------------
  1 | package dictionary
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"os"
  8 | 	"strconv"
  9 | 	"strings"
 10 | 	"time"
 11 | 
 12 | 	"github.com/msnoigrs/gosudachi/internal/mmap"
 13 | )
 14 | 
 15 | func PrintDictionary(filename string, utf16string bool, systemDict *BinaryDictionary, output io.Writer) error {
 16 | 	var grammar *Grammar
 17 | 
 18 | 	dic, err := NewBinaryDictionary(filename, utf16string)
 19 | 	if err != nil {
 20 | 		return err
 21 | 	}
 22 | 	defer dic.Close()
 23 | 	if dic.Header.Version == SystemDictVersion {
 24 | 		grammar = dic.Grammar
 25 | 	} else if systemDict == nil {
 26 | 		return errors.New("the system dictionary is not specified")
 27 | 	} else {
 28 | 		grammar = systemDict.Grammar
 29 | 		if dic.Header.Version == UserDictVersion2 {
 30 | 			grammar.AddPosList(dic.Grammar)
 31 | 		}
 32 | 	}
 33 | 
 34 | 	possize := grammar.GetPartOfSpeechSize()
 35 | 	posStrings := make([]string, possize, possize)
 36 | 	for pid := 0; pid < possize; pid++ {
 37 | 		posStrings = append(posStrings, strings.Join(grammar.GetPartOfSpeechString(int16(pid)), ","))
 38 | 	}
 39 | 
 40 | 	lexicon := dic.Lexicon
 41 | 	for wordId := int32(0); wordId < lexicon.Size(); wordId++ {
 42 | 		leftId := lexicon.GetLeftId(wordId)
 43 | 		rightId := lexicon.GetRightId(wordId)
 44 | 		cost := lexicon.GetCost(wordId)
 45 | 		wi := lexicon.GetWordInfo(wordId)
 46 | 
 47 | 		unitType := getUnitType(wi)
 48 | 
 49 | 		fmt.Fprintf(output,
 50 | 			"%s,%d,%d,%d,%s,%s,%s,%s,%s,%s,%s,%s,%s\n",
 51 | 			wi.Surface,
 52 | 			leftId,
 53 | 			rightId,
 54 | 			cost,
 55 | 			wi.Surface,
 56 | 			posStrings[int(wi.PosId)],
 57 | 			wi.ReadingForm,
 58 | 			wi.NormalizedForm,
 59 | 			wordIdToString(int(wi.DictionaryFormWordId)),
 60 | 			unitType,
 61 | 			splitToString(wi.AUnitSplit),
 62 | 			splitToString(wi.BUnitSplit),
 63 | 			splitToString(wi.WordStructure),
 64 | 		)
 65 | 	}
 66 | 	return nil
 67 | }
 68 | 
 69 | func wordIdToString(wid int) string {
 70 | 	if wid < 0 {
 71 | 		return "*"
 72 | 	}
 73 | 	return strconv.Itoa(wid)
 74 | }
 75 | 
 76 | func getUnitType(wi *WordInfo) string {
 77 | 	if len(wi.AUnitSplit) == 0 {
 78 | 		return "A"
 79 | 	} else if len(wi.BUnitSplit) == 0 {
 80 | 		return "B"
 81 | 	}
 82 | 	return "C"
 83 | }
 84 | 
 85 | func splitToString(split []int32) string {
 86 | 	if len(split) == 0 {
 87 | 		return "*"
 88 | 	}
 89 | 	splitstrs := make([]string, len(split), len(split))
 90 | 	for _, i := range split {
 91 | 		splitstrs = append(splitstrs, strconv.Itoa(int(i)))
 92 | 	}
 93 | 	return strings.Join(splitstrs, "/")
 94 | }
 95 | 
 96 | func PrintHeader(dictfile string, output io.Writer) error {
 97 | 	dictfd, err := os.OpenFile(dictfile, os.O_RDONLY, 0644)
 98 | 	if err != nil {
 99 | 		return err
100 | 	}
101 | 	defer dictfd.Close()
102 | 
103 | 	finfo, err := dictfd.Stat()
104 | 	if err != nil {
105 | 		return err
106 | 	}
107 | 
108 | 	bytebuffer, err := mmap.Mmap(dictfd, false, 0, finfo.Size())
109 | 	if err != nil {
110 | 		return err
111 | 	}
112 | 	defer mmap.Munmap(bytebuffer)
113 | 
114 | 	dh := ParseDictionaryHeader(bytebuffer, 0)
115 | 
116 | 	fmt.Fprintf(output, "filename: %s\n", dictfile)
117 | 
118 | 	switch dh.Version {
119 | 	case SystemDictVersion:
120 | 		fmt.Fprintln(output, "type: system dictionary")
121 | 	case UserDictVersion, UserDictVersion2:
122 | 		fmt.Fprintln(output, "type: user dictionary")
123 | 	default:
124 | 		fmt.Fprintln(output, "invalid file")
125 | 		os.Exit(1)
126 | 	}
127 | 
128 | 	ctime := time.Unix(dh.CreateTime, 0)
129 | 	zone, _ := ctime.Zone()
130 | 	fmt.Fprintf(output, "createTime: %s[%s]\n", ctime.Format(time.RFC3339), zone)
131 | 	fmt.Fprintf(output, "description: %s\n", dh.Description)
132 | 
133 | 	return nil
134 | }
135 | 


--------------------------------------------------------------------------------
/dictionary/dicversion.go:
--------------------------------------------------------------------------------
 1 | package dictionary
 2 | 
 3 | const (
 4 | 	SystemDictVersion = 0x7366d3f18bd111e7
 5 | 	UserDictVersion   = 0xa50f31188bd211e7
 6 | 	UserDictVersion2  = 0x9fdeb5a90168d868
 7 | )
 8 | 
 9 | func IsUserDictionary(version uint64) bool {
10 | 	return version == UserDictVersion || version == UserDictVersion2
11 | }
12 | 


--------------------------------------------------------------------------------
/dictionary/grammar.go:
--------------------------------------------------------------------------------
  1 | package dictionary
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"encoding/binary"
  6 | 	"io"
  7 | 	"math"
  8 | )
  9 | 
 10 | const (
 11 | 	posDepth            = 6
 12 | 	InhibitedConnection = math.MaxInt16
 13 | )
 14 | 
 15 | var (
 16 | 	BosParameter = []int16{0, 0, 0}
 17 | 	EosParameter = []int16{0, 0, 0}
 18 | )
 19 | 
 20 | type Grammar struct {
 21 | 	bytebuffer           []byte
 22 | 	posList              [][]string
 23 | 	connectTableBytes    []byte
 24 | 	isCopiedConnectTable bool
 25 | 	connectTableOffset   int
 26 | 	leftIdSize           int16
 27 | 	rightIdSize          int16
 28 | 	CharCategory         *CharacterCategory
 29 | 	StorageSize          int
 30 | }
 31 | 
 32 | func NewGrammar(bytebuffer []byte, offset int, utf16string bool) *Grammar {
 33 | 	var bufferToStringF bufferToStringFunc
 34 | 	if utf16string {
 35 | 		bufferToStringF = bufferToStringUtf16
 36 | 	} else {
 37 | 		bufferToStringF = bufferToString
 38 | 	}
 39 | 	originalOffset := offset
 40 | 	var posLen uint16
 41 | 	offset, posLen = bufferToUint16(bytebuffer, offset)
 42 | 	posLeni := int(posLen)
 43 | 	posList := make([][]string, posLeni, posLeni)
 44 | 	for i := 0; i < posLeni; i++ {
 45 | 		pos := make([]string, posDepth, posDepth)
 46 | 		for j := 0; j < posDepth; j++ {
 47 | 			offset, pos[j] = bufferToStringF(bytebuffer, offset)
 48 | 		}
 49 | 		posList[i] = pos
 50 | 	}
 51 | 	var (
 52 | 		leftIdSize  int16
 53 | 		rightIdSize int16
 54 | 	)
 55 | 	offset, leftIdSize = bufferToInt16(bytebuffer, offset)
 56 | 	offset, rightIdSize = bufferToInt16(bytebuffer, offset)
 57 | 
 58 | 	return &Grammar{
 59 | 		bytebuffer:           bytebuffer,
 60 | 		posList:              posList,
 61 | 		connectTableBytes:    bytebuffer,
 62 | 		isCopiedConnectTable: false,
 63 | 		connectTableOffset:   offset,
 64 | 		leftIdSize:           leftIdSize,
 65 | 		rightIdSize:          rightIdSize,
 66 | 		StorageSize:          (offset - originalOffset) + 2*int(leftIdSize)*int(rightIdSize),
 67 | 	}
 68 | }
 69 | 
 70 | func (g *Grammar) AddPosList(fromg *Grammar) {
 71 | 	g.posList = append(g.posList, fromg.posList...)
 72 | }
 73 | 
 74 | func (g *Grammar) GetPartOfSpeechSize() int {
 75 | 	return len(g.posList)
 76 | }
 77 | 
 78 | func (g *Grammar) GetPartOfSpeechString(posId int16) []string {
 79 | 	return g.posList[posId]
 80 | }
 81 | 
 82 | func (g *Grammar) GetPartOfSpeechId(pos []string) int16 {
 83 | L:
 84 | 	for i, p := range g.posList {
 85 | 		for j := 0; j < posDepth; j++ {
 86 | 			if p[j] != pos[j] {
 87 | 				continue L
 88 | 			}
 89 | 		}
 90 | 		return int16(i)
 91 | 	}
 92 | 	return int16(-1)
 93 | }
 94 | 
 95 | func (g *Grammar) GetPosId(posstrings ...string) int16 {
 96 | 	return g.GetPartOfSpeechId(posstrings)
 97 | }
 98 | 
 99 | func (g *Grammar) GetConnectCost(leftId int16, rightId int16) int16 {
100 | 	s := g.connectTableOffset + int(leftId)*2 + 2*int(g.leftIdSize)*int(rightId)
101 | 	_, cost := bufferToInt16(g.connectTableBytes, s)
102 | 	return cost
103 | }
104 | 
105 | func (g *Grammar) SetConnectCost(leftId int16, rightId int16, cost int16) {
106 | 	if !g.isCopiedConnectTable {
107 | 		g.copyConnectTable()
108 | 	}
109 | 	s := g.connectTableOffset + int(leftId)*2 + 2*int(g.leftIdSize)*int(rightId)
110 | 	binary.LittleEndian.PutUint16(g.connectTableBytes[s:], uint16(cost))
111 | }
112 | 
113 | // syncronized ???
114 | func (g *Grammar) copyConnectTable() {
115 | 	l := 2 * int(g.leftIdSize) * int(g.rightIdSize)
116 | 	newbuffer := make([]byte, l, l)
117 | 	s := g.connectTableOffset
118 | 	copy(newbuffer, g.connectTableBytes[s:s+l])
119 | 	g.connectTableBytes = newbuffer
120 | 	g.connectTableOffset = 0
121 | 	g.isCopiedConnectTable = true
122 | }
123 | 
124 | func (g *Grammar) WritePOSTableTo(buffer *bytes.Buffer, utf16string bool) error {
125 | 	var writeStringF writeStringFunc
126 | 	if utf16string {
127 | 		writeStringF = writeStringUtf16
128 | 	} else {
129 | 		writeStringF = writeString
130 | 	}
131 | 	err := binary.Write(buffer, binary.LittleEndian, uint16(len(g.posList)))
132 | 	if err != nil {
133 | 		return err
134 | 	}
135 | 
136 | 	for _, pos := range g.posList {
137 | 		for _, t := range pos {
138 | 			err := writeStringF(buffer, t)
139 | 			if err != nil {
140 | 				return err
141 | 			}
142 | 		}
143 | 	}
144 | 	return nil
145 | }
146 | 
147 | func (g *Grammar) WriteConnMatrixTo(writer io.Writer) (int, error) {
148 | 	err := binary.Write(writer, binary.LittleEndian, uint16(g.leftIdSize))
149 | 	if err != nil {
150 | 		return 0, err
151 | 	}
152 | 	err = binary.Write(writer, binary.LittleEndian, uint16(g.rightIdSize))
153 | 	if err != nil {
154 | 		return 2, err
155 | 	}
156 | 	var n int
157 | 	l := 2 * int(g.leftIdSize) * int(g.rightIdSize)
158 | 	if l > 0 {
159 | 		var err error
160 | 		n, err = writer.Write(g.connectTableBytes[g.connectTableOffset : g.connectTableOffset+l])
161 | 		if err != nil {
162 | 			return 4, err
163 | 		}
164 | 	}
165 | 	return n + 4, nil
166 | }
167 | 


--------------------------------------------------------------------------------
/dictionary/lexiconset.go:
--------------------------------------------------------------------------------
  1 | package dictionary
  2 | 
  3 | const (
  4 | 	LexiconSetMaxDictionaries = 16
  5 | )
  6 | 
  7 | type LexiconSet struct {
  8 | 	lexicons   []*DoubleArrayLexicon
  9 | 	posOffsets []int32
 10 | }
 11 | 
 12 | func NewLexiconSet(systemLexicon *DoubleArrayLexicon) *LexiconSet {
 13 | 	return &LexiconSet{
 14 | 		lexicons:   []*DoubleArrayLexicon{systemLexicon},
 15 | 		posOffsets: []int32{0},
 16 | 	}
 17 | }
 18 | 
 19 | func (s *LexiconSet) Add(lexicon *DoubleArrayLexicon, posOffset int32) {
 20 | 	s.lexicons = append(s.lexicons, lexicon)
 21 | 	s.posOffsets = append(s.posOffsets, posOffset)
 22 | }
 23 | 
 24 | func (s *LexiconSet) IsFull() bool {
 25 | 	return len(s.lexicons) >= LexiconSetMaxDictionaries
 26 | }
 27 | 
 28 | func (s *LexiconSet) Lookup(text []byte, offset int) *LexiconSetIterator {
 29 | 	return newLexiconSetIterator(text, offset, s.lexicons)
 30 | }
 31 | 
 32 | func (s *LexiconSet) GetWordId(headword string, posId int16, readingForm string) int32 {
 33 | 	for dictId := 1; dictId < len(s.lexicons); dictId++ {
 34 | 		wordId := s.lexicons[dictId].GetWordId(headword, posId, readingForm)
 35 | 		if wordId >= 0 {
 36 | 			// buildWordId
 37 | 			return int32(uint32(dictId)<<28) | wordId
 38 | 		}
 39 | 	}
 40 | 	return s.lexicons[0].GetWordId(headword, posId, readingForm)
 41 | }
 42 | 
 43 | func (s *LexiconSet) GetLeftId(wordId int32) int16 {
 44 | 	dictId := int(uint32(wordId) >> 28)
 45 | 	wordId = int32(uint32(wordId) & 0xfffffff)
 46 | 	return s.lexicons[dictId].GetLeftId(wordId)
 47 | }
 48 | 
 49 | func (s *LexiconSet) GetRightId(wordId int32) int16 {
 50 | 	dictId := int(uint32(wordId) >> 28)
 51 | 	wordId = int32(uint32(wordId) & 0xfffffff)
 52 | 	return s.lexicons[dictId].GetRightId(wordId)
 53 | }
 54 | 
 55 | func (s *LexiconSet) GetCost(wordId int32) int16 {
 56 | 	dictId := int(uint32(wordId) >> 28)
 57 | 	wordId = int32(uint32(wordId) & 0xfffffff)
 58 | 	return s.lexicons[dictId].GetCost(wordId)
 59 | }
 60 | 
 61 | func (s *LexiconSet) GetWordInfo(wordId int32) *WordInfo {
 62 | 	dictId := int(uint32(wordId) >> 28)
 63 | 	wordId = int32(uint32(wordId) & 0xfffffff)
 64 | 	wi := s.lexicons[dictId].GetWordInfo(wordId)
 65 | 	if dictId > 0 && int32(wi.PosId) >= s.posOffsets[1] {
 66 | 		// user defined part-of-speech
 67 | 		wi.PosId = int16(int32(wi.PosId) - s.posOffsets[1] + s.posOffsets[dictId])
 68 | 	}
 69 | 	s.convertSplit(wi.AUnitSplit, dictId)
 70 | 	s.convertSplit(wi.BUnitSplit, dictId)
 71 | 	s.convertSplit(wi.WordStructure, dictId)
 72 | 	return wi
 73 | }
 74 | 
 75 | func (s *LexiconSet) GetDictionaryId(wordId int32) int {
 76 | 	return int(uint32(wordId) >> 28)
 77 | }
 78 | 
 79 | func (s *LexiconSet) Size() int32 {
 80 | 	var n int32
 81 | 	for _, l := range s.lexicons {
 82 | 		n += l.Size()
 83 | 	}
 84 | 	return n
 85 | }
 86 | 
 87 | func (s *LexiconSet) convertSplit(split []int32, dictId int) {
 88 | 	for i, id := range split {
 89 | 		if s.GetDictionaryId(id) > 0 {
 90 | 			wordId := uint32(id) & 0xfffffff
 91 | 			// buildWordId
 92 | 			split[i] = int32(uint32(dictId<<28) | wordId)
 93 | 		}
 94 | 	}
 95 | }
 96 | 
 97 | type LexiconSetIterator struct {
 98 | 	text     []byte
 99 | 	offset   int
100 | 	dictId   int
101 | 	lexicons []*DoubleArrayLexicon
102 | 	dalit    *DoubleArrayLexiconIterator
103 | }
104 | 
105 | func newLexiconSetIterator(text []byte, offset int, lexicons []*DoubleArrayLexicon) *LexiconSetIterator {
106 | 	var (
107 | 		dalit  *DoubleArrayLexiconIterator
108 | 		dictId int
109 | 	)
110 | 	if len(lexicons) == 1 {
111 | 		dictId = 0
112 | 	} else {
113 | 		dictId = 1
114 | 	}
115 | 	dalit = lexicons[dictId].Lookup(text, offset)
116 | 
117 | 	return &LexiconSetIterator{
118 | 		text:     text,
119 | 		offset:   offset,
120 | 		dictId:   dictId,
121 | 		lexicons: lexicons,
122 | 		dalit:    dalit,
123 | 	}
124 | }
125 | 
126 | func (it *LexiconSetIterator) Next() bool {
127 | 	if it.dalit.Err() != nil {
128 | 		return false
129 | 	}
130 | 	for !it.dalit.Next() {
131 | 		if it.dictId == 0 {
132 | 			return false
133 | 		}
134 | 		it.dictId++
135 | 		if it.dictId >= len(it.lexicons) {
136 | 			it.dictId = 0
137 | 		}
138 | 		it.dalit = it.lexicons[it.dictId].Lookup(it.text, it.offset)
139 | 	}
140 | 	return true
141 | }
142 | 
143 | func (it *LexiconSetIterator) Get() (int32, int) {
144 | 	rvalue, roffset := it.dalit.Get()
145 | 	if it.dalit.Err() != nil {
146 | 		return -1, 0
147 | 	}
148 | 	if it.dictId > 0 {
149 | 		// buildWordId
150 | 		rvalue = int32(uint32(it.dictId<<28) | uint32(rvalue))
151 | 	}
152 | 	return rvalue, roffset
153 | }
154 | 
155 | func (it *LexiconSetIterator) Err() error {
156 | 	return it.dalit.Err()
157 | }
158 | 


--------------------------------------------------------------------------------
/dictionary/wordinfo.go:
--------------------------------------------------------------------------------
 1 | package dictionary
 2 | 
 3 | type WordInfo struct {
 4 | 	Surface string
 5 | 	HeadwordLength int16
 6 | 	PosId int16
 7 | 	NormalizedForm string
 8 | 	DictionaryFormWordId int32
 9 | 	DictionaryForm string
10 | 	ReadingForm string
11 | 	AUnitSplit []int32
12 | 	BUnitSplit []int32
13 | 	WordStructure []int32
14 | }
15 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
 1 | module github.com/msnoigrs/gosudachi
 2 | 
 3 | go 1.12
 4 | 
 5 | require (
 6 | 	github.com/emirpasic/gods v1.12.0
 7 | 	github.com/shurcooL/httpfs v0.0.0-20190707220628-8d4bc4ba7749 // indirect
 8 | 	github.com/shurcooL/vfsgen v0.0.0-20181202132449-6a9ea43bcacd // indirect
 9 | 	golang.org/x/sys v0.0.0-20190310054646-10058d7d4faa
10 | 	golang.org/x/text v0.3.0
11 | )
12 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
 1 | github.com/emirpasic/gods v1.12.0 h1:QAUIPSaCu4G+POclxeqb3F+WPpdKqFGlw36+yOzGlrg=
 2 | github.com/emirpasic/gods v1.12.0/go.mod h1:YfzfFFoVP/catgzJb4IKIqXjX78Ha8FMSDh3ymbK86o=
 3 | github.com/shurcooL/httpfs v0.0.0-20190707220628-8d4bc4ba7749 h1:bUGsEnyNbVPw06Bs80sCeARAlK8lhwqGyi6UT8ymuGk=
 4 | github.com/shurcooL/httpfs v0.0.0-20190707220628-8d4bc4ba7749/go.mod h1:ZY1cvUeJuFPAdZ/B6v7RHavJWZn2YPVFQ1OSXhCGOkg=
 5 | github.com/shurcooL/vfsgen v0.0.0-20181202132449-6a9ea43bcacd h1:ug7PpSOB5RBPK1Kg6qskGBoP3Vnj/aNYFTznWvlkGo0=
 6 | github.com/shurcooL/vfsgen v0.0.0-20181202132449-6a9ea43bcacd/go.mod h1:TrYk7fJVaAttu97ZZKrO9UbRa8izdowaMIZcxYMbVaw=
 7 | golang.org/x/sys v0.0.0-20190310054646-10058d7d4faa h1:lqti/xP+yD/6zH5TqEwx2MilNIJY5Vbc6Qr8J3qyPIQ=
 8 | golang.org/x/sys v0.0.0-20190310054646-10058d7d4faa/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 9 | golang.org/x/text v0.3.0 h1:g61tztE5qeGQ89tm6NTjjM9VPIm088od1l6aSorWRWg=
10 | golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
11 | 


--------------------------------------------------------------------------------
/inhibitconnectioncostplugin.go:
--------------------------------------------------------------------------------
 1 | package gosudachi
 2 | 
 3 | import (
 4 | 	"github.com/msnoigrs/gosudachi/dictionary"
 5 | )
 6 | 
 7 | type InhibitConnectionPlugin struct {
 8 | 	inhibitedPair []*[]int
 9 | }
10 | 
11 | func NewInhibitConnectionPlugin(inhibitedPair []*[]int) *InhibitConnectionPlugin {
12 | 	return &InhibitConnectionPlugin{
13 | 		inhibitedPair: inhibitedPair,
14 | 	}
15 | }
16 | 
17 | func (p *InhibitConnectionPlugin) GetConfigStruct() interface{} {
18 | 	return p
19 | }
20 | 
21 | func (p *InhibitConnectionPlugin) SetUp(grammar *dictionary.Grammar) error {
22 | 	return nil
23 | }
24 | 
25 | func (p *InhibitConnectionPlugin) Edit(grammar *dictionary.Grammar) error {
26 | 	for _, pair := range p.inhibitedPair {
27 | 		if len(*pair) < 2 {
28 | 			continue
29 | 		}
30 | 		InhibitConnection(grammar, int16((*pair)[0]), int16((*pair)[1]))
31 | 	}
32 | 	return nil
33 | }
34 | 


--------------------------------------------------------------------------------
/inputtext.go:
--------------------------------------------------------------------------------
  1 | package gosudachi
  2 | 
  3 | import (
  4 | 	"unicode/utf8"
  5 | 
  6 | 	"github.com/msnoigrs/gosudachi/dictionary"
  7 | )
  8 | 
  9 | type InputText struct {
 10 | 	OriginalText             string
 11 | 	ModifiedText             string
 12 | 	Bytea                    []byte
 13 | 	offsets                  []int
 14 | 	byteIndexes              []int
 15 | 	charCategories           []uint32
 16 | 	charCategoryContinuities []int
 17 | 	canBowList               []bool
 18 | }
 19 | 
 20 | func NewInputText(originalText string, modifiedText string, bytea []byte, offsets []int, byteIndexes []int, charCategories []uint32, charCategoryContinuities []int, canBowList []bool) *InputText {
 21 | 	return &InputText{
 22 | 		OriginalText:             originalText,
 23 | 		ModifiedText:             modifiedText,
 24 | 		Bytea:                    bytea,
 25 | 		offsets:                  offsets,
 26 | 		byteIndexes:              byteIndexes,
 27 | 		charCategories:           charCategories,
 28 | 		charCategoryContinuities: charCategoryContinuities,
 29 | 		canBowList:               canBowList,
 30 | 	}
 31 | }
 32 | 
 33 | func (t *InputText) GetText() string {
 34 | 	return t.ModifiedText
 35 | }
 36 | 
 37 | func (t *InputText) GetByteText() []byte {
 38 | 	return t.Bytea
 39 | }
 40 | 
 41 | func (t *InputText) GetSubstring(begin int, end int) string {
 42 | 	return string([]rune(t.ModifiedText)[t.byteIndexes[begin]:t.byteIndexes[end]])
 43 | }
 44 | 
 45 | func (t *InputText) GetOffsetTextLength(index int) int {
 46 | 	return t.byteIndexes[index]
 47 | }
 48 | 
 49 | func (t *InputText) GetOriginalIndex(index int) int {
 50 | 	return t.offsets[index]
 51 | }
 52 | 
 53 | func (t *InputText) GetCharCategoryTypes(index int) uint32 {
 54 | 	return t.charCategories[t.byteIndexes[index]]
 55 | }
 56 | 
 57 | func (t *InputText) GetCharCategoryTypesRange(begin int, end int) uint32 {
 58 | 	if begin+t.charCategoryContinuities[begin] < end {
 59 | 		return uint32(0)
 60 | 	}
 61 | 	b := t.byteIndexes[begin]
 62 | 	e := t.byteIndexes[end]
 63 | 	continuousCategory := t.charCategories[b]
 64 | 	for i := b + 1; i < e; i++ {
 65 | 		continuousCategory &= t.charCategories[i]
 66 | 	}
 67 | 	return continuousCategory
 68 | }
 69 | 
 70 | func (t *InputText) GetCharCategoryContinuousLength(index int) int {
 71 | 	return t.charCategoryContinuities[index]
 72 | }
 73 | 
 74 | func (t *InputText) GetCodePointsOffsetLength(index int, codePointOffset int) int {
 75 | 	length := 0
 76 | 	target := t.byteIndexes[index] + codePointOffset
 77 | 	for i := index; i < len(t.Bytea); i++ {
 78 | 		if t.byteIndexes[i] >= target {
 79 | 			return length
 80 | 		}
 81 | 		length++
 82 | 	}
 83 | 	return length
 84 | }
 85 | 
 86 | func (t *InputText) CodePointCount(begin int, end int) int {
 87 | 	return t.byteIndexes[end] - t.byteIndexes[begin]
 88 | }
 89 | 
 90 | func (t *InputText) CanBow(index int) bool {
 91 | 	return t.IsCharAlignment(index) && t.canBowList[t.byteIndexes[index]]
 92 | }
 93 | 
 94 | func (t *InputText) IsCharAlignment(index int) bool {
 95 | 	return (t.Bytea[index] & 0xC0) != 0x80
 96 | }
 97 | 
 98 | type InputTextBuilder struct {
 99 | 	OriginalText  string
100 | 	modifiedRunes []rune
101 | 	textOffsets   []int
102 | 	grammar       *dictionary.Grammar
103 | }
104 | 
105 | func NewInputTextBuilder(text string, grammar *dictionary.Grammar) *InputTextBuilder {
106 | 	modifiedRunes := []rune(text)
107 | 	offsetslen := len(modifiedRunes) + 1
108 | 	textOffsets := make([]int, offsetslen, offsetslen)
109 | 	for i := 0; i < len(modifiedRunes); i++ {
110 | 		textOffsets[i] = i
111 | 	}
112 | 	textOffsets[len(modifiedRunes)] = len(modifiedRunes)
113 | 	return &InputTextBuilder{
114 | 		OriginalText:  text,
115 | 		modifiedRunes: modifiedRunes,
116 | 		textOffsets:   textOffsets,
117 | 		grammar:       grammar,
118 | 	}
119 | }
120 | 
121 | func (builder *InputTextBuilder) GetText() []rune {
122 | 	ret := make([]rune, len(builder.modifiedRunes))
123 | 	copy(ret, builder.modifiedRunes)
124 | 	return ret
125 | }
126 | 
127 | func (builder *InputTextBuilder) Replace(begin int, end int, runes []rune) {
128 | 	rl := len(runes)
129 | 	tlen := end - begin
130 | 
131 | 	offset := builder.textOffsets[begin]
132 | 
133 | 	if rl < tlen {
134 | 		ol := len(builder.modifiedRunes)
135 | 		copy(builder.modifiedRunes[begin+rl:], builder.modifiedRunes[end:])
136 | 		copy(builder.modifiedRunes[begin:], runes)
137 | 		builder.modifiedRunes = builder.modifiedRunes[:ol-tlen+rl]
138 | 
139 | 		tolen := len(builder.textOffsets)
140 | 		copy(builder.textOffsets[begin+rl:], builder.textOffsets[end:])
141 | 		builder.textOffsets = builder.textOffsets[:tolen-tlen+rl]
142 | 	} else if rl == tlen {
143 | 		copy(builder.modifiedRunes[begin:], runes)
144 | 	} else {
145 | 		builder.modifiedRunes = append(builder.modifiedRunes, make([]rune, rl-tlen)...)
146 | 		copy(builder.modifiedRunes[begin+rl:], builder.modifiedRunes[end:])
147 | 		copy(builder.modifiedRunes[begin:], runes)
148 | 
149 | 		builder.textOffsets = append(builder.textOffsets, make([]int, rl-tlen)...)
150 | 		copy(builder.textOffsets[begin+rl:], builder.textOffsets[end:])
151 | 	}
152 | 
153 | 	for i := 0; i < rl; i++ {
154 | 		builder.textOffsets[begin+i] = offset
155 | 	}
156 | }
157 | 
158 | func (builder *InputTextBuilder) Build() *InputText {
159 | 	// getCharCategoryTypes
160 | 	runeCount := len(builder.modifiedRunes)
161 | 	charCategoryTypes := make([]uint32, runeCount, runeCount)
162 | 	for i := 0; i < runeCount; i++ {
163 | 		charCategoryTypes[i] = builder.grammar.CharCategory.GetCategoryTypes(builder.modifiedRunes[i])
164 | 	}
165 | 
166 | 	modifiedText := string(builder.modifiedRunes)
167 | 	p := []byte(modifiedText)
168 | 	keepp := p
169 | 	bytelength := len(p)
170 | 	size := bytelength + 1
171 | 	indexes := make([]int, size, size)
172 | 	offsets := make([]int, size, size)
173 | 
174 | 	sizes := make([]int, runeCount, runeCount)
175 | 
176 | 	pi := 0
177 | 	for i := 0; len(p) > 0; i++ {
178 | 		_, size := utf8.DecodeRune(p)
179 | 		sizes[i] = size
180 | 		for j := 0; j < size; j++ {
181 | 			indexes[pi] = i
182 | 			offsets[pi] = builder.textOffsets[i]
183 | 			pi++
184 | 		}
185 | 		p = p[size:]
186 | 	}
187 | 	indexes[bytelength] = runeCount
188 | 	offsets[bytelength] = builder.textOffsets[len(builder.textOffsets)-1]
189 | 
190 | 	// getCharCategoryContinuities
191 | 	charCategoryContinuities := make([]int, bytelength, bytelength)
192 | 	pi = 0
193 | 	for i := 0; i < runeCount; {
194 | 		next := i + getCharCategoryContinuousLength(charCategoryTypes, i)
195 | 		var length int
196 | 		for j := i; j < next; j++ {
197 | 			length += sizes[j]
198 | 		}
199 | 		for k := length; k > 0; k-- {
200 | 			charCategoryContinuities[pi] = k
201 | 			pi++
202 | 		}
203 | 		i = next
204 | 	}
205 | 
206 | 	// buildCanBowList
207 | 	canBowList := make([]bool, runeCount, runeCount)
208 | 	if runeCount > 0 {
209 | 		canBowList[0] = true
210 | 		for i := 1; i < runeCount; i++ {
211 | 			types := charCategoryTypes[i]
212 | 			if (types&dictionary.ALPHA == dictionary.ALPHA) ||
213 | 				(types&dictionary.GREEK == dictionary.GREEK) ||
214 | 				(types&dictionary.CYRILLIC == dictionary.CYRILLIC) {
215 | 				cc := charCategoryTypes[i-1] & types
216 | 				canBowList[i] = cc == 0
217 | 				continue
218 | 			}
219 | 			canBowList[i] = true
220 | 		}
221 | 	}
222 | 
223 | 	return &InputText{
224 | 		builder.OriginalText,
225 | 		modifiedText,
226 | 		keepp,
227 | 		offsets,
228 | 		indexes,
229 | 		charCategoryTypes,
230 | 		charCategoryContinuities,
231 | 		canBowList,
232 | 	}
233 | }
234 | 
235 | func getCharCategoryContinuousLength(charCategories []uint32, offset int) int {
236 | 	continuousCategory := charCategories[offset]
237 | 	var length int
238 | 	for length = 1; length < len(charCategories)-offset; length++ {
239 | 		cc := continuousCategory & charCategories[offset+length]
240 | 		if cc == 0 {
241 | 			return length
242 | 		}
243 | 	}
244 | 	return length
245 | }
246 | 


--------------------------------------------------------------------------------
/internal/lnreader/lnreader.go:
--------------------------------------------------------------------------------
 1 | package lnreader
 2 | 
 3 | import (
 4 | 	"bufio"
 5 | 	"io"
 6 | )
 7 | 
 8 | type LineNumberReader struct {
 9 | 	r         *bufio.Reader
10 | 	rawBuffer []byte
11 | 	NumLine   int
12 | }
13 | 
14 | func NewLineNumberReader(r io.Reader) *LineNumberReader {
15 | 	return &LineNumberReader{
16 | 		r: bufio.NewReader(r),
17 | 	}
18 | }
19 | 
20 | func (r *LineNumberReader) ReadLine() ([]byte, error) {
21 | 	line, err := r.r.ReadSlice('\n')
22 | 	if err == bufio.ErrBufferFull {
23 | 		r.rawBuffer = append(r.rawBuffer[:0], line...)
24 | 		for err == bufio.ErrBufferFull {
25 | 			line, err = r.r.ReadSlice('\n')
26 | 			r.rawBuffer = append(r.rawBuffer, line...)
27 | 		}
28 | 		line = r.rawBuffer
29 | 	}
30 | 	if len(line) > 0 && err == io.EOF {
31 | 		err = nil
32 | 	} else if err == nil {
33 | 		n := len(line)
34 | 		if n >= 2 && line[n-2] == '\r' && line[n-1] == '\n' {
35 | 			line = line[:n-2]
36 | 		} else {
37 | 			line = line[:n-1]
38 | 		}
39 | 	}
40 | 	if err == nil {
41 | 		r.NumLine++
42 | 	}
43 | 	return line, err
44 | }
45 | 
46 | func IsSkipLine(l []byte) bool {
47 | 	for i, c := range l {
48 | 		if i == 0 && c == '#' {
49 | 			return true
50 | 		} else {
51 | 			if c != ' ' && c != '\n' && c != '\t' {
52 | 				return false
53 | 			}
54 | 		}
55 | 	}
56 | 	return true
57 | }
58 | 
59 | func IsEmptyLine(l []byte) bool {
60 | 	for _, c := range l {
61 | 		if c != ' ' && c != '\n' && c != '\t' {
62 | 			return false
63 | 		}
64 | 	}
65 | 	return true
66 | }
67 | 


--------------------------------------------------------------------------------
/internal/mmap/mmap_unix.go:
--------------------------------------------------------------------------------
 1 | // +build !windows
 2 | 
 3 | package mmap
 4 | 
 5 | import (
 6 | 	"os"
 7 | 	"syscall"
 8 | 	"unsafe"
 9 | 
10 | 	"golang.org/x/sys/unix"
11 | )
12 | 
13 | func Mmap(fd *os.File, writable bool, offset int64, size int64) ([]byte, error) {
14 | 	mtype := unix.PROT_READ
15 | 	if writable {
16 | 		mtype |= unix.PROT_WRITE
17 | 	}
18 | 	return unix.Mmap(int(fd.Fd()), offset, int(size), mtype, unix.MAP_SHARED)
19 | }
20 | 
21 | func Munmap(b []byte) error {
22 | 	return unix.Munmap(b)
23 | }
24 | 
25 | func Madvise(b []byte, readahead bool) error {
26 | 	flags := unix.MADV_NORMAL
27 | 	if !readahead {
28 | 		flags = unix.MADV_RANDOM
29 | 	}
30 | 	return madvise(b, flags)
31 | }
32 | 
33 | // This is required because the unix package does not support the madvise system call on OS X
34 | func madvise(b []byte, advice int) (err error) {
35 | 	_, _, e1 := syscall.Syscall(syscall.SYS_MADVISE, uintptr(unsafe.Pointer(&b[0])),
36 | 		uintptr(len(b)), uintptr(advice))
37 | 	if e1 != 0 {
38 | 		err = e1
39 | 	}
40 | 	return
41 | }
42 | 


--------------------------------------------------------------------------------
/internal/mmap/mmap_windows.go:
--------------------------------------------------------------------------------
 1 | // +build windows
 2 | 
 3 | package mmap
 4 | 
 5 | import (
 6 | 	"fmt"
 7 | 	"os"
 8 | 	"syscall"
 9 | 	"unsafe"
10 | )
11 | 
12 | func Mmap(fd *os.File, write bool, offset int64, size int64) ([]byte, error) {
13 | 	protect := syscall.PAGE_READONLY
14 | 	access := syscall.FILE_MAP_READ
15 | 
16 | 	if write {
17 | 		protect = syscall.PAGE_READWRITE
18 | 		access = syscall.FILE_MAP_WRITE
19 | 	}
20 | 	fi, err := fd.Stat()
21 | 	if err != nil {
22 | 		return nil, err
23 | 	}
24 | 
25 | 	if fi.Size() < size {
26 | 		if err := fd.Truncate(size); err != nil {
27 | 			return nil, fmt.Errorf("truncate: %s", err)
28 | 		}
29 | 	}
30 | 
31 | 	maxsize := size + offset
32 | 	maxsizehi := uint32(maxsize >> 32)
33 | 	maxsizelo := uint32(maxsize & 0xffffffff)
34 | 
35 | 	handle, err := syscall.CreateFileMapping(syscall.Handle(fd.Fd()), nil,
36 | 		uint32(protect), maxsizehi, maxsizelo, nil)
37 | 	if err != nil {
38 | 		return nil, os.NewSyscallError("CreateFileMapping", err)
39 | 	}
40 | 
41 | 	offsethi := uint32(offset >> 32)
42 | 	offsetlo := uint32(offset & 0xffffffff)
43 | 	addr, err := syscall.MapViewOfFile(handle, uint32(access), offsethi, offsetlo, uintptr(size))
44 | 	if addr == 0 {
45 | 		return nil, os.NewSyscallError("MapViewOfFile", err)
46 | 	}
47 | 
48 | 	if err := syscall.CloseHandle(syscall.Handle(handle)); err != nil {
49 | 		return nil, os.NewSyscallError("CloseHandle", err)
50 | 	}
51 | 
52 | 	// Slice memory layout
53 | 	// Copied this snippet from golang/sys package
54 | 	var sl = struct {
55 | 		addr uintptr
56 | 		len int
57 | 		cap int
58 | 	}{addr, int(size), int(size)}
59 | 
60 | 	// Use unsafe to turn sl into a []byte
61 | 	data := *(*[]byte)(unsafe.Pointer(&sl))
62 | 
63 | 	return data, nil
64 | }
65 | 
66 | func Munmap(b []byte) error {
67 | 	return syscall.UnmapViewOfFile(uintptr(unsafe.Pointer(&b[0])))
68 | }
69 | 
70 | func Madvise(b []byte, readahead bool) error {
71 | 	// Do Nothing. We don't care about this setting on Windows
72 | 	return nil
73 | }
74 | 


--------------------------------------------------------------------------------
/joinkatakanaoovplugin.go:
--------------------------------------------------------------------------------
  1 | package gosudachi
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 
  6 | 	"github.com/msnoigrs/gosudachi/dictionary"
  7 | )
  8 | 
  9 | type JoinKatakanaOovPluginConfig struct {
 10 | 	OovPOS    *[]string
 11 | 	MinLength *int
 12 | }
 13 | 
 14 | type JoinKatakanaOovPlugin struct {
 15 | 	config    *JoinKatakanaOovPluginConfig
 16 | 	oovPosId  int16
 17 | 	minLength int
 18 | }
 19 | 
 20 | func NewJoinKatakanaOovPlugin(config *JoinKatakanaOovPluginConfig) *JoinKatakanaOovPlugin {
 21 | 	if config == nil {
 22 | 		config = &JoinKatakanaOovPluginConfig{}
 23 | 	}
 24 | 	return &JoinKatakanaOovPlugin{
 25 | 		config: config,
 26 | 	}
 27 | }
 28 | 
 29 | func (p *JoinKatakanaOovPlugin) GetConfigStruct() interface{} {
 30 | 	if p.config == nil {
 31 | 		p.config = &JoinKatakanaOovPluginConfig{}
 32 | 	}
 33 | 	return p.config
 34 | }
 35 | 
 36 | func (p *JoinKatakanaOovPlugin) SetUp(grammar *dictionary.Grammar) error {
 37 | 	if p.config.OovPOS == nil || len(*p.config.OovPOS) == 0 {
 38 | 		return fmt.Errorf("JoinKatakanaOovPlugin: oovPOS is not specified")
 39 | 	}
 40 | 	p.oovPosId = grammar.GetPartOfSpeechId(*p.config.OovPOS)
 41 | 	if p.oovPosId < 0 {
 42 | 		return fmt.Errorf("JoinKatakanaOovPlugin: oovPOS is invalid")
 43 | 	}
 44 | 	minLength := 1
 45 | 	if p.config.MinLength != nil {
 46 | 		minLength = *p.config.MinLength
 47 | 		if minLength < 0 {
 48 | 			return fmt.Errorf("JoinKatakanaOovPlugin: minLength is negative")
 49 | 		}
 50 | 	}
 51 | 	p.minLength = minLength
 52 | 	p.config = nil
 53 | 	return nil
 54 | }
 55 | 
 56 | func isShorter(length int, text *InputText, node *LatticeNode) bool {
 57 | 	return text.CodePointCount(node.Begin, node.End) < length
 58 | }
 59 | 
 60 | func isKatakanaNode(text *InputText, node *LatticeNode) bool {
 61 | 	types := GetCharCategoryTypes(text, node)
 62 | 	return (types & dictionary.KATAKANA) == dictionary.KATAKANA
 63 | }
 64 | 
 65 | func canOovBowNode(text *InputText, node *LatticeNode) bool {
 66 | 	types := GetCharCategoryTypes(text, node)
 67 | 	return types&dictionary.NOOOVBOW != dictionary.NOOOVBOW
 68 | }
 69 | 
 70 | func (p *JoinKatakanaOovPlugin) Rewrite(text *InputText, path *[]*LatticeNode, lattice *Lattice) error {
 71 | 	for i := 0; i < len(*path); i++ {
 72 | 		node := (*path)[i]
 73 | 		if (node.IsOov || isShorter(p.minLength, text, node)) &&
 74 | 			isKatakanaNode(text, node) {
 75 | 			begin := i - 1
 76 | 			for ; begin >= 0; begin-- {
 77 | 				if !isKatakanaNode(text, (*path)[begin]) {
 78 | 					begin++
 79 | 					break
 80 | 				}
 81 | 			}
 82 | 			if begin < 0 {
 83 | 				begin = 0
 84 | 			}
 85 | 			end := i + 1
 86 | 			for ; end < len(*path); end++ {
 87 | 				if !isKatakanaNode(text, (*path)[end]) {
 88 | 					break
 89 | 				}
 90 | 			}
 91 | 			for begin != end && !canOovBowNode(text, (*path)[begin]) {
 92 | 				begin++
 93 | 			}
 94 | 			if end-begin > 1 {
 95 | 				_, err := ConcatenateOov(path, begin, end, p.oovPosId, lattice)
 96 | 				if err != nil {
 97 | 					return fmt.Errorf("JoinKatakanaOovPlugin: %s", err)
 98 | 				}
 99 | 				i = begin + 1
100 | 			}
101 | 		}
102 | 	}
103 | 	return nil
104 | }
105 | 


--------------------------------------------------------------------------------
/joinnumericplugin.go:
--------------------------------------------------------------------------------
  1 | package gosudachi
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 
  6 | 	"github.com/msnoigrs/gosudachi/dictionary"
  7 | )
  8 | 
  9 | type JoinNumericPluginConfig struct {
 10 | 	EnableNormalize *bool
 11 | }
 12 | 
 13 | type JoinNumericPlugin struct {
 14 | 	config          *JoinNumericPluginConfig
 15 | 	enableNormalize bool
 16 | 	numericPosId    int16
 17 | }
 18 | 
 19 | func NewJoinNumericPlugin(config *JoinNumericPluginConfig) *JoinNumericPlugin {
 20 | 	if config == nil {
 21 | 		config = &JoinNumericPluginConfig{}
 22 | 	}
 23 | 	return &JoinNumericPlugin{
 24 | 		config: config,
 25 | 	}
 26 | }
 27 | 
 28 | func (p *JoinNumericPlugin) GetConfigStruct() interface{} {
 29 | 	if p.config == nil {
 30 | 		p.config = &JoinNumericPluginConfig{}
 31 | 	}
 32 | 	return p.config
 33 | }
 34 | 
 35 | func (p *JoinNumericPlugin) SetUp(grammar *dictionary.Grammar) error {
 36 | 	p.numericPosId = grammar.GetPartOfSpeechId(NumericPos)
 37 | 	if p.config.EnableNormalize == nil {
 38 | 		p.enableNormalize = true
 39 | 	} else {
 40 | 		p.enableNormalize = *p.config.EnableNormalize
 41 | 	}
 42 | 	p.config = nil
 43 | 	return nil
 44 | }
 45 | 
 46 | func (p *JoinNumericPlugin) concatNodes(path *[]*LatticeNode, begin int, end int, lattice *Lattice, parser *numericParser) error {
 47 | 	tpath := *path
 48 | 	wi := tpath[begin].GetWordInfo()
 49 | 	if wi.PosId != p.numericPosId {
 50 | 		return nil
 51 | 	}
 52 | 	if p.enableNormalize {
 53 | 		normalizedForm := parser.getNormalized()
 54 | 		if end-begin > 1 ||
 55 | 			normalizedForm != wi.NormalizedForm {
 56 | 			_, err := ConcatenateNodes(path, begin, end, lattice, normalizedForm)
 57 | 			if err != nil {
 58 | 				return err
 59 | 			}
 60 | 		}
 61 | 	} else {
 62 | 		if end-begin > 1 {
 63 | 			_, err := ConcatenateNodes(path, begin, end, lattice, "")
 64 | 			if err != nil {
 65 | 				return err
 66 | 			}
 67 | 		}
 68 | 	}
 69 | 	return nil
 70 | }
 71 | 
 72 | func (p *JoinNumericPlugin) Rewrite(text *InputText, path *[]*LatticeNode, lattice *Lattice) error {
 73 | 	beginIndex := -1
 74 | 	commaAsDigit := true
 75 | 	periodAsDigit := true
 76 | 	parser := newNumericParser()
 77 | 
 78 | 	for i := 0; i < len(*path); i++ {
 79 | 		node := (*path)[i]
 80 | 		types := GetCharCategoryTypes(text, node)
 81 | 		wi := node.GetWordInfo()
 82 | 		s := wi.NormalizedForm
 83 | 		if (types&dictionary.NUMERIC) == dictionary.NUMERIC ||
 84 | 			(types&dictionary.KANJINUMERIC) == dictionary.KANJINUMERIC ||
 85 | 			(periodAsDigit && s == ".") ||
 86 | 			(commaAsDigit && s == ",") {
 87 | 
 88 | 			if beginIndex < 0 {
 89 | 				parser.clear()
 90 | 				beginIndex = i
 91 | 			}
 92 | 
 93 | 			for _, c := range s {
 94 | 				if !parser.append(c) {
 95 | 					if beginIndex >= 0 {
 96 | 						if parser.errorState == errComma {
 97 | 							commaAsDigit = false
 98 | 							i = beginIndex - 1
 99 | 						} else if parser.errorState == errPoint {
100 | 							periodAsDigit = false
101 | 							i = beginIndex - 1
102 | 						}
103 | 						beginIndex = -1
104 | 					}
105 | 					break
106 | 				}
107 | 			}
108 | 		} else {
109 | 			if beginIndex >= 0 {
110 | 				if parser.done() {
111 | 					err := p.concatNodes(path, beginIndex, i, lattice, parser)
112 | 					if err != nil {
113 | 						return fmt.Errorf("JoinNumericPlugin: %s", err)
114 | 					}
115 | 					i = beginIndex + 1
116 | 				} else {
117 | 					wi := (*path)[i-1].GetWordInfo()
118 | 					ss := wi.NormalizedForm
119 | 					if (parser.errorState == errComma && ss == ",") ||
120 | 						(parser.errorState == errPoint && ss == ".") {
121 | 						err := p.concatNodes(path, beginIndex, i-1, lattice, parser)
122 | 						if err != nil {
123 | 							return fmt.Errorf("JoinNumericPlugin: %s", err)
124 | 						}
125 | 						i = beginIndex + 2
126 | 					}
127 | 				}
128 | 			}
129 | 			beginIndex = -1
130 | 			if !commaAsDigit && s != "," {
131 | 				commaAsDigit = true
132 | 			}
133 | 			if !periodAsDigit && s != "." {
134 | 				periodAsDigit = true
135 | 			}
136 | 		}
137 | 	}
138 | 
139 | 	if beginIndex >= 0 {
140 | 		if parser.done() {
141 | 			p.concatNodes(path, beginIndex, len(*path), lattice, parser)
142 | 		} else {
143 | 			wi := (*path)[len(*path)-1].GetWordInfo()
144 | 			ss := wi.NormalizedForm
145 | 			if (parser.errorState == errComma && ss == ",") ||
146 | 				(parser.errorState == errPoint && ss == ".") {
147 | 				p.concatNodes(path, beginIndex, len(*path)-1, lattice, parser)
148 | 			}
149 | 		}
150 | 	}
151 | 	return nil
152 | }
153 | 
154 | var NumericPos []string = []string{"名詞", "数詞", "*", "*", "*", "*"}
155 | 


--------------------------------------------------------------------------------
/lattice.go:
--------------------------------------------------------------------------------
  1 | package gosudachi
  2 | 
  3 | import (
  4 | 	"errors"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"math"
  8 | 	"strings"
  9 | 
 10 | 	"github.com/msnoigrs/gosudachi/dictionary"
 11 | )
 12 | 
 13 | const (
 14 | 	NullSurface = "(null)"
 15 | )
 16 | 
 17 | var UndefinedWordInfo = &dictionary.WordInfo{
 18 | 		Surface: NullSurface,
 19 | 		HeadwordLength: 0,
 20 | 		PosId: -1,
 21 | 		NormalizedForm: NullSurface,
 22 | 		DictionaryFormWordId: -1,
 23 | 		DictionaryForm: NullSurface,
 24 | 		ReadingForm: NullSurface,
 25 | 	}
 26 | 
 27 | type LatticeNode struct {
 28 | 	Begin            int
 29 | 	End              int
 30 | 	leftId           int16
 31 | 	rightId          int16
 32 | 	cost             int16
 33 | 	wordId           int32
 34 | 	totalCost        int
 35 | 	bestPreviousNode *LatticeNode
 36 | 	isConnectedToBOS bool
 37 | 	isDefined        bool
 38 | 	IsOov            bool
 39 | 	extraWordInfo    *dictionary.WordInfo
 40 | 	lexicon          *dictionary.LexiconSet
 41 | }
 42 | 
 43 | func NewLatticeNode(lexicon *dictionary.LexiconSet, leftId int16, rightId int16, cost int16, wordId int32) *LatticeNode {
 44 | 	return &LatticeNode{
 45 | 		lexicon:   lexicon,
 46 | 		leftId:    leftId,
 47 | 		rightId:   rightId,
 48 | 		cost:      cost,
 49 | 		wordId:    wordId,
 50 | 		isDefined: true,
 51 | 	}
 52 | }
 53 | 
 54 | func (ln *LatticeNode) SetParameter(leftId int16, rightId int16, cost int16) {
 55 | 	ln.leftId = leftId
 56 | 	ln.rightId = rightId
 57 | 	ln.cost = cost
 58 | }
 59 | 
 60 | func (ln *LatticeNode) GetBegin() int {
 61 | 	return ln.Begin
 62 | }
 63 | 
 64 | func (ln *LatticeNode) GetEnd() int {
 65 | 	return ln.End
 66 | }
 67 | 
 68 | func (ln *LatticeNode) SetRange(begin int, end int) {
 69 | 	ln.Begin = begin
 70 | 	ln.End = end
 71 | }
 72 | 
 73 | func (ln *LatticeNode) IsOOV() bool {
 74 | 	return ln.IsOov
 75 | }
 76 | 
 77 | func (ln *LatticeNode) SetOOV() {
 78 | 	ln.IsOov = true
 79 | }
 80 | 
 81 | func (ln *LatticeNode) GetWordInfo() *dictionary.WordInfo {
 82 | 	if !ln.isDefined {
 83 | 		return UndefinedWordInfo
 84 | 	}
 85 | 	if ln.extraWordInfo != nil {
 86 | 		return ln.extraWordInfo
 87 | 	}
 88 | 	return ln.lexicon.GetWordInfo(ln.wordId)
 89 | }
 90 | 
 91 | func (ln *LatticeNode) SetWordInfo(wordInfo *dictionary.WordInfo) {
 92 | 	ln.extraWordInfo = wordInfo
 93 | 	ln.isDefined = true
 94 | }
 95 | 
 96 | func (ln *LatticeNode) GetPathCost() int {
 97 | 	return int(ln.cost)
 98 | }
 99 | 
100 | func (ln *LatticeNode) GetWordId() int {
101 | 	return int(uint32(ln.wordId))
102 | }
103 | 
104 | func (ln *LatticeNode) GetDictionaryId() int {
105 | 	if !ln.isDefined || ln.extraWordInfo != nil {
106 | 		return -1
107 | 	}
108 | 	return ln.lexicon.GetDictionaryId(ln.wordId)
109 | }
110 | 
111 | func (ln *LatticeNode) String() string {
112 | 	var (
113 | 		surface string
114 | 		pos     int16
115 | 	)
116 | 
117 | 	wi := ln.GetWordInfo()
118 | 	surface = wi.Surface
119 | 	pos = wi.PosId
120 | 
121 | 	return fmt.Sprintf("%d %d %s(%d) %d %d %d %d", ln.Begin, ln.End, surface, ln.wordId, pos, ln.leftId, ln.rightId, ln.cost)
122 | }
123 | 
124 | type Lattice struct {
125 | 	endLists  [][]*LatticeNode
126 | 	eosNode   *LatticeNode
127 | 	grammar   *dictionary.Grammar
128 | 	eosParams []int16
129 | }
130 | 
131 | func NewLattice(grammar *dictionary.Grammar) *Lattice {
132 | 	bosNode := &LatticeNode{}
133 | 	bosParams := dictionary.BosParameter
134 | 	bosNode.SetParameter(bosParams[0], bosParams[1], bosParams[2])
135 | 	bosNode.isConnectedToBOS = true
136 | 	endLists := make([][]*LatticeNode, 1)
137 | 	singletonList := make([]*LatticeNode, 1)
138 | 	singletonList[0] = bosNode
139 | 	endLists[0] = singletonList
140 | 	return &Lattice{
141 | 		endLists:  endLists,
142 | 		grammar:   grammar,
143 | 		eosParams: dictionary.EosParameter,
144 | 	}
145 | }
146 | 
147 | func (l *Lattice) resize(size int) {
148 | 	if size > len(l.endLists)-1 {
149 | 		l.expand(size)
150 | 	}
151 | 	l.eosNode = &LatticeNode{}
152 | 	l.eosNode.SetParameter(l.eosParams[0], l.eosParams[1], l.eosParams[2])
153 | 	l.eosNode.Begin = size
154 | 	l.eosNode.End = size
155 | }
156 | 
157 | func (l *Lattice) clear() {
158 | 	for i := 1; i < len(l.endLists); i++ {
159 | 		l.endLists[i] = l.endLists[i][:0]
160 | 	}
161 | }
162 | 
163 | func (l *Lattice) expand(newSize int) {
164 | 	reallen := newSize + 1
165 | 	oldlen := len(l.endLists)
166 | 	if oldlen < reallen {
167 | 		l.endLists = append(l.endLists, make([][]*LatticeNode, reallen-oldlen)...)
168 | 		for i := oldlen; i < reallen; i++ {
169 | 			l.endLists[i] = []*LatticeNode{}
170 | 		}
171 | 	}
172 | }
173 | 
174 | func (l *Lattice) GetNodesWithEnd(end int) []*LatticeNode {
175 | 	return l.endLists[end]
176 | }
177 | 
178 | func (l *Lattice) GetNodes(begin int, end int) []*LatticeNode {
179 | 	ret := make([]*LatticeNode, 0)
180 | 	for _, n := range l.endLists[end] {
181 | 		if n.Begin == begin {
182 | 			ret = append(ret, n)
183 | 		}
184 | 	}
185 | 	return ret
186 | }
187 | 
188 | func (l *Lattice) GetMinimumNode(begin int, end int) *LatticeNode {
189 | 	var (
190 | 		ret     *LatticeNode
191 | 		mincost int16
192 | 	)
193 | 	for _, n := range l.endLists[end] {
194 | 		if n.Begin == begin {
195 | 			if ret == nil || mincost > n.cost {
196 | 				ret = n
197 | 				mincost = n.cost
198 | 			}
199 | 		}
200 | 	}
201 | 	return ret
202 | }
203 | 
204 | func (l *Lattice) Insert(begin int, end int, node *LatticeNode) {
205 | 	l.endLists[end] = append(l.endLists[end], node)
206 | 	node.Begin = begin
207 | 	node.End = end
208 | 
209 | 	l.connectNode(node)
210 | }
211 | 
212 | func (l *Lattice) Remove(begin int, end int, node *LatticeNode) {
213 | 	t := l.endLists[end]
214 | 	for i, n := range t {
215 | 		if n == node {
216 | 			if len(t) > 1 {
217 | 				copy(t[i:], t[i+1:])
218 | 			}
219 | 			t[len(t)-1] = nil
220 | 			l.endLists[end] = t[:len(t)-1]
221 | 		}
222 | 	}
223 | }
224 | 
225 | func (l *Lattice) HasPreviousNode(index int) bool {
226 | 	return len(l.endLists[index]) > 0
227 | }
228 | 
229 | func (l *Lattice) connectNode(rNode *LatticeNode) {
230 | 	begin := rNode.Begin
231 | 	rNode.totalCost = math.MaxInt32
232 | 	for _, lNode := range l.endLists[begin] {
233 | 		if !lNode.isConnectedToBOS {
234 | 			continue
235 | 		}
236 | 		connectCost := l.grammar.GetConnectCost(lNode.rightId, rNode.leftId)
237 | 		if connectCost == dictionary.InhibitedConnection {
238 | 			continue // this connection is not allowed
239 | 		}
240 | 		cost := lNode.totalCost + int(connectCost)
241 | 		if cost < rNode.totalCost {
242 | 			rNode.totalCost = cost
243 | 			rNode.bestPreviousNode = lNode
244 | 		}
245 | 	}
246 | 	rNode.isConnectedToBOS = rNode.bestPreviousNode != nil
247 | 	rNode.totalCost += int(rNode.cost)
248 | }
249 | 
250 | func (l *Lattice) connectEosNode() {
251 | 	l.connectNode(l.eosNode)
252 | }
253 | 
254 | func (l *Lattice) GetBestPath() ([]*LatticeNode, error) {
255 | 	if !l.eosNode.isConnectedToBOS { // EOS node
256 | 		return nil, errors.New("EOS isn't connected to BOS")
257 | 	}
258 | 	ret := make([]*LatticeNode, 0)
259 | 	for node := l.eosNode.bestPreviousNode; node != l.endLists[0][0]; node = node.bestPreviousNode {
260 | 		ret = append(ret, node)
261 | 	}
262 | 
263 | 	if len(ret) > 1 {
264 | 		// reverse
265 | 		for i := len(ret)/2 - 1; i >= 0; i-- {
266 | 			opp := len(ret) - 1 - i
267 | 			ret[i], ret[opp] = ret[opp], ret[i]
268 | 		}
269 | 	}
270 | 	return ret, nil
271 | }
272 | 
273 | func (l *Lattice) Dump(w io.Writer) {
274 | 	index := 0
275 | 	for i := len(l.endLists); i >= 0; i-- {
276 | 		var rNodes []*LatticeNode
277 | 		if i <= len(l.endLists)-1 {
278 | 			rNodes = l.endLists[i]
279 | 		} else {
280 | 			rNodes = []*LatticeNode{l.eosNode}
281 | 		}
282 | 		for _, rNode := range rNodes {
283 | 			var (
284 | 				surface, pos string
285 | 			)
286 | 			if !rNode.isDefined {
287 | 				surface = "(null)"
288 | 				pos = "BOS/EOS"
289 | 			} else {
290 | 				wi := rNode.GetWordInfo()
291 | 				surface = wi.Surface
292 | 				posId := wi.PosId
293 | 				if posId < 0 {
294 | 					pos = "(null)"
295 | 				} else {
296 | 					pos = strings.Join(l.grammar.GetPartOfSpeechString(posId), ",")
297 | 				}
298 | 			}
299 | 
300 | 			fmt.Fprintf(w, "%d: %d %d %s(%d) %s %d %d %d: ", index, rNode.Begin, rNode.End, surface, rNode.wordId, pos, rNode.leftId, rNode.rightId, rNode.cost)
301 | 			index++
302 | 
303 | 			for _, lNode := range l.endLists[rNode.Begin] {
304 | 				cost := l.grammar.GetConnectCost(lNode.rightId, rNode.leftId)
305 | 				fmt.Fprintf(w, "%d ", cost)
306 | 			}
307 | 			fmt.Fprintln(w, "")
308 | 		}
309 | 	}
310 | }
311 | 


--------------------------------------------------------------------------------
/mecaboovproviderplugin.go:
--------------------------------------------------------------------------------
  1 | package gosudachi
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io"
  6 | 	"os"
  7 | 	"strconv"
  8 | 	"strings"
  9 | 
 10 | 	"github.com/msnoigrs/gosudachi/data"
 11 | 	"github.com/msnoigrs/gosudachi/dictionary"
 12 | 	"github.com/msnoigrs/gosudachi/internal/lnreader"
 13 | )
 14 | 
 15 | type categoryInfo struct {
 16 | 	catType  uint32
 17 | 	isInvoke bool
 18 | 	isGroup  bool
 19 | 	length   int
 20 | }
 21 | 
 22 | type oov struct {
 23 | 	leftId  int16
 24 | 	rightId int16
 25 | 	cost    int16
 26 | 	posId   int16
 27 | }
 28 | 
 29 | type MeCabOovProviderPluginConfig struct {
 30 | 	CharDef *string
 31 | 	UnkDef  *string
 32 | }
 33 | 
 34 | type MeCabOovProviderPlugin struct {
 35 | 	config     *MeCabOovProviderPluginConfig
 36 | 	categories map[uint32]*categoryInfo
 37 | 	oovList    map[uint32]*[]*oov
 38 | }
 39 | 
 40 | func NewMeCabOovProviderPlugin(config *MeCabOovProviderPluginConfig) *MeCabOovProviderPlugin {
 41 | 	if config == nil {
 42 | 		config = &MeCabOovProviderPluginConfig{}
 43 | 	}
 44 | 	return &MeCabOovProviderPlugin{
 45 | 		config:     config,
 46 | 		categories: map[uint32]*categoryInfo{},
 47 | 		oovList:    map[uint32]*[]*oov{},
 48 | 	}
 49 | }
 50 | 
 51 | func (p *MeCabOovProviderPlugin) GetConfigStruct() interface{} {
 52 | 	if p.config == nil {
 53 | 		p.config = &MeCabOovProviderPluginConfig{}
 54 | 	}
 55 | 	return p.config
 56 | }
 57 | 
 58 | func (p *MeCabOovProviderPlugin) SetUp(grammar *dictionary.Grammar) error {
 59 | 	if p.config.CharDef == nil {
 60 | 		zstr := ""
 61 | 		p.config.CharDef = &zstr
 62 | 	}
 63 | 	if p.config.UnkDef == nil {
 64 | 		zstr := ""
 65 | 		p.config.UnkDef = &zstr
 66 | 	}
 67 | 	if p.categories == nil {
 68 | 		p.categories = map[uint32]*categoryInfo{}
 69 | 	}
 70 | 	if p.oovList == nil {
 71 | 		p.oovList = map[uint32]*[]*oov{}
 72 | 	}
 73 | 	err := p.readCharacterProperty(*p.config.CharDef)
 74 | 	if err != nil {
 75 | 		return fmt.Errorf("MeCabOovProviderPlugin: %s", err)
 76 | 	}
 77 | 	err = p.readOov(*p.config.UnkDef, grammar)
 78 | 	if err != nil {
 79 | 		return fmt.Errorf("MeCabOovProviderPlugin: %s", err)
 80 | 	}
 81 | 	p.config = nil
 82 | 	return nil
 83 | }
 84 | 
 85 | func (p *MeCabOovProviderPlugin) ProvideOOV(inputText *InputText, offset int, hasOtherWords bool) ([]*LatticeNode, error) {
 86 | 	nodes := []*LatticeNode{}
 87 | 	length := inputText.GetCharCategoryContinuousLength(offset)
 88 | 	if length > 0 {
 89 | 		catTypes := inputText.GetCharCategoryTypes(offset)
 90 | 		for t := dictionary.DEFAULT; t <= dictionary.NOOOVBOW; t *= 2 {
 91 | 			if (catTypes & t) != t {
 92 | 				continue
 93 | 			}
 94 | 			cinfo, ok := p.categories[t]
 95 | 			if !ok {
 96 | 				continue
 97 | 			}
 98 | 			llength := length
 99 | 			oovs, ok := p.oovList[t]
100 | 			if !ok {
101 | 				continue
102 | 			}
103 | 			if cinfo.isGroup && (cinfo.isInvoke || !hasOtherWords) {
104 | 				s := inputText.GetSubstring(offset, offset+length)
105 | 				for _, oov := range *oovs {
106 | 					nodes = append(nodes, p.getOovNode(s, oov, length))
107 | 				}
108 | 				llength -= 1
109 | 			}
110 | 			if cinfo.isInvoke || !hasOtherWords {
111 | 				for i := 1; i <= cinfo.length; i++ {
112 | 					sublength := inputText.GetCodePointsOffsetLength(offset, i)
113 | 					if sublength > llength {
114 | 						break
115 | 					}
116 | 					s := inputText.GetSubstring(offset, offset+sublength)
117 | 					for _, oov := range *oovs {
118 | 						nodes = append(nodes, p.getOovNode(s, oov, sublength))
119 | 					}
120 | 				}
121 | 			}
122 | 		}
123 | 	}
124 | 	return nodes, nil
125 | }
126 | 
127 | func (p *MeCabOovProviderPlugin) getOovNode(text string, oov *oov, length int) *LatticeNode {
128 | 	node := CreateNodeOfOOV()
129 | 	node.SetParameter(oov.leftId, oov.rightId, oov.cost)
130 | 	wi := &dictionary.WordInfo{
131 | 		Surface:        text,
132 | 		HeadwordLength: int16(length),
133 | 		PosId:          oov.posId,
134 | 		NormalizedForm: text,
135 | 		DictionaryForm: text,
136 | 		ReadingForm:    "",
137 | 	}
138 | 	node.SetWordInfo(wi)
139 | 	return node
140 | }
141 | 
142 | func (p *MeCabOovProviderPlugin) readCharacterProperty(charDef string) error {
143 | 	var charDefReader io.Reader
144 | 	if charDef != "" {
145 | 		charDefFd, err := os.OpenFile(charDef, os.O_RDONLY, 0644)
146 | 		if err != nil {
147 | 			return fmt.Errorf("%s: %s", err, charDef)
148 | 		}
149 | 		defer charDefFd.Close()
150 | 		charDefReader = charDefFd
151 | 	} else {
152 | 		charDefF, err := data.Assets.Open("char.def")
153 | 		if err != nil {
154 | 			return fmt.Errorf("%s: (data.Assets)char.def", err)
155 | 		}
156 | 		defer charDefF.Close()
157 | 		charDefReader = charDefF
158 | 	}
159 | 
160 | 	r := lnreader.NewLineNumberReader(charDefReader)
161 | 	for {
162 | 		line, err := r.ReadLine()
163 | 		if err == io.EOF {
164 | 			break
165 | 		}
166 | 		if err != nil {
167 | 			return err
168 | 		}
169 | 		if lnreader.IsSkipLine(line) {
170 | 			continue
171 | 		}
172 | 		if len(line) > 2 && line[0] == '0' && line[1] == 'x' {
173 | 			continue
174 | 		}
175 | 		cols := strings.Fields(string(line))
176 | 		if len(cols) < 4 {
177 | 			return fmt.Errorf("char.def: invalid format at line %d", r.NumLine)
178 | 		}
179 | 		catType, err := dictionary.GetCategoryType(cols[0])
180 | 		if err != nil {
181 | 			return fmt.Errorf("char.def: %s is invalid type at line %d", cols[0], r.NumLine)
182 | 		}
183 | 		_, ok := p.categories[catType]
184 | 		if ok {
185 | 			return fmt.Errorf("char.def: %s is already defined at line %d", cols[0], r.NumLine)
186 | 		}
187 | 		l, err := strconv.Atoi(cols[3])
188 | 		if err != nil {
189 | 			return fmt.Errorf("char.def: %s is invalid number at line %d", cols[3], r.NumLine)
190 | 		}
191 | 		catinfo := &categoryInfo{
192 | 			catType:  catType,
193 | 			isInvoke: cols[1] != "0",
194 | 			isGroup:  cols[2] != "0",
195 | 			length:   l,
196 | 		}
197 | 		p.categories[catType] = catinfo
198 | 	}
199 | 	return nil
200 | }
201 | 
202 | func (p *MeCabOovProviderPlugin) readOov(unkDef string, grammar *dictionary.Grammar) error {
203 | 	var unkDefReader io.Reader
204 | 	if unkDef != "" {
205 | 		unkDefFd, err := os.OpenFile(unkDef, os.O_RDONLY, 0644)
206 | 		if err != nil {
207 | 			return err
208 | 		}
209 | 		defer unkDefFd.Close()
210 | 		unkDefReader = unkDefFd
211 | 	} else {
212 | 		unkDefF, err := data.Assets.Open("unk.def")
213 | 		if err != nil {
214 | 			return err
215 | 		}
216 | 		defer unkDefF.Close()
217 | 		unkDefReader = unkDefF
218 | 	}
219 | 
220 | 	r := lnreader.NewLineNumberReader(unkDefReader)
221 | 	for {
222 | 		line, err := r.ReadLine()
223 | 		if err == io.EOF {
224 | 			break
225 | 		}
226 | 		if err != nil {
227 | 			return err
228 | 		}
229 | 		cols := strings.Split(string(line), ",")
230 | 		if len(cols) < 10 {
231 | 			return fmt.Errorf("unk.def: invalid format at line %d", r.NumLine)
232 | 		}
233 | 		catType, err := dictionary.GetCategoryType(cols[0])
234 | 		if err != nil {
235 | 			return fmt.Errorf("unk.def: %s is invalid type at line %d", cols[0], r.NumLine)
236 | 		}
237 | 		_, ok := p.categories[catType]
238 | 		if !ok {
239 | 			return fmt.Errorf("unk.def: %s is undefined at line %d", cols[0], r.NumLine)
240 | 		}
241 | 
242 | 		leftId, err := strconv.ParseInt(cols[1], 10, 16)
243 | 		if err != nil {
244 | 			return fmt.Errorf("unk.def: %s is invalid number at line %d", cols[1], r.NumLine)
245 | 		}
246 | 		rightId, err := strconv.ParseInt(cols[2], 10, 16)
247 | 		if err != nil {
248 | 			return fmt.Errorf("unk.def: %s is invalid number at line %d", cols[2], r.NumLine)
249 | 		}
250 | 		cost, err := strconv.ParseInt(cols[3], 10, 16)
251 | 		if err != nil {
252 | 			return fmt.Errorf("unk.def: %s is invalid number at line %d", cols[3], r.NumLine)
253 | 		}
254 | 		pos := []string{cols[4], cols[5], cols[6], cols[7], cols[8], cols[9]}
255 | 		posId := grammar.GetPartOfSpeechId(pos)
256 | 		if posId == -1 {
257 | 			return fmt.Errorf("unk.def: unknown Part Of Speech at line %d", r.NumLine)
258 | 		}
259 | 		poov := &oov{
260 | 			leftId:  int16(leftId),
261 | 			rightId: int16(rightId),
262 | 			cost:    int16(cost),
263 | 			posId:   posId,
264 | 		}
265 | 
266 | 		l, ok := p.oovList[catType]
267 | 		if !ok {
268 | 			ll := []*oov{}
269 | 			l = &ll
270 | 			p.oovList[catType] = l
271 | 		}
272 | 		*l = append(*l, poov)
273 | 	}
274 | 	return nil
275 | }
276 | 


--------------------------------------------------------------------------------
/morpheme.go:
--------------------------------------------------------------------------------
  1 | package gosudachi
  2 | 
  3 | import (
  4 | 	"github.com/msnoigrs/gosudachi/dictionary"
  5 | )
  6 | 
  7 | type Morpheme struct {
  8 | 	list     *MorphemeList
  9 | 	index    int
 10 | 	wordInfo *dictionary.WordInfo
 11 | }
 12 | 
 13 | func newMorpheme(list *MorphemeList, index int) *Morpheme {
 14 | 	return &Morpheme{
 15 | 		list:  list,
 16 | 		index: index,
 17 | 	}
 18 | }
 19 | 
 20 | func (m *Morpheme) Begin() int {
 21 | 	return m.list.GetBegin(m.index)
 22 | }
 23 | 
 24 | func (m *Morpheme) End() int {
 25 | 	return m.list.GetEnd(m.index)
 26 | }
 27 | 
 28 | func (m *Morpheme) Surface() string {
 29 | 	return m.list.GetSurface(m.index)
 30 | }
 31 | 
 32 | func (m *Morpheme) PartOfSpeech() []string {
 33 | 	wi := m.GetWordInfo()
 34 | 	return m.list.grammar.GetPartOfSpeechString(wi.PosId)
 35 | }
 36 | 
 37 | func (m *Morpheme) DictionaryForm() string {
 38 | 	wi := m.GetWordInfo()
 39 | 	return wi.DictionaryForm
 40 | }
 41 | 
 42 | func (m *Morpheme) NormalizedForm() string {
 43 | 	wi := m.GetWordInfo()
 44 | 	return wi.NormalizedForm
 45 | }
 46 | 
 47 | func (m *Morpheme) ReadingForm() string {
 48 | 	wi := m.GetWordInfo()
 49 | 	return wi.ReadingForm
 50 | }
 51 | 
 52 | func (m *Morpheme) Split(mode string) *MorphemeList {
 53 | 	wi := m.GetWordInfo()
 54 | 	return m.list.Split(mode, m.index, wi)
 55 | }
 56 | 
 57 | func (m *Morpheme) IsOOV() bool {
 58 | 	return m.list.IsOOV(m.index)
 59 | }
 60 | 
 61 | func (m *Morpheme) GetWordId() int {
 62 | 	return m.list.GetWordId(m.index)
 63 | }
 64 | 
 65 | func (m *Morpheme) GetDictionaryId() int {
 66 | 	return m.list.GetDictionaryId(m.index)
 67 | }
 68 | 
 69 | func (m *Morpheme) GetWordInfo() *dictionary.WordInfo {
 70 | 	if m.wordInfo == nil {
 71 | 		wordInfo := m.list.GetWordInfo(m.index)
 72 | 		m.wordInfo = wordInfo
 73 | 	}
 74 | 	return m.wordInfo
 75 | }
 76 | 
 77 | type MorphemeList struct {
 78 | 	inputText *InputText
 79 | 	grammar   *dictionary.Grammar
 80 | 	lexicon   *dictionary.LexiconSet
 81 | 	path      []*LatticeNode
 82 | }
 83 | 
 84 | func NewMorphemeList(inputText *InputText, grammar *dictionary.Grammar, lexicon *dictionary.LexiconSet, path []*LatticeNode) *MorphemeList {
 85 | 	return &MorphemeList{
 86 | 		inputText: inputText,
 87 | 		grammar:   grammar,
 88 | 		lexicon:   lexicon,
 89 | 		path:      path,
 90 | 	}
 91 | }
 92 | 
 93 | func (l *MorphemeList) Length() int {
 94 | 	return len(l.path)
 95 | }
 96 | 
 97 | func (l *MorphemeList) Get(index int) *Morpheme {
 98 | 	return newMorpheme(l, index)
 99 | }
100 | 
101 | func (l *MorphemeList) GetBegin(index int) int {
102 | 	return l.inputText.GetOriginalIndex(l.path[index].Begin)
103 | }
104 | 
105 | func (l *MorphemeList) GetEnd(index int) int {
106 | 	return l.inputText.GetOriginalIndex(l.path[index].End)
107 | }
108 | 
109 | func (l *MorphemeList) GetSurface(index int) string {
110 | 	begin := l.GetBegin(index)
111 | 	end := l.GetEnd(index)
112 | 	return string([]rune(l.inputText.OriginalText)[begin:end])
113 | }
114 | 
115 | func (l *MorphemeList) GetWordInfo(index int) *dictionary.WordInfo {
116 | 	return l.path[index].GetWordInfo()
117 | }
118 | 
119 | func (l *MorphemeList) Split(mode string, index int, wi *dictionary.WordInfo) *MorphemeList {
120 | 	var wordIds []int32
121 | 	switch mode {
122 | 	case "A":
123 | 		wordIds = wi.AUnitSplit
124 | 	case "B":
125 | 		wordIds = wi.BUnitSplit
126 | 	default:
127 | 		return NewMorphemeList(l.inputText, l.grammar, l.lexicon, []*LatticeNode{l.path[index]})
128 | 	}
129 | 	if len(wordIds) == 0 || len(wordIds) == 1 {
130 | 		return NewMorphemeList(l.inputText, l.grammar, l.lexicon, []*LatticeNode{l.path[index]})
131 | 	}
132 | 
133 | 	offset := l.path[index].Begin
134 | 	nodes := make([]*LatticeNode, len(wordIds), len(wordIds))
135 | 	for i, wid := range wordIds {
136 | 		n := NewLatticeNode(l.lexicon, 0, 0, 0, wid)
137 | 		n.Begin = offset
138 | 		wi := n.GetWordInfo()
139 | 		offset += int(wi.HeadwordLength)
140 | 		n.End = offset
141 | 		nodes[i] = n
142 | 	}
143 | 
144 | 	return NewMorphemeList(l.inputText, l.grammar, l.lexicon, nodes)
145 | }
146 | 
147 | func (l *MorphemeList) IsOOV(index int) bool {
148 | 	return l.path[index].IsOOV()
149 | }
150 | 
151 | func (l *MorphemeList) GetWordId(index int) int {
152 | 	return l.path[index].GetWordId()
153 | }
154 | 
155 | func (l *MorphemeList) GetDictionaryId(index int) int {
156 | 	return l.path[index].GetDictionaryId()
157 | }
158 | 
159 | func (l *MorphemeList) GetInternalCost() int {
160 | 	return l.path[len(l.path)-1].GetPathCost() - l.path[0].GetPathCost()
161 | }
162 | 


--------------------------------------------------------------------------------
/numericparser.go:
--------------------------------------------------------------------------------
  1 | package gosudachi
  2 | 
  3 | type errState int
  4 | 
  5 | const (
  6 | 	errNone errState = iota
  7 | 	errPoint
  8 | 	errComma
  9 | 	errOther
 10 | )
 11 | 
 12 | type numericParser struct {
 13 | 	digitLength     int
 14 | 	isFirstDigit    bool
 15 | 	hasComma        bool
 16 | 	hasHangingPoint bool
 17 | 	errorState      errState
 18 | 	total           *stringNumber
 19 | 	subtotal        *stringNumber
 20 | 	tmp             *stringNumber
 21 | }
 22 | 
 23 | func newNumericParser() *numericParser {
 24 | 	return &numericParser{
 25 | 		isFirstDigit: true,
 26 | 		total:        newStringNumber(),
 27 | 		subtotal:     newStringNumber(),
 28 | 		tmp:          newStringNumber(),
 29 | 	}
 30 | }
 31 | 
 32 | type stringNumber struct {
 33 | 	significand []rune
 34 | 	scale       int
 35 | 	point       int
 36 | 	IsAllZero   bool
 37 | }
 38 | 
 39 | func newStringNumber() *stringNumber {
 40 | 	return &stringNumber{
 41 | 		point:     -1,
 42 | 		IsAllZero: true,
 43 | 	}
 44 | }
 45 | 
 46 | func (n *stringNumber) clear() {
 47 | 	n.significand = n.significand[:0]
 48 | 	n.scale = 0
 49 | 	n.point = -1
 50 | 	n.IsAllZero = true
 51 | }
 52 | 
 53 | func (n *stringNumber) append(i int) {
 54 | 	if i != 0 {
 55 | 		n.IsAllZero = false
 56 | 	}
 57 | 	n.significand = append(n.significand, intToRune(i))
 58 | }
 59 | 
 60 | func (n *stringNumber) shiftScale(i int) {
 61 | 	if len(n.significand) == 0 {
 62 | 		n.significand = append(n.significand, '1')
 63 | 	}
 64 | 	n.scale += i
 65 | }
 66 | 
 67 | func (n *stringNumber) add(t *stringNumber) bool {
 68 | 	if len(t.significand) == 0 {
 69 | 		return true
 70 | 	}
 71 | 
 72 | 	if len(n.significand) == 0 {
 73 | 		n.significand = append(n.significand, t.significand...)
 74 | 		n.scale = t.scale
 75 | 		n.point = t.point
 76 | 		return true
 77 | 	}
 78 | 
 79 | 	l := t.intLength()
 80 | 	if n.scale >= l {
 81 | 		n.fillZero(n.scale - l)
 82 | 		if t.point >= 0 {
 83 | 			n.point = len(n.significand) + t.point
 84 | 		}
 85 | 		_ = t.String()
 86 | 		n.significand = append(n.significand, t.significand...)
 87 | 		n.scale = t.scale
 88 | 		return true
 89 | 	}
 90 | 
 91 | 	return false
 92 | }
 93 | 
 94 | func (n *stringNumber) setPoint() bool {
 95 | 	if n.scale == 0 && n.point < 0 {
 96 | 		n.point = len(n.significand)
 97 | 		return true
 98 | 	}
 99 | 	return false
100 | }
101 | 
102 | func (n *stringNumber) intLength() int {
103 | 	n.normalizeScale()
104 | 	if n.point >= 0 {
105 | 		return n.point
106 | 	}
107 | 	return len(n.significand) + n.scale
108 | }
109 | 
110 | func (n *stringNumber) isZero() bool {
111 | 	return len(n.significand) == 0
112 | }
113 | 
114 | func (n *stringNumber) String() string {
115 | 	if len(n.significand) == 0 {
116 | 		return "0"
117 | 	}
118 | 
119 | 	n.normalizeScale()
120 | 	if n.scale > 0 {
121 | 		n.fillZero(n.scale)
122 | 	} else if n.point >= 0 {
123 | 		if n.point == 0 {
124 | 			n.significand = append(n.significand, []rune{0, 0}...)
125 | 			copy(n.significand[2:], n.significand[:len(n.significand)-2])
126 | 			n.significand[0] = '0'
127 | 			n.significand[1] = '.'
128 | 		} else {
129 | 			n.significand = append(n.significand, rune(0))
130 | 			copy(n.significand[n.point+1:], n.significand[n.point:])
131 | 			n.significand[n.point] = '.'
132 | 		}
133 | 		i := len(n.significand) - 1
134 | 		j := 0
135 | 		for i >= 0 && n.significand[i] == '0' {
136 | 			i--
137 | 			j++
138 | 		}
139 | 		if n.significand[i] == '.' {
140 | 			i--
141 | 			j++
142 | 		}
143 | 		if j > 0 {
144 | 			n.significand = n.significand[:i+1]
145 | 		}
146 | 	}
147 | 
148 | 	return string(n.significand)
149 | }
150 | 
151 | func (n *stringNumber) normalizeScale() {
152 | 	if n.point >= 0 {
153 | 		nScale := len(n.significand) - n.point
154 | 		if nScale > n.scale {
155 | 			n.point += n.scale
156 | 			n.scale = 0
157 | 		} else {
158 | 			n.scale -= nScale
159 | 			n.point = -1
160 | 		}
161 | 	}
162 | }
163 | 
164 | func (n *stringNumber) fillZero(length int) {
165 | 	for i := 0; i < length; i++ {
166 | 		n.significand = append(n.significand, '0')
167 | 	}
168 | }
169 | 
170 | func intToRune(i int) rune {
171 | 	return rune(int32('0') + int32(i))
172 | }
173 | 
174 | func (p *numericParser) clear() {
175 | 	p.digitLength = 0
176 | 	p.isFirstDigit = true
177 | 	p.hasComma = false
178 | 	p.hasHangingPoint = false
179 | 	p.errorState = errNone
180 | 	p.total.clear()
181 | 	p.subtotal.clear()
182 | 	p.tmp.clear()
183 | }
184 | 
185 | func (p *numericParser) checkComma() bool {
186 | 	if p.isFirstDigit {
187 | 		return false
188 | 	} else if !p.hasComma {
189 | 		return p.digitLength <= 3 && !p.tmp.isZero() && !p.tmp.IsAllZero
190 | 	} else {
191 | 		return p.digitLength == 3
192 | 	}
193 | }
194 | 
195 | func (p *numericParser) append(c rune) bool {
196 | 	if c == '.' {
197 | 		p.hasHangingPoint = true
198 | 		if p.isFirstDigit {
199 | 			p.errorState = errPoint
200 | 			return false
201 | 		} else if p.hasComma && !p.checkComma() {
202 | 			p.errorState = errComma
203 | 			return false
204 | 
205 | 		} else if p.tmp.setPoint() {
206 | 			p.errorState = errPoint
207 | 			return false
208 | 		}
209 | 		p.hasComma = false
210 | 		return true
211 | 	} else if c == ',' {
212 | 		if !p.checkComma() {
213 | 			p.errorState = errComma
214 | 			return false
215 | 		}
216 | 		p.hasComma = true
217 | 		p.digitLength = 0
218 | 		return true
219 | 	}
220 | 
221 | 	n, ok := runeToNumMap[c]
222 | 	if !ok {
223 | 		return false
224 | 	}
225 | 	if n < 0 && n >= -3 { // isSmallUnit
226 | 		p.tmp.shiftScale(-n)
227 | 		if !p.subtotal.add(p.tmp) {
228 | 			return false
229 | 		}
230 | 		p.tmp.clear()
231 | 		p.isFirstDigit = true
232 | 		p.digitLength = 0
233 | 		p.hasComma = false
234 | 	} else if n <= -4 { // isLargeUnit
235 | 		if !p.subtotal.add(p.tmp) || p.subtotal.isZero() {
236 | 			return false
237 | 		}
238 | 		p.subtotal.shiftScale(-n)
239 | 		if !p.total.add(p.subtotal) {
240 | 			return false
241 | 		}
242 | 		p.subtotal.clear()
243 | 		p.tmp.clear()
244 | 		p.isFirstDigit = true
245 | 		p.digitLength = 0
246 | 		p.hasComma = false
247 | 	} else {
248 | 		p.tmp.append(n)
249 | 		p.isFirstDigit = false
250 | 		p.digitLength++
251 | 		p.hasHangingPoint = false
252 | 	}
253 | 
254 | 	return true
255 | }
256 | 
257 | func (p *numericParser) done() bool {
258 | 	ret := p.subtotal.add(p.tmp) && p.total.add(p.subtotal)
259 | 	if p.hasHangingPoint {
260 | 		p.errorState = errPoint
261 | 		return false
262 | 	} else if p.hasComma && p.digitLength != 3 {
263 | 		p.errorState = errComma
264 | 		return false
265 | 	}
266 | 	return ret
267 | }
268 | 
269 | func (p *numericParser) getNormalized() string {
270 | 	return p.total.String()
271 | }
272 | 
273 | var runeToNumMap = map[rune]int{
274 | 	'0': 0,
275 | 	'1': 1,
276 | 	'2': 2,
277 | 	'3': 3,
278 | 	'4': 4,
279 | 	'5': 5,
280 | 	'6': 6,
281 | 	'7': 7,
282 | 	'8': 8,
283 | 	'9': 9,
284 | 	'〇': 0,
285 | 	'一': 1,
286 | 	'二': 2,
287 | 	'三': 3,
288 | 	'四': 4,
289 | 	'五': 5,
290 | 	'六': 6,
291 | 	'七': 7,
292 | 	'八': 8,
293 | 	'九': 9,
294 | 	'十': -1,
295 | 	'百': -2,
296 | 	'千': -3,
297 | 	'万': -4,
298 | 	'億': -8,
299 | 	'兆': -12,
300 | }
301 | 


--------------------------------------------------------------------------------
/plugin.go:
--------------------------------------------------------------------------------
  1 | package gosudachi
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 
  6 | 	"github.com/msnoigrs/gosudachi/dictionary"
  7 | )
  8 | 
  9 | type Settings interface {
 10 | 	GetBaseConfig() *BaseConfig
 11 | }
 12 | 
 13 | type BaseConfig struct {
 14 | 	SystemDict              string
 15 | 	CharacterDefinitionFile string
 16 | 	UserDict                []string
 17 | 	Utf16String             bool
 18 | }
 19 | 
 20 | type PluginMaker interface {
 21 | 	GetInputTextPluginArray(f MakeInputTextPluginFunc) ([]InputTextPlugin, error)
 22 | 	GetOovProviderPluginArray(f MakeOovProviderPluginFunc) ([]OovProviderPlugin, error)
 23 | 	GetPathRewritePluginArray(f MakePathRewritePluginFunc) ([]PathRewritePlugin, error)
 24 | 	GetEditConnectionCostPluginArray(f MakeEditConnectionCostPluginFunc) ([]EditConnectionCostPlugin, error)
 25 | }
 26 | 
 27 | type Plugin interface {
 28 | 	GetConfigStruct() interface{}
 29 | }
 30 | 
 31 | type MakeInputTextPluginFunc func(n string) InputTextPlugin
 32 | type MakeEditConnectionCostPluginFunc func(n string) EditConnectionCostPlugin
 33 | type MakeOovProviderPluginFunc func(n string) OovProviderPlugin
 34 | type MakePathRewritePluginFunc func(n string) PathRewritePlugin
 35 | 
 36 | func DefMakeInputTextPlugin(k string) InputTextPlugin {
 37 | 	switch k {
 38 | 	case "DefaultInputTextPlugin", "com.worksap.nlp.sudachi.DefaultInputTextPlugin":
 39 | 		return NewDefaultInputTextPlugin(nil)
 40 | 	case "ProlongedSoundMarkInputTextPlugin", "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin":
 41 | 		return NewProlongedSoundMarkInputTextPlugin(nil)
 42 | 	}
 43 | 	return nil
 44 | }
 45 | 
 46 | func DefMakeEditConnectionCostPlugin(k string) EditConnectionCostPlugin {
 47 | 	switch k {
 48 | 	case "InhibitConnectionPlugin", "com.worksap.nlp.sudachi.InhibitConnectionPlugin":
 49 | 		return NewInhibitConnectionPlugin([]*[]int{})
 50 | 	}
 51 | 	return nil
 52 | }
 53 | 
 54 | func DefMakeOovProviderPlugin(k string) OovProviderPlugin {
 55 | 	switch k {
 56 | 	case "MeCabOovProviderPlugin", "com.worksap.nlp.sudachi.MeCabOovProviderPlugin":
 57 | 		return NewMeCabOovProviderPlugin(nil)
 58 | 	case "SimpleOovProviderPlugin", "com.worksap.nlp.sudachi.SimpleOovProviderPlugin":
 59 | 		return NewSimpleOovProviderPlugin(nil)
 60 | 	}
 61 | 	return nil
 62 | }
 63 | 
 64 | func DefMakePathRewritePlugin(k string) PathRewritePlugin {
 65 | 	switch k {
 66 | 	case "JoinNumericPlugin", "com.worksap.nlp.sudachi.JoinNumericPlugin":
 67 | 		return NewJoinNumericPlugin(nil)
 68 | 	case "JoinKatakanaOovPlugin", "com.worksap.nlp.sudachi.JoinKatakanaOovPlugin":
 69 | 		return NewJoinKatakanaOovPlugin(nil)
 70 | 	}
 71 | 	return nil
 72 | }
 73 | 
 74 | type EditConnectionCostPlugin interface {
 75 | 	Plugin
 76 | 	SetUp(grammar *dictionary.Grammar) error
 77 | 	Edit(grammar *dictionary.Grammar) error
 78 | }
 79 | 
 80 | func InhibitConnection(grammar *dictionary.Grammar, leftId int16, rightId int16) {
 81 | 	grammar.SetConnectCost(leftId, rightId, dictionary.InhibitedConnection)
 82 | }
 83 | 
 84 | type PathRewritePlugin interface {
 85 | 	Plugin
 86 | 	SetUp(grammar *dictionary.Grammar) error
 87 | 	Rewrite(text *InputText, path *[]*LatticeNode, lattice *Lattice) error
 88 | }
 89 | 
 90 | func ConcatenateNodes(path *[]*LatticeNode, begin int, end int, lattice *Lattice, normalizedForm string) (*LatticeNode, error) {
 91 | 	if begin >= end {
 92 | 		return nil, fmt.Errorf("begin >= end")
 93 | 	}
 94 | 	tpath := *path
 95 | 	b := tpath[begin].GetBegin()
 96 | 	e := tpath[end-1].GetEnd()
 97 | 	bwi := tpath[begin].GetWordInfo()
 98 | 	posId := bwi.PosId
 99 | 	var (
100 | 		surfaceLen        int
101 | 		normalizedFormLen int
102 | 		dictionaryFormLen int
103 | 		readingFormLen    int
104 | 		length            int16
105 | 	)
106 | 	wilist := make([]*dictionary.WordInfo, 0, end - begin)
107 | 	for i := begin; i < end; i++ {
108 | 		info := tpath[i].GetWordInfo()
109 | 		wilist = append(wilist, info)
110 | 		surfaceLen += len(info.Surface)
111 | 		length += info.HeadwordLength
112 | 		if normalizedForm == "" {
113 | 			normalizedFormLen += len(info.NormalizedForm)
114 | 		}
115 | 		dictionaryFormLen += len(info.DictionaryForm)
116 | 		readingFormLen += len(info.ReadingForm)
117 | 	}
118 | 	csurface := make([]byte, 0, surfaceLen)
119 | 	var cnormalizedForm []byte
120 | 	if normalizedForm == "" {
121 | 		cnormalizedForm = make([]byte, 0, normalizedFormLen)
122 | 	}
123 | 	cdictionaryForm := make([]byte, 0, dictionaryFormLen)
124 | 	creadingForm := make([]byte, 0, readingFormLen)
125 | 	for _, wi := range wilist {
126 | 		csurface = append(csurface, []byte(wi.Surface)...)
127 | 		if normalizedForm == "" {
128 | 			cnormalizedForm = append(cnormalizedForm, []byte(wi.NormalizedForm)...)
129 | 		}
130 | 		cdictionaryForm = append(cdictionaryForm, []byte(wi.DictionaryForm)...)
131 | 		creadingForm = append(creadingForm, []byte(wi.ReadingForm)...)
132 | 	}
133 | 	if normalizedForm == "" {
134 | 		normalizedForm = string(cnormalizedForm)
135 | 	}
136 | 	wi := &dictionary.WordInfo{
137 | 		Surface:        string(csurface),
138 | 		HeadwordLength: length,
139 | 		PosId:          posId,
140 | 		NormalizedForm: normalizedForm,
141 | 		DictionaryForm: string(cdictionaryForm),
142 | 		ReadingForm:    string(creadingForm),
143 | 	}
144 | 
145 | 	node := &LatticeNode{}
146 | 	node.SetRange(b, e)
147 | 	node.SetWordInfo(wi)
148 | 	*path = replaceNode(tpath, begin, end, node)
149 | 	return node, nil
150 | }
151 | 
152 | func ConcatenateOov(path *[]*LatticeNode, begin int, end int, posId int16, lattice *Lattice) (*LatticeNode, error) {
153 | 	if begin >= end {
154 | 		return nil, fmt.Errorf("begin >= end")
155 | 	}
156 | 	tpath := *path
157 | 	b := tpath[begin].GetBegin()
158 | 	e := tpath[end-1].GetEnd()
159 | 
160 | 	n := lattice.GetMinimumNode(b, e)
161 | 	if n != nil {
162 | 		*path = replaceNode(tpath, begin, end, n)
163 | 		return n, nil
164 | 	}
165 | 
166 | 	var (
167 | 		surfaceLen int
168 | 		length     int16
169 | 	)
170 | 	wilist := make([]*dictionary.WordInfo, 0, end - begin)
171 | 	for i := begin; i < end; i++ {
172 | 		info := tpath[i].GetWordInfo()
173 | 		wilist = append(wilist, info)
174 | 		surfaceLen += len(info.Surface)
175 | 		length += info.HeadwordLength
176 | 	}
177 | 	csurface := make([]byte, 0, surfaceLen)
178 | 	for _, wi := range wilist {
179 | 		csurface = append(csurface, []byte(wi.Surface)...)
180 | 	}
181 | 	s := string(csurface)
182 | 	wi := &dictionary.WordInfo{
183 | 		Surface:        s,
184 | 		HeadwordLength: length,
185 | 		PosId:          posId,
186 | 		NormalizedForm: s,
187 | 		DictionaryForm: s,
188 | 		ReadingForm:    "",
189 | 	}
190 | 
191 | 	node := &LatticeNode{}
192 | 	node.SetRange(b, e)
193 | 	node.SetWordInfo(wi)
194 | 	node.IsOov = true
195 | 	*path = replaceNode(tpath, begin, end, node)
196 | 	return node, nil
197 | }
198 | 
199 | func GetCharCategoryTypes(text *InputText, node *LatticeNode) uint32 {
200 | 	return text.GetCharCategoryTypesRange(node.Begin, node.End)
201 | }
202 | 
203 | func replaceNode(path []*LatticeNode, begin int, end int, node *LatticeNode) []*LatticeNode {
204 | 	d := end - begin
205 | 	if d > 1 {
206 | 		if end < len(path) {
207 | 			copy(path[begin+1:], path[end:])
208 | 		}
209 | 		path = path[:len(path)-d+1]
210 | 	}
211 | 	path[begin] = node
212 | 	return path
213 | }
214 | 
215 | type InputTextPlugin interface {
216 | 	Plugin
217 | 	SetUp() error
218 | 	Rewrite(builder *InputTextBuilder) error
219 | }
220 | 
221 | type OovProviderPlugin interface {
222 | 	Plugin
223 | 	SetUp(grammar *dictionary.Grammar) error
224 | 	ProvideOOV(inputText *InputText, offset int, hasOtherWords bool) ([]*LatticeNode, error)
225 | }
226 | 
227 | func GetOOV(p OovProviderPlugin, inputText *InputText, offset int, hasOtherWords bool) ([]*LatticeNode, error) {
228 | 	nodes, err := p.ProvideOOV(inputText, offset, hasOtherWords)
229 | 	if err != nil {
230 | 		return []*LatticeNode{}, err
231 | 	}
232 | 	for _, node := range nodes {
233 | 		wi := node.GetWordInfo()
234 | 		node.Begin = offset
235 | 		node.End = offset + int(wi.HeadwordLength)
236 | 	}
237 | 	return nodes, nil
238 | }
239 | 
240 | func CreateNodeOfOOV() *LatticeNode {
241 | 	return &LatticeNode{
242 | 		IsOov: true,
243 | 	}
244 | }
245 | 


--------------------------------------------------------------------------------
/printdic/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"flag"
 5 | 	"fmt"
 6 | 	"os"
 7 | 
 8 | 	"github.com/msnoigrs/gosudachi/dictionary"
 9 | )
10 | 
11 | func main() {
12 | 	flag.Usage = func() {
13 | 		fmt.Fprintf(os.Stderr, `Usage of %s:
14 | 	%s [-s file] [-j] file
15 | 
16 | Options:
17 | `, os.Args[0], os.Args[0])
18 | 		flag.PrintDefaults()
19 | 	}
20 | 
21 | 	var (
22 | 		systemdict  string
23 | 		utf16string bool
24 | 	)
25 | 	flag.StringVar(&systemdict, "s", "", "system dictionary")
26 | 	flag.BoolVar(&utf16string, "j", false, "use UTF-16 string")
27 | 
28 | 	flag.Parse()
29 | 
30 | 	if len(flag.Args()) == 0 {
31 | 		flag.Usage()
32 | 		os.Exit(1)
33 | 	}
34 | 
35 | 	var (
36 | 		sdic *dictionary.BinaryDictionary
37 | 		err  error
38 | 	)
39 | 	if systemdict != "" {
40 | 		sdic, err = dictionary.ReadSystemDictionary(systemdict, utf16string)
41 | 		if err != nil {
42 | 			fmt.Fprintln(os.Stderr, err)
43 | 			os.Exit(1)
44 | 		}
45 | 		defer sdic.Close()
46 | 	}
47 | 
48 | 	err = dictionary.PrintDictionary(flag.Args()[0], utf16string, sdic, os.Stdout)
49 | 	if err != nil {
50 | 		fmt.Fprintln(os.Stderr, err)
51 | 		os.Exit(1)
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/printdicheader/main.go:
--------------------------------------------------------------------------------
 1 | package main
 2 | 
 3 | import (
 4 | 	"flag"
 5 | 	"fmt"
 6 | 	"os"
 7 | 
 8 | 	"github.com/msnoigrs/gosudachi/dictionary"
 9 | )
10 | 
11 | func main() {
12 | 	flag.Usage = func() {
13 | 		fmt.Fprintf(os.Stderr, `Usage of %s:
14 | 	%s file
15 | `, os.Args[0], os.Args[0])
16 | 		flag.PrintDefaults()
17 | 	}
18 | 
19 | 	flag.Parse()
20 | 
21 | 	if len(flag.Args()) == 0 {
22 | 		flag.Usage()
23 | 		os.Exit(1)
24 | 	}
25 | 
26 | 	err := dictionary.PrintHeader(flag.Arg(0), os.Stdout)
27 | 	if err != nil {
28 | 		fmt.Fprintln(os.Stderr, err)
29 | 		os.Exit(1)
30 | 	}
31 | }
32 | 


--------------------------------------------------------------------------------
/prolongedsoundmarkinputtextplugin.go:
--------------------------------------------------------------------------------
 1 | package gosudachi
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | )
 6 | 
 7 | type ProlongedSoundMarkInputTextPluginConfig struct {
 8 | 	ProlongedSoundMarks *[]string
 9 | 	ReplacementSymbol   *string
10 | }
11 | 
12 | type ProlongedSoundMarkInputTextPlugin struct {
13 | 	config                *ProlongedSoundMarkInputTextPluginConfig
14 | 	prolongedSoundMarkMap map[rune]bool
15 | 	replacementSymbol     []rune
16 | }
17 | 
18 | func NewProlongedSoundMarkInputTextPlugin(config *ProlongedSoundMarkInputTextPluginConfig) *ProlongedSoundMarkInputTextPlugin {
19 | 	if config == nil {
20 | 		config = &ProlongedSoundMarkInputTextPluginConfig{}
21 | 	}
22 | 	return &ProlongedSoundMarkInputTextPlugin{
23 | 		config:                config,
24 | 		prolongedSoundMarkMap: map[rune]bool{},
25 | 	}
26 | }
27 | 
28 | func (p *ProlongedSoundMarkInputTextPlugin) GetConfigStruct() interface{} {
29 | 	if p.config == nil {
30 | 		p.config = &ProlongedSoundMarkInputTextPluginConfig{}
31 | 	}
32 | 	return p.config
33 | }
34 | 
35 | func (p *ProlongedSoundMarkInputTextPlugin) SetUp() error {
36 | 	if p.config.ProlongedSoundMarks == nil || len(*p.config.ProlongedSoundMarks) == 0 {
37 | 		return fmt.Errorf("ProlongedSoundMarkInputTextPlugin: prolongedSoundMarkStrings is not specified")
38 | 	}
39 | 	if p.config.ReplacementSymbol == nil {
40 | 		return fmt.Errorf("ProlongedSoundMarkInputTextPlugin: replacementSymbol is not specified")
41 | 	}
42 | 	if p.prolongedSoundMarkMap == nil {
43 | 		p.prolongedSoundMarkMap = map[rune]bool{}
44 | 	}
45 | 	for _, s := range *p.config.ProlongedSoundMarks {
46 | 		runes := []rune(s)
47 | 		if len(runes) > 0 {
48 | 			p.prolongedSoundMarkMap[runes[0]] = true
49 | 		}
50 | 	}
51 | 	p.replacementSymbol = []rune(*p.config.ReplacementSymbol)
52 | 	p.config = nil
53 | 	return nil
54 | }
55 | 
56 | func (p *ProlongedSoundMarkInputTextPlugin) Rewrite(builder *InputTextBuilder) error {
57 | 	runes := builder.GetText()
58 | 
59 | 	runelen := len(runes)
60 | 	offset := 0
61 | 	markStartIndex := runelen
62 | 	isProlongedSoundMark := false
63 | 	for i := 0; i < runelen; i++ {
64 | 		_, ok := p.prolongedSoundMarkMap[runes[i]]
65 | 		if !isProlongedSoundMark && ok {
66 | 			isProlongedSoundMark = true
67 | 			markStartIndex = i
68 | 		} else if isProlongedSoundMark && !ok {
69 | 			if (i - markStartIndex) > 1 {
70 | 				builder.Replace(markStartIndex-offset, i-offset, p.replacementSymbol)
71 | 				offset += i - markStartIndex - 1
72 | 			}
73 | 			isProlongedSoundMark = false
74 | 		}
75 | 	}
76 | 	if isProlongedSoundMark && (runelen-markStartIndex) > 1 {
77 | 		builder.Replace(markStartIndex-offset, runelen-offset, p.replacementSymbol)
78 | 	}
79 | 	return nil
80 | }
81 | 


--------------------------------------------------------------------------------
/scripts/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SRC_DIR="${PWD}"
 4 | BUILD_DIR="${PWD}"
 5 | DIST="${BUILD_DIR}/dist"
 6 | CMDDIRS="gosudachicli dicbuilder userdicbuilder printdic printdicheader dicconv"
 7 | 
 8 | build() {
 9 |     cd "${SRC_DIR}/$1"
10 |     echo -n "Building $1..."
11 |     go build -o "${DIST}/$1"
12 |     echo "done"
13 |     cd "${BUILD_DIR}"
14 | }
15 | 
16 | assets() {
17 |     cd "${SRC_DIR}/data"
18 |     go generate
19 |     cd "${BUILD_DIR}"
20 | }
21 | 
22 | assets
23 | 
24 | if [ ! -d "${DIST}" ]; then
25 |     mkdir "${DIST}"
26 | fi
27 | 
28 | for f in ${CMDDIRS}; do
29 |     build "${f}"
30 | done
31 | 


--------------------------------------------------------------------------------
/scripts/mksystemdic.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PROXY=""
 4 | VERSION=""
 5 | UNIDICVER="2.1.2"
 6 | UNIDICZIP="unidic-mecab-${UNIDICVER}_src.zip"
 7 | UNIDICURL="https://unidic.ninjal.ac.jp/unidic_archive/cwj/${UNIDICVER}/${UNIDICZIP}"
 8 | POMXML="pom.xml"
 9 | MATRIXDEF="matrix.def"
10 | SYSSMALLDIC="system_small.dic"
11 | SYSCOREDIC="system_core.dic"
12 | SYSFULLDIC="system_full.dic"
13 | SMALLCSV="small_lex.csv"
14 | CORECSV="core_lex.csv"
15 | NOTCORECSV="notcore_lex.csv"
16 | 
17 | init_env() {
18 |     if [ -f "${1}/pom.xml" ]; then
19 |         POMXML="${1}/pom.xml"
20 |     fi
21 |     if [ -f "${1}/src/main/text/${MATRIXDEF}.zip" ]; then
22 |         cp "${1}/src/main/text/${MATRIXDEF}.zip" .
23 |     fi
24 |     if [ -f "${1}/src/main/text/${SMALLCSV}" ]; then
25 |         SMALLCSV="${1}/src/main/text/${SMALLCSV}"
26 |     fi
27 |     if [ -f "${1}/src/main/text/${CORECSV}" ]; then
28 |         CORECSV="${1}/src/main/text/${CORECSV}"
29 |     fi
30 |     if [ -f "${1}/src/main/text/${NOTCORECSV}" ]; then
31 |         NOTCORECSV="${1}/src/main/text/${NOTCORECSV}"
32 |     fi
33 | }
34 | 
35 | if [ -n "${1}" ]; then
36 |     init_env "${1}"
37 | elif [ -d "../SudachiDict" ]; then
38 |     init_env "../SudachiDict"
39 | fi
40 | 
41 | if [ ! -f "${MATRIXDEF}" ]; then
42 |     if [ ! -f "${MATRIXDEF}.zip" ]; then
43 |         if [ -z "${PROXY}" ]; then
44 |             curl "${UNIDICURL}" -o "${UNIDICZIP}"
45 |         else
46 |             curl "${UNIDICURL}" -x "${PROXY}" -o "${UNIDICZIP}"
47 |         fi
48 |         unzip "${UNIDICZIP}"
49 |         cp "unidic-mecab-${UNIDICVER}_src/matrix.def" "${MATRIXDEF}"
50 |     else
51 |         unzip "${MATRIXDEF}.zip"
52 |     fi
53 | fi
54 | 
55 | if [ -f "${POMXML}" ]; then
56 |     VERSION=$(grep -oP -m 1 '<version>\K([^<]+)' "${POMXML}")
57 | fi
58 | 
59 | if [ -z "${VERSION}" ]; then
60 |     VERSION="go"
61 | fi
62 | 
63 | if [ ! -f "${SMALLCSV}" -o ! -f "${CORECSV}" -o ! -f "${NOTCORECSV}" ]; then
64 |     echo "dictionary files are needed: ${SMALLCSV}, ${CORECSV}, ${NOTCORECSV}" 1>&2
65 | fi
66 | 
67 | ./dicbuilder -o "${SYSSMALLDIC}" -m "${MATRIXDEF}" -d "${VERSION}" "${SMALLCSV}"
68 | ./dicbuilder -o "${SYSCOREDIC}" -m "${MATRIXDEF}" -d "${VERSION}" "${SMALLCSV}" "${CORECSV}"
69 | ./dicbuilder -o "${SYSFULLDIC}" -m "${MATRIXDEF}" -d "${VERSION}" "${SMALLCSV}" "${CORECSV}" "${NOTCORECSV}"
70 | 


--------------------------------------------------------------------------------
/scripts/mksystemdicutf16.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PROXY=""
 4 | VERSION=""
 5 | UNIDICVER="2.1.2"
 6 | UNIDICZIP="unidic-mecab-${UNIDICVER}_src.zip"
 7 | UNIDICURL="https://unidic.ninjal.ac.jp/unidic_archive/cwj/${UNIDICVER}/${UNIDICZIP}"
 8 | POMXML="pom.xml"
 9 | MATRIXDEF="matrix.def"
10 | SYSSMALLDIC="system_small.dic"
11 | SYSCOREDIC="system_core.dic"
12 | SYSFULLDIC="system_full.dic"
13 | SMALLCSV="small_lex.csv"
14 | CORECSV="core_lex.csv"
15 | NOTCORECSV="notcore_lex.csv"
16 | 
17 | init_env() {
18 |     if [ -f "${1}/pom.xml" ]; then
19 |         POMXML="${1}/pom.xml"
20 |     fi
21 |     if [ -f "${1}/src/main/text/${MATRIXDEF}.zip" ]; then
22 |         cp "${1}/src/main/text/${MATRIXDEF}.zip" .
23 |     fi
24 |     if [ -f "${1}/src/main/text/${SMALLCSV}" ]; then
25 |         SMALLCSV="${1}/src/main/text/${SMALLCSV}"
26 |     fi
27 |     if [ -f "${1}/src/main/text/${CORECSV}" ]; then
28 |         CORECSV="${1}/src/main/text/${CORECSV}"
29 |     fi
30 |     if [ -f "${1}/src/main/text/${NOTCORECSV}" ]; then
31 |         NOTCORECSV="${1}/src/main/text/${NOTCORECSV}"
32 |     fi
33 | }
34 | 
35 | if [ -n "${1}" ]; then
36 |     init_env "${1}"
37 | elif [ -d "../SudachiDict" ]; then
38 |     init_env "../SudachiDict"
39 | fi
40 | 
41 | if [ ! -f "${MATRIXDEF}" ]; then
42 |     if [ ! -f "${MATRIXDEF}.zip" ]; then
43 |         if [ -z "${PROXY}" ]; then
44 |             curl "${UNIDICURL}" -o "${UNIDICZIP}"
45 |         else
46 |             curl "${UNIDICURL}" -x "${PROXY}" -o "${UNIDICZIP}"
47 |         fi
48 |         unzip "${UNIDICZIP}"
49 |         cp "unidic-mecab-${UNIDICVER}_src/matrix.def" "${MATRIXDEF}"
50 |     else
51 |         unzip "${MATRIXDEF}.zip"
52 |     fi
53 | fi
54 | 
55 | if [ -f "${POMXML}" ]; then
56 |     VERSION=$(grep -oP -m 1 '<version>\K([^<]+)' "${POMXML}")
57 | fi
58 | 
59 | if [ -z "${VERSION}" ]; then
60 |     VERSION="go"
61 | fi
62 | 
63 | if [ ! -f "${SMALLCSV}" -o ! -f "${CORECSV}" -o ! -f "${NOTCORECSV}" ]; then
64 |     echo "dictionary files are needed: ${SMALLCSV}, ${CORECSV}, ${NOTCORECSV}" 1>&2
65 | fi
66 | 
67 | ./dicbuilder -o "${SYSSMALLDIC}" -m "${MATRIXDEF}" -d "${VERSION}" -j "${SMALLCSV}"
68 | ./dicbuilder -o "${SYSCOREDIC}" -m "${MATRIXDEF}" -d "${VERSION}" -j "${SMALLCSV}" "${CORECSV}"
69 | ./dicbuilder -o "${SYSFULLDIC}" -m "${MATRIXDEF}" -d "${VERSION}" -j "${SMALLCSV}" "${CORECSV}" "${NOTCORECSV}"
70 | 


--------------------------------------------------------------------------------
/settingsjson.go:
--------------------------------------------------------------------------------
  1 | package gosudachi
  2 | 
  3 | import (
  4 | 	"encoding/json"
  5 | 	"fmt"
  6 | 	"io"
  7 | 	"path/filepath"
  8 | )
  9 | 
 10 | type SettingsJSON struct {
 11 | 	BaseConfig
 12 | 	path                     string
 13 | 	inputTextPlugin          []json.RawMessage
 14 | 	oovProviderPlugin        []json.RawMessage
 15 | 	pathRewritePlugin        []json.RawMessage
 16 | 	editConnectionCostPlugin []json.RawMessage
 17 | }
 18 | 
 19 | func NewSettingsJSON() *SettingsJSON {
 20 | 	return &SettingsJSON{}
 21 | }
 22 | 
 23 | func (settings *SettingsJSON) GetBaseConfig() *BaseConfig {
 24 | 	return &settings.BaseConfig
 25 | }
 26 | 
 27 | func (settings *SettingsJSON) ParseSettingsJSON(defpath string, reader io.Reader) error {
 28 | 	internalBaseConfig := &struct {
 29 | 		Path                     *string
 30 | 		SystemDict               *string
 31 | 		CharacterDefinitionFile  *string
 32 | 		Utf16String              *bool
 33 | 		UserDict                 *[]string
 34 | 		InputTextPlugin          *[]json.RawMessage
 35 | 		OovProviderPlugin        *[]json.RawMessage
 36 | 		PathRewritePlugin        *[]json.RawMessage
 37 | 		EditConnectionCostPlugin *[]json.RawMessage
 38 | 	}{}
 39 | 
 40 | 	decoder := json.NewDecoder(reader)
 41 | 	err := decoder.Decode(internalBaseConfig)
 42 | 	if err != nil {
 43 | 		return err
 44 | 	}
 45 | 	if internalBaseConfig.Path == nil && settings.path != "" {
 46 | 		settings.path = defpath
 47 | 	} else if internalBaseConfig.Path != nil {
 48 | 		settings.path = *internalBaseConfig.Path
 49 | 	}
 50 | 	if internalBaseConfig.SystemDict != nil {
 51 | 		settings.SystemDict = settings.getPath(*internalBaseConfig.SystemDict)
 52 | 	}
 53 | 	if internalBaseConfig.CharacterDefinitionFile != nil {
 54 | 		settings.CharacterDefinitionFile = settings.getPath(*internalBaseConfig.CharacterDefinitionFile)
 55 | 	}
 56 | 	if internalBaseConfig.Utf16String != nil {
 57 | 		settings.Utf16String = *internalBaseConfig.Utf16String
 58 | 	}
 59 | 	if internalBaseConfig.UserDict != nil {
 60 | 		for _, ud := range *internalBaseConfig.UserDict {
 61 | 			settings.UserDict = append(settings.UserDict, settings.getPath(ud))
 62 | 		}
 63 | 	}
 64 | 
 65 | 	if internalBaseConfig.InputTextPlugin != nil {
 66 | 		settings.inputTextPlugin = *internalBaseConfig.InputTextPlugin
 67 | 	}
 68 | 	if internalBaseConfig.OovProviderPlugin != nil {
 69 | 		settings.oovProviderPlugin = *internalBaseConfig.OovProviderPlugin
 70 | 	}
 71 | 	if internalBaseConfig.PathRewritePlugin != nil {
 72 | 		settings.pathRewritePlugin = *internalBaseConfig.PathRewritePlugin
 73 | 	}
 74 | 	if internalBaseConfig.EditConnectionCostPlugin != nil {
 75 | 		settings.editConnectionCostPlugin = *internalBaseConfig.EditConnectionCostPlugin
 76 | 	}
 77 | 	return nil
 78 | }
 79 | 
 80 | func (settings *SettingsJSON) getPath(path string) string {
 81 | 	if path == "" || filepath.IsAbs(path) || settings.path == "" {
 82 | 		return path
 83 | 	}
 84 | 	return filepath.Join(settings.path, path)
 85 | }
 86 | 
 87 | func (settings *SettingsJSON) GetInputTextPluginArray(makeproc MakeInputTextPluginFunc) ([]InputTextPlugin, error) {
 88 | 	ret := []InputTextPlugin{}
 89 | 	pname := &struct {
 90 | 		Class *string
 91 | 		Name  *string
 92 | 	}{}
 93 | 	for _, raw := range settings.inputTextPlugin {
 94 | 		err := json.Unmarshal(raw, pname)
 95 | 		if err != nil {
 96 | 			return ret, err
 97 | 		}
 98 | 		var name string
 99 | 		if pname.Class != nil {
100 | 			name = *pname.Class
101 | 		}
102 | 		if pname.Name != nil {
103 | 			name = *pname.Name
104 | 		}
105 | 		plugin := makeproc(name)
106 | 		if plugin == nil {
107 | 			return ret, fmt.Errorf("InputTextPlugin: %s is unknown", name)
108 | 		}
109 | 		err = json.Unmarshal(raw, plugin.GetConfigStruct())
110 | 		if err != nil {
111 | 			return ret, err
112 | 		}
113 | 		ret = append(ret, plugin)
114 | 	}
115 | 	return ret, nil
116 | }
117 | 
118 | func (settings *SettingsJSON) GetOovProviderPluginArray(makeproc MakeOovProviderPluginFunc) ([]OovProviderPlugin, error) {
119 | 	ret := []OovProviderPlugin{}
120 | 	pname := &struct {
121 | 		Class *string
122 | 		Name  *string
123 | 	}{}
124 | 	for _, raw := range settings.oovProviderPlugin {
125 | 		err := json.Unmarshal(raw, pname)
126 | 		if err != nil {
127 | 			return ret, err
128 | 		}
129 | 		var name string
130 | 		if pname.Class != nil {
131 | 			name = *pname.Class
132 | 		}
133 | 		if pname.Name != nil {
134 | 			name = *pname.Name
135 | 		}
136 | 		plugin := makeproc(name)
137 | 		if plugin == nil {
138 | 			return ret, fmt.Errorf("OovProviderPlugin: %s is unknown", name)
139 | 		}
140 | 		err = json.Unmarshal(raw, plugin.GetConfigStruct())
141 | 		if err != nil {
142 | 			return ret, err
143 | 		}
144 | 		ret = append(ret, plugin)
145 | 	}
146 | 	return ret, nil
147 | }
148 | 
149 | func (settings *SettingsJSON) GetEditConnectionCostPluginArray(makeproc MakeEditConnectionCostPluginFunc) ([]EditConnectionCostPlugin, error) {
150 | 	ret := []EditConnectionCostPlugin{}
151 | 	pname := &struct {
152 | 		Class *string
153 | 		Name  *string
154 | 	}{}
155 | 	for _, raw := range settings.editConnectionCostPlugin {
156 | 		err := json.Unmarshal(raw, pname)
157 | 		if err != nil {
158 | 			return ret, err
159 | 		}
160 | 		var name string
161 | 		if pname.Class != nil {
162 | 			name = *pname.Class
163 | 		}
164 | 		if pname.Name != nil {
165 | 			name = *pname.Name
166 | 		}
167 | 		plugin := makeproc(name)
168 | 		if plugin == nil {
169 | 			return ret, fmt.Errorf("EditConnectionCostPlugin: %s is unknown", name)
170 | 		}
171 | 		err = json.Unmarshal(raw, plugin.GetConfigStruct())
172 | 		if err != nil {
173 | 			return ret, err
174 | 		}
175 | 		ret = append(ret, plugin)
176 | 	}
177 | 	return ret, nil
178 | }
179 | 
180 | func (settings *SettingsJSON) GetPathRewritePluginArray(makeproc MakePathRewritePluginFunc) ([]PathRewritePlugin, error) {
181 | 	ret := []PathRewritePlugin{}
182 | 	pname := &struct {
183 | 		Class *string
184 | 		Name  *string
185 | 	}{}
186 | 	for _, raw := range settings.pathRewritePlugin {
187 | 		err := json.Unmarshal(raw, pname)
188 | 		if err != nil {
189 | 			return ret, err
190 | 		}
191 | 		var name string
192 | 		if pname.Class != nil {
193 | 			name = *pname.Class
194 | 		}
195 | 		if pname.Name != nil {
196 | 			name = *pname.Name
197 | 		}
198 | 		plugin := makeproc(name)
199 | 		if plugin == nil {
200 | 			return ret, fmt.Errorf("PathRewritePlugin: %s is unknown", name)
201 | 		}
202 | 		err = json.Unmarshal(raw, plugin.GetConfigStruct())
203 | 		if err != nil {
204 | 			return ret, err
205 | 		}
206 | 		ret = append(ret, plugin)
207 | 	}
208 | 	return ret, nil
209 | }
210 | 


--------------------------------------------------------------------------------
/settingsjson_test.go:
--------------------------------------------------------------------------------
 1 | package gosudachi
 2 | 
 3 | import (
 4 | 	"strings"
 5 | 	"testing"
 6 | )
 7 | 
 8 | var s string = `
 9 | {
10 |   "path" : "/usr/local/share/sudachi",
11 |   "systemDict" : "system.dic",
12 |   "characterDefinitionFile" : "char.def",
13 |   "inputTextPlugin" : [
14 |     { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" },
15 |     { "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin",
16 |       "prolongedSoundMarks" : ["ー", "-", "⁓", "〜", "〰"],
17 |       "replacementSymbol" : "ー"
18 |     }
19 |   ],
20 |   "oovProviderPlugin" : [
21 |     {
22 |       "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin",
23 |       "charDef" : "char.def",
24 |       "unkDef" : "unk.def"
25 |     },
26 |     {
27 |       "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin",
28 |       "oovPOSStrings" : [ "補助記号", "一般", "*", "*", "*", "*" ],
29 |       "leftId" : 5968,
30 |       "rightId" : 5968,
31 |       "cost" : 3857
32 |     }
33 |   ],
34 |   "pathRewritePlugin" : [
35 |     {
36 |       "name" : "JoinNumericPlugin",
37 |       "enableNormalize" : false
38 |     },
39 |     {
40 |       "name" : "JoinKatakanaOovPlugin",
41 |       "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
42 |       "minLength" : 3
43 |     }
44 |   ]
45 | }
46 | `
47 | 
48 | // TestSettingsJSON_ParseSettingsJSON
49 | func TestSettingsJSON_ParseSettingsJSON(t *testing.T) {
50 | 	settings := NewSettingsJSON()
51 | 	err := settings.ParseSettingsJSON("", strings.NewReader(s))
52 | 	if err != nil {
53 | 		t.Errorf("fail to parse json: %s", err)
54 | 	}
55 | 
56 | 	bc := settings.GetBaseConfig()
57 | 	want := "/usr/local/share/sudachi/system.dic"
58 | 	if bc.SystemDict != want {
59 | 		t.Errorf("invalid result. want = %s, got = %s", want, bc.SystemDict)
60 | 	}
61 | 
62 | 	iplugins, err := settings.GetInputTextPluginArray(DefMakeInputTextPlugin)
63 | 	if err != nil {
64 | 		t.Errorf("GetInputTextPluginArray: %s", err)
65 | 	}
66 | 	if len(iplugins) != 2 {
67 | 		t.Errorf("invalid result. want = 2, got = %d", len(iplugins))
68 | 	}
69 | }
70 | 


--------------------------------------------------------------------------------
/simpleoovproviderplugin.go:
--------------------------------------------------------------------------------
 1 | package gosudachi
 2 | 
 3 | import (
 4 | 	"fmt"
 5 | 
 6 | 	"github.com/msnoigrs/gosudachi/dictionary"
 7 | )
 8 | 
 9 | type SimpleOovProviderPluginConfig struct {
10 | 	OovPos  *[]string
11 | 	LeftId  *int16
12 | 	RightId *int16
13 | 	Cost    *int16
14 | }
15 | 
16 | type SimpleOovProviderPlugin struct {
17 | 	config   *SimpleOovProviderPluginConfig
18 | 	oovPosId int16
19 | 	leftId   int16
20 | 	rightId  int16
21 | 	cost     int16
22 | }
23 | 
24 | func NewSimpleOovProviderPlugin(config *SimpleOovProviderPluginConfig) *SimpleOovProviderPlugin {
25 | 	if config == nil {
26 | 		config = &SimpleOovProviderPluginConfig{}
27 | 	}
28 | 	return &SimpleOovProviderPlugin{
29 | 		config: config,
30 | 	}
31 | }
32 | 
33 | func (p *SimpleOovProviderPlugin) GetConfigStruct() interface{} {
34 | 	if p.config == nil {
35 | 		p.config = &SimpleOovProviderPluginConfig{}
36 | 	}
37 | 	return p.config
38 | }
39 | 
40 | func (p *SimpleOovProviderPlugin) SetUp(grammar *dictionary.Grammar) error {
41 | 	if p.config.OovPos == nil {
42 | 		return fmt.Errorf("SimpleOovProviderPlugin: oovPOS is not specified")
43 | 	}
44 | 	if p.config.LeftId == nil {
45 | 		return fmt.Errorf("SimpleOovProviderPlugin: leftId is not specified")
46 | 	}
47 | 	if p.config.RightId == nil {
48 | 		return fmt.Errorf("SimpleOovProviderPlugin: rightId is not specified")
49 | 	}
50 | 	if p.config.Cost == nil {
51 | 		return fmt.Errorf("SimpleOovProviderPlugin: cost is not specified")
52 | 	}
53 | 	if len(*(p.config.OovPos)) == 0 {
54 | 		return fmt.Errorf("SimpleOovProviderPlugin: oovPOS is zero length")
55 | 	}
56 | 	oovPosId := grammar.GetPartOfSpeechId(*p.config.OovPos)
57 | 	if oovPosId < 0 {
58 | 		return fmt.Errorf("SimpleOovProviderPlugin: oovPOS is invalid")
59 | 	}
60 | 	p.oovPosId = oovPosId
61 | 	p.leftId = *p.config.LeftId
62 | 	p.rightId = *p.config.RightId
63 | 	p.cost = *p.config.Cost
64 | 	p.config = nil
65 | 	return nil
66 | }
67 | 
68 | func (p *SimpleOovProviderPlugin) ProvideOOV(inputText *InputText, offset int, hasOtherWords bool) ([]*LatticeNode, error) {
69 | 	if !hasOtherWords {
70 | 		node := CreateNodeOfOOV()
71 | 		node.SetParameter(p.leftId, p.rightId, p.cost)
72 | 		length := inputText.GetCodePointsOffsetLength(offset, 1)
73 | 		s := inputText.GetSubstring(offset, offset+length)
74 | 		wi := &dictionary.WordInfo{
75 | 			Surface:        s,
76 | 			HeadwordLength: int16(length),
77 | 			PosId:          p.oovPosId,
78 | 			NormalizedForm: s,
79 | 			DictionaryForm: s,
80 | 			ReadingForm:    "",
81 | 		}
82 | 		node.SetWordInfo(wi)
83 | 		return []*LatticeNode{node}, nil
84 | 	} else {
85 | 		return []*LatticeNode{}, nil
86 | 	}
87 | }
88 | 


--------------------------------------------------------------------------------
/tokenizer.go:
--------------------------------------------------------------------------------
  1 | package gosudachi
  2 | 
  3 | import (
  4 | 	"fmt"
  5 | 	"io"
  6 | 
  7 | 	"github.com/msnoigrs/gosudachi/dictionary"
  8 | )
  9 | 
 10 | type JapaneseTokenizer struct {
 11 | 	grammar            *dictionary.Grammar
 12 | 	lexicon            *dictionary.LexiconSet
 13 | 	inputTextPlugins   []InputTextPlugin
 14 | 	oovProviderPlugins []OovProviderPlugin
 15 | 	pathRewritePlugins []PathRewritePlugin
 16 | 	defaultOovProvider OovProviderPlugin
 17 | 
 18 | 	DumpOutput io.Writer
 19 | 	lattice    *Lattice
 20 | }
 21 | 
 22 | func NewJapaneseTokenizer(grammar *dictionary.Grammar, lexicon *dictionary.LexiconSet, inputTextPlugins []InputTextPlugin, oovProviderPlugins []OovProviderPlugin, pathRewritePlugins []PathRewritePlugin) *JapaneseTokenizer {
 23 | 	ret := &JapaneseTokenizer{
 24 | 		grammar:            grammar,
 25 | 		lexicon:            lexicon,
 26 | 		inputTextPlugins:   inputTextPlugins,
 27 | 		oovProviderPlugins: oovProviderPlugins,
 28 | 		pathRewritePlugins: pathRewritePlugins,
 29 | 		lattice:            NewLattice(grammar),
 30 | 	}
 31 | 	if len(oovProviderPlugins) > 0 {
 32 | 		ret.defaultOovProvider = oovProviderPlugins[0]
 33 | 	}
 34 | 	return ret
 35 | }
 36 | 
 37 | func (t *JapaneseTokenizer) Tokenize(mode string, text string) (*MorphemeList, error) {
 38 | 	inputTextBuilder := NewInputTextBuilder(text, t.grammar)
 39 | 
 40 | 	if len(text) == 0 {
 41 | 		return NewMorphemeList(inputTextBuilder.Build(), t.grammar, t.lexicon, []*LatticeNode{}), nil
 42 | 	}
 43 | 
 44 | 	for _, plugin := range t.inputTextPlugins {
 45 | 		err := plugin.Rewrite(inputTextBuilder)
 46 | 		if err != nil {
 47 | 			return nil, err
 48 | 		}
 49 | 	}
 50 | 	input := inputTextBuilder.Build()
 51 | 
 52 | 	if t.DumpOutput != nil {
 53 | 		fmt.Fprintln(t.DumpOutput, "=== Input dump")
 54 | 		fmt.Fprintln(t.DumpOutput, input.GetText())
 55 | 	}
 56 | 
 57 | 	err := t.buildLattice(input)
 58 | 	if err != nil {
 59 | 		return nil, err
 60 | 	}
 61 | 
 62 | 	if t.DumpOutput != nil {
 63 | 		fmt.Fprintln(t.DumpOutput, "=== Lattice dump")
 64 | 		t.lattice.Dump(t.DumpOutput)
 65 | 	}
 66 | 
 67 | 	path, err := t.lattice.GetBestPath()
 68 | 	if err != nil {
 69 | 		return nil, err
 70 | 	}
 71 | 
 72 | 	if t.DumpOutput != nil {
 73 | 		fmt.Fprintln(t.DumpOutput, "=== Before rewriting:")
 74 | 		t.dumpPath(path)
 75 | 	}
 76 | 
 77 | 	for _, plugin := range t.pathRewritePlugins {
 78 | 		err := plugin.Rewrite(input, &path, t.lattice)
 79 | 		if err != nil {
 80 | 			return nil, err
 81 | 		}
 82 | 	}
 83 | 	t.lattice.clear()
 84 | 
 85 | 	if mode != "C" {
 86 | 		path = t.splitPath(path, mode)
 87 | 	}
 88 | 
 89 | 	if t.DumpOutput != nil {
 90 | 		fmt.Fprintln(t.DumpOutput, "=== After rewriting:")
 91 | 		t.dumpPath(path)
 92 | 		fmt.Fprintln(t.DumpOutput, "===")
 93 | 	}
 94 | 
 95 | 	return NewMorphemeList(input, t.grammar, t.lexicon, path), nil
 96 | }
 97 | 
 98 | func (t *JapaneseTokenizer) buildLattice(input *InputText) error {
 99 | 	bytea := input.Bytea
100 | 	t.lattice.resize(len(bytea))
101 | 	for i, _ := range bytea {
102 | 		if !input.CanBow(i) || !t.lattice.HasPreviousNode(i) {
103 | 			continue
104 | 		}
105 | 		iterator := t.lexicon.Lookup(bytea, i)
106 | 		hasWords := iterator.Next()
107 | 		for iterator.Next() {
108 | 			wordId, end := iterator.Get()
109 | 			if err := iterator.Err(); err != nil {
110 | 				break
111 | 			}
112 | 			n := NewLatticeNode(
113 | 				t.lexicon,
114 | 				t.lexicon.GetLeftId(wordId),
115 | 				t.lexicon.GetRightId(wordId),
116 | 				t.lexicon.GetCost(wordId),
117 | 				wordId,
118 | 			)
119 | 			t.lattice.Insert(i, end, n)
120 | 		}
121 | 		if err := iterator.Err(); err != nil {
122 | 			return err
123 | 		}
124 | 
125 | 		// OOV
126 | 		types := input.GetCharCategoryTypes(i)
127 | 		if (types & dictionary.NOOOVBOW) != dictionary.NOOOVBOW {
128 | 			for _, plugin := range t.oovProviderPlugins {
129 | 				nodes, err := GetOOV(plugin, input, i, hasWords)
130 | 				if err != nil {
131 | 					return err
132 | 				}
133 | 				for _, node := range nodes {
134 | 					hasWords = true
135 | 					t.lattice.Insert(node.Begin, node.End, node)
136 | 				}
137 | 			}
138 | 		}
139 | 		if !hasWords && t.defaultOovProvider != nil {
140 | 			nodes, err := GetOOV(t.defaultOovProvider, input, i, hasWords)
141 | 			if err != nil {
142 | 				return err
143 | 			}
144 | 			for _, node := range nodes {
145 | 				hasWords = true
146 | 				t.lattice.Insert(node.Begin, node.End, node)
147 | 			}
148 | 		}
149 | 		if !hasWords {
150 | 			return fmt.Errorf("there is no morpheme at %d", i)
151 | 		}
152 | 	}
153 | 	t.lattice.connectEosNode()
154 | 
155 | 	return nil
156 | }
157 | 
158 | func (t *JapaneseTokenizer) splitPath(path []*LatticeNode, mode string) []*LatticeNode {
159 | 	newPath := []*LatticeNode{}
160 | 	for _, node := range path {
161 | 		wi := node.GetWordInfo()
162 | 		var wids []int32
163 | 		if mode == "A" {
164 | 			wids = wi.AUnitSplit
165 | 		} else {
166 | 			wids = wi.BUnitSplit
167 | 		}
168 | 		if len(wids) == 0 || len(wids) == 1 {
169 | 			newPath = append(newPath, node)
170 | 		} else {
171 | 			offset := node.Begin
172 | 			for _, wid := range wids {
173 | 				n := NewLatticeNode(t.lexicon, 0, 0, 0, wid)
174 | 				n.Begin = offset
175 | 				nwi := n.GetWordInfo()
176 | 				offset += int(nwi.HeadwordLength)
177 | 				n.End = offset
178 | 				newPath = append(newPath, n)
179 | 			}
180 | 		}
181 | 	}
182 | 	return newPath
183 | }
184 | 
185 | func (t *JapaneseTokenizer) dumpPath(path []*LatticeNode) {
186 | 	for i, node := range path {
187 | 		fmt.Fprintf(t.DumpOutput, "%d: %s\n", i, node.String())
188 | 	}
189 | }
190 | 


--------------------------------------------------------------------------------
/userdicbuilder/main.go:
--------------------------------------------------------------------------------
  1 | package main
  2 | 
  3 | import (
  4 | 	"bufio"
  5 | 	"flag"
  6 | 	"fmt"
  7 | 	"os"
  8 | 	"time"
  9 | 
 10 | 	"github.com/msnoigrs/gosudachi/dictionary"
 11 | 	"golang.org/x/text/language"
 12 | 	"golang.org/x/text/message"
 13 | )
 14 | 
 15 | func main() {
 16 | 	flag.Usage = func() {
 17 | 		fmt.Fprintf(os.Stderr, `Usage of %s:
 18 | 	%s -o file -s file [-d description] [-j] file1 [file2 ...]
 19 | 
 20 | Options:
 21 | `, os.Args[0], os.Args[0])
 22 | 		flag.PrintDefaults()
 23 | 	}
 24 | 
 25 | 	var (
 26 | 		outputpath  string
 27 | 		systemdict  string
 28 | 		description string
 29 | 		utf16string bool
 30 | 	)
 31 | 	flag.StringVar(&outputpath, "o", "", "output to file")
 32 | 	flag.StringVar(&systemdict, "s", "", "system dictionary")
 33 | 	flag.StringVar(&description, "d", "", "comment")
 34 | 	flag.BoolVar(&utf16string, "j", false, "use UTF-16 string")
 35 | 
 36 | 	flag.Parse()
 37 | 
 38 | 	if outputpath == "" || systemdict == "" || len(flag.Args()) == 0 {
 39 | 		flag.Usage()
 40 | 		os.Exit(1)
 41 | 	}
 42 | 
 43 | 	dh := dictionary.NewDictionaryHeader(
 44 | 		dictionary.UserDictVersion2,
 45 | 		time.Now().Unix(),
 46 | 		description,
 47 | 	)
 48 | 
 49 | 	hb, err := dh.ToBytes()
 50 | 	if err != nil {
 51 | 		fmt.Fprintln(os.Stderr, err)
 52 | 		os.Exit(1)
 53 | 	}
 54 | 
 55 | 	sdic, err := dictionary.ReadSystemDictionary(systemdict, utf16string)
 56 | 	if err != nil {
 57 | 		fmt.Fprintln(os.Stderr, err)
 58 | 		os.Exit(1)
 59 | 	}
 60 | 	defer sdic.Close()
 61 | 
 62 | 	outputWriter, err := os.OpenFile(outputpath, os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0644)
 63 | 	if err != nil {
 64 | 		fmt.Fprintln(os.Stderr, err)
 65 | 		os.Exit(1)
 66 | 	}
 67 | 	defer outputWriter.Close()
 68 | 
 69 | 	bufout := bufio.NewWriter(outputWriter)
 70 | 	n, err := bufout.Write(hb)
 71 | 	if err != nil {
 72 | 		fmt.Fprintf(os.Stderr, "fail to write header: %s\n", err)
 73 | 		os.Exit(1)
 74 | 	}
 75 | 	err = bufout.Flush()
 76 | 	if err != nil {
 77 | 		fmt.Fprintf(os.Stderr, "fail to write header: %s\n", err)
 78 | 		os.Exit(1)
 79 | 	}
 80 | 
 81 | 	dicbuilder := dictionary.NewDictionaryBuilder(int64(n), sdic.Lexicon, utf16string)
 82 | 	store := dictionary.NewPosTableUser(sdic.Grammar)
 83 | 
 84 | 	fmt.Fprint(os.Stderr, "reading the source file...")
 85 | 	for _, lexiconpath := range flag.Args() {
 86 | 		err := build(dicbuilder, store, lexiconpath)
 87 | 		if err != nil {
 88 | 			fmt.Fprintf(os.Stderr, "%s: %s", err, lexiconpath)
 89 | 			os.Exit(1)
 90 | 		}
 91 | 	}
 92 | 	p := message.NewPrinter(language.English)
 93 | 	p.Fprintf(os.Stderr, " %d words\n", dicbuilder.EntrySize())
 94 | 
 95 | 	err = dicbuilder.WriteGrammarUser(&store.PosTable, outputWriter)
 96 | 	if err != nil {
 97 | 		fmt.Fprintf(os.Stderr, "fail to write grammar: %s\n", err)
 98 | 		os.Exit(1)
 99 | 	}
100 | 
101 | 	err = dicbuilder.WriteLexicon(outputWriter, store)
102 | 	if err != nil {
103 | 		fmt.Fprintf(os.Stderr, "fail to write lexicon: %s\n", err)
104 | 		os.Exit(1)
105 | 	}
106 | }
107 | 
108 | func build(dicbuilder *dictionary.DictionaryBuilder, store dictionary.PosIdStore, lexiconpath string) error {
109 | 	lexiconReader, err := os.OpenFile(lexiconpath, os.O_RDONLY, 0644)
110 | 	if err != nil {
111 | 		return err
112 | 	}
113 | 	defer lexiconReader.Close()
114 | 
115 | 	err = dicbuilder.BuildLexicon(store, lexiconReader)
116 | 	if err != nil {
117 | 		return err
118 | 	}
119 | 	return nil
120 | }
121 | 


--------------------------------------------------------------------------------