├── .gitignore
├── README.txt
├── build.gradle
├── cmecab-java.iml
├── cmecab-java.ipr
├── etc
└── TinySegmenter
│ ├── exporter.js
│ └── tiny_segmenter-0.1.js
├── lib
├── js.jar
└── license
│ ├── LICENCE-TinySegmenter.txt
│ ├── LICENSE-APACHE.txt
│ ├── LICENSE-BridJ.txt
│ └── cpl1.0.txt
└── src
├── main
└── java
│ └── net
│ └── moraleboost
│ ├── io
│ ├── BasicCodePointReader.java
│ ├── CharsetUtil.java
│ ├── CodePointReader.java
│ └── PushbackCodePointReader.java
│ ├── mecab
│ ├── DictionaryInfo.java
│ ├── Lattice.java
│ ├── Model.java
│ ├── Node.java
│ ├── Path.java
│ ├── Tagger.java
│ └── impl
│ │ ├── StandardDictionaryInfo.java
│ │ ├── StandardLattice.java
│ │ ├── StandardModel.java
│ │ ├── StandardNode.java
│ │ ├── StandardPath.java
│ │ └── StandardTagger.java
│ ├── tinysegmenter
│ ├── ModelExporter.java
│ ├── TinySegmenter.java
│ └── TinySegmenterConstants.java
│ └── util
│ └── CSVUtil.java
└── test
└── java
└── net
└── moraleboost
├── io
├── BasicCodePointReaderTest.java
└── PushbackCodePointReaderTest.java
├── mecab
└── impl
│ └── StandardTaggerTest.java
├── tinysegmenter
└── TinySegmenterTest.java
└── util
└── CSVUtilTest.java
/.gitignore:
--------------------------------------------------------------------------------
1 | *.class
2 | *.jar
3 | *.iws
4 | build/
5 | .gradle/
6 |
--------------------------------------------------------------------------------
/README.txt:
--------------------------------------------------------------------------------
1 | cmecab-java
2 |
3 | 1. これは何?
4 |
5 | 日本語形態素解析エンジンMeCab (http://mecab.sourceforge.net/) の
6 | Javaバインディングです。SWIGを用いず、直接MeCabのライブラリを
7 | 呼び出しています。
8 |
9 | おまけとして、以下のものを含んでいます。
10 | * TinySegmenter (http://chasen.org/~taku/software/TinySegmenter/) の
11 | Java移植版、及びそのTokenizer、TokenizerFactory
12 |
13 |
14 | 2. パッケージ構成
15 |
16 | 配布パッケージは、以下のディレクトリ構造を持ちます。
17 |
18 | bin - ビルドされたJavaライブラリが書き出されるディレクトリ
19 | lib - ビルドおよびテストに必要なサードパーティライブラリが
20 | 格納されたディレクトリ
21 | src - Pure Javaライブラリのソースコードが格納されたディレクトリ
22 | test - テスト用データが格納されたディレクトリ
23 | etc - その他もろもろ(TinySegmenterのオリジナルソース等)
24 |
25 |
26 | 3. インストール方法
27 |
28 | 配布パッケージのルートディレクトリで、gradle buildを実行してください。
29 |
30 | % gradle build
31 |
32 | ビルドが終了すると、build/libsディレクトリに、cmecab-java-(バージョン番号).jar
33 | という名前のJARファイルが作成されます。これをお好きな場所にコピーして、
34 | Javaのクラスパスを通してください。
35 |
36 | 実行には、別途BridJ (https://github.com/nativelibs4java/BridJ) のjarを
37 | 入手し、クラスパスを通す必要があります。
38 | また、事前に、MeCabのライブラリ(libmecab.dll, libmecab.soなど)に、
39 | OSのパスを通しておく必要があります。
40 |
41 |
42 | 4. 利用方法
43 | http://code.google.com/p/cmecab-java/wiki/HowToUse
44 | をご覧ください。
45 |
46 |
47 | 5. ライセンス
48 |
49 | cmecab-java本体はパブリックドメインとします。
50 |
51 | ただし、TinySegmenter.java、TinySegmenterConstants.javaについては、
52 | TinySegmenter (http://chasen.org/~taku/software/TinySegmenter/)の
53 | 二次的著作物であるため、オリジナルと同じく修正BSDライセンスに
54 | 従います。
55 |
56 | なお、ビルドおよびテストのため、lib, etcディレクトリに以下のソフトウェアを
57 | 同梱しています。これらのソフトウェアは、それぞれのライセンスに従います。
58 |
59 | * TinySegmenter
60 | * 修正BSDライセンス
61 | * lib/license/LICENSE-TinySegmenter.txtをご覧ください
62 |
63 | 6. 連絡先
64 |
65 | MeCab、TinySegmenterに関するご質問は、それぞれのソフトウェアの
66 | メーリングリスト等へどうぞ。
67 |
68 | cmecab-java自体に関するご質問等は、武田光平 k-tak@void.in までどうぞ。
69 |
--------------------------------------------------------------------------------
/build.gradle:
--------------------------------------------------------------------------------
1 | apply plugin: 'java'
2 | apply plugin: 'idea'
3 | apply plugin: 'maven'
4 |
5 | group = 'net.moraleboost.cmecab-java'
6 | archivesBaseName = 'cmecab-java'
7 | version = '2.1.0'
8 |
9 | sourceCompatibility = 1.6
10 | targetCompatibility = 1.6
11 |
12 | compileJava.options.encoding = 'UTF-8'
13 | compileTestJava.options.encoding = 'UTF-8'
14 | javadoc.options.encoding = 'UTF-8'
15 | javadoc.options.charSet = 'UTF-8'
16 |
17 | configurations {
18 | deployerJars
19 | }
20 |
21 | repositories {
22 | maven {
23 | url 'http://maven.restlet.org/'
24 | }
25 | maven {
26 | url 'http://jcenter.bintray.com/'
27 | }
28 | mavenCentral()
29 | }
30 |
31 | dependencies {
32 | compile('com.nativelibs4java:bridj:0.7.0')
33 | compile fileTree(dir: 'lib', include: '*.jar')
34 |
35 | testCompile('junit:junit:4.12')
36 |
37 | deployerJars('org.apache.maven.wagon:wagon-http:2.2')
38 | }
39 |
40 | uploadArchives {
41 | repositories.mavenDeployer {
42 | configuration = configurations.deployerJars
43 | repository(url: System.properties['cmecab_java.repositoryUrl']) {
44 | authentication(
45 | userName: System.properties['cmecab_java.repositoryUser'],
46 | password: System.properties['cmecab_java.repositoryPassword'])
47 | }
48 | }
49 | }
50 |
--------------------------------------------------------------------------------
/cmecab-java.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/cmecab-java.ipr:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
81 |
82 |
83 |
84 | -
85 |
86 |
87 | -
88 |
89 |
90 | -
91 |
92 |
93 | -
94 |
95 |
96 | -
97 |
98 |
99 |
100 |
101 |
102 | -
103 |
104 |
105 |
106 |
107 |
108 | -
109 |
110 |
111 |
112 |
113 |
114 | -
115 |
116 |
117 |
118 |
119 |
120 | -
121 |
122 |
123 |
124 |
125 | -
126 |
127 |
128 |
129 |
130 | -
131 |
132 |
133 |
134 |
135 | -
136 |
137 |
138 |
139 |
140 | -
141 |
142 |
143 |
144 |
145 | -
146 |
147 |
148 |
149 |
150 | -
151 |
152 |
153 | -
154 |
155 |
156 |
157 |
158 | -
159 |
160 |
161 |
162 |
163 | -
164 |
165 |
166 |
167 |
168 | -
169 |
170 |
171 |
172 |
173 | -
174 |
175 |
176 |
177 |
178 | -
179 |
180 |
181 | -
182 |
183 |
184 | -
185 |
186 |
187 | -
188 |
189 |
190 | -
191 |
192 |
193 |
194 |
195 | -
196 |
197 |
198 | -
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 | Android Lint
212 |
213 |
214 | Internationalization issues
215 |
216 |
217 | Internationalization issuesJava
218 |
219 |
220 | Java
221 |
222 |
223 | Python
224 |
225 |
226 | WebSocket issues
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
--------------------------------------------------------------------------------
/etc/TinySegmenter/exporter.js:
--------------------------------------------------------------------------------
1 | /*
2 | **
3 | ** Mar. 24, 2009
4 | **
5 | ** The author disclaims copyright to this source code.
6 | ** In place of a legal notice, here is a blessing:
7 | **
8 | ** May you do good and not evil.
9 | ** May you find forgiveness for yourself and forgive others.
10 | ** May you share freely, never taking more than you give.
11 | **
12 | ** Stolen from SQLite :-)
13 | ** Any feedback is welcome.
14 | ** Kohei TAKETA
15 | **
16 | */
17 | var segmenter = new TinySegmenter();
18 | var BIAS = segmenter.BIAS__;
19 |
20 | function getKeys(name) {
21 | var ret = "";
22 | var obj = segmenter[name + "__"];
23 |
24 | for (var k in obj) {
25 | if (ret != "") {
26 | ret += ",";
27 | }
28 | ret += ("\"" + k + "\"");
29 | }
30 |
31 | return "{" + ret + "}";
32 | }
33 |
34 | function getValues(name) {
35 | var ret = "";
36 | var obj = segmenter[name + "__"];
37 |
38 | for (var k in obj) {
39 | if (ret != "") {
40 | ret += ",";
41 | }
42 | ret += obj[k];
43 | }
44 |
45 | return "{" + ret + "}";
46 | }
47 |
--------------------------------------------------------------------------------
/etc/TinySegmenter/tiny_segmenter-0.1.js:
--------------------------------------------------------------------------------
1 | // TinySegmenter 0.1 -- Super compact Japanese tokenizer in Javascript
2 | // (c) 2008 Taku Kudo
3 | // TinySegmenter is freely distributable under the terms of a new BSD licence.
4 | // For details, see http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt
5 |
6 | function TinySegmenter() {
7 | var patterns = {
8 | "[一二三四五六七八九十百千万億兆]":"M",
9 | "[一-龠々〆ヵヶ]":"H",
10 | "[ぁ-ん]":"I",
11 | "[ァ-ヴーア-ン゙ー]":"K",
12 | "[a-zA-Za-zA-Z]":"A",
13 | "[0-90-9]":"N"
14 | }
15 | this.chartype_ = [];
16 | for (var i in patterns) {
17 | var regexp = new RegExp;
18 | regexp.compile(i)
19 | this.chartype_.push([regexp, patterns[i]]);
20 | }
21 |
22 | this.BIAS__ = -332
23 | this.BC1__ = {"HH":6,"II":2461,"KH":406,"OH":-1378};
24 | this.BC2__ = {"AA":-3267,"AI":2744,"AN":-878,"HH":-4070,"HM":-1711,"HN":4012,"HO":3761,"IA":1327,"IH":-1184,"II":-1332,"IK":1721,"IO":5492,"KI":3831,"KK":-8741,"MH":-3132,"MK":3334,"OO":-2920};
25 | this.BC3__ = {"HH":996,"HI":626,"HK":-721,"HN":-1307,"HO":-836,"IH":-301,"KK":2762,"MK":1079,"MM":4034,"OA":-1652,"OH":266};
26 | this.BP1__ = {"BB":295,"OB":304,"OO":-125,"UB":352};
27 | this.BP2__ = {"BO":60,"OO":-1762};
28 | this.BQ1__ = {"BHH":1150,"BHM":1521,"BII":-1158,"BIM":886,"BMH":1208,"BNH":449,"BOH":-91,"BOO":-2597,"OHI":451,"OIH":-296,"OKA":1851,"OKH":-1020,"OKK":904,"OOO":2965};
29 | this.BQ2__ = {"BHH":118,"BHI":-1159,"BHM":466,"BIH":-919,"BKK":-1720,"BKO":864,"OHH":-1139,"OHM":-181,"OIH":153,"UHI":-1146};
30 | this.BQ3__ = {"BHH":-792,"BHI":2664,"BII":-299,"BKI":419,"BMH":937,"BMM":8335,"BNN":998,"BOH":775,"OHH":2174,"OHM":439,"OII":280,"OKH":1798,"OKI":-793,"OKO":-2242,"OMH":-2402,"OOO":11699};
31 | this.BQ4__ = {"BHH":-3895,"BIH":3761,"BII":-4654,"BIK":1348,"BKK":-1806,"BMI":-3385,"BOO":-12396,"OAH":926,"OHH":266,"OHK":-2036,"ONN":-973};
32 | this.BW1__ = {",と":660,",同":727,"B1あ":1404,"B1同":542,"、と":660,"、同":727,"」と":1682,"あっ":1505,"いう":1743,"いっ":-2055,"いる":672,"うし":-4817,"うん":665,"から":3472,"がら":600,"こう":-790,"こと":2083,"こん":-1262,"さら":-4143,"さん":4573,"した":2641,"して":1104,"すで":-3399,"そこ":1977,"それ":-871,"たち":1122,"ため":601,"った":3463,"つい":-802,"てい":805,"てき":1249,"でき":1127,"です":3445,"では":844,"とい":-4915,"とみ":1922,"どこ":3887,"ない":5713,"なっ":3015,"など":7379,"なん":-1113,"にし":2468,"には":1498,"にも":1671,"に対":-912,"の一":-501,"の中":741,"ませ":2448,"まで":1711,"まま":2600,"まる":-2155,"やむ":-1947,"よっ":-2565,"れた":2369,"れで":-913,"をし":1860,"を見":731,"亡く":-1886,"京都":2558,"取り":-2784,"大き":-2604,"大阪":1497,"平方":-2314,"引き":-1336,"日本":-195,"本当":-2423,"毎日":-2113,"目指":-724,"B1あ":1404,"B1同":542,"」と":1682};
33 | this.BW2__ = {"..":-11822,"11":-669,"――":-5730,"−−":-13175,"いう":-1609,"うか":2490,"かし":-1350,"かも":-602,"から":-7194,"かれ":4612,"がい":853,"がら":-3198,"きた":1941,"くな":-1597,"こと":-8392,"この":-4193,"させ":4533,"され":13168,"さん":-3977,"しい":-1819,"しか":-545,"した":5078,"して":972,"しな":939,"その":-3744,"たい":-1253,"たた":-662,"ただ":-3857,"たち":-786,"たと":1224,"たは":-939,"った":4589,"って":1647,"っと":-2094,"てい":6144,"てき":3640,"てく":2551,"ては":-3110,"ても":-3065,"でい":2666,"でき":-1528,"でし":-3828,"です":-4761,"でも":-4203,"とい":1890,"とこ":-1746,"とと":-2279,"との":720,"とみ":5168,"とも":-3941,"ない":-2488,"なが":-1313,"など":-6509,"なの":2614,"なん":3099,"にお":-1615,"にし":2748,"にな":2454,"によ":-7236,"に対":-14943,"に従":-4688,"に関":-11388,"のか":2093,"ので":-7059,"のに":-6041,"のの":-6125,"はい":1073,"はが":-1033,"はず":-2532,"ばれ":1813,"まし":-1316,"まで":-6621,"まれ":5409,"めて":-3153,"もい":2230,"もの":-10713,"らか":-944,"らし":-1611,"らに":-1897,"りし":651,"りま":1620,"れた":4270,"れて":849,"れば":4114,"ろう":6067,"われ":7901,"を通":-11877,"んだ":728,"んな":-4115,"一人":602,"一方":-1375,"一日":970,"一部":-1051,"上が":-4479,"会社":-1116,"出て":2163,"分の":-7758,"同党":970,"同日":-913,"大阪":-2471,"委員":-1250,"少な":-1050,"年度":-8669,"年間":-1626,"府県":-2363,"手権":-1982,"新聞":-4066,"日新":-722,"日本":-7068,"日米":3372,"曜日":-601,"朝鮮":-2355,"本人":-2697,"東京":-1543,"然と":-1384,"社会":-1276,"立て":-990,"第に":-1612,"米国":-4268,"11":-669};
34 | this.BW3__ = {"あた":-2194,"あり":719,"ある":3846,"い.":-1185,"い。":-1185,"いい":5308,"いえ":2079,"いく":3029,"いた":2056,"いっ":1883,"いる":5600,"いわ":1527,"うち":1117,"うと":4798,"えと":1454,"か.":2857,"か。":2857,"かけ":-743,"かっ":-4098,"かに":-669,"から":6520,"かり":-2670,"が,":1816,"が、":1816,"がき":-4855,"がけ":-1127,"がっ":-913,"がら":-4977,"がり":-2064,"きた":1645,"けど":1374,"こと":7397,"この":1542,"ころ":-2757,"さい":-714,"さを":976,"し,":1557,"し、":1557,"しい":-3714,"した":3562,"して":1449,"しな":2608,"しま":1200,"す.":-1310,"す。":-1310,"する":6521,"ず,":3426,"ず、":3426,"ずに":841,"そう":428,"た.":8875,"た。":8875,"たい":-594,"たの":812,"たり":-1183,"たる":-853,"だ.":4098,"だ。":4098,"だっ":1004,"った":-4748,"って":300,"てい":6240,"てお":855,"ても":302,"です":1437,"でに":-1482,"では":2295,"とう":-1387,"とし":2266,"との":541,"とも":-3543,"どう":4664,"ない":1796,"なく":-903,"など":2135,"に,":-1021,"に、":-1021,"にし":1771,"にな":1906,"には":2644,"の,":-724,"の、":-724,"の子":-1000,"は,":1337,"は、":1337,"べき":2181,"まし":1113,"ます":6943,"まっ":-1549,"まで":6154,"まれ":-793,"らし":1479,"られ":6820,"るる":3818,"れ,":854,"れ、":854,"れた":1850,"れて":1375,"れば":-3246,"れる":1091,"われ":-605,"んだ":606,"んで":798,"カ月":990,"会議":860,"入り":1232,"大会":2217,"始め":1681,"市":965,"新聞":-5055,"日,":974,"日、":974,"社会":2024,"カ月":990};
35 | this.TC1__ = {"AAA":1093,"HHH":1029,"HHM":580,"HII":998,"HOH":-390,"HOM":-331,"IHI":1169,"IOH":-142,"IOI":-1015,"IOM":467,"MMH":187,"OOI":-1832};
36 | this.TC2__ = {"HHO":2088,"HII":-1023,"HMM":-1154,"IHI":-1965,"KKH":703,"OII":-2649};
37 | this.TC3__ = {"AAA":-294,"HHH":346,"HHI":-341,"HII":-1088,"HIK":731,"HOH":-1486,"IHH":128,"IHI":-3041,"IHO":-1935,"IIH":-825,"IIM":-1035,"IOI":-542,"KHH":-1216,"KKA":491,"KKH":-1217,"KOK":-1009,"MHH":-2694,"MHM":-457,"MHO":123,"MMH":-471,"NNH":-1689,"NNO":662,"OHO":-3393};
38 | this.TC4__ = {"HHH":-203,"HHI":1344,"HHK":365,"HHM":-122,"HHN":182,"HHO":669,"HIH":804,"HII":679,"HOH":446,"IHH":695,"IHO":-2324,"IIH":321,"III":1497,"IIO":656,"IOO":54,"KAK":4845,"KKA":3386,"KKK":3065,"MHH":-405,"MHI":201,"MMH":-241,"MMM":661,"MOM":841};
39 | this.TQ1__ = {"BHHH":-227,"BHHI":316,"BHIH":-132,"BIHH":60,"BIII":1595,"BNHH":-744,"BOHH":225,"BOOO":-908,"OAKK":482,"OHHH":281,"OHIH":249,"OIHI":200,"OIIH":-68};
40 | this.TQ2__ = {"BIHH":-1401,"BIII":-1033,"BKAK":-543,"BOOO":-5591};
41 | this.TQ3__ = {"BHHH":478,"BHHM":-1073,"BHIH":222,"BHII":-504,"BIIH":-116,"BIII":-105,"BMHI":-863,"BMHM":-464,"BOMH":620,"OHHH":346,"OHHI":1729,"OHII":997,"OHMH":481,"OIHH":623,"OIIH":1344,"OKAK":2792,"OKHH":587,"OKKA":679,"OOHH":110,"OOII":-685};
42 | this.TQ4__ = {"BHHH":-721,"BHHM":-3604,"BHII":-966,"BIIH":-607,"BIII":-2181,"OAAA":-2763,"OAKK":180,"OHHH":-294,"OHHI":2446,"OHHO":480,"OHIH":-1573,"OIHH":1935,"OIHI":-493,"OIIH":626,"OIII":-4007,"OKAK":-8156};
43 | this.TW1__ = {"につい":-4681,"東京都":2026};
44 | this.TW2__ = {"ある程":-2049,"いった":-1256,"ころが":-2434,"しょう":3873,"その後":-4430,"だって":-1049,"ていた":1833,"として":-4657,"ともに":-4517,"もので":1882,"一気に":-792,"初めて":-1512,"同時に":-8097,"大きな":-1255,"対して":-2721,"社会党":-3216};
45 | this.TW3__ = {"いただ":-1734,"してい":1314,"として":-4314,"につい":-5483,"にとっ":-5989,"に当た":-6247,"ので,":-727,"ので、":-727,"のもの":-600,"れから":-3752,"十二月":-2287};
46 | this.TW4__ = {"いう.":8576,"いう。":8576,"からな":-2348,"してい":2958,"たが,":1516,"たが、":1516,"ている":1538,"という":1349,"ました":5543,"ません":1097,"ようと":-4258,"よると":5865};
47 | this.UC1__ = {"A":484,"K":93,"M":645,"O":-505};
48 | this.UC2__ = {"A":819,"H":1059,"I":409,"M":3987,"N":5775,"O":646};
49 | this.UC3__ = {"A":-1370,"I":2311};
50 | this.UC4__ = {"A":-2643,"H":1809,"I":-1032,"K":-3450,"M":3565,"N":3876,"O":6646};
51 | this.UC5__ = {"H":313,"I":-1238,"K":-799,"M":539,"O":-831};
52 | this.UC6__ = {"H":-506,"I":-253,"K":87,"M":247,"O":-387};
53 | this.UP1__ = {"O":-214};
54 | this.UP2__ = {"B":69,"O":935};
55 | this.UP3__ = {"B":189};
56 | this.UQ1__ = {"BH":21,"BI":-12,"BK":-99,"BN":142,"BO":-56,"OH":-95,"OI":477,"OK":410,"OO":-2422};
57 | this.UQ2__ = {"BH":216,"BI":113,"OK":1759};
58 | this.UQ3__ = {"BA":-479,"BH":42,"BI":1913,"BK":-7198,"BM":3160,"BN":6427,"BO":14761,"OI":-827,"ON":-3212};
59 | this.UW1__ = {",":156,"、":156,"「":-463,"あ":-941,"う":-127,"が":-553,"き":121,"こ":505,"で":-201,"と":-547,"ど":-123,"に":-789,"の":-185,"は":-847,"も":-466,"や":-470,"よ":182,"ら":-292,"り":208,"れ":169,"を":-446,"ん":-137,"・":-135,"主":-402,"京":-268,"区":-912,"午":871,"国":-460,"大":561,"委":729,"市":-411,"日":-141,"理":361,"生":-408,"県":-386,"都":-718,"「":-463,"・":-135};
60 | this.UW2__ = {",":-829,"、":-829,"〇":892,"「":-645,"」":3145,"あ":-538,"い":505,"う":134,"お":-502,"か":1454,"が":-856,"く":-412,"こ":1141,"さ":878,"ざ":540,"し":1529,"す":-675,"せ":300,"そ":-1011,"た":188,"だ":1837,"つ":-949,"て":-291,"で":-268,"と":-981,"ど":1273,"な":1063,"に":-1764,"の":130,"は":-409,"ひ":-1273,"べ":1261,"ま":600,"も":-1263,"や":-402,"よ":1639,"り":-579,"る":-694,"れ":571,"を":-2516,"ん":2095,"ア":-587,"カ":306,"キ":568,"ッ":831,"三":-758,"不":-2150,"世":-302,"中":-968,"主":-861,"事":492,"人":-123,"会":978,"保":362,"入":548,"初":-3025,"副":-1566,"北":-3414,"区":-422,"大":-1769,"天":-865,"太":-483,"子":-1519,"学":760,"実":1023,"小":-2009,"市":-813,"年":-1060,"強":1067,"手":-1519,"揺":-1033,"政":1522,"文":-1355,"新":-1682,"日":-1815,"明":-1462,"最":-630,"朝":-1843,"本":-1650,"東":-931,"果":-665,"次":-2378,"民":-180,"気":-1740,"理":752,"発":529,"目":-1584,"相":-242,"県":-1165,"立":-763,"第":810,"米":509,"自":-1353,"行":838,"西":-744,"見":-3874,"調":1010,"議":1198,"込":3041,"開":1758,"間":-1257,"「":-645,"」":3145,"ッ":831,"ア":-587,"カ":306,"キ":568};
61 | this.UW3__ = {",":4889,"1":-800,"−":-1723,"、":4889,"々":-2311,"〇":5827,"」":2670,"〓":-3573,"あ":-2696,"い":1006,"う":2342,"え":1983,"お":-4864,"か":-1163,"が":3271,"く":1004,"け":388,"げ":401,"こ":-3552,"ご":-3116,"さ":-1058,"し":-395,"す":584,"せ":3685,"そ":-5228,"た":842,"ち":-521,"っ":-1444,"つ":-1081,"て":6167,"で":2318,"と":1691,"ど":-899,"な":-2788,"に":2745,"の":4056,"は":4555,"ひ":-2171,"ふ":-1798,"へ":1199,"ほ":-5516,"ま":-4384,"み":-120,"め":1205,"も":2323,"や":-788,"よ":-202,"ら":727,"り":649,"る":5905,"れ":2773,"わ":-1207,"を":6620,"ん":-518,"ア":551,"グ":1319,"ス":874,"ッ":-1350,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278,"・":-3794,"一":-1619,"下":-1759,"世":-2087,"両":3815,"中":653,"主":-758,"予":-1193,"二":974,"人":2742,"今":792,"他":1889,"以":-1368,"低":811,"何":4265,"作":-361,"保":-2439,"元":4858,"党":3593,"全":1574,"公":-3030,"六":755,"共":-1880,"円":5807,"再":3095,"分":457,"初":2475,"別":1129,"前":2286,"副":4437,"力":365,"動":-949,"務":-1872,"化":1327,"北":-1038,"区":4646,"千":-2309,"午":-783,"協":-1006,"口":483,"右":1233,"各":3588,"合":-241,"同":3906,"和":-837,"員":4513,"国":642,"型":1389,"場":1219,"外":-241,"妻":2016,"学":-1356,"安":-423,"実":-1008,"家":1078,"小":-513,"少":-3102,"州":1155,"市":3197,"平":-1804,"年":2416,"広":-1030,"府":1605,"度":1452,"建":-2352,"当":-3885,"得":1905,"思":-1291,"性":1822,"戸":-488,"指":-3973,"政":-2013,"教":-1479,"数":3222,"文":-1489,"新":1764,"日":2099,"旧":5792,"昨":-661,"時":-1248,"曜":-951,"最":-937,"月":4125,"期":360,"李":3094,"村":364,"東":-805,"核":5156,"森":2438,"業":484,"氏":2613,"民":-1694,"決":-1073,"法":1868,"海":-495,"無":979,"物":461,"特":-3850,"生":-273,"用":914,"町":1215,"的":7313,"直":-1835,"省":792,"県":6293,"知":-1528,"私":4231,"税":401,"立":-960,"第":1201,"米":7767,"系":3066,"約":3663,"級":1384,"統":-4229,"総":1163,"線":1255,"者":6457,"能":725,"自":-2869,"英":785,"見":1044,"調":-562,"財":-733,"費":1777,"車":1835,"軍":1375,"込":-1504,"通":-1136,"選":-681,"郎":1026,"郡":4404,"部":1200,"金":2163,"長":421,"開":-1432,"間":1302,"関":-1282,"雨":2009,"電":-1045,"非":2066,"駅":1620,"1":-800,"」":2670,"・":-3794,"ッ":-1350,"ア":551,"グ":1319,"ス":874,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278};
62 | this.UW4__ = {",":3930,".":3508,"―":-4841,"、":3930,"。":3508,"〇":4999,"「":1895,"」":3798,"〓":-5156,"あ":4752,"い":-3435,"う":-640,"え":-2514,"お":2405,"か":530,"が":6006,"き":-4482,"ぎ":-3821,"く":-3788,"け":-4376,"げ":-4734,"こ":2255,"ご":1979,"さ":2864,"し":-843,"じ":-2506,"す":-731,"ず":1251,"せ":181,"そ":4091,"た":5034,"だ":5408,"ち":-3654,"っ":-5882,"つ":-1659,"て":3994,"で":7410,"と":4547,"な":5433,"に":6499,"ぬ":1853,"ね":1413,"の":7396,"は":8578,"ば":1940,"ひ":4249,"び":-4134,"ふ":1345,"へ":6665,"べ":-744,"ほ":1464,"ま":1051,"み":-2082,"む":-882,"め":-5046,"も":4169,"ゃ":-2666,"や":2795,"ょ":-1544,"よ":3351,"ら":-2922,"り":-9726,"る":-14896,"れ":-2613,"ろ":-4570,"わ":-1783,"を":13150,"ん":-2352,"カ":2145,"コ":1789,"セ":1287,"ッ":-724,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637,"・":-4371,"ー":-11870,"一":-2069,"中":2210,"予":782,"事":-190,"井":-1768,"人":1036,"以":544,"会":950,"体":-1286,"作":530,"側":4292,"先":601,"党":-2006,"共":-1212,"内":584,"円":788,"初":1347,"前":1623,"副":3879,"力":-302,"動":-740,"務":-2715,"化":776,"区":4517,"協":1013,"参":1555,"合":-1834,"和":-681,"員":-910,"器":-851,"回":1500,"国":-619,"園":-1200,"地":866,"場":-1410,"塁":-2094,"士":-1413,"多":1067,"大":571,"子":-4802,"学":-1397,"定":-1057,"寺":-809,"小":1910,"屋":-1328,"山":-1500,"島":-2056,"川":-2667,"市":2771,"年":374,"庁":-4556,"後":456,"性":553,"感":916,"所":-1566,"支":856,"改":787,"政":2182,"教":704,"文":522,"方":-856,"日":1798,"時":1829,"最":845,"月":-9066,"木":-485,"来":-442,"校":-360,"業":-1043,"氏":5388,"民":-2716,"気":-910,"沢":-939,"済":-543,"物":-735,"率":672,"球":-1267,"生":-1286,"産":-1101,"田":-2900,"町":1826,"的":2586,"目":922,"省":-3485,"県":2997,"空":-867,"立":-2112,"第":788,"米":2937,"系":786,"約":2171,"経":1146,"統":-1169,"総":940,"線":-994,"署":749,"者":2145,"能":-730,"般":-852,"行":-792,"規":792,"警":-1184,"議":-244,"谷":-1000,"賞":730,"車":-1481,"軍":1158,"輪":-1433,"込":-3370,"近":929,"道":-1291,"選":2596,"郎":-4866,"都":1192,"野":-1100,"銀":-2213,"長":357,"間":-2344,"院":-2297,"際":-2604,"電":-878,"領":-1659,"題":-792,"館":-1984,"首":1749,"高":2120,"「":1895,"」":3798,"・":-4371,"ッ":-724,"ー":-11870,"カ":2145,"コ":1789,"セ":1287,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637};
63 | this.UW5__ = {",":465,".":-299,"1":-514,"E2":-32768,"]":-2762,"、":465,"。":-299,"「":363,"あ":1655,"い":331,"う":-503,"え":1199,"お":527,"か":647,"が":-421,"き":1624,"ぎ":1971,"く":312,"げ":-983,"さ":-1537,"し":-1371,"す":-852,"だ":-1186,"ち":1093,"っ":52,"つ":921,"て":-18,"で":-850,"と":-127,"ど":1682,"な":-787,"に":-1224,"の":-635,"は":-578,"べ":1001,"み":502,"め":865,"ゃ":3350,"ょ":854,"り":-208,"る":429,"れ":504,"わ":419,"を":-1264,"ん":327,"イ":241,"ル":451,"ン":-343,"中":-871,"京":722,"会":-1153,"党":-654,"務":3519,"区":-901,"告":848,"員":2104,"大":-1296,"学":-548,"定":1785,"嵐":-1304,"市":-2991,"席":921,"年":1763,"思":872,"所":-814,"挙":1618,"新":-1682,"日":218,"月":-4353,"査":932,"格":1356,"機":-1508,"氏":-1347,"田":240,"町":-3912,"的":-3149,"相":1319,"省":-1052,"県":-4003,"研":-997,"社":-278,"空":-813,"統":1955,"者":-2233,"表":663,"語":-1073,"議":1219,"選":-1018,"郎":-368,"長":786,"間":1191,"題":2368,"館":-689,"1":-514,"E2":-32768,"「":363,"イ":241,"ル":451,"ン":-343};
64 | this.UW6__ = {",":227,".":808,"1":-270,"E1":306,"、":227,"。":808,"あ":-307,"う":189,"か":241,"が":-73,"く":-121,"こ":-200,"じ":1782,"す":383,"た":-428,"っ":573,"て":-1014,"で":101,"と":-105,"な":-253,"に":-149,"の":-417,"は":-236,"も":-206,"り":187,"る":-135,"を":195,"ル":-673,"ン":-496,"一":-277,"中":201,"件":-800,"会":624,"前":302,"区":1792,"員":-1212,"委":798,"学":-960,"市":887,"広":-695,"後":535,"業":-697,"相":753,"社":-507,"福":974,"空":-822,"者":1811,"連":463,"郎":1082,"1":-270,"E1":306,"ル":-673,"ン":-496};
65 |
66 | return this;
67 | }
68 |
69 | TinySegmenter.prototype.ctype_ = function(str) {
70 | for (var i in this.chartype_) {
71 | if (str.match(this.chartype_[i][0])) {
72 | return this.chartype_[i][1];
73 | }
74 | }
75 | return "O";
76 | }
77 |
78 | TinySegmenter.prototype.ts_ = function(v) {
79 | if (v) { return v; }
80 | return 0;
81 | }
82 |
83 | TinySegmenter.prototype.segment = function(input) {
84 | if (input == null || input == undefined || input == "") {
85 | return [];
86 | }
87 | var result = [];
88 | var seg = ["B3","B2","B1"];
89 | var ctype = ["O","O","O"];
90 | var o = input.split("");
91 | for (i = 0; i < o.length; ++i) {
92 | seg.push(o[i]);
93 | ctype.push(this.ctype_(o[i]))
94 | }
95 | seg.push("E1");
96 | seg.push("E2");
97 | seg.push("E3");
98 | ctype.push("O");
99 | ctype.push("O");
100 | ctype.push("O");
101 | var word = seg[3];
102 | var p1 = "U";
103 | var p2 = "U";
104 | var p3 = "U";
105 | for (var i = 4; i < seg.length - 3; ++i) {
106 | var score = this.BIAS__;
107 | var w1 = seg[i-3];
108 | var w2 = seg[i-2];
109 | var w3 = seg[i-1];
110 | var w4 = seg[i];
111 | var w5 = seg[i+1];
112 | var w6 = seg[i+2];
113 | var c1 = ctype[i-3];
114 | var c2 = ctype[i-2];
115 | var c3 = ctype[i-1];
116 | var c4 = ctype[i];
117 | var c5 = ctype[i+1];
118 | var c6 = ctype[i+2];
119 | score += this.ts_(this.UP1__[p1]);
120 | score += this.ts_(this.UP2__[p2]);
121 | score += this.ts_(this.UP3__[p3]);
122 | score += this.ts_(this.BP1__[p1 + p2]);
123 | score += this.ts_(this.BP2__[p2 + p3]);
124 | score += this.ts_(this.UW1__[w1]);
125 | score += this.ts_(this.UW2__[w2]);
126 | score += this.ts_(this.UW3__[w3]);
127 | score += this.ts_(this.UW4__[w4]);
128 | score += this.ts_(this.UW5__[w5]);
129 | score += this.ts_(this.UW6__[w6]);
130 | score += this.ts_(this.BW1__[w2 + w3]);
131 | score += this.ts_(this.BW2__[w3 + w4]);
132 | score += this.ts_(this.BW3__[w4 + w5]);
133 | score += this.ts_(this.TW1__[w1 + w2 + w3]);
134 | score += this.ts_(this.TW2__[w2 + w3 + w4]);
135 | score += this.ts_(this.TW3__[w3 + w4 + w5]);
136 | score += this.ts_(this.TW4__[w4 + w5 + w6]);
137 | score += this.ts_(this.UC1__[c1]);
138 | score += this.ts_(this.UC2__[c2]);
139 | score += this.ts_(this.UC3__[c3]);
140 | score += this.ts_(this.UC4__[c4]);
141 | score += this.ts_(this.UC5__[c5]);
142 | score += this.ts_(this.UC6__[c6]);
143 | score += this.ts_(this.BC1__[c2 + c3]);
144 | score += this.ts_(this.BC2__[c3 + c4]);
145 | score += this.ts_(this.BC3__[c4 + c5]);
146 | score += this.ts_(this.TC1__[c1 + c2 + c3]);
147 | score += this.ts_(this.TC2__[c2 + c3 + c4]);
148 | score += this.ts_(this.TC3__[c3 + c4 + c5]);
149 | score += this.ts_(this.TC4__[c4 + c5 + c6]);
150 | // score += this.ts_(this.TC5__[c4 + c5 + c6]);
151 | score += this.ts_(this.UQ1__[p1 + c1]);
152 | score += this.ts_(this.UQ2__[p2 + c2]);
153 | score += this.ts_(this.UQ1__[p3 + c3]);
154 | score += this.ts_(this.BQ1__[p2 + c2 + c3]);
155 | score += this.ts_(this.BQ2__[p2 + c3 + c4]);
156 | score += this.ts_(this.BQ3__[p3 + c2 + c3]);
157 | score += this.ts_(this.BQ4__[p3 + c3 + c4]);
158 | score += this.ts_(this.TQ1__[p2 + c1 + c2 + c3]);
159 | score += this.ts_(this.TQ2__[p2 + c2 + c3 + c4]);
160 | score += this.ts_(this.TQ3__[p3 + c1 + c2 + c3]);
161 | score += this.ts_(this.TQ4__[p3 + c2 + c3 + c4]);
162 | var p = "O";
163 | if (score > 0) {
164 | result.push(word);
165 | word = "";
166 | p = "B";
167 | }
168 | p1 = p2;
169 | p2 = p3;
170 | p3 = p;
171 | word += seg[i];
172 | }
173 | result.push(word);
174 |
175 | return result;
176 | }
177 |
--------------------------------------------------------------------------------
/lib/js.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/takscape/cmecab-java/2d8c160bfa884d926387e8e7b9f11b908157511b/lib/js.jar
--------------------------------------------------------------------------------
/lib/license/LICENCE-TinySegmenter.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2008, Taku Kudo
2 |
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | * Redistributions of source code must retain the above copyright notice,
9 | this list of conditions and the following disclaimer.
10 | * Redistributions in binary form must reproduce the above copyright
11 | notice, this list of conditions and the following disclaimer in the
12 | documentation and/or other materials provided with the distribution.
13 | * Neither the name of the nor the names of its
14 | contributors may be used to endorse or promote products derived from this
15 | software without specific prior written permission.
16 |
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
21 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
25 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
26 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 |
--------------------------------------------------------------------------------
/lib/license/LICENSE-APACHE.txt:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/lib/license/LICENSE-BridJ.txt:
--------------------------------------------------------------------------------
1 | Copyright (c) 2010-2012, Olivier Chafik
2 | All rights reserved.
3 | Redistribution and use in source and binary forms, with or without
4 | modification, are permitted provided that the following conditions are met:
5 |
6 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8 | * Neither the name of Olivier Chafik nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
9 |
10 | THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY
11 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
12 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
13 | DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY
14 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
15 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
16 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
17 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
18 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
19 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
20 |
--------------------------------------------------------------------------------
/lib/license/cpl1.0.txt:
--------------------------------------------------------------------------------
1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON PUBLIC
2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM
3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT.
4 |
5 | 1. DEFINITIONS
6 |
7 | "Contribution" means:
8 |
9 | a) in the case of the initial Contributor, the initial code and
10 | documentation distributed under this Agreement, and
11 |
12 | b) in the case of each subsequent Contributor:
13 |
14 | i) changes to the Program, and
15 |
16 | ii) additions to the Program;
17 |
18 | where such changes and/or additions to the Program originate from and are
19 | distributed by that particular Contributor. A Contribution 'originates' from a
20 | Contributor if it was added to the Program by such Contributor itself or anyone
21 | acting on such Contributor's behalf. Contributions do not include additions to
22 | the Program which: (i) are separate modules of software distributed in
23 | conjunction with the Program under their own license agreement, and (ii) are not
24 | derivative works of the Program.
25 |
26 | "Contributor" means any person or entity that distributes the Program.
27 |
28 | "Licensed Patents " mean patent claims licensable by a Contributor which are
29 | necessarily infringed by the use or sale of its Contribution alone or when
30 | combined with the Program.
31 |
32 | "Program" means the Contributions distributed in accordance with this Agreement.
33 |
34 | "Recipient" means anyone who receives the Program under this Agreement,
35 | including all Contributors.
36 |
37 | 2. GRANT OF RIGHTS
38 |
39 | a) Subject to the terms of this Agreement, each Contributor hereby grants
40 | Recipient a non-exclusive, worldwide, royalty-free copyright license to
41 | reproduce, prepare derivative works of, publicly display, publicly perform,
42 | distribute and sublicense the Contribution of such Contributor, if any, and such
43 | derivative works, in source code and object code form.
44 |
45 | b) Subject to the terms of this Agreement, each Contributor hereby grants
46 | Recipient a non-exclusive, worldwide, royalty-free patent license under Licensed
47 | Patents to make, use, sell, offer to sell, import and otherwise transfer the
48 | Contribution of such Contributor, if any, in source code and object code form.
49 | This patent license shall apply to the combination of the Contribution and the
50 | Program if, at the time the Contribution is added by the Contributor, such
51 | addition of the Contribution causes such combination to be covered by the
52 | Licensed Patents. The patent license shall not apply to any other combinations
53 | which include the Contribution. No hardware per se is licensed hereunder.
54 |
55 | c) Recipient understands that although each Contributor grants the licenses
56 | to its Contributions set forth herein, no assurances are provided by any
57 | Contributor that the Program does not infringe the patent or other intellectual
58 | property rights of any other entity. Each Contributor disclaims any liability to
59 | Recipient for claims brought by any other entity based on infringement of
60 | intellectual property rights or otherwise. As a condition to exercising the
61 | rights and licenses granted hereunder, each Recipient hereby assumes sole
62 | responsibility to secure any other intellectual property rights needed, if any.
63 | For example, if a third party patent license is required to allow Recipient to
64 | distribute the Program, it is Recipient's responsibility to acquire that license
65 | before distributing the Program.
66 |
67 | d) Each Contributor represents that to its knowledge it has sufficient
68 | copyright rights in its Contribution, if any, to grant the copyright license set
69 | forth in this Agreement.
70 |
71 | 3. REQUIREMENTS
72 |
73 | A Contributor may choose to distribute the Program in object code form under its
74 | own license agreement, provided that:
75 |
76 | a) it complies with the terms and conditions of this Agreement; and
77 |
78 | b) its license agreement:
79 |
80 | i) effectively disclaims on behalf of all Contributors all warranties and
81 | conditions, express and implied, including warranties or conditions of title and
82 | non-infringement, and implied warranties or conditions of merchantability and
83 | fitness for a particular purpose;
84 |
85 | ii) effectively excludes on behalf of all Contributors all liability for
86 | damages, including direct, indirect, special, incidental and consequential
87 | damages, such as lost profits;
88 |
89 | iii) states that any provisions which differ from this Agreement are offered
90 | by that Contributor alone and not by any other party; and
91 |
92 | iv) states that source code for the Program is available from such
93 | Contributor, and informs licensees how to obtain it in a reasonable manner on or
94 | through a medium customarily used for software exchange.
95 |
96 | When the Program is made available in source code form:
97 |
98 | a) it must be made available under this Agreement; and
99 |
100 | b) a copy of this Agreement must be included with each copy of the Program.
101 |
102 | Contributors may not remove or alter any copyright notices contained within the
103 | Program.
104 |
105 | Each Contributor must identify itself as the originator of its Contribution, if
106 | any, in a manner that reasonably allows subsequent Recipients to identify the
107 | originator of the Contribution.
108 |
109 | 4. COMMERCIAL DISTRIBUTION
110 |
111 | Commercial distributors of software may accept certain responsibilities with
112 | respect to end users, business partners and the like. While this license is
113 | intended to facilitate the commercial use of the Program, the Contributor who
114 | includes the Program in a commercial product offering should do so in a manner
115 | which does not create potential liability for other Contributors. Therefore, if
116 | a Contributor includes the Program in a commercial product offering, such
117 | Contributor ("Commercial Contributor") hereby agrees to defend and indemnify
118 | every other Contributor ("Indemnified Contributor") against any losses, damages
119 | and costs (collectively "Losses") arising from claims, lawsuits and other legal
120 | actions brought by a third party against the Indemnified Contributor to the
121 | extent caused by the acts or omissions of such Commercial Contributor in
122 | connection with its distribution of the Program in a commercial product
123 | offering. The obligations in this section do not apply to any claims or Losses
124 | relating to any actual or alleged intellectual property infringement. In order
125 | to qualify, an Indemnified Contributor must: a) promptly notify the Commercial
126 | Contributor in writing of such claim, and b) allow the Commercial Contributor to
127 | control, and cooperate with the Commercial Contributor in, the defense and any
128 | related settlement negotiations. The Indemnified Contributor may participate in
129 | any such claim at its own expense.
130 |
131 | For example, a Contributor might include the Program in a commercial product
132 | offering, Product X. That Contributor is then a Commercial Contributor. If that
133 | Commercial Contributor then makes performance claims, or offers warranties
134 | related to Product X, those performance claims and warranties are such
135 | Commercial Contributor's responsibility alone. Under this section, the
136 | Commercial Contributor would have to defend claims against the other
137 | Contributors related to those performance claims and warranties, and if a court
138 | requires any other Contributor to pay any damages as a result, the Commercial
139 | Contributor must pay those damages.
140 |
141 | 5. NO WARRANTY
142 |
143 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON AN
144 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR
145 | IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE,
146 | NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Each
147 | Recipient is solely responsible for determining the appropriateness of using and
148 | distributing the Program and assumes all risks associated with its exercise of
149 | rights under this Agreement, including but not limited to the risks and costs of
150 | program errors, compliance with applicable laws, damage to or loss of data,
151 | programs or equipment, and unavailability or interruption of operations.
152 |
153 | 6. DISCLAIMER OF LIABILITY
154 |
155 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY
156 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL,
157 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST
158 | PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
159 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
160 | OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS
161 | GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
162 |
163 | 7. GENERAL
164 |
165 | If any provision of this Agreement is invalid or unenforceable under applicable
166 | law, it shall not affect the validity or enforceability of the remainder of the
167 | terms of this Agreement, and without further action by the parties hereto, such
168 | provision shall be reformed to the minimum extent necessary to make such
169 | provision valid and enforceable.
170 |
171 | If Recipient institutes patent litigation against a Contributor with respect to
172 | a patent applicable to software (including a cross-claim or counterclaim in a
173 | lawsuit), then any patent licenses granted by that Contributor to such Recipient
174 | under this Agreement shall terminate as of the date such litigation is filed. In
175 | addition, if Recipient institutes patent litigation against any entity
176 | (including a cross-claim or counterclaim in a lawsuit) alleging that the Program
177 | itself (excluding combinations of the Program with other software or hardware)
178 | infringes such Recipient's patent(s), then such Recipient's rights granted under
179 | Section 2(b) shall terminate as of the date such litigation is filed.
180 |
181 | All Recipient's rights under this Agreement shall terminate if it fails to
182 | comply with any of the material terms or conditions of this Agreement and does
183 | not cure such failure in a reasonable period of time after becoming aware of
184 | such noncompliance. If all Recipient's rights under this Agreement terminate,
185 | Recipient agrees to cease use and distribution of the Program as soon as
186 | reasonably practicable. However, Recipient's obligations under this Agreement
187 | and any licenses granted by Recipient relating to the Program shall continue and
188 | survive.
189 |
190 | Everyone is permitted to copy and distribute copies of this Agreement, but in
191 | order to avoid inconsistency the Agreement is copyrighted and may only be
192 | modified in the following manner. The Agreement Steward reserves the right to
193 | publish new versions (including revisions) of this Agreement from time to time.
194 | No one other than the Agreement Steward has the right to modify this Agreement.
195 | IBM is the initial Agreement Steward. IBM may assign the responsibility to serve
196 | as the Agreement Steward to a suitable separate entity. Each new version of the
197 | Agreement will be given a distinguishing version number. The Program (including
198 | Contributions) may always be distributed subject to the version of the Agreement
199 | under which it was received. In addition, after a new version of the Agreement
200 | is published, Contributor may elect to distribute the Program (including its
201 | Contributions) under the new version. Except as expressly stated in Sections
202 | 2(a) and 2(b) above, Recipient receives no rights or licenses to the
203 | intellectual property of any Contributor under this Agreement, whether
204 | expressly, by implication, estoppel or otherwise. All rights in the Program not
205 | expressly granted under this Agreement are reserved.
206 |
207 | This Agreement is governed by the laws of the State of New York and the
208 | intellectual property laws of the United States of America. No party to this
209 | Agreement will bring a legal action under this Agreement more than one year
210 | after the cause of action arose. Each party waives its rights to a jury trial in
211 | any resulting litigation.
212 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/io/BasicCodePointReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | **
3 | ** Feb. 1, 2009
4 | **
5 | ** The author disclaims copyright to this source code.
6 | ** In place of a legal notice, here is a blessing:
7 | **
8 | ** May you do good and not evil.
9 | ** May you find forgiveness for yourself and forgive others.
10 | ** May you share freely, never taking more than you give.
11 | **
12 | ** Stolen from SQLite :-)
13 | ** Any feedback is welcome.
14 | ** Kohei TAKETA
15 | **
16 | */
17 | package net.moraleboost.io;
18 |
19 | import java.io.IOException;
20 | import java.io.PushbackReader;
21 | import java.io.Reader;
22 |
23 | /**
24 | * Readerをラップして、ひとつずつUnicodeコードポイントを
25 | * 取得するためのデコレータクラス。サロゲートペアを正しく認識する。
26 | *
27 | * 不正なサロゲートペアは、{@link #getAlternationCodePoint()}で得られる
28 | * 代替コードポイントに置換される。
29 | */
30 | public class BasicCodePointReader implements CodePointReader
31 | {
32 | /**
33 | * 不正なサロゲートペアを置換する文字の既定値。
34 | */
35 | public static final int DEFAULT_ALTERNATION_CODEPOINT = '〓';
36 |
37 | private PushbackReader reader;
38 | private long position;
39 | private int alternationCodePoint;
40 | private boolean eos;
41 |
42 | /**
43 | * コードポイントイテレータを構築する。
44 | *
45 | * @param reader
46 | * ソースとなるcharのシーケンス
47 | */
48 | public BasicCodePointReader(Reader reader)
49 | {
50 | this.reader = new PushbackReader(reader, 1);
51 | this.alternationCodePoint = DEFAULT_ALTERNATION_CODEPOINT;
52 | }
53 |
54 | public void setAlternationCodePoint(int cp)
55 | {
56 | this.alternationCodePoint = cp;
57 | }
58 |
59 | public int getAlternationCodePoint()
60 | {
61 | return alternationCodePoint;
62 | }
63 |
64 | public long getPosition()
65 | {
66 | return position;
67 | }
68 |
69 | public int read() throws IOException
70 | {
71 | int ci;
72 | char c, c2;
73 |
74 | if (eos) {
75 | return -1;
76 | }
77 |
78 | ci = reader.read();
79 | ++position;
80 |
81 | if (ci < 0) {
82 | // end of character stream
83 | eos = true;
84 | return -1;
85 | } else {
86 | c = (char)ci;
87 | }
88 |
89 | if (Character.isHighSurrogate(c)) {
90 | // 次の文字を検査
91 | ci = reader.read();
92 | ++position;
93 | if (ci < 0) {
94 | // シーケンスがhigh surrogateで終わっている。
95 | // 代替文字を返すと共に、EOSフラグをONにする。
96 | eos = true;
97 | --position;
98 | return alternationCodePoint;
99 | }
100 |
101 | c2 = (char)ci;
102 | if (Character.isLowSurrogate(c2)) {
103 | // サロゲートペアをコードポイントに変換して返す。
104 | return Character.toCodePoint(c, c2);
105 | } else {
106 | // high surrogateに続くcharが、low surrogateでない。
107 | // c2をプッシュバックして代替文字を返す。
108 | reader.unread(c2);
109 | --position;
110 | return alternationCodePoint;
111 | }
112 | } else if (Character.isLowSurrogate(c)) {
113 | // 単独で存在するlow surrogateを発見。
114 | // 代替文字を返す。
115 | return alternationCodePoint;
116 | } else {
117 | // 基本文字。そのまま返す。
118 | return c;
119 | }
120 | }
121 |
122 | public void reset()
123 | {
124 | position = 0;
125 | eos = false;
126 | }
127 | }
128 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/io/CharsetUtil.java:
--------------------------------------------------------------------------------
1 | /*
2 | **
3 | ** Mar. 1, 2008
4 | **
5 | ** The author disclaims copyright to this source code.
6 | ** In place of a legal notice, here is a blessing:
7 | **
8 | ** May you do good and not evil.
9 | ** May you find forgiveness for yourself and forgive others.
10 | ** May you share freely, never taking more than you give.
11 | **
12 | ** Stolen from SQLite :-)
13 | ** Any feedback is welcome.
14 | ** Kohei TAKETA
15 | **
16 | */
17 | package net.moraleboost.io;
18 |
19 | import java.nio.ByteBuffer;
20 | import java.nio.CharBuffer;
21 | import java.nio.charset.*;
22 |
23 | /**
24 | * 文字コード変換用のヘルパークラス。
25 | *
26 | * @author takedaku
27 | */
28 | public abstract class CharsetUtil
29 | {
30 | /**
31 | * Unicodeから指定した文字コードへの変換器を作成する。
32 | *
33 | * @param charset
34 | * 文字コード
35 | * @param malformedInputAction
36 | * 不正な入力への対処方法
37 | * @param unmappableCharacterAction
38 | * 指定した文字コードに存在しない文字への対処方法
39 | * @return 作成した変換器
40 | * @throws java.nio.charset.IllegalCharsetNameException
41 | * 文字コード名が不正
42 | * @throws UnsupportedCharsetException
43 | * JVMが文字コードをサポートしていない
44 | * @throws IllegalArgumentException
45 | * malformedInputAction, unmappableCharacterActionに無効な値を指定。
46 | */
47 | public static CharsetEncoder createEncoder(String charset,
48 | CodingErrorAction malformedInputAction,
49 | CodingErrorAction unmappableCharacterAction)
50 | {
51 | Charset cset = Charset.forName(charset);
52 | if (!cset.canEncode()) {
53 | throw new UnsupportedCharsetException(charset);
54 | }
55 | CharsetEncoder encoder = cset.newEncoder();
56 | encoder.onMalformedInput(malformedInputAction);
57 | encoder.onUnmappableCharacter(unmappableCharacterAction);
58 |
59 | return encoder;
60 | }
61 |
62 | /**
63 | * 指定した文字コードからUnicodeへの変換器を作成する。
64 | *
65 | * @param charset
66 | * 文字コード
67 | * @param malformedInputAction
68 | * 不正な入力への対処方法
69 | * @param unmappableCharacterAction
70 | * 指定した文字コードに存在しない文字への対処方法
71 | * @return 作成した変換器
72 | * @throws java.nio.charset.IllegalCharsetNameException
73 | * 文字コード名が不正。
74 | * @throws UnsupportedCharsetException
75 | * JVMが文字コードをサポートしていない。
76 | * @throws IllegalArgumentException
77 | * malformedInputAction, unmappableCharacterActionに無効な値を指定。
78 | */
79 | public static CharsetDecoder createDecoder(String charset,
80 | CodingErrorAction malformedInputAction,
81 | CodingErrorAction unmappableCharacterAction)
82 | {
83 | Charset cset = Charset.forName(charset);
84 | CharsetDecoder decoder = cset.newDecoder();
85 | decoder.onMalformedInput(malformedInputAction);
86 | decoder.onUnmappableCharacter(unmappableCharacterAction);
87 | return decoder;
88 | }
89 |
90 | /**
91 | * 指定したエンコーダを用いて、Unicode文字列をバイト配列にエンコードする。
92 | *
93 | * @param encoder
94 | * エンコーダ
95 | * @param text
96 | * Unicode文字列
97 | * @param terminateWithNull
98 | * バイト配列の最後の要素としてヌル文字を詰めるかどうか
99 | * @return バイト配列
100 | * @throws CharacterCodingException
101 | * 変換エラーの発生
102 | */
103 | public static byte[] encode(CharsetEncoder encoder, CharSequence text,
104 | boolean terminateWithNull) throws CharacterCodingException
105 | {
106 | encoder.reset();
107 | ByteBuffer buf = encoder.encode(CharBuffer.wrap(text));
108 | int size = buf.limit();
109 |
110 | byte[] ret;
111 | if (terminateWithNull) {
112 | // \0を追加する。
113 | ret = new byte[size + 1];
114 | buf.get(ret, 0, size);
115 | ret[size] = 0;
116 | } else {
117 | ret = new byte[size];
118 | buf.get(ret, 0, size);
119 | }
120 |
121 | return ret;
122 | }
123 |
124 | /**
125 | * 指定したデコーダを用いて、バイト配列をUnicode文字列にデコードする。
126 | *
127 | * @param decoder
128 | * デコーダ
129 | * @param rawText
130 | * バイト配列
131 | * @return Unicode文字列
132 | * @throws CharacterCodingException
133 | * 変換エラーの発生
134 | */
135 | public static String decode(CharsetDecoder decoder, byte[] rawText)
136 | throws CharacterCodingException
137 | {
138 | decoder.reset();
139 | CharBuffer buf = decoder.decode(ByteBuffer.wrap(rawText));
140 | return buf.toString();
141 | }
142 | }
143 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/io/CodePointReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | **
3 | ** Feb. 17, 2009
4 | **
5 | ** The author disclaims copyright to this source code.
6 | ** In place of a legal notice, here is a blessing:
7 | **
8 | ** May you do good and not evil.
9 | ** May you find forgiveness for yourself and forgive others.
10 | ** May you share freely, never taking more than you give.
11 | **
12 | ** Stolen from SQLite :-)
13 | ** Any feedback is welcome.
14 | ** Kohei TAKETA
15 | **
16 | */
17 | package net.moraleboost.io;
18 |
19 | import java.io.IOException;
20 |
21 | /**
22 | * 一つずつUnicodeコードポイントを取得するためのクラス。
23 | * サロゲートペアを正しく認識する。
24 | */
25 | public interface CodePointReader
26 | {
27 | /**
28 | * 不正なサロゲートペアを置換するための代替文字をセットする。
29 | * このメソッドを呼び出さない場合の既定値は、
30 | * 「{@value BasicCodePointReader#DEFAULT_ALTERNATION_CODEPOINT}」である。
31 | *
32 | * @param cp
33 | * 代替文字のコードポイント
34 | */
35 | public abstract void setAlternationCodePoint(int cp);
36 |
37 | /**
38 | * 不正なサロゲートペアを置換するための代替文字を取得する。
39 | *
40 | * @return 代替文字のコードポイント
41 | */
42 | public abstract int getAlternationCodePoint();
43 |
44 | /**
45 | * キャラクタストリーム中の現在の位置を返す。
46 | * コードポイント単位でなくchar単位で数えるので、
47 | * サロゲートペアが出現すると、位置は2大きくなる。
48 | *
49 | * @return キャラクタストリーム中の位置。
50 | */
51 | public abstract long getPosition();
52 |
53 | /**
54 | * 次のコードポイントを取得する。
55 | *
56 | * @return Unicodeコードポイント。
57 | * @throws java.io.IOException
58 | */
59 | public abstract int read() throws IOException;
60 |
61 | /**
62 | * 状態をリセットする。
63 | */
64 | public abstract void reset();
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/io/PushbackCodePointReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | **
3 | ** Feb. 17, 2009
4 | **
5 | ** The author disclaims copyright to this source code.
6 | ** In place of a legal notice, here is a blessing:
7 | **
8 | ** May you do good and not evil.
9 | ** May you find forgiveness for yourself and forgive others.
10 | ** May you share freely, never taking more than you give.
11 | **
12 | ** Stolen from SQLite :-)
13 | ** Any feedback is welcome.
14 | ** Kohei TAKETA
15 | **
16 | */
17 | package net.moraleboost.io;
18 |
19 | import java.io.IOException;
20 |
21 | public class PushbackCodePointReader implements CodePointReader
22 | {
23 | /**
24 | * ベースとなるCodePointReader
25 | */
26 | private CodePointReader reader;
27 | /**
28 | * ストリームに戻された各コードポイントを保持するスタック
29 | */
30 | private int codepoints[];
31 | /**
32 | * ストリームに戻された各コードポイントの幅を保持するスタック
33 | */
34 | private int lengths[];
35 | /**
36 | * readerのpositionからのoffset
37 | */
38 | private int offset;
39 | /**
40 | * スタックトップの位置
41 | */
42 | private int stackpos;
43 |
44 | public PushbackCodePointReader(CodePointReader reader, int size)
45 | {
46 | this.reader = reader;
47 | this.codepoints = new int[size];
48 | this.lengths = new int[size];
49 | this.stackpos = -1;
50 | }
51 |
52 | public void setAlternationCodePoint(int cp)
53 | {
54 | reader.setAlternationCodePoint(cp);
55 | }
56 |
57 | public int getAlternationCodePoint()
58 | {
59 | return reader.getAlternationCodePoint();
60 | }
61 |
62 | public long getPosition()
63 | {
64 | return reader.getPosition() - offset;
65 | }
66 |
67 | public int getStackSize()
68 | {
69 | return codepoints.length;
70 | }
71 |
72 | public int read() throws IOException
73 | {
74 | if (stackpos >= 0) {
75 | offset -= lengths[stackpos];
76 | return codepoints[stackpos--];
77 | } else {
78 | return reader.read();
79 | }
80 | }
81 |
82 | public void reset()
83 | {
84 | reader.reset();
85 | stackpos = -1;
86 | }
87 |
88 | /**
89 | * コードポイントを一つストリームに戻す。
90 | *
91 | * @param cp
92 | * プッシュバックするコードポイント
93 | * @param length
94 | * cpの幅をchar数単位で指定
95 | * @throws java.io.IOException
96 | */
97 | public void unread(int cp, int length) throws IOException
98 | {
99 | if (stackpos + 1 >= codepoints.length) {
100 | throw new IOException("Stack overflow.");
101 | }
102 |
103 | ++stackpos;
104 | codepoints[stackpos] = cp;
105 | lengths[stackpos] = length;
106 | offset += length;
107 | }
108 | }
109 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/mecab/DictionaryInfo.java:
--------------------------------------------------------------------------------
1 | package net.moraleboost.mecab;
2 |
3 | public interface DictionaryInfo
4 | {
5 | int TYPE_SYS_DIC = 0;
6 | int TYPE_USR_DIC = 1;
7 | int TYPE_UNK_DIC = 2;
8 |
9 | String filename();
10 | String charset();
11 | long size();
12 | int type();
13 | long lsize();
14 | long rsize();
15 | int version();
16 | DictionaryInfo next();
17 | }
18 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/mecab/Lattice.java:
--------------------------------------------------------------------------------
1 | package net.moraleboost.mecab;
2 |
3 | public interface Lattice
4 | {
5 | int REQ_TYPE_ONE_BEST = 1;
6 | int REQ_TYPE_NBEST = 2;
7 | int REQ_TYPE_PARTIAL = 4;
8 | int REQ_TYPE_MARGINAL_PROB = 8;
9 | int REQ_TYPE_ALTERNATIVE = 16;
10 | int REQ_TYPE_ALL_MORPHS = 32;
11 | int REQ_TYPE_ALLOCATE_SENTENCE = 64;
12 |
13 | void destroy();
14 | void clear();
15 | boolean isAvailable();
16 | Node bosNode();
17 | Node eosNode();
18 | Node beginNodes(long pos);
19 | Node endNodes(long pos);
20 | String sentence();
21 | void setSentence(String sentence);
22 | long size();
23 | double Z();
24 | void setZ(double Z);
25 | double theta();
26 | void setTheta(double theta);
27 | boolean next();
28 | int requestType();
29 | boolean hasRequestType(int requestType);
30 | void setRequestType(int requestType);
31 | void addRequestType(int requestType);
32 | void removeRequestType(int requestType);
33 | String toString();
34 | String enumNBestAsString(long N);
35 | String what();
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/mecab/Model.java:
--------------------------------------------------------------------------------
1 | package net.moraleboost.mecab;
2 |
3 | public interface Model
4 | {
5 | void destroy();
6 | Tagger createTagger();
7 | Lattice createLattice();
8 | boolean swap(Model model);
9 | DictionaryInfo dictionaryInfo();
10 | }
11 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/mecab/Node.java:
--------------------------------------------------------------------------------
1 | package net.moraleboost.mecab;
2 |
3 | public interface Node
4 | {
5 | int TYPE_NOR_NODE = 0;
6 | int TYPE_UNK_NODE = 1;
7 | int TYPE_BOS_NODE = 2;
8 | int TYPE_EOS_NODE = 3;
9 | int TYPE_EON_NODE = 4;
10 |
11 | Node prev();
12 | Node next();
13 | Node enext();
14 | Node bnext();
15 | Path rpath();
16 | Path lpath();
17 | String surface();
18 | String rsurface();
19 | boolean leadingSpaceAndSurface(String[] leadingSpaceAndSurface);
20 | String feature();
21 | long id();
22 | int length();
23 | int rlength();
24 | int rcAttr();
25 | int lcAttr();
26 | int posid();
27 | int charType();
28 | int stat();
29 | boolean isbest();
30 | float alpha();
31 | float beta();
32 | float prob();
33 | short wcost();
34 | long cost();
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/mecab/Path.java:
--------------------------------------------------------------------------------
1 | package net.moraleboost.mecab;
2 |
3 | public interface Path
4 | {
5 | Node rnode();
6 | Path rnext();
7 | Node lnode();
8 | Path lnext();
9 | int cost();
10 | float prob();
11 | }
12 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/mecab/Tagger.java:
--------------------------------------------------------------------------------
1 | package net.moraleboost.mecab;
2 |
3 | public interface Tagger
4 | {
5 | void destroy();
6 | Lattice createLattice();
7 | boolean parse(Lattice lattice);
8 | DictionaryInfo dictionaryInfo();
9 | String what();
10 | String version();
11 | }
12 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/mecab/impl/StandardDictionaryInfo.java:
--------------------------------------------------------------------------------
1 | package net.moraleboost.mecab.impl;
2 |
3 | import net.moraleboost.mecab.DictionaryInfo;
4 | import org.bridj.Platform;
5 | import org.bridj.Pointer;
6 | import org.bridj.StructObject;
7 | import org.bridj.ann.Field;
8 |
9 | import java.nio.charset.Charset;
10 |
11 | public class StandardDictionaryInfo extends StructObject implements DictionaryInfo
12 | {
13 | protected StandardDictionaryInfo(Pointer p)
14 | {
15 | super(p);
16 | }
17 |
18 | @Field(0)
19 | public Pointer _filename()
20 | {
21 | return this.io.getPointerField(this, 0);
22 | }
23 |
24 | public String filename()
25 | {
26 | Pointer p = _filename();
27 | if (p == null) {
28 | return null;
29 | }
30 |
31 | if (Platform.isWindows()) {
32 | // always UTF-8
33 | return p.getString(Pointer.StringType.C, Charset.forName("UTF-8"));
34 | } else {
35 | return p.getCString();
36 | }
37 | }
38 |
39 | @Field(1)
40 | public Pointer _charset()
41 | {
42 | return this.io.getPointerField(this, 1);
43 | }
44 |
45 | public String charset()
46 | {
47 | Pointer p = _charset();
48 | return (p == null) ? null : p.getCString();
49 | }
50 |
51 | @Field(2)
52 | public int _size()
53 | {
54 | return this.io.getIntField(this, 2);
55 | }
56 |
57 | public long size()
58 | {
59 | // convert to long
60 | return (_size() & 0xffffffffL);
61 | }
62 |
63 | @Field(3)
64 | public int type()
65 | {
66 | return this.io.getIntField(this, 3);
67 | }
68 |
69 | @Field(4)
70 | public int _lsize()
71 | {
72 | return this.io.getIntField(this, 4);
73 | }
74 |
75 | public long lsize()
76 | {
77 | return (_lsize() & 0xffffffffL);
78 | }
79 |
80 | @Field(5)
81 | public int _rsize()
82 | {
83 | return this.io.getIntField(this, 5);
84 | }
85 |
86 | public long rsize()
87 | {
88 | return (_rsize() & 0xffffffffL);
89 | }
90 |
91 | @Field(6)
92 | public short _version()
93 | {
94 | return this.io.getShortField(this, 6);
95 | }
96 |
97 | public int version()
98 | {
99 | return (_version() & 0xffff);
100 | }
101 |
102 | @Field(7)
103 | public Pointer _next()
104 | {
105 | return this.io.getPointerField(this, 7);
106 | }
107 |
108 | public StandardDictionaryInfo next()
109 | {
110 | Pointer p = _next();
111 | return (p == null) ? null : new StandardDictionaryInfo(p);
112 | }
113 | }
114 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/mecab/impl/StandardLattice.java:
--------------------------------------------------------------------------------
1 | package net.moraleboost.mecab.impl;
2 |
3 | import net.moraleboost.mecab.Lattice;
4 | import org.bridj.BridJ;
5 | import org.bridj.Platform;
6 | import org.bridj.Pointer;
7 | import org.bridj.SizeT;
8 | import org.bridj.ann.Library;
9 |
10 | import java.nio.charset.Charset;
11 |
12 | @Library("mecab")
13 | public class StandardLattice implements Lattice
14 | {
15 | static {
16 | if (Platform.isWindows()) {
17 | BridJ.setNativeLibraryActualName("mecab", "libmecab");
18 | }
19 | BridJ.register();
20 | }
21 |
22 | private static native Pointer> mecab_lattice_new();
23 | private static native void mecab_lattice_destroy(Pointer> pLattice);
24 | private static native void mecab_lattice_clear(Pointer> pLattice);
25 | private static native int mecab_lattice_is_available(Pointer> pLattice);
26 | private static native Pointer mecab_lattice_get_bos_node(Pointer> pLattice);
27 | private static native Pointer mecab_lattice_get_eos_node(Pointer> pLattice);
28 | private static native Pointer mecab_lattice_get_begin_nodes(Pointer> pLattice, SizeT pos);
29 | private static native Pointer mecab_lattice_get_end_nodes(Pointer> pLattice, SizeT pos);
30 | private static native Pointer mecab_lattice_get_sentence(Pointer> pLattice);
31 | private static native void mecab_lattice_set_sentence(Pointer> pLattice, Pointer sentence);
32 | private static native SizeT mecab_lattice_get_size(Pointer> pLattice);
33 | private static native double mecab_lattice_get_z(Pointer> pLattice);
34 | private static native void mecab_lattice_set_z(Pointer> pLattice, double Z);
35 | private static native double mecab_lattice_get_theta(Pointer> pLattice);
36 | private static native void mecab_lattice_set_theta(Pointer> pLattice, double theta);
37 | private static native int mecab_lattice_next(Pointer> pLattice);
38 | private static native int mecab_lattice_get_request_type(Pointer> pLattice);
39 | private static native int mecab_lattice_has_request_type(Pointer> pLattice, int requestType);
40 | private static native void mecab_lattice_set_request_type(Pointer> pLattice, int requestType);
41 | private static native void mecab_lattice_add_request_type(Pointer> pLattice, int requestType);
42 | private static native void mecab_lattice_remove_request_type(Pointer> pLattice, int requestType);
43 | private static native Pointer mecab_lattice_tostr(Pointer> pLattice);
44 | private static native Pointer mecab_lattice_nbest_tostr(Pointer> pLattice, SizeT N);
45 | private static native Pointer mecab_lattice_strerror(Pointer> pLattice);
46 |
47 | private Pointer> pLattice;
48 | private Pointer pSentence;
49 | private Charset charset;
50 |
51 | public StandardLattice(Charset charset)
52 | {
53 | pLattice = mecab_lattice_new();
54 | if (pLattice == null) {
55 | throw new OutOfMemoryError("mecab_lattice_new() failed.");
56 | }
57 | this.charset = charset;
58 | }
59 |
60 | protected StandardLattice(Pointer> p, Charset charset)
61 | {
62 | this.pLattice = p;
63 | this.charset = charset;
64 | }
65 |
66 | protected Pointer> getPointer()
67 | {
68 | return pLattice;
69 | }
70 |
71 | protected void finalize() throws Throwable
72 | {
73 | try {
74 | destroy();
75 | } finally {
76 | super.finalize();
77 | }
78 | }
79 |
80 | public void destroy()
81 | {
82 | try {
83 | if (pLattice != null) {
84 | mecab_lattice_destroy(pLattice);
85 | }
86 | if (pSentence != null) {
87 | pSentence.release();
88 | }
89 | } finally {
90 | pSentence = null;
91 | pLattice = null;
92 | }
93 | }
94 |
95 | public void clear()
96 | {
97 | try {
98 | mecab_lattice_clear(pLattice);
99 | if (pSentence != null) {
100 | pSentence.release();
101 | }
102 | } finally {
103 | pSentence = null;
104 | }
105 | }
106 |
107 | public boolean isAvailable()
108 | {
109 | return (mecab_lattice_is_available(pLattice) != 0);
110 | }
111 |
112 | public StandardNode bosNode()
113 | {
114 | Pointer p = mecab_lattice_get_bos_node(pLattice);
115 | if (p == null) {
116 | return null;
117 | } else {
118 | return new StandardNode(p, charset);
119 | }
120 | }
121 |
122 | public StandardNode eosNode()
123 | {
124 | Pointer p = mecab_lattice_get_eos_node(pLattice);
125 | if (p == null) {
126 | return null;
127 | } else {
128 | return new StandardNode(p, charset);
129 | }
130 | }
131 |
132 | public StandardNode beginNodes(long pos)
133 | {
134 | Pointer p = mecab_lattice_get_begin_nodes(pLattice, SizeT.valueOf(pos));
135 | if (p == null) {
136 | return null;
137 | } else {
138 | return new StandardNode(p, charset);
139 | }
140 | }
141 |
142 | public StandardNode endNodes(long pos)
143 | {
144 | Pointer p = mecab_lattice_get_end_nodes(pLattice, SizeT.valueOf(pos));
145 | if (p == null) {
146 | return null;
147 | } else {
148 | return new StandardNode(p, charset);
149 | }
150 | }
151 |
152 | public String sentence()
153 | {
154 | Pointer p = mecab_lattice_get_sentence(pLattice);
155 | if (p == null) {
156 | return null;
157 | }
158 |
159 | return p.getString(Pointer.StringType.C, charset);
160 | }
161 |
162 | public void setSentence(String sentence)
163 | {
164 | try {
165 | if (pSentence != null) {
166 | pSentence.release();
167 | }
168 | } finally {
169 | pSentence = null;
170 | }
171 | pSentence = Pointer.pointerToString(sentence, Pointer.StringType.C, charset).as(Byte.class);
172 | mecab_lattice_set_sentence(pLattice, pSentence);
173 | }
174 |
175 | public long size()
176 | {
177 | return mecab_lattice_get_size(pLattice).longValue();
178 | }
179 |
180 | public double Z()
181 | {
182 | return mecab_lattice_get_z(pLattice);
183 | }
184 |
185 | public void setZ(double Z)
186 | {
187 | mecab_lattice_set_z(pLattice, Z);
188 | }
189 |
190 | public double theta()
191 | {
192 | return mecab_lattice_get_theta(pLattice);
193 | }
194 |
195 | public void setTheta(double theta)
196 | {
197 | mecab_lattice_set_theta(pLattice, theta);
198 | }
199 |
200 | public boolean next()
201 | {
202 | return (mecab_lattice_next(pLattice) != 0);
203 | }
204 |
205 | public int requestType()
206 | {
207 | return mecab_lattice_get_request_type(pLattice);
208 | }
209 |
210 | public boolean hasRequestType(int requestType)
211 | {
212 | return (mecab_lattice_has_request_type(pLattice, requestType) != 0);
213 | }
214 |
215 | public void setRequestType(int requestType)
216 | {
217 | mecab_lattice_set_request_type(pLattice, requestType);
218 | }
219 |
220 | public void addRequestType(int requestType)
221 | {
222 | mecab_lattice_add_request_type(pLattice, requestType);
223 | }
224 |
225 | public void removeRequestType(int requestType)
226 | {
227 | mecab_lattice_remove_request_type(pLattice, requestType);
228 | }
229 |
230 | @Override
231 | public String toString()
232 | {
233 | Pointer p = mecab_lattice_tostr(pLattice);
234 | if (p == null) {
235 | return null;
236 | } else {
237 | return p.getString(Pointer.StringType.C, charset);
238 | }
239 | }
240 |
241 | public String enumNBestAsString(long N)
242 | {
243 | Pointer p = mecab_lattice_nbest_tostr(pLattice, SizeT.valueOf(N));
244 | if (p == null) {
245 | return null;
246 | } else {
247 | return p.getString(Pointer.StringType.C, charset);
248 | }
249 | }
250 |
251 | public String what()
252 | {
253 | Pointer p = mecab_lattice_strerror(pLattice);
254 | if (p == null) {
255 | return null;
256 | } else {
257 | return p.getString(Pointer.StringType.C, charset);
258 | }
259 | }
260 | }
261 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/mecab/impl/StandardModel.java:
--------------------------------------------------------------------------------
1 | package net.moraleboost.mecab.impl;
2 |
3 | import net.moraleboost.mecab.Model;
4 | import org.bridj.BridJ;
5 | import org.bridj.Platform;
6 | import org.bridj.Pointer;
7 | import org.bridj.ann.Library;
8 |
9 | import java.nio.charset.Charset;
10 |
11 | @Library("mecab")
12 | public class StandardModel implements Model
13 | {
14 | static {
15 | if (Platform.isWindows()) {
16 | BridJ.setNativeLibraryActualName("mecab", "libmecab");
17 | }
18 | BridJ.register();
19 | }
20 |
21 | private static native Pointer> mecab_model_new2(Pointer arg);
22 | private static native void mecab_model_destroy(Pointer> pModel);
23 | private static native Pointer> mecab_model_new_tagger(Pointer> pModel);
24 | private static native Pointer> mecab_model_new_lattice(Pointer> pModel);
25 | private static native int mecab_model_swap(Pointer> pModel, Pointer> pNewModel);
26 | private static native Pointer mecab_model_dictionary_info(Pointer> pModel);
27 |
28 | private Pointer> pModel;
29 | private Charset charset;
30 |
31 | public StandardModel(String arg)
32 | {
33 | Pointer parg = Pointer.pointerToCString(arg);
34 | try {
35 | pModel = mecab_model_new2(parg);
36 | } finally {
37 | Pointer.release(parg);
38 | }
39 |
40 | if (pModel == null) {
41 | throw new OutOfMemoryError("mecab_model_new2() failed.");
42 | }
43 |
44 | StandardDictionaryInfo dictInfo = dictionaryInfo();
45 | charset = Charset.forName(dictInfo.charset());
46 | }
47 |
48 | public StandardModel(String arg, Charset charset)
49 | {
50 | Pointer parg = Pointer.pointerToCString(arg);
51 | try {
52 | pModel = mecab_model_new2(parg);
53 | } finally {
54 | Pointer.release(parg);
55 | }
56 |
57 | if (pModel == null) {
58 | throw new OutOfMemoryError("mecab_model_new2() failed.");
59 | }
60 |
61 | this.charset = charset;
62 | }
63 |
64 | protected Pointer> getPointer()
65 | {
66 | return pModel;
67 | }
68 |
69 | protected void finalize() throws Throwable
70 | {
71 | try {
72 | destroy();
73 | } finally {
74 | super.finalize();
75 | }
76 | }
77 |
78 | public void destroy()
79 | {
80 | try {
81 | if (pModel != null) {
82 | mecab_model_destroy(pModel);
83 | }
84 | } finally {
85 | pModel = null;
86 | }
87 | }
88 |
89 | public StandardTagger createTagger()
90 | {
91 | Pointer> p = mecab_model_new_tagger(pModel);
92 | if (p == null) {
93 | throw new OutOfMemoryError("mecab_model_new_tagger() failed.");
94 | } else {
95 | return new StandardTagger(p, charset);
96 | }
97 | }
98 |
99 | public StandardLattice createLattice()
100 | {
101 | Pointer> p = mecab_model_new_lattice(pModel);
102 | if (p == null) {
103 | throw new OutOfMemoryError("mecab_model_new_lattice() failed.");
104 | } else {
105 | return new StandardLattice(p, charset);
106 | }
107 | }
108 |
109 | public boolean swap(Model model)
110 | {
111 | if (model != null && (model instanceof StandardModel)) {
112 | return (mecab_model_swap(pModel, ((StandardModel)model).getPointer()) != 0);
113 | } else {
114 | return false;
115 | }
116 | }
117 |
118 | public StandardDictionaryInfo dictionaryInfo()
119 | {
120 | Pointer p = mecab_model_dictionary_info(pModel);
121 | if (p == null) {
122 | throw new OutOfMemoryError("mecab_model_dictionary_info() failed.");
123 | } else {
124 | return new StandardDictionaryInfo(p);
125 | }
126 | }
127 | }
128 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/mecab/impl/StandardNode.java:
--------------------------------------------------------------------------------
1 | package net.moraleboost.mecab.impl;
2 |
3 | import net.moraleboost.mecab.Node;
4 | import org.bridj.Pointer;
5 | import org.bridj.StructObject;
6 | import org.bridj.ann.CLong;
7 | import org.bridj.ann.Field;
8 |
9 | import java.nio.charset.Charset;
10 |
11 | public class StandardNode extends StructObject implements Node
12 | {
13 | private Charset charset;
14 |
15 | protected StandardNode(Pointer p, Charset charset)
16 | {
17 | super(p);
18 | this.charset = charset;
19 | }
20 |
21 | @Field(0)
22 | public Pointer _prev()
23 | {
24 | return this.io.getPointerField(this, 0);
25 | }
26 |
27 | public StandardNode prev()
28 | {
29 | Pointer p = _prev();
30 | if (p == null) {
31 | return null;
32 | } else {
33 | return new StandardNode(p, charset);
34 | }
35 | }
36 |
37 | @Field(1)
38 | public Pointer _next()
39 | {
40 | return this.io.getPointerField(this, 1);
41 | }
42 |
43 | public StandardNode next()
44 | {
45 | Pointer p = _next();
46 | if (p == null) {
47 | return null;
48 | } else {
49 | return new StandardNode(p, charset);
50 | }
51 | }
52 |
53 | @Field(2)
54 | public Pointer _enext()
55 | {
56 | return this.io.getPointerField(this, 2);
57 | }
58 |
59 | public StandardNode enext()
60 | {
61 | Pointer p = _enext();
62 | if (p == null) {
63 | return null;
64 | } else {
65 | return new StandardNode(p, charset);
66 | }
67 | }
68 |
69 | @Field(3)
70 | public Pointer _bnext()
71 | {
72 | return this.io.getPointerField(this, 3);
73 | }
74 |
75 | public StandardNode bnext()
76 | {
77 | Pointer p = _bnext();
78 | if (p == null) {
79 | return null;
80 | } else {
81 | return new StandardNode(p, charset);
82 | }
83 | }
84 |
85 | @Field(4)
86 | public Pointer _rpath()
87 | {
88 | return this.io.getPointerField(this, 4);
89 | }
90 |
91 | public StandardPath rpath()
92 | {
93 | Pointer p = _rpath();
94 | if (p == null) {
95 | return null;
96 | } else {
97 | return new StandardPath(p, charset);
98 | }
99 | }
100 |
101 | @Field(5)
102 | public Pointer _lpath()
103 | {
104 | return this.io.getPointerField(this, 5);
105 | }
106 |
107 | public StandardPath lpath()
108 | {
109 | Pointer p = _lpath();
110 | if (p == null) {
111 | return null;
112 | } else {
113 | return new StandardPath(p, charset);
114 | }
115 | }
116 |
117 | @Field(6)
118 | public Pointer _surface()
119 | {
120 | return this.io.getPointerField(this, 6);
121 | }
122 |
123 | public String surface()
124 | {
125 | Pointer p = _surface();
126 | int len = length();
127 | if (p == null) {
128 | return null;
129 | } else {
130 | return new String(p.getBytes(len), charset);
131 | }
132 | }
133 |
134 | public String rsurface()
135 | {
136 | Pointer p = _surface();
137 | int rlen = rlength();
138 | int len = length();
139 | if (p == null) {
140 | return null;
141 | } else {
142 | return new String(p.offset(len-rlen).getBytes(rlen), charset);
143 | }
144 | }
145 |
146 | public boolean leadingSpaceAndSurface(String[] leadingSpaceAndSurface)
147 | {
148 | if (leadingSpaceAndSurface.length != 2) {
149 | throw new IllegalArgumentException("leadingSpaceAndSurface.length must be 2.");
150 | }
151 |
152 | Pointer p = _surface();
153 | int rlen = rlength();
154 | int len = length();
155 | int offset = rlen - len;
156 |
157 | if (p == null) {
158 | return false;
159 | } else {
160 | byte[] bytestr = p.offset(-offset).getBytes(rlen);
161 | leadingSpaceAndSurface[0] = new String(bytestr, 0, offset, charset); // leading space
162 | leadingSpaceAndSurface[1] = new String(bytestr, offset, len, charset); // surface
163 | return true;
164 | }
165 | }
166 |
167 | @Field(7)
168 | public Pointer _feature()
169 | {
170 | return this.io.getPointerField(this, 7);
171 | }
172 |
173 | public String feature()
174 | {
175 | Pointer p = _feature();
176 | if (p == null) {
177 | return null;
178 | } else {
179 | return p.getString(Pointer.StringType.C, charset);
180 | }
181 | }
182 |
183 | @Field(8)
184 | public int _id()
185 | {
186 | return this.io.getIntField(this, 8);
187 | }
188 |
189 | public long id()
190 | {
191 | return (_id() & 0xffffffffL);
192 | }
193 |
194 | @Field(9)
195 | public short _length()
196 | {
197 | return this.io.getShortField(this, 9);
198 | }
199 |
200 | public int length()
201 | {
202 | return (_length() & 0xffff);
203 | }
204 |
205 | @Field(10)
206 | public short _rlength()
207 | {
208 | return this.io.getShortField(this, 10);
209 | }
210 |
211 | public int rlength()
212 | {
213 | return (_rlength() & 0xffff);
214 | }
215 |
216 | @Field(11)
217 | public short _rcAttr()
218 | {
219 | return this.io.getShortField(this, 11);
220 | }
221 |
222 | public int rcAttr()
223 | {
224 | return (_rcAttr() & 0xffff);
225 | }
226 |
227 | @Field(12)
228 | public short _lcAttr()
229 | {
230 | return this.io.getShortField(this, 12);
231 | }
232 |
233 | public int lcAttr()
234 | {
235 | return (_lcAttr() & 0xffff);
236 | }
237 |
238 | @Field(13)
239 | public short _posid()
240 | {
241 | return this.io.getShortField(this, 13);
242 | }
243 |
244 | public int posid()
245 | {
246 | return (_posid() & 0xffff);
247 | }
248 |
249 | @Field(14)
250 | public byte _charType()
251 | {
252 | return this.io.getByteField(this, 14);
253 | }
254 |
255 | public int charType()
256 | {
257 | return (_charType() & 0xff);
258 | }
259 |
260 | @Field(15)
261 | public byte _stat()
262 | {
263 | return this.io.getByteField(this, 15);
264 | }
265 |
266 | public int stat()
267 | {
268 | return (_stat() & 0xff);
269 | }
270 |
271 | @Field(16)
272 | public byte _isbest()
273 | {
274 | return this.io.getByteField(this, 16);
275 | }
276 |
277 | public boolean isbest()
278 | {
279 | return (_isbest() != 0);
280 | }
281 |
282 | @Field(17)
283 | public float alpha()
284 | {
285 | return this.io.getFloatField(this, 17);
286 | }
287 |
288 | @Field(18)
289 | public float beta()
290 | {
291 | return this.io.getFloatField(this, 18);
292 | }
293 |
294 | @Field(19)
295 | public float prob()
296 | {
297 | return this.io.getFloatField(this, 19);
298 | }
299 |
300 | @Field(20)
301 | public short wcost()
302 | {
303 | return this.io.getShortField(this, 20);
304 | }
305 |
306 | @Field(21)
307 | @CLong
308 | public long cost()
309 | {
310 | return this.io.getCLongField(this, 21);
311 | }
312 | }
313 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/mecab/impl/StandardPath.java:
--------------------------------------------------------------------------------
1 | package net.moraleboost.mecab.impl;
2 |
3 | import net.moraleboost.mecab.Path;
4 | import org.bridj.Pointer;
5 | import org.bridj.StructObject;
6 | import org.bridj.ann.Field;
7 |
8 | import java.nio.charset.Charset;
9 |
10 | public class StandardPath extends StructObject implements Path
11 | {
12 | private Charset charset;
13 |
14 | protected StandardPath(Pointer p, Charset charset)
15 | {
16 | super(p);
17 | this.charset = charset;
18 | }
19 |
20 | @Field(0)
21 | public Pointer _rnode()
22 | {
23 | return this.io.getPointerField(this, 0);
24 | }
25 |
26 | public StandardNode rnode()
27 | {
28 | Pointer p = _rnode();
29 | return (p == null) ? null : new StandardNode(p, charset);
30 | }
31 |
32 | @Field(1)
33 | public Pointer _rnext()
34 | {
35 | return this.io.getPointerField(this, 1);
36 | }
37 |
38 | public StandardPath rnext()
39 | {
40 | Pointer p = _rnext();
41 | return (p == null) ? null : new StandardPath(p, charset);
42 | }
43 |
44 | @Field(2)
45 | public Pointer _lnode()
46 | {
47 | return this.io.getPointerField(this, 2);
48 | }
49 |
50 | public StandardNode lnode()
51 | {
52 | Pointer p = _lnode();
53 | return (p == null) ? null : new StandardNode(p, charset);
54 | }
55 |
56 | @Field(3)
57 | public Pointer _lnext()
58 | {
59 | return this.io.getPointerField(this, 3);
60 | }
61 |
62 | public StandardPath lnext()
63 | {
64 | Pointer p = _lnext();
65 | return (p == null) ? null : new StandardPath(p, charset);
66 | }
67 |
68 | @Field(4)
69 | public int cost()
70 | {
71 | return this.io.getIntField(this, 4);
72 | }
73 |
74 | @Field(5)
75 | public float prob()
76 | {
77 | return this.io.getFloatField(this, 5);
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/mecab/impl/StandardTagger.java:
--------------------------------------------------------------------------------
1 | package net.moraleboost.mecab.impl;
2 |
3 | import net.moraleboost.mecab.Lattice;
4 | import net.moraleboost.mecab.Tagger;
5 | import org.bridj.BridJ;
6 | import org.bridj.Platform;
7 | import org.bridj.Pointer;
8 | import org.bridj.ann.Library;
9 |
10 | import java.nio.charset.Charset;
11 |
12 | @Library("mecab")
13 | public class StandardTagger implements Tagger
14 | {
15 | static {
16 | if (Platform.isWindows()) {
17 | BridJ.setNativeLibraryActualName("mecab", "libmecab");
18 | }
19 | BridJ.register();
20 | }
21 |
22 | private static native Pointer> mecab_new2(Pointer arg);
23 | private static native Pointer mecab_version();
24 | private static native Pointer mecab_strerror(Pointer> pTagger);
25 | private static native void mecab_destroy(Pointer> pTagger);
26 | private static native int mecab_parse_lattice(Pointer> pTagger, Pointer> pLattice);
27 | private static native Pointer mecab_dictionary_info(Pointer> pTagger);
28 |
29 | private Pointer> pTagger;
30 | private Charset charset;
31 |
32 | public StandardTagger(String arg)
33 | {
34 | Pointer parg = Pointer.pointerToCString(arg);
35 | try {
36 | pTagger = mecab_new2(parg);
37 | } finally {
38 | Pointer.release(parg);
39 | }
40 |
41 | if (pTagger == null) {
42 | throw new OutOfMemoryError("mecab_new2() failed.");
43 | }
44 |
45 | StandardDictionaryInfo dictInfo = dictionaryInfo();
46 | charset = Charset.forName(dictInfo.charset());
47 | }
48 |
49 | public StandardTagger(String arg, Charset charset)
50 | {
51 | Pointer parg = Pointer.pointerToCString(arg);
52 | try {
53 | pTagger = mecab_new2(parg);
54 | } finally {
55 | Pointer.release(parg);
56 | }
57 |
58 | if (pTagger == null) {
59 | throw new OutOfMemoryError("mecab_new2() failed.");
60 | }
61 |
62 | this.charset = charset;
63 | }
64 |
65 | protected StandardTagger(Pointer> p, Charset charset)
66 | {
67 | this.pTagger = p;
68 | this.charset = charset;
69 | }
70 |
71 | protected void finalize() throws Throwable
72 | {
73 | try {
74 | destroy();
75 | } finally {
76 | super.finalize();
77 | }
78 | }
79 |
80 | public void destroy()
81 | {
82 | try {
83 | if (pTagger != null) {
84 | mecab_destroy(pTagger);
85 | }
86 | } finally {
87 | pTagger = null;
88 | }
89 | }
90 |
91 | public StandardLattice createLattice()
92 | {
93 | return new StandardLattice(charset);
94 | }
95 |
96 | public boolean parse(Lattice lattice)
97 | {
98 | if (lattice != null && (lattice instanceof StandardLattice)) {
99 | return (mecab_parse_lattice(pTagger, ((StandardLattice)lattice).getPointer()) != 0);
100 | } else {
101 | return false;
102 | }
103 | }
104 |
105 | public StandardDictionaryInfo dictionaryInfo()
106 | {
107 | Pointer p = mecab_dictionary_info(pTagger);
108 | if (p == null) {
109 | throw new OutOfMemoryError("mecab_dictionary_info() failed.");
110 | } else {
111 | return new StandardDictionaryInfo(p);
112 | }
113 | }
114 |
115 | public String what()
116 | {
117 | Pointer p = mecab_strerror(pTagger);
118 | if (p == null) {
119 | return null;
120 | } else {
121 | return p.getString(Pointer.StringType.C, charset);
122 | }
123 | }
124 |
125 | public String version()
126 | {
127 | Pointer p = mecab_version();
128 | if (p == null) {
129 | return null;
130 | } else {
131 | return p.getCString();
132 | }
133 | }
134 |
135 | public static void main(String[] args)
136 | {
137 | StringBuilder text = new StringBuilder();
138 | for (String arg: args) {
139 | if (text.length() != 0) {
140 | text.append(" ");
141 | }
142 | text.append(arg);
143 | }
144 |
145 | StandardTagger tagger = new StandardTagger("");
146 | Lattice lattice = tagger.createLattice();
147 | lattice.setSentence(text.toString());
148 | tagger.parse(lattice);
149 |
150 | System.out.println("MeCab version " + tagger.version());
151 | System.out.println();
152 | System.out.println("Original text: " + text.toString());
153 | System.out.println();
154 | System.out.println("Morphemes:");
155 | System.out.println(lattice.toString());
156 | }
157 | }
158 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/tinysegmenter/ModelExporter.java:
--------------------------------------------------------------------------------
1 | /*
2 | **
3 | ** Mar. 24, 2009
4 | **
5 | ** The author disclaims copyright to this source code.
6 | ** In place of a legal notice, here is a blessing:
7 | **
8 | ** May you do good and not evil.
9 | ** May you find forgiveness for yourself and forgive others.
10 | ** May you share freely, never taking more than you give.
11 | **
12 | ** Stolen from SQLite :-)
13 | ** Any feedback is welcome.
14 | ** Kohei TAKETA
15 | **
16 | */
17 | package net.moraleboost.tinysegmenter;
18 |
19 | import org.mozilla.javascript.Context;
20 | import org.mozilla.javascript.ContextFactory;
21 | import org.mozilla.javascript.Function;
22 | import org.mozilla.javascript.Scriptable;
23 |
24 | import java.io.*;
25 |
26 | public class ModelExporter
27 | {
28 | // java ModelExporter tiny_segmenter_source.js exporter.js out.java
29 | public static void main(String[] args)
30 | throws Exception
31 | {
32 | try {
33 | Context ctx = ContextFactory.getGlobal().enterContext();
34 | Scriptable scope = ctx.initStandardObjects();
35 |
36 | evaluateSource(ctx, scope, args[0]);
37 | evaluateSource(ctx, scope, args[1]);
38 |
39 | emit(ctx, scope, args[2]);
40 | } finally {
41 | Context.exit();
42 | }
43 | }
44 |
45 | private static void evaluateSource(Context ctx, Scriptable scope, String filename)
46 | throws Exception
47 | {
48 | FileInputStream fis = null;
49 | InputStreamReader isr = null;
50 |
51 | try {
52 | File f = new File(filename);
53 | fis = new FileInputStream(f);
54 | isr = new InputStreamReader(fis, "utf-8");
55 | ctx.evaluateReader(scope, isr, f.getName(), 1, null);
56 | } finally {
57 | if (isr != null) {
58 | try { isr.close(); } catch (Exception ignored) {}
59 | }
60 | if (fis != null) {
61 | try { fis.close(); } catch (Exception ignored) {}
62 | }
63 | }
64 | }
65 |
66 | private static void emit(Context ctx, Scriptable scope, String filename)
67 | throws Exception
68 | {
69 | FileOutputStream fos = null;
70 | OutputStreamWriter osw = null;
71 | try {
72 | File f = new File(filename);
73 | fos = new FileOutputStream(f);
74 | osw = new OutputStreamWriter(fos, "utf-8");
75 | emitToWriter(ctx, scope, osw);
76 | } finally {
77 | if (osw != null) {
78 | try { osw.close(); } catch (Exception ignored) {}
79 | }
80 | if (fos != null) {
81 | try { fos.close(); } catch (Exception ignored) {}
82 | }
83 | }
84 | }
85 |
86 | private static void emitToWriter(Context ctx, Scriptable scope, Writer w)
87 | throws Exception
88 | {
89 | emitPrologue(w);
90 |
91 | String[] names = {
92 | "BC1", "BC2", "BC3",
93 | "BP1", "BP2",
94 | "BQ1", "BQ2", "BQ3", "BQ4",
95 | "BW1", "BW2", "BW3",
96 | "TC1", "TC2", "TC3", "TC4",
97 | "TQ1", "TQ2", "TQ3", "TQ4",
98 | "TW1", "TW2", "TW3", "TW4",
99 | "UC1", "UC2", "UC3", "UC4", "UC5", "UC6",
100 | "UP1", "UP2", "UP3",
101 | "UQ1", "UQ2", "UQ3",
102 | "UW1", "UW2", "UW3", "UW4", "UW5", "UW6"
103 | };
104 |
105 | Function getKeys = (Function)scope.get("getKeys", scope);
106 | Function getValues = (Function)scope.get("getValues", scope);
107 | Object bias = scope.get("BIAS", scope);
108 |
109 | w.write(" public static final int BIAS = " + Context.toString(bias) + ";\r\n");
110 | for (String name: names) {
111 | Object keys = Context.toString(getKeys.call(ctx, scope, scope, new String[] {name}));
112 | Object vals = Context.toString(getValues.call(ctx, scope, scope, new String[] {name}));
113 | w.write(" public static final String[] " + name + "_KEYS" + " = " + keys + ";\r\n");
114 | w.write(" public static final Integer[] " + name + "_VALS" + " = " + vals + ";\r\n");
115 | }
116 |
117 | for (String name: names) {
118 | w.write(" public static final Map " + name + ";\r\n");
119 | }
120 |
121 | emitStaticBlock(w, names);
122 |
123 | emitEpilogue(w);
124 | }
125 |
126 | private static void emitPrologue(Writer w)
127 | throws Exception
128 | {
129 | w.write("// Automatically generated. Do not edit.\r\n");
130 | w.write("package net.moraleboost.tinysegmenter;\r\n\r\n");
131 | w.write("import java.util.Collections;\r\n");
132 | w.write("import java.util.Map;\r\n");
133 | w.write("import java.util.HashMap;\r\n\r\n");
134 | w.write("public class TinySegmenterConstants\r\n");
135 | w.write("{\r\n");
136 | }
137 |
138 | private static void emitEpilogue(Writer w)
139 | throws Exception
140 | {
141 | w.write("}\r\n");
142 | }
143 |
144 | private static void emitStaticBlock(Writer w, String[] names)
145 | throws Exception
146 | {
147 | w.write(" static {\r\n");
148 | w.write(" int i;\r\n");
149 | w.write(" Map m;\r\n");
150 |
151 | for (String name: names) {
152 | w.write(" ");
153 | w.write("m = new HashMap();\r\n");
154 | w.write(" ");
155 | w.write("for (i=0; i<" + name + "_KEYS.length; ++i) {\r\n");
156 | w.write(" ");
157 | w.write(" m.put(" + name + "_KEYS[i], " + name + "_VALS[i]);\r\n");
158 | w.write(" ");
159 | w.write("}\r\n");
160 | w.write(" ");
161 | w.write(name + " = Collections.unmodifiableMap(m);\r\n");
162 | }
163 |
164 | w.write(" }\r\n");
165 | }
166 | }
167 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/tinysegmenter/TinySegmenter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Based on TinySegmenter 0.1 -- Super compact Japanese tokenizer in Javascript
3 | * (c) 2008 Taku Kudo
4 | * TinySegmenter is freely distributable under the terms of a new BSD licence.
5 | * For details, see http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt
6 | *
7 | * Ported to Java by Kohei TAKETA
8 | */
9 | package net.moraleboost.tinysegmenter;
10 |
11 | import net.moraleboost.io.CodePointReader;
12 |
13 | import java.io.IOException;
14 | import java.util.*;
15 |
16 | import static net.moraleboost.tinysegmenter.TinySegmenterConstants.*;
17 |
18 | /**
19 | * TinySegmenterのJava移植版。
20 | *
21 | * @author taketa
22 | *
23 | */
24 | public class TinySegmenter
25 | {
26 | public static class CharInfo
27 | {
28 | public int cp;
29 | public String str;
30 | public String ctype;
31 | public long start;
32 | public long end;
33 | }
34 |
35 | public static class Token
36 | {
37 | public String str;
38 | public long start;
39 | public long end;
40 |
41 | public boolean equals(Object obj)
42 | {
43 | if (!(obj instanceof Token)) {
44 | return false;
45 | }
46 |
47 | Token another = (Token)obj;
48 | return ((str == null ? another.str == null : str.equals(another.str)) &&
49 | (start == another.start) && (end == another.end));
50 | }
51 |
52 | public String toString()
53 | {
54 | return ("(" +
55 | str + "," +
56 | Long.toString(start) + "," +
57 | Long.toString(end) + ")");
58 | }
59 | }
60 |
61 | private static String getCharType(int cp)
62 | {
63 | if (CHINESE_NUMBER_SET.contains(cp)) {
64 | // [一二三四五六七八九十百千万億兆]
65 | return "M";
66 | } else if (
67 | (0x4E00 <= cp && cp <= 0x9fa0) ||
68 | cp == '々' || cp == '〆' || cp == 'ヵ' || cp == 'ヶ') {
69 | // [一-龠々〆ヵヶ]
70 | return "H";
71 | } else if (0x3041 <= cp && cp <= 0x3093) {
72 | // [ぁ-ん]
73 | return "I";
74 | } else if (
75 | (0x30a1 <= cp && cp <= 0x30f4) || cp == 0x30fc ||
76 | (0xff71 <= cp && cp <= 0xff9e) || cp == 0xff70) {
77 | // [ァ-ヴーア-ン゙ー]
78 | return "K";
79 | } else if (
80 | ('a' <= cp && cp <= 'z') || ('A' <= cp && cp <= 'Z') ||
81 | ('a' <= cp && cp <= 'z') || ('A' <= cp && cp <= 'Z')) {
82 | // [a-zA-Za-zA-Z]
83 | return "A";
84 | } else if (
85 | ('0' <= cp && cp <= '9') || ('0' <= cp && cp <= '9')) {
86 | // [0-90-9]
87 | return "N";
88 | } else {
89 | return "O";
90 | }
91 | }
92 |
93 | private static List getCodePoints(String str)
94 | {
95 | int count = str.codePointCount(0, str.length());
96 | List result = new ArrayList(count);
97 | int charIndex = 0;
98 | while (charIndex < str.length()) {
99 | int cp = str.codePointAt(charIndex);
100 | result.add(cp);
101 | charIndex += Character.charCount(cp);
102 | }
103 |
104 | return result;
105 | }
106 |
107 | public static final int DEFAULT_BUFFER_SIZE = 4096;
108 | public static final int DEFAULT_MAX_TOKEN_SIZE = 255;
109 |
110 | @SuppressWarnings("serial")
111 | private static final Set CHINESE_NUMBER_SET =
112 | Collections.unmodifiableSet(new HashSet() {{
113 | addAll(getCodePoints("一二三四五六七八九十百千万億兆"));
114 | }});
115 |
116 | private CodePointReader reader;
117 | private boolean eos;
118 | private int eosCount;
119 |
120 | private CharInfo[] buffer;
121 | private int end;
122 | private int position;
123 | private boolean done;
124 | private String p1;
125 | private String p2;
126 | private String p3;
127 |
128 | private CharInfo[] tokenBuffer;
129 | private int tokenPosition;
130 |
131 | public TinySegmenter(CodePointReader reader)
132 | {
133 | this(reader, DEFAULT_BUFFER_SIZE, DEFAULT_MAX_TOKEN_SIZE);
134 | }
135 |
136 | /**
137 | * コンストラクタ
138 | * @param reader CodePointReaderオブジェクト。
139 | * @param bufferSize バッファのサイズ。7以上でなければならない。
140 | * @param maxTokenSize トークンの最大サイズ。2以上でなければならない。
141 | */
142 | public TinySegmenter(CodePointReader reader, int bufferSize, int maxTokenSize)
143 | {
144 | assert (bufferSize > 6);
145 | assert (maxTokenSize > 1);
146 |
147 | this.reader = reader;
148 | this.p1 = "U";
149 | this.p2 = "U";
150 | this.p3 = "U";
151 | this.buffer = new CharInfo[bufferSize];
152 | this.tokenBuffer = new CharInfo[maxTokenSize];
153 | }
154 |
155 | public CharInfo readChar() throws IOException
156 | {
157 | CharInfo c = new CharInfo();
158 |
159 | if (eos) {
160 | c.start = -1;
161 | c.end = -1;
162 | c.cp = -1;
163 | } else {
164 | c.start = reader.getPosition();
165 | c.cp = reader.read();
166 | c.end = reader.getPosition();
167 | }
168 |
169 | if (c.cp < 0) {
170 | eos = true;
171 | switch (eosCount) {
172 | case 0:
173 | c.str = "E1";
174 | c.ctype = "O";
175 | ++eosCount;
176 | break;
177 | case 1:
178 | c.str = "E2";
179 | c.ctype = "O";
180 | ++eosCount;
181 | break;
182 | case 2:
183 | c.str = "E3";
184 | c.ctype = "O";
185 | ++eosCount;
186 | break;
187 | default:
188 | return null;
189 | }
190 | } else {
191 | c.str = new String(Character.toChars(c.cp));
192 | c.ctype = getCharType(c.cp);
193 | }
194 |
195 | return c;
196 | }
197 |
198 | private void initBuffer()
199 | {
200 | CharInfo c;
201 |
202 | {
203 | c = new CharInfo();
204 | c.cp = 0;
205 | c.str = "B3";
206 | c.ctype = "O";
207 | c.start = -1;
208 | c.end = -1;
209 | buffer[0] = c;
210 |
211 | c = new CharInfo();
212 | c.cp = 0;
213 | c.str = "B2";
214 | c.ctype = "O";
215 | c.start = -1;
216 | c.end = -1;
217 | buffer[1] = c;
218 |
219 | c = new CharInfo();
220 | c.cp = 0;
221 | c.str = "B1";
222 | c.ctype = "O";
223 | c.start = -1;
224 | c.end = -1;
225 | buffer[2] = c;
226 | }
227 | end = 3;
228 | position = 3;
229 | }
230 |
231 | private int fillBuffer()
232 | throws IOException
233 | {
234 | CharInfo c;
235 |
236 | // 末尾の6アイテムを、先頭にコピーする。
237 | int src = end - 6;
238 | int dst = 0;
239 | if (src < 0) {
240 | src = 0;
241 | }
242 | while (src < end) {
243 | buffer[dst++] = buffer[src++];
244 | }
245 |
246 | // end, positionをリセット
247 | int start = dst;
248 | end = dst;
249 | position = 3;
250 |
251 | // 残りの部分にデータを読み込む
252 | while (end < buffer.length) {
253 | c = readChar();
254 | if (c != null) {
255 | buffer[end++] = c;
256 | } else {
257 | break;
258 | }
259 | }
260 |
261 | return (end - start);
262 | }
263 |
264 | public Token next()
265 | throws IOException
266 | {
267 | if (done) {
268 | return null;
269 | }
270 |
271 | if (end <= 0) {
272 | // 初回呼び出し
273 | initBuffer();
274 | if (fillBuffer() > 3) {
275 | // 最初の文字をtokenBufferに格納
276 | tokenBuffer[tokenPosition++] = buffer[position++];
277 | } else {
278 | // 空のストリーム
279 | done = true;
280 | return null;
281 | }
282 | }
283 |
284 | Token token = null;
285 | do {
286 | while (position < end-3) {
287 | if (isBoundary()) {
288 | // トークン境界と判定
289 | if (tokenPosition > 0) {
290 | token = makeToken();
291 | tokenBuffer[tokenPosition++] = buffer[position++];
292 | break;
293 | }
294 | } else {
295 | // トークン境界ではない
296 | tokenBuffer[tokenPosition++] = buffer[position++];
297 | if (tokenPosition >= tokenBuffer.length) {
298 | // バッファがいっぱいになったので、一旦トークンとして切り出す
299 | token = makeToken();
300 | break;
301 | }
302 | }
303 | }
304 | } while (token == null && fillBuffer() > 0);
305 |
306 | if (token == null) {
307 | // 最後のトークンを切り出す
308 | done = true;
309 | token = makeToken();
310 | }
311 |
312 | return token;
313 | }
314 |
315 | private Token makeToken()
316 | {
317 | Token token = new Token();
318 | StringBuilder builder = new StringBuilder();
319 |
320 | token.start = tokenBuffer[0].start;
321 | for (int i=0; i 0) {
391 | p = "B";
392 | result = true;
393 | }
394 | p1 = p2;
395 | p2 = p3;
396 | p3 = p;
397 |
398 | return result;
399 | }
400 |
401 | private int getScore(Map m, String key)
402 | {
403 | Integer s = m.get(key);
404 | return (s != null ? s : 0);
405 | }
406 | }
407 |
--------------------------------------------------------------------------------
/src/main/java/net/moraleboost/tinysegmenter/TinySegmenterConstants.java:
--------------------------------------------------------------------------------
1 | // Automatically generated. Do not edit.
2 | package net.moraleboost.tinysegmenter;
3 |
4 | import java.util.Collections;
5 | import java.util.HashMap;
6 | import java.util.Map;
7 |
8 | public class TinySegmenterConstants
9 | {
10 | public static final int BIAS = -332;
11 | public static final String[] BC1_KEYS = {"OH","II","HH","KH"};
12 | public static final Integer[] BC1_VALS = {-1378,2461,6,406};
13 | public static final String[] BC2_KEYS = {"AN","MK","HH","IA","KI","KK","HM","AA","HN","HO","IH","II","IK","AI","IO","MH","OO"};
14 | public static final Integer[] BC2_VALS = {-878,3334,-4070,1327,3831,-8741,-1711,-3267,4012,3761,-1184,-1332,1721,2744,5492,-3132,-2920};
15 | public static final String[] BC3_KEYS = {"MK","MM","HH","HI","HK","OA","KK","HN","HO","IH","OH"};
16 | public static final Integer[] BC3_VALS = {1079,4034,996,626,-721,-1652,2762,-1307,-836,-301,266};
17 | public static final String[] BP1_KEYS = {"BB","UB","OB","OO"};
18 | public static final Integer[] BP1_VALS = {295,352,304,-125};
19 | public static final String[] BP2_KEYS = {"BO","OO"};
20 | public static final Integer[] BP2_VALS = {60,-1762};
21 | public static final String[] BQ1_KEYS = {"BHM","OHI","OKH","OKK","BII","BOH","OIH","BIM","BOO","BMH","OKA","OOO","BHH","BNH"};
22 | public static final Integer[] BQ1_VALS = {1521,451,-1020,904,-1158,-91,-296,886,-2597,1208,1851,2965,1150,449};
23 | public static final String[] BQ2_KEYS = {"BKK","OHH","BHM","BKO","BIH","OHM","OIH","UHI","BHH","BHI"};
24 | public static final Integer[] BQ2_VALS = {-1720,-1139,466,864,-919,-181,153,-1146,118,-1159};
25 | public static final String[] BQ3_KEYS = {"OHH","OKH","OKI","BNN","BII","OHM","BOH","OKO","OII","BMH","OMH","OOO","BMM","BHH","BHI","BKI"};
26 | public static final Integer[] BQ3_VALS = {2174,1798,-793,998,-299,439,775,-2242,280,937,-2402,11699,8335,-792,2664,419};
27 | public static final String[] BQ4_KEYS = {"BKK","OHH","OHK","BIH","BII","BIK","ONN","BOO","OAH","BMI","BHH"};
28 | public static final Integer[] BQ4_VALS = {-1806,266,-2036,3761,-4654,1348,-973,-12396,926,-3385,-3895};
29 | public static final String[] BW1_KEYS = {"引き","から","いう","を見","平方","B1同","てい","たち","大阪","B1あ","ませ","取り","には","てき","すで","毎日","どこ","なん","さら","こと","まで","の中","そこ","いっ","がら","とみ","さん","にも","った","ない","」と","つい","ため","した","うん","本当","でき","、と","やむ","よっ","まま","して","、同","に対","亡く","B1同","」と","です","大き","B1あ","をし","あっ","まる","京都","こん","なっ","とい","いる",",と","れた","など",",同","の一","目指","うし","れで","では","それ","こう","にし","日本"};
30 | public static final Integer[] BW1_VALS = {-1336,3472,1743,731,-2314,542,805,1122,1497,1404,2448,-2784,1498,1249,-3399,-2113,3887,-1113,-4143,2083,1711,741,1977,-2055,600,1922,4573,1671,3463,5713,1682,-802,601,2641,665,-2423,1127,660,-1947,-2565,2600,1104,727,-912,-1886,542,1682,3445,-2604,1404,1860,1505,-2155,2558,-1262,3015,-4915,672,660,2369,7379,727,-501,-724,-4817,-913,844,-871,-790,2468,-195};
31 | public static final String[] BW2_KEYS = {"――","れば","とこ","に対","11","んだ","はい","くな","一部","委員","ので","でも","いう","のに","はが","んな","新聞","とと","のの","会社","同党","との","もい","めて","しい","はず","一方","を通","少な","しか","上が","され","とみ","−−","とも","ない","本人","った","さん","に関","なが","って","っと","手権","した","かし","らか","曜日","年度","して","その","しな","もの","一人","東京","がい","らし","米国","一日","など","にお","うか","日米","たい","なの","らに","大阪","にし","府県","かも","りし","社会","から","まし","かれ","ばれ","てい","たた","にな","ただ","たち","第に","われ","てき","たと","てく","なん","同日", "","まで","きた","たは","こと","然と","この","がら","りま","でい","によ","11","でき","に従","ては","立て","でし","です","まれ","れた","ても","とい","分の","のか","ろう","出て","日本","れて","年間","日新","朝鮮","させ"};
32 | public static final Integer[] BW2_VALS = {-5730,4114,-1746,-14943,-669,728,1073,-1597,-1051,-1250,-7059,-4203,-1609,-6041,-1033,-4115,-4066,-2279,-6125,-1116,970,720,2230,-3153,-1819,-2532,-1375,-11877,-1050,-545,-4479,13168,5168,-13175,-3941,-2488,-2697,4589,-3977,-11388,-1313,1647,-2094,-1982,5078,-1350,-944,-601,-8669,972,-3744,939,-10713,602,-1543,853,-1611,-4268,970,-6509,-1615,2490,3372,-1253,2614,-1897,-2471,2748,-2363,-602,651,-1276,-7194,-1316,4612,1813,6144,-662,2454,-3857,-786,-1612,7901,3640,1224,2551,3099,-913,-11822,-6621,1941,-939,-8392,-1384,-4193,-3198,1620,2666,-7236,-669,-1528,-4688,-3110,-990,-3828,-4761,5409,4270,-3065,1890,-7758,2093,6067,2163,-7068,849,-1626,-722,-2355,4533};
33 | public static final String[] BW3_KEYS = {"でに","市","るる","では","れば","日、","た.","とし","が、","す.","んだ","に、","いい","んで","どう","いえ","新聞","た。","あり","ある","いく","れる","との","す。","そう","しい","だ.","ず,","カ月","いた","いっ","大会","とも","さを","ない","った","だ。","かけ","って","ず、","した","なく","れ,","して","しな","かっ","らし","けど","カ月","れ、","かに","がき","の,","など","がけ","いる","たい","しま","いわ","会議","にし","がっ","の、","うち","社会","から","かり","うと","の子","まし","てい","は,","ます","にな","い.","てお","われ","には","まっ","られ","まで","たの","きた","し,","こと","は、","べき","この","い。","がら","がり","か.","だっ","し、","たり","たる","さい","始め","ずに","する","です","か。","まれ","日,","ころ","あた","れた","えと","が,","ても","とう","れて","入り","に,"};
34 | public static final Integer[] BW3_VALS = {-1482,965,3818,2295,-3246,974,8875,2266,1816,-1310,606,-1021,5308,798,4664,2079,-5055,8875,719,3846,3029,1091,541,-1310,428,-3714,4098,3426,990,2056,1883,2217,-3543,976,1796,-4748,4098,-743,300,3426,3562,-903,854,1449,2608,-4098,1479,1374,990,854,-669,-4855,-724,2135,-1127,5600,-594,1200,1527,860,1771,-913,-724,1117,2024,6520,-2670,4798,-1000,1113,6240,1337,6943,1906,-1185,855,-605,2644,-1549,6820,6154,812,1645,1557,7397,1337,2181,1542,-1185,-4977,-2064,2857,1004,1557,-1183,-853,-714,1681,841,6521,1437,2857,-793,974,-2757,-2194,1850,1454,1816,302,-1387,1375,1232,-1021};
35 | public static final String[] TC1_KEYS = {"HOM","MMH","AAA","IHI","OOI","HHH","IOH","HHM","IOI","HII","HOH","IOM"};
36 | public static final Integer[] TC1_VALS = {-331,187,1093,1169,-1832,1029,-142,580,-1015,998,-390,467};
37 | public static final String[] TC2_KEYS = {"IHI","OII","HMM","KKH","HHO","HII"};
38 | public static final Integer[] TC2_VALS = {-1965,-2649,-1154,703,2088,-1023};
39 | public static final String[] TC3_KEYS = {"HHH","HHI","KOK","IOI","IIH","AAA","KKA","IIM","MHH","OHO","KKH","KHH","MHM","MHO","IHH","IHI","MMH","IHO","HOH","NNH","HII","HIK","NNO"};
40 | public static final Integer[] TC3_VALS = {346,-341,-1009,-542,-825,-294,491,-1035,-2694,-3393,-1217,-1216,-457,123,128,-3041,-471,-1935,-1486,-1689,-1088,731,662};
41 | public static final String[] TC4_KEYS = {"MOM","HHH","HHI","HHK","HHM","IIH","HHN","III","HHO","KKA","IOO","MHH","IIO","MHI","KKK","IHH","MMH","IHO","HOH","MMM","HIH","HII","KAK"};
42 | public static final Integer[] TC4_VALS = {841,-203,1344,365,-122,321,182,1497,669,3386,54,-405,656,201,3065,695,-241,-2324,446,661,804,679,4845};
43 | public static final String[] TQ1_KEYS = {"BHIH","OHHH","BOHH","OIIH","BNHH","OHIH","BIHH","BHHH","BHHI","OAKK","BIII","BOOO","OIHI"};
44 | public static final Integer[] TQ1_VALS = {-132,281,225,-68,-744,249,60,-227,316,482,1595,-908,200};
45 | public static final String[] TQ2_KEYS = {"BIHH","BKAK","BOOO","BIII"};
46 | public static final Integer[] TQ2_VALS = {-1401,-543,-5591,-1033};
47 | public static final String[] TQ3_KEYS = {"BHIH","BHII","OHII","OKAK","OOII","BHHH","OHHH","OHHI","BHHM","BIIH","BIII","OIIH","OOHH","OKKA","BMHI","BMHM","OHMH","OKHH","BOMH","OIHH"};
48 | public static final Integer[] TQ3_VALS = {222,-504,997,2792,-685,478,346,1729,-1073,-116,-105,1344,110,679,-863,-464,481,587,620,623};
49 | public static final String[] TQ4_KEYS = {"BHII","OHHH","OHHI","OKAK","OIIH","OIII","OHHO","OHIH","BHHH","OAKK","BIIH","BIII","OAAA","OIHH","BHHM","OIHI"};
50 | public static final Integer[] TQ4_VALS = {-966,-294,2446,-8156,626,-4007,480,-1573,-721,180,-607,-2181,-2763,1935,-3604,-493};
51 | public static final String[] TW1_KEYS = {"東京都","につい"};
52 | public static final Integer[] TW1_VALS = {2026,-4681};
53 | public static final String[] TW2_KEYS = {"だって","しょう","として","ある程","大きな","その後","ともに","ころが","対して","もので","社会党","ていた","一気に","いった","初めて","同時に"};
54 | public static final Integer[] TW2_VALS = {-1049,3873,-4657,-2049,-1255,-4430,-4517,-2434,-2721,1882,-3216,1833,-792,-1256,-1512,-8097};
55 | public static final String[] TW3_KEYS = {"ので、","として","のもの","にとっ","いただ","につい","してい","ので,","十二月","れから","に当た"};
56 | public static final Integer[] TW3_VALS = {-727,-4314,-600,-5989,-1734,-5483,1314,-727,-2287,-3752,-6247};
57 | public static final String[] TW4_KEYS = {"からな","ました","という","いう.","ようと","よると","たが,","ている","してい","いう。","ません","たが、"};
58 | public static final Integer[] TW4_VALS = {-2348,5543,1349,8576,-4258,5865,1516,1538,2958,8576,1097,1516};
59 | public static final String[] UC1_KEYS = {"M","O","K","A"};
60 | public static final Integer[] UC1_VALS = {645,-505,93,484};
61 | public static final String[] UC2_KEYS = {"M","N","O","H","I","A"};
62 | public static final Integer[] UC2_VALS = {3987,5775,646,1059,409,819};
63 | public static final String[] UC3_KEYS = {"A","I"};
64 | public static final Integer[] UC3_VALS = {-1370,2311};
65 | public static final String[] UC4_KEYS = {"M","N","O","H","I","K","A"};
66 | public static final Integer[] UC4_VALS = {3565,3876,6646,1809,-1032,-3450,-2643};
67 | public static final String[] UC5_KEYS = {"M","O","H","I","K"};
68 | public static final Integer[] UC5_VALS = {539,-831,313,-1238,-799};
69 | public static final String[] UC6_KEYS = {"M","O","H","I","K"};
70 | public static final Integer[] UC6_VALS = {247,-387,-506,-253,87};
71 | public static final String[] UP1_KEYS = {"O"};
72 | public static final Integer[] UP1_VALS = {-214};
73 | public static final String[] UP2_KEYS = {"B","O"};
74 | public static final Integer[] UP2_VALS = {69,935};
75 | public static final String[] UP3_KEYS = {"B"};
76 | public static final Integer[] UP3_VALS = {189};
77 | public static final String[] UQ1_KEYS = {"BH","BI","BK","BN","BO","OH","OI","OK","OO"};
78 | public static final Integer[] UQ1_VALS = {21,-12,-99,142,-56,-95,477,410,-2422};
79 | public static final String[] UQ2_KEYS = {"BH","BI","OK"};
80 | public static final Integer[] UQ2_VALS = {216,113,1759};
81 | public static final String[] UQ3_KEYS = {"BH","BI","BK","BM","BN","BO","OI","BA","ON"};
82 | public static final Integer[] UQ3_VALS = {42,1913,-7198,3160,6427,14761,-827,-479,-3212};
83 | public static final String[] UW1_KEYS = {"京","あ","委","う","が","き","「","こ","・","大","区","市","、","国","午","で","と","ど",",","に","「","の","は","日","生","理","都","も","や","よ","ら","県","り","主","れ","を","ん","・"};
84 | public static final Integer[] UW1_VALS = {-268,-941,729,-127,-553,121,-463,505,-135,561,-912,-411,156,-460,871,-201,-547,-123,156,-789,-463,-185,-847,-141,-408,361,-718,-466,-470,182,-292,-386,208,-402,169,-446,-137,-135};
85 | public static final String[] UW2_KEYS = {"揺","市","も","会","や","保","よ","最","り","初","る","れ","文","第","入","を","ん","自","ア","朝",",","カ","キ","事","本","西","新","「","」","、","見","ッ","ッ","北","〇","ア","小","子","「","カ","」","目","キ","開","相","間","副","大","学","天","太","理","人","区","県","日","立","次","三","年","不","強","東","込","世","あ","行","い","う","政","お","か","が","手","く","こ","中","さ","ざ","明","し","発","実","す","米","せ","そ","た","だ","民","主","つ","て","果","で","気","と","ど","な","議","に","の","は","ひ","調","べ","ま"};
86 | public static final Integer[] UW2_VALS = {-1033,-813,-1263,978,-402,362,1639,-630,-579,-3025,-694,571,-1355,810,548,-2516,2095,-1353,-587,-1843,-829,306,568,492,-1650,-744,-1682,-645,3145,-829,-3874,831,831,-3414,892,-587,-2009,-1519,-645,306,3145,-1584,568,1758,-242,-1257,-1566,-1769,760,-865,-483,752,-123,-422,-1165,-1815,-763,-2378,-758,-1060,-2150,1067,-931,3041,-302,-538,838,505,134,1522,-502,1454,-856,-1519,-412,1141,-968,878,540,-1462,1529,529,1023,-675,509,300,-1011,188,1837,-180,-861,-949,-291,-665,-268,-1740,-981,1273,1063,1198,-1764,130,-409,-1273,1010,1261,600};
87 | public static final String[] UW3_KEYS = {"1","低","前","関","何","作","李","村","費","口","込","立","、","学","総","々","副","〇","日","旧","右",",","」","線","平","年","〓","一","森","知","東","国","各","下","合","海","広","非","同","安","米","指","世","力","的","能","両","氏","民","府","実","思","中","あ","い","度","う","性","え","お","か","昨","が","生","主","く","け","げ","家","こ","ご","さ","用","し","元","す","通","せ","そ","第","グ","た","ち","っ","つ","て","時","で","と","町","ど","な","に","動","の","は","務","党","ひ","保","私","ふ","へ","ほ","ま","全","み","め","公","も","六","や","共","よ","ら","車","り","る","れ","軍","わ","を","金","ん","業","物","建","1","円","予","二","ア","決","再","直","和","型","特","英","小","化","少","北","系","グ","省","外","約","選","ス","者","県","税","ッ","ト","無","級","人","区","戸","千","核","今","午","ム","政","他","協","ル","ロ","」","・","当","ン","員","以","ッ","・","調","ア","教","州","法","曜","ス","−","駅","郎","ト","数","ム","分","市","自","郡","ル","最","統","ロ","ン","部","文","月","雨","初","得","長","別","電","期","見","場","開","新","妻","間","財"};
88 | public static final Integer[] UW3_VALS = {-800,811,2286,-1282,4265,-361,3094,364,1777,483,-1504,-960,4889,-1356,1163,-2311,4437,5827,2099,5792,1233,4889,2670,1255,-1804,2416,-3573,-1619,2438,-1528,-805,642,3588,-1759,-241,-495,-1030,2066,3906,-423,7767,-3973,-2087,365,7313,725,3815,2613,-1694,1605,-1008,-1291,653,-2696,1006,1452,2342,1822,1983,-4864,-1163,-661,3271,-273,-758,1004,388,401,1078,-3552,-3116,-1058,914,-395,4858,584,-1136,3685,-5228,1201,1319,842,-521,-1444,-1081,6167,-1248,2318,1691,1215,-899,-2788,2745,-949,4056,4555,-1872,3593,-2171,-2439,4231,-1798,1199,-5516,-4384,1574,-120,1205,-3030,2323,755,-788,-1880,-202,727,1835,649,5905,2773,1375,-1207,6620,2163,-518,484,461,-2352,-800,5807,-1193,974,551,-1073,3095,-1835,-837,1389,-3850,785,-513,1327,-3102,-1038,3066,1319,792,-241,3663,-681,874,6457,6293,401,-1350,521,979,1384,2742,4646,-488,-2309,5156,792,-783,1109,-2013,1889,-1006,1591,2201,2670,-3794,-3885,278,4513,-1368,-1350,-3794,-562,551,-1479,1155,1868,-951,874,-1723,1620,1026,521,3222,1109,457,3197,-2869,4404,1591,-937,-4229,2201,278,1200,-1489,4125,2009,2475,1905,421,1129,-1045,360,1044,1219,-1432,1764,2016,1302,-733};
89 | public static final String[] UW4_KEYS = {"般","前","体","子","作","回","込","立","、","。","学","総","副","〇","行","日","来","「",",","」",".","線","近","年","〓","島","一","国","賞","庁","合","警","米","署","園","議","力","的","能","率","定","氏","民","気","中","あ","い","う","性","え","地","お","か","が","き","生","ぎ","く","け","産","げ","こ","ご","さ","し","じ","す","ず","せ","そ","先","田","第","た","だ","ち","っ","つ","て","時","で","と","町","な","に","ぬ","動","ね","の","館","は","ば","務","党","ひ","び","ふ","へ","べ","ほ","ま","み","む","め","も","ゃ","や","士","共","ょ","よ","ら","車","り","―","る","れ","軍","ろ","わ","野","を","ん","業","道","物","寺","内","円","予","目","事","高","和","院","井","カ","小","化","系","球","省","済","コ","多","約","選","者","セ","県","大","ッ","校","ト","沢","人","区","支","改","首","領","際","所","メ","政","屋","ラ","輪","リ","協","ル","「","」","・","ン","谷","員","以","ッ","ー","川","・","教","ー","経","カ","器","コ","セ","側","山","郎","ト","題","メ","市","ラ","リ","ル","最","統","ン","文","後","空","月","会","初","長","都","感","電","銀","規","木","場","間","参","塁","方"};
90 | public static final Integer[] UW4_VALS = {-852,1623,-1286,-4802,530,1500,-3370,-2112,3930,3508,-1397,940,3879,4999,-792,1798,-442,1895,3930,3798,3508,-994,929,374,-5156,-2056,-2069,-619,730,-4556,-1834,-1184,2937,749,-1200,-244,-302,2586,-730,672,-1057,5388,-2716,-910,2210,4752,-3435,-640,553,-2514,866,2405,530,6006,-4482,-1286,-3821,-3788,-4376,-1101,-4734,2255,1979,2864,-843,-2506,-731,1251,181,4091,601,-2900,788,5034,5408,-3654,-5882,-1659,3994,1829,7410,4547,1826,5433,6499,1853,-740,1413,7396,-1984,8578,1940,-2715,-2006,4249,-4134,1345,6665,-744,1464,1051,-2082,-882,-5046,4169,-2666,2795,-1413,-1212,-1544,3351,-2922,-1481,-9726,-4841,-14896,-2613,1158,-4570,-1783,-1100,13150,-2352,-1043,-1291,-735,-809,584,788,782,922,-190,2120,-681,-2297,-1768,2145,1910,776,786,-1267,-3485,-543,1789,1067,2171,2596,2145,1287,2997,571,-724,-360,-403,-939,1036,4517,856,787,1749,-1659,-2604,-1566,-1635,2182,-1328,-881,-1433,-541,1013,-856,1895,3798,-4371,-3637,-1000,-910,544,-724,-11870,-2667,-4371,704,-11870,1146,2145,-851,1789,1287,4292,-1500,-4866,-403,-792,-1635,2771,-881,-541,-856,845,-1169,-3637,522,456,-867,-9066,950,1347,357,1192,916,-878,-2213,792,-485,-1410,-2344,1555,-2094,-856};
91 | public static final String[] UW5_KEYS = {"み","市","1","め","ゃ","会","党","ょ","務","り","る","E2","れ","嵐","田","わ","郎","月","を","ん","町","題","統","空","イ","席",",",".","館","新","「","長","、","。","査","イ","「","京","相","E2","間","]","大","学","省","社","区","県","ル","日","機","ル","者","年","ン","ン","選","あ","所","い","う","格","え","お","か","が","き","ぎ","く","員","げ","定","中","さ","し","語","す","挙","思","表","氏","だ","ち","的","っ","つ","て","1","で","と","ど","な","議","に","の","は","研","べ","告"};
92 | public static final Integer[] UW5_VALS = {502,-2991,-514,865,3350,-1153,-654,854,3519,-208,429,-32768,504,-1304,240,419,-368,-4353,-1264,327,-3912,2368,1955,-813,241,921,465,-299,-689,-1682,363,786,465,-299,932,241,363,722,1319,-32768,1191,-2762,-1296,-548,-1052,-278,-901,-4003,451,218,-1508,451,-2233,1763,-343,-343,-1018,1655,-814,331,-503,1356,1199,527,647,-421,1624,1971,312,2104,-983,1785,-871,-1537,-1371,-1073,-852,1618,872,663,-1347,-1186,1093,-3149,52,921,-18,-514,-850,-127,1682,-787,1219,-1224,-635,-578,-997,1001,848};
93 | public static final String[] UW6_KEYS = {"1","E1","あ","空","委","う","業","か","が","会","く","一","郎","こ","じ","区","す","学","E1","市","1","た","、","。","っ","連","て","で","と",",","な","に","後",".","の","は","福","相","中","広","も","社","員","ル","前","件","り","る","ン","ル","を","ン","者"};
94 | public static final Integer[] UW6_VALS = {-270,306,-307,-822,798,189,-697,241,-73,624,-121,-277,1082,-200,1782,1792,383,-960,306,887,-270,-428,227,808,573,463,-1014,101,-105,227,-253,-149,535,808,-417,-236,974,753,201,-695,-206,-507,-1212,-673,302,-800,187,-135,-496,-673,195,-496,1811};
95 | public static final Map BC1;
96 | public static final Map BC2;
97 | public static final Map BC3;
98 | public static final Map BP1;
99 | public static final Map BP2;
100 | public static final Map BQ1;
101 | public static final Map BQ2;
102 | public static final Map BQ3;
103 | public static final Map BQ4;
104 | public static final Map BW1;
105 | public static final Map BW2;
106 | public static final Map BW3;
107 | public static final Map TC1;
108 | public static final Map TC2;
109 | public static final Map TC3;
110 | public static final Map TC4;
111 | public static final Map TQ1;
112 | public static final Map TQ2;
113 | public static final Map TQ3;
114 | public static final Map TQ4;
115 | public static final Map TW1;
116 | public static final Map TW2;
117 | public static final Map TW3;
118 | public static final Map TW4;
119 | public static final Map UC1;
120 | public static final Map UC2;
121 | public static final Map UC3;
122 | public static final Map UC4;
123 | public static final Map UC5;
124 | public static final Map UC6;
125 | public static final Map UP1;
126 | public static final Map UP2;
127 | public static final Map UP3;
128 | public static final Map UQ1;
129 | public static final Map UQ2;
130 | public static final Map UQ3;
131 | public static final Map UW1;
132 | public static final Map UW2;
133 | public static final Map UW3;
134 | public static final Map UW4;
135 | public static final Map UW5;
136 | public static final Map UW6;
137 | static {
138 | int i;
139 | Map m;
140 | m = new HashMap();
141 | for (i=0; i();
146 | for (i=0; i();
151 | for (i=0; i();
156 | for (i=0; i();
161 | for (i=0; i();
166 | for (i=0; i();
171 | for (i=0; i();
176 | for (i=0; i();
181 | for (i=0; i();
186 | for (i=0; i();
191 | for (i=0; i();
196 | for (i=0; i();
201 | for (i=0; i();
206 | for (i=0; i();
211 | for (i=0; i();
216 | for (i=0; i();
221 | for (i=0; i();
226 | for (i=0; i();
231 | for (i=0; i();
236 | for (i=0; i();
241 | for (i=0; i();
246 | for (i=0; i();
251 | for (i=0; i();
256 | for (i=0; i();
261 | for (i=0; i();
266 | for (i=0; i();
271 | for (i=0; i();
276 | for (i=0; i();
281 | for (i=0; i();
286 | for (i=0; i();
291 | for (i=0; i();
296 | for (i=0; i();
301 | for (i=0; i();
306 | for (i=0; i();
311 | for (i=0; i();
316 | for (i=0; i();
321 | for (i=0; i();
326 | for (i=0; i();
331 | for (i=0; i();
336 | for (i=0; i();
341 | for (i=0; i();
346 | for (i=0; i ret = new ArrayList();
35 | CharacterIterator iter = new StringCharacterIterator(str);
36 | StringBuilder token;
37 | char c = DONE;
38 |
39 | for (c=iter.first(); c!=DONE; c=iter.next()) {
40 | // トークン先頭の空白を読み飛ばす
41 | while (c == ' ' || c == '\t') {
42 | c = iter.next();
43 | }
44 |
45 | token = new StringBuilder();
46 | if (c == '"') {
47 | // クォート文字列
48 | // 終わりまで読む
49 | while ((c = iter.next()) != DONE) {
50 | if (c == '"') {
51 | c = iter.next();
52 | // 2つ連続する「"」は、エスケープされた「"」
53 | if (c == '"') {
54 | // これはエスケープされた「"」
55 | token.append(c);
56 | } else {
57 | // クォート文字列の終わり
58 | break;
59 | }
60 | } else {
61 | token.append(c);
62 | }
63 | }
64 | // ","まで文字列を読み飛ばす
65 | while (c != DONE && c != ',') {
66 | c = iter.next();
67 | }
68 | } else {
69 | // 次の","まで、トークンを読み取る
70 | while (c != DONE && c != ',') {
71 | token.append(c);
72 | c = iter.next();
73 | }
74 | }
75 |
76 | --max;
77 | if (max <= 0) {
78 | // これ以上の文字列は、すべて最後のトークンにマージする
79 | while (c != DONE) {
80 | token.append(c);
81 | c = iter.next();
82 | }
83 | }
84 |
85 | ret.add(token.toString());
86 | }
87 |
88 | // 汚いハック: 文字列が","で終わる場合、最後に空白要素を追加する
89 | if (max > 0 && str.endsWith(",")) {
90 | ret.add("");
91 | }
92 |
93 | return ret.toArray(new String[ret.size()]);
94 | }
95 |
96 | /**
97 | * CSVの要素内で使用できない文字をエスケープする
98 | *
99 | * @param str エスケープする文字列
100 | * @return エスケープされた文字列
101 | */
102 | public static String escape(String str)
103 | {
104 | // 「 」「\t」「"」「,」のいずれかが含まれていれば、ダブルクォーテーションで囲む
105 | StringBuilder ret = null;
106 | char c;
107 | for (int i=0; i elements)
143 | {
144 | StringBuilder b = new StringBuilder();
145 |
146 | boolean first = true;
147 | for (String e: elements) {
148 | if (first) {
149 | first = false;
150 | } else {
151 | b.append(",");
152 | }
153 | b.append(escape(e));
154 | }
155 |
156 | return b.toString();
157 | }
158 |
159 | /**
160 | * elementsの要素をカラムとして、一行のCSVデータを作成して返す。
161 | *
162 | * @param elements 各項目の値
163 | * @return 一行分のCSVデータ
164 | */
165 | public static String join(String[] elements)
166 | {
167 | return join(Arrays.asList(elements));
168 | }
169 | }
170 |
--------------------------------------------------------------------------------
/src/test/java/net/moraleboost/io/BasicCodePointReaderTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | **
3 | ** Feb. 17, 2009
4 | **
5 | ** The author disclaims copyright to this source code.
6 | ** In place of a legal notice, here is a blessing:
7 | **
8 | ** May you do good and not evil.
9 | ** May you find forgiveness for yourself and forgive others.
10 | ** May you share freely, never taking more than you give.
11 | **
12 | ** Stolen from SQLite :-)
13 | ** Any feedback is welcome.
14 | ** Kohei TAKETA
15 | **
16 | */
17 | package net.moraleboost.io;
18 |
19 | import org.junit.Test;
20 |
21 | import java.io.CharArrayReader;
22 | import java.io.CharArrayWriter;
23 | import java.io.IOException;
24 | import java.io.StringReader;
25 |
26 | import static org.junit.Assert.assertTrue;
27 | import static org.junit.Assert.fail;
28 |
29 | public class BasicCodePointReaderTest
30 | {
31 | @Test
32 | public void testBasic() throws IOException
33 | {
34 | String str = "あaいbうcえdお";
35 | int[] answer = getCodePoints(str);
36 | long[] positions = getPositions(str);
37 | CodePointReader reader = new BasicCodePointReader(new StringReader(str));
38 | if (!match(reader, answer, positions)) {
39 | fail("コードポイントが一致しません。");
40 | }
41 | }
42 |
43 | @Test
44 | public void testSurrogatePair() throws IOException
45 | {
46 | int scp = 0x00010400;
47 | int[] answer = new int[] { scp, 'あ', 'a', 'い', scp, scp, 'd', 'お', scp };
48 | long[] positions = new long[] { 0, 2, 3, 4, 5, 7, 9, 10, 11, 13 };
49 |
50 | String str = new String(answer, 0, answer.length);
51 | CodePointReader reader = new BasicCodePointReader(new StringReader(str));
52 | if (!match(reader, answer, positions)) {
53 | fail("コードポイントが一致しません。");
54 | }
55 | }
56 |
57 | @Test
58 | public void testEndWithHighSurrogate() throws IOException
59 | {
60 | int scp = 0x00010400;
61 | String base = "本日は晴天なり";
62 | char highSurrogate = Character.toChars(scp)[0];
63 | assertTrue(Character.isHighSurrogate(highSurrogate));
64 |
65 | CharArrayWriter writer = new CharArrayWriter();
66 | writer.write(base);
67 | writer.write(highSurrogate);
68 |
69 | int[] answer = getCodePoints(base
70 | + (char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT);
71 | long[] positions = getPositions(base
72 | + (char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT);
73 | CodePointReader reader = new BasicCodePointReader(new CharArrayReader(
74 | writer.toCharArray()));
75 | if (!match(reader, answer, positions)) {
76 | fail("コードポイントが一致しません。");
77 | }
78 | }
79 |
80 | @Test
81 | public void testEndWithLowSurrogate() throws IOException
82 | {
83 | int scp = 0x00010400;
84 | String base = "本日は晴天なり";
85 | char lowSurrogate = Character.toChars(scp)[1];
86 | assertTrue(Character.isLowSurrogate(lowSurrogate));
87 |
88 | CharArrayWriter writer = new CharArrayWriter();
89 | writer.write(base);
90 | writer.write(lowSurrogate);
91 |
92 | int[] answer = getCodePoints(base
93 | + (char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT);
94 | long[] positions = getPositions(base
95 | + (char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT);
96 | CodePointReader reader = new BasicCodePointReader(new CharArrayReader(
97 | writer.toCharArray()));
98 | if (!match(reader, answer, positions)) {
99 | fail("コードポイントが一致しません。");
100 | }
101 | }
102 |
103 | @Test
104 | public void testStartWithHighSurrogate() throws IOException
105 | {
106 | int scp = 0x00010400;
107 | String base = "本日は晴天なり";
108 | char highSurrogate = Character.toChars(scp)[0];
109 | assertTrue(Character.isHighSurrogate(highSurrogate));
110 |
111 | CharArrayWriter writer = new CharArrayWriter();
112 | writer.write(highSurrogate);
113 | writer.write(base);
114 |
115 | int[] answer = getCodePoints((char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT
116 | + base);
117 | long[] positions = getPositions((char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT
118 | + base);
119 | CodePointReader reader = new BasicCodePointReader(new CharArrayReader(
120 | writer.toCharArray()));
121 | if (!match(reader, answer, positions)) {
122 | fail("コードポイントが一致しません。");
123 | }
124 | }
125 |
126 | @Test
127 | public void testStartWithLowSurrogate() throws IOException
128 | {
129 | int scp = 0x00010400;
130 | String base = "本日は晴天なり";
131 | char lowSurrogate = Character.toChars(scp)[1];
132 | assertTrue(Character.isLowSurrogate(lowSurrogate));
133 |
134 | CharArrayWriter writer = new CharArrayWriter();
135 | writer.write(lowSurrogate); // low surrogate
136 | writer.write(base);
137 |
138 | int[] answer = getCodePoints((char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT
139 | + base);
140 | long[] positions = getPositions((char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT
141 | + base);
142 | CodePointReader reader = new BasicCodePointReader(new CharArrayReader(
143 | writer.toCharArray()));
144 | if (!match(reader, answer, positions)) {
145 | fail("コードポイントが一致しません。");
146 | }
147 | }
148 |
149 | @Test
150 | public void testIllformedSurrogate() throws IOException
151 | {
152 | int scp = 0x00010400;
153 | int[] original = new int[] { 'あ', 'a', 'い', scp, scp, 'd', 'お' };
154 | // high surrogateを破壊した場合の正解
155 | int[] answer1 = new int[] { 'あ', 'a', 'い', 'a',
156 | BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT, scp, 'd',
157 | 'お' };
158 | long[] positions1 = new long[] { 0, 1, 2, 3, 4, 5, 7, 8, 9 };
159 | // low surrogateを破壊した場合の正解
160 | int[] answer2 = new int[] { 'あ', 'a', 'い',
161 | BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT, 'a', scp,
162 | 'd', 'お' };
163 | long[] positions2 = new long[] { 0, 1, 2, 3, 4, 5, 7, 8, 9 };
164 |
165 | // 不正なデータを作成
166 | char[] chars1 = new String(original, 0, original.length).toCharArray();
167 | char[] chars2 = new String(original, 0, original.length).toCharArray();
168 |
169 | // high surrogateを破壊
170 | chars1[3] = 'a';
171 | String ill1 = new String(chars1);
172 |
173 | // low surrogateを破壊
174 | chars2[4] = 'a';
175 | String ill2 = new String(chars2);
176 |
177 | if (!match(new BasicCodePointReader(new StringReader(ill1)), answer1,
178 | positions1)) {
179 | fail("Low surrogateが単独で存在する場合のコードポイントが一致しません。");
180 | }
181 | if (!match(new BasicCodePointReader(new StringReader(ill2)), answer2,
182 | positions2)) {
183 | fail("High surrogateが単独で存在する場合のコードポイントが一致しません。");
184 | }
185 | }
186 |
187 | private boolean match(CodePointReader reader, int[] answer, long[] positions)
188 | throws IOException
189 | {
190 | int i = 0;
191 | int cp;
192 | while ((cp = reader.read()) >= 0) {
193 | if (cp != answer[i]) {
194 | return false;
195 | }
196 | if (reader.getPosition() != positions[i + 1]) {
197 | return false;
198 | }
199 | ++i;
200 | }
201 |
202 | return (i == answer.length);
203 | }
204 |
205 | private int[] getCodePoints(String str)
206 | {
207 | int count = str.codePointCount(0, str.length());
208 | int[] result = new int[count];
209 | int cpIndex = 0, charIndex = 0;
210 | while (charIndex < str.length()) {
211 | int cp = str.codePointAt(charIndex);
212 | result[cpIndex++] = cp;
213 | charIndex += Character.charCount(cp);
214 | }
215 |
216 | return result;
217 | }
218 |
219 | private long[] getPositions(String str)
220 | {
221 | int count = str.codePointCount(0, str.length());
222 | long[] positions = new long[count + 1];
223 | int cpIndex = 0, charIndex = 0;
224 | while (charIndex < str.length()) {
225 | int cp = str.codePointAt(charIndex);
226 | positions[cpIndex++] = charIndex;
227 | charIndex += Character.charCount(cp);
228 | }
229 | positions[cpIndex] = charIndex;
230 |
231 | return positions;
232 | }
233 | }
234 |
--------------------------------------------------------------------------------
/src/test/java/net/moraleboost/io/PushbackCodePointReaderTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | **
3 | ** Feb. 17, 2009
4 | **
5 | ** The author disclaims copyright to this source code.
6 | ** In place of a legal notice, here is a blessing:
7 | **
8 | ** May you do good and not evil.
9 | ** May you find forgiveness for yourself and forgive others.
10 | ** May you share freely, never taking more than you give.
11 | **
12 | ** Stolen from SQLite :-)
13 | ** Any feedback is welcome.
14 | ** Kohei TAKETA
15 | **
16 | */
17 | package net.moraleboost.io;
18 |
19 | import org.junit.Test;
20 |
21 | import java.io.CharArrayReader;
22 | import java.io.IOException;
23 |
24 | import static org.junit.Assert.assertEquals;
25 | import static org.junit.Assert.fail;
26 |
27 | public class PushbackCodePointReaderTest
28 | {
29 | @Test
30 | public void testBasic() throws Exception
31 | {
32 | String str = "abc";
33 | int scp = 0x00010400;
34 |
35 | CharArrayReader car = new CharArrayReader(str.toCharArray());
36 | CodePointReader base = new BasicCodePointReader(car);
37 | PushbackCodePointReader reader = new PushbackCodePointReader(base, 2);
38 |
39 | assertEquals((int)'a', reader.read());
40 | assertEquals(1L, reader.getPosition());
41 | assertEquals((int)'b', reader.read());
42 | assertEquals(2L, reader.getPosition());
43 | assertEquals((int)'c', reader.read());
44 | assertEquals(3L, reader.getPosition());
45 | reader.unread((int)'あ', 1);
46 | assertEquals(2L, reader.getPosition());
47 | reader.unread(scp, 2);
48 | assertEquals(0L, reader.getPosition());
49 | assertEquals(scp, reader.read());
50 | assertEquals(2L, reader.getPosition());
51 | assertEquals((int)'あ', reader.read());
52 | assertEquals(3L, reader.getPosition());
53 | assertEquals(-1, reader.read());
54 | }
55 |
56 | @Test
57 | public void testMaxSize() throws Exception
58 | {
59 | String str = "abc";
60 |
61 | CharArrayReader car = new CharArrayReader(str.toCharArray());
62 | CodePointReader base = new BasicCodePointReader(car);
63 | PushbackCodePointReader reader = new PushbackCodePointReader(base, 2);
64 |
65 | for (int i = 0; i < 3; ++i) {
66 | reader.read();
67 | }
68 | for (int i = 0; i < 2; ++i) {
69 | reader.unread('a', 1);
70 | }
71 |
72 | try {
73 | reader.unread('a', 1);
74 | fail("スタックサイズ上限の指定が機能していません。");
75 | } catch (IOException e) {
76 | }
77 | }
78 | }
79 |
--------------------------------------------------------------------------------
/src/test/java/net/moraleboost/mecab/impl/StandardTaggerTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | **
3 | ** Mar. 1, 2008
4 | **
5 | ** The author disclaims copyright to this source code.
6 | ** In place of a legal notice, here is a blessing:
7 | **
8 | ** May you do good and not evil.
9 | ** May you find forgiveness for yourself and forgive others.
10 | ** May you share freely, never taking more than you give.
11 | **
12 | ** Stolen from SQLite :-)
13 | ** Any feedback is welcome.
14 | ** Kohei TAKETA
15 | **
16 | */
17 | package net.moraleboost.mecab.impl;
18 |
19 | import net.moraleboost.mecab.Lattice;
20 | import net.moraleboost.mecab.Node;
21 | import net.moraleboost.mecab.Tagger;
22 | import org.junit.Test;
23 |
24 | import static org.junit.Assert.fail;
25 |
26 | public class StandardTaggerTest
27 | {
28 | public static String[] TEXTS = {
29 | "メロスは激怒した。必ず、かの邪智暴虐(じゃちぼうぎゃく)の王を除かなければならぬと決意した。メロスには政治がわからぬ。メロスは、村の牧人である。笛を吹き、羊と遊んで暮して来た。けれども邪悪に対しては、人一倍に敏感であった。きょう未明メロスは村を出発し、野を越え山越え、十里はなれた此(こ)のシラクスの市にやって来た。メロスには父も、母も無い。女房も無い。十六の、内気な妹と二人暮しだ。この妹は、村の或る律気な一牧人を、近々、花婿(はなむこ)として迎える事になっていた。結婚式も間近かなのである。メロスは、それゆえ、花嫁の衣裳やら祝宴の御馳走やらを買いに、はるばる市にやって来たのだ。先ず、その品々を買い集め、それから都の大路をぶらぶら歩いた。メロスには竹馬の友があった。セリヌンティウスである。今は此のシラクスの市で、石工をしている。その友を、これから訪ねてみるつもりなのだ。久しく逢わなかったのだから、訪ねて行くのが楽しみである。歩いているうちにメロスは、まちの様子を怪しく思った。ひっそりしている。もう既に日も落ちて、まちの暗いのは当りまえだが、けれども、なんだか、夜のせいばかりでは無く、市全体が、やけに寂しい。のんきなメロスも、だんだん不安になって来た。路で逢った若い衆をつかまえて、何かあったのか、二年まえに此の市に来たときは、夜でも皆が歌をうたって、まちは賑やかであった筈(はず)だが、と質問した。若い衆は、首を振って答えなかった。しばらく歩いて老爺(ろうや)に逢い、こんどはもっと、語勢を強くして質問した。老爺は答えなかった。メロスは両手で老爺のからだをゆすぶって質問を重ねた。老爺は、あたりをはばかる低声で、わずか答えた。",
30 | "メロスは、単純な男であった。買い物を、背負ったままで、のそのそ王城にはいって行った。たちまち彼は、巡邏(じゅんら)の警吏に捕縛された。調べられて、メロスの懐中からは短剣が出て来たので、騒ぎが大きくなってしまった。メロスは、王の前に引き出された。「この短刀で何をするつもりであったか。言え!」暴君ディオニスは静かに、けれども威厳を以(もっ)て問いつめた。その王の顔は蒼白(そうはく)で、眉間(みけん)の皺(しわ)は、刻み込まれたように深かった。「市を暴君の手から救うのだ。」とメロスは悪びれずに答えた。「おまえがか?」王は、憫笑(びんしょう)した。「仕方の無いやつじゃ。おまえには、わしの孤独がわからぬ。」「言うな!」とメロスは、いきり立って反駁(はんばく)した。「人の心を疑うのは、最も恥ずべき悪徳だ。王は、民の忠誠をさえ疑って居られる。」「疑うのが、正当の心構えなのだと、わしに教えてくれたのは、おまえたちだ。人の心は、あてにならない。人間は、もともと私慾のかたまりさ。信じては、ならぬ。」暴君は落着いて呟(つぶや)き、ほっと溜息(ためいき)をついた。「わしだって、平和を望んでいるのだが。」「なんの為の平和だ。自分の地位を守る為か。」こんどはメロスが嘲笑した。「罪の無い人を殺して、何が平和だ。」「だまれ、下賤(げせん)の者。」王は、さっと顔を挙げて報いた。「口では、どんな清らかな事でも言える。わしには、人の腹綿の奥底が見え透いてならぬ。おまえだって、いまに、磔(はりつけ)になってから、泣いて詫(わ)びたって聞かぬぞ。」「ああ、王は悧巧(りこう)だ。自惚(うぬぼ)れているがよい。私は、ちゃんと死ぬる覚悟で居るのに。命乞いなど決してしない。ただ、――」と言いかけて、メロスは足もとに視線を落し瞬時ためらい、「ただ、私に情をかけたいつもりなら、処刑までに三日間の日限を与えて下さい。たった一人の妹に、亭主を持たせてやりたいのです。三日のうちに、私は村で結婚式を挙げさせ、必ず、ここへ帰って来ます。」「ばかな。」と暴君は、嗄(しわが)れた声で低く笑った。「とんでもない嘘(うそ)を言うわい。逃がした小鳥が帰って来るというのか。」「そうです。帰って来るのです。」メロスは必死で言い張った。「私は約束を守ります。私を、三日間だけ許して下さい。妹が、私の帰りを待っているのだ。そんなに私を信じられないならば、よろしい、この市にセリヌンティウスという石工がいます。私の無二の友人だ。あれを、人質としてここに置いて行こう。私が逃げてしまって、三日目の日暮まで、ここに帰って来なかったら、あの友人を絞め殺して下さい。たのむ、そうして下さい。」",
31 | "それを聞いて王は、残虐な気持で、そっと北叟笑(ほくそえ)んだ。生意気なことを言うわい。どうせ帰って来ないにきまっている。この嘘つきに騙(だま)された振りして、放してやるのも面白い。そうして身代りの男を、三日目に殺してやるのも気味がいい。人は、これだから信じられぬと、わしは悲しい顔して、その身代りの男を磔刑に処してやるのだ。世の中の、正直者とかいう奴輩(やつばら)にうんと見せつけてやりたいものさ。「願いを、聞いた。その身代りを呼ぶがよい。三日目には日没までに帰って来い。おくれたら、その身代りを、きっと殺すぞ。ちょっとおくれて来るがいい。おまえの罪は、永遠にゆるしてやろうぞ。」「なに、何をおっしゃる。」「はは。いのちが大事だったら、おくれて来い。おまえの心は、わかっているぞ。」"
32 | };
33 |
34 | @Test
35 | public void testParse()
36 | {
37 | try {
38 | Tagger tagger = new StandardTagger("");
39 | Lattice lattice = tagger.createLattice();
40 | lattice.setSentence("本日は晴天なり。");
41 | tagger.parse(lattice);
42 | Node node = lattice.bosNode().next();
43 |
44 | while (node != null && node.stat() != Node.TYPE_EOS_NODE) {
45 | System.out.println("Surface = " + node.surface());
46 | System.out.println("Feature = " + node.feature());
47 | node = node.next();
48 | }
49 | lattice.destroy();
50 | tagger.destroy();
51 | } catch (Exception e) {
52 | fail(e.toString());
53 | }
54 | }
55 |
56 | @Test
57 | public void testPerf()
58 | {
59 | try {
60 | Tagger tagger = new StandardTagger("");
61 | Lattice lattice = tagger.createLattice();
62 | String[] leadingSpaceAndSurface = new String[2];
63 |
64 | // warming up
65 | for (int i=0; i<100; ++i) {
66 | lattice.clear();
67 | lattice.setSentence(TEXTS[i % TEXTS.length]);
68 | tagger.parse(lattice);
69 | Node node = lattice.bosNode().next();
70 |
71 | while (node != null && node.stat() != Node.TYPE_EOS_NODE) {
72 | node.leadingSpaceAndSurface(leadingSpaceAndSurface);
73 | node.feature();
74 | node = node.next();
75 | }
76 | }
77 |
78 | long start = System.currentTimeMillis();
79 |
80 | for (int i=0; i<1000; ++i) {
81 | lattice.clear();
82 | lattice.setSentence(TEXTS[i % TEXTS.length]);
83 | tagger.parse(lattice);
84 | Node node = lattice.bosNode().next();
85 |
86 | while (node != null && node.stat() != Node.TYPE_EOS_NODE) {
87 | node.leadingSpaceAndSurface(leadingSpaceAndSurface);
88 | node.feature();
89 | node = node.next();
90 | }
91 | }
92 |
93 | long end = System.currentTimeMillis();
94 |
95 | System.out.println("Total: " + Long.toString(end-start) + " millis.");
96 |
97 | lattice.destroy();
98 | tagger.destroy();
99 | } catch (Exception e) {
100 | fail(e.toString());
101 | }
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/src/test/java/net/moraleboost/tinysegmenter/TinySegmenterTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | **
3 | ** Mar. 24, 2009
4 | **
5 | ** The author disclaims copyright to this source code.
6 | ** In place of a legal notice, here is a blessing:
7 | **
8 | ** May you do good and not evil.
9 | ** May you find forgiveness for yourself and forgive others.
10 | ** May you share freely, never taking more than you give.
11 | **
12 | ** Stolen from SQLite :-)
13 | ** Any feedback is welcome.
14 | ** Kohei TAKETA
15 | **
16 | */
17 | package net.moraleboost.tinysegmenter;
18 |
19 | import net.moraleboost.io.BasicCodePointReader;
20 | import org.junit.Test;
21 |
22 | import java.io.StringReader;
23 |
24 | import static org.junit.Assert.assertEquals;
25 | import static org.junit.Assert.assertNull;
26 |
27 | public class TinySegmenterTest
28 | {
29 | @Test
30 | public void testTokenize()
31 | throws Exception
32 | {
33 | String str = "本日は晴天なり。";
34 | StringReader reader = new StringReader(str);
35 | BasicCodePointReader cpreader = new BasicCodePointReader(reader);
36 |
37 | TinySegmenter segmenter = new TinySegmenter(cpreader);
38 |
39 | String[] terms = {
40 | "本日",
41 | "は",
42 | "晴天",
43 | "なり",
44 | "。"
45 | };
46 |
47 | int[][] offsets = {
48 | {0, 2},
49 | {2, 3},
50 | {3, 5},
51 | {5, 7},
52 | {7, 8}
53 | };
54 |
55 | TinySegmenter.Token token;
56 | int i = 0;
57 | while ((token = segmenter.next()) != null) {
58 | assertEquals(terms[i], token.str);
59 | assertEquals(offsets[i][0], token.start);
60 | assertEquals(offsets[i][1], token.end);
61 | ++i;
62 | }
63 |
64 | assertEquals(terms.length, i);
65 | }
66 |
67 | @Test
68 | public void testMinimalBufferSize()
69 | throws Exception
70 | {
71 | String str =
72 | "メロスは激怒した。" +
73 | "必ず、かの邪智暴虐の王を除かなければならぬと決意した。" +
74 | "メロスには政治がわからぬ。" +
75 | "メロスは、村の牧人である。" +
76 | "笛を吹き、羊と遊んで暮して来た。" +
77 | "けれども邪悪に対しては、人一倍に敏感であった。";
78 | StringReader reader = new StringReader(str);
79 | StringReader reader2 = new StringReader(str);
80 | BasicCodePointReader cpreader = new BasicCodePointReader(reader);
81 | BasicCodePointReader cpreader2 = new BasicCodePointReader(reader2);
82 |
83 | TinySegmenter segmenter =
84 | new TinySegmenter(cpreader, 7, TinySegmenter.DEFAULT_MAX_TOKEN_SIZE);
85 | TinySegmenter segmenter2 =
86 | new TinySegmenter(cpreader2, 1024, TinySegmenter.DEFAULT_MAX_TOKEN_SIZE);
87 |
88 | TinySegmenter.Token token;
89 | TinySegmenter.Token token2;
90 | while (true) {
91 | token = segmenter.next();
92 | token2 = segmenter2.next();
93 |
94 | assertEquals(token2, token);
95 |
96 | if (token == null || token2 == null) {
97 | break;
98 | }
99 | }
100 |
101 | assertNull(token);
102 | assertNull(token2);
103 | }
104 |
105 | @Test
106 | public void testEmptyStream()
107 | throws Exception
108 | {
109 | String str = "";
110 | StringReader reader = new StringReader(str);
111 | BasicCodePointReader cpreader = new BasicCodePointReader(reader);
112 |
113 | TinySegmenter segmenter = new TinySegmenter(cpreader);
114 |
115 | assertNull(segmenter.next());
116 | }
117 |
118 | @Test
119 | public void testMaxTokenSize()
120 | throws Exception
121 | {
122 | String str = "一日作さざれば、一日食わず。";
123 | StringReader reader = new StringReader(str);
124 | BasicCodePointReader cpreader = new BasicCodePointReader(reader);
125 |
126 | TinySegmenter segmenter = new TinySegmenter(cpreader, 1024, 2);
127 |
128 | String[] terms = {
129 | "一日",
130 | "作",
131 | "さざ",
132 | "れ",
133 | "ば",
134 | "、",
135 | "一",
136 | "日",
137 | "食わ",
138 | "ず",
139 | "。"
140 | };
141 |
142 | int[][] offsets = {
143 | {0, 2},
144 | {2, 3},
145 | {3, 5},
146 | {5, 6},
147 | {6, 7},
148 | {7, 8},
149 | {8, 9},
150 | {9, 10},
151 | {10, 12},
152 | {12, 13},
153 | {13, 14}
154 | };
155 |
156 | TinySegmenter.Token token;
157 | int i = 0;
158 | while ((token = segmenter.next()) != null) {
159 | assertEquals(terms[i], token.str);
160 | assertEquals(offsets[i][0], token.start);
161 | assertEquals(offsets[i][1], token.end);
162 | ++i;
163 | }
164 |
165 | assertEquals(terms.length, i);
166 | }
167 | }
168 |
--------------------------------------------------------------------------------
/src/test/java/net/moraleboost/util/CSVUtilTest.java:
--------------------------------------------------------------------------------
1 | package net.moraleboost.util;
2 |
3 | import org.junit.Test;
4 |
5 | import static org.junit.Assert.assertArrayEquals;
6 | import static org.junit.Assert.assertEquals;
7 |
8 | public class CSVUtilTest
9 | {
10 | @Test
11 | public void testEscape()
12 | {
13 | for (int i=0; i<65536; ++i) {
14 | String str = Character.toString((char)i);
15 | if (str.equals("\"")) {
16 | assertEquals("\"\"\"\"", CSVUtil.escape(str));
17 | } else if (str.equals(" ")) {
18 | assertEquals("\" \"", CSVUtil.escape(str));
19 | } else if (str.equals("\t")) {
20 | assertEquals("\"\t\"", CSVUtil.escape(str));
21 | } else if (str.equals(",")) {
22 | assertEquals("\",\"", CSVUtil.escape(str));
23 | } else {
24 | assertEquals(str, CSVUtil.escape(str));
25 | }
26 | }
27 |
28 | assertEquals("abc", CSVUtil.escape("abc"));
29 | assertEquals("あいうえお", CSVUtil.escape("あいうえお"));
30 | assertEquals("\" This \"\"is a pen., what?\t\"", CSVUtil.escape(" This \"is a pen., what?\t"));
31 | }
32 |
33 | @Test
34 | public void testTokenize()
35 | {
36 | // 空文字列は、サイズ0の配列になる
37 | String str = "";
38 | String[] answer = new String[] {};
39 | String[] tokens = CSVUtil.tokenize(str, 100);
40 | assertArrayEquals(answer, tokens);
41 |
42 | // 「""」は、空文字列一つからなる配列になる
43 | str = "\"\"";
44 | answer = new String[] {""};
45 | tokens = CSVUtil.tokenize(str, 100);
46 | assertArrayEquals(answer, tokens);
47 |
48 | // ","は、空文字列2つからなる配列になる
49 | str = ",";
50 | answer = new String[] {"", ""};
51 | tokens = CSVUtil.tokenize(str, 100);
52 | assertArrayEquals(answer, tokens);
53 |
54 | // 非クォート文字列(1列)
55 | str = "abc";
56 | answer = new String[] { "abc" };
57 | tokens = CSVUtil.tokenize(str, 100);
58 | assertArrayEquals(answer, tokens);
59 |
60 | // クォート文字列(1列)
61 | str = " \t\"ab\"\"c\" ";
62 | answer = new String[] {"ab\"c"};
63 | tokens = CSVUtil.tokenize(str, 100);
64 | assertArrayEquals(answer, tokens);
65 |
66 | // 非クォート文字列(複数列)
67 | str = "a,b , c";
68 | answer = new String[] {"a", "b ", "c"};
69 | tokens = CSVUtil.tokenize(str, 100);
70 | assertArrayEquals(answer, tokens);
71 |
72 | // クォート文字列(複数列)
73 | str = "\"a\" , \" bc\"\"d \", \"efg";
74 | answer = new String[] {"a", " bc\"d ", "efg"};
75 | tokens = CSVUtil.tokenize(str, 100);
76 | assertArrayEquals(answer, tokens);
77 |
78 | // 混在
79 | str = "a, \"bcd\" , efg ";
80 | answer = new String[] {"a", "bcd", "efg "};
81 | tokens = CSVUtil.tokenize(str, 100);
82 | assertArrayEquals(answer, tokens);
83 |
84 | // max指定
85 | str = "a , ";
86 | answer = new String[] {"a , "};
87 | tokens = CSVUtil.tokenize(str, 1);
88 | assertArrayEquals(answer, tokens);
89 |
90 | str = "a ,";
91 | answer = new String[] {"a ,"};
92 | tokens = CSVUtil.tokenize(str, 1);
93 | assertArrayEquals(answer, tokens);
94 |
95 | str = "a , \"bcd\"efg";
96 | answer = new String[] {"a , \"bcd\"efg"};
97 | tokens = CSVUtil.tokenize(str, 1);
98 | assertArrayEquals(answer, tokens);
99 |
100 | str = "a , bcd, \tefg,def";
101 | answer = new String[] {"a ", "bcd, \tefg,def"};
102 | tokens = CSVUtil.tokenize(str, 2);
103 | assertArrayEquals(answer, tokens);
104 | }
105 | }
106 |
--------------------------------------------------------------------------------