├── .gitignore ├── README.txt ├── build.gradle ├── cmecab-java.iml ├── cmecab-java.ipr ├── etc └── TinySegmenter │ ├── exporter.js │ └── tiny_segmenter-0.1.js ├── lib ├── js.jar └── license │ ├── LICENCE-TinySegmenter.txt │ ├── LICENSE-APACHE.txt │ ├── LICENSE-BridJ.txt │ └── cpl1.0.txt └── src ├── main └── java │ └── net │ └── moraleboost │ ├── io │ ├── BasicCodePointReader.java │ ├── CharsetUtil.java │ ├── CodePointReader.java │ └── PushbackCodePointReader.java │ ├── mecab │ ├── DictionaryInfo.java │ ├── Lattice.java │ ├── Model.java │ ├── Node.java │ ├── Path.java │ ├── Tagger.java │ └── impl │ │ ├── StandardDictionaryInfo.java │ │ ├── StandardLattice.java │ │ ├── StandardModel.java │ │ ├── StandardNode.java │ │ ├── StandardPath.java │ │ └── StandardTagger.java │ ├── tinysegmenter │ ├── ModelExporter.java │ ├── TinySegmenter.java │ └── TinySegmenterConstants.java │ └── util │ └── CSVUtil.java └── test └── java └── net └── moraleboost ├── io ├── BasicCodePointReaderTest.java └── PushbackCodePointReaderTest.java ├── mecab └── impl │ └── StandardTaggerTest.java ├── tinysegmenter └── TinySegmenterTest.java └── util └── CSVUtilTest.java /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.jar 3 | *.iws 4 | build/ 5 | .gradle/ 6 | -------------------------------------------------------------------------------- /README.txt: -------------------------------------------------------------------------------- 1 | cmecab-java 2 | 3 | 1. これは何? 4 | 5 | 日本語形態素解析エンジンMeCab (http://mecab.sourceforge.net/) の 6 | Javaバインディングです。SWIGを用いず、直接MeCabのライブラリを 7 | 呼び出しています。 8 | 9 | おまけとして、以下のものを含んでいます。 10 | * TinySegmenter (http://chasen.org/~taku/software/TinySegmenter/) の 11 | Java移植版、及びそのTokenizer、TokenizerFactory 12 | 13 | 14 | 2. パッケージ構成 15 | 16 | 配布パッケージは、以下のディレクトリ構造を持ちます。 17 | 18 | bin - ビルドされたJavaライブラリが書き出されるディレクトリ 19 | lib - ビルドおよびテストに必要なサードパーティライブラリが 20 | 格納されたディレクトリ 21 | src - Pure Javaライブラリのソースコードが格納されたディレクトリ 22 | test - テスト用データが格納されたディレクトリ 23 | etc - その他もろもろ(TinySegmenterのオリジナルソース等) 24 | 25 | 26 | 3. インストール方法 27 | 28 | 配布パッケージのルートディレクトリで、gradle buildを実行してください。 29 | 30 | % gradle build 31 | 32 | ビルドが終了すると、build/libsディレクトリに、cmecab-java-(バージョン番号).jar 33 | という名前のJARファイルが作成されます。これをお好きな場所にコピーして、 34 | Javaのクラスパスを通してください。 35 | 36 | 実行には、別途BridJ (https://github.com/nativelibs4java/BridJ) のjarを 37 | 入手し、クラスパスを通す必要があります。 38 | また、事前に、MeCabのライブラリ(libmecab.dll, libmecab.soなど)に、 39 | OSのパスを通しておく必要があります。 40 | 41 | 42 | 4. 利用方法 43 | http://code.google.com/p/cmecab-java/wiki/HowToUse 44 | をご覧ください。 45 | 46 | 47 | 5. ライセンス 48 | 49 | cmecab-java本体はパブリックドメインとします。 50 | 51 | ただし、TinySegmenter.java、TinySegmenterConstants.javaについては、 52 | TinySegmenter (http://chasen.org/~taku/software/TinySegmenter/)の 53 | 二次的著作物であるため、オリジナルと同じく修正BSDライセンスに 54 | 従います。 55 | 56 | なお、ビルドおよびテストのため、lib, etcディレクトリに以下のソフトウェアを 57 | 同梱しています。これらのソフトウェアは、それぞれのライセンスに従います。 58 | 59 | * TinySegmenter 60 | * 修正BSDライセンス 61 | * lib/license/LICENSE-TinySegmenter.txtをご覧ください 62 | 63 | 6. 連絡先 64 | 65 | MeCab、TinySegmenterに関するご質問は、それぞれのソフトウェアの 66 | メーリングリスト等へどうぞ。 67 | 68 | cmecab-java自体に関するご質問等は、武田光平 k-tak@void.in までどうぞ。 69 | -------------------------------------------------------------------------------- /build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | apply plugin: 'idea' 3 | apply plugin: 'maven' 4 | 5 | group = 'net.moraleboost.cmecab-java' 6 | archivesBaseName = 'cmecab-java' 7 | version = '2.1.0' 8 | 9 | sourceCompatibility = 1.6 10 | targetCompatibility = 1.6 11 | 12 | compileJava.options.encoding = 'UTF-8' 13 | compileTestJava.options.encoding = 'UTF-8' 14 | javadoc.options.encoding = 'UTF-8' 15 | javadoc.options.charSet = 'UTF-8' 16 | 17 | configurations { 18 | deployerJars 19 | } 20 | 21 | repositories { 22 | maven { 23 | url 'http://maven.restlet.org/' 24 | } 25 | maven { 26 | url 'http://jcenter.bintray.com/' 27 | } 28 | mavenCentral() 29 | } 30 | 31 | dependencies { 32 | compile('com.nativelibs4java:bridj:0.7.0') 33 | compile fileTree(dir: 'lib', include: '*.jar') 34 | 35 | testCompile('junit:junit:4.12') 36 | 37 | deployerJars('org.apache.maven.wagon:wagon-http:2.2') 38 | } 39 | 40 | uploadArchives { 41 | repositories.mavenDeployer { 42 | configuration = configurations.deployerJars 43 | repository(url: System.properties['cmecab_java.repositoryUrl']) { 44 | authentication( 45 | userName: System.properties['cmecab_java.repositoryUser'], 46 | password: System.properties['cmecab_java.repositoryPassword']) 47 | } 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /cmecab-java.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /cmecab-java.ipr: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | Android Lint 212 | 213 | 214 | Internationalization issues 215 | 216 | 217 | Internationalization issuesJava 218 | 219 | 220 | Java 221 | 222 | 223 | Python 224 | 225 | 226 | WebSocket issues 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | -------------------------------------------------------------------------------- /etc/TinySegmenter/exporter.js: -------------------------------------------------------------------------------- 1 | /* 2 | ** 3 | ** Mar. 24, 2009 4 | ** 5 | ** The author disclaims copyright to this source code. 6 | ** In place of a legal notice, here is a blessing: 7 | ** 8 | ** May you do good and not evil. 9 | ** May you find forgiveness for yourself and forgive others. 10 | ** May you share freely, never taking more than you give. 11 | ** 12 | ** Stolen from SQLite :-) 13 | ** Any feedback is welcome. 14 | ** Kohei TAKETA 15 | ** 16 | */ 17 | var segmenter = new TinySegmenter(); 18 | var BIAS = segmenter.BIAS__; 19 | 20 | function getKeys(name) { 21 | var ret = ""; 22 | var obj = segmenter[name + "__"]; 23 | 24 | for (var k in obj) { 25 | if (ret != "") { 26 | ret += ","; 27 | } 28 | ret += ("\"" + k + "\""); 29 | } 30 | 31 | return "{" + ret + "}"; 32 | } 33 | 34 | function getValues(name) { 35 | var ret = ""; 36 | var obj = segmenter[name + "__"]; 37 | 38 | for (var k in obj) { 39 | if (ret != "") { 40 | ret += ","; 41 | } 42 | ret += obj[k]; 43 | } 44 | 45 | return "{" + ret + "}"; 46 | } 47 | -------------------------------------------------------------------------------- /etc/TinySegmenter/tiny_segmenter-0.1.js: -------------------------------------------------------------------------------- 1 | // TinySegmenter 0.1 -- Super compact Japanese tokenizer in Javascript 2 | // (c) 2008 Taku Kudo 3 | // TinySegmenter is freely distributable under the terms of a new BSD licence. 4 | // For details, see http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt 5 | 6 | function TinySegmenter() { 7 | var patterns = { 8 | "[一二三四五六七八九十百千万億兆]":"M", 9 | "[一-龠々〆ヵヶ]":"H", 10 | "[ぁ-ん]":"I", 11 | "[ァ-ヴーア-ン゙ー]":"K", 12 | "[a-zA-Za-zA-Z]":"A", 13 | "[0-90-9]":"N" 14 | } 15 | this.chartype_ = []; 16 | for (var i in patterns) { 17 | var regexp = new RegExp; 18 | regexp.compile(i) 19 | this.chartype_.push([regexp, patterns[i]]); 20 | } 21 | 22 | this.BIAS__ = -332 23 | this.BC1__ = {"HH":6,"II":2461,"KH":406,"OH":-1378}; 24 | this.BC2__ = {"AA":-3267,"AI":2744,"AN":-878,"HH":-4070,"HM":-1711,"HN":4012,"HO":3761,"IA":1327,"IH":-1184,"II":-1332,"IK":1721,"IO":5492,"KI":3831,"KK":-8741,"MH":-3132,"MK":3334,"OO":-2920}; 25 | this.BC3__ = {"HH":996,"HI":626,"HK":-721,"HN":-1307,"HO":-836,"IH":-301,"KK":2762,"MK":1079,"MM":4034,"OA":-1652,"OH":266}; 26 | this.BP1__ = {"BB":295,"OB":304,"OO":-125,"UB":352}; 27 | this.BP2__ = {"BO":60,"OO":-1762}; 28 | this.BQ1__ = {"BHH":1150,"BHM":1521,"BII":-1158,"BIM":886,"BMH":1208,"BNH":449,"BOH":-91,"BOO":-2597,"OHI":451,"OIH":-296,"OKA":1851,"OKH":-1020,"OKK":904,"OOO":2965}; 29 | this.BQ2__ = {"BHH":118,"BHI":-1159,"BHM":466,"BIH":-919,"BKK":-1720,"BKO":864,"OHH":-1139,"OHM":-181,"OIH":153,"UHI":-1146}; 30 | this.BQ3__ = {"BHH":-792,"BHI":2664,"BII":-299,"BKI":419,"BMH":937,"BMM":8335,"BNN":998,"BOH":775,"OHH":2174,"OHM":439,"OII":280,"OKH":1798,"OKI":-793,"OKO":-2242,"OMH":-2402,"OOO":11699}; 31 | this.BQ4__ = {"BHH":-3895,"BIH":3761,"BII":-4654,"BIK":1348,"BKK":-1806,"BMI":-3385,"BOO":-12396,"OAH":926,"OHH":266,"OHK":-2036,"ONN":-973}; 32 | this.BW1__ = {",と":660,",同":727,"B1あ":1404,"B1同":542,"、と":660,"、同":727,"」と":1682,"あっ":1505,"いう":1743,"いっ":-2055,"いる":672,"うし":-4817,"うん":665,"から":3472,"がら":600,"こう":-790,"こと":2083,"こん":-1262,"さら":-4143,"さん":4573,"した":2641,"して":1104,"すで":-3399,"そこ":1977,"それ":-871,"たち":1122,"ため":601,"った":3463,"つい":-802,"てい":805,"てき":1249,"でき":1127,"です":3445,"では":844,"とい":-4915,"とみ":1922,"どこ":3887,"ない":5713,"なっ":3015,"など":7379,"なん":-1113,"にし":2468,"には":1498,"にも":1671,"に対":-912,"の一":-501,"の中":741,"ませ":2448,"まで":1711,"まま":2600,"まる":-2155,"やむ":-1947,"よっ":-2565,"れた":2369,"れで":-913,"をし":1860,"を見":731,"亡く":-1886,"京都":2558,"取り":-2784,"大き":-2604,"大阪":1497,"平方":-2314,"引き":-1336,"日本":-195,"本当":-2423,"毎日":-2113,"目指":-724,"B1あ":1404,"B1同":542,"」と":1682}; 33 | this.BW2__ = {"..":-11822,"11":-669,"――":-5730,"−−":-13175,"いう":-1609,"うか":2490,"かし":-1350,"かも":-602,"から":-7194,"かれ":4612,"がい":853,"がら":-3198,"きた":1941,"くな":-1597,"こと":-8392,"この":-4193,"させ":4533,"され":13168,"さん":-3977,"しい":-1819,"しか":-545,"した":5078,"して":972,"しな":939,"その":-3744,"たい":-1253,"たた":-662,"ただ":-3857,"たち":-786,"たと":1224,"たは":-939,"った":4589,"って":1647,"っと":-2094,"てい":6144,"てき":3640,"てく":2551,"ては":-3110,"ても":-3065,"でい":2666,"でき":-1528,"でし":-3828,"です":-4761,"でも":-4203,"とい":1890,"とこ":-1746,"とと":-2279,"との":720,"とみ":5168,"とも":-3941,"ない":-2488,"なが":-1313,"など":-6509,"なの":2614,"なん":3099,"にお":-1615,"にし":2748,"にな":2454,"によ":-7236,"に対":-14943,"に従":-4688,"に関":-11388,"のか":2093,"ので":-7059,"のに":-6041,"のの":-6125,"はい":1073,"はが":-1033,"はず":-2532,"ばれ":1813,"まし":-1316,"まで":-6621,"まれ":5409,"めて":-3153,"もい":2230,"もの":-10713,"らか":-944,"らし":-1611,"らに":-1897,"りし":651,"りま":1620,"れた":4270,"れて":849,"れば":4114,"ろう":6067,"われ":7901,"を通":-11877,"んだ":728,"んな":-4115,"一人":602,"一方":-1375,"一日":970,"一部":-1051,"上が":-4479,"会社":-1116,"出て":2163,"分の":-7758,"同党":970,"同日":-913,"大阪":-2471,"委員":-1250,"少な":-1050,"年度":-8669,"年間":-1626,"府県":-2363,"手権":-1982,"新聞":-4066,"日新":-722,"日本":-7068,"日米":3372,"曜日":-601,"朝鮮":-2355,"本人":-2697,"東京":-1543,"然と":-1384,"社会":-1276,"立て":-990,"第に":-1612,"米国":-4268,"11":-669}; 34 | this.BW3__ = {"あた":-2194,"あり":719,"ある":3846,"い.":-1185,"い。":-1185,"いい":5308,"いえ":2079,"いく":3029,"いた":2056,"いっ":1883,"いる":5600,"いわ":1527,"うち":1117,"うと":4798,"えと":1454,"か.":2857,"か。":2857,"かけ":-743,"かっ":-4098,"かに":-669,"から":6520,"かり":-2670,"が,":1816,"が、":1816,"がき":-4855,"がけ":-1127,"がっ":-913,"がら":-4977,"がり":-2064,"きた":1645,"けど":1374,"こと":7397,"この":1542,"ころ":-2757,"さい":-714,"さを":976,"し,":1557,"し、":1557,"しい":-3714,"した":3562,"して":1449,"しな":2608,"しま":1200,"す.":-1310,"す。":-1310,"する":6521,"ず,":3426,"ず、":3426,"ずに":841,"そう":428,"た.":8875,"た。":8875,"たい":-594,"たの":812,"たり":-1183,"たる":-853,"だ.":4098,"だ。":4098,"だっ":1004,"った":-4748,"って":300,"てい":6240,"てお":855,"ても":302,"です":1437,"でに":-1482,"では":2295,"とう":-1387,"とし":2266,"との":541,"とも":-3543,"どう":4664,"ない":1796,"なく":-903,"など":2135,"に,":-1021,"に、":-1021,"にし":1771,"にな":1906,"には":2644,"の,":-724,"の、":-724,"の子":-1000,"は,":1337,"は、":1337,"べき":2181,"まし":1113,"ます":6943,"まっ":-1549,"まで":6154,"まれ":-793,"らし":1479,"られ":6820,"るる":3818,"れ,":854,"れ、":854,"れた":1850,"れて":1375,"れば":-3246,"れる":1091,"われ":-605,"んだ":606,"んで":798,"カ月":990,"会議":860,"入り":1232,"大会":2217,"始め":1681,"市":965,"新聞":-5055,"日,":974,"日、":974,"社会":2024,"カ月":990}; 35 | this.TC1__ = {"AAA":1093,"HHH":1029,"HHM":580,"HII":998,"HOH":-390,"HOM":-331,"IHI":1169,"IOH":-142,"IOI":-1015,"IOM":467,"MMH":187,"OOI":-1832}; 36 | this.TC2__ = {"HHO":2088,"HII":-1023,"HMM":-1154,"IHI":-1965,"KKH":703,"OII":-2649}; 37 | this.TC3__ = {"AAA":-294,"HHH":346,"HHI":-341,"HII":-1088,"HIK":731,"HOH":-1486,"IHH":128,"IHI":-3041,"IHO":-1935,"IIH":-825,"IIM":-1035,"IOI":-542,"KHH":-1216,"KKA":491,"KKH":-1217,"KOK":-1009,"MHH":-2694,"MHM":-457,"MHO":123,"MMH":-471,"NNH":-1689,"NNO":662,"OHO":-3393}; 38 | this.TC4__ = {"HHH":-203,"HHI":1344,"HHK":365,"HHM":-122,"HHN":182,"HHO":669,"HIH":804,"HII":679,"HOH":446,"IHH":695,"IHO":-2324,"IIH":321,"III":1497,"IIO":656,"IOO":54,"KAK":4845,"KKA":3386,"KKK":3065,"MHH":-405,"MHI":201,"MMH":-241,"MMM":661,"MOM":841}; 39 | this.TQ1__ = {"BHHH":-227,"BHHI":316,"BHIH":-132,"BIHH":60,"BIII":1595,"BNHH":-744,"BOHH":225,"BOOO":-908,"OAKK":482,"OHHH":281,"OHIH":249,"OIHI":200,"OIIH":-68}; 40 | this.TQ2__ = {"BIHH":-1401,"BIII":-1033,"BKAK":-543,"BOOO":-5591}; 41 | this.TQ3__ = {"BHHH":478,"BHHM":-1073,"BHIH":222,"BHII":-504,"BIIH":-116,"BIII":-105,"BMHI":-863,"BMHM":-464,"BOMH":620,"OHHH":346,"OHHI":1729,"OHII":997,"OHMH":481,"OIHH":623,"OIIH":1344,"OKAK":2792,"OKHH":587,"OKKA":679,"OOHH":110,"OOII":-685}; 42 | this.TQ4__ = {"BHHH":-721,"BHHM":-3604,"BHII":-966,"BIIH":-607,"BIII":-2181,"OAAA":-2763,"OAKK":180,"OHHH":-294,"OHHI":2446,"OHHO":480,"OHIH":-1573,"OIHH":1935,"OIHI":-493,"OIIH":626,"OIII":-4007,"OKAK":-8156}; 43 | this.TW1__ = {"につい":-4681,"東京都":2026}; 44 | this.TW2__ = {"ある程":-2049,"いった":-1256,"ころが":-2434,"しょう":3873,"その後":-4430,"だって":-1049,"ていた":1833,"として":-4657,"ともに":-4517,"もので":1882,"一気に":-792,"初めて":-1512,"同時に":-8097,"大きな":-1255,"対して":-2721,"社会党":-3216}; 45 | this.TW3__ = {"いただ":-1734,"してい":1314,"として":-4314,"につい":-5483,"にとっ":-5989,"に当た":-6247,"ので,":-727,"ので、":-727,"のもの":-600,"れから":-3752,"十二月":-2287}; 46 | this.TW4__ = {"いう.":8576,"いう。":8576,"からな":-2348,"してい":2958,"たが,":1516,"たが、":1516,"ている":1538,"という":1349,"ました":5543,"ません":1097,"ようと":-4258,"よると":5865}; 47 | this.UC1__ = {"A":484,"K":93,"M":645,"O":-505}; 48 | this.UC2__ = {"A":819,"H":1059,"I":409,"M":3987,"N":5775,"O":646}; 49 | this.UC3__ = {"A":-1370,"I":2311}; 50 | this.UC4__ = {"A":-2643,"H":1809,"I":-1032,"K":-3450,"M":3565,"N":3876,"O":6646}; 51 | this.UC5__ = {"H":313,"I":-1238,"K":-799,"M":539,"O":-831}; 52 | this.UC6__ = {"H":-506,"I":-253,"K":87,"M":247,"O":-387}; 53 | this.UP1__ = {"O":-214}; 54 | this.UP2__ = {"B":69,"O":935}; 55 | this.UP3__ = {"B":189}; 56 | this.UQ1__ = {"BH":21,"BI":-12,"BK":-99,"BN":142,"BO":-56,"OH":-95,"OI":477,"OK":410,"OO":-2422}; 57 | this.UQ2__ = {"BH":216,"BI":113,"OK":1759}; 58 | this.UQ3__ = {"BA":-479,"BH":42,"BI":1913,"BK":-7198,"BM":3160,"BN":6427,"BO":14761,"OI":-827,"ON":-3212}; 59 | this.UW1__ = {",":156,"、":156,"「":-463,"あ":-941,"う":-127,"が":-553,"き":121,"こ":505,"で":-201,"と":-547,"ど":-123,"に":-789,"の":-185,"は":-847,"も":-466,"や":-470,"よ":182,"ら":-292,"り":208,"れ":169,"を":-446,"ん":-137,"・":-135,"主":-402,"京":-268,"区":-912,"午":871,"国":-460,"大":561,"委":729,"市":-411,"日":-141,"理":361,"生":-408,"県":-386,"都":-718,"「":-463,"・":-135}; 60 | this.UW2__ = {",":-829,"、":-829,"〇":892,"「":-645,"」":3145,"あ":-538,"い":505,"う":134,"お":-502,"か":1454,"が":-856,"く":-412,"こ":1141,"さ":878,"ざ":540,"し":1529,"す":-675,"せ":300,"そ":-1011,"た":188,"だ":1837,"つ":-949,"て":-291,"で":-268,"と":-981,"ど":1273,"な":1063,"に":-1764,"の":130,"は":-409,"ひ":-1273,"べ":1261,"ま":600,"も":-1263,"や":-402,"よ":1639,"り":-579,"る":-694,"れ":571,"を":-2516,"ん":2095,"ア":-587,"カ":306,"キ":568,"ッ":831,"三":-758,"不":-2150,"世":-302,"中":-968,"主":-861,"事":492,"人":-123,"会":978,"保":362,"入":548,"初":-3025,"副":-1566,"北":-3414,"区":-422,"大":-1769,"天":-865,"太":-483,"子":-1519,"学":760,"実":1023,"小":-2009,"市":-813,"年":-1060,"強":1067,"手":-1519,"揺":-1033,"政":1522,"文":-1355,"新":-1682,"日":-1815,"明":-1462,"最":-630,"朝":-1843,"本":-1650,"東":-931,"果":-665,"次":-2378,"民":-180,"気":-1740,"理":752,"発":529,"目":-1584,"相":-242,"県":-1165,"立":-763,"第":810,"米":509,"自":-1353,"行":838,"西":-744,"見":-3874,"調":1010,"議":1198,"込":3041,"開":1758,"間":-1257,"「":-645,"」":3145,"ッ":831,"ア":-587,"カ":306,"キ":568}; 61 | this.UW3__ = {",":4889,"1":-800,"−":-1723,"、":4889,"々":-2311,"〇":5827,"」":2670,"〓":-3573,"あ":-2696,"い":1006,"う":2342,"え":1983,"お":-4864,"か":-1163,"が":3271,"く":1004,"け":388,"げ":401,"こ":-3552,"ご":-3116,"さ":-1058,"し":-395,"す":584,"せ":3685,"そ":-5228,"た":842,"ち":-521,"っ":-1444,"つ":-1081,"て":6167,"で":2318,"と":1691,"ど":-899,"な":-2788,"に":2745,"の":4056,"は":4555,"ひ":-2171,"ふ":-1798,"へ":1199,"ほ":-5516,"ま":-4384,"み":-120,"め":1205,"も":2323,"や":-788,"よ":-202,"ら":727,"り":649,"る":5905,"れ":2773,"わ":-1207,"を":6620,"ん":-518,"ア":551,"グ":1319,"ス":874,"ッ":-1350,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278,"・":-3794,"一":-1619,"下":-1759,"世":-2087,"両":3815,"中":653,"主":-758,"予":-1193,"二":974,"人":2742,"今":792,"他":1889,"以":-1368,"低":811,"何":4265,"作":-361,"保":-2439,"元":4858,"党":3593,"全":1574,"公":-3030,"六":755,"共":-1880,"円":5807,"再":3095,"分":457,"初":2475,"別":1129,"前":2286,"副":4437,"力":365,"動":-949,"務":-1872,"化":1327,"北":-1038,"区":4646,"千":-2309,"午":-783,"協":-1006,"口":483,"右":1233,"各":3588,"合":-241,"同":3906,"和":-837,"員":4513,"国":642,"型":1389,"場":1219,"外":-241,"妻":2016,"学":-1356,"安":-423,"実":-1008,"家":1078,"小":-513,"少":-3102,"州":1155,"市":3197,"平":-1804,"年":2416,"広":-1030,"府":1605,"度":1452,"建":-2352,"当":-3885,"得":1905,"思":-1291,"性":1822,"戸":-488,"指":-3973,"政":-2013,"教":-1479,"数":3222,"文":-1489,"新":1764,"日":2099,"旧":5792,"昨":-661,"時":-1248,"曜":-951,"最":-937,"月":4125,"期":360,"李":3094,"村":364,"東":-805,"核":5156,"森":2438,"業":484,"氏":2613,"民":-1694,"決":-1073,"法":1868,"海":-495,"無":979,"物":461,"特":-3850,"生":-273,"用":914,"町":1215,"的":7313,"直":-1835,"省":792,"県":6293,"知":-1528,"私":4231,"税":401,"立":-960,"第":1201,"米":7767,"系":3066,"約":3663,"級":1384,"統":-4229,"総":1163,"線":1255,"者":6457,"能":725,"自":-2869,"英":785,"見":1044,"調":-562,"財":-733,"費":1777,"車":1835,"軍":1375,"込":-1504,"通":-1136,"選":-681,"郎":1026,"郡":4404,"部":1200,"金":2163,"長":421,"開":-1432,"間":1302,"関":-1282,"雨":2009,"電":-1045,"非":2066,"駅":1620,"1":-800,"」":2670,"・":-3794,"ッ":-1350,"ア":551,"グ":1319,"ス":874,"ト":521,"ム":1109,"ル":1591,"ロ":2201,"ン":278}; 62 | this.UW4__ = {",":3930,".":3508,"―":-4841,"、":3930,"。":3508,"〇":4999,"「":1895,"」":3798,"〓":-5156,"あ":4752,"い":-3435,"う":-640,"え":-2514,"お":2405,"か":530,"が":6006,"き":-4482,"ぎ":-3821,"く":-3788,"け":-4376,"げ":-4734,"こ":2255,"ご":1979,"さ":2864,"し":-843,"じ":-2506,"す":-731,"ず":1251,"せ":181,"そ":4091,"た":5034,"だ":5408,"ち":-3654,"っ":-5882,"つ":-1659,"て":3994,"で":7410,"と":4547,"な":5433,"に":6499,"ぬ":1853,"ね":1413,"の":7396,"は":8578,"ば":1940,"ひ":4249,"び":-4134,"ふ":1345,"へ":6665,"べ":-744,"ほ":1464,"ま":1051,"み":-2082,"む":-882,"め":-5046,"も":4169,"ゃ":-2666,"や":2795,"ょ":-1544,"よ":3351,"ら":-2922,"り":-9726,"る":-14896,"れ":-2613,"ろ":-4570,"わ":-1783,"を":13150,"ん":-2352,"カ":2145,"コ":1789,"セ":1287,"ッ":-724,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637,"・":-4371,"ー":-11870,"一":-2069,"中":2210,"予":782,"事":-190,"井":-1768,"人":1036,"以":544,"会":950,"体":-1286,"作":530,"側":4292,"先":601,"党":-2006,"共":-1212,"内":584,"円":788,"初":1347,"前":1623,"副":3879,"力":-302,"動":-740,"務":-2715,"化":776,"区":4517,"協":1013,"参":1555,"合":-1834,"和":-681,"員":-910,"器":-851,"回":1500,"国":-619,"園":-1200,"地":866,"場":-1410,"塁":-2094,"士":-1413,"多":1067,"大":571,"子":-4802,"学":-1397,"定":-1057,"寺":-809,"小":1910,"屋":-1328,"山":-1500,"島":-2056,"川":-2667,"市":2771,"年":374,"庁":-4556,"後":456,"性":553,"感":916,"所":-1566,"支":856,"改":787,"政":2182,"教":704,"文":522,"方":-856,"日":1798,"時":1829,"最":845,"月":-9066,"木":-485,"来":-442,"校":-360,"業":-1043,"氏":5388,"民":-2716,"気":-910,"沢":-939,"済":-543,"物":-735,"率":672,"球":-1267,"生":-1286,"産":-1101,"田":-2900,"町":1826,"的":2586,"目":922,"省":-3485,"県":2997,"空":-867,"立":-2112,"第":788,"米":2937,"系":786,"約":2171,"経":1146,"統":-1169,"総":940,"線":-994,"署":749,"者":2145,"能":-730,"般":-852,"行":-792,"規":792,"警":-1184,"議":-244,"谷":-1000,"賞":730,"車":-1481,"軍":1158,"輪":-1433,"込":-3370,"近":929,"道":-1291,"選":2596,"郎":-4866,"都":1192,"野":-1100,"銀":-2213,"長":357,"間":-2344,"院":-2297,"際":-2604,"電":-878,"領":-1659,"題":-792,"館":-1984,"首":1749,"高":2120,"「":1895,"」":3798,"・":-4371,"ッ":-724,"ー":-11870,"カ":2145,"コ":1789,"セ":1287,"ト":-403,"メ":-1635,"ラ":-881,"リ":-541,"ル":-856,"ン":-3637}; 63 | this.UW5__ = {",":465,".":-299,"1":-514,"E2":-32768,"]":-2762,"、":465,"。":-299,"「":363,"あ":1655,"い":331,"う":-503,"え":1199,"お":527,"か":647,"が":-421,"き":1624,"ぎ":1971,"く":312,"げ":-983,"さ":-1537,"し":-1371,"す":-852,"だ":-1186,"ち":1093,"っ":52,"つ":921,"て":-18,"で":-850,"と":-127,"ど":1682,"な":-787,"に":-1224,"の":-635,"は":-578,"べ":1001,"み":502,"め":865,"ゃ":3350,"ょ":854,"り":-208,"る":429,"れ":504,"わ":419,"を":-1264,"ん":327,"イ":241,"ル":451,"ン":-343,"中":-871,"京":722,"会":-1153,"党":-654,"務":3519,"区":-901,"告":848,"員":2104,"大":-1296,"学":-548,"定":1785,"嵐":-1304,"市":-2991,"席":921,"年":1763,"思":872,"所":-814,"挙":1618,"新":-1682,"日":218,"月":-4353,"査":932,"格":1356,"機":-1508,"氏":-1347,"田":240,"町":-3912,"的":-3149,"相":1319,"省":-1052,"県":-4003,"研":-997,"社":-278,"空":-813,"統":1955,"者":-2233,"表":663,"語":-1073,"議":1219,"選":-1018,"郎":-368,"長":786,"間":1191,"題":2368,"館":-689,"1":-514,"E2":-32768,"「":363,"イ":241,"ル":451,"ン":-343}; 64 | this.UW6__ = {",":227,".":808,"1":-270,"E1":306,"、":227,"。":808,"あ":-307,"う":189,"か":241,"が":-73,"く":-121,"こ":-200,"じ":1782,"す":383,"た":-428,"っ":573,"て":-1014,"で":101,"と":-105,"な":-253,"に":-149,"の":-417,"は":-236,"も":-206,"り":187,"る":-135,"を":195,"ル":-673,"ン":-496,"一":-277,"中":201,"件":-800,"会":624,"前":302,"区":1792,"員":-1212,"委":798,"学":-960,"市":887,"広":-695,"後":535,"業":-697,"相":753,"社":-507,"福":974,"空":-822,"者":1811,"連":463,"郎":1082,"1":-270,"E1":306,"ル":-673,"ン":-496}; 65 | 66 | return this; 67 | } 68 | 69 | TinySegmenter.prototype.ctype_ = function(str) { 70 | for (var i in this.chartype_) { 71 | if (str.match(this.chartype_[i][0])) { 72 | return this.chartype_[i][1]; 73 | } 74 | } 75 | return "O"; 76 | } 77 | 78 | TinySegmenter.prototype.ts_ = function(v) { 79 | if (v) { return v; } 80 | return 0; 81 | } 82 | 83 | TinySegmenter.prototype.segment = function(input) { 84 | if (input == null || input == undefined || input == "") { 85 | return []; 86 | } 87 | var result = []; 88 | var seg = ["B3","B2","B1"]; 89 | var ctype = ["O","O","O"]; 90 | var o = input.split(""); 91 | for (i = 0; i < o.length; ++i) { 92 | seg.push(o[i]); 93 | ctype.push(this.ctype_(o[i])) 94 | } 95 | seg.push("E1"); 96 | seg.push("E2"); 97 | seg.push("E3"); 98 | ctype.push("O"); 99 | ctype.push("O"); 100 | ctype.push("O"); 101 | var word = seg[3]; 102 | var p1 = "U"; 103 | var p2 = "U"; 104 | var p3 = "U"; 105 | for (var i = 4; i < seg.length - 3; ++i) { 106 | var score = this.BIAS__; 107 | var w1 = seg[i-3]; 108 | var w2 = seg[i-2]; 109 | var w3 = seg[i-1]; 110 | var w4 = seg[i]; 111 | var w5 = seg[i+1]; 112 | var w6 = seg[i+2]; 113 | var c1 = ctype[i-3]; 114 | var c2 = ctype[i-2]; 115 | var c3 = ctype[i-1]; 116 | var c4 = ctype[i]; 117 | var c5 = ctype[i+1]; 118 | var c6 = ctype[i+2]; 119 | score += this.ts_(this.UP1__[p1]); 120 | score += this.ts_(this.UP2__[p2]); 121 | score += this.ts_(this.UP3__[p3]); 122 | score += this.ts_(this.BP1__[p1 + p2]); 123 | score += this.ts_(this.BP2__[p2 + p3]); 124 | score += this.ts_(this.UW1__[w1]); 125 | score += this.ts_(this.UW2__[w2]); 126 | score += this.ts_(this.UW3__[w3]); 127 | score += this.ts_(this.UW4__[w4]); 128 | score += this.ts_(this.UW5__[w5]); 129 | score += this.ts_(this.UW6__[w6]); 130 | score += this.ts_(this.BW1__[w2 + w3]); 131 | score += this.ts_(this.BW2__[w3 + w4]); 132 | score += this.ts_(this.BW3__[w4 + w5]); 133 | score += this.ts_(this.TW1__[w1 + w2 + w3]); 134 | score += this.ts_(this.TW2__[w2 + w3 + w4]); 135 | score += this.ts_(this.TW3__[w3 + w4 + w5]); 136 | score += this.ts_(this.TW4__[w4 + w5 + w6]); 137 | score += this.ts_(this.UC1__[c1]); 138 | score += this.ts_(this.UC2__[c2]); 139 | score += this.ts_(this.UC3__[c3]); 140 | score += this.ts_(this.UC4__[c4]); 141 | score += this.ts_(this.UC5__[c5]); 142 | score += this.ts_(this.UC6__[c6]); 143 | score += this.ts_(this.BC1__[c2 + c3]); 144 | score += this.ts_(this.BC2__[c3 + c4]); 145 | score += this.ts_(this.BC3__[c4 + c5]); 146 | score += this.ts_(this.TC1__[c1 + c2 + c3]); 147 | score += this.ts_(this.TC2__[c2 + c3 + c4]); 148 | score += this.ts_(this.TC3__[c3 + c4 + c5]); 149 | score += this.ts_(this.TC4__[c4 + c5 + c6]); 150 | // score += this.ts_(this.TC5__[c4 + c5 + c6]); 151 | score += this.ts_(this.UQ1__[p1 + c1]); 152 | score += this.ts_(this.UQ2__[p2 + c2]); 153 | score += this.ts_(this.UQ1__[p3 + c3]); 154 | score += this.ts_(this.BQ1__[p2 + c2 + c3]); 155 | score += this.ts_(this.BQ2__[p2 + c3 + c4]); 156 | score += this.ts_(this.BQ3__[p3 + c2 + c3]); 157 | score += this.ts_(this.BQ4__[p3 + c3 + c4]); 158 | score += this.ts_(this.TQ1__[p2 + c1 + c2 + c3]); 159 | score += this.ts_(this.TQ2__[p2 + c2 + c3 + c4]); 160 | score += this.ts_(this.TQ3__[p3 + c1 + c2 + c3]); 161 | score += this.ts_(this.TQ4__[p3 + c2 + c3 + c4]); 162 | var p = "O"; 163 | if (score > 0) { 164 | result.push(word); 165 | word = ""; 166 | p = "B"; 167 | } 168 | p1 = p2; 169 | p2 = p3; 170 | p3 = p; 171 | word += seg[i]; 172 | } 173 | result.push(word); 174 | 175 | return result; 176 | } 177 | -------------------------------------------------------------------------------- /lib/js.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/takscape/cmecab-java/2d8c160bfa884d926387e8e7b9f11b908157511b/lib/js.jar -------------------------------------------------------------------------------- /lib/license/LICENCE-TinySegmenter.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2008, Taku Kudo 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, 9 | this list of conditions and the following disclaimer. 10 | * Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | * Neither the name of the nor the names of its 14 | contributors may be used to endorse or promote products derived from this 15 | software without specific prior written permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 18 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 19 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 20 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 21 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 22 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 23 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 24 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 25 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 26 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 27 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /lib/license/LICENSE-APACHE.txt: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /lib/license/LICENSE-BridJ.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2010-2012, Olivier Chafik 2 | All rights reserved. 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions are met: 5 | 6 | * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 7 | * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 8 | * Neither the name of Olivier Chafik nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. 9 | 10 | THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY 11 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 12 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 13 | DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY 14 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 15 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 16 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 17 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 18 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 19 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 20 | -------------------------------------------------------------------------------- /lib/license/cpl1.0.txt: -------------------------------------------------------------------------------- 1 | THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS COMMON PUBLIC 2 | LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION OF THE PROGRAM 3 | CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. 4 | 5 | 1. DEFINITIONS 6 | 7 | "Contribution" means: 8 | 9 | a) in the case of the initial Contributor, the initial code and 10 | documentation distributed under this Agreement, and 11 | 12 | b) in the case of each subsequent Contributor: 13 | 14 | i) changes to the Program, and 15 | 16 | ii) additions to the Program; 17 | 18 | where such changes and/or additions to the Program originate from and are 19 | distributed by that particular Contributor. A Contribution 'originates' from a 20 | Contributor if it was added to the Program by such Contributor itself or anyone 21 | acting on such Contributor's behalf. Contributions do not include additions to 22 | the Program which: (i) are separate modules of software distributed in 23 | conjunction with the Program under their own license agreement, and (ii) are not 24 | derivative works of the Program. 25 | 26 | "Contributor" means any person or entity that distributes the Program. 27 | 28 | "Licensed Patents " mean patent claims licensable by a Contributor which are 29 | necessarily infringed by the use or sale of its Contribution alone or when 30 | combined with the Program. 31 | 32 | "Program" means the Contributions distributed in accordance with this Agreement. 33 | 34 | "Recipient" means anyone who receives the Program under this Agreement, 35 | including all Contributors. 36 | 37 | 2. GRANT OF RIGHTS 38 | 39 | a) Subject to the terms of this Agreement, each Contributor hereby grants 40 | Recipient a non-exclusive, worldwide, royalty-free copyright license to 41 | reproduce, prepare derivative works of, publicly display, publicly perform, 42 | distribute and sublicense the Contribution of such Contributor, if any, and such 43 | derivative works, in source code and object code form. 44 | 45 | b) Subject to the terms of this Agreement, each Contributor hereby grants 46 | Recipient a non-exclusive, worldwide, royalty-free patent license under Licensed 47 | Patents to make, use, sell, offer to sell, import and otherwise transfer the 48 | Contribution of such Contributor, if any, in source code and object code form. 49 | This patent license shall apply to the combination of the Contribution and the 50 | Program if, at the time the Contribution is added by the Contributor, such 51 | addition of the Contribution causes such combination to be covered by the 52 | Licensed Patents. The patent license shall not apply to any other combinations 53 | which include the Contribution. No hardware per se is licensed hereunder. 54 | 55 | c) Recipient understands that although each Contributor grants the licenses 56 | to its Contributions set forth herein, no assurances are provided by any 57 | Contributor that the Program does not infringe the patent or other intellectual 58 | property rights of any other entity. Each Contributor disclaims any liability to 59 | Recipient for claims brought by any other entity based on infringement of 60 | intellectual property rights or otherwise. As a condition to exercising the 61 | rights and licenses granted hereunder, each Recipient hereby assumes sole 62 | responsibility to secure any other intellectual property rights needed, if any. 63 | For example, if a third party patent license is required to allow Recipient to 64 | distribute the Program, it is Recipient's responsibility to acquire that license 65 | before distributing the Program. 66 | 67 | d) Each Contributor represents that to its knowledge it has sufficient 68 | copyright rights in its Contribution, if any, to grant the copyright license set 69 | forth in this Agreement. 70 | 71 | 3. REQUIREMENTS 72 | 73 | A Contributor may choose to distribute the Program in object code form under its 74 | own license agreement, provided that: 75 | 76 | a) it complies with the terms and conditions of this Agreement; and 77 | 78 | b) its license agreement: 79 | 80 | i) effectively disclaims on behalf of all Contributors all warranties and 81 | conditions, express and implied, including warranties or conditions of title and 82 | non-infringement, and implied warranties or conditions of merchantability and 83 | fitness for a particular purpose; 84 | 85 | ii) effectively excludes on behalf of all Contributors all liability for 86 | damages, including direct, indirect, special, incidental and consequential 87 | damages, such as lost profits; 88 | 89 | iii) states that any provisions which differ from this Agreement are offered 90 | by that Contributor alone and not by any other party; and 91 | 92 | iv) states that source code for the Program is available from such 93 | Contributor, and informs licensees how to obtain it in a reasonable manner on or 94 | through a medium customarily used for software exchange. 95 | 96 | When the Program is made available in source code form: 97 | 98 | a) it must be made available under this Agreement; and 99 | 100 | b) a copy of this Agreement must be included with each copy of the Program. 101 | 102 | Contributors may not remove or alter any copyright notices contained within the 103 | Program. 104 | 105 | Each Contributor must identify itself as the originator of its Contribution, if 106 | any, in a manner that reasonably allows subsequent Recipients to identify the 107 | originator of the Contribution. 108 | 109 | 4. COMMERCIAL DISTRIBUTION 110 | 111 | Commercial distributors of software may accept certain responsibilities with 112 | respect to end users, business partners and the like. While this license is 113 | intended to facilitate the commercial use of the Program, the Contributor who 114 | includes the Program in a commercial product offering should do so in a manner 115 | which does not create potential liability for other Contributors. Therefore, if 116 | a Contributor includes the Program in a commercial product offering, such 117 | Contributor ("Commercial Contributor") hereby agrees to defend and indemnify 118 | every other Contributor ("Indemnified Contributor") against any losses, damages 119 | and costs (collectively "Losses") arising from claims, lawsuits and other legal 120 | actions brought by a third party against the Indemnified Contributor to the 121 | extent caused by the acts or omissions of such Commercial Contributor in 122 | connection with its distribution of the Program in a commercial product 123 | offering. The obligations in this section do not apply to any claims or Losses 124 | relating to any actual or alleged intellectual property infringement. In order 125 | to qualify, an Indemnified Contributor must: a) promptly notify the Commercial 126 | Contributor in writing of such claim, and b) allow the Commercial Contributor to 127 | control, and cooperate with the Commercial Contributor in, the defense and any 128 | related settlement negotiations. The Indemnified Contributor may participate in 129 | any such claim at its own expense. 130 | 131 | For example, a Contributor might include the Program in a commercial product 132 | offering, Product X. That Contributor is then a Commercial Contributor. If that 133 | Commercial Contributor then makes performance claims, or offers warranties 134 | related to Product X, those performance claims and warranties are such 135 | Commercial Contributor's responsibility alone. Under this section, the 136 | Commercial Contributor would have to defend claims against the other 137 | Contributors related to those performance claims and warranties, and if a court 138 | requires any other Contributor to pay any damages as a result, the Commercial 139 | Contributor must pay those damages. 140 | 141 | 5. NO WARRANTY 142 | 143 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, THE PROGRAM IS PROVIDED ON AN 144 | "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR 145 | IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, 146 | NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. Each 147 | Recipient is solely responsible for determining the appropriateness of using and 148 | distributing the Program and assumes all risks associated with its exercise of 149 | rights under this Agreement, including but not limited to the risks and costs of 150 | program errors, compliance with applicable laws, damage to or loss of data, 151 | programs or equipment, and unavailability or interruption of operations. 152 | 153 | 6. DISCLAIMER OF LIABILITY 154 | 155 | EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, NEITHER RECIPIENT NOR ANY 156 | CONTRIBUTORS SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, 157 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST 158 | PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 159 | STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 160 | OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE EXERCISE OF ANY RIGHTS 161 | GRANTED HEREUNDER, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. 162 | 163 | 7. GENERAL 164 | 165 | If any provision of this Agreement is invalid or unenforceable under applicable 166 | law, it shall not affect the validity or enforceability of the remainder of the 167 | terms of this Agreement, and without further action by the parties hereto, such 168 | provision shall be reformed to the minimum extent necessary to make such 169 | provision valid and enforceable. 170 | 171 | If Recipient institutes patent litigation against a Contributor with respect to 172 | a patent applicable to software (including a cross-claim or counterclaim in a 173 | lawsuit), then any patent licenses granted by that Contributor to such Recipient 174 | under this Agreement shall terminate as of the date such litigation is filed. In 175 | addition, if Recipient institutes patent litigation against any entity 176 | (including a cross-claim or counterclaim in a lawsuit) alleging that the Program 177 | itself (excluding combinations of the Program with other software or hardware) 178 | infringes such Recipient's patent(s), then such Recipient's rights granted under 179 | Section 2(b) shall terminate as of the date such litigation is filed. 180 | 181 | All Recipient's rights under this Agreement shall terminate if it fails to 182 | comply with any of the material terms or conditions of this Agreement and does 183 | not cure such failure in a reasonable period of time after becoming aware of 184 | such noncompliance. If all Recipient's rights under this Agreement terminate, 185 | Recipient agrees to cease use and distribution of the Program as soon as 186 | reasonably practicable. However, Recipient's obligations under this Agreement 187 | and any licenses granted by Recipient relating to the Program shall continue and 188 | survive. 189 | 190 | Everyone is permitted to copy and distribute copies of this Agreement, but in 191 | order to avoid inconsistency the Agreement is copyrighted and may only be 192 | modified in the following manner. The Agreement Steward reserves the right to 193 | publish new versions (including revisions) of this Agreement from time to time. 194 | No one other than the Agreement Steward has the right to modify this Agreement. 195 | IBM is the initial Agreement Steward. IBM may assign the responsibility to serve 196 | as the Agreement Steward to a suitable separate entity. Each new version of the 197 | Agreement will be given a distinguishing version number. The Program (including 198 | Contributions) may always be distributed subject to the version of the Agreement 199 | under which it was received. In addition, after a new version of the Agreement 200 | is published, Contributor may elect to distribute the Program (including its 201 | Contributions) under the new version. Except as expressly stated in Sections 202 | 2(a) and 2(b) above, Recipient receives no rights or licenses to the 203 | intellectual property of any Contributor under this Agreement, whether 204 | expressly, by implication, estoppel or otherwise. All rights in the Program not 205 | expressly granted under this Agreement are reserved. 206 | 207 | This Agreement is governed by the laws of the State of New York and the 208 | intellectual property laws of the United States of America. No party to this 209 | Agreement will bring a legal action under this Agreement more than one year 210 | after the cause of action arose. Each party waives its rights to a jury trial in 211 | any resulting litigation. 212 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/io/BasicCodePointReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | ** 3 | ** Feb. 1, 2009 4 | ** 5 | ** The author disclaims copyright to this source code. 6 | ** In place of a legal notice, here is a blessing: 7 | ** 8 | ** May you do good and not evil. 9 | ** May you find forgiveness for yourself and forgive others. 10 | ** May you share freely, never taking more than you give. 11 | ** 12 | ** Stolen from SQLite :-) 13 | ** Any feedback is welcome. 14 | ** Kohei TAKETA 15 | ** 16 | */ 17 | package net.moraleboost.io; 18 | 19 | import java.io.IOException; 20 | import java.io.PushbackReader; 21 | import java.io.Reader; 22 | 23 | /** 24 | * Readerをラップして、ひとつずつUnicodeコードポイントを 25 | * 取得するためのデコレータクラス。サロゲートペアを正しく認識する。
26 | * 27 | * 不正なサロゲートペアは、{@link #getAlternationCodePoint()}で得られる 28 | * 代替コードポイントに置換される。 29 | */ 30 | public class BasicCodePointReader implements CodePointReader 31 | { 32 | /** 33 | * 不正なサロゲートペアを置換する文字の既定値。 34 | */ 35 | public static final int DEFAULT_ALTERNATION_CODEPOINT = '〓'; 36 | 37 | private PushbackReader reader; 38 | private long position; 39 | private int alternationCodePoint; 40 | private boolean eos; 41 | 42 | /** 43 | * コードポイントイテレータを構築する。 44 | * 45 | * @param reader 46 | * ソースとなるcharのシーケンス 47 | */ 48 | public BasicCodePointReader(Reader reader) 49 | { 50 | this.reader = new PushbackReader(reader, 1); 51 | this.alternationCodePoint = DEFAULT_ALTERNATION_CODEPOINT; 52 | } 53 | 54 | public void setAlternationCodePoint(int cp) 55 | { 56 | this.alternationCodePoint = cp; 57 | } 58 | 59 | public int getAlternationCodePoint() 60 | { 61 | return alternationCodePoint; 62 | } 63 | 64 | public long getPosition() 65 | { 66 | return position; 67 | } 68 | 69 | public int read() throws IOException 70 | { 71 | int ci; 72 | char c, c2; 73 | 74 | if (eos) { 75 | return -1; 76 | } 77 | 78 | ci = reader.read(); 79 | ++position; 80 | 81 | if (ci < 0) { 82 | // end of character stream 83 | eos = true; 84 | return -1; 85 | } else { 86 | c = (char)ci; 87 | } 88 | 89 | if (Character.isHighSurrogate(c)) { 90 | // 次の文字を検査 91 | ci = reader.read(); 92 | ++position; 93 | if (ci < 0) { 94 | // シーケンスがhigh surrogateで終わっている。 95 | // 代替文字を返すと共に、EOSフラグをONにする。 96 | eos = true; 97 | --position; 98 | return alternationCodePoint; 99 | } 100 | 101 | c2 = (char)ci; 102 | if (Character.isLowSurrogate(c2)) { 103 | // サロゲートペアをコードポイントに変換して返す。 104 | return Character.toCodePoint(c, c2); 105 | } else { 106 | // high surrogateに続くcharが、low surrogateでない。 107 | // c2をプッシュバックして代替文字を返す。 108 | reader.unread(c2); 109 | --position; 110 | return alternationCodePoint; 111 | } 112 | } else if (Character.isLowSurrogate(c)) { 113 | // 単独で存在するlow surrogateを発見。 114 | // 代替文字を返す。 115 | return alternationCodePoint; 116 | } else { 117 | // 基本文字。そのまま返す。 118 | return c; 119 | } 120 | } 121 | 122 | public void reset() 123 | { 124 | position = 0; 125 | eos = false; 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/io/CharsetUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | ** 3 | ** Mar. 1, 2008 4 | ** 5 | ** The author disclaims copyright to this source code. 6 | ** In place of a legal notice, here is a blessing: 7 | ** 8 | ** May you do good and not evil. 9 | ** May you find forgiveness for yourself and forgive others. 10 | ** May you share freely, never taking more than you give. 11 | ** 12 | ** Stolen from SQLite :-) 13 | ** Any feedback is welcome. 14 | ** Kohei TAKETA 15 | ** 16 | */ 17 | package net.moraleboost.io; 18 | 19 | import java.nio.ByteBuffer; 20 | import java.nio.CharBuffer; 21 | import java.nio.charset.*; 22 | 23 | /** 24 | * 文字コード変換用のヘルパークラス。 25 | * 26 | * @author takedaku 27 | */ 28 | public abstract class CharsetUtil 29 | { 30 | /** 31 | * Unicodeから指定した文字コードへの変換器を作成する。 32 | * 33 | * @param charset 34 | * 文字コード 35 | * @param malformedInputAction 36 | * 不正な入力への対処方法 37 | * @param unmappableCharacterAction 38 | * 指定した文字コードに存在しない文字への対処方法 39 | * @return 作成した変換器 40 | * @throws java.nio.charset.IllegalCharsetNameException 41 | * 文字コード名が不正 42 | * @throws UnsupportedCharsetException 43 | * JVMが文字コードをサポートしていない 44 | * @throws IllegalArgumentException 45 | * malformedInputAction, unmappableCharacterActionに無効な値を指定。 46 | */ 47 | public static CharsetEncoder createEncoder(String charset, 48 | CodingErrorAction malformedInputAction, 49 | CodingErrorAction unmappableCharacterAction) 50 | { 51 | Charset cset = Charset.forName(charset); 52 | if (!cset.canEncode()) { 53 | throw new UnsupportedCharsetException(charset); 54 | } 55 | CharsetEncoder encoder = cset.newEncoder(); 56 | encoder.onMalformedInput(malformedInputAction); 57 | encoder.onUnmappableCharacter(unmappableCharacterAction); 58 | 59 | return encoder; 60 | } 61 | 62 | /** 63 | * 指定した文字コードからUnicodeへの変換器を作成する。 64 | * 65 | * @param charset 66 | * 文字コード 67 | * @param malformedInputAction 68 | * 不正な入力への対処方法 69 | * @param unmappableCharacterAction 70 | * 指定した文字コードに存在しない文字への対処方法 71 | * @return 作成した変換器 72 | * @throws java.nio.charset.IllegalCharsetNameException 73 | * 文字コード名が不正。 74 | * @throws UnsupportedCharsetException 75 | * JVMが文字コードをサポートしていない。 76 | * @throws IllegalArgumentException 77 | * malformedInputAction, unmappableCharacterActionに無効な値を指定。 78 | */ 79 | public static CharsetDecoder createDecoder(String charset, 80 | CodingErrorAction malformedInputAction, 81 | CodingErrorAction unmappableCharacterAction) 82 | { 83 | Charset cset = Charset.forName(charset); 84 | CharsetDecoder decoder = cset.newDecoder(); 85 | decoder.onMalformedInput(malformedInputAction); 86 | decoder.onUnmappableCharacter(unmappableCharacterAction); 87 | return decoder; 88 | } 89 | 90 | /** 91 | * 指定したエンコーダを用いて、Unicode文字列をバイト配列にエンコードする。 92 | * 93 | * @param encoder 94 | * エンコーダ 95 | * @param text 96 | * Unicode文字列 97 | * @param terminateWithNull 98 | * バイト配列の最後の要素としてヌル文字を詰めるかどうか 99 | * @return バイト配列 100 | * @throws CharacterCodingException 101 | * 変換エラーの発生 102 | */ 103 | public static byte[] encode(CharsetEncoder encoder, CharSequence text, 104 | boolean terminateWithNull) throws CharacterCodingException 105 | { 106 | encoder.reset(); 107 | ByteBuffer buf = encoder.encode(CharBuffer.wrap(text)); 108 | int size = buf.limit(); 109 | 110 | byte[] ret; 111 | if (terminateWithNull) { 112 | // \0を追加する。 113 | ret = new byte[size + 1]; 114 | buf.get(ret, 0, size); 115 | ret[size] = 0; 116 | } else { 117 | ret = new byte[size]; 118 | buf.get(ret, 0, size); 119 | } 120 | 121 | return ret; 122 | } 123 | 124 | /** 125 | * 指定したデコーダを用いて、バイト配列をUnicode文字列にデコードする。 126 | * 127 | * @param decoder 128 | * デコーダ 129 | * @param rawText 130 | * バイト配列 131 | * @return Unicode文字列 132 | * @throws CharacterCodingException 133 | * 変換エラーの発生 134 | */ 135 | public static String decode(CharsetDecoder decoder, byte[] rawText) 136 | throws CharacterCodingException 137 | { 138 | decoder.reset(); 139 | CharBuffer buf = decoder.decode(ByteBuffer.wrap(rawText)); 140 | return buf.toString(); 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/io/CodePointReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | ** 3 | ** Feb. 17, 2009 4 | ** 5 | ** The author disclaims copyright to this source code. 6 | ** In place of a legal notice, here is a blessing: 7 | ** 8 | ** May you do good and not evil. 9 | ** May you find forgiveness for yourself and forgive others. 10 | ** May you share freely, never taking more than you give. 11 | ** 12 | ** Stolen from SQLite :-) 13 | ** Any feedback is welcome. 14 | ** Kohei TAKETA 15 | ** 16 | */ 17 | package net.moraleboost.io; 18 | 19 | import java.io.IOException; 20 | 21 | /** 22 | * 一つずつUnicodeコードポイントを取得するためのクラス。 23 | * サロゲートペアを正しく認識する。 24 | */ 25 | public interface CodePointReader 26 | { 27 | /** 28 | * 不正なサロゲートペアを置換するための代替文字をセットする。 29 | * このメソッドを呼び出さない場合の既定値は、 30 | * 「{@value BasicCodePointReader#DEFAULT_ALTERNATION_CODEPOINT}」である。 31 | * 32 | * @param cp 33 | * 代替文字のコードポイント 34 | */ 35 | public abstract void setAlternationCodePoint(int cp); 36 | 37 | /** 38 | * 不正なサロゲートペアを置換するための代替文字を取得する。 39 | * 40 | * @return 代替文字のコードポイント 41 | */ 42 | public abstract int getAlternationCodePoint(); 43 | 44 | /** 45 | * キャラクタストリーム中の現在の位置を返す。 46 | * コードポイント単位でなくchar単位で数えるので、 47 | * サロゲートペアが出現すると、位置は2大きくなる。 48 | * 49 | * @return キャラクタストリーム中の位置。 50 | */ 51 | public abstract long getPosition(); 52 | 53 | /** 54 | * 次のコードポイントを取得する。 55 | * 56 | * @return Unicodeコードポイント。 57 | * @throws java.io.IOException 58 | */ 59 | public abstract int read() throws IOException; 60 | 61 | /** 62 | * 状態をリセットする。 63 | */ 64 | public abstract void reset(); 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/io/PushbackCodePointReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | ** 3 | ** Feb. 17, 2009 4 | ** 5 | ** The author disclaims copyright to this source code. 6 | ** In place of a legal notice, here is a blessing: 7 | ** 8 | ** May you do good and not evil. 9 | ** May you find forgiveness for yourself and forgive others. 10 | ** May you share freely, never taking more than you give. 11 | ** 12 | ** Stolen from SQLite :-) 13 | ** Any feedback is welcome. 14 | ** Kohei TAKETA 15 | ** 16 | */ 17 | package net.moraleboost.io; 18 | 19 | import java.io.IOException; 20 | 21 | public class PushbackCodePointReader implements CodePointReader 22 | { 23 | /** 24 | * ベースとなるCodePointReader 25 | */ 26 | private CodePointReader reader; 27 | /** 28 | * ストリームに戻された各コードポイントを保持するスタック 29 | */ 30 | private int codepoints[]; 31 | /** 32 | * ストリームに戻された各コードポイントの幅を保持するスタック 33 | */ 34 | private int lengths[]; 35 | /** 36 | * readerのpositionからのoffset 37 | */ 38 | private int offset; 39 | /** 40 | * スタックトップの位置 41 | */ 42 | private int stackpos; 43 | 44 | public PushbackCodePointReader(CodePointReader reader, int size) 45 | { 46 | this.reader = reader; 47 | this.codepoints = new int[size]; 48 | this.lengths = new int[size]; 49 | this.stackpos = -1; 50 | } 51 | 52 | public void setAlternationCodePoint(int cp) 53 | { 54 | reader.setAlternationCodePoint(cp); 55 | } 56 | 57 | public int getAlternationCodePoint() 58 | { 59 | return reader.getAlternationCodePoint(); 60 | } 61 | 62 | public long getPosition() 63 | { 64 | return reader.getPosition() - offset; 65 | } 66 | 67 | public int getStackSize() 68 | { 69 | return codepoints.length; 70 | } 71 | 72 | public int read() throws IOException 73 | { 74 | if (stackpos >= 0) { 75 | offset -= lengths[stackpos]; 76 | return codepoints[stackpos--]; 77 | } else { 78 | return reader.read(); 79 | } 80 | } 81 | 82 | public void reset() 83 | { 84 | reader.reset(); 85 | stackpos = -1; 86 | } 87 | 88 | /** 89 | * コードポイントを一つストリームに戻す。 90 | * 91 | * @param cp 92 | * プッシュバックするコードポイント 93 | * @param length 94 | * cpの幅をchar数単位で指定 95 | * @throws java.io.IOException 96 | */ 97 | public void unread(int cp, int length) throws IOException 98 | { 99 | if (stackpos + 1 >= codepoints.length) { 100 | throw new IOException("Stack overflow."); 101 | } 102 | 103 | ++stackpos; 104 | codepoints[stackpos] = cp; 105 | lengths[stackpos] = length; 106 | offset += length; 107 | } 108 | } 109 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/mecab/DictionaryInfo.java: -------------------------------------------------------------------------------- 1 | package net.moraleboost.mecab; 2 | 3 | public interface DictionaryInfo 4 | { 5 | int TYPE_SYS_DIC = 0; 6 | int TYPE_USR_DIC = 1; 7 | int TYPE_UNK_DIC = 2; 8 | 9 | String filename(); 10 | String charset(); 11 | long size(); 12 | int type(); 13 | long lsize(); 14 | long rsize(); 15 | int version(); 16 | DictionaryInfo next(); 17 | } 18 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/mecab/Lattice.java: -------------------------------------------------------------------------------- 1 | package net.moraleboost.mecab; 2 | 3 | public interface Lattice 4 | { 5 | int REQ_TYPE_ONE_BEST = 1; 6 | int REQ_TYPE_NBEST = 2; 7 | int REQ_TYPE_PARTIAL = 4; 8 | int REQ_TYPE_MARGINAL_PROB = 8; 9 | int REQ_TYPE_ALTERNATIVE = 16; 10 | int REQ_TYPE_ALL_MORPHS = 32; 11 | int REQ_TYPE_ALLOCATE_SENTENCE = 64; 12 | 13 | void destroy(); 14 | void clear(); 15 | boolean isAvailable(); 16 | Node bosNode(); 17 | Node eosNode(); 18 | Node beginNodes(long pos); 19 | Node endNodes(long pos); 20 | String sentence(); 21 | void setSentence(String sentence); 22 | long size(); 23 | double Z(); 24 | void setZ(double Z); 25 | double theta(); 26 | void setTheta(double theta); 27 | boolean next(); 28 | int requestType(); 29 | boolean hasRequestType(int requestType); 30 | void setRequestType(int requestType); 31 | void addRequestType(int requestType); 32 | void removeRequestType(int requestType); 33 | String toString(); 34 | String enumNBestAsString(long N); 35 | String what(); 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/mecab/Model.java: -------------------------------------------------------------------------------- 1 | package net.moraleboost.mecab; 2 | 3 | public interface Model 4 | { 5 | void destroy(); 6 | Tagger createTagger(); 7 | Lattice createLattice(); 8 | boolean swap(Model model); 9 | DictionaryInfo dictionaryInfo(); 10 | } 11 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/mecab/Node.java: -------------------------------------------------------------------------------- 1 | package net.moraleboost.mecab; 2 | 3 | public interface Node 4 | { 5 | int TYPE_NOR_NODE = 0; 6 | int TYPE_UNK_NODE = 1; 7 | int TYPE_BOS_NODE = 2; 8 | int TYPE_EOS_NODE = 3; 9 | int TYPE_EON_NODE = 4; 10 | 11 | Node prev(); 12 | Node next(); 13 | Node enext(); 14 | Node bnext(); 15 | Path rpath(); 16 | Path lpath(); 17 | String surface(); 18 | String rsurface(); 19 | boolean leadingSpaceAndSurface(String[] leadingSpaceAndSurface); 20 | String feature(); 21 | long id(); 22 | int length(); 23 | int rlength(); 24 | int rcAttr(); 25 | int lcAttr(); 26 | int posid(); 27 | int charType(); 28 | int stat(); 29 | boolean isbest(); 30 | float alpha(); 31 | float beta(); 32 | float prob(); 33 | short wcost(); 34 | long cost(); 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/mecab/Path.java: -------------------------------------------------------------------------------- 1 | package net.moraleboost.mecab; 2 | 3 | public interface Path 4 | { 5 | Node rnode(); 6 | Path rnext(); 7 | Node lnode(); 8 | Path lnext(); 9 | int cost(); 10 | float prob(); 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/mecab/Tagger.java: -------------------------------------------------------------------------------- 1 | package net.moraleboost.mecab; 2 | 3 | public interface Tagger 4 | { 5 | void destroy(); 6 | Lattice createLattice(); 7 | boolean parse(Lattice lattice); 8 | DictionaryInfo dictionaryInfo(); 9 | String what(); 10 | String version(); 11 | } 12 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/mecab/impl/StandardDictionaryInfo.java: -------------------------------------------------------------------------------- 1 | package net.moraleboost.mecab.impl; 2 | 3 | import net.moraleboost.mecab.DictionaryInfo; 4 | import org.bridj.Platform; 5 | import org.bridj.Pointer; 6 | import org.bridj.StructObject; 7 | import org.bridj.ann.Field; 8 | 9 | import java.nio.charset.Charset; 10 | 11 | public class StandardDictionaryInfo extends StructObject implements DictionaryInfo 12 | { 13 | protected StandardDictionaryInfo(Pointer p) 14 | { 15 | super(p); 16 | } 17 | 18 | @Field(0) 19 | public Pointer _filename() 20 | { 21 | return this.io.getPointerField(this, 0); 22 | } 23 | 24 | public String filename() 25 | { 26 | Pointer p = _filename(); 27 | if (p == null) { 28 | return null; 29 | } 30 | 31 | if (Platform.isWindows()) { 32 | // always UTF-8 33 | return p.getString(Pointer.StringType.C, Charset.forName("UTF-8")); 34 | } else { 35 | return p.getCString(); 36 | } 37 | } 38 | 39 | @Field(1) 40 | public Pointer _charset() 41 | { 42 | return this.io.getPointerField(this, 1); 43 | } 44 | 45 | public String charset() 46 | { 47 | Pointer p = _charset(); 48 | return (p == null) ? null : p.getCString(); 49 | } 50 | 51 | @Field(2) 52 | public int _size() 53 | { 54 | return this.io.getIntField(this, 2); 55 | } 56 | 57 | public long size() 58 | { 59 | // convert to long 60 | return (_size() & 0xffffffffL); 61 | } 62 | 63 | @Field(3) 64 | public int type() 65 | { 66 | return this.io.getIntField(this, 3); 67 | } 68 | 69 | @Field(4) 70 | public int _lsize() 71 | { 72 | return this.io.getIntField(this, 4); 73 | } 74 | 75 | public long lsize() 76 | { 77 | return (_lsize() & 0xffffffffL); 78 | } 79 | 80 | @Field(5) 81 | public int _rsize() 82 | { 83 | return this.io.getIntField(this, 5); 84 | } 85 | 86 | public long rsize() 87 | { 88 | return (_rsize() & 0xffffffffL); 89 | } 90 | 91 | @Field(6) 92 | public short _version() 93 | { 94 | return this.io.getShortField(this, 6); 95 | } 96 | 97 | public int version() 98 | { 99 | return (_version() & 0xffff); 100 | } 101 | 102 | @Field(7) 103 | public Pointer _next() 104 | { 105 | return this.io.getPointerField(this, 7); 106 | } 107 | 108 | public StandardDictionaryInfo next() 109 | { 110 | Pointer p = _next(); 111 | return (p == null) ? null : new StandardDictionaryInfo(p); 112 | } 113 | } 114 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/mecab/impl/StandardLattice.java: -------------------------------------------------------------------------------- 1 | package net.moraleboost.mecab.impl; 2 | 3 | import net.moraleboost.mecab.Lattice; 4 | import org.bridj.BridJ; 5 | import org.bridj.Platform; 6 | import org.bridj.Pointer; 7 | import org.bridj.SizeT; 8 | import org.bridj.ann.Library; 9 | 10 | import java.nio.charset.Charset; 11 | 12 | @Library("mecab") 13 | public class StandardLattice implements Lattice 14 | { 15 | static { 16 | if (Platform.isWindows()) { 17 | BridJ.setNativeLibraryActualName("mecab", "libmecab"); 18 | } 19 | BridJ.register(); 20 | } 21 | 22 | private static native Pointer mecab_lattice_new(); 23 | private static native void mecab_lattice_destroy(Pointer pLattice); 24 | private static native void mecab_lattice_clear(Pointer pLattice); 25 | private static native int mecab_lattice_is_available(Pointer pLattice); 26 | private static native Pointer mecab_lattice_get_bos_node(Pointer pLattice); 27 | private static native Pointer mecab_lattice_get_eos_node(Pointer pLattice); 28 | private static native Pointer mecab_lattice_get_begin_nodes(Pointer pLattice, SizeT pos); 29 | private static native Pointer mecab_lattice_get_end_nodes(Pointer pLattice, SizeT pos); 30 | private static native Pointer mecab_lattice_get_sentence(Pointer pLattice); 31 | private static native void mecab_lattice_set_sentence(Pointer pLattice, Pointer sentence); 32 | private static native SizeT mecab_lattice_get_size(Pointer pLattice); 33 | private static native double mecab_lattice_get_z(Pointer pLattice); 34 | private static native void mecab_lattice_set_z(Pointer pLattice, double Z); 35 | private static native double mecab_lattice_get_theta(Pointer pLattice); 36 | private static native void mecab_lattice_set_theta(Pointer pLattice, double theta); 37 | private static native int mecab_lattice_next(Pointer pLattice); 38 | private static native int mecab_lattice_get_request_type(Pointer pLattice); 39 | private static native int mecab_lattice_has_request_type(Pointer pLattice, int requestType); 40 | private static native void mecab_lattice_set_request_type(Pointer pLattice, int requestType); 41 | private static native void mecab_lattice_add_request_type(Pointer pLattice, int requestType); 42 | private static native void mecab_lattice_remove_request_type(Pointer pLattice, int requestType); 43 | private static native Pointer mecab_lattice_tostr(Pointer pLattice); 44 | private static native Pointer mecab_lattice_nbest_tostr(Pointer pLattice, SizeT N); 45 | private static native Pointer mecab_lattice_strerror(Pointer pLattice); 46 | 47 | private Pointer pLattice; 48 | private Pointer pSentence; 49 | private Charset charset; 50 | 51 | public StandardLattice(Charset charset) 52 | { 53 | pLattice = mecab_lattice_new(); 54 | if (pLattice == null) { 55 | throw new OutOfMemoryError("mecab_lattice_new() failed."); 56 | } 57 | this.charset = charset; 58 | } 59 | 60 | protected StandardLattice(Pointer p, Charset charset) 61 | { 62 | this.pLattice = p; 63 | this.charset = charset; 64 | } 65 | 66 | protected Pointer getPointer() 67 | { 68 | return pLattice; 69 | } 70 | 71 | protected void finalize() throws Throwable 72 | { 73 | try { 74 | destroy(); 75 | } finally { 76 | super.finalize(); 77 | } 78 | } 79 | 80 | public void destroy() 81 | { 82 | try { 83 | if (pLattice != null) { 84 | mecab_lattice_destroy(pLattice); 85 | } 86 | if (pSentence != null) { 87 | pSentence.release(); 88 | } 89 | } finally { 90 | pSentence = null; 91 | pLattice = null; 92 | } 93 | } 94 | 95 | public void clear() 96 | { 97 | try { 98 | mecab_lattice_clear(pLattice); 99 | if (pSentence != null) { 100 | pSentence.release(); 101 | } 102 | } finally { 103 | pSentence = null; 104 | } 105 | } 106 | 107 | public boolean isAvailable() 108 | { 109 | return (mecab_lattice_is_available(pLattice) != 0); 110 | } 111 | 112 | public StandardNode bosNode() 113 | { 114 | Pointer p = mecab_lattice_get_bos_node(pLattice); 115 | if (p == null) { 116 | return null; 117 | } else { 118 | return new StandardNode(p, charset); 119 | } 120 | } 121 | 122 | public StandardNode eosNode() 123 | { 124 | Pointer p = mecab_lattice_get_eos_node(pLattice); 125 | if (p == null) { 126 | return null; 127 | } else { 128 | return new StandardNode(p, charset); 129 | } 130 | } 131 | 132 | public StandardNode beginNodes(long pos) 133 | { 134 | Pointer p = mecab_lattice_get_begin_nodes(pLattice, SizeT.valueOf(pos)); 135 | if (p == null) { 136 | return null; 137 | } else { 138 | return new StandardNode(p, charset); 139 | } 140 | } 141 | 142 | public StandardNode endNodes(long pos) 143 | { 144 | Pointer p = mecab_lattice_get_end_nodes(pLattice, SizeT.valueOf(pos)); 145 | if (p == null) { 146 | return null; 147 | } else { 148 | return new StandardNode(p, charset); 149 | } 150 | } 151 | 152 | public String sentence() 153 | { 154 | Pointer p = mecab_lattice_get_sentence(pLattice); 155 | if (p == null) { 156 | return null; 157 | } 158 | 159 | return p.getString(Pointer.StringType.C, charset); 160 | } 161 | 162 | public void setSentence(String sentence) 163 | { 164 | try { 165 | if (pSentence != null) { 166 | pSentence.release(); 167 | } 168 | } finally { 169 | pSentence = null; 170 | } 171 | pSentence = Pointer.pointerToString(sentence, Pointer.StringType.C, charset).as(Byte.class); 172 | mecab_lattice_set_sentence(pLattice, pSentence); 173 | } 174 | 175 | public long size() 176 | { 177 | return mecab_lattice_get_size(pLattice).longValue(); 178 | } 179 | 180 | public double Z() 181 | { 182 | return mecab_lattice_get_z(pLattice); 183 | } 184 | 185 | public void setZ(double Z) 186 | { 187 | mecab_lattice_set_z(pLattice, Z); 188 | } 189 | 190 | public double theta() 191 | { 192 | return mecab_lattice_get_theta(pLattice); 193 | } 194 | 195 | public void setTheta(double theta) 196 | { 197 | mecab_lattice_set_theta(pLattice, theta); 198 | } 199 | 200 | public boolean next() 201 | { 202 | return (mecab_lattice_next(pLattice) != 0); 203 | } 204 | 205 | public int requestType() 206 | { 207 | return mecab_lattice_get_request_type(pLattice); 208 | } 209 | 210 | public boolean hasRequestType(int requestType) 211 | { 212 | return (mecab_lattice_has_request_type(pLattice, requestType) != 0); 213 | } 214 | 215 | public void setRequestType(int requestType) 216 | { 217 | mecab_lattice_set_request_type(pLattice, requestType); 218 | } 219 | 220 | public void addRequestType(int requestType) 221 | { 222 | mecab_lattice_add_request_type(pLattice, requestType); 223 | } 224 | 225 | public void removeRequestType(int requestType) 226 | { 227 | mecab_lattice_remove_request_type(pLattice, requestType); 228 | } 229 | 230 | @Override 231 | public String toString() 232 | { 233 | Pointer p = mecab_lattice_tostr(pLattice); 234 | if (p == null) { 235 | return null; 236 | } else { 237 | return p.getString(Pointer.StringType.C, charset); 238 | } 239 | } 240 | 241 | public String enumNBestAsString(long N) 242 | { 243 | Pointer p = mecab_lattice_nbest_tostr(pLattice, SizeT.valueOf(N)); 244 | if (p == null) { 245 | return null; 246 | } else { 247 | return p.getString(Pointer.StringType.C, charset); 248 | } 249 | } 250 | 251 | public String what() 252 | { 253 | Pointer p = mecab_lattice_strerror(pLattice); 254 | if (p == null) { 255 | return null; 256 | } else { 257 | return p.getString(Pointer.StringType.C, charset); 258 | } 259 | } 260 | } 261 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/mecab/impl/StandardModel.java: -------------------------------------------------------------------------------- 1 | package net.moraleboost.mecab.impl; 2 | 3 | import net.moraleboost.mecab.Model; 4 | import org.bridj.BridJ; 5 | import org.bridj.Platform; 6 | import org.bridj.Pointer; 7 | import org.bridj.ann.Library; 8 | 9 | import java.nio.charset.Charset; 10 | 11 | @Library("mecab") 12 | public class StandardModel implements Model 13 | { 14 | static { 15 | if (Platform.isWindows()) { 16 | BridJ.setNativeLibraryActualName("mecab", "libmecab"); 17 | } 18 | BridJ.register(); 19 | } 20 | 21 | private static native Pointer mecab_model_new2(Pointer arg); 22 | private static native void mecab_model_destroy(Pointer pModel); 23 | private static native Pointer mecab_model_new_tagger(Pointer pModel); 24 | private static native Pointer mecab_model_new_lattice(Pointer pModel); 25 | private static native int mecab_model_swap(Pointer pModel, Pointer pNewModel); 26 | private static native Pointer mecab_model_dictionary_info(Pointer pModel); 27 | 28 | private Pointer pModel; 29 | private Charset charset; 30 | 31 | public StandardModel(String arg) 32 | { 33 | Pointer parg = Pointer.pointerToCString(arg); 34 | try { 35 | pModel = mecab_model_new2(parg); 36 | } finally { 37 | Pointer.release(parg); 38 | } 39 | 40 | if (pModel == null) { 41 | throw new OutOfMemoryError("mecab_model_new2() failed."); 42 | } 43 | 44 | StandardDictionaryInfo dictInfo = dictionaryInfo(); 45 | charset = Charset.forName(dictInfo.charset()); 46 | } 47 | 48 | public StandardModel(String arg, Charset charset) 49 | { 50 | Pointer parg = Pointer.pointerToCString(arg); 51 | try { 52 | pModel = mecab_model_new2(parg); 53 | } finally { 54 | Pointer.release(parg); 55 | } 56 | 57 | if (pModel == null) { 58 | throw new OutOfMemoryError("mecab_model_new2() failed."); 59 | } 60 | 61 | this.charset = charset; 62 | } 63 | 64 | protected Pointer getPointer() 65 | { 66 | return pModel; 67 | } 68 | 69 | protected void finalize() throws Throwable 70 | { 71 | try { 72 | destroy(); 73 | } finally { 74 | super.finalize(); 75 | } 76 | } 77 | 78 | public void destroy() 79 | { 80 | try { 81 | if (pModel != null) { 82 | mecab_model_destroy(pModel); 83 | } 84 | } finally { 85 | pModel = null; 86 | } 87 | } 88 | 89 | public StandardTagger createTagger() 90 | { 91 | Pointer p = mecab_model_new_tagger(pModel); 92 | if (p == null) { 93 | throw new OutOfMemoryError("mecab_model_new_tagger() failed."); 94 | } else { 95 | return new StandardTagger(p, charset); 96 | } 97 | } 98 | 99 | public StandardLattice createLattice() 100 | { 101 | Pointer p = mecab_model_new_lattice(pModel); 102 | if (p == null) { 103 | throw new OutOfMemoryError("mecab_model_new_lattice() failed."); 104 | } else { 105 | return new StandardLattice(p, charset); 106 | } 107 | } 108 | 109 | public boolean swap(Model model) 110 | { 111 | if (model != null && (model instanceof StandardModel)) { 112 | return (mecab_model_swap(pModel, ((StandardModel)model).getPointer()) != 0); 113 | } else { 114 | return false; 115 | } 116 | } 117 | 118 | public StandardDictionaryInfo dictionaryInfo() 119 | { 120 | Pointer p = mecab_model_dictionary_info(pModel); 121 | if (p == null) { 122 | throw new OutOfMemoryError("mecab_model_dictionary_info() failed."); 123 | } else { 124 | return new StandardDictionaryInfo(p); 125 | } 126 | } 127 | } 128 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/mecab/impl/StandardNode.java: -------------------------------------------------------------------------------- 1 | package net.moraleboost.mecab.impl; 2 | 3 | import net.moraleboost.mecab.Node; 4 | import org.bridj.Pointer; 5 | import org.bridj.StructObject; 6 | import org.bridj.ann.CLong; 7 | import org.bridj.ann.Field; 8 | 9 | import java.nio.charset.Charset; 10 | 11 | public class StandardNode extends StructObject implements Node 12 | { 13 | private Charset charset; 14 | 15 | protected StandardNode(Pointer p, Charset charset) 16 | { 17 | super(p); 18 | this.charset = charset; 19 | } 20 | 21 | @Field(0) 22 | public Pointer _prev() 23 | { 24 | return this.io.getPointerField(this, 0); 25 | } 26 | 27 | public StandardNode prev() 28 | { 29 | Pointer p = _prev(); 30 | if (p == null) { 31 | return null; 32 | } else { 33 | return new StandardNode(p, charset); 34 | } 35 | } 36 | 37 | @Field(1) 38 | public Pointer _next() 39 | { 40 | return this.io.getPointerField(this, 1); 41 | } 42 | 43 | public StandardNode next() 44 | { 45 | Pointer p = _next(); 46 | if (p == null) { 47 | return null; 48 | } else { 49 | return new StandardNode(p, charset); 50 | } 51 | } 52 | 53 | @Field(2) 54 | public Pointer _enext() 55 | { 56 | return this.io.getPointerField(this, 2); 57 | } 58 | 59 | public StandardNode enext() 60 | { 61 | Pointer p = _enext(); 62 | if (p == null) { 63 | return null; 64 | } else { 65 | return new StandardNode(p, charset); 66 | } 67 | } 68 | 69 | @Field(3) 70 | public Pointer _bnext() 71 | { 72 | return this.io.getPointerField(this, 3); 73 | } 74 | 75 | public StandardNode bnext() 76 | { 77 | Pointer p = _bnext(); 78 | if (p == null) { 79 | return null; 80 | } else { 81 | return new StandardNode(p, charset); 82 | } 83 | } 84 | 85 | @Field(4) 86 | public Pointer _rpath() 87 | { 88 | return this.io.getPointerField(this, 4); 89 | } 90 | 91 | public StandardPath rpath() 92 | { 93 | Pointer p = _rpath(); 94 | if (p == null) { 95 | return null; 96 | } else { 97 | return new StandardPath(p, charset); 98 | } 99 | } 100 | 101 | @Field(5) 102 | public Pointer _lpath() 103 | { 104 | return this.io.getPointerField(this, 5); 105 | } 106 | 107 | public StandardPath lpath() 108 | { 109 | Pointer p = _lpath(); 110 | if (p == null) { 111 | return null; 112 | } else { 113 | return new StandardPath(p, charset); 114 | } 115 | } 116 | 117 | @Field(6) 118 | public Pointer _surface() 119 | { 120 | return this.io.getPointerField(this, 6); 121 | } 122 | 123 | public String surface() 124 | { 125 | Pointer p = _surface(); 126 | int len = length(); 127 | if (p == null) { 128 | return null; 129 | } else { 130 | return new String(p.getBytes(len), charset); 131 | } 132 | } 133 | 134 | public String rsurface() 135 | { 136 | Pointer p = _surface(); 137 | int rlen = rlength(); 138 | int len = length(); 139 | if (p == null) { 140 | return null; 141 | } else { 142 | return new String(p.offset(len-rlen).getBytes(rlen), charset); 143 | } 144 | } 145 | 146 | public boolean leadingSpaceAndSurface(String[] leadingSpaceAndSurface) 147 | { 148 | if (leadingSpaceAndSurface.length != 2) { 149 | throw new IllegalArgumentException("leadingSpaceAndSurface.length must be 2."); 150 | } 151 | 152 | Pointer p = _surface(); 153 | int rlen = rlength(); 154 | int len = length(); 155 | int offset = rlen - len; 156 | 157 | if (p == null) { 158 | return false; 159 | } else { 160 | byte[] bytestr = p.offset(-offset).getBytes(rlen); 161 | leadingSpaceAndSurface[0] = new String(bytestr, 0, offset, charset); // leading space 162 | leadingSpaceAndSurface[1] = new String(bytestr, offset, len, charset); // surface 163 | return true; 164 | } 165 | } 166 | 167 | @Field(7) 168 | public Pointer _feature() 169 | { 170 | return this.io.getPointerField(this, 7); 171 | } 172 | 173 | public String feature() 174 | { 175 | Pointer p = _feature(); 176 | if (p == null) { 177 | return null; 178 | } else { 179 | return p.getString(Pointer.StringType.C, charset); 180 | } 181 | } 182 | 183 | @Field(8) 184 | public int _id() 185 | { 186 | return this.io.getIntField(this, 8); 187 | } 188 | 189 | public long id() 190 | { 191 | return (_id() & 0xffffffffL); 192 | } 193 | 194 | @Field(9) 195 | public short _length() 196 | { 197 | return this.io.getShortField(this, 9); 198 | } 199 | 200 | public int length() 201 | { 202 | return (_length() & 0xffff); 203 | } 204 | 205 | @Field(10) 206 | public short _rlength() 207 | { 208 | return this.io.getShortField(this, 10); 209 | } 210 | 211 | public int rlength() 212 | { 213 | return (_rlength() & 0xffff); 214 | } 215 | 216 | @Field(11) 217 | public short _rcAttr() 218 | { 219 | return this.io.getShortField(this, 11); 220 | } 221 | 222 | public int rcAttr() 223 | { 224 | return (_rcAttr() & 0xffff); 225 | } 226 | 227 | @Field(12) 228 | public short _lcAttr() 229 | { 230 | return this.io.getShortField(this, 12); 231 | } 232 | 233 | public int lcAttr() 234 | { 235 | return (_lcAttr() & 0xffff); 236 | } 237 | 238 | @Field(13) 239 | public short _posid() 240 | { 241 | return this.io.getShortField(this, 13); 242 | } 243 | 244 | public int posid() 245 | { 246 | return (_posid() & 0xffff); 247 | } 248 | 249 | @Field(14) 250 | public byte _charType() 251 | { 252 | return this.io.getByteField(this, 14); 253 | } 254 | 255 | public int charType() 256 | { 257 | return (_charType() & 0xff); 258 | } 259 | 260 | @Field(15) 261 | public byte _stat() 262 | { 263 | return this.io.getByteField(this, 15); 264 | } 265 | 266 | public int stat() 267 | { 268 | return (_stat() & 0xff); 269 | } 270 | 271 | @Field(16) 272 | public byte _isbest() 273 | { 274 | return this.io.getByteField(this, 16); 275 | } 276 | 277 | public boolean isbest() 278 | { 279 | return (_isbest() != 0); 280 | } 281 | 282 | @Field(17) 283 | public float alpha() 284 | { 285 | return this.io.getFloatField(this, 17); 286 | } 287 | 288 | @Field(18) 289 | public float beta() 290 | { 291 | return this.io.getFloatField(this, 18); 292 | } 293 | 294 | @Field(19) 295 | public float prob() 296 | { 297 | return this.io.getFloatField(this, 19); 298 | } 299 | 300 | @Field(20) 301 | public short wcost() 302 | { 303 | return this.io.getShortField(this, 20); 304 | } 305 | 306 | @Field(21) 307 | @CLong 308 | public long cost() 309 | { 310 | return this.io.getCLongField(this, 21); 311 | } 312 | } 313 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/mecab/impl/StandardPath.java: -------------------------------------------------------------------------------- 1 | package net.moraleboost.mecab.impl; 2 | 3 | import net.moraleboost.mecab.Path; 4 | import org.bridj.Pointer; 5 | import org.bridj.StructObject; 6 | import org.bridj.ann.Field; 7 | 8 | import java.nio.charset.Charset; 9 | 10 | public class StandardPath extends StructObject implements Path 11 | { 12 | private Charset charset; 13 | 14 | protected StandardPath(Pointer p, Charset charset) 15 | { 16 | super(p); 17 | this.charset = charset; 18 | } 19 | 20 | @Field(0) 21 | public Pointer _rnode() 22 | { 23 | return this.io.getPointerField(this, 0); 24 | } 25 | 26 | public StandardNode rnode() 27 | { 28 | Pointer p = _rnode(); 29 | return (p == null) ? null : new StandardNode(p, charset); 30 | } 31 | 32 | @Field(1) 33 | public Pointer _rnext() 34 | { 35 | return this.io.getPointerField(this, 1); 36 | } 37 | 38 | public StandardPath rnext() 39 | { 40 | Pointer p = _rnext(); 41 | return (p == null) ? null : new StandardPath(p, charset); 42 | } 43 | 44 | @Field(2) 45 | public Pointer _lnode() 46 | { 47 | return this.io.getPointerField(this, 2); 48 | } 49 | 50 | public StandardNode lnode() 51 | { 52 | Pointer p = _lnode(); 53 | return (p == null) ? null : new StandardNode(p, charset); 54 | } 55 | 56 | @Field(3) 57 | public Pointer _lnext() 58 | { 59 | return this.io.getPointerField(this, 3); 60 | } 61 | 62 | public StandardPath lnext() 63 | { 64 | Pointer p = _lnext(); 65 | return (p == null) ? null : new StandardPath(p, charset); 66 | } 67 | 68 | @Field(4) 69 | public int cost() 70 | { 71 | return this.io.getIntField(this, 4); 72 | } 73 | 74 | @Field(5) 75 | public float prob() 76 | { 77 | return this.io.getFloatField(this, 5); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/mecab/impl/StandardTagger.java: -------------------------------------------------------------------------------- 1 | package net.moraleboost.mecab.impl; 2 | 3 | import net.moraleboost.mecab.Lattice; 4 | import net.moraleboost.mecab.Tagger; 5 | import org.bridj.BridJ; 6 | import org.bridj.Platform; 7 | import org.bridj.Pointer; 8 | import org.bridj.ann.Library; 9 | 10 | import java.nio.charset.Charset; 11 | 12 | @Library("mecab") 13 | public class StandardTagger implements Tagger 14 | { 15 | static { 16 | if (Platform.isWindows()) { 17 | BridJ.setNativeLibraryActualName("mecab", "libmecab"); 18 | } 19 | BridJ.register(); 20 | } 21 | 22 | private static native Pointer mecab_new2(Pointer arg); 23 | private static native Pointer mecab_version(); 24 | private static native Pointer mecab_strerror(Pointer pTagger); 25 | private static native void mecab_destroy(Pointer pTagger); 26 | private static native int mecab_parse_lattice(Pointer pTagger, Pointer pLattice); 27 | private static native Pointer mecab_dictionary_info(Pointer pTagger); 28 | 29 | private Pointer pTagger; 30 | private Charset charset; 31 | 32 | public StandardTagger(String arg) 33 | { 34 | Pointer parg = Pointer.pointerToCString(arg); 35 | try { 36 | pTagger = mecab_new2(parg); 37 | } finally { 38 | Pointer.release(parg); 39 | } 40 | 41 | if (pTagger == null) { 42 | throw new OutOfMemoryError("mecab_new2() failed."); 43 | } 44 | 45 | StandardDictionaryInfo dictInfo = dictionaryInfo(); 46 | charset = Charset.forName(dictInfo.charset()); 47 | } 48 | 49 | public StandardTagger(String arg, Charset charset) 50 | { 51 | Pointer parg = Pointer.pointerToCString(arg); 52 | try { 53 | pTagger = mecab_new2(parg); 54 | } finally { 55 | Pointer.release(parg); 56 | } 57 | 58 | if (pTagger == null) { 59 | throw new OutOfMemoryError("mecab_new2() failed."); 60 | } 61 | 62 | this.charset = charset; 63 | } 64 | 65 | protected StandardTagger(Pointer p, Charset charset) 66 | { 67 | this.pTagger = p; 68 | this.charset = charset; 69 | } 70 | 71 | protected void finalize() throws Throwable 72 | { 73 | try { 74 | destroy(); 75 | } finally { 76 | super.finalize(); 77 | } 78 | } 79 | 80 | public void destroy() 81 | { 82 | try { 83 | if (pTagger != null) { 84 | mecab_destroy(pTagger); 85 | } 86 | } finally { 87 | pTagger = null; 88 | } 89 | } 90 | 91 | public StandardLattice createLattice() 92 | { 93 | return new StandardLattice(charset); 94 | } 95 | 96 | public boolean parse(Lattice lattice) 97 | { 98 | if (lattice != null && (lattice instanceof StandardLattice)) { 99 | return (mecab_parse_lattice(pTagger, ((StandardLattice)lattice).getPointer()) != 0); 100 | } else { 101 | return false; 102 | } 103 | } 104 | 105 | public StandardDictionaryInfo dictionaryInfo() 106 | { 107 | Pointer p = mecab_dictionary_info(pTagger); 108 | if (p == null) { 109 | throw new OutOfMemoryError("mecab_dictionary_info() failed."); 110 | } else { 111 | return new StandardDictionaryInfo(p); 112 | } 113 | } 114 | 115 | public String what() 116 | { 117 | Pointer p = mecab_strerror(pTagger); 118 | if (p == null) { 119 | return null; 120 | } else { 121 | return p.getString(Pointer.StringType.C, charset); 122 | } 123 | } 124 | 125 | public String version() 126 | { 127 | Pointer p = mecab_version(); 128 | if (p == null) { 129 | return null; 130 | } else { 131 | return p.getCString(); 132 | } 133 | } 134 | 135 | public static void main(String[] args) 136 | { 137 | StringBuilder text = new StringBuilder(); 138 | for (String arg: args) { 139 | if (text.length() != 0) { 140 | text.append(" "); 141 | } 142 | text.append(arg); 143 | } 144 | 145 | StandardTagger tagger = new StandardTagger(""); 146 | Lattice lattice = tagger.createLattice(); 147 | lattice.setSentence(text.toString()); 148 | tagger.parse(lattice); 149 | 150 | System.out.println("MeCab version " + tagger.version()); 151 | System.out.println(); 152 | System.out.println("Original text: " + text.toString()); 153 | System.out.println(); 154 | System.out.println("Morphemes:"); 155 | System.out.println(lattice.toString()); 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/tinysegmenter/ModelExporter.java: -------------------------------------------------------------------------------- 1 | /* 2 | ** 3 | ** Mar. 24, 2009 4 | ** 5 | ** The author disclaims copyright to this source code. 6 | ** In place of a legal notice, here is a blessing: 7 | ** 8 | ** May you do good and not evil. 9 | ** May you find forgiveness for yourself and forgive others. 10 | ** May you share freely, never taking more than you give. 11 | ** 12 | ** Stolen from SQLite :-) 13 | ** Any feedback is welcome. 14 | ** Kohei TAKETA 15 | ** 16 | */ 17 | package net.moraleboost.tinysegmenter; 18 | 19 | import org.mozilla.javascript.Context; 20 | import org.mozilla.javascript.ContextFactory; 21 | import org.mozilla.javascript.Function; 22 | import org.mozilla.javascript.Scriptable; 23 | 24 | import java.io.*; 25 | 26 | public class ModelExporter 27 | { 28 | // java ModelExporter tiny_segmenter_source.js exporter.js out.java 29 | public static void main(String[] args) 30 | throws Exception 31 | { 32 | try { 33 | Context ctx = ContextFactory.getGlobal().enterContext(); 34 | Scriptable scope = ctx.initStandardObjects(); 35 | 36 | evaluateSource(ctx, scope, args[0]); 37 | evaluateSource(ctx, scope, args[1]); 38 | 39 | emit(ctx, scope, args[2]); 40 | } finally { 41 | Context.exit(); 42 | } 43 | } 44 | 45 | private static void evaluateSource(Context ctx, Scriptable scope, String filename) 46 | throws Exception 47 | { 48 | FileInputStream fis = null; 49 | InputStreamReader isr = null; 50 | 51 | try { 52 | File f = new File(filename); 53 | fis = new FileInputStream(f); 54 | isr = new InputStreamReader(fis, "utf-8"); 55 | ctx.evaluateReader(scope, isr, f.getName(), 1, null); 56 | } finally { 57 | if (isr != null) { 58 | try { isr.close(); } catch (Exception ignored) {} 59 | } 60 | if (fis != null) { 61 | try { fis.close(); } catch (Exception ignored) {} 62 | } 63 | } 64 | } 65 | 66 | private static void emit(Context ctx, Scriptable scope, String filename) 67 | throws Exception 68 | { 69 | FileOutputStream fos = null; 70 | OutputStreamWriter osw = null; 71 | try { 72 | File f = new File(filename); 73 | fos = new FileOutputStream(f); 74 | osw = new OutputStreamWriter(fos, "utf-8"); 75 | emitToWriter(ctx, scope, osw); 76 | } finally { 77 | if (osw != null) { 78 | try { osw.close(); } catch (Exception ignored) {} 79 | } 80 | if (fos != null) { 81 | try { fos.close(); } catch (Exception ignored) {} 82 | } 83 | } 84 | } 85 | 86 | private static void emitToWriter(Context ctx, Scriptable scope, Writer w) 87 | throws Exception 88 | { 89 | emitPrologue(w); 90 | 91 | String[] names = { 92 | "BC1", "BC2", "BC3", 93 | "BP1", "BP2", 94 | "BQ1", "BQ2", "BQ3", "BQ4", 95 | "BW1", "BW2", "BW3", 96 | "TC1", "TC2", "TC3", "TC4", 97 | "TQ1", "TQ2", "TQ3", "TQ4", 98 | "TW1", "TW2", "TW3", "TW4", 99 | "UC1", "UC2", "UC3", "UC4", "UC5", "UC6", 100 | "UP1", "UP2", "UP3", 101 | "UQ1", "UQ2", "UQ3", 102 | "UW1", "UW2", "UW3", "UW4", "UW5", "UW6" 103 | }; 104 | 105 | Function getKeys = (Function)scope.get("getKeys", scope); 106 | Function getValues = (Function)scope.get("getValues", scope); 107 | Object bias = scope.get("BIAS", scope); 108 | 109 | w.write(" public static final int BIAS = " + Context.toString(bias) + ";\r\n"); 110 | for (String name: names) { 111 | Object keys = Context.toString(getKeys.call(ctx, scope, scope, new String[] {name})); 112 | Object vals = Context.toString(getValues.call(ctx, scope, scope, new String[] {name})); 113 | w.write(" public static final String[] " + name + "_KEYS" + " = " + keys + ";\r\n"); 114 | w.write(" public static final Integer[] " + name + "_VALS" + " = " + vals + ";\r\n"); 115 | } 116 | 117 | for (String name: names) { 118 | w.write(" public static final Map " + name + ";\r\n"); 119 | } 120 | 121 | emitStaticBlock(w, names); 122 | 123 | emitEpilogue(w); 124 | } 125 | 126 | private static void emitPrologue(Writer w) 127 | throws Exception 128 | { 129 | w.write("// Automatically generated. Do not edit.\r\n"); 130 | w.write("package net.moraleboost.tinysegmenter;\r\n\r\n"); 131 | w.write("import java.util.Collections;\r\n"); 132 | w.write("import java.util.Map;\r\n"); 133 | w.write("import java.util.HashMap;\r\n\r\n"); 134 | w.write("public class TinySegmenterConstants\r\n"); 135 | w.write("{\r\n"); 136 | } 137 | 138 | private static void emitEpilogue(Writer w) 139 | throws Exception 140 | { 141 | w.write("}\r\n"); 142 | } 143 | 144 | private static void emitStaticBlock(Writer w, String[] names) 145 | throws Exception 146 | { 147 | w.write(" static {\r\n"); 148 | w.write(" int i;\r\n"); 149 | w.write(" Map m;\r\n"); 150 | 151 | for (String name: names) { 152 | w.write(" "); 153 | w.write("m = new HashMap();\r\n"); 154 | w.write(" "); 155 | w.write("for (i=0; i<" + name + "_KEYS.length; ++i) {\r\n"); 156 | w.write(" "); 157 | w.write(" m.put(" + name + "_KEYS[i], " + name + "_VALS[i]);\r\n"); 158 | w.write(" "); 159 | w.write("}\r\n"); 160 | w.write(" "); 161 | w.write(name + " = Collections.unmodifiableMap(m);\r\n"); 162 | } 163 | 164 | w.write(" }\r\n"); 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/tinysegmenter/TinySegmenter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Based on TinySegmenter 0.1 -- Super compact Japanese tokenizer in Javascript 3 | * (c) 2008 Taku Kudo 4 | * TinySegmenter is freely distributable under the terms of a new BSD licence. 5 | * For details, see http://chasen.org/~taku/software/TinySegmenter/LICENCE.txt 6 | * 7 | * Ported to Java by Kohei TAKETA 8 | */ 9 | package net.moraleboost.tinysegmenter; 10 | 11 | import net.moraleboost.io.CodePointReader; 12 | 13 | import java.io.IOException; 14 | import java.util.*; 15 | 16 | import static net.moraleboost.tinysegmenter.TinySegmenterConstants.*; 17 | 18 | /** 19 | * TinySegmenterのJava移植版。 20 | * 21 | * @author taketa 22 | * 23 | */ 24 | public class TinySegmenter 25 | { 26 | public static class CharInfo 27 | { 28 | public int cp; 29 | public String str; 30 | public String ctype; 31 | public long start; 32 | public long end; 33 | } 34 | 35 | public static class Token 36 | { 37 | public String str; 38 | public long start; 39 | public long end; 40 | 41 | public boolean equals(Object obj) 42 | { 43 | if (!(obj instanceof Token)) { 44 | return false; 45 | } 46 | 47 | Token another = (Token)obj; 48 | return ((str == null ? another.str == null : str.equals(another.str)) && 49 | (start == another.start) && (end == another.end)); 50 | } 51 | 52 | public String toString() 53 | { 54 | return ("(" + 55 | str + "," + 56 | Long.toString(start) + "," + 57 | Long.toString(end) + ")"); 58 | } 59 | } 60 | 61 | private static String getCharType(int cp) 62 | { 63 | if (CHINESE_NUMBER_SET.contains(cp)) { 64 | // [一二三四五六七八九十百千万億兆] 65 | return "M"; 66 | } else if ( 67 | (0x4E00 <= cp && cp <= 0x9fa0) || 68 | cp == '々' || cp == '〆' || cp == 'ヵ' || cp == 'ヶ') { 69 | // [一-龠々〆ヵヶ] 70 | return "H"; 71 | } else if (0x3041 <= cp && cp <= 0x3093) { 72 | // [ぁ-ん] 73 | return "I"; 74 | } else if ( 75 | (0x30a1 <= cp && cp <= 0x30f4) || cp == 0x30fc || 76 | (0xff71 <= cp && cp <= 0xff9e) || cp == 0xff70) { 77 | // [ァ-ヴーア-ン゙ー] 78 | return "K"; 79 | } else if ( 80 | ('a' <= cp && cp <= 'z') || ('A' <= cp && cp <= 'Z') || 81 | ('a' <= cp && cp <= 'z') || ('A' <= cp && cp <= 'Z')) { 82 | // [a-zA-Za-zA-Z] 83 | return "A"; 84 | } else if ( 85 | ('0' <= cp && cp <= '9') || ('0' <= cp && cp <= '9')) { 86 | // [0-90-9] 87 | return "N"; 88 | } else { 89 | return "O"; 90 | } 91 | } 92 | 93 | private static List getCodePoints(String str) 94 | { 95 | int count = str.codePointCount(0, str.length()); 96 | List result = new ArrayList(count); 97 | int charIndex = 0; 98 | while (charIndex < str.length()) { 99 | int cp = str.codePointAt(charIndex); 100 | result.add(cp); 101 | charIndex += Character.charCount(cp); 102 | } 103 | 104 | return result; 105 | } 106 | 107 | public static final int DEFAULT_BUFFER_SIZE = 4096; 108 | public static final int DEFAULT_MAX_TOKEN_SIZE = 255; 109 | 110 | @SuppressWarnings("serial") 111 | private static final Set CHINESE_NUMBER_SET = 112 | Collections.unmodifiableSet(new HashSet() {{ 113 | addAll(getCodePoints("一二三四五六七八九十百千万億兆")); 114 | }}); 115 | 116 | private CodePointReader reader; 117 | private boolean eos; 118 | private int eosCount; 119 | 120 | private CharInfo[] buffer; 121 | private int end; 122 | private int position; 123 | private boolean done; 124 | private String p1; 125 | private String p2; 126 | private String p3; 127 | 128 | private CharInfo[] tokenBuffer; 129 | private int tokenPosition; 130 | 131 | public TinySegmenter(CodePointReader reader) 132 | { 133 | this(reader, DEFAULT_BUFFER_SIZE, DEFAULT_MAX_TOKEN_SIZE); 134 | } 135 | 136 | /** 137 | * コンストラクタ 138 | * @param reader CodePointReaderオブジェクト。 139 | * @param bufferSize バッファのサイズ。7以上でなければならない。 140 | * @param maxTokenSize トークンの最大サイズ。2以上でなければならない。 141 | */ 142 | public TinySegmenter(CodePointReader reader, int bufferSize, int maxTokenSize) 143 | { 144 | assert (bufferSize > 6); 145 | assert (maxTokenSize > 1); 146 | 147 | this.reader = reader; 148 | this.p1 = "U"; 149 | this.p2 = "U"; 150 | this.p3 = "U"; 151 | this.buffer = new CharInfo[bufferSize]; 152 | this.tokenBuffer = new CharInfo[maxTokenSize]; 153 | } 154 | 155 | public CharInfo readChar() throws IOException 156 | { 157 | CharInfo c = new CharInfo(); 158 | 159 | if (eos) { 160 | c.start = -1; 161 | c.end = -1; 162 | c.cp = -1; 163 | } else { 164 | c.start = reader.getPosition(); 165 | c.cp = reader.read(); 166 | c.end = reader.getPosition(); 167 | } 168 | 169 | if (c.cp < 0) { 170 | eos = true; 171 | switch (eosCount) { 172 | case 0: 173 | c.str = "E1"; 174 | c.ctype = "O"; 175 | ++eosCount; 176 | break; 177 | case 1: 178 | c.str = "E2"; 179 | c.ctype = "O"; 180 | ++eosCount; 181 | break; 182 | case 2: 183 | c.str = "E3"; 184 | c.ctype = "O"; 185 | ++eosCount; 186 | break; 187 | default: 188 | return null; 189 | } 190 | } else { 191 | c.str = new String(Character.toChars(c.cp)); 192 | c.ctype = getCharType(c.cp); 193 | } 194 | 195 | return c; 196 | } 197 | 198 | private void initBuffer() 199 | { 200 | CharInfo c; 201 | 202 | { 203 | c = new CharInfo(); 204 | c.cp = 0; 205 | c.str = "B3"; 206 | c.ctype = "O"; 207 | c.start = -1; 208 | c.end = -1; 209 | buffer[0] = c; 210 | 211 | c = new CharInfo(); 212 | c.cp = 0; 213 | c.str = "B2"; 214 | c.ctype = "O"; 215 | c.start = -1; 216 | c.end = -1; 217 | buffer[1] = c; 218 | 219 | c = new CharInfo(); 220 | c.cp = 0; 221 | c.str = "B1"; 222 | c.ctype = "O"; 223 | c.start = -1; 224 | c.end = -1; 225 | buffer[2] = c; 226 | } 227 | end = 3; 228 | position = 3; 229 | } 230 | 231 | private int fillBuffer() 232 | throws IOException 233 | { 234 | CharInfo c; 235 | 236 | // 末尾の6アイテムを、先頭にコピーする。 237 | int src = end - 6; 238 | int dst = 0; 239 | if (src < 0) { 240 | src = 0; 241 | } 242 | while (src < end) { 243 | buffer[dst++] = buffer[src++]; 244 | } 245 | 246 | // end, positionをリセット 247 | int start = dst; 248 | end = dst; 249 | position = 3; 250 | 251 | // 残りの部分にデータを読み込む 252 | while (end < buffer.length) { 253 | c = readChar(); 254 | if (c != null) { 255 | buffer[end++] = c; 256 | } else { 257 | break; 258 | } 259 | } 260 | 261 | return (end - start); 262 | } 263 | 264 | public Token next() 265 | throws IOException 266 | { 267 | if (done) { 268 | return null; 269 | } 270 | 271 | if (end <= 0) { 272 | // 初回呼び出し 273 | initBuffer(); 274 | if (fillBuffer() > 3) { 275 | // 最初の文字をtokenBufferに格納 276 | tokenBuffer[tokenPosition++] = buffer[position++]; 277 | } else { 278 | // 空のストリーム 279 | done = true; 280 | return null; 281 | } 282 | } 283 | 284 | Token token = null; 285 | do { 286 | while (position < end-3) { 287 | if (isBoundary()) { 288 | // トークン境界と判定 289 | if (tokenPosition > 0) { 290 | token = makeToken(); 291 | tokenBuffer[tokenPosition++] = buffer[position++]; 292 | break; 293 | } 294 | } else { 295 | // トークン境界ではない 296 | tokenBuffer[tokenPosition++] = buffer[position++]; 297 | if (tokenPosition >= tokenBuffer.length) { 298 | // バッファがいっぱいになったので、一旦トークンとして切り出す 299 | token = makeToken(); 300 | break; 301 | } 302 | } 303 | } 304 | } while (token == null && fillBuffer() > 0); 305 | 306 | if (token == null) { 307 | // 最後のトークンを切り出す 308 | done = true; 309 | token = makeToken(); 310 | } 311 | 312 | return token; 313 | } 314 | 315 | private Token makeToken() 316 | { 317 | Token token = new Token(); 318 | StringBuilder builder = new StringBuilder(); 319 | 320 | token.start = tokenBuffer[0].start; 321 | for (int i=0; i 0) { 391 | p = "B"; 392 | result = true; 393 | } 394 | p1 = p2; 395 | p2 = p3; 396 | p3 = p; 397 | 398 | return result; 399 | } 400 | 401 | private int getScore(Map m, String key) 402 | { 403 | Integer s = m.get(key); 404 | return (s != null ? s : 0); 405 | } 406 | } 407 | -------------------------------------------------------------------------------- /src/main/java/net/moraleboost/tinysegmenter/TinySegmenterConstants.java: -------------------------------------------------------------------------------- 1 | // Automatically generated. Do not edit. 2 | package net.moraleboost.tinysegmenter; 3 | 4 | import java.util.Collections; 5 | import java.util.HashMap; 6 | import java.util.Map; 7 | 8 | public class TinySegmenterConstants 9 | { 10 | public static final int BIAS = -332; 11 | public static final String[] BC1_KEYS = {"OH","II","HH","KH"}; 12 | public static final Integer[] BC1_VALS = {-1378,2461,6,406}; 13 | public static final String[] BC2_KEYS = {"AN","MK","HH","IA","KI","KK","HM","AA","HN","HO","IH","II","IK","AI","IO","MH","OO"}; 14 | public static final Integer[] BC2_VALS = {-878,3334,-4070,1327,3831,-8741,-1711,-3267,4012,3761,-1184,-1332,1721,2744,5492,-3132,-2920}; 15 | public static final String[] BC3_KEYS = {"MK","MM","HH","HI","HK","OA","KK","HN","HO","IH","OH"}; 16 | public static final Integer[] BC3_VALS = {1079,4034,996,626,-721,-1652,2762,-1307,-836,-301,266}; 17 | public static final String[] BP1_KEYS = {"BB","UB","OB","OO"}; 18 | public static final Integer[] BP1_VALS = {295,352,304,-125}; 19 | public static final String[] BP2_KEYS = {"BO","OO"}; 20 | public static final Integer[] BP2_VALS = {60,-1762}; 21 | public static final String[] BQ1_KEYS = {"BHM","OHI","OKH","OKK","BII","BOH","OIH","BIM","BOO","BMH","OKA","OOO","BHH","BNH"}; 22 | public static final Integer[] BQ1_VALS = {1521,451,-1020,904,-1158,-91,-296,886,-2597,1208,1851,2965,1150,449}; 23 | public static final String[] BQ2_KEYS = {"BKK","OHH","BHM","BKO","BIH","OHM","OIH","UHI","BHH","BHI"}; 24 | public static final Integer[] BQ2_VALS = {-1720,-1139,466,864,-919,-181,153,-1146,118,-1159}; 25 | public static final String[] BQ3_KEYS = {"OHH","OKH","OKI","BNN","BII","OHM","BOH","OKO","OII","BMH","OMH","OOO","BMM","BHH","BHI","BKI"}; 26 | public static final Integer[] BQ3_VALS = {2174,1798,-793,998,-299,439,775,-2242,280,937,-2402,11699,8335,-792,2664,419}; 27 | public static final String[] BQ4_KEYS = {"BKK","OHH","OHK","BIH","BII","BIK","ONN","BOO","OAH","BMI","BHH"}; 28 | public static final Integer[] BQ4_VALS = {-1806,266,-2036,3761,-4654,1348,-973,-12396,926,-3385,-3895}; 29 | public static final String[] BW1_KEYS = {"引き","から","いう","を見","平方","B1同","てい","たち","大阪","B1あ","ませ","取り","には","てき","すで","毎日","どこ","なん","さら","こと","まで","の中","そこ","いっ","がら","とみ","さん","にも","った","ない","」と","つい","ため","した","うん","本当","でき","、と","やむ","よっ","まま","して","、同","に対","亡く","B1同","」と","です","大き","B1あ","をし","あっ","まる","京都","こん","なっ","とい","いる",",と","れた","など",",同","の一","目指","うし","れで","では","それ","こう","にし","日本"}; 30 | public static final Integer[] BW1_VALS = {-1336,3472,1743,731,-2314,542,805,1122,1497,1404,2448,-2784,1498,1249,-3399,-2113,3887,-1113,-4143,2083,1711,741,1977,-2055,600,1922,4573,1671,3463,5713,1682,-802,601,2641,665,-2423,1127,660,-1947,-2565,2600,1104,727,-912,-1886,542,1682,3445,-2604,1404,1860,1505,-2155,2558,-1262,3015,-4915,672,660,2369,7379,727,-501,-724,-4817,-913,844,-871,-790,2468,-195}; 31 | public static final String[] BW2_KEYS = {"――","れば","とこ","に対","11","んだ","はい","くな","一部","委員","ので","でも","いう","のに","はが","んな","新聞","とと","のの","会社","同党","との","もい","めて","しい","はず","一方","を通","少な","しか","上が","され","とみ","−−","とも","ない","本人","った","さん","に関","なが","って","っと","手権","した","かし","らか","曜日","年度","して","その","しな","もの","一人","東京","がい","らし","米国","一日","など","にお","うか","日米","たい","なの","らに","大阪","にし","府県","かも","りし","社会","から","まし","かれ","ばれ","てい","たた","にな","ただ","たち","第に","われ","てき","たと","てく","なん","同日", "","まで","きた","たは","こと","然と","この","がら","りま","でい","によ","11","でき","に従","ては","立て","でし","です","まれ","れた","ても","とい","分の","のか","ろう","出て","日本","れて","年間","日新","朝鮮","させ"}; 32 | public static final Integer[] BW2_VALS = {-5730,4114,-1746,-14943,-669,728,1073,-1597,-1051,-1250,-7059,-4203,-1609,-6041,-1033,-4115,-4066,-2279,-6125,-1116,970,720,2230,-3153,-1819,-2532,-1375,-11877,-1050,-545,-4479,13168,5168,-13175,-3941,-2488,-2697,4589,-3977,-11388,-1313,1647,-2094,-1982,5078,-1350,-944,-601,-8669,972,-3744,939,-10713,602,-1543,853,-1611,-4268,970,-6509,-1615,2490,3372,-1253,2614,-1897,-2471,2748,-2363,-602,651,-1276,-7194,-1316,4612,1813,6144,-662,2454,-3857,-786,-1612,7901,3640,1224,2551,3099,-913,-11822,-6621,1941,-939,-8392,-1384,-4193,-3198,1620,2666,-7236,-669,-1528,-4688,-3110,-990,-3828,-4761,5409,4270,-3065,1890,-7758,2093,6067,2163,-7068,849,-1626,-722,-2355,4533}; 33 | public static final String[] BW3_KEYS = {"でに","市","るる","では","れば","日、","た.","とし","が、","す.","んだ","に、","いい","んで","どう","いえ","新聞","た。","あり","ある","いく","れる","との","す。","そう","しい","だ.","ず,","カ月","いた","いっ","大会","とも","さを","ない","った","だ。","かけ","って","ず、","した","なく","れ,","して","しな","かっ","らし","けど","カ月","れ、","かに","がき","の,","など","がけ","いる","たい","しま","いわ","会議","にし","がっ","の、","うち","社会","から","かり","うと","の子","まし","てい","は,","ます","にな","い.","てお","われ","には","まっ","られ","まで","たの","きた","し,","こと","は、","べき","この","い。","がら","がり","か.","だっ","し、","たり","たる","さい","始め","ずに","する","です","か。","まれ","日,","ころ","あた","れた","えと","が,","ても","とう","れて","入り","に,"}; 34 | public static final Integer[] BW3_VALS = {-1482,965,3818,2295,-3246,974,8875,2266,1816,-1310,606,-1021,5308,798,4664,2079,-5055,8875,719,3846,3029,1091,541,-1310,428,-3714,4098,3426,990,2056,1883,2217,-3543,976,1796,-4748,4098,-743,300,3426,3562,-903,854,1449,2608,-4098,1479,1374,990,854,-669,-4855,-724,2135,-1127,5600,-594,1200,1527,860,1771,-913,-724,1117,2024,6520,-2670,4798,-1000,1113,6240,1337,6943,1906,-1185,855,-605,2644,-1549,6820,6154,812,1645,1557,7397,1337,2181,1542,-1185,-4977,-2064,2857,1004,1557,-1183,-853,-714,1681,841,6521,1437,2857,-793,974,-2757,-2194,1850,1454,1816,302,-1387,1375,1232,-1021}; 35 | public static final String[] TC1_KEYS = {"HOM","MMH","AAA","IHI","OOI","HHH","IOH","HHM","IOI","HII","HOH","IOM"}; 36 | public static final Integer[] TC1_VALS = {-331,187,1093,1169,-1832,1029,-142,580,-1015,998,-390,467}; 37 | public static final String[] TC2_KEYS = {"IHI","OII","HMM","KKH","HHO","HII"}; 38 | public static final Integer[] TC2_VALS = {-1965,-2649,-1154,703,2088,-1023}; 39 | public static final String[] TC3_KEYS = {"HHH","HHI","KOK","IOI","IIH","AAA","KKA","IIM","MHH","OHO","KKH","KHH","MHM","MHO","IHH","IHI","MMH","IHO","HOH","NNH","HII","HIK","NNO"}; 40 | public static final Integer[] TC3_VALS = {346,-341,-1009,-542,-825,-294,491,-1035,-2694,-3393,-1217,-1216,-457,123,128,-3041,-471,-1935,-1486,-1689,-1088,731,662}; 41 | public static final String[] TC4_KEYS = {"MOM","HHH","HHI","HHK","HHM","IIH","HHN","III","HHO","KKA","IOO","MHH","IIO","MHI","KKK","IHH","MMH","IHO","HOH","MMM","HIH","HII","KAK"}; 42 | public static final Integer[] TC4_VALS = {841,-203,1344,365,-122,321,182,1497,669,3386,54,-405,656,201,3065,695,-241,-2324,446,661,804,679,4845}; 43 | public static final String[] TQ1_KEYS = {"BHIH","OHHH","BOHH","OIIH","BNHH","OHIH","BIHH","BHHH","BHHI","OAKK","BIII","BOOO","OIHI"}; 44 | public static final Integer[] TQ1_VALS = {-132,281,225,-68,-744,249,60,-227,316,482,1595,-908,200}; 45 | public static final String[] TQ2_KEYS = {"BIHH","BKAK","BOOO","BIII"}; 46 | public static final Integer[] TQ2_VALS = {-1401,-543,-5591,-1033}; 47 | public static final String[] TQ3_KEYS = {"BHIH","BHII","OHII","OKAK","OOII","BHHH","OHHH","OHHI","BHHM","BIIH","BIII","OIIH","OOHH","OKKA","BMHI","BMHM","OHMH","OKHH","BOMH","OIHH"}; 48 | public static final Integer[] TQ3_VALS = {222,-504,997,2792,-685,478,346,1729,-1073,-116,-105,1344,110,679,-863,-464,481,587,620,623}; 49 | public static final String[] TQ4_KEYS = {"BHII","OHHH","OHHI","OKAK","OIIH","OIII","OHHO","OHIH","BHHH","OAKK","BIIH","BIII","OAAA","OIHH","BHHM","OIHI"}; 50 | public static final Integer[] TQ4_VALS = {-966,-294,2446,-8156,626,-4007,480,-1573,-721,180,-607,-2181,-2763,1935,-3604,-493}; 51 | public static final String[] TW1_KEYS = {"東京都","につい"}; 52 | public static final Integer[] TW1_VALS = {2026,-4681}; 53 | public static final String[] TW2_KEYS = {"だって","しょう","として","ある程","大きな","その後","ともに","ころが","対して","もので","社会党","ていた","一気に","いった","初めて","同時に"}; 54 | public static final Integer[] TW2_VALS = {-1049,3873,-4657,-2049,-1255,-4430,-4517,-2434,-2721,1882,-3216,1833,-792,-1256,-1512,-8097}; 55 | public static final String[] TW3_KEYS = {"ので、","として","のもの","にとっ","いただ","につい","してい","ので,","十二月","れから","に当た"}; 56 | public static final Integer[] TW3_VALS = {-727,-4314,-600,-5989,-1734,-5483,1314,-727,-2287,-3752,-6247}; 57 | public static final String[] TW4_KEYS = {"からな","ました","という","いう.","ようと","よると","たが,","ている","してい","いう。","ません","たが、"}; 58 | public static final Integer[] TW4_VALS = {-2348,5543,1349,8576,-4258,5865,1516,1538,2958,8576,1097,1516}; 59 | public static final String[] UC1_KEYS = {"M","O","K","A"}; 60 | public static final Integer[] UC1_VALS = {645,-505,93,484}; 61 | public static final String[] UC2_KEYS = {"M","N","O","H","I","A"}; 62 | public static final Integer[] UC2_VALS = {3987,5775,646,1059,409,819}; 63 | public static final String[] UC3_KEYS = {"A","I"}; 64 | public static final Integer[] UC3_VALS = {-1370,2311}; 65 | public static final String[] UC4_KEYS = {"M","N","O","H","I","K","A"}; 66 | public static final Integer[] UC4_VALS = {3565,3876,6646,1809,-1032,-3450,-2643}; 67 | public static final String[] UC5_KEYS = {"M","O","H","I","K"}; 68 | public static final Integer[] UC5_VALS = {539,-831,313,-1238,-799}; 69 | public static final String[] UC6_KEYS = {"M","O","H","I","K"}; 70 | public static final Integer[] UC6_VALS = {247,-387,-506,-253,87}; 71 | public static final String[] UP1_KEYS = {"O"}; 72 | public static final Integer[] UP1_VALS = {-214}; 73 | public static final String[] UP2_KEYS = {"B","O"}; 74 | public static final Integer[] UP2_VALS = {69,935}; 75 | public static final String[] UP3_KEYS = {"B"}; 76 | public static final Integer[] UP3_VALS = {189}; 77 | public static final String[] UQ1_KEYS = {"BH","BI","BK","BN","BO","OH","OI","OK","OO"}; 78 | public static final Integer[] UQ1_VALS = {21,-12,-99,142,-56,-95,477,410,-2422}; 79 | public static final String[] UQ2_KEYS = {"BH","BI","OK"}; 80 | public static final Integer[] UQ2_VALS = {216,113,1759}; 81 | public static final String[] UQ3_KEYS = {"BH","BI","BK","BM","BN","BO","OI","BA","ON"}; 82 | public static final Integer[] UQ3_VALS = {42,1913,-7198,3160,6427,14761,-827,-479,-3212}; 83 | public static final String[] UW1_KEYS = {"京","あ","委","う","が","き","「","こ","・","大","区","市","、","国","午","で","と","ど",",","に","「","の","は","日","生","理","都","も","や","よ","ら","県","り","主","れ","を","ん","・"}; 84 | public static final Integer[] UW1_VALS = {-268,-941,729,-127,-553,121,-463,505,-135,561,-912,-411,156,-460,871,-201,-547,-123,156,-789,-463,-185,-847,-141,-408,361,-718,-466,-470,182,-292,-386,208,-402,169,-446,-137,-135}; 85 | public static final String[] UW2_KEYS = {"揺","市","も","会","や","保","よ","最","り","初","る","れ","文","第","入","を","ん","自","ア","朝",",","カ","キ","事","本","西","新","「","」","、","見","ッ","ッ","北","〇","ア","小","子","「","カ","」","目","キ","開","相","間","副","大","学","天","太","理","人","区","県","日","立","次","三","年","不","強","東","込","世","あ","行","い","う","政","お","か","が","手","く","こ","中","さ","ざ","明","し","発","実","す","米","せ","そ","た","だ","民","主","つ","て","果","で","気","と","ど","な","議","に","の","は","ひ","調","べ","ま"}; 86 | public static final Integer[] UW2_VALS = {-1033,-813,-1263,978,-402,362,1639,-630,-579,-3025,-694,571,-1355,810,548,-2516,2095,-1353,-587,-1843,-829,306,568,492,-1650,-744,-1682,-645,3145,-829,-3874,831,831,-3414,892,-587,-2009,-1519,-645,306,3145,-1584,568,1758,-242,-1257,-1566,-1769,760,-865,-483,752,-123,-422,-1165,-1815,-763,-2378,-758,-1060,-2150,1067,-931,3041,-302,-538,838,505,134,1522,-502,1454,-856,-1519,-412,1141,-968,878,540,-1462,1529,529,1023,-675,509,300,-1011,188,1837,-180,-861,-949,-291,-665,-268,-1740,-981,1273,1063,1198,-1764,130,-409,-1273,1010,1261,600}; 87 | public static final String[] UW3_KEYS = {"1","低","前","関","何","作","李","村","費","口","込","立","、","学","総","々","副","〇","日","旧","右",",","」","線","平","年","〓","一","森","知","東","国","各","下","合","海","広","非","同","安","米","指","世","力","的","能","両","氏","民","府","実","思","中","あ","い","度","う","性","え","お","か","昨","が","生","主","く","け","げ","家","こ","ご","さ","用","し","元","す","通","せ","そ","第","グ","た","ち","っ","つ","て","時","で","と","町","ど","な","に","動","の","は","務","党","ひ","保","私","ふ","へ","ほ","ま","全","み","め","公","も","六","や","共","よ","ら","車","り","る","れ","軍","わ","を","金","ん","業","物","建","1","円","予","二","ア","決","再","直","和","型","特","英","小","化","少","北","系","グ","省","外","約","選","ス","者","県","税","ッ","ト","無","級","人","区","戸","千","核","今","午","ム","政","他","協","ル","ロ","」","・","当","ン","員","以","ッ","・","調","ア","教","州","法","曜","ス","−","駅","郎","ト","数","ム","分","市","自","郡","ル","最","統","ロ","ン","部","文","月","雨","初","得","長","別","電","期","見","場","開","新","妻","間","財"}; 88 | public static final Integer[] UW3_VALS = {-800,811,2286,-1282,4265,-361,3094,364,1777,483,-1504,-960,4889,-1356,1163,-2311,4437,5827,2099,5792,1233,4889,2670,1255,-1804,2416,-3573,-1619,2438,-1528,-805,642,3588,-1759,-241,-495,-1030,2066,3906,-423,7767,-3973,-2087,365,7313,725,3815,2613,-1694,1605,-1008,-1291,653,-2696,1006,1452,2342,1822,1983,-4864,-1163,-661,3271,-273,-758,1004,388,401,1078,-3552,-3116,-1058,914,-395,4858,584,-1136,3685,-5228,1201,1319,842,-521,-1444,-1081,6167,-1248,2318,1691,1215,-899,-2788,2745,-949,4056,4555,-1872,3593,-2171,-2439,4231,-1798,1199,-5516,-4384,1574,-120,1205,-3030,2323,755,-788,-1880,-202,727,1835,649,5905,2773,1375,-1207,6620,2163,-518,484,461,-2352,-800,5807,-1193,974,551,-1073,3095,-1835,-837,1389,-3850,785,-513,1327,-3102,-1038,3066,1319,792,-241,3663,-681,874,6457,6293,401,-1350,521,979,1384,2742,4646,-488,-2309,5156,792,-783,1109,-2013,1889,-1006,1591,2201,2670,-3794,-3885,278,4513,-1368,-1350,-3794,-562,551,-1479,1155,1868,-951,874,-1723,1620,1026,521,3222,1109,457,3197,-2869,4404,1591,-937,-4229,2201,278,1200,-1489,4125,2009,2475,1905,421,1129,-1045,360,1044,1219,-1432,1764,2016,1302,-733}; 89 | public static final String[] UW4_KEYS = {"般","前","体","子","作","回","込","立","、","。","学","総","副","〇","行","日","来","「",",","」",".","線","近","年","〓","島","一","国","賞","庁","合","警","米","署","園","議","力","的","能","率","定","氏","民","気","中","あ","い","う","性","え","地","お","か","が","き","生","ぎ","く","け","産","げ","こ","ご","さ","し","じ","す","ず","せ","そ","先","田","第","た","だ","ち","っ","つ","て","時","で","と","町","な","に","ぬ","動","ね","の","館","は","ば","務","党","ひ","び","ふ","へ","べ","ほ","ま","み","む","め","も","ゃ","や","士","共","ょ","よ","ら","車","り","―","る","れ","軍","ろ","わ","野","を","ん","業","道","物","寺","内","円","予","目","事","高","和","院","井","カ","小","化","系","球","省","済","コ","多","約","選","者","セ","県","大","ッ","校","ト","沢","人","区","支","改","首","領","際","所","メ","政","屋","ラ","輪","リ","協","ル","「","」","・","ン","谷","員","以","ッ","ー","川","・","教","ー","経","カ","器","コ","セ","側","山","郎","ト","題","メ","市","ラ","リ","ル","最","統","ン","文","後","空","月","会","初","長","都","感","電","銀","規","木","場","間","参","塁","方"}; 90 | public static final Integer[] UW4_VALS = {-852,1623,-1286,-4802,530,1500,-3370,-2112,3930,3508,-1397,940,3879,4999,-792,1798,-442,1895,3930,3798,3508,-994,929,374,-5156,-2056,-2069,-619,730,-4556,-1834,-1184,2937,749,-1200,-244,-302,2586,-730,672,-1057,5388,-2716,-910,2210,4752,-3435,-640,553,-2514,866,2405,530,6006,-4482,-1286,-3821,-3788,-4376,-1101,-4734,2255,1979,2864,-843,-2506,-731,1251,181,4091,601,-2900,788,5034,5408,-3654,-5882,-1659,3994,1829,7410,4547,1826,5433,6499,1853,-740,1413,7396,-1984,8578,1940,-2715,-2006,4249,-4134,1345,6665,-744,1464,1051,-2082,-882,-5046,4169,-2666,2795,-1413,-1212,-1544,3351,-2922,-1481,-9726,-4841,-14896,-2613,1158,-4570,-1783,-1100,13150,-2352,-1043,-1291,-735,-809,584,788,782,922,-190,2120,-681,-2297,-1768,2145,1910,776,786,-1267,-3485,-543,1789,1067,2171,2596,2145,1287,2997,571,-724,-360,-403,-939,1036,4517,856,787,1749,-1659,-2604,-1566,-1635,2182,-1328,-881,-1433,-541,1013,-856,1895,3798,-4371,-3637,-1000,-910,544,-724,-11870,-2667,-4371,704,-11870,1146,2145,-851,1789,1287,4292,-1500,-4866,-403,-792,-1635,2771,-881,-541,-856,845,-1169,-3637,522,456,-867,-9066,950,1347,357,1192,916,-878,-2213,792,-485,-1410,-2344,1555,-2094,-856}; 91 | public static final String[] UW5_KEYS = {"み","市","1","め","ゃ","会","党","ょ","務","り","る","E2","れ","嵐","田","わ","郎","月","を","ん","町","題","統","空","イ","席",",",".","館","新","「","長","、","。","査","イ","「","京","相","E2","間","]","大","学","省","社","区","県","ル","日","機","ル","者","年","ン","ン","選","あ","所","い","う","格","え","お","か","が","き","ぎ","く","員","げ","定","中","さ","し","語","す","挙","思","表","氏","だ","ち","的","っ","つ","て","1","で","と","ど","な","議","に","の","は","研","べ","告"}; 92 | public static final Integer[] UW5_VALS = {502,-2991,-514,865,3350,-1153,-654,854,3519,-208,429,-32768,504,-1304,240,419,-368,-4353,-1264,327,-3912,2368,1955,-813,241,921,465,-299,-689,-1682,363,786,465,-299,932,241,363,722,1319,-32768,1191,-2762,-1296,-548,-1052,-278,-901,-4003,451,218,-1508,451,-2233,1763,-343,-343,-1018,1655,-814,331,-503,1356,1199,527,647,-421,1624,1971,312,2104,-983,1785,-871,-1537,-1371,-1073,-852,1618,872,663,-1347,-1186,1093,-3149,52,921,-18,-514,-850,-127,1682,-787,1219,-1224,-635,-578,-997,1001,848}; 93 | public static final String[] UW6_KEYS = {"1","E1","あ","空","委","う","業","か","が","会","く","一","郎","こ","じ","区","す","学","E1","市","1","た","、","。","っ","連","て","で","と",",","な","に","後",".","の","は","福","相","中","広","も","社","員","ル","前","件","り","る","ン","ル","を","ン","者"}; 94 | public static final Integer[] UW6_VALS = {-270,306,-307,-822,798,189,-697,241,-73,624,-121,-277,1082,-200,1782,1792,383,-960,306,887,-270,-428,227,808,573,463,-1014,101,-105,227,-253,-149,535,808,-417,-236,974,753,201,-695,-206,-507,-1212,-673,302,-800,187,-135,-496,-673,195,-496,1811}; 95 | public static final Map BC1; 96 | public static final Map BC2; 97 | public static final Map BC3; 98 | public static final Map BP1; 99 | public static final Map BP2; 100 | public static final Map BQ1; 101 | public static final Map BQ2; 102 | public static final Map BQ3; 103 | public static final Map BQ4; 104 | public static final Map BW1; 105 | public static final Map BW2; 106 | public static final Map BW3; 107 | public static final Map TC1; 108 | public static final Map TC2; 109 | public static final Map TC3; 110 | public static final Map TC4; 111 | public static final Map TQ1; 112 | public static final Map TQ2; 113 | public static final Map TQ3; 114 | public static final Map TQ4; 115 | public static final Map TW1; 116 | public static final Map TW2; 117 | public static final Map TW3; 118 | public static final Map TW4; 119 | public static final Map UC1; 120 | public static final Map UC2; 121 | public static final Map UC3; 122 | public static final Map UC4; 123 | public static final Map UC5; 124 | public static final Map UC6; 125 | public static final Map UP1; 126 | public static final Map UP2; 127 | public static final Map UP3; 128 | public static final Map UQ1; 129 | public static final Map UQ2; 130 | public static final Map UQ3; 131 | public static final Map UW1; 132 | public static final Map UW2; 133 | public static final Map UW3; 134 | public static final Map UW4; 135 | public static final Map UW5; 136 | public static final Map UW6; 137 | static { 138 | int i; 139 | Map m; 140 | m = new HashMap(); 141 | for (i=0; i(); 146 | for (i=0; i(); 151 | for (i=0; i(); 156 | for (i=0; i(); 161 | for (i=0; i(); 166 | for (i=0; i(); 171 | for (i=0; i(); 176 | for (i=0; i(); 181 | for (i=0; i(); 186 | for (i=0; i(); 191 | for (i=0; i(); 196 | for (i=0; i(); 201 | for (i=0; i(); 206 | for (i=0; i(); 211 | for (i=0; i(); 216 | for (i=0; i(); 221 | for (i=0; i(); 226 | for (i=0; i(); 231 | for (i=0; i(); 236 | for (i=0; i(); 241 | for (i=0; i(); 246 | for (i=0; i(); 251 | for (i=0; i(); 256 | for (i=0; i(); 261 | for (i=0; i(); 266 | for (i=0; i(); 271 | for (i=0; i(); 276 | for (i=0; i(); 281 | for (i=0; i(); 286 | for (i=0; i(); 291 | for (i=0; i(); 296 | for (i=0; i(); 301 | for (i=0; i(); 306 | for (i=0; i(); 311 | for (i=0; i(); 316 | for (i=0; i(); 321 | for (i=0; i(); 326 | for (i=0; i(); 331 | for (i=0; i(); 336 | for (i=0; i(); 341 | for (i=0; i(); 346 | for (i=0; i ret = new ArrayList(); 35 | CharacterIterator iter = new StringCharacterIterator(str); 36 | StringBuilder token; 37 | char c = DONE; 38 | 39 | for (c=iter.first(); c!=DONE; c=iter.next()) { 40 | // トークン先頭の空白を読み飛ばす 41 | while (c == ' ' || c == '\t') { 42 | c = iter.next(); 43 | } 44 | 45 | token = new StringBuilder(); 46 | if (c == '"') { 47 | // クォート文字列 48 | // 終わりまで読む 49 | while ((c = iter.next()) != DONE) { 50 | if (c == '"') { 51 | c = iter.next(); 52 | // 2つ連続する「"」は、エスケープされた「"」 53 | if (c == '"') { 54 | // これはエスケープされた「"」 55 | token.append(c); 56 | } else { 57 | // クォート文字列の終わり 58 | break; 59 | } 60 | } else { 61 | token.append(c); 62 | } 63 | } 64 | // ","まで文字列を読み飛ばす 65 | while (c != DONE && c != ',') { 66 | c = iter.next(); 67 | } 68 | } else { 69 | // 次の","まで、トークンを読み取る 70 | while (c != DONE && c != ',') { 71 | token.append(c); 72 | c = iter.next(); 73 | } 74 | } 75 | 76 | --max; 77 | if (max <= 0) { 78 | // これ以上の文字列は、すべて最後のトークンにマージする 79 | while (c != DONE) { 80 | token.append(c); 81 | c = iter.next(); 82 | } 83 | } 84 | 85 | ret.add(token.toString()); 86 | } 87 | 88 | // 汚いハック: 文字列が","で終わる場合、最後に空白要素を追加する 89 | if (max > 0 && str.endsWith(",")) { 90 | ret.add(""); 91 | } 92 | 93 | return ret.toArray(new String[ret.size()]); 94 | } 95 | 96 | /** 97 | * CSVの要素内で使用できない文字をエスケープする 98 | * 99 | * @param str エスケープする文字列 100 | * @return エスケープされた文字列 101 | */ 102 | public static String escape(String str) 103 | { 104 | // 「 」「\t」「"」「,」のいずれかが含まれていれば、ダブルクォーテーションで囲む 105 | StringBuilder ret = null; 106 | char c; 107 | for (int i=0; i elements) 143 | { 144 | StringBuilder b = new StringBuilder(); 145 | 146 | boolean first = true; 147 | for (String e: elements) { 148 | if (first) { 149 | first = false; 150 | } else { 151 | b.append(","); 152 | } 153 | b.append(escape(e)); 154 | } 155 | 156 | return b.toString(); 157 | } 158 | 159 | /** 160 | * elementsの要素をカラムとして、一行のCSVデータを作成して返す。 161 | * 162 | * @param elements 各項目の値 163 | * @return 一行分のCSVデータ 164 | */ 165 | public static String join(String[] elements) 166 | { 167 | return join(Arrays.asList(elements)); 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /src/test/java/net/moraleboost/io/BasicCodePointReaderTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | ** 3 | ** Feb. 17, 2009 4 | ** 5 | ** The author disclaims copyright to this source code. 6 | ** In place of a legal notice, here is a blessing: 7 | ** 8 | ** May you do good and not evil. 9 | ** May you find forgiveness for yourself and forgive others. 10 | ** May you share freely, never taking more than you give. 11 | ** 12 | ** Stolen from SQLite :-) 13 | ** Any feedback is welcome. 14 | ** Kohei TAKETA 15 | ** 16 | */ 17 | package net.moraleboost.io; 18 | 19 | import org.junit.Test; 20 | 21 | import java.io.CharArrayReader; 22 | import java.io.CharArrayWriter; 23 | import java.io.IOException; 24 | import java.io.StringReader; 25 | 26 | import static org.junit.Assert.assertTrue; 27 | import static org.junit.Assert.fail; 28 | 29 | public class BasicCodePointReaderTest 30 | { 31 | @Test 32 | public void testBasic() throws IOException 33 | { 34 | String str = "あaいbうcえdお"; 35 | int[] answer = getCodePoints(str); 36 | long[] positions = getPositions(str); 37 | CodePointReader reader = new BasicCodePointReader(new StringReader(str)); 38 | if (!match(reader, answer, positions)) { 39 | fail("コードポイントが一致しません。"); 40 | } 41 | } 42 | 43 | @Test 44 | public void testSurrogatePair() throws IOException 45 | { 46 | int scp = 0x00010400; 47 | int[] answer = new int[] { scp, 'あ', 'a', 'い', scp, scp, 'd', 'お', scp }; 48 | long[] positions = new long[] { 0, 2, 3, 4, 5, 7, 9, 10, 11, 13 }; 49 | 50 | String str = new String(answer, 0, answer.length); 51 | CodePointReader reader = new BasicCodePointReader(new StringReader(str)); 52 | if (!match(reader, answer, positions)) { 53 | fail("コードポイントが一致しません。"); 54 | } 55 | } 56 | 57 | @Test 58 | public void testEndWithHighSurrogate() throws IOException 59 | { 60 | int scp = 0x00010400; 61 | String base = "本日は晴天なり"; 62 | char highSurrogate = Character.toChars(scp)[0]; 63 | assertTrue(Character.isHighSurrogate(highSurrogate)); 64 | 65 | CharArrayWriter writer = new CharArrayWriter(); 66 | writer.write(base); 67 | writer.write(highSurrogate); 68 | 69 | int[] answer = getCodePoints(base 70 | + (char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT); 71 | long[] positions = getPositions(base 72 | + (char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT); 73 | CodePointReader reader = new BasicCodePointReader(new CharArrayReader( 74 | writer.toCharArray())); 75 | if (!match(reader, answer, positions)) { 76 | fail("コードポイントが一致しません。"); 77 | } 78 | } 79 | 80 | @Test 81 | public void testEndWithLowSurrogate() throws IOException 82 | { 83 | int scp = 0x00010400; 84 | String base = "本日は晴天なり"; 85 | char lowSurrogate = Character.toChars(scp)[1]; 86 | assertTrue(Character.isLowSurrogate(lowSurrogate)); 87 | 88 | CharArrayWriter writer = new CharArrayWriter(); 89 | writer.write(base); 90 | writer.write(lowSurrogate); 91 | 92 | int[] answer = getCodePoints(base 93 | + (char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT); 94 | long[] positions = getPositions(base 95 | + (char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT); 96 | CodePointReader reader = new BasicCodePointReader(new CharArrayReader( 97 | writer.toCharArray())); 98 | if (!match(reader, answer, positions)) { 99 | fail("コードポイントが一致しません。"); 100 | } 101 | } 102 | 103 | @Test 104 | public void testStartWithHighSurrogate() throws IOException 105 | { 106 | int scp = 0x00010400; 107 | String base = "本日は晴天なり"; 108 | char highSurrogate = Character.toChars(scp)[0]; 109 | assertTrue(Character.isHighSurrogate(highSurrogate)); 110 | 111 | CharArrayWriter writer = new CharArrayWriter(); 112 | writer.write(highSurrogate); 113 | writer.write(base); 114 | 115 | int[] answer = getCodePoints((char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT 116 | + base); 117 | long[] positions = getPositions((char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT 118 | + base); 119 | CodePointReader reader = new BasicCodePointReader(new CharArrayReader( 120 | writer.toCharArray())); 121 | if (!match(reader, answer, positions)) { 122 | fail("コードポイントが一致しません。"); 123 | } 124 | } 125 | 126 | @Test 127 | public void testStartWithLowSurrogate() throws IOException 128 | { 129 | int scp = 0x00010400; 130 | String base = "本日は晴天なり"; 131 | char lowSurrogate = Character.toChars(scp)[1]; 132 | assertTrue(Character.isLowSurrogate(lowSurrogate)); 133 | 134 | CharArrayWriter writer = new CharArrayWriter(); 135 | writer.write(lowSurrogate); // low surrogate 136 | writer.write(base); 137 | 138 | int[] answer = getCodePoints((char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT 139 | + base); 140 | long[] positions = getPositions((char)BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT 141 | + base); 142 | CodePointReader reader = new BasicCodePointReader(new CharArrayReader( 143 | writer.toCharArray())); 144 | if (!match(reader, answer, positions)) { 145 | fail("コードポイントが一致しません。"); 146 | } 147 | } 148 | 149 | @Test 150 | public void testIllformedSurrogate() throws IOException 151 | { 152 | int scp = 0x00010400; 153 | int[] original = new int[] { 'あ', 'a', 'い', scp, scp, 'd', 'お' }; 154 | // high surrogateを破壊した場合の正解 155 | int[] answer1 = new int[] { 'あ', 'a', 'い', 'a', 156 | BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT, scp, 'd', 157 | 'お' }; 158 | long[] positions1 = new long[] { 0, 1, 2, 3, 4, 5, 7, 8, 9 }; 159 | // low surrogateを破壊した場合の正解 160 | int[] answer2 = new int[] { 'あ', 'a', 'い', 161 | BasicCodePointReader.DEFAULT_ALTERNATION_CODEPOINT, 'a', scp, 162 | 'd', 'お' }; 163 | long[] positions2 = new long[] { 0, 1, 2, 3, 4, 5, 7, 8, 9 }; 164 | 165 | // 不正なデータを作成 166 | char[] chars1 = new String(original, 0, original.length).toCharArray(); 167 | char[] chars2 = new String(original, 0, original.length).toCharArray(); 168 | 169 | // high surrogateを破壊 170 | chars1[3] = 'a'; 171 | String ill1 = new String(chars1); 172 | 173 | // low surrogateを破壊 174 | chars2[4] = 'a'; 175 | String ill2 = new String(chars2); 176 | 177 | if (!match(new BasicCodePointReader(new StringReader(ill1)), answer1, 178 | positions1)) { 179 | fail("Low surrogateが単独で存在する場合のコードポイントが一致しません。"); 180 | } 181 | if (!match(new BasicCodePointReader(new StringReader(ill2)), answer2, 182 | positions2)) { 183 | fail("High surrogateが単独で存在する場合のコードポイントが一致しません。"); 184 | } 185 | } 186 | 187 | private boolean match(CodePointReader reader, int[] answer, long[] positions) 188 | throws IOException 189 | { 190 | int i = 0; 191 | int cp; 192 | while ((cp = reader.read()) >= 0) { 193 | if (cp != answer[i]) { 194 | return false; 195 | } 196 | if (reader.getPosition() != positions[i + 1]) { 197 | return false; 198 | } 199 | ++i; 200 | } 201 | 202 | return (i == answer.length); 203 | } 204 | 205 | private int[] getCodePoints(String str) 206 | { 207 | int count = str.codePointCount(0, str.length()); 208 | int[] result = new int[count]; 209 | int cpIndex = 0, charIndex = 0; 210 | while (charIndex < str.length()) { 211 | int cp = str.codePointAt(charIndex); 212 | result[cpIndex++] = cp; 213 | charIndex += Character.charCount(cp); 214 | } 215 | 216 | return result; 217 | } 218 | 219 | private long[] getPositions(String str) 220 | { 221 | int count = str.codePointCount(0, str.length()); 222 | long[] positions = new long[count + 1]; 223 | int cpIndex = 0, charIndex = 0; 224 | while (charIndex < str.length()) { 225 | int cp = str.codePointAt(charIndex); 226 | positions[cpIndex++] = charIndex; 227 | charIndex += Character.charCount(cp); 228 | } 229 | positions[cpIndex] = charIndex; 230 | 231 | return positions; 232 | } 233 | } 234 | -------------------------------------------------------------------------------- /src/test/java/net/moraleboost/io/PushbackCodePointReaderTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | ** 3 | ** Feb. 17, 2009 4 | ** 5 | ** The author disclaims copyright to this source code. 6 | ** In place of a legal notice, here is a blessing: 7 | ** 8 | ** May you do good and not evil. 9 | ** May you find forgiveness for yourself and forgive others. 10 | ** May you share freely, never taking more than you give. 11 | ** 12 | ** Stolen from SQLite :-) 13 | ** Any feedback is welcome. 14 | ** Kohei TAKETA 15 | ** 16 | */ 17 | package net.moraleboost.io; 18 | 19 | import org.junit.Test; 20 | 21 | import java.io.CharArrayReader; 22 | import java.io.IOException; 23 | 24 | import static org.junit.Assert.assertEquals; 25 | import static org.junit.Assert.fail; 26 | 27 | public class PushbackCodePointReaderTest 28 | { 29 | @Test 30 | public void testBasic() throws Exception 31 | { 32 | String str = "abc"; 33 | int scp = 0x00010400; 34 | 35 | CharArrayReader car = new CharArrayReader(str.toCharArray()); 36 | CodePointReader base = new BasicCodePointReader(car); 37 | PushbackCodePointReader reader = new PushbackCodePointReader(base, 2); 38 | 39 | assertEquals((int)'a', reader.read()); 40 | assertEquals(1L, reader.getPosition()); 41 | assertEquals((int)'b', reader.read()); 42 | assertEquals(2L, reader.getPosition()); 43 | assertEquals((int)'c', reader.read()); 44 | assertEquals(3L, reader.getPosition()); 45 | reader.unread((int)'あ', 1); 46 | assertEquals(2L, reader.getPosition()); 47 | reader.unread(scp, 2); 48 | assertEquals(0L, reader.getPosition()); 49 | assertEquals(scp, reader.read()); 50 | assertEquals(2L, reader.getPosition()); 51 | assertEquals((int)'あ', reader.read()); 52 | assertEquals(3L, reader.getPosition()); 53 | assertEquals(-1, reader.read()); 54 | } 55 | 56 | @Test 57 | public void testMaxSize() throws Exception 58 | { 59 | String str = "abc"; 60 | 61 | CharArrayReader car = new CharArrayReader(str.toCharArray()); 62 | CodePointReader base = new BasicCodePointReader(car); 63 | PushbackCodePointReader reader = new PushbackCodePointReader(base, 2); 64 | 65 | for (int i = 0; i < 3; ++i) { 66 | reader.read(); 67 | } 68 | for (int i = 0; i < 2; ++i) { 69 | reader.unread('a', 1); 70 | } 71 | 72 | try { 73 | reader.unread('a', 1); 74 | fail("スタックサイズ上限の指定が機能していません。"); 75 | } catch (IOException e) { 76 | } 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /src/test/java/net/moraleboost/mecab/impl/StandardTaggerTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | ** 3 | ** Mar. 1, 2008 4 | ** 5 | ** The author disclaims copyright to this source code. 6 | ** In place of a legal notice, here is a blessing: 7 | ** 8 | ** May you do good and not evil. 9 | ** May you find forgiveness for yourself and forgive others. 10 | ** May you share freely, never taking more than you give. 11 | ** 12 | ** Stolen from SQLite :-) 13 | ** Any feedback is welcome. 14 | ** Kohei TAKETA 15 | ** 16 | */ 17 | package net.moraleboost.mecab.impl; 18 | 19 | import net.moraleboost.mecab.Lattice; 20 | import net.moraleboost.mecab.Node; 21 | import net.moraleboost.mecab.Tagger; 22 | import org.junit.Test; 23 | 24 | import static org.junit.Assert.fail; 25 | 26 | public class StandardTaggerTest 27 | { 28 | public static String[] TEXTS = { 29 | "メロスは激怒した。必ず、かの邪智暴虐(じゃちぼうぎゃく)の王を除かなければならぬと決意した。メロスには政治がわからぬ。メロスは、村の牧人である。笛を吹き、羊と遊んで暮して来た。けれども邪悪に対しては、人一倍に敏感であった。きょう未明メロスは村を出発し、野を越え山越え、十里はなれた此(こ)のシラクスの市にやって来た。メロスには父も、母も無い。女房も無い。十六の、内気な妹と二人暮しだ。この妹は、村の或る律気な一牧人を、近々、花婿(はなむこ)として迎える事になっていた。結婚式も間近かなのである。メロスは、それゆえ、花嫁の衣裳やら祝宴の御馳走やらを買いに、はるばる市にやって来たのだ。先ず、その品々を買い集め、それから都の大路をぶらぶら歩いた。メロスには竹馬の友があった。セリヌンティウスである。今は此のシラクスの市で、石工をしている。その友を、これから訪ねてみるつもりなのだ。久しく逢わなかったのだから、訪ねて行くのが楽しみである。歩いているうちにメロスは、まちの様子を怪しく思った。ひっそりしている。もう既に日も落ちて、まちの暗いのは当りまえだが、けれども、なんだか、夜のせいばかりでは無く、市全体が、やけに寂しい。のんきなメロスも、だんだん不安になって来た。路で逢った若い衆をつかまえて、何かあったのか、二年まえに此の市に来たときは、夜でも皆が歌をうたって、まちは賑やかであった筈(はず)だが、と質問した。若い衆は、首を振って答えなかった。しばらく歩いて老爺(ろうや)に逢い、こんどはもっと、語勢を強くして質問した。老爺は答えなかった。メロスは両手で老爺のからだをゆすぶって質問を重ねた。老爺は、あたりをはばかる低声で、わずか答えた。", 30 | "メロスは、単純な男であった。買い物を、背負ったままで、のそのそ王城にはいって行った。たちまち彼は、巡邏(じゅんら)の警吏に捕縛された。調べられて、メロスの懐中からは短剣が出て来たので、騒ぎが大きくなってしまった。メロスは、王の前に引き出された。「この短刀で何をするつもりであったか。言え!」暴君ディオニスは静かに、けれども威厳を以(もっ)て問いつめた。その王の顔は蒼白(そうはく)で、眉間(みけん)の皺(しわ)は、刻み込まれたように深かった。「市を暴君の手から救うのだ。」とメロスは悪びれずに答えた。「おまえがか?」王は、憫笑(びんしょう)した。「仕方の無いやつじゃ。おまえには、わしの孤独がわからぬ。」「言うな!」とメロスは、いきり立って反駁(はんばく)した。「人の心を疑うのは、最も恥ずべき悪徳だ。王は、民の忠誠をさえ疑って居られる。」「疑うのが、正当の心構えなのだと、わしに教えてくれたのは、おまえたちだ。人の心は、あてにならない。人間は、もともと私慾のかたまりさ。信じては、ならぬ。」暴君は落着いて呟(つぶや)き、ほっと溜息(ためいき)をついた。「わしだって、平和を望んでいるのだが。」「なんの為の平和だ。自分の地位を守る為か。」こんどはメロスが嘲笑した。「罪の無い人を殺して、何が平和だ。」「だまれ、下賤(げせん)の者。」王は、さっと顔を挙げて報いた。「口では、どんな清らかな事でも言える。わしには、人の腹綿の奥底が見え透いてならぬ。おまえだって、いまに、磔(はりつけ)になってから、泣いて詫(わ)びたって聞かぬぞ。」「ああ、王は悧巧(りこう)だ。自惚(うぬぼ)れているがよい。私は、ちゃんと死ぬる覚悟で居るのに。命乞いなど決してしない。ただ、――」と言いかけて、メロスは足もとに視線を落し瞬時ためらい、「ただ、私に情をかけたいつもりなら、処刑までに三日間の日限を与えて下さい。たった一人の妹に、亭主を持たせてやりたいのです。三日のうちに、私は村で結婚式を挙げさせ、必ず、ここへ帰って来ます。」「ばかな。」と暴君は、嗄(しわが)れた声で低く笑った。「とんでもない嘘(うそ)を言うわい。逃がした小鳥が帰って来るというのか。」「そうです。帰って来るのです。」メロスは必死で言い張った。「私は約束を守ります。私を、三日間だけ許して下さい。妹が、私の帰りを待っているのだ。そんなに私を信じられないならば、よろしい、この市にセリヌンティウスという石工がいます。私の無二の友人だ。あれを、人質としてここに置いて行こう。私が逃げてしまって、三日目の日暮まで、ここに帰って来なかったら、あの友人を絞め殺して下さい。たのむ、そうして下さい。」", 31 | "それを聞いて王は、残虐な気持で、そっと北叟笑(ほくそえ)んだ。生意気なことを言うわい。どうせ帰って来ないにきまっている。この嘘つきに騙(だま)された振りして、放してやるのも面白い。そうして身代りの男を、三日目に殺してやるのも気味がいい。人は、これだから信じられぬと、わしは悲しい顔して、その身代りの男を磔刑に処してやるのだ。世の中の、正直者とかいう奴輩(やつばら)にうんと見せつけてやりたいものさ。「願いを、聞いた。その身代りを呼ぶがよい。三日目には日没までに帰って来い。おくれたら、その身代りを、きっと殺すぞ。ちょっとおくれて来るがいい。おまえの罪は、永遠にゆるしてやろうぞ。」「なに、何をおっしゃる。」「はは。いのちが大事だったら、おくれて来い。おまえの心は、わかっているぞ。」" 32 | }; 33 | 34 | @Test 35 | public void testParse() 36 | { 37 | try { 38 | Tagger tagger = new StandardTagger(""); 39 | Lattice lattice = tagger.createLattice(); 40 | lattice.setSentence("本日は晴天なり。"); 41 | tagger.parse(lattice); 42 | Node node = lattice.bosNode().next(); 43 | 44 | while (node != null && node.stat() != Node.TYPE_EOS_NODE) { 45 | System.out.println("Surface = " + node.surface()); 46 | System.out.println("Feature = " + node.feature()); 47 | node = node.next(); 48 | } 49 | lattice.destroy(); 50 | tagger.destroy(); 51 | } catch (Exception e) { 52 | fail(e.toString()); 53 | } 54 | } 55 | 56 | @Test 57 | public void testPerf() 58 | { 59 | try { 60 | Tagger tagger = new StandardTagger(""); 61 | Lattice lattice = tagger.createLattice(); 62 | String[] leadingSpaceAndSurface = new String[2]; 63 | 64 | // warming up 65 | for (int i=0; i<100; ++i) { 66 | lattice.clear(); 67 | lattice.setSentence(TEXTS[i % TEXTS.length]); 68 | tagger.parse(lattice); 69 | Node node = lattice.bosNode().next(); 70 | 71 | while (node != null && node.stat() != Node.TYPE_EOS_NODE) { 72 | node.leadingSpaceAndSurface(leadingSpaceAndSurface); 73 | node.feature(); 74 | node = node.next(); 75 | } 76 | } 77 | 78 | long start = System.currentTimeMillis(); 79 | 80 | for (int i=0; i<1000; ++i) { 81 | lattice.clear(); 82 | lattice.setSentence(TEXTS[i % TEXTS.length]); 83 | tagger.parse(lattice); 84 | Node node = lattice.bosNode().next(); 85 | 86 | while (node != null && node.stat() != Node.TYPE_EOS_NODE) { 87 | node.leadingSpaceAndSurface(leadingSpaceAndSurface); 88 | node.feature(); 89 | node = node.next(); 90 | } 91 | } 92 | 93 | long end = System.currentTimeMillis(); 94 | 95 | System.out.println("Total: " + Long.toString(end-start) + " millis."); 96 | 97 | lattice.destroy(); 98 | tagger.destroy(); 99 | } catch (Exception e) { 100 | fail(e.toString()); 101 | } 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/test/java/net/moraleboost/tinysegmenter/TinySegmenterTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | ** 3 | ** Mar. 24, 2009 4 | ** 5 | ** The author disclaims copyright to this source code. 6 | ** In place of a legal notice, here is a blessing: 7 | ** 8 | ** May you do good and not evil. 9 | ** May you find forgiveness for yourself and forgive others. 10 | ** May you share freely, never taking more than you give. 11 | ** 12 | ** Stolen from SQLite :-) 13 | ** Any feedback is welcome. 14 | ** Kohei TAKETA 15 | ** 16 | */ 17 | package net.moraleboost.tinysegmenter; 18 | 19 | import net.moraleboost.io.BasicCodePointReader; 20 | import org.junit.Test; 21 | 22 | import java.io.StringReader; 23 | 24 | import static org.junit.Assert.assertEquals; 25 | import static org.junit.Assert.assertNull; 26 | 27 | public class TinySegmenterTest 28 | { 29 | @Test 30 | public void testTokenize() 31 | throws Exception 32 | { 33 | String str = "本日は晴天なり。"; 34 | StringReader reader = new StringReader(str); 35 | BasicCodePointReader cpreader = new BasicCodePointReader(reader); 36 | 37 | TinySegmenter segmenter = new TinySegmenter(cpreader); 38 | 39 | String[] terms = { 40 | "本日", 41 | "は", 42 | "晴天", 43 | "なり", 44 | "。" 45 | }; 46 | 47 | int[][] offsets = { 48 | {0, 2}, 49 | {2, 3}, 50 | {3, 5}, 51 | {5, 7}, 52 | {7, 8} 53 | }; 54 | 55 | TinySegmenter.Token token; 56 | int i = 0; 57 | while ((token = segmenter.next()) != null) { 58 | assertEquals(terms[i], token.str); 59 | assertEquals(offsets[i][0], token.start); 60 | assertEquals(offsets[i][1], token.end); 61 | ++i; 62 | } 63 | 64 | assertEquals(terms.length, i); 65 | } 66 | 67 | @Test 68 | public void testMinimalBufferSize() 69 | throws Exception 70 | { 71 | String str = 72 | "メロスは激怒した。" + 73 | "必ず、かの邪智暴虐の王を除かなければならぬと決意した。" + 74 | "メロスには政治がわからぬ。" + 75 | "メロスは、村の牧人である。" + 76 | "笛を吹き、羊と遊んで暮して来た。" + 77 | "けれども邪悪に対しては、人一倍に敏感であった。"; 78 | StringReader reader = new StringReader(str); 79 | StringReader reader2 = new StringReader(str); 80 | BasicCodePointReader cpreader = new BasicCodePointReader(reader); 81 | BasicCodePointReader cpreader2 = new BasicCodePointReader(reader2); 82 | 83 | TinySegmenter segmenter = 84 | new TinySegmenter(cpreader, 7, TinySegmenter.DEFAULT_MAX_TOKEN_SIZE); 85 | TinySegmenter segmenter2 = 86 | new TinySegmenter(cpreader2, 1024, TinySegmenter.DEFAULT_MAX_TOKEN_SIZE); 87 | 88 | TinySegmenter.Token token; 89 | TinySegmenter.Token token2; 90 | while (true) { 91 | token = segmenter.next(); 92 | token2 = segmenter2.next(); 93 | 94 | assertEquals(token2, token); 95 | 96 | if (token == null || token2 == null) { 97 | break; 98 | } 99 | } 100 | 101 | assertNull(token); 102 | assertNull(token2); 103 | } 104 | 105 | @Test 106 | public void testEmptyStream() 107 | throws Exception 108 | { 109 | String str = ""; 110 | StringReader reader = new StringReader(str); 111 | BasicCodePointReader cpreader = new BasicCodePointReader(reader); 112 | 113 | TinySegmenter segmenter = new TinySegmenter(cpreader); 114 | 115 | assertNull(segmenter.next()); 116 | } 117 | 118 | @Test 119 | public void testMaxTokenSize() 120 | throws Exception 121 | { 122 | String str = "一日作さざれば、一日食わず。"; 123 | StringReader reader = new StringReader(str); 124 | BasicCodePointReader cpreader = new BasicCodePointReader(reader); 125 | 126 | TinySegmenter segmenter = new TinySegmenter(cpreader, 1024, 2); 127 | 128 | String[] terms = { 129 | "一日", 130 | "作", 131 | "さざ", 132 | "れ", 133 | "ば", 134 | "、", 135 | "一", 136 | "日", 137 | "食わ", 138 | "ず", 139 | "。" 140 | }; 141 | 142 | int[][] offsets = { 143 | {0, 2}, 144 | {2, 3}, 145 | {3, 5}, 146 | {5, 6}, 147 | {6, 7}, 148 | {7, 8}, 149 | {8, 9}, 150 | {9, 10}, 151 | {10, 12}, 152 | {12, 13}, 153 | {13, 14} 154 | }; 155 | 156 | TinySegmenter.Token token; 157 | int i = 0; 158 | while ((token = segmenter.next()) != null) { 159 | assertEquals(terms[i], token.str); 160 | assertEquals(offsets[i][0], token.start); 161 | assertEquals(offsets[i][1], token.end); 162 | ++i; 163 | } 164 | 165 | assertEquals(terms.length, i); 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /src/test/java/net/moraleboost/util/CSVUtilTest.java: -------------------------------------------------------------------------------- 1 | package net.moraleboost.util; 2 | 3 | import org.junit.Test; 4 | 5 | import static org.junit.Assert.assertArrayEquals; 6 | import static org.junit.Assert.assertEquals; 7 | 8 | public class CSVUtilTest 9 | { 10 | @Test 11 | public void testEscape() 12 | { 13 | for (int i=0; i<65536; ++i) { 14 | String str = Character.toString((char)i); 15 | if (str.equals("\"")) { 16 | assertEquals("\"\"\"\"", CSVUtil.escape(str)); 17 | } else if (str.equals(" ")) { 18 | assertEquals("\" \"", CSVUtil.escape(str)); 19 | } else if (str.equals("\t")) { 20 | assertEquals("\"\t\"", CSVUtil.escape(str)); 21 | } else if (str.equals(",")) { 22 | assertEquals("\",\"", CSVUtil.escape(str)); 23 | } else { 24 | assertEquals(str, CSVUtil.escape(str)); 25 | } 26 | } 27 | 28 | assertEquals("abc", CSVUtil.escape("abc")); 29 | assertEquals("あいうえお", CSVUtil.escape("あいうえお")); 30 | assertEquals("\" This \"\"is a pen., what?\t\"", CSVUtil.escape(" This \"is a pen., what?\t")); 31 | } 32 | 33 | @Test 34 | public void testTokenize() 35 | { 36 | // 空文字列は、サイズ0の配列になる 37 | String str = ""; 38 | String[] answer = new String[] {}; 39 | String[] tokens = CSVUtil.tokenize(str, 100); 40 | assertArrayEquals(answer, tokens); 41 | 42 | // 「""」は、空文字列一つからなる配列になる 43 | str = "\"\""; 44 | answer = new String[] {""}; 45 | tokens = CSVUtil.tokenize(str, 100); 46 | assertArrayEquals(answer, tokens); 47 | 48 | // ","は、空文字列2つからなる配列になる 49 | str = ","; 50 | answer = new String[] {"", ""}; 51 | tokens = CSVUtil.tokenize(str, 100); 52 | assertArrayEquals(answer, tokens); 53 | 54 | // 非クォート文字列(1列) 55 | str = "abc"; 56 | answer = new String[] { "abc" }; 57 | tokens = CSVUtil.tokenize(str, 100); 58 | assertArrayEquals(answer, tokens); 59 | 60 | // クォート文字列(1列) 61 | str = " \t\"ab\"\"c\" "; 62 | answer = new String[] {"ab\"c"}; 63 | tokens = CSVUtil.tokenize(str, 100); 64 | assertArrayEquals(answer, tokens); 65 | 66 | // 非クォート文字列(複数列) 67 | str = "a,b , c"; 68 | answer = new String[] {"a", "b ", "c"}; 69 | tokens = CSVUtil.tokenize(str, 100); 70 | assertArrayEquals(answer, tokens); 71 | 72 | // クォート文字列(複数列) 73 | str = "\"a\" , \" bc\"\"d \", \"efg"; 74 | answer = new String[] {"a", " bc\"d ", "efg"}; 75 | tokens = CSVUtil.tokenize(str, 100); 76 | assertArrayEquals(answer, tokens); 77 | 78 | // 混在 79 | str = "a, \"bcd\" , efg "; 80 | answer = new String[] {"a", "bcd", "efg "}; 81 | tokens = CSVUtil.tokenize(str, 100); 82 | assertArrayEquals(answer, tokens); 83 | 84 | // max指定 85 | str = "a , "; 86 | answer = new String[] {"a , "}; 87 | tokens = CSVUtil.tokenize(str, 1); 88 | assertArrayEquals(answer, tokens); 89 | 90 | str = "a ,"; 91 | answer = new String[] {"a ,"}; 92 | tokens = CSVUtil.tokenize(str, 1); 93 | assertArrayEquals(answer, tokens); 94 | 95 | str = "a , \"bcd\"efg"; 96 | answer = new String[] {"a , \"bcd\"efg"}; 97 | tokens = CSVUtil.tokenize(str, 1); 98 | assertArrayEquals(answer, tokens); 99 | 100 | str = "a , bcd, \tefg,def"; 101 | answer = new String[] {"a ", "bcd, \tefg,def"}; 102 | tokens = CSVUtil.tokenize(str, 2); 103 | assertArrayEquals(answer, tokens); 104 | } 105 | } 106 | --------------------------------------------------------------------------------