├── benchmark ├── .gitignore ├── kyoto-leads-corpus.sh ├── jawikipedia.sh ├── commoncrawl.sh ├── benchmark_run.sh ├── benchmark_multithread.sh ├── process_warc.py ├── benchmark_setup.sh └── README.md ├── docs ├── _config.yml ├── Sudachi.png ├── favicon.ico ├── development.md ├── _layouts │ └── default.html └── Sudachi.svg ├── src ├── test │ ├── resources │ │ ├── sudachi_test_empty.json │ │ ├── rewrite_error_ignorelist.def │ │ ├── rewrite_error_dup.def │ │ ├── rewrite_error_replacelist.def │ │ ├── com │ │ │ └── worksap │ │ │ │ └── nlp │ │ │ │ └── sudachi │ │ │ │ └── dictionary │ │ │ │ └── build │ │ │ │ ├── one.csv │ │ │ │ ├── test.matrix │ │ │ │ └── sudachi_dic_build.json │ │ ├── rewrite.def │ │ ├── dict │ │ │ ├── user2.csv │ │ │ ├── user.csv │ │ │ ├── matrix.def │ │ │ └── lex.csv │ │ ├── logging.properties │ │ ├── sudachi_minimum.json │ │ ├── char.def │ │ ├── sudachi.json │ │ └── joinnumeric │ │ │ └── char.def │ ├── dict │ │ ├── user2.csv │ │ ├── user.csv │ │ ├── matrix.def │ │ └── lex.csv │ └── java │ │ └── com │ │ └── worksap │ │ └── nlp │ │ └── sudachi │ │ ├── MorphemeImplTest.kt │ │ ├── TestLoggingConfig.java │ │ ├── StringUtilTest.kt │ │ ├── dictionary │ │ ├── build │ │ │ ├── ParametersTest.kt │ │ │ ├── ConnectionMatrixTest.kt │ │ │ ├── GrammarTest.kt │ │ │ ├── DicBufferTest.kt │ │ │ └── MemChannel.kt │ │ ├── DictionaryReader.java │ │ └── DictionaryHeaderTest.java │ │ ├── WordIdTest.kt │ │ ├── Utils.java │ │ ├── InhibitConnectionPluginTest.java │ │ ├── WordMaskTest.kt │ │ ├── DictionaryFactoryTest.kt │ │ ├── TextNormalizerTest.kt │ │ ├── MMapTest.java │ │ ├── TestDictionary.kt │ │ ├── MockGrammar.java │ │ ├── JapaneseTokenizerMaskTest.kt │ │ ├── JoinKatakanaOovPluginTest.java │ │ ├── JapaneseDictionaryTest.java │ │ └── MockInputText.java ├── main │ ├── resources │ │ ├── sudachi.logging.properties │ │ ├── unk.def │ │ └── sudachi.json │ └── java │ │ └── com │ │ └── worksap │ │ └── nlp │ │ └── sudachi │ │ ├── Plugin.java │ │ ├── dictionary │ │ ├── build │ │ │ ├── WriteDictionary.java │ │ │ ├── WordIdResolver.java │ │ │ ├── InputFileException.java │ │ │ ├── BuildStats.java │ │ │ ├── TrackingInputStream.java │ │ │ ├── POSTable.java │ │ │ ├── Parameters.java │ │ │ ├── Progress.java │ │ │ └── Index.java │ │ ├── DictionaryAccess.java │ │ ├── DictionaryVersion.java │ │ ├── WordIdTable.java │ │ ├── WordParameterList.java │ │ ├── DictionaryHeaderPrinter.java │ │ ├── Connection.java │ │ ├── CategoryType.java │ │ ├── Lexicon.java │ │ ├── POS.java │ │ ├── DoubleArrayLookup.java │ │ ├── WordInfoList.java │ │ └── BinaryDictionary.java │ │ ├── InputTextPlugin.java │ │ ├── InhibitConnectionPlugin.java │ │ ├── WordMask.java │ │ ├── InputTextBuilder.java │ │ ├── SimpleMorphemeFormatter.java │ │ ├── WordSegmentationFormatter.java │ │ ├── EditConnectionCostPlugin.java │ │ ├── PartialPOS.java │ │ ├── WordId.java │ │ ├── SimpleOovProviderPlugin.java │ │ ├── MorphemeFormatterPlugin.java │ │ ├── StringUtil.java │ │ ├── SentenceSplittingAnalysis.java │ │ ├── ProlongedSoundMarkInputTextPlugin.java │ │ ├── IOTools.java │ │ ├── MorphemeImpl.java │ │ ├── Dictionary.java │ │ └── JoinKatakanaOovPlugin.java └── jmh │ └── java │ └── com │ └── worksap │ └── nlp │ └── sudachi │ └── dictionary │ ├── DictionaryBuilderPerformanceTest.java │ └── MemChannelJmh.kt ├── .github ├── FUNDING.yml └── workflows │ ├── publish.yml │ ├── release.yml │ └── build.yml ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── settings.gradle ├── gradle.properties ├── .formatter └── license-header ├── .gitattributes ├── CHANGELOG.md ├── gradlew.bat └── .gitignore /benchmark/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-primer 2 | -------------------------------------------------------------------------------- /src/test/resources/sudachi_test_empty.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /docs/Sudachi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/Sudachi/HEAD/docs/Sudachi.png -------------------------------------------------------------------------------- /docs/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/Sudachi/HEAD/docs/favicon.ico -------------------------------------------------------------------------------- /src/test/resources/rewrite_error_ignorelist.def: -------------------------------------------------------------------------------- 1 | # there are two characters in ignore list 2 | 12 -------------------------------------------------------------------------------- /src/test/resources/rewrite_error_dup.def: -------------------------------------------------------------------------------- 1 | # there are ad uplicated replacement. 2 | 12 21 3 | 12 31 4 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: WorksApplications 4 | -------------------------------------------------------------------------------- /src/test/resources/rewrite_error_replacelist.def: -------------------------------------------------------------------------------- 1 | # there are three columns in replace list 2 | 12 21 31 3 | -------------------------------------------------------------------------------- /src/test/resources/com/worksap/nlp/sudachi/dictionary/build/one.csv: -------------------------------------------------------------------------------- 1 | 東,1,1,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,* 2 | -------------------------------------------------------------------------------- /src/test/resources/rewrite.def: -------------------------------------------------------------------------------- 1 | # ignore normalize list 2 | Ⅲ 3 | ⅲ 4 | ⼼ 5 | 6 | # replace char list 7 | ガ ガ 8 | ウ゛ ヴ 9 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/Sudachi/HEAD/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | /* 2 | * This file was generated by the Gradle 'init' task. 3 | */ 4 | 5 | rootProject.name = 'sudachi' 6 | -------------------------------------------------------------------------------- /src/test/dict/user2.csv: -------------------------------------------------------------------------------- 1 | ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,*,A,*,*,*,* 2 | かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,*,A,*,*,*,* 3 | -------------------------------------------------------------------------------- /src/test/resources/dict/user2.csv: -------------------------------------------------------------------------------- 1 | ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,*,A,*,*,*,* 2 | かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,*,A,*,*,*,* 3 | -------------------------------------------------------------------------------- /src/test/resources/com/worksap/nlp/sudachi/dictionary/build/test.matrix: -------------------------------------------------------------------------------- 1 | 3 3 2 | 0 0 0 3 | 0 1 1 4 | 0 2 2 5 | 1 0 3 6 | 1 1 4 7 | 1 2 5 8 | 2 0 6 9 | 2 1 7 10 | 2 2 8 11 | 12 | -------------------------------------------------------------------------------- /src/test/resources/logging.properties: -------------------------------------------------------------------------------- 1 | java.util.logging.SimpleFormatter.format=%5$s%n 2 | 3 | com.worksap.nlp.sudachi.handlers=java.util.logging.ConsoleHandler 4 | com.worksap.nlp.sudachi.level=FINEST 5 | java.util.logging.ConsoleHandler.level=ALL -------------------------------------------------------------------------------- /src/main/resources/sudachi.logging.properties: -------------------------------------------------------------------------------- 1 | java.util.logging.SimpleFormatter.format=%5$s%n 2 | 3 | com.worksap.nlp.sudachi.handlers=java.util.logging.ConsoleHandler 4 | com.worksap.nlp.sudachi.level=INFO 5 | java.util.logging.ConsoleHandler.level=ALL -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.5.1-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /src/test/dict/user.csv: -------------------------------------------------------------------------------- 1 | ぴらる,8,8,-32768,ぴらる,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,*,A,*,*,*,* 2 | 府,8,8,2914,府,名詞,普通名詞,一般,*,*,*,フ,府,*,A,*,*,*,* 3 | 東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,*,B,5/U1,*,5/U1,1/3 4 | すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,*,A,*,*,*,* 5 | -------------------------------------------------------------------------------- /src/test/resources/dict/user.csv: -------------------------------------------------------------------------------- 1 | ぴらる,8,8,-32768,ぴらる,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,*,A,*,*,*,* 2 | 府,8,8,2914,府,名詞,普通名詞,一般,*,*,*,フ,府,*,A,*,*,*,* 3 | 東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,*,B,5/U1,*,5/U1,1/3 4 | すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,*,A,*,*,*,* 5 | -------------------------------------------------------------------------------- /src/test/resources/sudachi_minimum.json: -------------------------------------------------------------------------------- 1 | { 2 | "systemDict" : "system.dic", 3 | "inputTextPlugin" : [ 4 | { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" } 5 | ], 6 | "oovProviderPlugin" : [ 7 | { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", 8 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 9 | "leftId" : 8, 10 | "rightId" : 8, 11 | "cost" : 6000 } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- 1 | kotlin.stdlib.default.dependency=false 2 | org.gradle.jvmargs=--add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED \ 3 | --add-exports jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED \ 4 | --add-exports jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED \ 5 | --add-exports jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED \ 6 | --add-exports jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED 7 | org.gradle.caching=true 8 | org.gradle.parallel=true 9 | -------------------------------------------------------------------------------- /benchmark/kyoto-leads-corpus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Run benchmark with Kyoto Leads Corpus 3 | 4 | set -eux 5 | DIR=$(dirname "$(readlink -f "$0")") 6 | 7 | # Download Kyoto Leads corpus original texts 8 | DATA_DIR=$DIR/data 9 | mkdir -p "$DATA_DIR" 10 | 11 | CORPUS_FILE="$DATA_DIR/leads.txt" 12 | if [ ! -e "$CORPUS_FILE" ]; then 13 | curl -L https://github.com/ku-nlp/KWDLC/releases/download/release_1_0/leads.org.txt.gz | gzip -dc > $CORPUS_FILE 14 | fi 15 | 16 | # Setup & run 17 | $DIR/benchmark_setup.sh 18 | $DIR/benchmark_run.sh $CORPUS_FILE "kyoto-leads" 19 | -------------------------------------------------------------------------------- /.formatter/license-header: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) $YEAR Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | -------------------------------------------------------------------------------- /src/test/resources/char.def: -------------------------------------------------------------------------------- 1 | 0x0021..0x002F SYMBOL #!"#$%&'()*+,-./ 2 | 0x0030..0x0039 NUMERIC #0-9 3 | 0x0041..0x005A ALPHA #A-Z 4 | 0x0061..0x007A ALPHA #a-z 5 | 0x00C0..0x00FF ALPHA # Latin 1 #À->ÿ 6 | 0x3041..0x309F HIRAGANA 7 | 0x30A1..0x30FF KATAKANA 8 | 0x30A1 NOOOVBOW 9 | 0xFF66..0xFF9D KATAKANA 10 | 0xFF9E..0xFF9F KATAKANA 11 | 0x2E80..0x2EF3 KANJI # CJK Raidcals Supplement 12 | 0x2F00..0x2FD5 KANJI 13 | 0x3005 KANJI 14 | 0x3007 KANJI 15 | 0x3400..0x4DB5 KANJI # CJK Unified Ideographs Extention 16 | 0x4E00..0x9FA5 KANJI 17 | 0xF900..0xFA2D KANJI 18 | 0xFA30..0xFA6A KANJI 19 | 0xFF10..0xFF19 NUMERIC 20 | 0xFF21..0xFF3A ALPHA 21 | 0xFF41..0xFF5A ALPHA 22 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Handle line endings automatically for files detected as text 2 | # and leave all files detected as binary untouched. 3 | * text=auto 4 | 5 | # Force the following filetypes to have unix eols, so Windows does not break them 6 | *.* text eol=lf 7 | 8 | # 9 | ## These files are binary and should be left untouched 10 | # 11 | 12 | # (binary is a macro for -text -diff) 13 | *.png binary 14 | *.jpg binary 15 | *.jpeg binary 16 | *.gif binary 17 | *.ico binary 18 | *.mov binary 19 | *.mp4 binary 20 | *.mp3 binary 21 | *.flv binary 22 | *.fla binary 23 | *.swf binary 24 | *.gz binary 25 | *.zip binary 26 | *.7z binary 27 | *.ttf binary 28 | *.eot binary 29 | *.woff binary 30 | *.pyc binary 31 | *.pdf binary 32 | *.ez binary 33 | *.bz2 binary 34 | *.swp binary -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/Plugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | abstract class Plugin { 20 | 21 | protected Settings settings; 22 | 23 | void setSettings(Settings settings) { 24 | this.settings = settings; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/build/WriteDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build; 18 | 19 | import java.io.IOException; 20 | 21 | public interface WriteDictionary { 22 | void writeTo(ModelOutput output) throws IOException; 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build; 18 | 19 | public interface WordIdResolver { 20 | int lookup(String headword, short posId, String reading); 21 | 22 | void validate(int wordId); 23 | 24 | boolean isUser(); 25 | } 26 | -------------------------------------------------------------------------------- /docs/development.md: -------------------------------------------------------------------------------- 1 | # How to develop Sudachi 2 | 3 | ## Requirements 4 | 5 | You need to install a JDK, for example from https://adoptium.net/ 6 | Both 11 and 17 will suffice. 7 | Sudachi keeps Java 8 source compatibility at the moment, but we use JDK 11 for CI. 8 | 9 | ## Build System 10 | 11 | Sudachi uses [Gradle](https://gradle.org/) for build. 12 | Basic build can be done with 13 | 14 | `./gradlew build` 15 | 16 | It will produce a jar file in the `build/libs` directory. 17 | 18 | Build enforces the code formatting, so during the development the recommended build command is 19 | 20 | `./gradlew spotlessApply test` 21 | 22 | ## Running development version 23 | 24 | Sometimes you would like to run a development version of Sudachi from a jar file. 25 | Gradle allows you to make a development jar installation of Sudachi with all dependencies with 26 | 27 | `./gradlew installExecutableDist` 28 | 29 | ## List of Gradle tasks 30 | 31 | List of all Gradle tasks can be seen with `./gradlew tasks` -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/build/InputFileException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build; 18 | 19 | public class InputFileException extends IllegalArgumentException { 20 | public InputFileException(int line, String s, Exception cause) { 21 | super(String.format("line:%d %s", line, s), cause); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/test/resources/com/worksap/nlp/sudachi/dictionary/build/sudachi_dic_build.json: -------------------------------------------------------------------------------- 1 | { 2 | "characterDefinitionFile" : "char.def", 3 | "inputTextPlugin" : [ 4 | { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" }, 5 | { "class" : "com.worksap.nlp.sudachi.IgnoreYomiganaPlugin", 6 | "leftBrackets": ["(", "("], 7 | "rightBrackets": [")", ")"], 8 | "maxYomiganaLength": 4} 9 | ], 10 | "oovProviderPlugin" : [ 11 | { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", 12 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 13 | "leftId" : 8, 14 | "rightId" : 8, 15 | "cost" : 6000 } 16 | ], 17 | "formatterPlugin" : [ 18 | { "class" : "com.worksap.nlp.sudachi.SimpleMorphemeFormatter" }, 19 | { "class" : "com.worksap.nlp.sudachi.WordSegmentationFormatter", 20 | "eos" : "\n" }, 21 | { "class" : "com.worksap.nlp.sudachi.WordSegmentationFormatter", 22 | "eos" : " " } 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /docs/_layouts/default.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | 6 | 7 | 8 | {% seo %} 9 | 10 | 11 | 12 | 13 |26 | * {@link Dictionary} initialize this plugin with {@link Settings}. It can be 27 | * referred as {@link Plugin#settings}. 28 | * 29 | *
30 | * The following is an example of settings. 31 | * 32 | *
33 | * {@code
34 | * {
35 | * "class" : "com.worksap.nlp.sudachi.InputTextPlugin",
36 | * "example" : "example setting"
37 | * }
38 | * }
39 | *
40 | */
41 | public abstract class InputTextPlugin extends Plugin {
42 |
43 | /**
44 | * Set up the plugin.
45 | *
46 | * {@link Tokenizer} calls this method for setting up this plugin.
47 | *
48 | * @param grammar
49 | * the grammar of the system dictionary
50 | * @throws IOException
51 | * if reading something is failed
52 | */
53 | public void setUp(Grammar grammar) throws IOException {
54 | }
55 |
56 | /**
57 | * Rewrite the input text.
58 | *
59 | * To rewrite the input text, you can use {@link InputTextBuilder#replace}.
60 | *
61 | * @param builder
62 | * the input text
63 | */
64 | public abstract void rewrite(InputTextBuilder builder);
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/InhibitConnectionPlugin.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import java.util.List;
20 |
21 | import com.worksap.nlp.sudachi.dictionary.Grammar;
22 |
23 | /**
24 | * A Plugin for inhibiting the connections.
25 | *
26 | * 27 | * {@link Dictionary} initialize this plugin with {@link Settings}. It can be 28 | * referred as {@link Plugin#settings}. 29 | * 30 | *
31 | * The following is an example of settings. 32 | * 33 | *
34 | * {@code
35 | * {
36 | * "class" : "com.worksap.nlp.sudachi.InhibitConnectionPlugin",
37 | * "inhibitedPair" : [ [ 0, 233 ], [435, 332] ]
38 | * }
39 | * }
40 | *
41 | *
42 | * {@code inhibitPair} is a list of lists of two numbers. At each pair, the
43 | * first number is right-ID of the left node and the second is left-ID of the
44 | * right node in a connection.
45 | */
46 | class InhibitConnectionPlugin extends EditConnectionCostPlugin {
47 |
48 | List26 | * The following is an example of settings. 27 | * 28 | *
29 | * {@code
30 | * {
31 | * "class" : "com.worksap.nlp.sudachi.SimpleFormatter",
32 | * "delimiter" : "\n",
33 | * "eos" : "\nEOS\n",
34 | * "columnDelimiter" : "\t"
35 | * }
36 | * }
37 | *
38 | *
39 | * {@code delimiter} is the delimiter of the morphemes. {@code eos} is printed
40 | * at the position of EOS. {@code columnDelimiter} is the delimiter of the
41 | * fields.
42 | */
43 | public class SimpleMorphemeFormatter extends MorphemeFormatterPlugin {
44 |
45 | protected String columnDelimiter;
46 |
47 | @Override
48 | public void setUp() throws IOException {
49 | super.setUp();
50 | columnDelimiter = settings.getString("columnDelimiter", "\t");
51 | }
52 |
53 | @Override
54 | public String formatMorpheme(Morpheme morpheme) {
55 | String output = morpheme.surface() + columnDelimiter + String.join(",", morpheme.partOfSpeech())
56 | + columnDelimiter + morpheme.normalizedForm();
57 | if (showDetails) {
58 | output += columnDelimiter + morpheme.dictionaryForm() + columnDelimiter + morpheme.readingForm()
59 | + columnDelimiter + morpheme.getDictionaryId() + columnDelimiter
60 | + Arrays.toString(morpheme.getSynonymGroupIds()) + columnDelimiter
61 | + ((morpheme.isOOV()) ? "(OOV)" : "");
62 | }
63 | return output;
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/WordSegmentationFormatter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import java.io.IOException;
20 | import java.io.PrintStream;
21 | import java.util.List;
22 |
23 | /**
24 | * Provides a formatter for {@link Morpheme}
25 | *
26 | * 27 | * The following is an example of settings. 28 | * 29 | *
30 | * {@code
31 | * {
32 | * "class" : "com.worksap.nlp.sudachi.SurfaceFormatter",
33 | * "delimiter" : " ",
34 | * "eos" : "\n",
35 | * }
36 | * }
37 | *
38 | *
39 | * {@code delimiter} is the delimiter of the morphemes. {@code eos} is printed
40 | * at the position of EOS.
41 | */
42 | public class WordSegmentationFormatter extends MorphemeFormatterPlugin {
43 |
44 | @Override
45 | public void setUp() throws IOException {
46 | super.setUp();
47 | delimiter = settings.getString("delimiter", " ");
48 | eosString = settings.getString("eos", "\n");
49 | }
50 |
51 | @Override
52 | public String formatMorpheme(Morpheme morpheme) {
53 | return morpheme.surface();
54 | }
55 |
56 | @Override
57 | void printSentence(List27 | * {@link Dictionary} initialize this plugin with {@link Settings}. It can be 28 | * referred as {@link Plugin#settings}. 29 | * 30 | *
31 | * The following is an example of settings. 32 | * 33 | *
34 | * {@code
35 | * {
36 | * "class" : "com.worksap.nlp.sudachi.SampleEditConnectionPlugin",
37 | * "example" : "example setting"
38 | * }
39 | * }
40 | *
41 | */
42 | public abstract class EditConnectionCostPlugin extends Plugin {
43 |
44 | /**
45 | * Set up the plugin.
46 | *
47 | * {@link Tokenizer} calls this method for setting up this plugin.
48 | *
49 | * @param grammar
50 | * the grammar of the system dictionary
51 | * @throws IOException
52 | * if reading something is failed
53 | */
54 | public void setUp(Grammar grammar) throws IOException {
55 | }
56 |
57 | /**
58 | * Edit the connection costs.
59 | *
60 | * To edit connection costs, you can use {@link Grammar#getConnectCost},
61 | * {@link Grammar#setConnectCost}, and {@link #inhibitConnection}.
62 | *
63 | * @param grammar
64 | * the grammar of the system dictionary
65 | */
66 | public abstract void edit(Grammar grammar);
67 |
68 | /**
69 | * Inhibit a connection.
70 | *
71 | * @param grammar
72 | * the grammar of the system dictionary
73 | * @param left
74 | * the right-ID of the left node
75 | * @param right
76 | * the left-ID of the right node
77 | */
78 | public void inhibitConnection(Grammar grammar, short left, short right) {
79 | grammar.setConnectCost(left, right, Grammar.INHIBITED_CONNECTION);
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/PartialPOS.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import com.worksap.nlp.sudachi.dictionary.POS;
20 |
21 | import java.util.AbstractList;
22 | import java.util.Arrays;
23 | import java.util.List;
24 |
25 | public class PartialPOS extends AbstractList31 | * The following is an example of settings. 32 | * 33 | *
34 | * {@code
35 | * {
36 | * "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin",
37 | * "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ],
38 | * "leftId" : 5968,
39 | * "rigthId" : 5968,
40 | * "cost" : 3857
41 | * }
42 | * }
43 | *
44 | *
45 | * {@code oovPOS} is the part of speech of the OOVs. {@code leftId} is the
46 | * left-ID of the OOVs. {@code rightId} is the right-ID of the OOVs.
47 | * {@code cost} is the cost of the OOVs.
48 | */
49 | class SimpleOovProviderPlugin extends OovProviderPlugin {
50 |
51 | short oovPOSId;
52 | short leftId;
53 | short rightId;
54 | short cost;
55 |
56 | @Override
57 | public void setUp(Grammar grammar) {
58 | POS pos = new POS(settings.getStringList("oovPOS"));
59 | leftId = (short) settings.getInt("leftId");
60 | rightId = (short) settings.getInt("rightId");
61 | cost = (short) settings.getInt("cost");
62 | String userPosMode = settings.getString(USER_POS, USER_POS_FORBID);
63 | oovPOSId = posIdOf(grammar, pos, userPosMode);
64 | }
65 |
66 | @Override
67 | public int provideOOV(InputText inputText, int offset, long otherWords, List27 | * You can defined the range of each category in the file which specified 28 | * "characterDefinitionFile" of the settings. 29 | */ 30 | public enum CategoryType { 31 | /** The fall back category. */ 32 | DEFAULT(1), 33 | /** White spaces. */ 34 | SPACE(1 << 1), 35 | /** CJKV ideographic characters. */ 36 | KANJI(1 << 2), 37 | /** Symbols. */ 38 | SYMBOL(1 << 3), 39 | /** Numerical characters. */ 40 | NUMERIC(1 << 4), 41 | /** Latin alphabets. */ 42 | ALPHA(1 << 5), 43 | /** Hiragana characters. */ 44 | HIRAGANA(1 << 6), 45 | /** Katakana characters. */ 46 | KATAKANA(1 << 7), 47 | /** Kanji numeric characters. */ 48 | KANJINUMERIC(1 << 8), 49 | /** Greek alphabets. */ 50 | GREEK(1 << 9), 51 | /** Cyrillic alphabets. */ 52 | CYRILLIC(1 << 10), 53 | /** User defined category. */ 54 | USER1(1 << 11), 55 | /** User defined category. */ 56 | USER2(1 << 12), 57 | /** User defined category. */ 58 | USER3(1 << 13), 59 | /** User defined category. */ 60 | USER4(1 << 14), 61 | /** Characters that cannot be the beginning of word */ 62 | NOOOVBOW(1 << 15); 63 | 64 | private final int id; 65 | 66 | private CategoryType(int id) { 67 | this.id = id; 68 | } 69 | 70 | /** 71 | * Returns the integer ID number of the category. 72 | * 73 | * @return the ID number of the category 74 | */ 75 | public int getId() { 76 | return id; 77 | } 78 | 79 | /** 80 | * Returns the category to which the specified ID is mapped, or {@code null} if 81 | * there is no associated category. 82 | * 83 | * @param id 84 | * the ID number of category 85 | * @return the category to which the specified ID is mapped, or {@code null} if 86 | * there is no associated category. 87 | */ 88 | public static CategoryType getType(int id) { 89 | for (CategoryType type : CategoryType.values()) { 90 | if (type.getId() == id) { 91 | return type; 92 | } 93 | } 94 | return null; 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/MorphemeFormatterPlugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.io.IOException; 20 | import java.io.PrintStream; 21 | import java.util.List; 22 | 23 | /** 24 | * Provides a formatter for {@link Morpheme} 25 | * 26 | *
27 | * The following is an example of settings. 28 | * 29 | *
30 | * {@code
31 | * {
32 | * "class" : "com.worksap.nlp.sudachi.MorphemeFormatterPlugin",
33 | * "delimiter" : "\n",
34 | * "eos" : "\nEOS\n",
35 | * }
36 | * }
37 | *
38 | *
39 | * {@code delimiter} is the delimiter of the morphemes. {@code eos} is printed
40 | * at the position of EOS.
41 | */
42 | public abstract class MorphemeFormatterPlugin extends Plugin {
43 |
44 | protected String delimiter = "\n";
45 | protected String eosString = "\nEOS\n";
46 | protected boolean showDetails;
47 |
48 | /**
49 | * Set up the plugin.
50 | *
51 | * {@link SudachiCommandLine} calls this method for setting up this plugin.
52 | */
53 | public void setUp() throws IOException {
54 | showDetails = false;
55 | }
56 |
57 | /**
58 | * Provides a string representation of a morpheme.
59 | *
60 | * @param morpheme
61 | * the input
62 | *
63 | * @return a string representation of a morpheme.
64 | */
65 | public abstract String formatMorpheme(Morpheme morpheme);
66 |
67 | /**
68 | * Show details.
69 | *
70 | * This method is called when the {@code -a} option is specified.
71 | */
72 | public void showDetails() {
73 | showDetails = true;
74 | }
75 |
76 | void printSentence(List36 | * when the word ID is out of range, the behavior is undefined. 37 | * 38 | * @param wordId 39 | * the word ID of the morpheme 40 | * @return the left-ID of the morpheme 41 | */ 42 | short getLeftId(int wordId); 43 | 44 | /** 45 | * Returns the right-ID of the morpheme specified by the word ID. 46 | * 47 | *
48 | * when the word ID is out of range, the behavior is undefined. 49 | * 50 | * @param wordId 51 | * the word ID of the morpheme 52 | * @return the right-ID of the morpheme. 53 | */ 54 | short getRightId(int wordId); 55 | 56 | /** 57 | * Returns the word occurrence cost of the morpheme specified by the word ID. 58 | * 59 | *
60 | * when the word ID is out of range, the behavior is undefined. 61 | * 62 | * @param wordId 63 | * the word ID of the morpheme 64 | * @return the word occurrence cost 65 | */ 66 | short getCost(int wordId); 67 | 68 | /** 69 | * Returns the informations of the morpheme specified by the word ID. 70 | * 71 | *
72 | * when the word ID is out of range, the behavior is undefined. 73 | * 74 | * @param wordId 75 | * the word ID of the morpheme 76 | * @return the informations of the morpheme 77 | * @see WordInfo 78 | */ 79 | WordInfo getWordInfo(int wordId); 80 | 81 | /** 82 | * Returns the ID of the dictionary containing the morpheme specified by the 83 | * word ID. 84 | * 85 | * If the morpheme is in the system dictionary, it returns {@code 0}. 86 | * 87 | * @param wordId 88 | * the word ID of the morpheme 89 | * @return the dictionary ID 90 | * @deprecated use {@link WordId#dic(int)} 91 | */ 92 | @Deprecated 93 | default int getDictionaryId(int wordId) { 94 | return WordId.dic(wordId); 95 | } 96 | 97 | /** 98 | * Returns the number of morphemes in the dictionary. 99 | * 100 | * @return the number of morphemes 101 | */ 102 | int size(); 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/StringUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.io.IOException; 20 | import java.io.InputStream; 21 | import java.io.InputStreamReader; 22 | import java.net.URL; 23 | import java.nio.ByteBuffer; 24 | import java.nio.ByteOrder; 25 | import java.nio.CharBuffer; 26 | import java.nio.charset.StandardCharsets; 27 | import java.nio.file.Files; 28 | import java.nio.file.Path; 29 | import java.util.Arrays; 30 | 31 | public class StringUtil { 32 | private StringUtil() { 33 | } 34 | 35 | public static String readFully(URL url) throws IOException { 36 | try (InputStream inputStream = url.openStream()) { 37 | return readFully(inputStream); 38 | } 39 | } 40 | 41 | public static String readFully(Path path) throws IOException { 42 | try (InputStream is = Files.newInputStream(path)) { 43 | return readFully(is); 44 | } 45 | } 46 | 47 | public static String readFully(InputStream stream) throws IOException { 48 | InputStreamReader isr = new InputStreamReader(stream, StandardCharsets.UTF_8); 49 | StringBuilder sb = new StringBuilder(); 50 | CharBuffer cb = CharBuffer.allocate(1024); 51 | while (isr.read(cb) != -1) { 52 | cb.flip(); 53 | sb.append(cb); 54 | cb.clear(); 55 | } 56 | return sb.toString(); 57 | } 58 | 59 | public static ByteBuffer readAllBytes(URL url) throws IOException { 60 | return readAllBytes(url, ByteOrder.LITTLE_ENDIAN); 61 | } 62 | 63 | public static ByteBuffer readAllBytes(URL url, ByteOrder order) throws IOException { 64 | try (InputStream is = url.openStream()) { 65 | return readAllBytes(is, order); 66 | } 67 | } 68 | 69 | public static ByteBuffer readAllBytes(InputStream inputStream) throws IOException { 70 | return readAllBytes(inputStream, ByteOrder.LITTLE_ENDIAN); 71 | } 72 | 73 | public static ByteBuffer readAllBytes(InputStream inputStream, ByteOrder order) throws IOException { 74 | byte[] buffer = new byte[inputStream.available() + 1024]; 75 | int offset = 0; 76 | 77 | while (true) { 78 | int nread = inputStream.read(buffer, offset, buffer.length - offset); 79 | if (nread >= 0) { 80 | offset += nread; 81 | if (offset == buffer.length) { 82 | buffer = Arrays.copyOf(buffer, buffer.length * 2); 83 | } 84 | } else { 85 | break; 86 | } 87 | } 88 | ByteBuffer bbuf = ByteBuffer.wrap(buffer); 89 | bbuf.limit(offset); 90 | bbuf.order(order); 91 | return bbuf; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/test/dict/lex.csv: -------------------------------------------------------------------------------- 1 | た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*,* 2 | に,2,2,11406,に,助詞,接続助詞,*,*,*,*,ニ,に,*,A,*,*,*,* 3 | に,3,3,4481,に,助詞,格助詞,*,*,*,*,ニ,に,*,A,*,*,*,* 4 | 京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,1/5 5 | 東,7,7,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,* 6 | 東京,6,6,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* 7 | 東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,* 8 | 行く,4,4,5105,行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,* 9 | 行っ,5,5,5122,行っ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,A,*,*,*,* 10 | 都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,* 11 | アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,*,A,*,*,*,* 12 | アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,*,A,*,*,*,* 13 | アイアイウ,6,6,32766,アイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,*,A,*,*,*,* 14 | 0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,*,A,*,*,*,* 15 | 1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,*,A,*,*,*,* 16 | 2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,*,A,*,*,*,* 17 | 3,9,9,2478,3,名詞,数詞,*,*,*,*,サン,3,*,A,*,*,*,* 18 | 4,9,9,2478,4,名詞,数詞,*,*,*,*,ヨン,4,*,A,*,*,*,* 19 | 5,9,9,2478,5,名詞,数詞,*,*,*,*,ゴ,5,*,A,*,*,*,* 20 | 6,9,9,2478,6,名詞,数詞,*,*,*,*,ロク,6,*,A,*,*,*,* 21 | 7,9,9,2478,7,名詞,数詞,*,*,*,*,ナナ,7,*,A,*,*,*,* 22 | 8,9,9,2478,8,名詞,数詞,*,*,*,*,ハチ,8,*,A,*,*,*,* 23 | 9,9,9,2478,9,名詞,数詞,*,*,*,*,キュウ,9,*,A,*,*,*,* 24 | 〇,9,9,2478,〇,名詞,数詞,*,*,*,*,ゼロ,〇,*,A,*,*,*,* 25 | 一,9,9,2478,一,名詞,数詞,*,*,*,*,イチ,一,*,A,*,*,*,* 26 | 二,9,9,2478,二,名詞,数詞,*,*,*,*,ニ,二,*,A,*,*,*,* 27 | 三,9,9,2478,三,名詞,数詞,*,*,*,*,サン,三,*,A,*,*,*,* 28 | 四,9,9,2478,四,名詞,数詞,*,*,*,*,ヨン,四,*,A,*,*,*,* 29 | 五,9,9,2478,五,名詞,数詞,*,*,*,*,ゴ,五,*,A,*,*,*,* 30 | 六,9,9,2478,六,名詞,数詞,*,*,*,*,ロク,六,*,A,*,*,*,* 31 | 七,9,9,2478,七,名詞,数詞,*,*,*,*,ナナ,七,*,A,*,*,*,* 32 | 八,9,9,2478,八,名詞,数詞,*,*,*,*,ハチ,八,*,A,*,*,*,* 33 | 九,9,9,2478,九,名詞,数詞,*,*,*,*,キュウ,九,*,A,*,*,*,* 34 | 六三四,6,6,0,六三四,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,*,A,*,*,*,* 35 | いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,* 36 | いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,* 37 | 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,2478,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,* 38 | 特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,特A,*,A,*,*,*,* 39 | な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,A,*,*,*,* -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/POS.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | import java.util.AbstractList; 20 | import java.util.Arrays; 21 | import java.util.List; 22 | 23 | /** 24 | * Part-of-Speech 25 | *
26 | * Sudachi POS are 6-component and consist of: 4 layers of POS tags, conjugation
27 | * type, conjugation form.
28 | */
29 | public final class POS extends AbstractList
31 | * This plugin combines the continuous sequence of prolonged sound marks to 1
32 | * character.
33 | *
34 | *
35 | * {@link Dictionary} initialize this plugin with {@link Settings}. It can be
36 | * referred as {@link Plugin#settings}.
37 | *
38 | *
39 | * The following is an example of settings.
40 | *
41 | *
56 | * With above setting example, the plugin rewrites input "エーービ〜〜〜シ〰〰〰〰" to
57 | * "エービーシー".
58 | */
59 | class ProlongedSoundMarkInputTextPlugin extends InputTextPlugin {
60 |
61 | private Set
30 | * The concatenated morpheme is OOV, and its part of speech must be specified in
31 | * the settings.
32 | *
33 | *
34 | * The following is an example of settings.
35 | *
36 | *
42 | * {@code
43 | * {
44 | * "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin",
45 | "prolongedSoundMarks": ["ー", "〜", "〰"],
46 | "replacementSymbol": "ー"
47 | * }
48 | * }
49 | *
50 | *
51 | * {@code prolongedSoundMarks} is the list of symbols to be combined.
52 | * {@code replacementSymbol} is the symbol for replacement, after combining
53 | * prolonged sound mark sequences.
54 | *
55 | *
37 | * {@code
38 | * {
39 | * "class" : "com.worksap.nlp.sudachi.JoinKatakanaOovPlugin",
40 | * "oovPOS" : [ "POS1", "POS2", ... ],
41 | * "minLength" : 3
42 | * }
43 | * }
44 | *
45 | */
46 | class JoinKatakanaOovPlugin extends PathRewritePlugin {
47 |
48 | short oovPosId;
49 | int minLength;
50 |
51 | @Override
52 | public void setUp(Grammar grammar) {
53 | List