data) {
29 | if (data.size() == 0) {
30 | throw new IllegalArgumentException("Partial POS must have at least 1 component");
31 | }
32 | if (data.size() > POS.DEPTH) {
33 | throw new IllegalArgumentException("Partial POS can have at most 6 components, was " + data);
34 | }
35 | for (String component : data) {
36 | if (component != null && component.length() > POS.MAX_COMPONENT_LENGTH) {
37 | throw new IllegalArgumentException("Component length can't be more than " + POS.MAX_COMPONENT_LENGTH
38 | + ", was " + component.length() + ":" + component);
39 | }
40 | }
41 | this.data = data;
42 | }
43 |
44 | public PartialPOS(String... data) {
45 | this(Arrays.asList(data));
46 | }
47 |
48 | @Override
49 | public String get(int index) {
50 | return data.get(index);
51 | }
52 |
53 | @Override
54 | public int size() {
55 | return data.size();
56 | }
57 |
58 | boolean matches(POS pos) {
59 | for (int level = 0; level < data.size(); ++level) {
60 | String s = data.get(level);
61 | if (s == null) {
62 | continue;
63 | }
64 | if (!s.equals(pos.get(level))) {
65 | return false;
66 | }
67 | }
68 | return true;
69 | }
70 |
71 | @Override
72 | public String toString() {
73 | return String.join(",", data);
74 | }
75 |
76 | public static PartialPOS of(String... parts) {
77 | return new PartialPOS(parts);
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/Plugin.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | abstract class Plugin {
20 |
21 | protected Settings settings;
22 |
23 | void setSettings(Settings settings) {
24 | this.settings = settings;
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/ProlongedSoundMarkInputTextPlugin.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import java.io.IOException;
20 | import java.util.HashSet;
21 | import java.util.Set;
22 | import java.util.List;
23 |
24 | import com.worksap.nlp.sudachi.dictionary.Grammar;
25 |
26 | /**
27 | * A plugin that rewrites the Katakana-Hiragana Prolonged Sound Mark (Chōonpu)
28 | * and similar symbols.
29 | *
30 | *
31 | * This plugin combines the continuous sequence of prolonged sound marks to 1
32 | * character.
33 | *
34 | *
35 | * {@link Dictionary} initialize this plugin with {@link Settings}. It can be
36 | * referred as {@link Plugin#settings}.
37 | *
38 | *
39 | * The following is an example of settings.
40 | *
41 | *
42 | * {@code
43 | * {
44 | * "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin",
45 | "prolongedSoundMarks": ["ー", "〜", "〰"],
46 | "replacementSymbol": "ー"
47 | * }
48 | * }
49 | *
50 | *
51 | * {@code prolongedSoundMarks} is the list of symbols to be combined.
52 | * {@code replacementSymbol} is the symbol for replacement, after combining
53 | * prolonged sound mark sequences.
54 | *
55 | *
56 | * With above setting example, the plugin rewrites input "エーービ〜〜〜シ〰〰〰〰" to
57 | * "エービーシー".
58 | */
59 | class ProlongedSoundMarkInputTextPlugin extends InputTextPlugin {
60 |
61 | private Set prolongedSoundMarkSet = new HashSet<>();
62 | private String replacementSymbol;
63 |
64 | @Override
65 | public void setUp(Grammar Grammar) throws IOException {
66 | List prolongedSoundMarkStrings = settings.getStringList("prolongedSoundMarks");
67 | for (String s : prolongedSoundMarkStrings) {
68 | prolongedSoundMarkSet.add(s.codePointAt(0));
69 | }
70 | replacementSymbol = settings.getString("replacementSymbol");
71 | }
72 |
73 | @Override
74 | public void rewrite(InputTextBuilder builder) {
75 | String text = builder.getText();
76 |
77 | int n = text.length();
78 | int offset = 0;
79 | int markStartIndex = n;
80 | boolean isProlongedSoundMark = false;
81 | for (int i = 0; i < n; i++) {
82 | int cp = text.codePointAt(i);
83 | if (!isProlongedSoundMark && prolongedSoundMarkSet.contains(cp)) {
84 | isProlongedSoundMark = true;
85 | markStartIndex = i;
86 | } else if (isProlongedSoundMark && !prolongedSoundMarkSet.contains(cp)) {
87 | if ((i - markStartIndex) > 1) {
88 | builder.replace(markStartIndex - offset, i - offset, replacementSymbol);
89 | offset += i - markStartIndex - 1;
90 | }
91 | isProlongedSoundMark = false;
92 | }
93 | }
94 | if (isProlongedSoundMark && (n - markStartIndex) > 1) {
95 | builder.replace(markStartIndex - offset, n - offset, replacementSymbol);
96 | }
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/SentenceSplittingAnalysis.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2023 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import com.worksap.nlp.sudachi.dictionary.LexiconSet;
20 | import com.worksap.nlp.sudachi.sentdetect.SentenceDetector;
21 |
22 | import java.util.ArrayList;
23 | import java.util.Iterator;
24 |
25 | /*internal*/ class SentenceSplittingAnalysis implements SentenceDetector.NonBreakCheker {
26 | private final SentenceDetector detector = new SentenceDetector();
27 |
28 | private final Tokenizer.SplitMode mode;
29 | private final JapaneseTokenizer tokenizer;
30 | final ArrayList result = new ArrayList<>();
31 |
32 | SentenceSplittingAnalysis(Tokenizer.SplitMode mode, JapaneseTokenizer tokenizer) {
33 | this.mode = mode;
34 | this.tokenizer = tokenizer;
35 | }
36 |
37 | UTF8InputText input;
38 | int bos;
39 |
40 | int tokenizeBuffer(CharSequence buffer) {
41 | UTF8InputText input = tokenizer.buildInputText(buffer);
42 | String normalized = input.getText();
43 | this.input = input;
44 |
45 | int bos = 0;
46 | int length;
47 |
48 | this.bos = bos;
49 | while ((length = detector.getEos(normalized, this)) > 0) {
50 | int eos = bos + length;
51 | if (eos < normalized.length()) {
52 | eos = input.getNextInOriginal(eos - 1);
53 | length = eos - bos;
54 | }
55 | UTF8InputText sentence = input.slice(bos, eos);
56 | result.add(tokenizer.tokenizeSentence(mode, sentence));
57 | normalized = normalized.substring(length);
58 | bos = eos;
59 | this.bos = bos;
60 | }
61 |
62 | // buffer is full, need to clean it up
63 | if (length < 0 && buffer.length() == -length) {
64 | result.add(tokenizer.tokenizeSentence(mode, input));
65 | return -length;
66 | }
67 |
68 | return length;
69 | }
70 |
71 | int bosPosition() {
72 | return input.textIndexToOriginalTextIndex(bos);
73 | }
74 |
75 | @Override
76 | public boolean hasNonBreakWord(int length) {
77 | UTF8InputText inp = input;
78 | int byteEOS = inp.getCodePointsOffsetLength(0, bos + length);
79 | byte[] bytes = inp.getByteText();
80 | LexiconSet lexicon = tokenizer.lexicon;
81 | for (int i = Math.max(0, byteEOS - 64); i < byteEOS; i++) {
82 | Iterator iterator = lexicon.lookup(bytes, i);
83 | while (iterator.hasNext()) {
84 | int[] r = iterator.next();
85 | int l = r[1];
86 | if (l > byteEOS || (l == byteEOS && bos + length - inp.modifiedOffset(i) > 1)) {
87 | return true;
88 | }
89 | }
90 | }
91 | return false;
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/SimpleMorphemeFormatter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import java.io.IOException;
20 | import java.util.Arrays;
21 |
22 | /**
23 | * Provides a formatter for {@link Morpheme}
24 | *
25 | *
26 | * The following is an example of settings.
27 | *
28 | *
29 | * {@code
30 | * {
31 | * "class" : "com.worksap.nlp.sudachi.SimpleFormatter",
32 | * "delimiter" : "\n",
33 | * "eos" : "\nEOS\n",
34 | * "columnDelimiter" : "\t"
35 | * }
36 | * }
37 | *
38 | *
39 | * {@code delimiter} is the delimiter of the morphemes. {@code eos} is printed
40 | * at the position of EOS. {@code columnDelimiter} is the delimiter of the
41 | * fields.
42 | */
43 | public class SimpleMorphemeFormatter extends MorphemeFormatterPlugin {
44 |
45 | protected String columnDelimiter;
46 |
47 | @Override
48 | public void setUp() throws IOException {
49 | super.setUp();
50 | columnDelimiter = settings.getString("columnDelimiter", "\t");
51 | }
52 |
53 | @Override
54 | public String formatMorpheme(Morpheme morpheme) {
55 | String output = morpheme.surface() + columnDelimiter + String.join(",", morpheme.partOfSpeech())
56 | + columnDelimiter + morpheme.normalizedForm();
57 | if (showDetails) {
58 | output += columnDelimiter + morpheme.dictionaryForm() + columnDelimiter + morpheme.readingForm()
59 | + columnDelimiter + morpheme.getDictionaryId() + columnDelimiter
60 | + Arrays.toString(morpheme.getSynonymGroupIds()) + columnDelimiter
61 | + ((morpheme.isOOV()) ? "(OOV)" : "");
62 | }
63 | return output;
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/SimpleOovProviderPlugin.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import com.worksap.nlp.sudachi.dictionary.Grammar;
20 | import com.worksap.nlp.sudachi.dictionary.POS;
21 | import com.worksap.nlp.sudachi.dictionary.WordInfo;
22 |
23 | import java.util.List;
24 |
25 | /**
26 | * Provides the OOVs which consists of a maximum run of characters of a single
27 | * character class. Does not produce OOVs if there was any other word at the
28 | * boundary.
29 | *
30 | *
31 | * The following is an example of settings.
32 | *
33 | *
34 | * {@code
35 | * {
36 | * "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin",
37 | * "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ],
38 | * "leftId" : 5968,
39 | * "rigthId" : 5968,
40 | * "cost" : 3857
41 | * }
42 | * }
43 | *
44 | *
45 | * {@code oovPOS} is the part of speech of the OOVs. {@code leftId} is the
46 | * left-ID of the OOVs. {@code rightId} is the right-ID of the OOVs.
47 | * {@code cost} is the cost of the OOVs.
48 | */
49 | class SimpleOovProviderPlugin extends OovProviderPlugin {
50 |
51 | short oovPOSId;
52 | short leftId;
53 | short rightId;
54 | short cost;
55 |
56 | @Override
57 | public void setUp(Grammar grammar) {
58 | POS pos = new POS(settings.getStringList("oovPOS"));
59 | leftId = (short) settings.getInt("leftId");
60 | rightId = (short) settings.getInt("rightId");
61 | cost = (short) settings.getInt("cost");
62 | String userPosMode = settings.getString(USER_POS, USER_POS_FORBID);
63 | oovPOSId = posIdOf(grammar, pos, userPosMode);
64 | }
65 |
66 | @Override
67 | public int provideOOV(InputText inputText, int offset, long otherWords, List nodes) {
68 | if (otherWords == 0) {
69 | LatticeNodeImpl node = createNode();
70 | node.setParameter(leftId, rightId, cost);
71 | int length = inputText.getWordCandidateLength(offset);
72 | String s = inputText.getSubstring(offset, offset + length);
73 | WordInfo info = new WordInfo(s, (short) length, oovPOSId, s, s, "");
74 | node.setWordInfo(info);
75 | nodes.add(node);
76 | return 1;
77 | } else {
78 | return 0;
79 | }
80 | }
81 | }
82 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/StringUtil.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import java.io.IOException;
20 | import java.io.InputStream;
21 | import java.io.InputStreamReader;
22 | import java.net.URL;
23 | import java.nio.ByteBuffer;
24 | import java.nio.ByteOrder;
25 | import java.nio.CharBuffer;
26 | import java.nio.charset.StandardCharsets;
27 | import java.nio.file.Files;
28 | import java.nio.file.Path;
29 | import java.util.Arrays;
30 |
31 | public class StringUtil {
32 | private StringUtil() {
33 | }
34 |
35 | public static String readFully(URL url) throws IOException {
36 | try (InputStream inputStream = url.openStream()) {
37 | return readFully(inputStream);
38 | }
39 | }
40 |
41 | public static String readFully(Path path) throws IOException {
42 | try (InputStream is = Files.newInputStream(path)) {
43 | return readFully(is);
44 | }
45 | }
46 |
47 | public static String readFully(InputStream stream) throws IOException {
48 | InputStreamReader isr = new InputStreamReader(stream, StandardCharsets.UTF_8);
49 | StringBuilder sb = new StringBuilder();
50 | CharBuffer cb = CharBuffer.allocate(1024);
51 | while (isr.read(cb) != -1) {
52 | cb.flip();
53 | sb.append(cb);
54 | cb.clear();
55 | }
56 | return sb.toString();
57 | }
58 |
59 | public static ByteBuffer readAllBytes(URL url) throws IOException {
60 | return readAllBytes(url, ByteOrder.LITTLE_ENDIAN);
61 | }
62 |
63 | public static ByteBuffer readAllBytes(URL url, ByteOrder order) throws IOException {
64 | try (InputStream is = url.openStream()) {
65 | return readAllBytes(is, order);
66 | }
67 | }
68 |
69 | public static ByteBuffer readAllBytes(InputStream inputStream) throws IOException {
70 | return readAllBytes(inputStream, ByteOrder.LITTLE_ENDIAN);
71 | }
72 |
73 | public static ByteBuffer readAllBytes(InputStream inputStream, ByteOrder order) throws IOException {
74 | byte[] buffer = new byte[inputStream.available() + 1024];
75 | int offset = 0;
76 |
77 | while (true) {
78 | int nread = inputStream.read(buffer, offset, buffer.length - offset);
79 | if (nread >= 0) {
80 | offset += nread;
81 | if (offset == buffer.length) {
82 | buffer = Arrays.copyOf(buffer, buffer.length * 2);
83 | }
84 | } else {
85 | break;
86 | }
87 | }
88 | ByteBuffer bbuf = ByteBuffer.wrap(buffer);
89 | bbuf.limit(offset);
90 | bbuf.order(order);
91 | return bbuf;
92 | }
93 | }
94 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/WordId.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | public class WordId {
20 | private WordId() {
21 | }
22 |
23 | /**
24 | * Internal word ids can't be larger than this number
25 | */
26 | public static final int MAX_WORD_ID = 0x0fffffff;
27 |
28 | /**
29 | * Dictionary ids can't be larger than this number
30 | */
31 | public static final int MAX_DIC_ID = 0xe;
32 |
33 | public static int makeUnchecked(int dic, int word) {
34 | int dicPart = dicIdMask(dic);
35 | return dicPart | word;
36 | }
37 |
38 | /**
39 | * Make combined WordId from dictionary and internal parts. This method does
40 | * bound checking.
41 | *
42 | * @param dic
43 | * dictionary id. 0 is system, 1 and above are user.
44 | * @param word
45 | * word id inside the dictionary.
46 | * @return combined word id.
47 | */
48 | public static int make(int dic, int word) {
49 | if (word > MAX_WORD_ID) {
50 | throw new IndexOutOfBoundsException("wordId is too large: " + word);
51 | }
52 | if (dic > MAX_DIC_ID) {
53 | throw new IndexOutOfBoundsException("dictionaryId is too large: " + dic);
54 | }
55 | return makeUnchecked(dic, word);
56 | }
57 |
58 | /**
59 | * Extract dictionary number from the combined word id
60 | *
61 | * @param wordId
62 | * combined word id
63 | * @return dictionary number
64 | */
65 | public static int dic(int wordId) {
66 | return wordId >>> 28;
67 | }
68 |
69 | /**
70 | * Extract internal word id from the combined word id
71 | *
72 | * @param wordId
73 | * combined word id
74 | * @return internal word id
75 | */
76 | public static int word(int wordId) {
77 | return wordId & MAX_WORD_ID;
78 | }
79 |
80 | public static int dicIdMask(int dicId) {
81 | return dicId << 28;
82 | }
83 |
84 | public static int applyMask(int wordId, int dicIdMask) {
85 | return (wordId & MAX_WORD_ID) | dicIdMask;
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/WordMask.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | public class WordMask {
20 | public static final int MAX_LENGTH = 63;
21 |
22 | // instance creation is forbidden
23 | private WordMask() {
24 |
25 | }
26 |
27 | /**
28 | * Add n-th element to wordMask
29 | *
30 | * @param positions
31 | * current mask of word positions
32 | * @param position
33 | * new position to add
34 | * @return position mask with the new element added
35 | */
36 | public static long addNth(long positions, int position) {
37 | return positions | nth(position);
38 | }
39 |
40 | /**
41 | * Create a word mask with nth position set
42 | *
43 | * @param position
44 | * number of set position
45 | * @return a word mask bitset
46 | */
47 | public static long nth(int position) {
48 | assert position > 0;
49 | int fixedPosition = Math.min(position - 1, MAX_LENGTH);
50 | return 1L << fixedPosition;
51 | }
52 |
53 | /**
54 | * Checks that a word mask has nth position set
55 | *
56 | * @param positions
57 | * word mask of positions
58 | * @param position
59 | * position to check
60 | * @return whether the checked position was included in the set
61 | */
62 | public static boolean hasNth(long positions, int position) {
63 | return (positions & nth(position)) != 0;
64 | }
65 | }
66 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/WordSegmentationFormatter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import java.io.IOException;
20 | import java.io.PrintStream;
21 | import java.util.List;
22 |
23 | /**
24 | * Provides a formatter for {@link Morpheme}
25 | *
26 | *
27 | * The following is an example of settings.
28 | *
29 | *
30 | * {@code
31 | * {
32 | * "class" : "com.worksap.nlp.sudachi.SurfaceFormatter",
33 | * "delimiter" : " ",
34 | * "eos" : "\n",
35 | * }
36 | * }
37 | *
38 | *
39 | * {@code delimiter} is the delimiter of the morphemes. {@code eos} is printed
40 | * at the position of EOS.
41 | */
42 | public class WordSegmentationFormatter extends MorphemeFormatterPlugin {
43 |
44 | @Override
45 | public void setUp() throws IOException {
46 | super.setUp();
47 | delimiter = settings.getString("delimiter", " ");
48 | eosString = settings.getString("eos", "\n");
49 | }
50 |
51 | @Override
52 | public String formatMorpheme(Morpheme morpheme) {
53 | return morpheme.surface();
54 | }
55 |
56 | @Override
57 | void printSentence(List sentence, PrintStream output) {
58 | boolean isFirst = true;
59 | for (Morpheme m : sentence) {
60 | String morpheme = formatMorpheme(m);
61 | if (morpheme.equals("")) {
62 | continue;
63 | }
64 | if (morpheme.equals(delimiter)) {
65 | continue;
66 | }
67 | if (isFirst) {
68 | isFirst = false;
69 | } else {
70 | output.print(delimiter);
71 | }
72 | output.print(morpheme);
73 | }
74 | output.print(eosString);
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/BinaryDictionary.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary;
18 |
19 | import java.io.Closeable;
20 | import java.io.IOException;
21 | import java.nio.ByteBuffer;
22 |
23 | import com.worksap.nlp.sudachi.Config;
24 | import com.worksap.nlp.sudachi.MMap;
25 |
26 | public class BinaryDictionary implements Closeable, DictionaryAccess {
27 |
28 | private final ByteBuffer bytes;
29 | private final DictionaryHeader header;
30 | private final GrammarImpl grammar;
31 | private final DoubleArrayLexicon lexicon;
32 |
33 | public BinaryDictionary(String fileName) throws IOException {
34 | this(MMap.map(fileName));
35 | }
36 |
37 | public BinaryDictionary(ByteBuffer dictionary) throws IOException {
38 | int offset = 0;
39 | bytes = dictionary;
40 |
41 | header = new DictionaryHeader(bytes, offset);
42 | offset += header.storageSize();
43 |
44 | long version = header.getVersion();
45 | if (DictionaryVersion.hasGrammar(version)) {
46 | grammar = new GrammarImpl(bytes, offset);
47 | offset += grammar.storageSize();
48 | } else if (header.isUserDictionary()) {
49 | grammar = new GrammarImpl();
50 | } else {
51 | MMap.unmap(bytes);
52 | throw new IOException("invalid dictionary");
53 | }
54 |
55 | lexicon = new DoubleArrayLexicon(bytes, offset, DictionaryVersion.hasSynonymGroupIds(version));
56 | }
57 |
58 | public static BinaryDictionary loadSystem(String fileName) throws IOException {
59 | return loadSystem(MMap.map(fileName));
60 | }
61 |
62 | public static BinaryDictionary loadUser(String fileName) throws IOException {
63 | return loadUser(MMap.map(fileName));
64 | }
65 |
66 | public static BinaryDictionary loadSystem(ByteBuffer buffer) throws IOException {
67 | BinaryDictionary dict = new BinaryDictionary(buffer);
68 | if (!dict.getDictionaryHeader().isSystemDictionary()) {
69 | dict.close();
70 | throw new IOException("invalid system dictionary");
71 | }
72 | return dict;
73 | }
74 |
75 | public static BinaryDictionary loadUser(ByteBuffer buffer) throws IOException {
76 | BinaryDictionary dict = new BinaryDictionary(buffer);
77 | if (!dict.getDictionaryHeader().isUserDictionary()) {
78 | dict.close();
79 | throw new IOException("invalid user dictionary");
80 | }
81 | return dict;
82 | }
83 |
84 | public static BinaryDictionary loadSystem(Config.Resource resource) throws IOException {
85 | return resource.consume(res -> loadSystem(res.asByteBuffer()));
86 | }
87 |
88 | public static BinaryDictionary loadUser(Config.Resource resource) throws IOException {
89 | return resource.consume(res -> loadUser(res.asByteBuffer()));
90 | }
91 |
92 | @Override
93 | public void close() throws IOException {
94 | MMap.unmap(bytes);
95 | }
96 |
97 | public DictionaryHeader getDictionaryHeader() {
98 | return header;
99 | }
100 |
101 | public GrammarImpl getGrammar() {
102 | return grammar;
103 | }
104 |
105 | public DoubleArrayLexicon getLexicon() {
106 | return lexicon;
107 | }
108 | }
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/CategoryType.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary;
18 |
19 | /**
20 | * Categories of characters.
21 | *
22 | * These categories are used in the
23 | * {@link com.worksap.nlp.sudachi.OovProviderPlugin} and
24 | * {@link com.worksap.nlp.sudachi.PathRewritePlugin}.
25 | *
26 | *
27 | * You can defined the range of each category in the file which specified
28 | * "characterDefinitionFile" of the settings.
29 | */
30 | public enum CategoryType {
31 | /** The fall back category. */
32 | DEFAULT(1),
33 | /** White spaces. */
34 | SPACE(1 << 1),
35 | /** CJKV ideographic characters. */
36 | KANJI(1 << 2),
37 | /** Symbols. */
38 | SYMBOL(1 << 3),
39 | /** Numerical characters. */
40 | NUMERIC(1 << 4),
41 | /** Latin alphabets. */
42 | ALPHA(1 << 5),
43 | /** Hiragana characters. */
44 | HIRAGANA(1 << 6),
45 | /** Katakana characters. */
46 | KATAKANA(1 << 7),
47 | /** Kanji numeric characters. */
48 | KANJINUMERIC(1 << 8),
49 | /** Greek alphabets. */
50 | GREEK(1 << 9),
51 | /** Cyrillic alphabets. */
52 | CYRILLIC(1 << 10),
53 | /** User defined category. */
54 | USER1(1 << 11),
55 | /** User defined category. */
56 | USER2(1 << 12),
57 | /** User defined category. */
58 | USER3(1 << 13),
59 | /** User defined category. */
60 | USER4(1 << 14),
61 | /** Characters that cannot be the beginning of word */
62 | NOOOVBOW(1 << 15);
63 |
64 | private final int id;
65 |
66 | private CategoryType(int id) {
67 | this.id = id;
68 | }
69 |
70 | /**
71 | * Returns the integer ID number of the category.
72 | *
73 | * @return the ID number of the category
74 | */
75 | public int getId() {
76 | return id;
77 | }
78 |
79 | /**
80 | * Returns the category to which the specified ID is mapped, or {@code null} if
81 | * there is no associated category.
82 | *
83 | * @param id
84 | * the ID number of category
85 | * @return the category to which the specified ID is mapped, or {@code null} if
86 | * there is no associated category.
87 | */
88 | public static CategoryType getType(int id) {
89 | for (CategoryType type : CategoryType.values()) {
90 | if (type.getId() == id) {
91 | return type;
92 | }
93 | }
94 | return null;
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/Connection.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary;
18 |
19 | import java.nio.ShortBuffer;
20 |
21 | /**
22 | * CRF weights compressed into 2D u16 matrix in MeCab manner
23 | */
24 | public final class Connection {
25 | private final ShortBuffer matrix;
26 | private final int leftSize;
27 | private final int rightSize;
28 |
29 | public Connection(ShortBuffer matrix, int leftSize, int rightSize) {
30 | this.matrix = matrix;
31 | this.leftSize = leftSize;
32 | this.rightSize = rightSize;
33 | }
34 |
35 | private int ix(int left, int right) {
36 | assert left < leftSize;
37 | assert right < rightSize;
38 | return right * leftSize + left;
39 | }
40 |
41 | /**
42 | *
43 | * @param left
44 | * left connection index
45 | * @param right
46 | * right connection index
47 | * @return connection weight in the matrix
48 | */
49 | public short cost(int left, int right) {
50 | return matrix.get(ix(left, right));
51 | }
52 |
53 | public int getLeftSize() {
54 | return leftSize;
55 | }
56 |
57 | public int getRightSize() {
58 | return rightSize;
59 | }
60 |
61 | public void setCost(int left, int right, short cost) {
62 | matrix.put(ix(left, right), cost);
63 | }
64 |
65 | /**
66 | * @return a copy of itself with the buffer owned, instead of slice
67 | */
68 | public Connection ownedCopy() {
69 | ShortBuffer copy = ShortBuffer.allocate(matrix.limit());
70 | copy.put(matrix);
71 |
72 | return new Connection(copy, leftSize, rightSize);
73 | }
74 |
75 | public void validate(int leftId) {
76 | if (matrix == null) {
77 | // should never happen, but elides compiler checks
78 | throw new NullPointerException("matrix");
79 | }
80 |
81 | if (leftId >= leftSize) {
82 | // should never happen, but adds a compiler precondition to the inlined method
83 | throw new IllegalArgumentException(String.format("leftId < leftSize: (%d, %d)", leftId, leftSize));
84 | }
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryAccess.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary;
18 |
19 | /**
20 | * Marks access into dictionary internals
21 | */
22 | public interface DictionaryAccess {
23 | /**
24 | * Gets current Lexicon.
25 | *
26 | * @return Lexicon implementation
27 | */
28 | Lexicon getLexicon();
29 |
30 | /**
31 | * Gets current grammar.
32 | *
33 | * @return current Grammar
34 | */
35 | GrammarImpl getGrammar();
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary;
18 |
19 | import java.io.FileInputStream;
20 | import java.io.IOException;
21 | import java.io.PrintStream;
22 | import java.nio.ByteBuffer;
23 | import java.nio.ByteOrder;
24 | import java.nio.channels.FileChannel;
25 | import java.time.Instant;
26 | import java.time.ZoneId;
27 |
28 | /**
29 | * A dictionary header printing tool.
30 | */
31 | public class DictionaryHeaderPrinter {
32 |
33 | private DictionaryHeaderPrinter() {
34 | }
35 |
36 | static void printHeader(String filename, PrintStream output) throws IOException {
37 | ByteBuffer bytes;
38 | try (FileInputStream input = new FileInputStream(filename); FileChannel inputFile = input.getChannel()) {
39 | bytes = inputFile.map(FileChannel.MapMode.READ_ONLY, 0, inputFile.size());
40 | bytes.order(ByteOrder.LITTLE_ENDIAN);
41 | }
42 | DictionaryHeader header = new DictionaryHeader(bytes, 0);
43 |
44 | output.println("filename: " + filename);
45 |
46 | if (header.isSystemDictionary()) {
47 | output.println("type: system dictionary");
48 | } else if (header.isUserDictionary()) {
49 | output.println("type: user dictionary");
50 | } else {
51 | output.println("invalid file");
52 | return;
53 | }
54 |
55 | output.println("createTime: "
56 | + Instant.ofEpochSecond(header.getCreateTime()).atZone(ZoneId.systemDefault()).toString());
57 | output.println("description: " + header.getDescription());
58 | }
59 |
60 | /**
61 | * Prints the contents of dictionary header.
62 | *
63 | * This tool requires filenames of dictionaries.
64 | *
65 | * @param args
66 | * the input filenames
67 | * @throws IOException
68 | * if IO
69 | */
70 | public static void main(String[] args) throws IOException {
71 | for (String filename : args) {
72 | printHeader(filename, System.out);
73 | }
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryVersion.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary;
18 |
19 | /**
20 | * Versions of dictionaries.
21 | */
22 | public class DictionaryVersion {
23 |
24 | private DictionaryVersion() {
25 | }
26 |
27 | /** the first version of system dictionries */
28 | public static final long SYSTEM_DICT_VERSION_1 = 0x7366d3f18bd111e7L;
29 |
30 | /** the second version of system dictionries */
31 | public static final long SYSTEM_DICT_VERSION_2 = 0xce9f011a92394434L;
32 |
33 | /** the first version of user dictionries */
34 | public static final long USER_DICT_VERSION_1 = 0xa50f31188bd211e7L;
35 |
36 | /** the second version of user dictionries */
37 | public static final long USER_DICT_VERSION_2 = 0x9fdeb5a90168d868L;
38 |
39 | /** the third version of user dictionries */
40 | public static final long USER_DICT_VERSION_3 = 0xca9811756ff64fb0L;
41 |
42 | public static boolean isSystemDictionary(long version) {
43 | return version == SYSTEM_DICT_VERSION_1 || version == SYSTEM_DICT_VERSION_2;
44 | }
45 |
46 | public static boolean isUserDictionary(long version) {
47 | return version == USER_DICT_VERSION_1 || version == USER_DICT_VERSION_2 || version == USER_DICT_VERSION_3;
48 | }
49 |
50 | static boolean hasGrammar(long version) {
51 | return isSystemDictionary(version) || version == USER_DICT_VERSION_2 || version == USER_DICT_VERSION_3;
52 | }
53 |
54 | static boolean hasSynonymGroupIds(long version) {
55 | return version == SYSTEM_DICT_VERSION_2 || version == USER_DICT_VERSION_3;
56 | }
57 | }
58 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLookup.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary;
18 |
19 | import java.nio.IntBuffer;
20 |
21 | /**
22 | * This class implements common prefix lookup in the double array with a
23 | * different API. It uses fields to return current values of end offset and a
24 | * value stored in trie to reduce GC pressure. It also modifies the hot loop to
25 | * reduce the number of non-elidable field writes.
26 | */
27 | public final class DoubleArrayLookup {
28 | private IntBuffer array;
29 | private byte[] key;
30 | private int limit;
31 | private int startOffset;
32 | private int offset;
33 | private int nodePos;
34 | private int nodeValue;
35 |
36 | public DoubleArrayLookup() {
37 | this(null);
38 | }
39 |
40 | public DoubleArrayLookup(IntBuffer array) {
41 | this.array = array;
42 | }
43 |
44 | public DoubleArrayLookup(IntBuffer array, byte[] key, int offset, int limit) {
45 | this(array);
46 | reset(key, offset, limit);
47 | }
48 |
49 | private static boolean hasLeaf(int unit) {
50 | return ((unit >>> 8) & 1) == 1;
51 | }
52 |
53 | private static int value(int unit) {
54 | return unit & ((1 << 31) - 1);
55 | }
56 |
57 | private static int label(int unit) {
58 | return unit & ((1 << 31) | 0xFF);
59 | }
60 |
61 | private static int offset(int unit) {
62 | return ((unit >>> 10) << ((unit & (1 << 9)) >>> 6));
63 | }
64 |
65 | public void setArray(IntBuffer array) {
66 | this.array = array;
67 | reset(this.key, this.startOffset, this.limit);
68 | }
69 |
70 | public void reset(byte[] key, int offset, int limit) {
71 | this.key = key;
72 | this.offset = offset;
73 | this.startOffset = offset;
74 | this.limit = limit;
75 | nodePos = 0;
76 | int unit = array.get(nodePos);
77 | nodePos ^= offset(unit);
78 | }
79 |
80 | public boolean next() {
81 | IntBuffer array = this.array;
82 | byte[] key = this.key;
83 | int nodePos = this.nodePos;
84 | int limit = this.limit;
85 |
86 | for (int offset = this.offset; offset < limit; ++offset) {
87 | int k = Byte.toUnsignedInt(key[offset]);
88 | nodePos ^= k;
89 | int unit = array.get(nodePos);
90 | if (label(unit) != k) {
91 | this.offset = limit; // no more loop
92 | this.nodePos = nodePos;
93 | return false;
94 | }
95 |
96 | nodePos ^= offset(unit);
97 | if (hasLeaf(unit)) {
98 | nodeValue = value(array.get(nodePos));
99 | this.offset = offset + 1;
100 | this.nodePos = nodePos;
101 | return true;
102 | }
103 | }
104 | return false;
105 | }
106 |
107 | public int getValue() {
108 | return nodeValue;
109 | }
110 |
111 | public int getOffset() {
112 | return offset;
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary;
18 |
19 | import com.worksap.nlp.sudachi.WordId;
20 |
21 | import java.util.Iterator;
22 |
23 | /**
24 | * The lexicon of morphemes.
25 | */
26 | public interface Lexicon {
27 |
28 | Iterator lookup(byte[] text, int offset);
29 |
30 | int getWordId(String headword, short posId, String readingForm);
31 |
32 | /**
33 | * Returns the left-ID of the morpheme specified by the word ID.
34 | *
35 | *
36 | * when the word ID is out of range, the behavior is undefined.
37 | *
38 | * @param wordId
39 | * the word ID of the morpheme
40 | * @return the left-ID of the morpheme
41 | */
42 | short getLeftId(int wordId);
43 |
44 | /**
45 | * Returns the right-ID of the morpheme specified by the word ID.
46 | *
47 | *
48 | * when the word ID is out of range, the behavior is undefined.
49 | *
50 | * @param wordId
51 | * the word ID of the morpheme
52 | * @return the right-ID of the morpheme.
53 | */
54 | short getRightId(int wordId);
55 |
56 | /**
57 | * Returns the word occurrence cost of the morpheme specified by the word ID.
58 | *
59 | *
60 | * when the word ID is out of range, the behavior is undefined.
61 | *
62 | * @param wordId
63 | * the word ID of the morpheme
64 | * @return the word occurrence cost
65 | */
66 | short getCost(int wordId);
67 |
68 | /**
69 | * Returns the informations of the morpheme specified by the word ID.
70 | *
71 | *
72 | * when the word ID is out of range, the behavior is undefined.
73 | *
74 | * @param wordId
75 | * the word ID of the morpheme
76 | * @return the informations of the morpheme
77 | * @see WordInfo
78 | */
79 | WordInfo getWordInfo(int wordId);
80 |
81 | /**
82 | * Returns the ID of the dictionary containing the morpheme specified by the
83 | * word ID.
84 | *
85 | * If the morpheme is in the system dictionary, it returns {@code 0}.
86 | *
87 | * @param wordId
88 | * the word ID of the morpheme
89 | * @return the dictionary ID
90 | * @deprecated use {@link WordId#dic(int)}
91 | */
92 | @Deprecated
93 | default int getDictionaryId(int wordId) {
94 | return WordId.dic(wordId);
95 | }
96 |
97 | /**
98 | * Returns the number of morphemes in the dictionary.
99 | *
100 | * @return the number of morphemes
101 | */
102 | int size();
103 | }
104 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/POS.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary;
18 |
19 | import java.util.AbstractList;
20 | import java.util.Arrays;
21 | import java.util.List;
22 |
23 | /**
24 | * Part-of-Speech
25 | *
26 | * Sudachi POS are 6-component and consist of: 4 layers of POS tags, conjugation
27 | * type, conjugation form.
28 | */
29 | public final class POS extends AbstractList {
30 | public final static int DEPTH = 6;
31 | public final static int MAX_COMPONENT_LENGTH = 127;
32 | private final String[] elems;
33 |
34 | /**
35 | * @param elems
36 | * non-null string array of exactly six elements
37 | */
38 | public POS(String... elems) {
39 | if (elems == null) {
40 | throw new IllegalArgumentException("pos must not be null");
41 | }
42 | if (elems.length != DEPTH) {
43 | throw new IllegalArgumentException(String.format("pos must have exactly 6 elements, was %s: %s",
44 | elems.length, String.join(",", elems)));
45 | }
46 | for (String e : elems) {
47 | if (e == null) {
48 | throw new IllegalArgumentException("POS components can't be null");
49 | }
50 |
51 | if (e.length() > MAX_COMPONENT_LENGTH) {
52 | throw new IllegalArgumentException(
53 | String.format("POS component had length (%d) > %d: %s", e.length(), MAX_COMPONENT_LENGTH, e));
54 | }
55 | }
56 | this.elems = elems;
57 | }
58 |
59 | /**
60 | * Creates new POS instance from elements. Elements must be 6-length string
61 | * list.
62 | *
63 | * @param elems
64 | * POS object elements
65 | */
66 | public POS(List elems) {
67 | this(elems.toArray(new String[0]));
68 | }
69 |
70 | @Override
71 | public String get(int i) {
72 | return elems[i];
73 | }
74 |
75 | @Override
76 | public int size() {
77 | return DEPTH;
78 | }
79 |
80 | @Override
81 | public boolean equals(Object o) {
82 | if (this == o)
83 | return true;
84 | if (o instanceof POS) {
85 | POS strings = (POS) o;
86 | return Arrays.equals(elems, strings.elems);
87 | }
88 | return super.equals(o);
89 | }
90 |
91 | @Override
92 | public int hashCode() {
93 | int result = 0xfeed;
94 | result = 31 * result + Arrays.hashCode(elems);
95 | return result;
96 | }
97 |
98 | @Override
99 | public String toString() {
100 | return String.join(",", elems);
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary;
18 |
19 | import com.worksap.nlp.sudachi.WordId;
20 |
21 | import java.nio.ByteBuffer;
22 |
23 | class WordIdTable {
24 | private final ByteBuffer bytes;
25 | private final int size;
26 | private final int offset;
27 | private int dicIdMask = 0;
28 |
29 | WordIdTable(ByteBuffer bytes, int offset) {
30 | this.bytes = bytes;
31 | size = bytes.getInt(offset);
32 | this.offset = offset + 4;
33 | }
34 |
35 | int storageSize() {
36 | return 4 + size;
37 | }
38 |
39 | Integer[] get(int index) {
40 | int length = Byte.toUnsignedInt(bytes.get(offset + index++));
41 | Integer[] result = new Integer[length];
42 | for (int i = 0; i < length; i++) {
43 | result[i] = bytes.getInt(offset + index);
44 | index += 4;
45 | }
46 | return result;
47 | }
48 |
49 | /**
50 | * Reads the word IDs to the passed WordLookup object
51 | *
52 | * @param index
53 | * index in the word array
54 | * @param lookup
55 | * object to read word IDs into
56 | * @return number of read IDs
57 | */
58 | int readWordIds(int index, WordLookup lookup) {
59 | int offset = this.offset + index;
60 | ByteBuffer bytes = this.bytes;
61 | int length = Byte.toUnsignedInt(bytes.get(offset));
62 | offset += 1;
63 | int[] result = lookup.outputBuffer(length);
64 | int dicIdMask = this.dicIdMask;
65 | for (int i = 0; i < length; i++) {
66 | int wordId = bytes.getInt(offset);
67 | result[i] = WordId.applyMask(wordId, dicIdMask);
68 | offset += 4;
69 | }
70 | return length;
71 | }
72 |
73 | void setDictionaryId(int id) {
74 | dicIdMask = WordId.dicIdMask(id);
75 | }
76 | }
77 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfoList.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary;
18 |
19 | import java.nio.Buffer;
20 | import java.nio.ByteBuffer;
21 |
22 | class WordInfoList {
23 |
24 | private final ByteBuffer bytes;
25 | private final int offset;
26 | private final int wordSize;
27 | private final boolean hasSynonymGid;
28 |
29 | WordInfoList(ByteBuffer bytes, int offset, int wordSize, boolean hasSysnoymGid) {
30 | this.bytes = bytes;
31 | this.offset = offset;
32 | this.wordSize = wordSize;
33 | this.hasSynonymGid = hasSysnoymGid;
34 | }
35 |
36 | WordInfo getWordInfo(int wordId) {
37 | ByteBuffer buf = bytes.asReadOnlyBuffer();
38 | buf.order(bytes.order());
39 | ((Buffer) buf).position(wordIdToOffset(wordId)); // a kludge for Java 9
40 |
41 | String surface = bufferToString(buf);
42 | short headwordLength = (short) bufferToStringLength(buf);
43 | short posId = buf.getShort();
44 | String normalizedForm = bufferToString(buf);
45 | if (normalizedForm.isEmpty()) {
46 | normalizedForm = surface;
47 | }
48 | int dictionaryFormWordId = buf.getInt();
49 | String readingForm = bufferToString(buf);
50 | if (readingForm.isEmpty()) {
51 | readingForm = surface;
52 | }
53 | int[] aUnitSplit = bufferToIntArray(buf);
54 | int[] bUnitSplit = bufferToIntArray(buf);
55 | int[] wordStructure = bufferToIntArray(buf);
56 |
57 | int[] synonymGids = new int[0];
58 | if (hasSynonymGid) {
59 | synonymGids = bufferToIntArray(buf);
60 | }
61 |
62 | String dictionaryForm = surface;
63 | if (dictionaryFormWordId >= 0 && dictionaryFormWordId != wordId) {
64 | WordInfo wi = getWordInfo(dictionaryFormWordId);
65 | dictionaryForm = wi.getSurface();
66 | }
67 |
68 | return new WordInfo(surface, headwordLength, posId, normalizedForm, dictionaryFormWordId, dictionaryForm,
69 | readingForm, aUnitSplit, bUnitSplit, wordStructure, synonymGids);
70 | }
71 |
72 | int size() {
73 | return wordSize;
74 | }
75 |
76 | private int wordIdToOffset(int wordId) {
77 | return bytes.getInt(offset + 4 * wordId);
78 | }
79 |
80 | private int bufferToStringLength(ByteBuffer buffer) {
81 | byte length = buffer.get();
82 | if (length < 0) {
83 | int high = Byte.toUnsignedInt(length);
84 | int low = Byte.toUnsignedInt(buffer.get());
85 | return ((high & 0x7F) << 8) | low;
86 | }
87 | return length;
88 | }
89 |
90 | private String bufferToString(ByteBuffer buffer) {
91 | int length = bufferToStringLength(buffer);
92 | char[] str = new char[length];
93 | for (int i = 0; i < length; i++) {
94 | str[i] = buffer.getChar();
95 | }
96 | return new String(str);
97 | }
98 |
99 | private int[] bufferToIntArray(ByteBuffer buffer) {
100 | int length = Byte.toUnsignedInt(buffer.get());
101 | int[] array = new int[length];
102 | for (int i = 0; i < length; i++) {
103 | array[i] = buffer.getInt();
104 | }
105 | return array;
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameterList.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary;
18 |
19 | import java.nio.Buffer;
20 | import java.nio.ByteBuffer;
21 | import java.nio.ByteOrder;
22 |
23 | class WordParameterList {
24 |
25 | private static final int ELEMENT_SIZE = 2 * 3;
26 |
27 | private ByteBuffer bytes;
28 | private final int size;
29 | private int offset;
30 | private boolean isCopied;
31 |
32 | WordParameterList(ByteBuffer bytes, int offset) {
33 | this.bytes = bytes;
34 | size = bytes.getInt(offset);
35 | this.offset = offset + 4;
36 | isCopied = false;
37 | }
38 |
39 | int storageSize() {
40 | return 4 + ELEMENT_SIZE * size;
41 | }
42 |
43 | int size() {
44 | return size;
45 | }
46 |
47 | short getLeftId(int wordId) {
48 | return bytes.getShort(offset + ELEMENT_SIZE * wordId);
49 | }
50 |
51 | short getRightId(int wordId) {
52 | return bytes.getShort(offset + ELEMENT_SIZE * wordId + 2);
53 | }
54 |
55 | short getCost(int wordId) {
56 | return bytes.getShort(offset + ELEMENT_SIZE * wordId + 4);
57 | }
58 |
59 | void setCost(int wordId, short cost) {
60 | if (!isCopied) {
61 | copyBuffer();
62 | }
63 | bytes.putShort(offset + ELEMENT_SIZE * wordId + 4, cost);
64 | }
65 |
66 | int endOffset() {
67 | return offset + 4 + ELEMENT_SIZE * size;
68 | }
69 |
70 | synchronized void copyBuffer() {
71 | ByteBuffer newBuffer = ByteBuffer.allocate(ELEMENT_SIZE * size);
72 | newBuffer.order(ByteOrder.LITTLE_ENDIAN);
73 | ByteBuffer srcBuffer = bytes.duplicate();
74 | Buffer buffer = srcBuffer; // a kludge for Java 9
75 | buffer.position(offset);
76 | buffer.limit(offset + ELEMENT_SIZE * size);
77 | newBuffer.put(srcBuffer);
78 | bytes = newBuffer;
79 | offset = 0;
80 | isCopied = true;
81 | }
82 | }
83 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/build/BuildStats.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary.build;
18 |
19 | import java.util.List;
20 |
21 | public class BuildStats {
22 | private final List inputs;
23 | private final List parts;
24 |
25 | public BuildStats(List inputs, List parts) {
26 | this.inputs = inputs;
27 | this.parts = parts;
28 | }
29 |
30 | public List getInputs() {
31 | return inputs;
32 | }
33 |
34 | public List getParts() {
35 | return parts;
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary.build;
18 |
19 | import com.worksap.nlp.dartsclone.DoubleArray;
20 |
21 | import java.io.IOException;
22 | import java.nio.ByteBuffer;
23 | import java.nio.ByteOrder;
24 | import java.nio.charset.StandardCharsets;
25 | import java.util.*;
26 |
27 | /**
28 | * Dictionary Parts: Trie index and entry offsets
29 | */
30 | public class Index implements WriteDictionary {
31 | private final SortedMap> elements = new TreeMap<>((byte[] l, byte[] r) -> {
32 | int llen = l.length;
33 | int rlen = r.length;
34 | for (int i = 0; i < Math.min(llen, rlen); i++) {
35 | if (l[i] != r[i]) {
36 | return (l[i] & 0xff) - (r[i] & 0xff);
37 | }
38 | }
39 | return l.length - r.length;
40 | });
41 |
42 | private int count = 0;
43 |
44 | public int add(String key, int wordId) {
45 | byte[] bytes = key.getBytes(StandardCharsets.UTF_8);
46 | List entries = elements.computeIfAbsent(bytes, k -> new ArrayList<>());
47 | if (entries.size() >= 255) {
48 | throw new IllegalArgumentException(String.format("key %s has >= 255 entries in the dictionary", key));
49 | }
50 | entries.add(wordId);
51 | count += 1;
52 | return bytes.length;
53 | }
54 |
55 | public void writeTo(ModelOutput output) throws IOException {
56 | DoubleArray trie = new DoubleArray();
57 |
58 | int size = this.elements.size();
59 |
60 | byte[][] keys = new byte[size][];
61 | int[] values = new int[size];
62 | ByteBuffer wordIdTable = ByteBuffer.allocate(count * (4 + 2));
63 | wordIdTable.order(ByteOrder.LITTLE_ENDIAN);
64 |
65 | output.withSizedPart("WordId table", () -> {
66 | int i = 0;
67 | int numEntries = this.elements.entrySet().size();
68 | for (Map.Entry> entry : this.elements.entrySet()) {
69 | keys[i] = entry.getKey();
70 | values[i] = wordIdTable.position();
71 | i++;
72 | List wordIds = entry.getValue();
73 | wordIdTable.put((byte) wordIds.size());
74 | for (int wid : wordIds) {
75 | wordIdTable.putInt(wid);
76 | }
77 | output.progress(i, numEntries);
78 | }
79 | return wordIdTable.position() + 4;
80 | });
81 |
82 | DicBuffer buffer = new DicBuffer(4);
83 | output.withPart("double array Trie", () -> {
84 | trie.build(keys, values, output::progress);
85 | buffer.putInt(trie.size());
86 | buffer.consume(output::write);
87 | output.write(trie.byteArray());
88 | });
89 |
90 | buffer.putInt(wordIdTable.position());
91 | buffer.consume(output::write);
92 |
93 | wordIdTable.flip();
94 | output.write(wordIdTable);
95 | }
96 | }
97 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/build/InputFileException.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary.build;
18 |
19 | public class InputFileException extends IllegalArgumentException {
20 | public InputFileException(int line, String s, Exception cause) {
21 | super(String.format("line:%d %s", line, s), cause);
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary.build;
18 |
19 | import com.worksap.nlp.sudachi.dictionary.Grammar;
20 | import com.worksap.nlp.sudachi.dictionary.POS;
21 |
22 | import java.io.IOException;
23 | import java.util.ArrayList;
24 | import java.util.HashMap;
25 | import java.util.List;
26 |
27 | public class POSTable implements WriteDictionary {
28 | private final List table = new ArrayList<>();
29 | private final HashMap lookup = new HashMap<>();
30 | private int builtin = 0;
31 |
32 | short getId(POS s) {
33 | return lookup.computeIfAbsent(s, p -> {
34 | int next = table.size();
35 | if (next >= Short.MAX_VALUE) {
36 | throw new IllegalArgumentException("maximum POS number exceeded by " + s);
37 | }
38 | table.add(s);
39 | return (short) next;
40 | });
41 | }
42 |
43 | public void preloadFrom(Grammar grammar) {
44 | int partOfSpeechSize = grammar.getPartOfSpeechSize();
45 | for (short i = 0; i < partOfSpeechSize; ++i) {
46 | POS pos = grammar.getPartOfSpeechString(i);
47 | table.add(pos);
48 | lookup.put(pos, i);
49 | }
50 | builtin += partOfSpeechSize;
51 | }
52 |
53 | List getList() {
54 | return table;
55 | }
56 |
57 | @Override
58 | public void writeTo(ModelOutput output) throws IOException {
59 | output.withPart("POS table", () -> {
60 | DicBuffer buffer = new DicBuffer(128 * 1024);
61 | buffer.putShort((short) ownedLength());
62 | for (int i = builtin; i < table.size(); ++i) {
63 | for (String s : table.get(i)) {
64 | if (!buffer.put(s)) {
65 | // handle buffer overflow, this should be extremely rare
66 | buffer.consume(output::write);
67 | buffer.put(s);
68 | }
69 | }
70 | }
71 | buffer.consume(output::write);
72 | });
73 | }
74 |
75 | public int ownedLength() {
76 | return table.size() - builtin;
77 | }
78 |
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Parameters.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary.build;
18 |
19 | import java.io.IOException;
20 | import java.nio.ByteBuffer;
21 | import java.nio.ByteOrder;
22 | import java.nio.ShortBuffer;
23 |
24 | /**
25 | * Compiles model parameters into the binary format
26 | */
27 | public class Parameters implements WriteDictionary {
28 | private ByteBuffer data;
29 | private ShortBuffer params;
30 | private int maxLeft = Integer.MAX_VALUE;
31 | private int maxRight = Integer.MAX_VALUE;
32 |
33 | public Parameters(int initialSize) {
34 | data = ByteBuffer.allocate(initialSize);
35 | data.order(ByteOrder.LITTLE_ENDIAN);
36 | params = data.asShortBuffer();
37 | }
38 |
39 | public Parameters() {
40 | this(1024 * 1024); // default 1M
41 | }
42 |
43 | public void add(short left, short right, short cost) {
44 | maybeResize();
45 | if (left >= maxLeft) {
46 | throw new IllegalArgumentException(String.format("left %d is larger than max value %d", left, maxLeft));
47 | }
48 | if (right >= maxRight) {
49 | throw new IllegalArgumentException(String.format("right %d is larger than max value %d", right, maxRight));
50 | }
51 | params.put(left);
52 | params.put(right);
53 | params.put(cost);
54 | }
55 |
56 | public void setLimits(int left, int right) {
57 | this.maxLeft = left;
58 | this.maxRight = right;
59 | }
60 |
61 | private void maybeResize() {
62 | if (params.remaining() < 3) {
63 | ByteBuffer newData = ByteBuffer.allocate(data.capacity() * 2);
64 | newData.order(ByteOrder.LITTLE_ENDIAN);
65 | int position = params.position();
66 | data.position(0);
67 | data.limit(position * 2);
68 | newData.put(data);
69 | newData.clear();
70 | data = newData;
71 | params = newData.asShortBuffer();
72 | params.position(position);
73 | assert params.remaining() > 3;
74 | }
75 | }
76 |
77 | @Override
78 | public void writeTo(ModelOutput output) throws IOException {
79 | output.withPart("word parameters", () -> {
80 | data.limit(params.position() * 2);
81 | output.write(data);
82 | });
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary.build;
18 |
19 | import java.time.Duration;
20 |
21 | public class Progress {
22 | private final static long MS_100 = 100_000_000L; // 100ms in nanos
23 | private final int maxUpdates;
24 | private final Callback callback;
25 | private float currentProgress;
26 | private long lastUpdate;
27 |
28 | public Progress(int maxUpdates, Callback callback) {
29 | this.maxUpdates = maxUpdates;
30 | this.callback = callback;
31 | }
32 |
33 | public void startBlock(String name, long start, Kind kind) {
34 | lastUpdate = start;
35 | callback.start(name, kind);
36 | currentProgress = step();
37 | }
38 |
39 | private float step() {
40 | return 1.0f / maxUpdates - 1e-6f;
41 | }
42 |
43 | /**
44 | * This function limits calls to the progress function
45 | *
46 | * @param cur
47 | * current state
48 | * @param max
49 | * maximum state
50 | */
51 | public void progress(long cur, long max) {
52 | double ratio = cur / (double) max;
53 | if (ratio > currentProgress) {
54 | if (ratio >= 1.0) {
55 | callback.progress(1.0f);
56 | currentProgress = Float.MAX_VALUE;
57 | }
58 |
59 | long curTime = System.nanoTime();
60 | if (curTime - lastUpdate > MS_100) {
61 | callback.progress((float) ratio);
62 | float step = step();
63 | double nsteps = ratio / step;
64 | currentProgress += Math.floor(nsteps) * step;
65 | assert ratio < currentProgress;
66 | lastUpdate = curTime;
67 | }
68 | }
69 | }
70 |
71 | public void endBlock(long size, long time) {
72 | callback.end(size, Duration.ofNanos(time));
73 | }
74 |
75 | public enum Kind {
76 | INPUT, OUTPUT
77 | }
78 |
79 | /**
80 | * Progress callback
81 | */
82 | public interface Callback {
83 | /**
84 | * This function will be called for each step at the beginning
85 | *
86 | * @param name
87 | * step name
88 | */
89 | default void start(String name, Kind kind) {
90 | }
91 |
92 | /**
93 | * This function will be called as progress is happening
94 | *
95 | * @param progress
96 | * ratio of the progress
97 | */
98 | void progress(float progress);
99 |
100 | default void end(long size, Duration time) {
101 | }
102 | }
103 | }
104 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/build/TrackingInputStream.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary.build;
18 |
19 | import java.io.IOException;
20 | import java.io.InputStream;
21 |
22 | public class TrackingInputStream extends InputStream {
23 | private final InputStream inner;
24 | private long position;
25 |
26 | public TrackingInputStream(InputStream inner) {
27 | this.inner = inner;
28 | }
29 |
30 | @Override
31 | public int read() throws IOException {
32 | return inner.read();
33 | }
34 |
35 | @Override
36 | public int read(byte[] b) throws IOException {
37 | int read = inner.read(b);
38 | if (read != -1) {
39 | position += read;
40 | }
41 | return read;
42 | }
43 |
44 | @Override
45 | public int read(byte[] b, int off, int len) throws IOException {
46 | int read = inner.read(b, off, len);
47 | if (read != -1) {
48 | position += read;
49 | }
50 | return read;
51 | }
52 |
53 | @Override
54 | public long skip(long n) throws IOException {
55 | position += n;
56 | return super.skip(n);
57 | }
58 |
59 | public long getPosition() {
60 | return position;
61 | }
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary.build;
18 |
19 | public interface WordIdResolver {
20 | int lookup(String headword, short posId, String reading);
21 |
22 | void validate(int wordId);
23 |
24 | boolean isUser();
25 | }
26 |
--------------------------------------------------------------------------------
/src/main/java/com/worksap/nlp/sudachi/dictionary/build/WriteDictionary.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary.build;
18 |
19 | import java.io.IOException;
20 |
21 | public interface WriteDictionary {
22 | void writeTo(ModelOutput output) throws IOException;
23 | }
24 |
--------------------------------------------------------------------------------
/src/main/resources/sudachi.json:
--------------------------------------------------------------------------------
1 | {
2 | "systemDict" : "system_core.dic",
3 | "characterDefinitionFile": "char.def",
4 | "inputTextPlugin" : [
5 | { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" },
6 | { "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin",
7 | "prolongedSoundMarks": ["ー", "-", "⁓", "〜", "〰"],
8 | "replacementSymbol": "ー"},
9 | { "class": "com.worksap.nlp.sudachi.IgnoreYomiganaPlugin",
10 | "leftBrackets": ["(", "("],
11 | "rightBrackets": [")", ")"],
12 | "maxYomiganaLength": 4}
13 | ],
14 | "oovProviderPlugin" : [
15 | { "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin" },
16 | { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin",
17 | "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ],
18 | "leftId" : 5968,
19 | "rightId" : 5968,
20 | "cost" : 3857 }
21 | ],
22 | "pathRewritePlugin" : [
23 | { "class" : "com.worksap.nlp.sudachi.JoinNumericPlugin",
24 | "enableNormalize" : true },
25 | { "class" : "com.worksap.nlp.sudachi.JoinKatakanaOovPlugin",
26 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
27 | "minLength" : 3
28 | }
29 | ],
30 | "formatterPlugin" : [
31 | { "class" : "com.worksap.nlp.sudachi.SimpleMorphemeFormatter" },
32 | { "class" : "com.worksap.nlp.sudachi.WordSegmentationFormatter",
33 | "eos" : "\n" },
34 | { "class" : "com.worksap.nlp.sudachi.WordSegmentationFormatter",
35 | "eos" : " " }
36 | ]
37 | }
38 |
--------------------------------------------------------------------------------
/src/main/resources/sudachi.logging.properties:
--------------------------------------------------------------------------------
1 | java.util.logging.SimpleFormatter.format=%5$s%n
2 |
3 | com.worksap.nlp.sudachi.handlers=java.util.logging.ConsoleHandler
4 | com.worksap.nlp.sudachi.level=INFO
5 | java.util.logging.ConsoleHandler.level=ALL
--------------------------------------------------------------------------------
/src/main/resources/unk.def:
--------------------------------------------------------------------------------
1 | DEFAULT,5968,5968,3857,補助記号,一般,*,*,*,*
2 | SPACE,5966,5966,6056,空白,*,*,*,*,*
3 | KANJI,5139,5139,14657,名詞,普通名詞,一般,*,*,*
4 | KANJI,5129,5129,17308,名詞,普通名詞,サ変可能,*,*,*
5 | KANJI,4785,4785,18181,名詞,固有名詞,一般,*,*,*
6 | KANJI,4787,4787,18086,名詞,固有名詞,人名,一般,*,*
7 | KANJI,4791,4791,19198,名詞,固有名詞,地名,一般,*,*
8 | SYMBOL,5129,5129,17094,名詞,普通名詞,サ変可能,*,*,*
9 | NUMERIC,4794,4794,12450,名詞,数詞,*,*,*,*
10 | ALPHA,5139,5139,11633,名詞,普通名詞,一般,*,*,*
11 | ALPHA,4785,4785,13620,名詞,固有名詞,一般,*,*,*
12 | ALPHA,4787,4787,14228,名詞,固有名詞,人名,一般,*,*
13 | ALPHA,4791,4791,15793,名詞,固有名詞,地名,一般,*,*
14 | ALPHA,5687,5687,15246,感動詞,一般,*,*,*,*
15 | HIRAGANA,5139,5139,16012,名詞,普通名詞,一般,*,*,*
16 | HIRAGANA,5129,5129,20012,名詞,普通名詞,サ変可能,*,*,*
17 | HIRAGANA,4785,4785,18282,名詞,固有名詞,一般,*,*,*
18 | HIRAGANA,4787,4787,18269,名詞,固有名詞,人名,一般,*,*
19 | HIRAGANA,4791,4791,20474,名詞,固有名詞,地名,一般,*,*
20 | HIRAGANA,5687,5687,17786,感動詞,一般,*,*,*,*
21 | KATAKANA,5139,5139,10980,名詞,普通名詞,一般,*,*,*
22 | KATAKANA,5129,5129,14802,名詞,普通名詞,サ変可能,*,*,*
23 | KATAKANA,4785,4785,13451,名詞,固有名詞,一般,*,*,*
24 | KATAKANA,4787,4787,13759,名詞,固有名詞,人名,一般,*,*
25 | KATAKANA,4791,4791,14554,名詞,固有名詞,地名,一般,*,*
26 | KATAKANA,5687,5687,15272,感動詞,一般,*,*,*,*
27 | KANJINUMERIC,4794,4794,14170,名詞,数詞,*,*,*,*
28 | GREEK,5139,5139,11051,名詞,普通名詞,一般,*,*,*
29 | GREEK,4785,4785,13353,名詞,固有名詞,一般,*,*,*
30 | GREEK,4787,4787,13671,名詞,固有名詞,人名,一般,*,*
31 | GREEK,4791,4791,14862,名詞,固有名詞,地名,一般,*,*
32 | CYRILLIC,5139,5139,11140,名詞,普通名詞,一般,*,*,*
33 | CYRILLIC,4785,4785,13174,名詞,固有名詞,一般,*,*,*
34 | CYRILLIC,4787,4787,13495,名詞,固有名詞,人名,一般,*,*
35 | CYRILLIC,4791,4791,14700,名詞,固有名詞,地名,一般,*,*
36 |
--------------------------------------------------------------------------------
/src/test/dict/lex.csv:
--------------------------------------------------------------------------------
1 | た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*,*
2 | に,2,2,11406,に,助詞,接続助詞,*,*,*,*,ニ,に,*,A,*,*,*,*
3 | に,3,3,4481,に,助詞,格助詞,*,*,*,*,ニ,に,*,A,*,*,*,*
4 | 京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,1/5
5 | 東,7,7,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,*
6 | 東京,6,6,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,*
7 | 東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,*
8 | 行く,4,4,5105,行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,*
9 | 行っ,5,5,5122,行っ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,A,*,*,*,*
10 | 都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*
11 | アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,*,A,*,*,*,*
12 | アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,*,A,*,*,*,*
13 | アイアイウ,6,6,32766,アイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,*,A,*,*,*,*
14 | 0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,*,A,*,*,*,*
15 | 1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,*,A,*,*,*,*
16 | 2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,*,A,*,*,*,*
17 | 3,9,9,2478,3,名詞,数詞,*,*,*,*,サン,3,*,A,*,*,*,*
18 | 4,9,9,2478,4,名詞,数詞,*,*,*,*,ヨン,4,*,A,*,*,*,*
19 | 5,9,9,2478,5,名詞,数詞,*,*,*,*,ゴ,5,*,A,*,*,*,*
20 | 6,9,9,2478,6,名詞,数詞,*,*,*,*,ロク,6,*,A,*,*,*,*
21 | 7,9,9,2478,7,名詞,数詞,*,*,*,*,ナナ,7,*,A,*,*,*,*
22 | 8,9,9,2478,8,名詞,数詞,*,*,*,*,ハチ,8,*,A,*,*,*,*
23 | 9,9,9,2478,9,名詞,数詞,*,*,*,*,キュウ,9,*,A,*,*,*,*
24 | 〇,9,9,2478,〇,名詞,数詞,*,*,*,*,ゼロ,〇,*,A,*,*,*,*
25 | 一,9,9,2478,一,名詞,数詞,*,*,*,*,イチ,一,*,A,*,*,*,*
26 | 二,9,9,2478,二,名詞,数詞,*,*,*,*,ニ,二,*,A,*,*,*,*
27 | 三,9,9,2478,三,名詞,数詞,*,*,*,*,サン,三,*,A,*,*,*,*
28 | 四,9,9,2478,四,名詞,数詞,*,*,*,*,ヨン,四,*,A,*,*,*,*
29 | 五,9,9,2478,五,名詞,数詞,*,*,*,*,ゴ,五,*,A,*,*,*,*
30 | 六,9,9,2478,六,名詞,数詞,*,*,*,*,ロク,六,*,A,*,*,*,*
31 | 七,9,9,2478,七,名詞,数詞,*,*,*,*,ナナ,七,*,A,*,*,*,*
32 | 八,9,9,2478,八,名詞,数詞,*,*,*,*,ハチ,八,*,A,*,*,*,*
33 | 九,9,9,2478,九,名詞,数詞,*,*,*,*,キュウ,九,*,A,*,*,*,*
34 | 六三四,6,6,0,六三四,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,*,A,*,*,*,*
35 | いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,*
36 | いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,*
37 | 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,2478,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,*
38 | 特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,特A,*,A,*,*,*,*
39 | な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,A,*,*,*,*
--------------------------------------------------------------------------------
/src/test/dict/matrix.def:
--------------------------------------------------------------------------------
1 | 10 10
2 | 0 0 0
3 | 0 1 863
4 | 0 2 2124
5 | 0 3 1032
6 | 0 4 591
7 | 0 5 -162
8 | 0 6 -79
9 | 0 7 887
10 | 0 8 447
11 | 0 9 -535
12 | 1 0 -3689
13 | 1 1 -3361
14 | 1 2 -7643
15 | 1 3 -3267
16 | 1 4 809
17 | 1 5 -1098
18 | 1 6 4606
19 | 1 7 4269
20 | 1 8 4567
21 | 1 9 1635
22 | 2 0 -1959
23 | 2 1 2457
24 | 2 2 811
25 | 2 3 840
26 | 2 4 903
27 | 2 5 -958
28 | 2 6 517
29 | 2 7 2037
30 | 2 8 1392
31 | 2 9 -193
32 | 3 0 -2288
33 | 3 1 1741
34 | 3 2 487
35 | 3 3 792
36 | 3 4 -1474
37 | 3 5 -3429
38 | 3 6 126
39 | 3 7 437
40 | 3 8 605
41 | 3 9 -547
42 | 4 0 -2809
43 | 4 1 -3584
44 | 4 2 -6743
45 | 4 3 -2869
46 | 4 4 -2805
47 | 4 5 -407
48 | 4 6 3422
49 | 4 7 5642
50 | 4 8 6382
51 | 4 9 2165
52 | 5 0 -509
53 | 5 1 -3665
54 | 5 2 -3882
55 | 5 3 -572
56 | 5 4 -1036
57 | 5 5 -54
58 | 5 6 2570
59 | 5 7 3319
60 | 5 8 4059
61 | 5 9 882
62 | 6 0 101
63 | 6 1 2933
64 | 6 2 2198
65 | 6 3 -2004
66 | 6 4 4392
67 | 6 5 4017
68 | 6 6 569
69 | 6 7 475
70 | 6 8 -390
71 | 6 9 852
72 | 7 0 -852
73 | 7 1 2079
74 | 7 2 1180
75 | 7 3 -3084
76 | 7 4 2010
77 | 7 5 1570
78 | 7 6 746
79 | 7 7 2341
80 | 7 8 2051
81 | 7 9 1393
82 | 8 0 -522
83 | 8 1 3354
84 | 8 2 2037
85 | 8 3 -2542
86 | 8 4 3071
87 | 8 5 2631
88 | 8 6 -352
89 | 8 7 2847
90 | 8 8 1134
91 | 8 9 1256
92 | 9 0 -975
93 | 9 1 2498
94 | 9 2 1690
95 | 9 3 -1523
96 | 9 4 3023
97 | 9 5 3139
98 | 9 6 2562
99 | 9 7 3962
100 | 9 8 418
101 | 9 9 -2490
102 |
--------------------------------------------------------------------------------
/src/test/dict/user.csv:
--------------------------------------------------------------------------------
1 | ぴらる,8,8,-32768,ぴらる,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,*,A,*,*,*,*
2 | 府,8,8,2914,府,名詞,普通名詞,一般,*,*,*,フ,府,*,A,*,*,*,*
3 | 東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,*,B,5/U1,*,5/U1,1/3
4 | すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,*,A,*,*,*,*
5 |
--------------------------------------------------------------------------------
/src/test/dict/user2.csv:
--------------------------------------------------------------------------------
1 | ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,*,A,*,*,*,*
2 | かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,*,A,*,*,*,*
3 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/DictionaryFactoryTest.kt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi
18 |
19 | import kotlin.test.assertFails
20 | import org.junit.Test
21 |
22 | class DictionaryFactoryTest {
23 | @Test
24 | @Deprecated(
25 | "testing deprecated methods",
26 | ReplaceWith("DictionaryFactory().create()", "com.worksap.nlp.sudachi.DictionaryFactory"))
27 | fun everythingNull() {
28 | val error = assertFails { DictionaryFactory().create(null, null, false) }
29 | assert(error.message!!.contains("Failed to resolve file: system.dic"))
30 | }
31 |
32 | @Test
33 | @Deprecated(
34 | "testing deprecated methods",
35 | ReplaceWith("DictionaryFactory().create()", "com.worksap.nlp.sudachi.DictionaryFactory"))
36 | fun notNullPath() {
37 | val error = assertFails { DictionaryFactory().create("does-not-exist", null, false) }
38 | assert(error.message!!.contains("base=does-not-exist"))
39 | }
40 |
41 | @Test
42 | @Deprecated(
43 | "testing deprecated methods",
44 | ReplaceWith("DictionaryFactory().create()", "com.worksap.nlp.sudachi.DictionaryFactory"))
45 | fun notNullPathSettings() {
46 | val error = assertFails {
47 | DictionaryFactory().create("", """{"systemDict": "test.dic"}""", true)
48 | }
49 | assert(error.message!!.contains("Failed to resolve file: test.dic"))
50 | }
51 | }
52 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/InhibitConnectionPluginTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import static org.hamcrest.CoreMatchers.is;
20 | import static org.hamcrest.MatcherAssert.assertThat;
21 |
22 | import java.util.Arrays;
23 | import java.util.Collections;
24 |
25 | import org.junit.Test;
26 |
27 | import com.worksap.nlp.sudachi.dictionary.Grammar;
28 |
29 | public class InhibitConnectionPluginTest {
30 |
31 | @Test
32 | public void edit() {
33 | short left = 0;
34 | short right = 0;
35 | MockGrammar grammar = new MockGrammar();
36 | InhibitConnectionPlugin plugin = new InhibitConnectionPlugin();
37 | plugin.inhibitedPairs = Collections.singletonList(Arrays.asList((int) left, (int) right));
38 | plugin.edit(grammar);
39 | assertThat(grammar.getConnectCost(left, right), is(Grammar.INHIBITED_CONNECTION));
40 | }
41 |
42 | }
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import static org.hamcrest.CoreMatchers.is;
20 | import static org.hamcrest.CoreMatchers.isA;
21 | import static org.hamcrest.CoreMatchers.notNullValue;
22 | import static org.hamcrest.MatcherAssert.assertThat;
23 |
24 | import java.io.IOException;
25 | import java.util.List;
26 |
27 | import org.junit.After;
28 | import org.junit.Before;
29 | import org.junit.Test;
30 |
31 | public class JapaneseDictionaryTest {
32 | Dictionary dict;
33 |
34 | @Before
35 | public void setUp() throws IOException {
36 | dict = TestDictionary.INSTANCE.user0();
37 | }
38 |
39 | @After
40 | public void tearDown() throws IOException {
41 | dict.close();
42 | }
43 |
44 | @Test
45 | public void create() {
46 | assertThat(dict.create(), isA(Tokenizer.class));
47 | }
48 |
49 | @Test
50 | public void getPartOfSpeechSize() {
51 | assertThat(dict.getPartOfSpeechSize(), is(8));
52 | }
53 |
54 | @Test
55 | public void getPartOfSpeechString() {
56 | List pos = dict.getPartOfSpeechString((short) 0);
57 | assertThat(pos, notNullValue());
58 | assertThat(pos.get(0), is("助動詞"));
59 | }
60 |
61 | @Test
62 | public void instantiateConfigWithoutCharDef() throws IOException {
63 | Config cfg = Config.fromClasspath("sudachi_minimum.json");
64 | cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
65 | try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
66 | assertThat(jd, notNullValue());
67 | assertThat(jd.create(), notNullValue());
68 | }
69 | }
70 |
71 | private JapaneseDictionary makeDictionaryIncorrectly() throws IOException {
72 | Config cfg = Config.fromClasspath("sudachi_minimum.json");
73 | cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
74 | try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
75 | return jd;
76 | }
77 | }
78 |
79 | @Test(expected = IllegalStateException.class)
80 | public void throwExceptionOnDictionaryUsageAfterClose() throws IOException {
81 | JapaneseDictionary dic = makeDictionaryIncorrectly();
82 | Tokenizer ignored = dic.create();
83 | }
84 |
85 | private Tokenizer makeTokenizerIncorrectly() throws IOException {
86 | Config cfg = Config.fromClasspath("sudachi_minimum.json");
87 | cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict());
88 | try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) {
89 | return jd.create();
90 | }
91 | }
92 |
93 | @Test(expected = IllegalStateException.class)
94 | public void throwExceptionOnTokenizerUsageAfterClose() throws IOException {
95 | Tokenizer tok = makeTokenizerIncorrectly();
96 | tok.tokenize("a");
97 | }
98 |
99 | }
100 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerMaskTest.kt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi
18 |
19 | import kotlin.test.Test
20 | import kotlin.test.assertEquals
21 | import kotlin.test.assertIs
22 |
23 | class JapaneseTokenizerMaskTest {
24 | private class CaptureOtherWords : OovProviderPlugin() {
25 | val otherWords = ArrayList>()
26 | override fun provideOOV(
27 | inputText: InputText?,
28 | offset: Int,
29 | otherWords: Long,
30 | result: MutableList?
31 | ): Int {
32 | this.otherWords.add(offset to otherWords)
33 | return 0
34 | }
35 | }
36 |
37 | @Test
38 | fun correctMasksWithFirstProvider() {
39 | val cfg0 = Config.empty()
40 | cfg0.addOovProviderPlugin(CaptureOtherWords::class.java)
41 | cfg0.addOovProviderPlugin(SimpleOovProviderPlugin::class.java)
42 | val cfg = cfg0.withFallback(TestDictionary.user0Cfg())
43 | val dic = DictionaryFactory().create(cfg) as JapaneseDictionary
44 | val tokenizer = dic.create()
45 |
46 | assertEquals(2, dic.oovProviderPlugins.size)
47 | assertIs(dic.oovProviderPlugins[0])
48 | assertIs(dic.oovProviderPlugins[1])
49 |
50 | tokenizer.tokenize("かaiueoか")
51 | val provider = dic.oovProviderPlugins.first { it is CaptureOtherWords } as CaptureOtherWords
52 | val otherWords = provider.otherWords
53 | assertEquals(3, otherWords.size)
54 | // in this order word masks are empty
55 | assertEquals(0 to 0L, otherWords[0])
56 | assertEquals(3 to 0L, otherWords[1])
57 | assertEquals(8 to 0L, otherWords[2])
58 | }
59 |
60 | @Test
61 | fun correctMasksWithSecondProvider() {
62 | val cfg = TestDictionary.user0Cfg()
63 | cfg.addOovProviderPlugin(CaptureOtherWords::class.java)
64 | val dic = DictionaryFactory().create(cfg) as JapaneseDictionary
65 | val tokenizer = dic.create()
66 |
67 | assertIs(dic.oovProviderPlugins[0])
68 | assertIs(dic.oovProviderPlugins[1])
69 |
70 | tokenizer.tokenize("かaiueoか")
71 | val provider = dic.oovProviderPlugins.first { it is CaptureOtherWords } as CaptureOtherWords
72 | val otherWords = provider.otherWords
73 | assertEquals(3, otherWords.size)
74 | // in this order word masks are not empty
75 | assertEquals(0 to WordMask.nth(3), otherWords[0])
76 | assertEquals(3 to WordMask.nth(5), otherWords[1])
77 | assertEquals(8 to WordMask.nth(3), otherWords[2])
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import static org.junit.Assert.assertEquals;
20 | import static org.junit.Assert.assertFalse;
21 |
22 | import java.io.IOException;
23 | import java.util.List;
24 |
25 | import org.junit.Before;
26 | import org.junit.Test;
27 |
28 | public class JoinKatakanaOovPluginTest {
29 | JapaneseTokenizer tokenizer;
30 | JoinKatakanaOovPlugin plugin;
31 |
32 | @Before
33 | public void setUp() throws IOException {
34 | Dictionary dict = TestDictionary.INSTANCE.user1();
35 | tokenizer = (JapaneseTokenizer) dict.create();
36 | plugin = new JoinKatakanaOovPlugin();
37 | }
38 |
39 | @Test
40 | public void testKatakanaLength() {
41 | // アイ, アイウ in the dictionary
42 |
43 | plugin.minLength = 0;
44 | List path = getPath("アイアイウ");
45 | assertEquals(2, path.size());
46 |
47 | plugin.minLength = 1;
48 | path = getPath("アイアイウ");
49 | assertEquals(2, path.size());
50 |
51 | plugin.minLength = 2;
52 | path = getPath("アイアイウ");
53 | assertEquals(2, path.size());
54 |
55 | plugin.minLength = 3;
56 | path = getPath("アイアイウ");
57 | assertEquals(1, path.size());
58 | }
59 |
60 | @Test
61 | public void testPOS() {
62 | // アイアイウ is 名詞-固有名詞-地名-一般 in the dictionary
63 | plugin.minLength = 3;
64 | List path = getPath("アイアイウ");
65 | assertEquals(1, path.size());
66 | assertFalse(path.get(0).isOOV()); // use the word in dictionary
67 | }
68 |
69 | @Test
70 | public void testStartWithMiddle() {
71 | plugin.minLength = 3;
72 | List path = getPath("アイウアイアイウ");
73 | assertEquals(1, path.size());
74 | }
75 |
76 | @Test
77 | public void testStartWithTail() {
78 | plugin.minLength = 3;
79 | List path = getPath("アイウアイウアイ");
80 | assertEquals(1, path.size());
81 | }
82 |
83 | @Test
84 | public void testWithNOOOVBOW() {
85 | plugin.minLength = 3;
86 | List path = getPath("ァアイアイウ");
87 | assertEquals(2, path.size());
88 | assertEquals("ァ", path.get(0).getWordInfo().getSurface());
89 |
90 | path = getPath("アイウァアイウ");
91 | assertEquals(1, path.size());
92 | }
93 |
94 | private List getPath(String text) {
95 | UTF8InputText input = new UTF8InputTextBuilder(text, tokenizer.grammar).build();
96 | LatticeImpl lattice = tokenizer.buildLattice(input);
97 | List path = lattice.getBestPath();
98 | plugin.rewrite(input, path, lattice);
99 | lattice.clear();
100 | return path;
101 | }
102 |
103 | }
104 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/MMapTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import org.junit.Before;
20 | import org.junit.Rule;
21 | import org.junit.Test;
22 | import org.junit.rules.TemporaryFolder;
23 |
24 | import java.io.IOException;
25 | import java.nio.ByteBuffer;
26 | import java.nio.file.NoSuchFileException;
27 | import java.nio.file.Path;
28 |
29 | import static org.hamcrest.CoreMatchers.isA;
30 | import static org.hamcrest.MatcherAssert.assertThat;
31 |
32 | public class MMapTest {
33 |
34 | @Rule
35 | public TemporaryFolder temporaryFolder = new TemporaryFolder();
36 |
37 | Path path;
38 |
39 | @Before
40 | public void setUp() throws IOException {
41 | path = temporaryFolder.getRoot().toPath();
42 | TestDictionary.INSTANCE.getSystemDictData().writeData(path.resolve("system.dic"));
43 | }
44 |
45 | @Test
46 | public void map() throws IOException {
47 | String filename = path.resolve("system.dic").toString();
48 | assertThat(MMap.map(filename), isA(ByteBuffer.class));
49 | }
50 |
51 | @Test(expected = NoSuchFileException.class)
52 | public void mapWithNotExist() throws IOException {
53 | String filename = path.resolve("does_not_exist").toString();
54 | MMap.map(filename);
55 | }
56 |
57 | @Test
58 | public void unmap() throws IOException {
59 | String filename = path.resolve("system.dic").toString();
60 | ByteBuffer buffer = MMap.map(filename);
61 | assertThat(buffer, isA(ByteBuffer.class));
62 | MMap.unmap(buffer);
63 | }
64 |
65 | @Test
66 | public void unmapWithoutMappedByteBuffer() throws IOException {
67 | ByteBuffer buffer = ByteBuffer.wrap(new byte[] { 0x00, 0x00 });
68 | MMap.unmap(buffer);
69 | }
70 |
71 | }
72 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/MockGrammar.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import java.io.IOException;
20 | import java.util.Collections;
21 | import java.util.HashMap;
22 | import java.util.List;
23 | import java.util.Map;
24 |
25 | import com.worksap.nlp.sudachi.dictionary.CharacterCategory;
26 | import com.worksap.nlp.sudachi.dictionary.Grammar;
27 | import com.worksap.nlp.sudachi.dictionary.POS;
28 |
29 | public class MockGrammar implements Grammar {
30 |
31 | Map> matrix = new HashMap<>();
32 | private final CharacterCategory category = defaultCharCategory();
33 |
34 | @Override
35 | public int getPartOfSpeechSize() {
36 | return 0;
37 | }
38 |
39 | @Override
40 | public POS getPartOfSpeechString(short posId) {
41 | return null;
42 | }
43 |
44 | @Override
45 | public short getPartOfSpeechId(List pos) {
46 | return 0;
47 | }
48 |
49 | @Override
50 | public short getConnectCost(short left, short right) {
51 | return matrix.getOrDefault(left, Collections.emptyMap()).getOrDefault(right, (short) 0);
52 | }
53 |
54 | @Override
55 | public void setConnectCost(short left, short right, short cost) {
56 | matrix.computeIfAbsent(left, k -> new HashMap<>()).put(right, cost);
57 | }
58 |
59 | @Override
60 | public short[] getBOSParameter() {
61 | return null;
62 | }
63 |
64 | @Override
65 | public short[] getEOSParameter() {
66 | return null;
67 | }
68 |
69 | @Override
70 | public CharacterCategory getCharacterCategory() {
71 | return category;
72 | }
73 |
74 | public static CharacterCategory defaultCharCategory() {
75 | try {
76 | return CharacterCategory.load(PathAnchor.classpath().resource("char.def"));
77 | } catch (IOException e) {
78 | throw new RuntimeException(e);
79 | }
80 | }
81 |
82 | @Override
83 | public void setCharacterCategory(CharacterCategory charCategory) {
84 | }
85 |
86 | @Override
87 | public boolean isValid() {
88 | return true;
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/MockInputText.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import java.util.EnumSet;
20 | import java.util.Set;
21 |
22 | import com.worksap.nlp.sudachi.dictionary.CategoryType;
23 |
24 | class MockInputText implements InputText {
25 |
26 | String text;
27 | EnumSet[] types;
28 |
29 | @SuppressWarnings("unchecked")
30 | MockInputText(String text) {
31 | this.text = text;
32 | types = new EnumSet[text.length()];
33 | for (int i = 0; i < text.length(); i++) {
34 | types[i] = EnumSet.noneOf(CategoryType.class);
35 | }
36 | }
37 |
38 | void setCategoryType(int begin, int end, CategoryType... types) {
39 | for (int i = begin; i < end; i++) {
40 | for (CategoryType type : types) {
41 | this.types[i].add(type);
42 | }
43 | }
44 | }
45 |
46 | @Override
47 | public String getText() {
48 | return text;
49 | }
50 |
51 | @Override
52 | public String getOriginalText() {
53 | return text;
54 | }
55 |
56 | @Override
57 | public String getSubstring(int begin, int end) {
58 | return text.substring(begin, end);
59 | }
60 |
61 | @Override
62 | public InputText slice(int begin, int end) {
63 | return null;
64 | }
65 |
66 | @Override
67 | public int getOriginalIndex(int index) {
68 | return index;
69 | }
70 |
71 | @Override
72 | public Set getCharCategoryTypes(int index) {
73 | return types[index];
74 | }
75 |
76 | @Override
77 | public Set getCharCategoryTypes(int begin, int end) {
78 | Set continuousCategory = types[begin].clone();
79 | for (int i = text.offsetByCodePoints(begin, 1); i < end; i = text.offsetByCodePoints(i, 1)) {
80 | continuousCategory.retainAll(types[i]);
81 | }
82 | return continuousCategory;
83 | }
84 |
85 | @Override
86 | public int getCharCategoryContinuousLength(int index) {
87 | Set continuousCategory = types[index].clone();
88 | for (int i = text.offsetByCodePoints(index, 1); i < text.length(); i = text.offsetByCodePoints(i, 1)) {
89 | continuousCategory.retainAll(types[i]);
90 | if (continuousCategory.isEmpty()) {
91 | return i - index;
92 | }
93 | }
94 | return text.length() - index;
95 | }
96 |
97 | @Override
98 | public int getCodePointsOffsetLength(int index, int codePointOffset) {
99 | return text.offsetByCodePoints(index, codePointOffset) - index;
100 | }
101 |
102 | @Override
103 | public int codePointCount(int begin, int end) {
104 | return Character.codePointCount(text, begin, end);
105 | }
106 |
107 | @Override
108 | public boolean canBow(int index) {
109 | return true;
110 | }
111 |
112 | @Override
113 | public int getWordCandidateLength(int index) {
114 | return 1;
115 | }
116 |
117 | @Override
118 | public int getNextInOriginal(int index) {
119 | return index + 1;
120 | }
121 |
122 | @Override
123 | public int modifiedOffset(int index) {
124 | return 0;
125 | }
126 |
127 | @Override
128 | public byte[] getByteText() {
129 | return new byte[0];
130 | }
131 | }
132 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi
18 |
19 | import kotlin.test.Test
20 | import kotlin.test.assertEquals
21 |
22 | class MorphemeImplTest {
23 | @Test
24 | fun useToString() {
25 | val dic = TestDictionary.user0()
26 | val sudachi = dic.create().tokenize("すだち")
27 | assertEquals(
28 | "MorphemeImpl{begin=0, end=1, surface=す, pos=4/名詞,普通名詞,一般,*,*,*, wid=(0,0)}",
29 | sudachi[0].toString())
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/StringUtilTest.kt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi
18 |
19 | import kotlin.test.Test
20 | import kotlin.test.assertContentEquals
21 | import kotlin.test.assertEquals
22 |
23 | class StringUtilTest {
24 | @Test
25 | fun readAllBytes() {
26 | val resource = javaClass.getResource("/char.def")
27 | val buf = StringUtil.readAllBytes(resource)
28 | val str = StringUtil.readFully(resource)
29 | val bytes = str.encodeToByteArray()
30 | assertEquals(bytes.size, buf.remaining())
31 | val arr2 = ByteArray(bytes.size)
32 | buf.get(arr2)
33 | assertContentEquals(bytes, arr2)
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi
18 |
19 | import com.worksap.nlp.sudachi.dictionary.BinaryDictionary
20 | import com.worksap.nlp.sudachi.dictionary.build.DicBuilder
21 | import com.worksap.nlp.sudachi.dictionary.build.MemChannel
22 | import com.worksap.nlp.sudachi.dictionary.build.res
23 |
24 | /** Utility for lazily creating binary dictionaries for test */
25 | object TestDictionary {
26 | val systemDictData: MemChannel by lazy {
27 | val result = MemChannel()
28 | DicBuilder.system()
29 | .matrix(res("/dict/matrix.def"))
30 | .lexicon(res("/dict/lex.csv"))
31 | .description("the system dictionary for the unit tests")
32 | .build(result)
33 | result
34 | }
35 |
36 | val userDict1Data: MemChannel by lazy {
37 | val chan = MemChannel()
38 | DicBuilder.user(systemDict).lexicon(res("/dict/user.csv")).build(chan)
39 | chan
40 | }
41 |
42 | val systemDict: BinaryDictionary
43 | get() = BinaryDictionary.loadSystem(systemDictData.buffer())
44 |
45 | val userDict1: BinaryDictionary
46 | get() = BinaryDictionary.loadUser(userDict1Data.buffer())
47 |
48 | val userDict2: BinaryDictionary by lazy {
49 | val chan = MemChannel()
50 | DicBuilder.user(systemDict).lexicon(res("/dict/user2.csv")).build(chan)
51 | BinaryDictionary.loadUser(chan.buffer())
52 | }
53 |
54 | fun user0Cfg(): Config {
55 | return Config.defaultConfig().clearUserDictionaries().systemDictionary(systemDict)
56 | }
57 |
58 | fun user1Cfg(): Config {
59 | return user0Cfg().addUserDictionary(userDict1)
60 | }
61 |
62 | fun user2Cfg(): Config {
63 | return user1Cfg().addUserDictionary(userDict2)
64 | }
65 |
66 | /** System only */
67 | fun user0(): JapaneseDictionary {
68 | return DictionaryFactory().create(user0Cfg()) as JapaneseDictionary
69 | }
70 |
71 | /** System + One User dictionary */
72 | fun user1(): JapaneseDictionary {
73 | return DictionaryFactory().create(user1Cfg()) as JapaneseDictionary
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/TestLoggingConfig.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import java.io.IOException;
20 | import java.io.InputStream;
21 | import java.util.logging.LogManager;
22 |
23 | public class TestLoggingConfig {
24 | public TestLoggingConfig() throws IOException {
25 | try (InputStream is = TestLoggingConfig.class.getClassLoader().getResourceAsStream("logging.properties")) {
26 | LogManager.getLogManager().readConfiguration(is);
27 | }
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi
18 |
19 | import com.worksap.nlp.sudachi.dictionary.CharacterCategory
20 | import com.worksap.nlp.sudachi.dictionary.GrammarImpl
21 | import kotlin.test.*
22 |
23 | class TextNormalizerTest {
24 |
25 | private val dic =
26 | DictionaryFactory()
27 | .create(TestDictionary.user2Cfg().characterDefinition(CharacterCategory.loadDefault()))
28 | as JapaneseDictionary
29 |
30 | @Test
31 | fun instantiation() {
32 | TextNormalizer.fromDictionary(dic)
33 | TextNormalizer(dic.getGrammar())
34 | TextNormalizer(dic.getGrammar(), dic.inputTextPlugins)
35 | TextNormalizer.defaultTextNormalizer()
36 | }
37 |
38 | @Test
39 | fun failToInstantiateWithoutCharCategory() {
40 | val grammar = GrammarImpl()
41 | assertFails { TextNormalizer(grammar) }
42 | }
43 |
44 | @Test
45 | fun normalizeText() {
46 | val tn = TextNormalizer.defaultTextNormalizer()
47 |
48 | // from DefaultInputTextPlugin test
49 | assertEquals("âbγд(株)ガヴ⼼ⅲ", tn.normalize("ÂBΓД㈱ガウ゛⼼Ⅲ"))
50 | }
51 |
52 | @Test
53 | fun normalizeTextWithDefaultConfig() {
54 | // will use default config, which has InputTextPlugins of
55 | // [Default, ProlongedSoundMark, IgnoreYomigana]
56 | val tn = TextNormalizer.fromDictionary(dic)
57 | print(dic.inputTextPlugins)
58 |
59 | assertEquals("âbγд(株)ガヴ⼼ⅲ", tn.normalize("ÂBΓД㈱ガウ゛⼼Ⅲ")) // default
60 | assertEquals("うわーい", tn.normalize("うわーーーい")) // prolonged sound mark
61 | assertEquals("小鳥遊", tn.normalize("小鳥遊(タカナシ)")) // ignore yomigana
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/Utils.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi;
18 |
19 | import java.io.IOException;
20 | import java.net.URISyntaxException;
21 | import java.net.URL;
22 | import java.nio.file.Files;
23 | import java.nio.file.Path;
24 | import java.nio.file.Paths;
25 |
26 | public class Utils {
27 | public static void copyResource(Path folder, String... files) throws IOException {
28 | for (String file : files) {
29 | try {
30 | URL src = Utils.class.getResource(file);
31 | Path dest = Paths.get(src.toURI()).getFileName();
32 | Files.copy(src.openStream(), folder.resolve(dest));
33 | } catch (URISyntaxException e) {
34 | throw new IOException(e);
35 | }
36 | }
37 | }
38 | }
39 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/WordIdTest.kt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi
18 |
19 | import kotlin.test.assertEquals
20 | import kotlin.test.assertFails
21 | import kotlin.test.assertNotEquals
22 | import org.junit.Test
23 |
24 | class WordIdTest {
25 | @Test
26 | fun valid() {
27 | assertEquals(WordId.make(0, 0), 0)
28 | assertEquals(WordId.make(0, 5), 5)
29 | assertNotEquals(WordId.make(1, 5), 5)
30 | }
31 |
32 | @Test
33 | fun deconstruct() {
34 | val wid = WordId.make(12, 51612312)
35 | assertEquals(12, WordId.dic(wid))
36 | assertEquals(51612312, WordId.word(wid))
37 | }
38 |
39 | @Test
40 | fun invalid() {
41 | assertFails { WordId.make(0, WordId.MAX_WORD_ID + 1) }
42 | assertFails { WordId.make(WordId.MAX_DIC_ID + 1, 0) }
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/WordMaskTest.kt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi
18 |
19 | import kotlin.test.Test
20 | import kotlin.test.assertFalse
21 | import kotlin.test.assertTrue
22 |
23 | class WordMaskTest {
24 |
25 | @Test
26 | fun works() {
27 | (1..65).forEach { i ->
28 | val mask = WordMask.nth(i)
29 | assertTrue(WordMask.hasNth(mask, i))
30 | }
31 | }
32 |
33 | @Test
34 | fun addNth() {
35 | val mask1 = WordMask.addNth(0, 1)
36 | val mask2 = WordMask.addNth(mask1, 3)
37 | val mask3 = WordMask.addNth(mask2, 64)
38 | assertTrue(WordMask.hasNth(mask3, 1))
39 | assertFalse(WordMask.hasNth(mask3, 2))
40 | assertTrue(WordMask.hasNth(mask3, 3))
41 | assertFalse(WordMask.hasNth(mask3, 4))
42 | assertFalse(WordMask.hasNth(mask3, 63))
43 | assertTrue(WordMask.hasNth(mask3, 64))
44 | assertTrue(WordMask.hasNth(mask3, 65))
45 | }
46 | }
47 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderTest.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary;
18 |
19 | import static org.junit.Assert.assertEquals;
20 | import static org.junit.Assert.assertTrue;
21 |
22 | import java.io.IOException;
23 |
24 | import com.worksap.nlp.sudachi.TestDictionary;
25 | import org.junit.Before;
26 | import org.junit.Test;
27 |
28 | public class DictionaryHeaderTest {
29 | DictionaryHeader header;
30 |
31 | @Before
32 | public void setUp() throws IOException {
33 | header = new DictionaryHeader(TestDictionary.INSTANCE.getSystemDictData().buffer(), 0);
34 | }
35 |
36 | @Test
37 | public void getVersion() {
38 | assertEquals(DictionaryVersion.SYSTEM_DICT_VERSION_2, header.getVersion());
39 | }
40 |
41 | @Test
42 | public void getCreateTime() {
43 | assertTrue(header.getCreateTime() > 0);
44 | }
45 |
46 | @Test
47 | public void getDescription() {
48 | assertEquals("the system dictionary for the unit tests", header.getDescription());
49 | }
50 | }
51 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryReader.java:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2021 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary;
18 |
19 | import java.io.IOException;
20 | import java.io.InputStream;
21 | import java.nio.ByteBuffer;
22 | import java.nio.ByteOrder;
23 | import java.util.ArrayList;
24 |
25 | class DictionaryReader {
26 |
27 | static ByteBuffer read(String filename) throws IOException {
28 | InputStream input = DictionaryReader.class.getResourceAsStream(filename);
29 | ArrayList buffer = new ArrayList<>();
30 | for (int c = input.read(); c >= 0; c = input.read()) {
31 | buffer.add((byte) c);
32 | }
33 | ByteBuffer bytes = ByteBuffer.allocate(buffer.size());
34 | bytes.order(ByteOrder.LITTLE_ENDIAN);
35 | for (Byte b : buffer) {
36 | bytes.put(b);
37 | }
38 | bytes.rewind();
39 |
40 | return bytes;
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrixTest.kt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary.build
18 |
19 | import com.worksap.nlp.sudachi.dictionary.Connection
20 | import java.io.InputStream
21 | import kotlin.test.assertEquals
22 | import kotlin.test.assertFailsWith
23 | import kotlin.test.assertNotNull
24 | import org.junit.Test
25 |
26 | object Res {
27 | operator fun invoke(name: String, fn: (InputStream) -> R): R {
28 | Res.javaClass.getResourceAsStream(name).use {
29 | assertNotNull(it, "resource '$name' did not exist")
30 | return fn(it)
31 | }
32 | }
33 | }
34 |
35 | class ConnectionMatrixTest {
36 | @Test
37 | fun parse3x3() {
38 | val cm = ConnectionMatrix()
39 | assertEquals(9, Res("test.matrix") { cm.readEntries(it) })
40 | val conn = Connection(cm.compiledNoHeader.asShortBuffer(), 3, 3)
41 | assertEquals(conn.cost(0, 0), 0)
42 | assertEquals(conn.cost(1, 1), 4)
43 | assertEquals(conn.cost(2, 1), 7)
44 | }
45 |
46 | @Test
47 | fun invalidHeader() {
48 | val cm = ConnectionMatrix()
49 | assertFailsWith { cm.readEntries("1".byteInputStream()) }
50 | }
51 |
52 | @Test
53 | fun emptyHeader() {
54 | val cm = ConnectionMatrix()
55 | assertFailsWith { cm.readEntries("".byteInputStream()) }
56 | }
57 |
58 | @Test
59 | fun badHeader() {
60 | val cm = ConnectionMatrix()
61 | assertFailsWith { cm.readEntries("5 a".byteInputStream()) }
62 | }
63 | }
64 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/dictionary/build/DicBufferTest.kt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary.build
18 |
19 | import java.nio.ByteOrder
20 | import kotlin.test.*
21 |
22 | class DicBufferTest {
23 | @Test
24 | fun writeEmptyIntArray() {
25 | val s = DicBuffer(1024)
26 | s.putInts(intArrayOf())
27 | val bb = s.consume { it.duplicate() }
28 | assertEquals(bb.remaining(), 1)
29 | assertEquals(bb.get(), 0)
30 | assertEquals(bb.remaining(), 0)
31 | }
32 |
33 | @Test
34 | fun writeIntArray() {
35 | val s = DicBuffer(1024)
36 | s.putInts(intArrayOf(1, 2, 3))
37 | val bb = s.consume { it.duplicate() }
38 | bb.order(ByteOrder.LITTLE_ENDIAN)
39 | assertEquals(bb.remaining(), 4 * 3 + 1)
40 | assertEquals(bb.get(), 3)
41 | assertEquals(bb.getInt(), 1)
42 | assertEquals(bb.getInt(), 2)
43 | assertEquals(bb.getInt(), 3)
44 | assertEquals(bb.remaining(), 0)
45 | }
46 |
47 | @Test
48 | fun writeEmptyString() {
49 | val s = DicBuffer(1024)
50 | s.put("")
51 | val bb = s.consume { it.duplicate() }
52 | assertEquals(bb.remaining(), 1)
53 | assertEquals(bb.get(), 0)
54 | assertEquals(bb.remaining(), 0)
55 | }
56 |
57 | @Test
58 | fun writeSmallString() {
59 | val s = DicBuffer(1024)
60 | s.put("あ𠮟")
61 | val bb = s.consume { it.duplicate() }
62 | bb.order(ByteOrder.LITTLE_ENDIAN)
63 | assertEquals(bb.remaining(), 1 + 2 * 3)
64 | assertEquals(bb.get(), 3)
65 | assertEquals(bb.getChar(), 'あ')
66 | assertEquals(bb.getChar(), '\uD842')
67 | assertEquals(bb.getChar(), '\uDF9F')
68 | assertEquals(bb.remaining(), 0)
69 | }
70 |
71 | @Test
72 | fun writeLargeString() {
73 | val s = DicBuffer(1024)
74 | val str = "0123456789".repeat(20)
75 | s.put(str)
76 | val bb = s.consume { it.duplicate() }
77 | bb.order(ByteOrder.LITTLE_ENDIAN)
78 | val length = str.length
79 | assertEquals(bb.remaining(), 2 + length * 2)
80 | assertEquals(bb.get(), (length shr 8 or 0x80).toByte())
81 | assertEquals(bb.get(), (length and 0xff).toByte())
82 | }
83 |
84 | @Test
85 | fun failWriteHugeString() {
86 | val s = DicBuffer(1024)
87 | val str = "0123456789".repeat(DicBuffer.MAX_STRING / 10 + 1)
88 | assertFails { s.put(str) }
89 | }
90 |
91 | @Test
92 | fun checkedPut() {
93 | val s = DicBuffer(10)
94 | assertTrue { s.put("asdf") }
95 | assertFalse { s.put("asdf") }
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/dictionary/build/GrammarTest.kt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary.build
18 |
19 | import com.worksap.nlp.sudachi.dictionary.GrammarImpl
20 | import com.worksap.nlp.sudachi.dictionary.POS
21 | import kotlin.test.assertEquals
22 | import kotlin.test.assertFails
23 | import org.junit.Test
24 |
25 | class GrammarTest {
26 | @Test
27 | fun singlePos() {
28 | val cm = ConnectionMatrix()
29 | Res("test.matrix") { cm.readEntries(it) }
30 | val pos = POSTable()
31 | assertEquals(0, pos.getId(POS("a", "b", "c", "d", "e", "f")))
32 | val outbuf = MemChannel()
33 | val out = ModelOutput(outbuf)
34 | pos.writeTo(out)
35 | cm.writeTo(out)
36 | val gram = GrammarImpl(outbuf.buffer(), 0)
37 | assertEquals(gram.getPartOfSpeechString(0), POS("a", "b", "c", "d", "e", "f"))
38 | }
39 |
40 | @Test
41 | fun failPosData() {
42 | val posTable = POSTable()
43 | repeat(Short.MAX_VALUE.toInt()) {
44 | val pos = POS("a", "b", "c", "d", "e", it.toString())
45 | assertEquals(posTable.getId(pos), it.toShort())
46 | }
47 | assertFails { posTable.getId(POS("a", "a", "a", "a", "a", "a")) }
48 | }
49 |
50 | @Test
51 | fun invalidPos() {
52 | assertFails { POS() }
53 | assertFails { POS("1") }
54 | assertFails { POS("1", "2") }
55 | assertFails { POS("1", "2", "3") }
56 | assertFails { POS("1", "2", "3", "4") }
57 | assertFails { POS("1", "2", "3", "4", "5") }
58 | assertFails { POS("1", "2", "3", "4", "5", null) }
59 | assertFails { POS("1", "2", "3", "4", "5", "6", "7") }
60 | assertFails { POS("1", "2", "3", "4", "5", "6".repeat(POS.MAX_COMPONENT_LENGTH + 1)) }
61 | }
62 |
63 | @Test
64 | fun worksWithEnormousPos() {
65 | val posTable = POSTable()
66 | val e = "あ".repeat(127)
67 | repeat(1024) {
68 | val pos = POS(e, e, e, e, e, it.toString())
69 | assertEquals(posTable.getId(pos), it.toShort())
70 | }
71 | val cm = ConnectionMatrix()
72 | Res("test.matrix") { cm.readEntries(it) }
73 | val outbuf = MemChannel()
74 | val out = ModelOutput(outbuf)
75 | posTable.writeTo(out)
76 | cm.writeTo(out)
77 | val gram = GrammarImpl(outbuf.buffer(), 0)
78 | assertEquals(gram.partOfSpeechSize, 1024)
79 | repeat(1024) {
80 | val pos = POS(e, e, e, e, e, it.toString())
81 | assertEquals(pos, gram.getPartOfSpeechString(it.toShort()))
82 | }
83 | }
84 | }
85 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/dictionary/build/MemChannel.kt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary.build
18 |
19 | import java.nio.ByteBuffer
20 | import java.nio.ByteOrder
21 | import java.nio.channels.SeekableByteChannel
22 | import java.nio.file.Files
23 | import java.nio.file.Path
24 | import java.nio.file.StandardOpenOption
25 |
26 | class MemChannel : SeekableByteChannel {
27 | private var buffer: ByteBuffer = ByteBuffer.allocate(1024 * 1024)
28 | private var size = 0L
29 |
30 | init {
31 | buffer.order(ByteOrder.LITTLE_ENDIAN)
32 | }
33 |
34 | override fun close() {}
35 |
36 | override fun isOpen(): Boolean {
37 | return true
38 | }
39 |
40 | override fun read(p0: ByteBuffer?): Int {
41 | throw UnsupportedOperationException()
42 | }
43 |
44 | override fun write(p0: ByteBuffer?): Int {
45 | val remaining = p0!!.remaining()
46 | reserve(remaining)
47 | buffer.put(p0)
48 | val pos = buffer.position().toLong()
49 | if (pos > size) {
50 | size = pos
51 | }
52 | return remaining
53 | }
54 |
55 | private fun reserve(additional: Int) {
56 | val remaining = buffer.remaining()
57 | if (additional <= remaining) {
58 | return
59 | }
60 | val newSize = buffer.capacity() * 2
61 | val newBuf = ByteBuffer.allocate(newSize)
62 | newBuf.order(ByteOrder.LITTLE_ENDIAN)
63 | buffer.flip()
64 | newBuf.put(buffer)
65 | buffer = newBuf
66 | }
67 |
68 | override fun position(): Long {
69 | return buffer.position().toLong()
70 | }
71 |
72 | override fun position(p0: Long): SeekableByteChannel {
73 | buffer.position(p0.toInt())
74 | return this
75 | }
76 |
77 | override fun size(): Long {
78 | return this.size
79 | }
80 |
81 | override fun truncate(p0: Long): SeekableByteChannel {
82 | throw UnsupportedOperationException()
83 | }
84 |
85 | fun buffer(): ByteBuffer {
86 | val dup = buffer.duplicate()
87 | dup.position(0)
88 | dup.limit(buffer.position())
89 | dup.order(ByteOrder.LITTLE_ENDIAN)
90 | return dup
91 | }
92 |
93 | fun writeData(path: Path) {
94 | Files.newByteChannel(
95 | path,
96 | StandardOpenOption.WRITE,
97 | StandardOpenOption.CREATE,
98 | StandardOpenOption.TRUNCATE_EXISTING)
99 | .use { it.write(buffer()) }
100 | }
101 | }
102 |
--------------------------------------------------------------------------------
/src/test/java/com/worksap/nlp/sudachi/dictionary/build/ParametersTest.kt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd.
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package com.worksap.nlp.sudachi.dictionary.build
18 |
19 | import kotlin.test.Test
20 | import kotlin.test.assertEquals
21 |
22 | class ParametersTest {
23 | @Test
24 | fun resizeWorks() {
25 | val params = Parameters(4)
26 | params.add(1, 1, 1)
27 | params.add(2, 2, 2)
28 | val ch = MemChannel()
29 | val out = ModelOutput(ch)
30 | params.writeTo(out)
31 | assertEquals(ch.position(), 12)
32 | val b = ch.buffer()
33 | assertEquals(b.short, 1)
34 | assertEquals(b.short, 1)
35 | assertEquals(b.short, 1)
36 | assertEquals(b.short, 2)
37 | assertEquals(b.short, 2)
38 | assertEquals(b.short, 2)
39 | assertEquals(b.remaining(), 0)
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/src/test/resources/char.def:
--------------------------------------------------------------------------------
1 | 0x0021..0x002F SYMBOL #!"#$%&'()*+,-./
2 | 0x0030..0x0039 NUMERIC #0-9
3 | 0x0041..0x005A ALPHA #A-Z
4 | 0x0061..0x007A ALPHA #a-z
5 | 0x00C0..0x00FF ALPHA # Latin 1 #À->ÿ
6 | 0x3041..0x309F HIRAGANA
7 | 0x30A1..0x30FF KATAKANA
8 | 0x30A1 NOOOVBOW
9 | 0xFF66..0xFF9D KATAKANA
10 | 0xFF9E..0xFF9F KATAKANA
11 | 0x2E80..0x2EF3 KANJI # CJK Raidcals Supplement
12 | 0x2F00..0x2FD5 KANJI
13 | 0x3005 KANJI
14 | 0x3007 KANJI
15 | 0x3400..0x4DB5 KANJI # CJK Unified Ideographs Extention
16 | 0x4E00..0x9FA5 KANJI
17 | 0xF900..0xFA2D KANJI
18 | 0xFA30..0xFA6A KANJI
19 | 0xFF10..0xFF19 NUMERIC
20 | 0xFF21..0xFF3A ALPHA
21 | 0xFF41..0xFF5A ALPHA
22 |
--------------------------------------------------------------------------------
/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/one.csv:
--------------------------------------------------------------------------------
1 | 東,1,1,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,*
2 |
--------------------------------------------------------------------------------
/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/sudachi_dic_build.json:
--------------------------------------------------------------------------------
1 | {
2 | "characterDefinitionFile" : "char.def",
3 | "inputTextPlugin" : [
4 | { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" },
5 | { "class" : "com.worksap.nlp.sudachi.IgnoreYomiganaPlugin",
6 | "leftBrackets": ["(", "("],
7 | "rightBrackets": [")", ")"],
8 | "maxYomiganaLength": 4}
9 | ],
10 | "oovProviderPlugin" : [
11 | { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin",
12 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
13 | "leftId" : 8,
14 | "rightId" : 8,
15 | "cost" : 6000 }
16 | ],
17 | "formatterPlugin" : [
18 | { "class" : "com.worksap.nlp.sudachi.SimpleMorphemeFormatter" },
19 | { "class" : "com.worksap.nlp.sudachi.WordSegmentationFormatter",
20 | "eos" : "\n" },
21 | { "class" : "com.worksap.nlp.sudachi.WordSegmentationFormatter",
22 | "eos" : " " }
23 | ]
24 | }
25 |
--------------------------------------------------------------------------------
/src/test/resources/com/worksap/nlp/sudachi/dictionary/build/test.matrix:
--------------------------------------------------------------------------------
1 | 3 3
2 | 0 0 0
3 | 0 1 1
4 | 0 2 2
5 | 1 0 3
6 | 1 1 4
7 | 1 2 5
8 | 2 0 6
9 | 2 1 7
10 | 2 2 8
11 |
12 |
--------------------------------------------------------------------------------
/src/test/resources/dict/lex.csv:
--------------------------------------------------------------------------------
1 | た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*,*
2 | に,2,2,11406,に,助詞,接続助詞,*,*,*,*,ニ,に,*,A,*,*,*,*
3 | に,3,3,4481,に,助詞,格助詞,*,*,*,*,ニ,に,*,A,*,*,*,*
4 | 京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,1/5
5 | 東,7,7,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,*
6 | 東京,6,6,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,*
7 | 東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,*
8 | 行く,4,4,5105,行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,*
9 | 行っ,5,5,5122,行っ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,A,*,*,*,*
10 | 都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,*
11 | アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,*,A,*,*,*,*
12 | アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,*,A,*,*,*,*
13 | アイアイウ,6,6,32766,アイアイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,*,A,*,*,*,*
14 | 0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,*,A,*,*,*,*
15 | 1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,*,A,*,*,*,*
16 | 2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,*,A,*,*,*,*
17 | 3,9,9,2478,3,名詞,数詞,*,*,*,*,サン,3,*,A,*,*,*,*
18 | 4,9,9,2478,4,名詞,数詞,*,*,*,*,ヨン,4,*,A,*,*,*,*
19 | 5,9,9,2478,5,名詞,数詞,*,*,*,*,ゴ,5,*,A,*,*,*,*
20 | 6,9,9,2478,6,名詞,数詞,*,*,*,*,ロク,6,*,A,*,*,*,*
21 | 7,9,9,2478,7,名詞,数詞,*,*,*,*,ナナ,7,*,A,*,*,*,*
22 | 8,9,9,2478,8,名詞,数詞,*,*,*,*,ハチ,8,*,A,*,*,*,*
23 | 9,9,9,2478,9,名詞,数詞,*,*,*,*,キュウ,9,*,A,*,*,*,*
24 | 〇,9,9,2478,〇,名詞,数詞,*,*,*,*,ゼロ,〇,*,A,*,*,*,*
25 | 一,9,9,2478,一,名詞,数詞,*,*,*,*,イチ,一,*,A,*,*,*,*
26 | 二,9,9,2478,二,名詞,数詞,*,*,*,*,ニ,二,*,A,*,*,*,*
27 | 三,9,9,2478,三,名詞,数詞,*,*,*,*,サン,三,*,A,*,*,*,*
28 | 四,9,9,2478,四,名詞,数詞,*,*,*,*,ヨン,四,*,A,*,*,*,*
29 | 五,9,9,2478,五,名詞,数詞,*,*,*,*,ゴ,五,*,A,*,*,*,*
30 | 六,9,9,2478,六,名詞,数詞,*,*,*,*,ロク,六,*,A,*,*,*,*
31 | 七,9,9,2478,七,名詞,数詞,*,*,*,*,ナナ,七,*,A,*,*,*,*
32 | 八,9,9,2478,八,名詞,数詞,*,*,*,*,ハチ,八,*,A,*,*,*,*
33 | 九,9,9,2478,九,名詞,数詞,*,*,*,*,キュウ,九,*,A,*,*,*,*
34 | 六三四,6,6,0,六三四,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,*,A,*,*,*,*
35 | いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,*
36 | いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,*
37 | 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,*
38 | 特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,特A,*,A,*,*,*,*
39 | 隠し,-1,-1,0,隠し,名詞,普通名詞,一般,*,*,*,カクシ,隠し,*,A,*,*,*,*
40 | な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,C,11,11,*,*
41 |
--------------------------------------------------------------------------------
/src/test/resources/dict/matrix.def:
--------------------------------------------------------------------------------
1 | 10 10
2 | 0 0 0
3 | 0 1 863
4 | 0 2 2124
5 | 0 3 1032
6 | 0 4 591
7 | 0 5 -162
8 | 0 6 -79
9 | 0 7 887
10 | 0 8 447
11 | 0 9 -535
12 | 1 0 -3689
13 | 1 1 -3361
14 | 1 2 -7643
15 | 1 3 -3267
16 | 1 4 809
17 | 1 5 -1098
18 | 1 6 4606
19 | 1 7 4269
20 | 1 8 4567
21 | 1 9 1635
22 | 2 0 -1959
23 | 2 1 2457
24 | 2 2 811
25 | 2 3 840
26 | 2 4 903
27 | 2 5 -958
28 | 2 6 517
29 | 2 7 2037
30 | 2 8 1392
31 | 2 9 -193
32 | 3 0 -2288
33 | 3 1 1741
34 | 3 2 487
35 | 3 3 792
36 | 3 4 -1474
37 | 3 5 -3429
38 | 3 6 126
39 | 3 7 437
40 | 3 8 605
41 | 3 9 -547
42 | 4 0 -2809
43 | 4 1 -3584
44 | 4 2 -6743
45 | 4 3 -2869
46 | 4 4 -2805
47 | 4 5 -407
48 | 4 6 3422
49 | 4 7 5642
50 | 4 8 6382
51 | 4 9 2165
52 | 5 0 -509
53 | 5 1 -3665
54 | 5 2 -3882
55 | 5 3 -572
56 | 5 4 -1036
57 | 5 5 -54
58 | 5 6 2570
59 | 5 7 3319
60 | 5 8 4059
61 | 5 9 882
62 | 6 0 101
63 | 6 1 2933
64 | 6 2 2198
65 | 6 3 -2004
66 | 6 4 4392
67 | 6 5 4017
68 | 6 6 569
69 | 6 7 475
70 | 6 8 -390
71 | 6 9 852
72 | 7 0 -852
73 | 7 1 2079
74 | 7 2 1180
75 | 7 3 -3084
76 | 7 4 2010
77 | 7 5 1570
78 | 7 6 746
79 | 7 7 2341
80 | 7 8 2051
81 | 7 9 1393
82 | 8 0 -522
83 | 8 1 3354
84 | 8 2 2037
85 | 8 3 -2542
86 | 8 4 3071
87 | 8 5 2631
88 | 8 6 -352
89 | 8 7 2847
90 | 8 8 1134
91 | 8 9 1256
92 | 9 0 -975
93 | 9 1 2498
94 | 9 2 1690
95 | 9 3 -1523
96 | 9 4 3023
97 | 9 5 3139
98 | 9 6 2562
99 | 9 7 3962
100 | 9 8 418
101 | 9 9 -2490
102 |
--------------------------------------------------------------------------------
/src/test/resources/dict/user.csv:
--------------------------------------------------------------------------------
1 | ぴらる,8,8,-32768,ぴらる,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,*,A,*,*,*,*
2 | 府,8,8,2914,府,名詞,普通名詞,一般,*,*,*,フ,府,*,A,*,*,*,*
3 | 東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,*,B,5/U1,*,5/U1,1/3
4 | すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,*,A,*,*,*,*
5 |
--------------------------------------------------------------------------------
/src/test/resources/dict/user2.csv:
--------------------------------------------------------------------------------
1 | ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,*,A,*,*,*,*
2 | かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,*,A,*,*,*,*
3 |
--------------------------------------------------------------------------------
/src/test/resources/joinnumeric/char.def:
--------------------------------------------------------------------------------
1 | #
2 | # Japanese charcter category map
3 | #
4 | # $Id: char.def 9 2012-12-12 04:13:15Z togiso $;
5 | #
6 |
7 | ###################################################################################
8 | #
9 | # CHARACTER CATEGORY DEFINITION
10 | #
11 | # CATEGORY_NAME INVOKE GROUP LENGTH
12 | #
13 | # - CATEGORY_NAME: Name of category. you have to define DEFAULT class.
14 | # - INVOKE: 1/0: always invoke unknown word processing, evan when the word can be found in the lexicon
15 | # - GROUP: 1/0: make a new word by grouping the same chracter category
16 | # - LENGTH: n: 1 to n length new words are added
17 | #
18 | DEFAULT 0 1 0 # DEFAULT is a mandatory category!
19 | SPACE 0 1 0
20 | KANJI 0 0 2
21 | SYMBOL 1 1 0
22 | NUMERIC 1 1 0
23 | ALPHA 1 1 0
24 | HIRAGANA 0 1 2
25 | KATAKANA 1 1 2
26 | KANJINUMERIC 0 1 0 #change INVOKE 1->0
27 | GREEK 1 1 0
28 | CYRILLIC 1 1 0
29 |
30 | ###################################################################################
31 | #
32 | # CODE(UCS2) TO CATEGORY MAPPING
33 | #
34 |
35 | # SPACE
36 | 0x0020 SPACE # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE
37 |
38 | # ASCII
39 | 0x0030..0x0039 NUMERIC #0-9
40 |
41 | # KANJI-NUMERIC (〇 一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆)
42 | 0x3007 KANJINUMERIC KANJI
43 | 0x4E00 KANJINUMERIC KANJI
44 | 0x4E8C KANJINUMERIC KANJI
45 | 0x4E09 KANJINUMERIC KANJI
46 | 0x56DB KANJINUMERIC KANJI
47 | 0x4E94 KANJINUMERIC KANJI
48 | 0x516D KANJINUMERIC KANJI
49 | 0x4E03 KANJINUMERIC KANJI
50 | 0x516B KANJINUMERIC KANJI
51 | 0x4E5D KANJINUMERIC KANJI
52 | 0x5341 KANJINUMERIC KANJI
53 | 0x767E KANJINUMERIC KANJI
54 | 0x5343 KANJINUMERIC KANJI
55 | 0x4E07 KANJINUMERIC KANJI
56 | 0x5104 KANJINUMERIC KANJI
57 | 0x5146 KANJINUMERIC KANJI
58 |
--------------------------------------------------------------------------------
/src/test/resources/logging.properties:
--------------------------------------------------------------------------------
1 | java.util.logging.SimpleFormatter.format=%5$s%n
2 |
3 | com.worksap.nlp.sudachi.handlers=java.util.logging.ConsoleHandler
4 | com.worksap.nlp.sudachi.level=FINEST
5 | java.util.logging.ConsoleHandler.level=ALL
--------------------------------------------------------------------------------
/src/test/resources/rewrite.def:
--------------------------------------------------------------------------------
1 | # ignore normalize list
2 | Ⅲ
3 | ⅲ
4 | ⼼
5 |
6 | # replace char list
7 | ガ ガ
8 | ウ゛ ヴ
9 |
--------------------------------------------------------------------------------
/src/test/resources/rewrite_error_dup.def:
--------------------------------------------------------------------------------
1 | # there are ad uplicated replacement.
2 | 12 21
3 | 12 31
4 |
--------------------------------------------------------------------------------
/src/test/resources/rewrite_error_ignorelist.def:
--------------------------------------------------------------------------------
1 | # there are two characters in ignore list
2 | 12
--------------------------------------------------------------------------------
/src/test/resources/rewrite_error_replacelist.def:
--------------------------------------------------------------------------------
1 | # there are three columns in replace list
2 | 12 21 31
3 |
--------------------------------------------------------------------------------
/src/test/resources/sudachi.json:
--------------------------------------------------------------------------------
1 | {
2 | "systemDict" : "system.dic",
3 | "userDict" : [ "user.dic" ],
4 | "characterDefinitionFile" : "char.def",
5 | "inputTextPlugin" : [
6 | { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" },
7 | { "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin",
8 | "prolongedSoundMarks": ["ー", "〜", "〰"],
9 | "replacementSymbol": "ー"},
10 | { "class" : "com.worksap.nlp.sudachi.IgnoreYomiganaPlugin",
11 | "leftBrackets": ["(", "("],
12 | "rightBrackets": [")", ")"],
13 | "maxYomiganaLength": 4}
14 | ],
15 | "oovProviderPlugin" : [
16 | { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin",
17 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
18 | "leftId" : 8,
19 | "rightId" : 8,
20 | "cost" : 6000 }
21 | ],
22 | "formatterPlugin" : [
23 | { "class" : "com.worksap.nlp.sudachi.SimpleMorphemeFormatter" },
24 | { "class" : "com.worksap.nlp.sudachi.WordSegmentationFormatter",
25 | "eos" : "\n" },
26 | { "class" : "com.worksap.nlp.sudachi.WordSegmentationFormatter",
27 | "eos" : " " }
28 | ]
29 | }
30 |
--------------------------------------------------------------------------------
/src/test/resources/sudachi_minimum.json:
--------------------------------------------------------------------------------
1 | {
2 | "systemDict" : "system.dic",
3 | "inputTextPlugin" : [
4 | { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" }
5 | ],
6 | "oovProviderPlugin" : [
7 | { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin",
8 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ],
9 | "leftId" : 8,
10 | "rightId" : 8,
11 | "cost" : 6000 }
12 | ]
13 | }
14 |
--------------------------------------------------------------------------------
/src/test/resources/sudachi_test_empty.json:
--------------------------------------------------------------------------------
1 | {}
--------------------------------------------------------------------------------