├── .formatter ├── eclipse-formatter.xml └── license-header ├── .gitattributes ├── .github ├── FUNDING.yml └── workflows │ ├── build.yml │ ├── publish.yml │ └── release.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE-2.0.txt ├── README.md ├── benchmark ├── .gitignore ├── README.md ├── benchmark_multithread.sh ├── benchmark_run.sh ├── benchmark_setup.sh ├── commoncrawl.sh ├── jawikipedia.sh ├── kyoto-leads-corpus.sh ├── process_warc.py └── src │ └── com │ └── worksap │ └── nlp │ └── sudachi │ └── benchmark │ └── TokenizeMultiThread.java ├── build.gradle ├── docs ├── Sudachi.png ├── Sudachi.svg ├── _config.yml ├── _layouts │ └── default.html ├── development.md ├── favicon.ico ├── index.en.md ├── index.md ├── oov_handlers.md ├── tutorial.md ├── tutorial_plugin.md └── user_dict.md ├── gradle.properties ├── gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── licenses └── javax.json-1.1.txt ├── settings.gradle └── src ├── jmh └── java │ └── com │ └── worksap │ └── nlp │ └── sudachi │ └── dictionary │ ├── DictionaryBuilderPerformanceTest.java │ ├── DoubleArrayLookupBench.java │ └── MemChannelJmh.kt ├── main ├── java │ └── com │ │ └── worksap │ │ └── nlp │ │ └── sudachi │ │ ├── Config.java │ │ ├── DefaultInputTextPlugin.java │ │ ├── Dictionary.java │ │ ├── DictionaryFactory.java │ │ ├── EditConnectionCostPlugin.java │ │ ├── IOTools.java │ │ ├── IgnoreYomiganaPlugin.java │ │ ├── InhibitConnectionPlugin.java │ │ ├── InputText.java │ │ ├── InputTextBuilder.java │ │ ├── InputTextPlugin.java │ │ ├── JapaneseDictionary.java │ │ ├── JapaneseTokenizer.java │ │ ├── JoinKatakanaOovPlugin.java │ │ ├── JoinNumericPlugin.java │ │ ├── Lattice.java │ │ ├── LatticeImpl.java │ │ ├── LatticeNode.java │ │ ├── LatticeNodeImpl.java │ │ ├── MMap.java │ │ ├── MeCabOovProviderPlugin.java │ │ ├── Morpheme.java │ │ ├── MorphemeFormatterPlugin.java │ │ ├── MorphemeImpl.java │ │ ├── MorphemeList.java │ │ ├── NumericParser.java │ │ ├── OovProviderPlugin.java │ │ ├── PartialPOS.java │ │ ├── PathAnchor.java │ │ ├── PathRewritePlugin.java │ │ ├── Plugin.java │ │ ├── PosMatcher.java │ │ ├── ProlongedSoundMarkInputTextPlugin.java │ │ ├── RegexOovProvider.java │ │ ├── SentenceSplittingAnalysis.java │ │ ├── SentenceSplittingLazyAnalysis.java │ │ ├── Settings.java │ │ ├── SimpleMorphemeFormatter.java │ │ ├── SimpleOovProviderPlugin.java │ │ ├── StringUtil.java │ │ ├── SudachiCommandLine.java │ │ ├── TextNormalizer.java │ │ ├── Tokenizer.java │ │ ├── UTF8InputText.java │ │ ├── UTF8InputTextBuilder.java │ │ ├── WordId.java │ │ ├── WordMask.java │ │ ├── WordSegmentationFormatter.java │ │ ├── dictionary │ │ ├── BinaryDictionary.java │ │ ├── CSVParser.java │ │ ├── CategoryType.java │ │ ├── CharacterCategory.java │ │ ├── Connection.java │ │ ├── DictionaryAccess.java │ │ ├── DictionaryBuilder.java │ │ ├── DictionaryHeader.java │ │ ├── DictionaryHeaderPrinter.java │ │ ├── DictionaryPrinter.java │ │ ├── DictionaryVersion.java │ │ ├── DoubleArrayLexicon.java │ │ ├── DoubleArrayLookup.java │ │ ├── Grammar.java │ │ ├── GrammarImpl.java │ │ ├── Lexicon.java │ │ ├── LexiconSet.java │ │ ├── POS.java │ │ ├── UserDictionaryBuilder.java │ │ ├── WordIdTable.java │ │ ├── WordInfo.java │ │ ├── WordInfoList.java │ │ ├── WordLookup.java │ │ ├── WordParameterList.java │ │ └── build │ │ │ ├── BuildStats.java │ │ │ ├── ConnectionMatrix.java │ │ │ ├── CsvLexicon.java │ │ │ ├── DicBuffer.java │ │ │ ├── DicBuilder.java │ │ │ ├── Index.java │ │ │ ├── InputFileException.java │ │ │ ├── ModelOutput.java │ │ │ ├── POSTable.java │ │ │ ├── Parameters.java │ │ │ ├── Progress.java │ │ │ ├── TrackingInputStream.java │ │ │ ├── WordIdResolver.java │ │ │ ├── WordLookup.java │ │ │ └── WriteDictionary.java │ │ └── sentdetect │ │ └── SentenceDetector.java └── resources │ ├── char.def │ ├── rewrite.def │ ├── sudachi.json │ ├── sudachi.logging.properties │ └── unk.def └── test ├── dict ├── lex.csv ├── matrix.def ├── user.csv └── user2.csv ├── java └── com │ └── worksap │ └── nlp │ └── sudachi │ ├── ConfigTest.kt │ ├── DefaultInputTextPluginTest.java │ ├── DictionaryFactoryTest.kt │ ├── IgnoreYomiganaPluginTest.java │ ├── InhibitConnectionPluginTest.java │ ├── JapaneseDictionaryTest.java │ ├── JapaneseTokenizerMaskTest.kt │ ├── JapaneseTokenizerStreamingTest.kt │ ├── JapaneseTokenizerTest.java │ ├── JoinKatakanaOovPluginTest.java │ ├── JoinNumericPluginTest.java │ ├── MMapTest.java │ ├── MeCabOovProviderPluginTest.java │ ├── MockGrammar.java │ ├── MockInputText.java │ ├── MorphemeImplTest.kt │ ├── NumericParserTest.java │ ├── OovProviderPluginTest.kt │ ├── PathAnchorTest.kt │ ├── PosMatcherTest.kt │ ├── ProlongedSoundMarkInputTextPluginTest.java │ ├── RegexOovProviderTest.kt │ ├── SettingsTest.java │ ├── StringUtilTest.kt │ ├── SudachiCommandLineTest.java │ ├── TestDictionary.kt │ ├── TestLoggingConfig.java │ ├── TextNormalizerTest.kt │ ├── UTF8InputTextTest.java │ ├── UserDictionaryTest.java │ ├── Utils.java │ ├── WordIdTest.kt │ ├── WordMaskTest.kt │ ├── dictionary │ ├── CSVParserTest.java │ ├── CharacterCategoryTest.java │ ├── DictionaryBuilderTest.java │ ├── DictionaryHeaderPrinterTest.java │ ├── DictionaryHeaderTest.java │ ├── DictionaryPrinterTest.java │ ├── DictionaryReader.java │ ├── DoubleArrayLexiconTest.java │ ├── GrammarImplTest.java │ ├── UserDictionaryBuilderTest.java │ └── build │ │ ├── ConnectionMatrixTest.kt │ │ ├── CsvLexiconTest.kt │ │ ├── DicBufferTest.kt │ │ ├── GrammarTest.kt │ │ ├── MemChannel.kt │ │ ├── ParametersTest.kt │ │ ├── SystemDicTest.kt │ │ └── UserDicTest.kt │ └── sentdetect │ └── SentenceDetectorTest.java └── resources ├── char.def ├── com └── worksap │ └── nlp │ └── sudachi │ └── dictionary │ └── build │ ├── one.csv │ ├── sudachi_dic_build.json │ └── test.matrix ├── dict ├── lex.csv ├── matrix.def ├── user.csv └── user2.csv ├── joinnumeric └── char.def ├── logging.properties ├── rewrite.def ├── rewrite_error_dup.def ├── rewrite_error_ignorelist.def ├── rewrite_error_replacelist.def ├── sudachi.json ├── sudachi_minimum.json └── sudachi_test_empty.json /.formatter/license-header: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) $YEAR Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Handle line endings automatically for files detected as text 2 | # and leave all files detected as binary untouched. 3 | * text=auto 4 | 5 | # Force the following filetypes to have unix eols, so Windows does not break them 6 | *.* text eol=lf 7 | 8 | # 9 | ## These files are binary and should be left untouched 10 | # 11 | 12 | # (binary is a macro for -text -diff) 13 | *.png binary 14 | *.jpg binary 15 | *.jpeg binary 16 | *.gif binary 17 | *.ico binary 18 | *.mov binary 19 | *.mp4 binary 20 | *.mp3 binary 21 | *.flv binary 22 | *.fla binary 23 | *.swf binary 24 | *.gz binary 25 | *.zip binary 26 | *.7z binary 27 | *.ttf binary 28 | *.eot binary 29 | *.woff binary 30 | *.pyc binary 31 | *.pdf binary 32 | *.ez binary 33 | *.bz2 binary 34 | *.swp binary -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: WorksApplications 4 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | on: 3 | push: 4 | branches: 5 | - develop 6 | pull_request: 7 | types: [opened, synchronize, reopened] 8 | 9 | permissions: 10 | checks: write # for SonarQube 11 | contents: read # for SonarQube 12 | statuses: read # for SonarQube 13 | pull-requests: read # for SonarQube 14 | 15 | jobs: 16 | build: 17 | name: Build 18 | runs-on: ubuntu-latest 19 | strategy: 20 | matrix: 21 | java-version: [11, 17] 22 | steps: 23 | - uses: actions/checkout@v3 24 | with: 25 | fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis 26 | - name: Set up JDK ${{ matrix.java-version }} 27 | uses: actions/setup-java@v3 28 | with: 29 | java-version: ${{ matrix.java-version }} 30 | distribution: temurin 31 | cache: gradle 32 | - name: Cache SonarCloud packages 33 | if: matrix.java-version >= 17 34 | uses: actions/cache@v3 35 | with: 36 | path: ~/.sonar/cache 37 | key: ${{ runner.os }}-sonar 38 | - name: Build and check 39 | run: ./gradlew --no-daemon --info --stacktrace build jacocoTestReport 40 | - name: Analyze with sonarqube 41 | env: 42 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # Needed to get PR information, if any 43 | SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }} 44 | if: matrix.java-version >= 17 && env.SONAR_TOKEN != '' 45 | run: ./gradlew --no-daemon --info --stacktrace sonarqube -Dsonar.verbose=true -Dsonar.login=$SONAR_TOKEN 46 | - uses: actions/upload-artifact@v3 47 | if: failure() 48 | with: 49 | name: reports (${{ matrix.java-version }}) 50 | path: build/reports 51 | - name: Build javadoc 52 | run: ./gradlew --no-daemon --info javadoc 53 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish to Maven Central 2 | on: 3 | push: 4 | branches: 5 | - develop 6 | # pull_request: # uncommit for debug 7 | # types: [ opened, synchronize, reopened ] 8 | jobs: 9 | publish: 10 | name: Publish 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v3 14 | with: 15 | fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis 16 | - name: Set up JDK 11 17 | uses: actions/setup-java@v3 18 | with: 19 | java-version: 11 20 | distribution: temurin 21 | cache: gradle 22 | - name: Publish Package 23 | env: 24 | GITHUB_USERNAME: GITHUB_ACTOR 25 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 26 | MAVEN_USERNAME: ${{ secrets.MAVEN_USERNAME }} 27 | MAVEN_USER_PASSWORD: ${{ secrets.MAVEN_USER_PASSWORD }} 28 | MAVEN_GPG_PASSPHRASE: ${{ secrets.MAVEN_GPG_PASSPHRASE }} 29 | MAVEN_GPG_PRIVATE_KEY: ${{ secrets.MAVEN_GPG_PRIVATE_KEY }} 30 | run: ./gradlew --no-daemon --info --stacktrace publish publishToSonatype closeAndReleaseSonatypeStagingRepository 31 | if: env.MAVEN_USERNAME != '' 32 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | # This workflow will build a Java project with Maven 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/building-and-testing-java-with-maven 3 | 4 | name: Create release draft 5 | 6 | on: 7 | push: 8 | tags: 9 | - 'v*' 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v3 17 | - name: Set up JDK 11 18 | uses: actions/setup-java@v3 19 | with: 20 | java-version: 11 21 | distribution: temurin 22 | cache: gradle 23 | - name: Extract tag version 24 | id: tag 25 | run: echo "::set-output name=version::$(echo ${{ github.ref }} | sed -e 's/refs\/tags\/v//')" 26 | - name: Build Distribution Archive 27 | run: ./gradlew --no-daemon --info --stacktrace assembleExecutableDist 28 | - name: Create release 29 | id: create_release 30 | uses: actions/create-release@v1 31 | env: 32 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 33 | with: 34 | tag_name: ${{ github.ref }} 35 | release_name: Sudachi version ${{ steps.tag.outputs.version }} 36 | draft: true 37 | prerelease: false 38 | - name: Upload release asset 39 | id: upload_release_asset 40 | uses: actions/upload-release-asset@v1 41 | env: 42 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 43 | with: 44 | upload_url: ${{ steps.create_release.outputs.upload_url }} 45 | asset_path: build/distributions/sudachi-executable-${{ steps.tag.outputs.version }}.zip 46 | asset_name: sudachi-${{ steps.tag.outputs.version }}-executable.zip 47 | asset_content_type: application/zip 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/osx,java,linux,maven,windows,eclipse 3 | 4 | ### Eclipse ### 5 | 6 | .metadata 7 | bin/ 8 | tmp/ 9 | *.tmp 10 | *.bak 11 | *.swp 12 | *~.nib 13 | local.properties 14 | .settings/ 15 | .loadpath 16 | .recommenders 17 | 18 | # External tool builders 19 | .externalToolBuilders/ 20 | 21 | # Locally stored "Eclipse launch configurations" 22 | *.launch 23 | 24 | # PyDev specific (Python IDE for Eclipse) 25 | *.pydevproject 26 | 27 | # CDT-specific (C/C++ Development Tooling) 28 | .cproject 29 | 30 | # Java annotation processor (APT) 31 | .factorypath 32 | 33 | # PDT-specific (PHP Development Tools) 34 | .buildpath 35 | 36 | # sbteclipse plugin 37 | .target 38 | 39 | # Tern plugin 40 | .tern-project 41 | 42 | # TeXlipse plugin 43 | .texlipse 44 | 45 | # STS (Spring Tool Suite) 46 | .springBeans 47 | 48 | # Code Recommenders 49 | .recommenders/ 50 | 51 | # Scala IDE specific (Scala & Java development for Eclipse) 52 | .cache-main 53 | .scala_dependencies 54 | .worksheet 55 | 56 | ### Eclipse Patch ### 57 | # Eclipse Core 58 | .project 59 | 60 | # JDT-specific (Eclipse Java Development Tools) 61 | .classpath 62 | 63 | ### IntelliJ-based IDEs 64 | 65 | .idea/ 66 | 67 | ### Java ### 68 | # Compiled class file 69 | *.class 70 | 71 | # Log file 72 | *.log 73 | 74 | # BlueJ files 75 | *.ctxt 76 | 77 | # Mobile Tools for Java (J2ME) 78 | .mtj.tmp/ 79 | 80 | # Package Files # 81 | *.jar 82 | *.war 83 | *.ear 84 | *.zip 85 | *.tar.gz 86 | *.rar 87 | 88 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml 89 | hs_err_pid* 90 | 91 | ### Linux ### 92 | *~ 93 | 94 | # temporary files which can be created if a process still has a handle open of a deleted file 95 | .fuse_hidden* 96 | 97 | # KDE directory preferences 98 | .directory 99 | 100 | # Linux trash folder which might appear on any partition or disk 101 | .Trash-* 102 | 103 | # .nfs files are created when an open file is removed but is still being accessed 104 | .nfs* 105 | 106 | ### Maven ### 107 | target/ 108 | pom.xml.tag 109 | pom.xml.releaseBackup 110 | pom.xml.versionsBackup 111 | pom.xml.next 112 | release.properties 113 | dependency-reduced-pom.xml 114 | buildNumber.properties 115 | .mvn/timing.properties 116 | settings.xml 117 | 118 | # Avoid ignoring Maven wrapper jar file (.jar files are usually ignored) 119 | !/.mvn/wrapper/maven-wrapper.jar 120 | 121 | ### GRADLE ### 122 | 123 | build/ 124 | .gradle/ 125 | !gradle/wrapper/gradle-wrapper.jar 126 | out/ 127 | 128 | ### OSX ### 129 | *.DS_Store 130 | .AppleDouble 131 | .LSOverride 132 | 133 | # Icon must end with two \r 134 | Icon 135 | 136 | # Thumbnails 137 | ._* 138 | 139 | # Files that might appear in the root of a volume 140 | .DocumentRevisions-V100 141 | .fseventsd 142 | .Spotlight-V100 143 | .TemporaryItems 144 | .Trashes 145 | .VolumeIcon.icns 146 | .com.apple.timemachine.donotpresent 147 | 148 | # Directories potentially created on remote AFP share 149 | .AppleDB 150 | .AppleDesktop 151 | Network Trash Folder 152 | Temporary Items 153 | .apdisk 154 | 155 | ### Windows ### 156 | # Windows thumbnail cache files 157 | Thumbs.db 158 | ehthumbs.db 159 | ehthumbs_vista.db 160 | 161 | # Folder config file 162 | Desktop.ini 163 | 164 | # Recycle Bin used on file shares 165 | $RECYCLE.BIN/ 166 | 167 | # Windows Installer files 168 | *.cab 169 | *.msi 170 | *.msm 171 | *.msp 172 | 173 | # Windows shortcuts 174 | *.lnk 175 | 176 | # End of https://www.gitignore.io/api/osx,java,linux,maven,windows,eclipse 177 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## [Unreleased](https://github.com/WorksApplications/Sudachi/releases/tag/v) 4 | 5 | - 6 | 7 | ## [v0.7.5](https://github.com/WorksApplications/Sudachi/releases/tag/v0.7.5) 8 | 9 | ### Added 10 | 11 | - Some benchmark scripts are added under `benchmark/` (#235) 12 | 13 | ### Changed 14 | 15 | - Behavior of the dictionary printer and builder are changed (#234) 16 | - DictioaryPrinter now prints word reference as (surface, pos, reading)-triple format. 17 | - DictionaryBuilder now allow dictionary-form to be triple format. 18 | 19 | ### Fixed 20 | 21 | - [Tutorial](./docs/tutorial.md) is updated (#237) 22 | - The byte order of a ByteBuffer returned by `Config.Resource.asByteBuffer` is now always little endian (#239) 23 | - Also, the byte order of `StringUtil.readAllBytes` is now little endian. 24 | 25 | ## [v0.7.4](https://github.com/WorksApplications/Sudachi/releases/tag/v0.7.4) 26 | 27 | ### Added 28 | 29 | - Update tutorial.md (#226) 30 | - Lazy sentence split and tokenization (#231) 31 | - Add `Tokenizer.lazyTokenizeSentences(SplitMode mode, Readable input)`, that performs analysis lazily and saves memory usage. 32 | 33 | ### Fixed 34 | 35 | - Do not segfault on tokenizing with closed dictionary (#217) 36 | - The default config sudachi.json sets non-existent property joinKanjiNumeric in JoinNumericPlugin (#221) 37 | - fix incorrect size calculation when expand (#227) 38 | 39 | ### Deprecated 40 | 41 | - `Tokenizer.tokenizeSentences(SplitMode mode, Reader input)` are marked as deprecated (#231) 42 | -------------------------------------------------------------------------------- /benchmark/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | -------------------------------------------------------------------------------- /benchmark/README.md: -------------------------------------------------------------------------------- 1 | # Sudachi Benchmark 2 | 3 | Sudachi に大規模なテキストを解析させ、実行速度の計測やバグの検出を行う。 4 | 5 | ## Base Scripts 6 | 7 | ### benchmark_setup.sh 8 | 9 | Sudachi のビルドおよび Sudachi 辞書のビルドを行う。 10 | 11 | - ビルドした `sudachi-executable-[VERSION].zip` を `../build/distributions/sudachi/` 以下に展開する 12 | - `data/` 以下に `system_small.dic`, `system_core.dic`, `system_full.dic` をビルドする 13 | - `data/dictdata/` 以下にダウンロードした Sudachi 辞書データを格納する 14 | 15 | command: `benchmark_setup.sh [dict_version]` 16 | 17 | - `dict_version`: Sudachi 辞書バージョン (default "20240716") 18 | 19 | ### benchmark_run.sh 20 | 21 | 指定のテキストファイルを各辞書タイプ・分割単位で解析する。 22 | 解析結果は `/dev/null` に出力、対象ファイルや開始/終了時刻情報を `data/benchmark.log` に追記する。 23 | 24 | command: `benchmark_run.sh corpus_file` 25 | 26 | - `corpus_file`: 解析対象とするテキストファイル 27 | 28 | ### benchmark_multithread.sh 29 | 30 | 指定のテキストファイルを解析するスレッドを指定数同時に実行する。 31 | 各スレッドは一つの辞書インスタンスから生成した個別のトークナイザーインスタンスを持たせる。 32 | 解析結果は `/dev/null` に出力、対象ファイルや開始/終了時刻情報を `data/benchmark.log` に追記する。 33 | 34 | command: `benchmark_multithread.sh corpus_file [num_thread [dict_type]]` 35 | 36 | - `corpus_file`: 解析対象とするテキストファイル 37 | - `num_thread`: 作成するスレッド数 (default 3) 38 | - `dict_type`: 使用する辞書タイプ (default "small") 39 | 40 | ## Corpus scripts 41 | 42 | ### kyoto-leads-corpus.sh 43 | 44 | [Kyoto University Web Document Leads Corpus](https://github.com/ku-nlp/KWDLC) を取得し、setup および run を実行する。 45 | 46 | command: `kyoto-leads-corpus.sh` 47 | 48 | - 引数なし 49 | 50 | ### jawikipedia.sh 51 | 52 | [Wikipedia 日本語版ダンプデータ](https://ja.wikipedia.org/wiki/Wikipedia:%E3%83%87%E3%83%BC%E3%82%BF%E3%83%99%E3%83%BC%E3%82%B9%E3%83%80%E3%82%A6%E3%83%B3%E3%83%AD%E3%83%BC%E3%83%89)を取得し、setup および run を実行する。 53 | サイズが非常に大きいため、先頭から指定サイズのみを対象とする。 54 | 55 | - 事前に [wikiextracutor](https://github.com/attardi/wikiextractor) のインストールが必要 56 | - `data/jawiki_[DUMP_DATE]/` 以下にデータを格納する。 57 | 58 | command: `jawikipedia.sh [dump_date [size]]` 59 | 60 | - `dump_date`: ダンプデータの生成日時 (default "20240801") 61 | - `size`: 使用するテキストのサイズ (default 100M) 62 | 63 | ### commoncrawl.sh 64 | 65 | [CommonCrawl](https://commoncrawl.org/get-started) データを取得し、setup および run を実行する。 66 | サイズが非常に大きいため、指定数のページのみを対象とする。 67 | 68 | 非日本語のサンプルとして利用するため、言語判別は行わず、また HTML を抽出して使用する。 69 | 70 | - 事前に python および [warcio](https://pypi.org/project/warcio/) のインストールが必要 71 | - `data/cc[CRAWL_DATE]/` 以下にデータを格納する。 72 | 73 | command: `commoncrawl.sh [crawl_date [file_index [num_records]]]` 74 | 75 | - `crawl_date`: クロールデータの生成日時 (CC-MAIN-\*, default "2024-33") 76 | - `file_index`: 使用する WARC ファイルの warc.paths ファイル中の行数 (default 1) 77 | - `num_records`: 使用するレコード数(対象 WARC の先頭から取得) (default 1000) 78 | - 目安として、2024-33 では 1000 レコードでおよそ 50M 79 | -------------------------------------------------------------------------------- /benchmark/benchmark_multithread.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Analyze given file n-times in multithread. 3 | # assume `benchmark_setup.sh` is called beforehand. 4 | 5 | set -eux 6 | DIR=$(dirname "$(readlink -f "$0")") 7 | cd "${DIR}/.." 8 | 9 | SUDACHI_VERSION=$(./gradlew properties --console=plain -q | grep "^version:" | awk '{printf $2}') 10 | 11 | CORPUS_FILE=$1 12 | NUM_THREAD=${2:-3} 13 | DICT_TYPE=${3:-"small"} 14 | 15 | # Build code 16 | BUILD_DIR="$DIR/../build/distributions" 17 | JAR_FILE="$BUILD_DIR/sudachi/sudachi-${SUDACHI_VERSION}.jar" 18 | SRC_ROOT="${DIR}/src" 19 | SRC_DIR="${SRC_ROOT}/com/worksap/nlp/sudachi/benchmark" 20 | SRC_NAME="TokenizeMultiThread" 21 | 22 | if [ ! -e "${SRC_DIR}/${SRC_NAME}.class" ]; then 23 | javac -cp ${JAR_FILE} ${SRC_DIR}/${SRC_NAME}.java 24 | fi 25 | 26 | # Run 27 | cd ${DIR} 28 | DATA_DIR=$DIR/data 29 | LOGFILE="$DATA_DIR/benchmark.log" 30 | 31 | echo "$(date), $SUDACHI_VERSION, multithread ${NUM_THREAD}, ${DICT_TYPE}, begin" >> $LOGFILE 32 | echo $(ls -l $CORPUS_FILE) >> $LOGFILE 33 | 34 | java -Dfile.encoding=UTF-8 -cp ${SRC_ROOT}:${JAR_FILE} \ 35 | com.worksap.nlp.sudachi.benchmark.${SRC_NAME} \ 36 | --systemDict ${DIR}/data/system_${DICT_TYPE}.dic \ 37 | -p "$NUM_THREAD" "$CORPUS_FILE" > /dev/null 38 | 39 | echo "$(date), $SUDACHI_VERSION, multithread ${NUM_THREAD}, ${DICT_TYPE}, end" >> $LOGFILE 40 | -------------------------------------------------------------------------------- /benchmark/benchmark_run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Tokenize given file, with each of small/core/full dict and A/B/C mode. 3 | # assume `benchmark_setup.sh` is called beforehand. 4 | 5 | set -eux 6 | DIR=$(dirname "$(readlink -f "$0")") 7 | cd "${DIR}/.." 8 | 9 | CORPUS_FILE=$1 10 | TASK=${2:-"benchmark"} 11 | 12 | SUDACHI_VERSION=$(./gradlew properties --console=plain -q | grep "^version:" | awk '{printf $2}') 13 | 14 | # Run benchmark 15 | DATA_DIR=$DIR/data 16 | JAR_DIR="$DIR/../build/distributions/sudachi" 17 | LOGFILE="$DATA_DIR/benchmark.log" 18 | 19 | DICT_TYPES=("small" "core" "full") 20 | SPLIT_MODES=("A" "B" "C") 21 | 22 | echo "" >> $LOGFILE 23 | echo "$(date), $SUDACHI_VERSION, $TASK, begin" >> $LOGFILE 24 | echo $(ls -l $CORPUS_FILE) >> $LOGFILE 25 | for TYPE in ${DICT_TYPES[@]}; do 26 | DICT_FILE="$DATA_DIR/system_${TYPE}.dic" 27 | for MODE in ${SPLIT_MODES[@]}; do 28 | echo "$(date), $TYPE, $MODE, begin" >> $LOGFILE 29 | java -Dfile.encoding=UTF-8 -jar "$JAR_DIR/sudachi-${SUDACHI_VERSION}.jar" \ 30 | --systemDict "$DICT_FILE" -m ${MODE} -a \ 31 | "$CORPUS_FILE" > /dev/null 32 | echo "$(date), $TYPE, $MODE, end" >> $LOGFILE 33 | done 34 | done 35 | echo "$(date), $SUDACHI_VERSION, $TASK, end" >> $LOGFILE 36 | -------------------------------------------------------------------------------- /benchmark/benchmark_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Build Sudachi and build small/core/full dictionary with it. 3 | 4 | set -eux 5 | DIR=$(dirname "$(readlink -f "$0")") 6 | cd "${DIR}/.." 7 | 8 | SUDACHI_VERSION=$(./gradlew properties --console=plain -q | grep "^version:" | awk '{printf $2}') 9 | 10 | DICT_VERSION=${1:-"20240716"} 11 | 12 | # Build Sudachi 13 | ./gradlew build 14 | BUILD_DIR="$DIR/../build/distributions" 15 | JAR_DIR="$BUILD_DIR/sudachi" 16 | if [ -e "$JAR_DIR" ]; then 17 | rm -r "$JAR_DIR" 18 | fi 19 | unzip -d "$JAR_DIR" "$BUILD_DIR/sudachi-executable-$SUDACHI_VERSION.zip" 20 | 21 | # Get dictionary data 22 | DATA_DIR=$DIR/data 23 | DICT_DIR=$DIR/data/dictdata 24 | mkdir -p "$DICT_DIR" 25 | 26 | RAW_DICT_BASEURL="http://sudachi.s3-website-ap-northeast-1.amazonaws.com/sudachidict-raw" 27 | 28 | DICT_FILES=("small_lex" "core_lex" "notcore_lex") 29 | for TYPE in ${DICT_FILES[@]}; do 30 | if [ ! -e "$DICT_DIR/${TYPE}.csv" ]; then 31 | ZIPFILE=${TYPE}.zip 32 | if [ ! -e "$DICT_DIR/$ZIPFILE" ]; then 33 | wget "$RAW_DICT_BASEURL/$DICT_VERSION/$ZIPFILE" -P $DICT_DIR 34 | fi 35 | unzip -d $DICT_DIR $DICT_DIR/$ZIPFILE 36 | fi 37 | done 38 | 39 | MATRIX_FILE="matrix.def" 40 | if [ ! -e "$DICT_DIR/$MATRIX_FILE" ]; then 41 | ZIPFILE=${MATRIX_FILE}.zip 42 | if [ ! -e "$ZIPFILE" ]; then 43 | wget "$RAW_DICT_BASEURL/$ZIPFILE" -P $DICT_DIR 44 | fi 45 | unzip -d $DICT_DIR $DICT_DIR/$ZIPFILE 46 | fi 47 | 48 | # Build dictionary 49 | DICT_TYPES=("small" "core" "full") 50 | 51 | for i in $(seq 0 2); do 52 | TYPE=${DICT_TYPES[$i]} 53 | DICT_FILE="$DATA_DIR/system_${TYPE}.dic" 54 | if [ ! -e "$DICT_FILE" ]; then 55 | FILES=$(for v in ${DICT_FILES[@]:0:$(expr $i+1)}; do echo "$DICT_DIR/${v}.csv"; done) 56 | java -Dfile.encoding=UTF-8 -cp "$JAR_DIR/sudachi-${SUDACHI_VERSION}.jar" \ 57 | com.worksap.nlp.sudachi.dictionary.DictionaryBuilder \ 58 | -o "$DICT_FILE" \ 59 | -m "$DICT_DIR/$MATRIX_FILE" \ 60 | $FILES 61 | fi 62 | done 63 | -------------------------------------------------------------------------------- /benchmark/commoncrawl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Run benchmark with CommonCrawl (raw HTML) 3 | 4 | set -eux 5 | DIR=$(dirname "$(readlink -f "$0")") 6 | 7 | CRAWL_DATE=${1:-"2024-33"} 8 | LINE=${2:-"1"} # use n-th file in path file 9 | NUM_RECORDS=${3:-"1000"} # take first n records 10 | 11 | # Download CommonCrawl 12 | DATA_DIR="$DIR/data/cc${CRAWL_DATE}" 13 | mkdir -p "$DATA_DIR" 14 | 15 | CCURL="https://data.commoncrawl.org" 16 | BASEURL="${CCURL}/crawl-data/CC-MAIN-${CRAWL_DATE}" 17 | 18 | PATHFILE="${DATA_DIR}/warc.paths" 19 | if [ ! -e "${PATHFILE}" ]; then 20 | curl -L "${BASEURL}/warc.paths.gz" | gzip -dc > $PATHFILE 21 | fi 22 | 23 | CORPUS_WARC="$DATA_DIR/${LINE}.warc" 24 | FILEURL="${CCURL}/$(head ${PATHFILE} -n ${LINE} | tail -n 1)" 25 | if [ ! -e "${CORPUS_WARC}" ]; then 26 | curl -L "$FILEURL" | gzip -dc > $CORPUS_WARC 27 | fi 28 | 29 | # extract HTML 30 | CORPUS_WARC="$DATA_DIR/${LINE}.warc" 31 | CORPUS_FILE="$DATA_DIR/${LINE}.txt" 32 | python process_warc.py -i ${CORPUS_WARC} -o ${CORPUS_FILE} -n ${NUM_RECORDS} 33 | 34 | # setup & run 35 | $DIR/benchmark_setup.sh 36 | $DIR/benchmark_run.sh $CORPUS_FILE "commoncrawl_${CRAWL_DATE}_${LINE}_${NUM_RECORDS}" 37 | -------------------------------------------------------------------------------- /benchmark/jawikipedia.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Run benchmark with Japanese Wikipedia (first 100M articles) 3 | 4 | set -eux 5 | DIR=$(dirname "$(readlink -f "$0")") 6 | 7 | DUMP_DATE=${1:-"20240801"} 8 | SIZE=${2:-"100M"} 9 | 10 | # Download Wikipedia dump (ja) 11 | DATA_DIR=$DIR/data/jawiki_${DUMP_DATE} 12 | mkdir -p "$DATA_DIR" 13 | 14 | ## full dump is too large (>15GB), take first split. 15 | BASEURL="https://dumps.wikimedia.org/jawiki/${DUMP_DATE}" 16 | FILEURL="${BASEURL}/jawiki-${DUMP_DATE}-pages-articles1.xml-p1p114794.bz2" 17 | CORPUS_XML="$DATA_DIR/jawiki_${DUMP_DATE}_1.xml" 18 | 19 | if [ ! -e "$CORPUS_XML" ]; then 20 | curl -L $FILEURL | bzip2 -dc > $CORPUS_XML 21 | fi 22 | 23 | # extract 24 | CORPUS_FILE="$DATA_DIR/wiki_00" 25 | 26 | ## assume wikiextracutor is installed (https://github.com/attardi/wikiextractor) 27 | if [ ! -e "$CORPUS_FILE" ]; then 28 | python -m wikiextractor.WikiExtractor $CORPUS_XML -o $DATA_DIR -b ${SIZE} 29 | mv $DATA_DIR/AA/* $DATA_DIR 30 | rm -r "$DATA_DIR/AA" 31 | fi 32 | 33 | # setup & run 34 | $DIR/benchmark_setup.sh 35 | $DIR/benchmark_run.sh $CORPUS_FILE "jawiki_${DUMP_DATE}_${SIZE}" 36 | 37 | -------------------------------------------------------------------------------- /benchmark/kyoto-leads-corpus.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Run benchmark with Kyoto Leads Corpus 3 | 4 | set -eux 5 | DIR=$(dirname "$(readlink -f "$0")") 6 | 7 | # Download Kyoto Leads corpus original texts 8 | DATA_DIR=$DIR/data 9 | mkdir -p "$DATA_DIR" 10 | 11 | CORPUS_FILE="$DATA_DIR/leads.txt" 12 | if [ ! -e "$CORPUS_FILE" ]; then 13 | curl -L https://github.com/ku-nlp/KWDLC/releases/download/release_1_0/leads.org.txt.gz | gzip -dc > $CORPUS_FILE 14 | fi 15 | 16 | # Setup & run 17 | $DIR/benchmark_setup.sh 18 | $DIR/benchmark_run.sh $CORPUS_FILE "kyoto-leads" 19 | -------------------------------------------------------------------------------- /benchmark/process_warc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Extract HTML from .warc file. 3 | """ 4 | 5 | import argparse as ap 6 | from pathlib import Path 7 | from tqdm import tqdm 8 | 9 | from warcio.archiveiterator import ArchiveIterator 10 | 11 | 12 | def parse_args() -> ap.Namespace: 13 | parser = ap.ArgumentParser() 14 | parser.add_argument("-i", "--input", type=Path, 15 | help="input warc file") 16 | parser.add_argument("-o", "--output", type=Path, default="output.txt", 17 | help="output text file") 18 | parser.add_argument("-n", "--num-records", type=int, default=None, 19 | help="how many records to dump. dump all if not set.") 20 | 21 | args = parser.parse_args() 22 | return args 23 | 24 | 25 | def main(): 26 | args = parse_args() 27 | 28 | with args.input.open("rb") as fi, args.output.open("wb") as fo: 29 | count = 0 30 | for record in tqdm(ArchiveIterator(fi)): 31 | if (args.num_records is not None) and (count >= args.num_records): 32 | break 33 | 34 | try: 35 | if record.rec_type != "response": 36 | continue 37 | contenttype = record.http_headers.get_header("Content-Type") 38 | if not contenttype.startswith("text/html"): 39 | continue 40 | 41 | # dump raw html 42 | content = record.content_stream().read() 43 | fo.write(content) 44 | 45 | count += 1 46 | except: 47 | continue 48 | print(f"count: {count}") 49 | return 50 | 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /docs/Sudachi.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/Sudachi/e689869967c2944bbf2dca31dc2543aa474f1629/docs/Sudachi.png -------------------------------------------------------------------------------- /docs/Sudachi.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-primer 2 | -------------------------------------------------------------------------------- /docs/_layouts/default.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | {% seo %} 9 | 10 | 11 | 12 | 13 |
14 | 15 | {{ content }} 16 | 17 | 20 |
21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /docs/development.md: -------------------------------------------------------------------------------- 1 | # How to develop Sudachi 2 | 3 | ## Requirements 4 | 5 | You need to install a JDK, for example from https://adoptium.net/ 6 | Both 11 and 17 will suffice. 7 | Sudachi keeps Java 8 source compatibility at the moment, but we use JDK 11 for CI. 8 | 9 | ## Build System 10 | 11 | Sudachi uses [Gradle](https://gradle.org/) for build. 12 | Basic build can be done with 13 | 14 | `./gradlew build` 15 | 16 | It will produce a jar file in the `build/libs` directory. 17 | 18 | Build enforces the code formatting, so during the development the recommended build command is 19 | 20 | `./gradlew spotlessApply test` 21 | 22 | ## Running development version 23 | 24 | Sometimes you would like to run a development version of Sudachi from a jar file. 25 | Gradle allows you to make a development jar installation of Sudachi with all dependencies with 26 | 27 | `./gradlew installExecutableDist` 28 | 29 | ## List of Gradle tasks 30 | 31 | List of all Gradle tasks can be seen with `./gradlew tasks` -------------------------------------------------------------------------------- /docs/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/Sudachi/e689869967c2944bbf2dca31dc2543aa474f1629/docs/favicon.ico -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- 1 | kotlin.stdlib.default.dependency=false 2 | org.gradle.jvmargs=--add-exports jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED \ 3 | --add-exports jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED \ 4 | --add-exports jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED \ 5 | --add-exports jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED \ 6 | --add-exports jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED 7 | org.gradle.caching=true 8 | org.gradle.parallel=true 9 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WorksApplications/Sudachi/e689869967c2944bbf2dca31dc2543aa474f1629/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | distributionBase=GRADLE_USER_HOME 2 | distributionPath=wrapper/dists 3 | distributionUrl=https\://services.gradle.org/distributions/gradle-7.5.1-bin.zip 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @rem 2 | @rem Copyright 2015 the original author or authors. 3 | @rem 4 | @rem Licensed under the Apache License, Version 2.0 (the "License"); 5 | @rem you may not use this file except in compliance with the License. 6 | @rem You may obtain a copy of the License at 7 | @rem 8 | @rem https://www.apache.org/licenses/LICENSE-2.0 9 | @rem 10 | @rem Unless required by applicable law or agreed to in writing, software 11 | @rem distributed under the License is distributed on an "AS IS" BASIS, 12 | @rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @rem See the License for the specific language governing permissions and 14 | @rem limitations under the License. 15 | @rem 16 | 17 | @if "%DEBUG%" == "" @echo off 18 | @rem ########################################################################## 19 | @rem 20 | @rem Gradle startup script for Windows 21 | @rem 22 | @rem ########################################################################## 23 | 24 | @rem Set local scope for the variables with windows NT shell 25 | if "%OS%"=="Windows_NT" setlocal 26 | 27 | set DIRNAME=%~dp0 28 | if "%DIRNAME%" == "" set DIRNAME=. 29 | set APP_BASE_NAME=%~n0 30 | set APP_HOME=%DIRNAME% 31 | 32 | @rem Resolve any "." and ".." in APP_HOME to make it shorter. 33 | for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi 34 | 35 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 36 | set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" 37 | 38 | @rem Find java.exe 39 | if defined JAVA_HOME goto findJavaFromJavaHome 40 | 41 | set JAVA_EXE=java.exe 42 | %JAVA_EXE% -version >NUL 2>&1 43 | if "%ERRORLEVEL%" == "0" goto execute 44 | 45 | echo. 46 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 47 | echo. 48 | echo Please set the JAVA_HOME variable in your environment to match the 49 | echo location of your Java installation. 50 | 51 | goto fail 52 | 53 | :findJavaFromJavaHome 54 | set JAVA_HOME=%JAVA_HOME:"=% 55 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 56 | 57 | if exist "%JAVA_EXE%" goto execute 58 | 59 | echo. 60 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 61 | echo. 62 | echo Please set the JAVA_HOME variable in your environment to match the 63 | echo location of your Java installation. 64 | 65 | goto fail 66 | 67 | :execute 68 | @rem Setup the command line 69 | 70 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 71 | 72 | 73 | @rem Execute Gradle 74 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* 75 | 76 | :end 77 | @rem End local scope for the variables with windows NT shell 78 | if "%ERRORLEVEL%"=="0" goto mainEnd 79 | 80 | :fail 81 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 82 | rem the _cmd.exe /c_ return code! 83 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 84 | exit /b 1 85 | 86 | :mainEnd 87 | if "%OS%"=="Windows_NT" endlocal 88 | 89 | :omega 90 | -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | /* 2 | * This file was generated by the Gradle 'init' task. 3 | */ 4 | 5 | rootProject.name = 'sudachi' 6 | -------------------------------------------------------------------------------- /src/jmh/java/com/worksap/nlp/sudachi/dictionary/DictionaryBuilderPerformanceTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | import com.worksap.nlp.sudachi.dictionary.build.DicBuilder; 20 | import org.openjdk.jmh.annotations.*; 21 | 22 | import java.io.IOException; 23 | import java.nio.file.Path; 24 | import java.nio.file.Paths; 25 | import java.util.concurrent.TimeUnit; 26 | 27 | @BenchmarkMode(Mode.Throughput) 28 | @OutputTimeUnit(TimeUnit.SECONDS) 29 | @Threads(1) 30 | @Fork(jvmArgs = "-Xmx1g") 31 | public class DictionaryBuilderPerformanceTest { 32 | public static void main(String[] args) throws IOException { 33 | org.openjdk.jmh.Main.main(args); 34 | } 35 | 36 | private final static Path ROOT = Paths.get("src/test/resources/dict"); 37 | 38 | @Benchmark 39 | public long smallCase() throws IOException { 40 | MemChannelJmh mc = new MemChannelJmh(); 41 | DicBuilder.system().matrix(ROOT.resolve("matrix.def")).lexicon(ROOT.resolve("lex.csv")).build(mc); 42 | return mc.size(); 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/jmh/java/com/worksap/nlp/sudachi/dictionary/MemChannelJmh.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary 18 | 19 | import java.nio.ByteBuffer 20 | import java.nio.ByteOrder 21 | import java.nio.channels.SeekableByteChannel 22 | 23 | class MemChannelJmh : SeekableByteChannel { 24 | private var buffer: ByteBuffer = ByteBuffer.allocate(1024 * 1024) 25 | private var size = 0L 26 | 27 | init { 28 | buffer.order(ByteOrder.LITTLE_ENDIAN) 29 | } 30 | 31 | override fun close() {} 32 | 33 | override fun isOpen(): Boolean { 34 | return true 35 | } 36 | 37 | override fun read(p0: ByteBuffer?): Int { 38 | throw UnsupportedOperationException() 39 | } 40 | 41 | override fun write(p0: ByteBuffer?): Int { 42 | val remaining = p0!!.remaining() 43 | reserve(remaining) 44 | buffer.put(p0) 45 | val pos = buffer.position().toLong() 46 | if (pos > size) { 47 | size = pos 48 | } 49 | return remaining 50 | } 51 | 52 | private fun reserve(additional: Int) { 53 | val remaining = buffer.remaining() 54 | if (additional <= remaining) { 55 | return 56 | } 57 | val newSize = buffer.capacity() * 2 58 | val newBuf = ByteBuffer.allocate(newSize) 59 | newBuf.order(ByteOrder.LITTLE_ENDIAN) 60 | buffer.flip() 61 | newBuf.put(buffer) 62 | buffer = newBuf 63 | } 64 | 65 | override fun position(): Long { 66 | return buffer.position().toLong() 67 | } 68 | 69 | override fun position(p0: Long): SeekableByteChannel { 70 | buffer.position(p0.toInt()) 71 | return this 72 | } 73 | 74 | override fun size(): Long { 75 | return this.size 76 | } 77 | 78 | override fun truncate(p0: Long): SeekableByteChannel { 79 | throw UnsupportedOperationException() 80 | } 81 | 82 | fun buffer(): ByteBuffer { 83 | val dup = buffer.duplicate() 84 | dup.position(0) 85 | dup.limit(buffer.position()) 86 | dup.order(ByteOrder.LITTLE_ENDIAN) 87 | return dup 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/Dictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import com.worksap.nlp.sudachi.dictionary.POS; 20 | 21 | import java.io.IOException; 22 | import java.util.Arrays; 23 | import java.util.List; 24 | import java.util.function.Predicate; 25 | 26 | /** 27 | * A lexicon and a grammar for morphological analysis. 28 | * 29 | * This class requires a lot of memory. When using multiple analyzers, it is 30 | * recommended to generate only one instance of this class, and generate 31 | * multiple tokenizers. 32 | * 33 | * @see DictionaryFactory 34 | * @see Tokenizer 35 | * @see AutoCloseable 36 | */ 37 | public interface Dictionary extends AutoCloseable { 38 | 39 | /** 40 | * Creates a tokenizer instance. 41 | * 42 | * @return a tokenizer 43 | */ 44 | public Tokenizer create(); 45 | 46 | @Override 47 | public void close() throws IOException; 48 | 49 | /** 50 | * Returns the number of types of part-of-speech. 51 | * 52 | * The IDs of part-of-speech are within the range of 0 to 53 | * {@code getPartOfSpeechSize() - 1}. 54 | * 55 | * @return the number of types of part-of-speech 56 | */ 57 | public int getPartOfSpeechSize(); 58 | 59 | /** 60 | * Returns the array of strings of part-of-speech name. 61 | * 62 | * The name is divided into layers. 63 | * 64 | * @param posId 65 | * the ID of the part-of-speech 66 | * @return the list of strings of part-of-speech name 67 | * @throws IndexOutOfBoundsException 68 | * if {@code posId} is out of the range 69 | */ 70 | public List getPartOfSpeechString(short posId); 71 | 72 | /** 73 | * Create a POS matcher that will match any of POS for which the passed 74 | * predicate returns true. PosMatcher will be much faster than doing string 75 | * comparison on POS objects. 76 | * 77 | * @param predicate 78 | * returns true if the POS is needed 79 | * @return PosMatcher object that mirrors behavior of the predicate 80 | */ 81 | PosMatcher posMatcher(Predicate predicate); 82 | 83 | /** 84 | * Create a POS matcher that will mirror matching behavior of passed list of 85 | * partially-defined POS. 86 | * 87 | * @param posList 88 | * list of partially defined part-of-speech objects 89 | * @return mirroring PosMatcher object 90 | * @see PartialPOS 91 | */ 92 | default PosMatcher posMatcher(Iterable posList) { 93 | return posMatcher(posRepr -> { 94 | for (PartialPOS p : posList) { 95 | if (p.matches(posRepr)) { 96 | return true; 97 | } 98 | } 99 | return false; 100 | }); 101 | } 102 | 103 | /** 104 | * Create a POS matcher that will mirror matching behavior of passed list of 105 | * partially-defined POS. 106 | * 107 | * @param posList 108 | * list of partially defined part-of-speech objects 109 | * @return mirroring PosMatcher object 110 | * @see PartialPOS 111 | */ 112 | default PosMatcher posMatcher(PartialPOS... posList) { 113 | return posMatcher(Arrays.asList(posList)); 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/EditConnectionCostPlugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.io.IOException; 20 | 21 | import com.worksap.nlp.sudachi.dictionary.Grammar; 22 | 23 | /** 24 | * A plugin for editing the connection costs. 25 | * 26 | *

27 | * {@link Dictionary} initialize this plugin with {@link Settings}. It can be 28 | * referred as {@link Plugin#settings}. 29 | * 30 | *

31 | * The following is an example of settings. 32 | * 33 | *

34 |  * {@code
35 |  *   {
36 |  *     "class" : "com.worksap.nlp.sudachi.SampleEditConnectionPlugin",
37 |  *     "example" : "example setting"
38 |  *   }
39 |  * }
40 |  * 
41 | */ 42 | public abstract class EditConnectionCostPlugin extends Plugin { 43 | 44 | /** 45 | * Set up the plugin. 46 | * 47 | * {@link Tokenizer} calls this method for setting up this plugin. 48 | * 49 | * @param grammar 50 | * the grammar of the system dictionary 51 | * @throws IOException 52 | * if reading something is failed 53 | */ 54 | public void setUp(Grammar grammar) throws IOException { 55 | } 56 | 57 | /** 58 | * Edit the connection costs. 59 | * 60 | * To edit connection costs, you can use {@link Grammar#getConnectCost}, 61 | * {@link Grammar#setConnectCost}, and {@link #inhibitConnection}. 62 | * 63 | * @param grammar 64 | * the grammar of the system dictionary 65 | */ 66 | public abstract void edit(Grammar grammar); 67 | 68 | /** 69 | * Inhibit a connection. 70 | * 71 | * @param grammar 72 | * the grammar of the system dictionary 73 | * @param left 74 | * the right-ID of the left node 75 | * @param right 76 | * the left-ID of the right node 77 | */ 78 | public void inhibitConnection(Grammar grammar, short left, short right) { 79 | grammar.setConnectCost(left, right, Grammar.INHIBITED_CONNECTION); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/IOTools.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023-2024 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.io.IOException; 20 | import java.nio.CharBuffer; 21 | 22 | public class IOTools { 23 | private IOTools() { 24 | // forbid instantiation 25 | } 26 | 27 | /** 28 | * Read as much as possible from the readable to the result buffer. Use this to 29 | * make sure that the buffer is fulfilled or no text left unread. 30 | * 31 | * @param readable 32 | * input readable 33 | * @param result 34 | * buffer to read into 35 | * @return number of read characters 36 | * @throws IOException 37 | * when read operation fails 38 | */ 39 | public static int readAsMuchAsCan(Readable readable, CharBuffer result) throws IOException { 40 | int totalRead = 0; 41 | while (result.hasRemaining()) { 42 | int read = readable.read(result); 43 | if (read < 0) { 44 | if (totalRead == 0) { 45 | return -1; 46 | } else { 47 | return totalRead; 48 | } 49 | } 50 | totalRead += read; 51 | } 52 | return totalRead; 53 | } 54 | 55 | /** 56 | * Wrapper class for Readable, that uses {@link #readAsMuchAsCan} to read and 57 | * guarantees that the last character read is not a high surrogate unless it is 58 | * the last one in the readable. 59 | */ 60 | public static class SurrogateAwareReadable implements Readable { 61 | private Readable readable; 62 | char lastTrailingHighSurrogate; 63 | 64 | SurrogateAwareReadable(Readable input) { 65 | this.readable = input; 66 | } 67 | 68 | @Override 69 | public int read(CharBuffer cb) throws IOException { 70 | boolean trailingKept = false; 71 | if (lastTrailingHighSurrogate != 0) { 72 | cb.append(lastTrailingHighSurrogate); 73 | lastTrailingHighSurrogate = 0; 74 | trailingKept = true; 75 | } 76 | 77 | int nread = IOTools.readAsMuchAsCan(readable, cb); 78 | if (nread < 0) { 79 | if (!trailingKept) { 80 | return -1; 81 | } 82 | // the last char in the readable is a high surrogate and there is nothing we can 83 | // do. 84 | return 1; 85 | } 86 | if (trailingKept) { 87 | nread += 1; 88 | } 89 | 90 | char lastChar = cb.get(cb.position() - 1); 91 | if (Character.isHighSurrogate(lastChar)) { 92 | lastTrailingHighSurrogate = lastChar; 93 | cb.position(cb.position() - 1); 94 | nread -= 1; 95 | } 96 | return nread; 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/InhibitConnectionPlugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.util.List; 20 | 21 | import com.worksap.nlp.sudachi.dictionary.Grammar; 22 | 23 | /** 24 | * A Plugin for inhibiting the connections. 25 | * 26 | *

27 | * {@link Dictionary} initialize this plugin with {@link Settings}. It can be 28 | * referred as {@link Plugin#settings}. 29 | * 30 | *

31 | * The following is an example of settings. 32 | * 33 | *

34 |  * {@code
35 |  *   {
36 |  *     "class" : "com.worksap.nlp.sudachi.InhibitConnectionPlugin",
37 |  *     "inhibitedPair" : [ [ 0, 233 ], [435, 332] ]
38 |  *   }
39 |  * }
40 |  * 
41 | * 42 | * {@code inhibitPair} is a list of lists of two numbers. At each pair, the 43 | * first number is right-ID of the left node and the second is left-ID of the 44 | * right node in a connection. 45 | */ 46 | class InhibitConnectionPlugin extends EditConnectionCostPlugin { 47 | 48 | List> inhibitedPairs; 49 | 50 | @Override 51 | public void setUp(Grammar grammar) { 52 | inhibitedPairs = settings.getIntListList("inhibitedPair"); 53 | } 54 | 55 | @Override 56 | public void edit(Grammar grammar) { 57 | for (List pair : inhibitedPairs) { 58 | if (pair.size() < 2) { 59 | continue; 60 | } 61 | inhibitConnection(grammar, pair.get(0).shortValue(), pair.get(1).shortValue()); 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/InputTextBuilder.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | /** 20 | * A mutable sequence of characters. This class is used in 21 | * {@link InputTextPlugin#rewrite} to modify the input text. 22 | */ 23 | public interface InputTextBuilder { 24 | 25 | /** 26 | * Replaces the characters in a substring of this sequence with characters in 27 | * the specified {@code String}. The substring begins at the specified 28 | * {@code begin} and extends to the character at index {@code end - 1} or to the 29 | * end of sequence if no such character exists. 30 | * 31 | * @param begin 32 | * the beginning index 33 | * @param end 34 | * the ending index 35 | * @param str 36 | * the replacement string 37 | * @throws StringIndexOutOfBoundsException 38 | * if {@code begin} is negative, greater than the length of the 39 | * sequence, or greater than {@code end}. 40 | */ 41 | public void replace(int begin, int end, String str); 42 | 43 | /** 44 | * Returns the sequence before all of the replacements. 45 | * 46 | * @return the sequence before all of the replacements 47 | */ 48 | public String getOriginalText(); 49 | 50 | /** 51 | * Returns the sequence as {@link String}. 52 | * 53 | * @return the sequence as {@link String} 54 | */ 55 | public String getText(); 56 | 57 | /** 58 | * Returns the immutable sequence of characters. 59 | * 60 | * @return the immutable sequence of characters 61 | */ 62 | public InputText build(); 63 | } 64 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/InputTextPlugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.io.IOException; 20 | import com.worksap.nlp.sudachi.dictionary.Grammar; 21 | 22 | /** 23 | * A plugin that rewrites the characters of input texts. 24 | * 25 | *

26 | * {@link Dictionary} initialize this plugin with {@link Settings}. It can be 27 | * referred as {@link Plugin#settings}. 28 | * 29 | *

30 | * The following is an example of settings. 31 | * 32 | *

33 |  * {@code
34 |  *   {
35 |  *     "class" : "com.worksap.nlp.sudachi.InputTextPlugin",
36 |  *     "example" : "example setting"
37 |  *   }
38 |  * }
39 |  * 
40 | */ 41 | public abstract class InputTextPlugin extends Plugin { 42 | 43 | /** 44 | * Set up the plugin. 45 | * 46 | * {@link Tokenizer} calls this method for setting up this plugin. 47 | * 48 | * @param grammar 49 | * the grammar of the system dictionary 50 | * @throws IOException 51 | * if reading something is failed 52 | */ 53 | public void setUp(Grammar grammar) throws IOException { 54 | } 55 | 56 | /** 57 | * Rewrite the input text. 58 | * 59 | * To rewrite the input text, you can use {@link InputTextBuilder#replace}. 60 | * 61 | * @param builder 62 | * the input text 63 | */ 64 | public abstract void rewrite(InputTextBuilder builder); 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/JoinKatakanaOovPlugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.util.List; 20 | 21 | import com.worksap.nlp.sudachi.dictionary.CategoryType; 22 | import com.worksap.nlp.sudachi.dictionary.Grammar; 23 | 24 | /** 25 | * A plugin for concatenation of Katakana OOVs. 26 | * 27 | * This plugin concatenate the Katakana OOV and the adjacent Katakana morphemes. 28 | * 29 | *

30 | * The concatenated morpheme is OOV, and its part of speech must be specified in 31 | * the settings. 32 | * 33 | *

34 | * The following is an example of settings. 35 | * 36 | *

 37 |  * {@code
 38 |  *   {
 39 |  *     "class" : "com.worksap.nlp.sudachi.JoinKatakanaOovPlugin",
 40 |  *     "oovPOS" : [ "POS1", "POS2", ... ],
 41 |  *     "minLength" : 3
 42 |  *   }
 43 |  * }
 44 |  * 
45 | */ 46 | class JoinKatakanaOovPlugin extends PathRewritePlugin { 47 | 48 | short oovPosId; 49 | int minLength; 50 | 51 | @Override 52 | public void setUp(Grammar grammar) { 53 | List pos = settings.getStringList("oovPOS"); 54 | if (pos.isEmpty()) { 55 | throw new IllegalArgumentException("oovPOS is undefined"); 56 | } 57 | oovPosId = grammar.getPartOfSpeechId(pos); 58 | if (oovPosId < 0) { 59 | throw new IllegalArgumentException("oovPOS is invalid"); 60 | } 61 | minLength = settings.getInt("minLength", 1); 62 | if (minLength < 0) { 63 | throw new IllegalArgumentException("minLength is negative"); 64 | } 65 | } 66 | 67 | @Override 68 | public void rewrite(InputText text, List path, Lattice lattice) { 69 | for (int i = 0; i < path.size(); i++) { 70 | LatticeNode node = path.get(i); 71 | if ((node.isOOV() || isShorter(minLength, text, node)) && isKatakanaNode(text, node)) { 72 | int begin = i - 1; 73 | for (; begin >= 0; begin--) { 74 | if (!isKatakanaNode(text, path.get(begin))) { 75 | begin++; 76 | break; 77 | } 78 | } 79 | if (begin < 0) { 80 | begin = 0; 81 | } 82 | int end = i + 1; 83 | for (; end < path.size(); end++) { 84 | if (!isKatakanaNode(text, path.get(end))) { 85 | break; 86 | } 87 | } 88 | while (begin != end && !canOovBowNode(text, path.get(begin))) { 89 | begin++; 90 | } 91 | if (end - begin > 1) { 92 | concatenateOov(path, begin, end, oovPosId, lattice); 93 | i = begin + 1; 94 | } 95 | } 96 | } 97 | } 98 | 99 | boolean isKatakanaNode(InputText text, LatticeNode node) { 100 | return getCharCategoryTypes(text, node).contains(CategoryType.KATAKANA); 101 | } 102 | 103 | boolean isShorter(int length, InputText text, LatticeNode node) { 104 | return text.codePointCount(node.getBegin(), node.getEnd()) < length; 105 | } 106 | 107 | boolean canOovBowNode(InputText text, LatticeNode node) { 108 | return !text.getCharCategoryTypes(node.getBegin()).contains(CategoryType.NOOOVBOW); 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/MorphemeFormatterPlugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.io.IOException; 20 | import java.io.PrintStream; 21 | import java.util.List; 22 | 23 | /** 24 | * Provides a formatter for {@link Morpheme} 25 | * 26 | *

27 | * The following is an example of settings. 28 | * 29 | *

 30 |  * {@code
 31 |  *   {
 32 |  *     "class"   : "com.worksap.nlp.sudachi.MorphemeFormatterPlugin",
 33 |  *     "delimiter"  : "\n",
 34 |  *     "eos" : "\nEOS\n",
 35 |   *   }
 36 |  * }
 37 |  * 
38 | * 39 | * {@code delimiter} is the delimiter of the morphemes. {@code eos} is printed 40 | * at the position of EOS. 41 | */ 42 | public abstract class MorphemeFormatterPlugin extends Plugin { 43 | 44 | protected String delimiter = "\n"; 45 | protected String eosString = "\nEOS\n"; 46 | protected boolean showDetails; 47 | 48 | /** 49 | * Set up the plugin. 50 | * 51 | * {@link SudachiCommandLine} calls this method for setting up this plugin. 52 | */ 53 | public void setUp() throws IOException { 54 | showDetails = false; 55 | } 56 | 57 | /** 58 | * Provides a string representation of a morpheme. 59 | * 60 | * @param morpheme 61 | * the input 62 | * 63 | * @return a string representation of a morpheme. 64 | */ 65 | public abstract String formatMorpheme(Morpheme morpheme); 66 | 67 | /** 68 | * Show details. 69 | * 70 | * This method is called when the {@code -a} option is specified. 71 | */ 72 | public void showDetails() { 73 | showDetails = true; 74 | } 75 | 76 | void printSentence(List sentence, PrintStream output) { 77 | boolean isFirst = true; 78 | for (Morpheme m : sentence) { 79 | if (isFirst) { 80 | isFirst = false; 81 | } else { 82 | output.print(delimiter); 83 | } 84 | output.print(formatMorpheme(m)); 85 | } 86 | output.print(eosString); 87 | } 88 | 89 | public void setDelimiter(String delimiter) { 90 | this.delimiter = delimiter; 91 | } 92 | 93 | public void setEosString(String eosString) { 94 | this.eosString = eosString; 95 | } 96 | 97 | public void setShowDetails(boolean showDetails) { 98 | this.showDetails = showDetails; 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/MorphemeImpl.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.util.List; 20 | 21 | import com.worksap.nlp.sudachi.dictionary.WordInfo; 22 | 23 | class MorphemeImpl implements Morpheme { 24 | 25 | final MorphemeList list; 26 | final int index; 27 | WordInfo wordInfo; 28 | 29 | MorphemeImpl(MorphemeList list, int index) { 30 | this.list = list; 31 | this.index = index; 32 | } 33 | 34 | @Override 35 | public int begin() { 36 | return list.getBegin(index); 37 | } 38 | 39 | @Override 40 | public int end() { 41 | return list.getEnd(index); 42 | } 43 | 44 | @Override 45 | public String surface() { 46 | return list.getSurface(index); 47 | } 48 | 49 | @Override 50 | public List partOfSpeech() { 51 | WordInfo wi = getWordInfo(); 52 | return list.grammar.getPartOfSpeechString(wi.getPOSId()); 53 | } 54 | 55 | @Override 56 | public short partOfSpeechId() { 57 | WordInfo wi = getWordInfo(); 58 | return wi.getPOSId(); 59 | } 60 | 61 | @Override 62 | public String dictionaryForm() { 63 | WordInfo wi = getWordInfo(); 64 | return wi.getDictionaryForm(); 65 | } 66 | 67 | @Override 68 | public String normalizedForm() { 69 | WordInfo wi = getWordInfo(); 70 | return wi.getNormalizedForm(); 71 | } 72 | 73 | @Override 74 | public String readingForm() { 75 | WordInfo wi = getWordInfo(); 76 | return wi.getReadingForm(); 77 | } 78 | 79 | @Override 80 | public List split(Tokenizer.SplitMode mode) { 81 | return list.split(mode, index); 82 | } 83 | 84 | @Override 85 | public boolean isOOV() { 86 | return list.isOOV(index); 87 | } 88 | 89 | @Override 90 | public int getWordId() { 91 | return list.getWordId(index); 92 | } 93 | 94 | @Override 95 | public int getDictionaryId() { 96 | return list.getDictionaryId(index); 97 | } 98 | 99 | @Override 100 | public int[] getSynonymGroupIds() { 101 | WordInfo wi = getWordInfo(); 102 | return wi.getSynonymGoupIds(); 103 | } 104 | 105 | WordInfo getWordInfo() { 106 | if (wordInfo == null) { 107 | wordInfo = list.getWordInfo(index); 108 | } 109 | return wordInfo; 110 | } 111 | 112 | @Override 113 | public String toString() { 114 | final StringBuilder sb = new StringBuilder("MorphemeImpl{"); 115 | sb.append("begin=").append(begin()); 116 | sb.append(", end=").append(end()); 117 | sb.append(", surface=").append(surface()); 118 | sb.append(", pos=").append(partOfSpeechId()).append('/').append(partOfSpeech()); 119 | int wordId = getWordId(); 120 | sb.append(", wid=(").append(WordId.dic(wordId)).append(',').append(WordId.word(wordId)); 121 | sb.append(")}"); 122 | return sb.toString(); 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/PartialPOS.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import com.worksap.nlp.sudachi.dictionary.POS; 20 | 21 | import java.util.AbstractList; 22 | import java.util.Arrays; 23 | import java.util.List; 24 | 25 | public class PartialPOS extends AbstractList { 26 | private final List data; 27 | 28 | public PartialPOS(List data) { 29 | if (data.size() == 0) { 30 | throw new IllegalArgumentException("Partial POS must have at least 1 component"); 31 | } 32 | if (data.size() > POS.DEPTH) { 33 | throw new IllegalArgumentException("Partial POS can have at most 6 components, was " + data); 34 | } 35 | for (String component : data) { 36 | if (component != null && component.length() > POS.MAX_COMPONENT_LENGTH) { 37 | throw new IllegalArgumentException("Component length can't be more than " + POS.MAX_COMPONENT_LENGTH 38 | + ", was " + component.length() + ":" + component); 39 | } 40 | } 41 | this.data = data; 42 | } 43 | 44 | public PartialPOS(String... data) { 45 | this(Arrays.asList(data)); 46 | } 47 | 48 | @Override 49 | public String get(int index) { 50 | return data.get(index); 51 | } 52 | 53 | @Override 54 | public int size() { 55 | return data.size(); 56 | } 57 | 58 | boolean matches(POS pos) { 59 | for (int level = 0; level < data.size(); ++level) { 60 | String s = data.get(level); 61 | if (s == null) { 62 | continue; 63 | } 64 | if (!s.equals(pos.get(level))) { 65 | return false; 66 | } 67 | } 68 | return true; 69 | } 70 | 71 | @Override 72 | public String toString() { 73 | return String.join(",", data); 74 | } 75 | 76 | public static PartialPOS of(String... parts) { 77 | return new PartialPOS(parts); 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/Plugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | abstract class Plugin { 20 | 21 | protected Settings settings; 22 | 23 | void setSettings(Settings settings) { 24 | this.settings = settings; 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/ProlongedSoundMarkInputTextPlugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.io.IOException; 20 | import java.util.HashSet; 21 | import java.util.Set; 22 | import java.util.List; 23 | 24 | import com.worksap.nlp.sudachi.dictionary.Grammar; 25 | 26 | /** 27 | * A plugin that rewrites the Katakana-Hiragana Prolonged Sound Mark (Chōonpu) 28 | * and similar symbols. 29 | * 30 | *

31 | * This plugin combines the continuous sequence of prolonged sound marks to 1 32 | * character. 33 | * 34 | *

35 | * {@link Dictionary} initialize this plugin with {@link Settings}. It can be 36 | * referred as {@link Plugin#settings}. 37 | * 38 | *

39 | * The following is an example of settings. 40 | * 41 | *

42 |  * {@code
43 |  *   {
44 |  *     "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin",
45 |         "prolongedSoundMarks": ["ー", "〜", "〰"],
46 |         "replacementSymbol": "ー"
47 |  *   }
48 |  * }
49 |  * 
50 | * 51 | * {@code prolongedSoundMarks} is the list of symbols to be combined. 52 | * {@code replacementSymbol} is the symbol for replacement, after combining 53 | * prolonged sound mark sequences. 54 | * 55 | *

56 | * With above setting example, the plugin rewrites input "エーービ〜〜〜シ〰〰〰〰" to 57 | * "エービーシー". 58 | */ 59 | class ProlongedSoundMarkInputTextPlugin extends InputTextPlugin { 60 | 61 | private Set prolongedSoundMarkSet = new HashSet<>(); 62 | private String replacementSymbol; 63 | 64 | @Override 65 | public void setUp(Grammar Grammar) throws IOException { 66 | List prolongedSoundMarkStrings = settings.getStringList("prolongedSoundMarks"); 67 | for (String s : prolongedSoundMarkStrings) { 68 | prolongedSoundMarkSet.add(s.codePointAt(0)); 69 | } 70 | replacementSymbol = settings.getString("replacementSymbol"); 71 | } 72 | 73 | @Override 74 | public void rewrite(InputTextBuilder builder) { 75 | String text = builder.getText(); 76 | 77 | int n = text.length(); 78 | int offset = 0; 79 | int markStartIndex = n; 80 | boolean isProlongedSoundMark = false; 81 | for (int i = 0; i < n; i++) { 82 | int cp = text.codePointAt(i); 83 | if (!isProlongedSoundMark && prolongedSoundMarkSet.contains(cp)) { 84 | isProlongedSoundMark = true; 85 | markStartIndex = i; 86 | } else if (isProlongedSoundMark && !prolongedSoundMarkSet.contains(cp)) { 87 | if ((i - markStartIndex) > 1) { 88 | builder.replace(markStartIndex - offset, i - offset, replacementSymbol); 89 | offset += i - markStartIndex - 1; 90 | } 91 | isProlongedSoundMark = false; 92 | } 93 | } 94 | if (isProlongedSoundMark && (n - markStartIndex) > 1) { 95 | builder.replace(markStartIndex - offset, n - offset, replacementSymbol); 96 | } 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/SentenceSplittingAnalysis.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2023 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import com.worksap.nlp.sudachi.dictionary.LexiconSet; 20 | import com.worksap.nlp.sudachi.sentdetect.SentenceDetector; 21 | 22 | import java.util.ArrayList; 23 | import java.util.Iterator; 24 | 25 | /*internal*/ class SentenceSplittingAnalysis implements SentenceDetector.NonBreakCheker { 26 | private final SentenceDetector detector = new SentenceDetector(); 27 | 28 | private final Tokenizer.SplitMode mode; 29 | private final JapaneseTokenizer tokenizer; 30 | final ArrayList result = new ArrayList<>(); 31 | 32 | SentenceSplittingAnalysis(Tokenizer.SplitMode mode, JapaneseTokenizer tokenizer) { 33 | this.mode = mode; 34 | this.tokenizer = tokenizer; 35 | } 36 | 37 | UTF8InputText input; 38 | int bos; 39 | 40 | int tokenizeBuffer(CharSequence buffer) { 41 | UTF8InputText input = tokenizer.buildInputText(buffer); 42 | String normalized = input.getText(); 43 | this.input = input; 44 | 45 | int bos = 0; 46 | int length; 47 | 48 | this.bos = bos; 49 | while ((length = detector.getEos(normalized, this)) > 0) { 50 | int eos = bos + length; 51 | if (eos < normalized.length()) { 52 | eos = input.getNextInOriginal(eos - 1); 53 | length = eos - bos; 54 | } 55 | UTF8InputText sentence = input.slice(bos, eos); 56 | result.add(tokenizer.tokenizeSentence(mode, sentence)); 57 | normalized = normalized.substring(length); 58 | bos = eos; 59 | this.bos = bos; 60 | } 61 | 62 | // buffer is full, need to clean it up 63 | if (length < 0 && buffer.length() == -length) { 64 | result.add(tokenizer.tokenizeSentence(mode, input)); 65 | return -length; 66 | } 67 | 68 | return length; 69 | } 70 | 71 | int bosPosition() { 72 | return input.textIndexToOriginalTextIndex(bos); 73 | } 74 | 75 | @Override 76 | public boolean hasNonBreakWord(int length) { 77 | UTF8InputText inp = input; 78 | int byteEOS = inp.getCodePointsOffsetLength(0, bos + length); 79 | byte[] bytes = inp.getByteText(); 80 | LexiconSet lexicon = tokenizer.lexicon; 81 | for (int i = Math.max(0, byteEOS - 64); i < byteEOS; i++) { 82 | Iterator iterator = lexicon.lookup(bytes, i); 83 | while (iterator.hasNext()) { 84 | int[] r = iterator.next(); 85 | int l = r[1]; 86 | if (l > byteEOS || (l == byteEOS && bos + length - inp.modifiedOffset(i) > 1)) { 87 | return true; 88 | } 89 | } 90 | } 91 | return false; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/SimpleMorphemeFormatter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.io.IOException; 20 | import java.util.Arrays; 21 | 22 | /** 23 | * Provides a formatter for {@link Morpheme} 24 | * 25 | *

26 | * The following is an example of settings. 27 | * 28 | *

29 |  * {@code
30 |  *   {
31 |  *     "class"   : "com.worksap.nlp.sudachi.SimpleFormatter",
32 |  *     "delimiter"  : "\n",
33 |  *     "eos" : "\nEOS\n",
34 |  *     "columnDelimiter" : "\t"
35 |  *   }
36 |  * }
37 |  * 
38 | * 39 | * {@code delimiter} is the delimiter of the morphemes. {@code eos} is printed 40 | * at the position of EOS. {@code columnDelimiter} is the delimiter of the 41 | * fields. 42 | */ 43 | public class SimpleMorphemeFormatter extends MorphemeFormatterPlugin { 44 | 45 | protected String columnDelimiter; 46 | 47 | @Override 48 | public void setUp() throws IOException { 49 | super.setUp(); 50 | columnDelimiter = settings.getString("columnDelimiter", "\t"); 51 | } 52 | 53 | @Override 54 | public String formatMorpheme(Morpheme morpheme) { 55 | String output = morpheme.surface() + columnDelimiter + String.join(",", morpheme.partOfSpeech()) 56 | + columnDelimiter + morpheme.normalizedForm(); 57 | if (showDetails) { 58 | output += columnDelimiter + morpheme.dictionaryForm() + columnDelimiter + morpheme.readingForm() 59 | + columnDelimiter + morpheme.getDictionaryId() + columnDelimiter 60 | + Arrays.toString(morpheme.getSynonymGroupIds()) + columnDelimiter 61 | + ((morpheme.isOOV()) ? "(OOV)" : ""); 62 | } 63 | return output; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/SimpleOovProviderPlugin.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import com.worksap.nlp.sudachi.dictionary.Grammar; 20 | import com.worksap.nlp.sudachi.dictionary.POS; 21 | import com.worksap.nlp.sudachi.dictionary.WordInfo; 22 | 23 | import java.util.List; 24 | 25 | /** 26 | * Provides the OOVs which consists of a maximum run of characters of a single 27 | * character class. Does not produce OOVs if there was any other word at the 28 | * boundary. 29 | * 30 | *

31 | * The following is an example of settings. 32 | * 33 | *

34 |  * {@code
35 |  *   {
36 |  *     "class"   : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin",
37 |  *     "oovPOS"  : [ "補助記号", "一般", "*", "*", "*", "*" ],
38 |  *     "leftId"  : 5968,
39 |  *     "rigthId" : 5968,
40 |  *     "cost"    : 3857
41 |  *   }
42 |  * }
43 |  * 
44 | * 45 | * {@code oovPOS} is the part of speech of the OOVs. {@code leftId} is the 46 | * left-ID of the OOVs. {@code rightId} is the right-ID of the OOVs. 47 | * {@code cost} is the cost of the OOVs. 48 | */ 49 | class SimpleOovProviderPlugin extends OovProviderPlugin { 50 | 51 | short oovPOSId; 52 | short leftId; 53 | short rightId; 54 | short cost; 55 | 56 | @Override 57 | public void setUp(Grammar grammar) { 58 | POS pos = new POS(settings.getStringList("oovPOS")); 59 | leftId = (short) settings.getInt("leftId"); 60 | rightId = (short) settings.getInt("rightId"); 61 | cost = (short) settings.getInt("cost"); 62 | String userPosMode = settings.getString(USER_POS, USER_POS_FORBID); 63 | oovPOSId = posIdOf(grammar, pos, userPosMode); 64 | } 65 | 66 | @Override 67 | public int provideOOV(InputText inputText, int offset, long otherWords, List nodes) { 68 | if (otherWords == 0) { 69 | LatticeNodeImpl node = createNode(); 70 | node.setParameter(leftId, rightId, cost); 71 | int length = inputText.getWordCandidateLength(offset); 72 | String s = inputText.getSubstring(offset, offset + length); 73 | WordInfo info = new WordInfo(s, (short) length, oovPOSId, s, s, ""); 74 | node.setWordInfo(info); 75 | nodes.add(node); 76 | return 1; 77 | } else { 78 | return 0; 79 | } 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/StringUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.io.IOException; 20 | import java.io.InputStream; 21 | import java.io.InputStreamReader; 22 | import java.net.URL; 23 | import java.nio.ByteBuffer; 24 | import java.nio.ByteOrder; 25 | import java.nio.CharBuffer; 26 | import java.nio.charset.StandardCharsets; 27 | import java.nio.file.Files; 28 | import java.nio.file.Path; 29 | import java.util.Arrays; 30 | 31 | public class StringUtil { 32 | private StringUtil() { 33 | } 34 | 35 | public static String readFully(URL url) throws IOException { 36 | try (InputStream inputStream = url.openStream()) { 37 | return readFully(inputStream); 38 | } 39 | } 40 | 41 | public static String readFully(Path path) throws IOException { 42 | try (InputStream is = Files.newInputStream(path)) { 43 | return readFully(is); 44 | } 45 | } 46 | 47 | public static String readFully(InputStream stream) throws IOException { 48 | InputStreamReader isr = new InputStreamReader(stream, StandardCharsets.UTF_8); 49 | StringBuilder sb = new StringBuilder(); 50 | CharBuffer cb = CharBuffer.allocate(1024); 51 | while (isr.read(cb) != -1) { 52 | cb.flip(); 53 | sb.append(cb); 54 | cb.clear(); 55 | } 56 | return sb.toString(); 57 | } 58 | 59 | public static ByteBuffer readAllBytes(URL url) throws IOException { 60 | return readAllBytes(url, ByteOrder.LITTLE_ENDIAN); 61 | } 62 | 63 | public static ByteBuffer readAllBytes(URL url, ByteOrder order) throws IOException { 64 | try (InputStream is = url.openStream()) { 65 | return readAllBytes(is, order); 66 | } 67 | } 68 | 69 | public static ByteBuffer readAllBytes(InputStream inputStream) throws IOException { 70 | return readAllBytes(inputStream, ByteOrder.LITTLE_ENDIAN); 71 | } 72 | 73 | public static ByteBuffer readAllBytes(InputStream inputStream, ByteOrder order) throws IOException { 74 | byte[] buffer = new byte[inputStream.available() + 1024]; 75 | int offset = 0; 76 | 77 | while (true) { 78 | int nread = inputStream.read(buffer, offset, buffer.length - offset); 79 | if (nread >= 0) { 80 | offset += nread; 81 | if (offset == buffer.length) { 82 | buffer = Arrays.copyOf(buffer, buffer.length * 2); 83 | } 84 | } else { 85 | break; 86 | } 87 | } 88 | ByteBuffer bbuf = ByteBuffer.wrap(buffer); 89 | bbuf.limit(offset); 90 | bbuf.order(order); 91 | return bbuf; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/WordId.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | public class WordId { 20 | private WordId() { 21 | } 22 | 23 | /** 24 | * Internal word ids can't be larger than this number 25 | */ 26 | public static final int MAX_WORD_ID = 0x0fffffff; 27 | 28 | /** 29 | * Dictionary ids can't be larger than this number 30 | */ 31 | public static final int MAX_DIC_ID = 0xe; 32 | 33 | public static int makeUnchecked(int dic, int word) { 34 | int dicPart = dicIdMask(dic); 35 | return dicPart | word; 36 | } 37 | 38 | /** 39 | * Make combined WordId from dictionary and internal parts. This method does 40 | * bound checking. 41 | * 42 | * @param dic 43 | * dictionary id. 0 is system, 1 and above are user. 44 | * @param word 45 | * word id inside the dictionary. 46 | * @return combined word id. 47 | */ 48 | public static int make(int dic, int word) { 49 | if (word > MAX_WORD_ID) { 50 | throw new IndexOutOfBoundsException("wordId is too large: " + word); 51 | } 52 | if (dic > MAX_DIC_ID) { 53 | throw new IndexOutOfBoundsException("dictionaryId is too large: " + dic); 54 | } 55 | return makeUnchecked(dic, word); 56 | } 57 | 58 | /** 59 | * Extract dictionary number from the combined word id 60 | * 61 | * @param wordId 62 | * combined word id 63 | * @return dictionary number 64 | */ 65 | public static int dic(int wordId) { 66 | return wordId >>> 28; 67 | } 68 | 69 | /** 70 | * Extract internal word id from the combined word id 71 | * 72 | * @param wordId 73 | * combined word id 74 | * @return internal word id 75 | */ 76 | public static int word(int wordId) { 77 | return wordId & MAX_WORD_ID; 78 | } 79 | 80 | public static int dicIdMask(int dicId) { 81 | return dicId << 28; 82 | } 83 | 84 | public static int applyMask(int wordId, int dicIdMask) { 85 | return (wordId & MAX_WORD_ID) | dicIdMask; 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/WordMask.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | public class WordMask { 20 | public static final int MAX_LENGTH = 63; 21 | 22 | // instance creation is forbidden 23 | private WordMask() { 24 | 25 | } 26 | 27 | /** 28 | * Add n-th element to wordMask 29 | * 30 | * @param positions 31 | * current mask of word positions 32 | * @param position 33 | * new position to add 34 | * @return position mask with the new element added 35 | */ 36 | public static long addNth(long positions, int position) { 37 | return positions | nth(position); 38 | } 39 | 40 | /** 41 | * Create a word mask with nth position set 42 | * 43 | * @param position 44 | * number of set position 45 | * @return a word mask bitset 46 | */ 47 | public static long nth(int position) { 48 | assert position > 0; 49 | int fixedPosition = Math.min(position - 1, MAX_LENGTH); 50 | return 1L << fixedPosition; 51 | } 52 | 53 | /** 54 | * Checks that a word mask has nth position set 55 | * 56 | * @param positions 57 | * word mask of positions 58 | * @param position 59 | * position to check 60 | * @return whether the checked position was included in the set 61 | */ 62 | public static boolean hasNth(long positions, int position) { 63 | return (positions & nth(position)) != 0; 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/WordSegmentationFormatter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.io.IOException; 20 | import java.io.PrintStream; 21 | import java.util.List; 22 | 23 | /** 24 | * Provides a formatter for {@link Morpheme} 25 | * 26 | *

27 | * The following is an example of settings. 28 | * 29 | *

30 |  * {@code
31 |  *   {
32 |  *     "class" : "com.worksap.nlp.sudachi.SurfaceFormatter",
33 |  *     "delimiter" : " ",
34 |  *     "eos" : "\n",
35 |  *   }
36 |  * }
37 |  * 
38 | * 39 | * {@code delimiter} is the delimiter of the morphemes. {@code eos} is printed 40 | * at the position of EOS. 41 | */ 42 | public class WordSegmentationFormatter extends MorphemeFormatterPlugin { 43 | 44 | @Override 45 | public void setUp() throws IOException { 46 | super.setUp(); 47 | delimiter = settings.getString("delimiter", " "); 48 | eosString = settings.getString("eos", "\n"); 49 | } 50 | 51 | @Override 52 | public String formatMorpheme(Morpheme morpheme) { 53 | return morpheme.surface(); 54 | } 55 | 56 | @Override 57 | void printSentence(List sentence, PrintStream output) { 58 | boolean isFirst = true; 59 | for (Morpheme m : sentence) { 60 | String morpheme = formatMorpheme(m); 61 | if (morpheme.equals("")) { 62 | continue; 63 | } 64 | if (morpheme.equals(delimiter)) { 65 | continue; 66 | } 67 | if (isFirst) { 68 | isFirst = false; 69 | } else { 70 | output.print(delimiter); 71 | } 72 | output.print(morpheme); 73 | } 74 | output.print(eosString); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/BinaryDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | import java.io.Closeable; 20 | import java.io.IOException; 21 | import java.nio.ByteBuffer; 22 | 23 | import com.worksap.nlp.sudachi.Config; 24 | import com.worksap.nlp.sudachi.MMap; 25 | 26 | public class BinaryDictionary implements Closeable, DictionaryAccess { 27 | 28 | private final ByteBuffer bytes; 29 | private final DictionaryHeader header; 30 | private final GrammarImpl grammar; 31 | private final DoubleArrayLexicon lexicon; 32 | 33 | public BinaryDictionary(String fileName) throws IOException { 34 | this(MMap.map(fileName)); 35 | } 36 | 37 | public BinaryDictionary(ByteBuffer dictionary) throws IOException { 38 | int offset = 0; 39 | bytes = dictionary; 40 | 41 | header = new DictionaryHeader(bytes, offset); 42 | offset += header.storageSize(); 43 | 44 | long version = header.getVersion(); 45 | if (DictionaryVersion.hasGrammar(version)) { 46 | grammar = new GrammarImpl(bytes, offset); 47 | offset += grammar.storageSize(); 48 | } else if (header.isUserDictionary()) { 49 | grammar = new GrammarImpl(); 50 | } else { 51 | MMap.unmap(bytes); 52 | throw new IOException("invalid dictionary"); 53 | } 54 | 55 | lexicon = new DoubleArrayLexicon(bytes, offset, DictionaryVersion.hasSynonymGroupIds(version)); 56 | } 57 | 58 | public static BinaryDictionary loadSystem(String fileName) throws IOException { 59 | return loadSystem(MMap.map(fileName)); 60 | } 61 | 62 | public static BinaryDictionary loadUser(String fileName) throws IOException { 63 | return loadUser(MMap.map(fileName)); 64 | } 65 | 66 | public static BinaryDictionary loadSystem(ByteBuffer buffer) throws IOException { 67 | BinaryDictionary dict = new BinaryDictionary(buffer); 68 | if (!dict.getDictionaryHeader().isSystemDictionary()) { 69 | dict.close(); 70 | throw new IOException("invalid system dictionary"); 71 | } 72 | return dict; 73 | } 74 | 75 | public static BinaryDictionary loadUser(ByteBuffer buffer) throws IOException { 76 | BinaryDictionary dict = new BinaryDictionary(buffer); 77 | if (!dict.getDictionaryHeader().isUserDictionary()) { 78 | dict.close(); 79 | throw new IOException("invalid user dictionary"); 80 | } 81 | return dict; 82 | } 83 | 84 | public static BinaryDictionary loadSystem(Config.Resource resource) throws IOException { 85 | return resource.consume(res -> loadSystem(res.asByteBuffer())); 86 | } 87 | 88 | public static BinaryDictionary loadUser(Config.Resource resource) throws IOException { 89 | return resource.consume(res -> loadUser(res.asByteBuffer())); 90 | } 91 | 92 | @Override 93 | public void close() throws IOException { 94 | MMap.unmap(bytes); 95 | } 96 | 97 | public DictionaryHeader getDictionaryHeader() { 98 | return header; 99 | } 100 | 101 | public GrammarImpl getGrammar() { 102 | return grammar; 103 | } 104 | 105 | public DoubleArrayLexicon getLexicon() { 106 | return lexicon; 107 | } 108 | } -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/CategoryType.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | /** 20 | * Categories of characters. 21 | * 22 | * These categories are used in the 23 | * {@link com.worksap.nlp.sudachi.OovProviderPlugin} and 24 | * {@link com.worksap.nlp.sudachi.PathRewritePlugin}. 25 | * 26 | *

27 | * You can defined the range of each category in the file which specified 28 | * "characterDefinitionFile" of the settings. 29 | */ 30 | public enum CategoryType { 31 | /** The fall back category. */ 32 | DEFAULT(1), 33 | /** White spaces. */ 34 | SPACE(1 << 1), 35 | /** CJKV ideographic characters. */ 36 | KANJI(1 << 2), 37 | /** Symbols. */ 38 | SYMBOL(1 << 3), 39 | /** Numerical characters. */ 40 | NUMERIC(1 << 4), 41 | /** Latin alphabets. */ 42 | ALPHA(1 << 5), 43 | /** Hiragana characters. */ 44 | HIRAGANA(1 << 6), 45 | /** Katakana characters. */ 46 | KATAKANA(1 << 7), 47 | /** Kanji numeric characters. */ 48 | KANJINUMERIC(1 << 8), 49 | /** Greek alphabets. */ 50 | GREEK(1 << 9), 51 | /** Cyrillic alphabets. */ 52 | CYRILLIC(1 << 10), 53 | /** User defined category. */ 54 | USER1(1 << 11), 55 | /** User defined category. */ 56 | USER2(1 << 12), 57 | /** User defined category. */ 58 | USER3(1 << 13), 59 | /** User defined category. */ 60 | USER4(1 << 14), 61 | /** Characters that cannot be the beginning of word */ 62 | NOOOVBOW(1 << 15); 63 | 64 | private final int id; 65 | 66 | private CategoryType(int id) { 67 | this.id = id; 68 | } 69 | 70 | /** 71 | * Returns the integer ID number of the category. 72 | * 73 | * @return the ID number of the category 74 | */ 75 | public int getId() { 76 | return id; 77 | } 78 | 79 | /** 80 | * Returns the category to which the specified ID is mapped, or {@code null} if 81 | * there is no associated category. 82 | * 83 | * @param id 84 | * the ID number of category 85 | * @return the category to which the specified ID is mapped, or {@code null} if 86 | * there is no associated category. 87 | */ 88 | public static CategoryType getType(int id) { 89 | for (CategoryType type : CategoryType.values()) { 90 | if (type.getId() == id) { 91 | return type; 92 | } 93 | } 94 | return null; 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/Connection.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | import java.nio.ShortBuffer; 20 | 21 | /** 22 | * CRF weights compressed into 2D u16 matrix in MeCab manner 23 | */ 24 | public final class Connection { 25 | private final ShortBuffer matrix; 26 | private final int leftSize; 27 | private final int rightSize; 28 | 29 | public Connection(ShortBuffer matrix, int leftSize, int rightSize) { 30 | this.matrix = matrix; 31 | this.leftSize = leftSize; 32 | this.rightSize = rightSize; 33 | } 34 | 35 | private int ix(int left, int right) { 36 | assert left < leftSize; 37 | assert right < rightSize; 38 | return right * leftSize + left; 39 | } 40 | 41 | /** 42 | * 43 | * @param left 44 | * left connection index 45 | * @param right 46 | * right connection index 47 | * @return connection weight in the matrix 48 | */ 49 | public short cost(int left, int right) { 50 | return matrix.get(ix(left, right)); 51 | } 52 | 53 | public int getLeftSize() { 54 | return leftSize; 55 | } 56 | 57 | public int getRightSize() { 58 | return rightSize; 59 | } 60 | 61 | public void setCost(int left, int right, short cost) { 62 | matrix.put(ix(left, right), cost); 63 | } 64 | 65 | /** 66 | * @return a copy of itself with the buffer owned, instead of slice 67 | */ 68 | public Connection ownedCopy() { 69 | ShortBuffer copy = ShortBuffer.allocate(matrix.limit()); 70 | copy.put(matrix); 71 | 72 | return new Connection(copy, leftSize, rightSize); 73 | } 74 | 75 | public void validate(int leftId) { 76 | if (matrix == null) { 77 | // should never happen, but elides compiler checks 78 | throw new NullPointerException("matrix"); 79 | } 80 | 81 | if (leftId >= leftSize) { 82 | // should never happen, but adds a compiler precondition to the inlined method 83 | throw new IllegalArgumentException(String.format("leftId < leftSize: (%d, %d)", leftId, leftSize)); 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryAccess.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | /** 20 | * Marks access into dictionary internals 21 | */ 22 | public interface DictionaryAccess { 23 | /** 24 | * Gets current Lexicon. 25 | * 26 | * @return Lexicon implementation 27 | */ 28 | Lexicon getLexicon(); 29 | 30 | /** 31 | * Gets current grammar. 32 | * 33 | * @return current Grammar 34 | */ 35 | GrammarImpl getGrammar(); 36 | } 37 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderPrinter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | import java.io.FileInputStream; 20 | import java.io.IOException; 21 | import java.io.PrintStream; 22 | import java.nio.ByteBuffer; 23 | import java.nio.ByteOrder; 24 | import java.nio.channels.FileChannel; 25 | import java.time.Instant; 26 | import java.time.ZoneId; 27 | 28 | /** 29 | * A dictionary header printing tool. 30 | */ 31 | public class DictionaryHeaderPrinter { 32 | 33 | private DictionaryHeaderPrinter() { 34 | } 35 | 36 | static void printHeader(String filename, PrintStream output) throws IOException { 37 | ByteBuffer bytes; 38 | try (FileInputStream input = new FileInputStream(filename); FileChannel inputFile = input.getChannel()) { 39 | bytes = inputFile.map(FileChannel.MapMode.READ_ONLY, 0, inputFile.size()); 40 | bytes.order(ByteOrder.LITTLE_ENDIAN); 41 | } 42 | DictionaryHeader header = new DictionaryHeader(bytes, 0); 43 | 44 | output.println("filename: " + filename); 45 | 46 | if (header.isSystemDictionary()) { 47 | output.println("type: system dictionary"); 48 | } else if (header.isUserDictionary()) { 49 | output.println("type: user dictionary"); 50 | } else { 51 | output.println("invalid file"); 52 | return; 53 | } 54 | 55 | output.println("createTime: " 56 | + Instant.ofEpochSecond(header.getCreateTime()).atZone(ZoneId.systemDefault()).toString()); 57 | output.println("description: " + header.getDescription()); 58 | } 59 | 60 | /** 61 | * Prints the contents of dictionary header. 62 | * 63 | * This tool requires filenames of dictionaries. 64 | * 65 | * @param args 66 | * the input filenames 67 | * @throws IOException 68 | * if IO 69 | */ 70 | public static void main(String[] args) throws IOException { 71 | for (String filename : args) { 72 | printHeader(filename, System.out); 73 | } 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/DictionaryVersion.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | /** 20 | * Versions of dictionaries. 21 | */ 22 | public class DictionaryVersion { 23 | 24 | private DictionaryVersion() { 25 | } 26 | 27 | /** the first version of system dictionries */ 28 | public static final long SYSTEM_DICT_VERSION_1 = 0x7366d3f18bd111e7L; 29 | 30 | /** the second version of system dictionries */ 31 | public static final long SYSTEM_DICT_VERSION_2 = 0xce9f011a92394434L; 32 | 33 | /** the first version of user dictionries */ 34 | public static final long USER_DICT_VERSION_1 = 0xa50f31188bd211e7L; 35 | 36 | /** the second version of user dictionries */ 37 | public static final long USER_DICT_VERSION_2 = 0x9fdeb5a90168d868L; 38 | 39 | /** the third version of user dictionries */ 40 | public static final long USER_DICT_VERSION_3 = 0xca9811756ff64fb0L; 41 | 42 | public static boolean isSystemDictionary(long version) { 43 | return version == SYSTEM_DICT_VERSION_1 || version == SYSTEM_DICT_VERSION_2; 44 | } 45 | 46 | public static boolean isUserDictionary(long version) { 47 | return version == USER_DICT_VERSION_1 || version == USER_DICT_VERSION_2 || version == USER_DICT_VERSION_3; 48 | } 49 | 50 | static boolean hasGrammar(long version) { 51 | return isSystemDictionary(version) || version == USER_DICT_VERSION_2 || version == USER_DICT_VERSION_3; 52 | } 53 | 54 | static boolean hasSynonymGroupIds(long version) { 55 | return version == SYSTEM_DICT_VERSION_2 || version == USER_DICT_VERSION_3; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/DoubleArrayLookup.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | import java.nio.IntBuffer; 20 | 21 | /** 22 | * This class implements common prefix lookup in the double array with a 23 | * different API. It uses fields to return current values of end offset and a 24 | * value stored in trie to reduce GC pressure. It also modifies the hot loop to 25 | * reduce the number of non-elidable field writes. 26 | */ 27 | public final class DoubleArrayLookup { 28 | private IntBuffer array; 29 | private byte[] key; 30 | private int limit; 31 | private int startOffset; 32 | private int offset; 33 | private int nodePos; 34 | private int nodeValue; 35 | 36 | public DoubleArrayLookup() { 37 | this(null); 38 | } 39 | 40 | public DoubleArrayLookup(IntBuffer array) { 41 | this.array = array; 42 | } 43 | 44 | public DoubleArrayLookup(IntBuffer array, byte[] key, int offset, int limit) { 45 | this(array); 46 | reset(key, offset, limit); 47 | } 48 | 49 | private static boolean hasLeaf(int unit) { 50 | return ((unit >>> 8) & 1) == 1; 51 | } 52 | 53 | private static int value(int unit) { 54 | return unit & ((1 << 31) - 1); 55 | } 56 | 57 | private static int label(int unit) { 58 | return unit & ((1 << 31) | 0xFF); 59 | } 60 | 61 | private static int offset(int unit) { 62 | return ((unit >>> 10) << ((unit & (1 << 9)) >>> 6)); 63 | } 64 | 65 | public void setArray(IntBuffer array) { 66 | this.array = array; 67 | reset(this.key, this.startOffset, this.limit); 68 | } 69 | 70 | public void reset(byte[] key, int offset, int limit) { 71 | this.key = key; 72 | this.offset = offset; 73 | this.startOffset = offset; 74 | this.limit = limit; 75 | nodePos = 0; 76 | int unit = array.get(nodePos); 77 | nodePos ^= offset(unit); 78 | } 79 | 80 | public boolean next() { 81 | IntBuffer array = this.array; 82 | byte[] key = this.key; 83 | int nodePos = this.nodePos; 84 | int limit = this.limit; 85 | 86 | for (int offset = this.offset; offset < limit; ++offset) { 87 | int k = Byte.toUnsignedInt(key[offset]); 88 | nodePos ^= k; 89 | int unit = array.get(nodePos); 90 | if (label(unit) != k) { 91 | this.offset = limit; // no more loop 92 | this.nodePos = nodePos; 93 | return false; 94 | } 95 | 96 | nodePos ^= offset(unit); 97 | if (hasLeaf(unit)) { 98 | nodeValue = value(array.get(nodePos)); 99 | this.offset = offset + 1; 100 | this.nodePos = nodePos; 101 | return true; 102 | } 103 | } 104 | return false; 105 | } 106 | 107 | public int getValue() { 108 | return nodeValue; 109 | } 110 | 111 | public int getOffset() { 112 | return offset; 113 | } 114 | } 115 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/Lexicon.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | import com.worksap.nlp.sudachi.WordId; 20 | 21 | import java.util.Iterator; 22 | 23 | /** 24 | * The lexicon of morphemes. 25 | */ 26 | public interface Lexicon { 27 | 28 | Iterator lookup(byte[] text, int offset); 29 | 30 | int getWordId(String headword, short posId, String readingForm); 31 | 32 | /** 33 | * Returns the left-ID of the morpheme specified by the word ID. 34 | * 35 | *

36 | * when the word ID is out of range, the behavior is undefined. 37 | * 38 | * @param wordId 39 | * the word ID of the morpheme 40 | * @return the left-ID of the morpheme 41 | */ 42 | short getLeftId(int wordId); 43 | 44 | /** 45 | * Returns the right-ID of the morpheme specified by the word ID. 46 | * 47 | *

48 | * when the word ID is out of range, the behavior is undefined. 49 | * 50 | * @param wordId 51 | * the word ID of the morpheme 52 | * @return the right-ID of the morpheme. 53 | */ 54 | short getRightId(int wordId); 55 | 56 | /** 57 | * Returns the word occurrence cost of the morpheme specified by the word ID. 58 | * 59 | *

60 | * when the word ID is out of range, the behavior is undefined. 61 | * 62 | * @param wordId 63 | * the word ID of the morpheme 64 | * @return the word occurrence cost 65 | */ 66 | short getCost(int wordId); 67 | 68 | /** 69 | * Returns the informations of the morpheme specified by the word ID. 70 | * 71 | *

72 | * when the word ID is out of range, the behavior is undefined. 73 | * 74 | * @param wordId 75 | * the word ID of the morpheme 76 | * @return the informations of the morpheme 77 | * @see WordInfo 78 | */ 79 | WordInfo getWordInfo(int wordId); 80 | 81 | /** 82 | * Returns the ID of the dictionary containing the morpheme specified by the 83 | * word ID. 84 | * 85 | * If the morpheme is in the system dictionary, it returns {@code 0}. 86 | * 87 | * @param wordId 88 | * the word ID of the morpheme 89 | * @return the dictionary ID 90 | * @deprecated use {@link WordId#dic(int)} 91 | */ 92 | @Deprecated 93 | default int getDictionaryId(int wordId) { 94 | return WordId.dic(wordId); 95 | } 96 | 97 | /** 98 | * Returns the number of morphemes in the dictionary. 99 | * 100 | * @return the number of morphemes 101 | */ 102 | int size(); 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/POS.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | import java.util.AbstractList; 20 | import java.util.Arrays; 21 | import java.util.List; 22 | 23 | /** 24 | * Part-of-Speech 25 | *

26 | * Sudachi POS are 6-component and consist of: 4 layers of POS tags, conjugation 27 | * type, conjugation form. 28 | */ 29 | public final class POS extends AbstractList { 30 | public final static int DEPTH = 6; 31 | public final static int MAX_COMPONENT_LENGTH = 127; 32 | private final String[] elems; 33 | 34 | /** 35 | * @param elems 36 | * non-null string array of exactly six elements 37 | */ 38 | public POS(String... elems) { 39 | if (elems == null) { 40 | throw new IllegalArgumentException("pos must not be null"); 41 | } 42 | if (elems.length != DEPTH) { 43 | throw new IllegalArgumentException(String.format("pos must have exactly 6 elements, was %s: %s", 44 | elems.length, String.join(",", elems))); 45 | } 46 | for (String e : elems) { 47 | if (e == null) { 48 | throw new IllegalArgumentException("POS components can't be null"); 49 | } 50 | 51 | if (e.length() > MAX_COMPONENT_LENGTH) { 52 | throw new IllegalArgumentException( 53 | String.format("POS component had length (%d) > %d: %s", e.length(), MAX_COMPONENT_LENGTH, e)); 54 | } 55 | } 56 | this.elems = elems; 57 | } 58 | 59 | /** 60 | * Creates new POS instance from elements. Elements must be 6-length string 61 | * list. 62 | * 63 | * @param elems 64 | * POS object elements 65 | */ 66 | public POS(List elems) { 67 | this(elems.toArray(new String[0])); 68 | } 69 | 70 | @Override 71 | public String get(int i) { 72 | return elems[i]; 73 | } 74 | 75 | @Override 76 | public int size() { 77 | return DEPTH; 78 | } 79 | 80 | @Override 81 | public boolean equals(Object o) { 82 | if (this == o) 83 | return true; 84 | if (o instanceof POS) { 85 | POS strings = (POS) o; 86 | return Arrays.equals(elems, strings.elems); 87 | } 88 | return super.equals(o); 89 | } 90 | 91 | @Override 92 | public int hashCode() { 93 | int result = 0xfeed; 94 | result = 31 * result + Arrays.hashCode(elems); 95 | return result; 96 | } 97 | 98 | @Override 99 | public String toString() { 100 | return String.join(",", elems); 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/WordIdTable.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | import com.worksap.nlp.sudachi.WordId; 20 | 21 | import java.nio.ByteBuffer; 22 | 23 | class WordIdTable { 24 | private final ByteBuffer bytes; 25 | private final int size; 26 | private final int offset; 27 | private int dicIdMask = 0; 28 | 29 | WordIdTable(ByteBuffer bytes, int offset) { 30 | this.bytes = bytes; 31 | size = bytes.getInt(offset); 32 | this.offset = offset + 4; 33 | } 34 | 35 | int storageSize() { 36 | return 4 + size; 37 | } 38 | 39 | Integer[] get(int index) { 40 | int length = Byte.toUnsignedInt(bytes.get(offset + index++)); 41 | Integer[] result = new Integer[length]; 42 | for (int i = 0; i < length; i++) { 43 | result[i] = bytes.getInt(offset + index); 44 | index += 4; 45 | } 46 | return result; 47 | } 48 | 49 | /** 50 | * Reads the word IDs to the passed WordLookup object 51 | * 52 | * @param index 53 | * index in the word array 54 | * @param lookup 55 | * object to read word IDs into 56 | * @return number of read IDs 57 | */ 58 | int readWordIds(int index, WordLookup lookup) { 59 | int offset = this.offset + index; 60 | ByteBuffer bytes = this.bytes; 61 | int length = Byte.toUnsignedInt(bytes.get(offset)); 62 | offset += 1; 63 | int[] result = lookup.outputBuffer(length); 64 | int dicIdMask = this.dicIdMask; 65 | for (int i = 0; i < length; i++) { 66 | int wordId = bytes.getInt(offset); 67 | result[i] = WordId.applyMask(wordId, dicIdMask); 68 | offset += 4; 69 | } 70 | return length; 71 | } 72 | 73 | void setDictionaryId(int id) { 74 | dicIdMask = WordId.dicIdMask(id); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/WordInfoList.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | import java.nio.Buffer; 20 | import java.nio.ByteBuffer; 21 | 22 | class WordInfoList { 23 | 24 | private final ByteBuffer bytes; 25 | private final int offset; 26 | private final int wordSize; 27 | private final boolean hasSynonymGid; 28 | 29 | WordInfoList(ByteBuffer bytes, int offset, int wordSize, boolean hasSysnoymGid) { 30 | this.bytes = bytes; 31 | this.offset = offset; 32 | this.wordSize = wordSize; 33 | this.hasSynonymGid = hasSysnoymGid; 34 | } 35 | 36 | WordInfo getWordInfo(int wordId) { 37 | ByteBuffer buf = bytes.asReadOnlyBuffer(); 38 | buf.order(bytes.order()); 39 | ((Buffer) buf).position(wordIdToOffset(wordId)); // a kludge for Java 9 40 | 41 | String surface = bufferToString(buf); 42 | short headwordLength = (short) bufferToStringLength(buf); 43 | short posId = buf.getShort(); 44 | String normalizedForm = bufferToString(buf); 45 | if (normalizedForm.isEmpty()) { 46 | normalizedForm = surface; 47 | } 48 | int dictionaryFormWordId = buf.getInt(); 49 | String readingForm = bufferToString(buf); 50 | if (readingForm.isEmpty()) { 51 | readingForm = surface; 52 | } 53 | int[] aUnitSplit = bufferToIntArray(buf); 54 | int[] bUnitSplit = bufferToIntArray(buf); 55 | int[] wordStructure = bufferToIntArray(buf); 56 | 57 | int[] synonymGids = new int[0]; 58 | if (hasSynonymGid) { 59 | synonymGids = bufferToIntArray(buf); 60 | } 61 | 62 | String dictionaryForm = surface; 63 | if (dictionaryFormWordId >= 0 && dictionaryFormWordId != wordId) { 64 | WordInfo wi = getWordInfo(dictionaryFormWordId); 65 | dictionaryForm = wi.getSurface(); 66 | } 67 | 68 | return new WordInfo(surface, headwordLength, posId, normalizedForm, dictionaryFormWordId, dictionaryForm, 69 | readingForm, aUnitSplit, bUnitSplit, wordStructure, synonymGids); 70 | } 71 | 72 | int size() { 73 | return wordSize; 74 | } 75 | 76 | private int wordIdToOffset(int wordId) { 77 | return bytes.getInt(offset + 4 * wordId); 78 | } 79 | 80 | private int bufferToStringLength(ByteBuffer buffer) { 81 | byte length = buffer.get(); 82 | if (length < 0) { 83 | int high = Byte.toUnsignedInt(length); 84 | int low = Byte.toUnsignedInt(buffer.get()); 85 | return ((high & 0x7F) << 8) | low; 86 | } 87 | return length; 88 | } 89 | 90 | private String bufferToString(ByteBuffer buffer) { 91 | int length = bufferToStringLength(buffer); 92 | char[] str = new char[length]; 93 | for (int i = 0; i < length; i++) { 94 | str[i] = buffer.getChar(); 95 | } 96 | return new String(str); 97 | } 98 | 99 | private int[] bufferToIntArray(ByteBuffer buffer) { 100 | int length = Byte.toUnsignedInt(buffer.get()); 101 | int[] array = new int[length]; 102 | for (int i = 0; i < length; i++) { 103 | array[i] = buffer.getInt(); 104 | } 105 | return array; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/WordParameterList.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | import java.nio.Buffer; 20 | import java.nio.ByteBuffer; 21 | import java.nio.ByteOrder; 22 | 23 | class WordParameterList { 24 | 25 | private static final int ELEMENT_SIZE = 2 * 3; 26 | 27 | private ByteBuffer bytes; 28 | private final int size; 29 | private int offset; 30 | private boolean isCopied; 31 | 32 | WordParameterList(ByteBuffer bytes, int offset) { 33 | this.bytes = bytes; 34 | size = bytes.getInt(offset); 35 | this.offset = offset + 4; 36 | isCopied = false; 37 | } 38 | 39 | int storageSize() { 40 | return 4 + ELEMENT_SIZE * size; 41 | } 42 | 43 | int size() { 44 | return size; 45 | } 46 | 47 | short getLeftId(int wordId) { 48 | return bytes.getShort(offset + ELEMENT_SIZE * wordId); 49 | } 50 | 51 | short getRightId(int wordId) { 52 | return bytes.getShort(offset + ELEMENT_SIZE * wordId + 2); 53 | } 54 | 55 | short getCost(int wordId) { 56 | return bytes.getShort(offset + ELEMENT_SIZE * wordId + 4); 57 | } 58 | 59 | void setCost(int wordId, short cost) { 60 | if (!isCopied) { 61 | copyBuffer(); 62 | } 63 | bytes.putShort(offset + ELEMENT_SIZE * wordId + 4, cost); 64 | } 65 | 66 | int endOffset() { 67 | return offset + 4 + ELEMENT_SIZE * size; 68 | } 69 | 70 | synchronized void copyBuffer() { 71 | ByteBuffer newBuffer = ByteBuffer.allocate(ELEMENT_SIZE * size); 72 | newBuffer.order(ByteOrder.LITTLE_ENDIAN); 73 | ByteBuffer srcBuffer = bytes.duplicate(); 74 | Buffer buffer = srcBuffer; // a kludge for Java 9 75 | buffer.position(offset); 76 | buffer.limit(offset + ELEMENT_SIZE * size); 77 | newBuffer.put(srcBuffer); 78 | bytes = newBuffer; 79 | offset = 0; 80 | isCopied = true; 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/build/BuildStats.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build; 18 | 19 | import java.util.List; 20 | 21 | public class BuildStats { 22 | private final List inputs; 23 | private final List parts; 24 | 25 | public BuildStats(List inputs, List parts) { 26 | this.inputs = inputs; 27 | this.parts = parts; 28 | } 29 | 30 | public List getInputs() { 31 | return inputs; 32 | } 33 | 34 | public List getParts() { 35 | return parts; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/build/Index.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build; 18 | 19 | import com.worksap.nlp.dartsclone.DoubleArray; 20 | 21 | import java.io.IOException; 22 | import java.nio.ByteBuffer; 23 | import java.nio.ByteOrder; 24 | import java.nio.charset.StandardCharsets; 25 | import java.util.*; 26 | 27 | /** 28 | * Dictionary Parts: Trie index and entry offsets 29 | */ 30 | public class Index implements WriteDictionary { 31 | private final SortedMap> elements = new TreeMap<>((byte[] l, byte[] r) -> { 32 | int llen = l.length; 33 | int rlen = r.length; 34 | for (int i = 0; i < Math.min(llen, rlen); i++) { 35 | if (l[i] != r[i]) { 36 | return (l[i] & 0xff) - (r[i] & 0xff); 37 | } 38 | } 39 | return l.length - r.length; 40 | }); 41 | 42 | private int count = 0; 43 | 44 | public int add(String key, int wordId) { 45 | byte[] bytes = key.getBytes(StandardCharsets.UTF_8); 46 | List entries = elements.computeIfAbsent(bytes, k -> new ArrayList<>()); 47 | if (entries.size() >= 255) { 48 | throw new IllegalArgumentException(String.format("key %s has >= 255 entries in the dictionary", key)); 49 | } 50 | entries.add(wordId); 51 | count += 1; 52 | return bytes.length; 53 | } 54 | 55 | public void writeTo(ModelOutput output) throws IOException { 56 | DoubleArray trie = new DoubleArray(); 57 | 58 | int size = this.elements.size(); 59 | 60 | byte[][] keys = new byte[size][]; 61 | int[] values = new int[size]; 62 | ByteBuffer wordIdTable = ByteBuffer.allocate(count * (4 + 2)); 63 | wordIdTable.order(ByteOrder.LITTLE_ENDIAN); 64 | 65 | output.withSizedPart("WordId table", () -> { 66 | int i = 0; 67 | int numEntries = this.elements.entrySet().size(); 68 | for (Map.Entry> entry : this.elements.entrySet()) { 69 | keys[i] = entry.getKey(); 70 | values[i] = wordIdTable.position(); 71 | i++; 72 | List wordIds = entry.getValue(); 73 | wordIdTable.put((byte) wordIds.size()); 74 | for (int wid : wordIds) { 75 | wordIdTable.putInt(wid); 76 | } 77 | output.progress(i, numEntries); 78 | } 79 | return wordIdTable.position() + 4; 80 | }); 81 | 82 | DicBuffer buffer = new DicBuffer(4); 83 | output.withPart("double array Trie", () -> { 84 | trie.build(keys, values, output::progress); 85 | buffer.putInt(trie.size()); 86 | buffer.consume(output::write); 87 | output.write(trie.byteArray()); 88 | }); 89 | 90 | buffer.putInt(wordIdTable.position()); 91 | buffer.consume(output::write); 92 | 93 | wordIdTable.flip(); 94 | output.write(wordIdTable); 95 | } 96 | } 97 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/build/InputFileException.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build; 18 | 19 | public class InputFileException extends IllegalArgumentException { 20 | public InputFileException(int line, String s, Exception cause) { 21 | super(String.format("line:%d %s", line, s), cause); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/build/POSTable.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build; 18 | 19 | import com.worksap.nlp.sudachi.dictionary.Grammar; 20 | import com.worksap.nlp.sudachi.dictionary.POS; 21 | 22 | import java.io.IOException; 23 | import java.util.ArrayList; 24 | import java.util.HashMap; 25 | import java.util.List; 26 | 27 | public class POSTable implements WriteDictionary { 28 | private final List table = new ArrayList<>(); 29 | private final HashMap lookup = new HashMap<>(); 30 | private int builtin = 0; 31 | 32 | short getId(POS s) { 33 | return lookup.computeIfAbsent(s, p -> { 34 | int next = table.size(); 35 | if (next >= Short.MAX_VALUE) { 36 | throw new IllegalArgumentException("maximum POS number exceeded by " + s); 37 | } 38 | table.add(s); 39 | return (short) next; 40 | }); 41 | } 42 | 43 | public void preloadFrom(Grammar grammar) { 44 | int partOfSpeechSize = grammar.getPartOfSpeechSize(); 45 | for (short i = 0; i < partOfSpeechSize; ++i) { 46 | POS pos = grammar.getPartOfSpeechString(i); 47 | table.add(pos); 48 | lookup.put(pos, i); 49 | } 50 | builtin += partOfSpeechSize; 51 | } 52 | 53 | List getList() { 54 | return table; 55 | } 56 | 57 | @Override 58 | public void writeTo(ModelOutput output) throws IOException { 59 | output.withPart("POS table", () -> { 60 | DicBuffer buffer = new DicBuffer(128 * 1024); 61 | buffer.putShort((short) ownedLength()); 62 | for (int i = builtin; i < table.size(); ++i) { 63 | for (String s : table.get(i)) { 64 | if (!buffer.put(s)) { 65 | // handle buffer overflow, this should be extremely rare 66 | buffer.consume(output::write); 67 | buffer.put(s); 68 | } 69 | } 70 | } 71 | buffer.consume(output::write); 72 | }); 73 | } 74 | 75 | public int ownedLength() { 76 | return table.size() - builtin; 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/build/Parameters.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build; 18 | 19 | import java.io.IOException; 20 | import java.nio.ByteBuffer; 21 | import java.nio.ByteOrder; 22 | import java.nio.ShortBuffer; 23 | 24 | /** 25 | * Compiles model parameters into the binary format 26 | */ 27 | public class Parameters implements WriteDictionary { 28 | private ByteBuffer data; 29 | private ShortBuffer params; 30 | private int maxLeft = Integer.MAX_VALUE; 31 | private int maxRight = Integer.MAX_VALUE; 32 | 33 | public Parameters(int initialSize) { 34 | data = ByteBuffer.allocate(initialSize); 35 | data.order(ByteOrder.LITTLE_ENDIAN); 36 | params = data.asShortBuffer(); 37 | } 38 | 39 | public Parameters() { 40 | this(1024 * 1024); // default 1M 41 | } 42 | 43 | public void add(short left, short right, short cost) { 44 | maybeResize(); 45 | if (left >= maxLeft) { 46 | throw new IllegalArgumentException(String.format("left %d is larger than max value %d", left, maxLeft)); 47 | } 48 | if (right >= maxRight) { 49 | throw new IllegalArgumentException(String.format("right %d is larger than max value %d", right, maxRight)); 50 | } 51 | params.put(left); 52 | params.put(right); 53 | params.put(cost); 54 | } 55 | 56 | public void setLimits(int left, int right) { 57 | this.maxLeft = left; 58 | this.maxRight = right; 59 | } 60 | 61 | private void maybeResize() { 62 | if (params.remaining() < 3) { 63 | ByteBuffer newData = ByteBuffer.allocate(data.capacity() * 2); 64 | newData.order(ByteOrder.LITTLE_ENDIAN); 65 | int position = params.position(); 66 | data.position(0); 67 | data.limit(position * 2); 68 | newData.put(data); 69 | newData.clear(); 70 | data = newData; 71 | params = newData.asShortBuffer(); 72 | params.position(position); 73 | assert params.remaining() > 3; 74 | } 75 | } 76 | 77 | @Override 78 | public void writeTo(ModelOutput output) throws IOException { 79 | output.withPart("word parameters", () -> { 80 | data.limit(params.position() * 2); 81 | output.write(data); 82 | }); 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/build/Progress.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build; 18 | 19 | import java.time.Duration; 20 | 21 | public class Progress { 22 | private final static long MS_100 = 100_000_000L; // 100ms in nanos 23 | private final int maxUpdates; 24 | private final Callback callback; 25 | private float currentProgress; 26 | private long lastUpdate; 27 | 28 | public Progress(int maxUpdates, Callback callback) { 29 | this.maxUpdates = maxUpdates; 30 | this.callback = callback; 31 | } 32 | 33 | public void startBlock(String name, long start, Kind kind) { 34 | lastUpdate = start; 35 | callback.start(name, kind); 36 | currentProgress = step(); 37 | } 38 | 39 | private float step() { 40 | return 1.0f / maxUpdates - 1e-6f; 41 | } 42 | 43 | /** 44 | * This function limits calls to the progress function 45 | * 46 | * @param cur 47 | * current state 48 | * @param max 49 | * maximum state 50 | */ 51 | public void progress(long cur, long max) { 52 | double ratio = cur / (double) max; 53 | if (ratio > currentProgress) { 54 | if (ratio >= 1.0) { 55 | callback.progress(1.0f); 56 | currentProgress = Float.MAX_VALUE; 57 | } 58 | 59 | long curTime = System.nanoTime(); 60 | if (curTime - lastUpdate > MS_100) { 61 | callback.progress((float) ratio); 62 | float step = step(); 63 | double nsteps = ratio / step; 64 | currentProgress += Math.floor(nsteps) * step; 65 | assert ratio < currentProgress; 66 | lastUpdate = curTime; 67 | } 68 | } 69 | } 70 | 71 | public void endBlock(long size, long time) { 72 | callback.end(size, Duration.ofNanos(time)); 73 | } 74 | 75 | public enum Kind { 76 | INPUT, OUTPUT 77 | } 78 | 79 | /** 80 | * Progress callback 81 | */ 82 | public interface Callback { 83 | /** 84 | * This function will be called for each step at the beginning 85 | * 86 | * @param name 87 | * step name 88 | */ 89 | default void start(String name, Kind kind) { 90 | } 91 | 92 | /** 93 | * This function will be called as progress is happening 94 | * 95 | * @param progress 96 | * ratio of the progress 97 | */ 98 | void progress(float progress); 99 | 100 | default void end(long size, Duration time) { 101 | } 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/build/TrackingInputStream.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build; 18 | 19 | import java.io.IOException; 20 | import java.io.InputStream; 21 | 22 | public class TrackingInputStream extends InputStream { 23 | private final InputStream inner; 24 | private long position; 25 | 26 | public TrackingInputStream(InputStream inner) { 27 | this.inner = inner; 28 | } 29 | 30 | @Override 31 | public int read() throws IOException { 32 | return inner.read(); 33 | } 34 | 35 | @Override 36 | public int read(byte[] b) throws IOException { 37 | int read = inner.read(b); 38 | if (read != -1) { 39 | position += read; 40 | } 41 | return read; 42 | } 43 | 44 | @Override 45 | public int read(byte[] b, int off, int len) throws IOException { 46 | int read = inner.read(b, off, len); 47 | if (read != -1) { 48 | position += read; 49 | } 50 | return read; 51 | } 52 | 53 | @Override 54 | public long skip(long n) throws IOException { 55 | position += n; 56 | return super.skip(n); 57 | } 58 | 59 | public long getPosition() { 60 | return position; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/build/WordIdResolver.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build; 18 | 19 | public interface WordIdResolver { 20 | int lookup(String headword, short posId, String reading); 21 | 22 | void validate(int wordId); 23 | 24 | boolean isUser(); 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/com/worksap/nlp/sudachi/dictionary/build/WriteDictionary.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build; 18 | 19 | import java.io.IOException; 20 | 21 | public interface WriteDictionary { 22 | void writeTo(ModelOutput output) throws IOException; 23 | } 24 | -------------------------------------------------------------------------------- /src/main/resources/sudachi.json: -------------------------------------------------------------------------------- 1 | { 2 | "systemDict" : "system_core.dic", 3 | "characterDefinitionFile": "char.def", 4 | "inputTextPlugin" : [ 5 | { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" }, 6 | { "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin", 7 | "prolongedSoundMarks": ["ー", "-", "⁓", "〜", "〰"], 8 | "replacementSymbol": "ー"}, 9 | { "class": "com.worksap.nlp.sudachi.IgnoreYomiganaPlugin", 10 | "leftBrackets": ["(", "("], 11 | "rightBrackets": [")", ")"], 12 | "maxYomiganaLength": 4} 13 | ], 14 | "oovProviderPlugin" : [ 15 | { "class" : "com.worksap.nlp.sudachi.MeCabOovProviderPlugin" }, 16 | { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", 17 | "oovPOS" : [ "補助記号", "一般", "*", "*", "*", "*" ], 18 | "leftId" : 5968, 19 | "rightId" : 5968, 20 | "cost" : 3857 } 21 | ], 22 | "pathRewritePlugin" : [ 23 | { "class" : "com.worksap.nlp.sudachi.JoinNumericPlugin", 24 | "enableNormalize" : true }, 25 | { "class" : "com.worksap.nlp.sudachi.JoinKatakanaOovPlugin", 26 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 27 | "minLength" : 3 28 | } 29 | ], 30 | "formatterPlugin" : [ 31 | { "class" : "com.worksap.nlp.sudachi.SimpleMorphemeFormatter" }, 32 | { "class" : "com.worksap.nlp.sudachi.WordSegmentationFormatter", 33 | "eos" : "\n" }, 34 | { "class" : "com.worksap.nlp.sudachi.WordSegmentationFormatter", 35 | "eos" : " " } 36 | ] 37 | } 38 | -------------------------------------------------------------------------------- /src/main/resources/sudachi.logging.properties: -------------------------------------------------------------------------------- 1 | java.util.logging.SimpleFormatter.format=%5$s%n 2 | 3 | com.worksap.nlp.sudachi.handlers=java.util.logging.ConsoleHandler 4 | com.worksap.nlp.sudachi.level=INFO 5 | java.util.logging.ConsoleHandler.level=ALL -------------------------------------------------------------------------------- /src/main/resources/unk.def: -------------------------------------------------------------------------------- 1 | DEFAULT,5968,5968,3857,補助記号,一般,*,*,*,* 2 | SPACE,5966,5966,6056,空白,*,*,*,*,* 3 | KANJI,5139,5139,14657,名詞,普通名詞,一般,*,*,* 4 | KANJI,5129,5129,17308,名詞,普通名詞,サ変可能,*,*,* 5 | KANJI,4785,4785,18181,名詞,固有名詞,一般,*,*,* 6 | KANJI,4787,4787,18086,名詞,固有名詞,人名,一般,*,* 7 | KANJI,4791,4791,19198,名詞,固有名詞,地名,一般,*,* 8 | SYMBOL,5129,5129,17094,名詞,普通名詞,サ変可能,*,*,* 9 | NUMERIC,4794,4794,12450,名詞,数詞,*,*,*,* 10 | ALPHA,5139,5139,11633,名詞,普通名詞,一般,*,*,* 11 | ALPHA,4785,4785,13620,名詞,固有名詞,一般,*,*,* 12 | ALPHA,4787,4787,14228,名詞,固有名詞,人名,一般,*,* 13 | ALPHA,4791,4791,15793,名詞,固有名詞,地名,一般,*,* 14 | ALPHA,5687,5687,15246,感動詞,一般,*,*,*,* 15 | HIRAGANA,5139,5139,16012,名詞,普通名詞,一般,*,*,* 16 | HIRAGANA,5129,5129,20012,名詞,普通名詞,サ変可能,*,*,* 17 | HIRAGANA,4785,4785,18282,名詞,固有名詞,一般,*,*,* 18 | HIRAGANA,4787,4787,18269,名詞,固有名詞,人名,一般,*,* 19 | HIRAGANA,4791,4791,20474,名詞,固有名詞,地名,一般,*,* 20 | HIRAGANA,5687,5687,17786,感動詞,一般,*,*,*,* 21 | KATAKANA,5139,5139,10980,名詞,普通名詞,一般,*,*,* 22 | KATAKANA,5129,5129,14802,名詞,普通名詞,サ変可能,*,*,* 23 | KATAKANA,4785,4785,13451,名詞,固有名詞,一般,*,*,* 24 | KATAKANA,4787,4787,13759,名詞,固有名詞,人名,一般,*,* 25 | KATAKANA,4791,4791,14554,名詞,固有名詞,地名,一般,*,* 26 | KATAKANA,5687,5687,15272,感動詞,一般,*,*,*,* 27 | KANJINUMERIC,4794,4794,14170,名詞,数詞,*,*,*,* 28 | GREEK,5139,5139,11051,名詞,普通名詞,一般,*,*,* 29 | GREEK,4785,4785,13353,名詞,固有名詞,一般,*,*,* 30 | GREEK,4787,4787,13671,名詞,固有名詞,人名,一般,*,* 31 | GREEK,4791,4791,14862,名詞,固有名詞,地名,一般,*,* 32 | CYRILLIC,5139,5139,11140,名詞,普通名詞,一般,*,*,* 33 | CYRILLIC,4785,4785,13174,名詞,固有名詞,一般,*,*,* 34 | CYRILLIC,4787,4787,13495,名詞,固有名詞,人名,一般,*,* 35 | CYRILLIC,4791,4791,14700,名詞,固有名詞,地名,一般,*,* 36 | -------------------------------------------------------------------------------- /src/test/dict/lex.csv: -------------------------------------------------------------------------------- 1 | た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*,* 2 | に,2,2,11406,に,助詞,接続助詞,*,*,*,*,ニ,に,*,A,*,*,*,* 3 | に,3,3,4481,に,助詞,格助詞,*,*,*,*,ニ,に,*,A,*,*,*,* 4 | 京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,1/5 5 | 東,7,7,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,* 6 | 東京,6,6,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* 7 | 東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,* 8 | 行く,4,4,5105,行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,* 9 | 行っ,5,5,5122,行っ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,A,*,*,*,* 10 | 都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,* 11 | アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,*,A,*,*,*,* 12 | アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,*,A,*,*,*,* 13 | アイアイウ,6,6,32766,アイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,*,A,*,*,*,* 14 | 0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,*,A,*,*,*,* 15 | 1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,*,A,*,*,*,* 16 | 2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,*,A,*,*,*,* 17 | 3,9,9,2478,3,名詞,数詞,*,*,*,*,サン,3,*,A,*,*,*,* 18 | 4,9,9,2478,4,名詞,数詞,*,*,*,*,ヨン,4,*,A,*,*,*,* 19 | 5,9,9,2478,5,名詞,数詞,*,*,*,*,ゴ,5,*,A,*,*,*,* 20 | 6,9,9,2478,6,名詞,数詞,*,*,*,*,ロク,6,*,A,*,*,*,* 21 | 7,9,9,2478,7,名詞,数詞,*,*,*,*,ナナ,7,*,A,*,*,*,* 22 | 8,9,9,2478,8,名詞,数詞,*,*,*,*,ハチ,8,*,A,*,*,*,* 23 | 9,9,9,2478,9,名詞,数詞,*,*,*,*,キュウ,9,*,A,*,*,*,* 24 | 〇,9,9,2478,〇,名詞,数詞,*,*,*,*,ゼロ,〇,*,A,*,*,*,* 25 | 一,9,9,2478,一,名詞,数詞,*,*,*,*,イチ,一,*,A,*,*,*,* 26 | 二,9,9,2478,二,名詞,数詞,*,*,*,*,ニ,二,*,A,*,*,*,* 27 | 三,9,9,2478,三,名詞,数詞,*,*,*,*,サン,三,*,A,*,*,*,* 28 | 四,9,9,2478,四,名詞,数詞,*,*,*,*,ヨン,四,*,A,*,*,*,* 29 | 五,9,9,2478,五,名詞,数詞,*,*,*,*,ゴ,五,*,A,*,*,*,* 30 | 六,9,9,2478,六,名詞,数詞,*,*,*,*,ロク,六,*,A,*,*,*,* 31 | 七,9,9,2478,七,名詞,数詞,*,*,*,*,ナナ,七,*,A,*,*,*,* 32 | 八,9,9,2478,八,名詞,数詞,*,*,*,*,ハチ,八,*,A,*,*,*,* 33 | 九,9,9,2478,九,名詞,数詞,*,*,*,*,キュウ,九,*,A,*,*,*,* 34 | 六三四,6,6,0,六三四,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,*,A,*,*,*,* 35 | いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,* 36 | いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,* 37 | 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,2478,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,* 38 | 特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,特A,*,A,*,*,*,* 39 | な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,A,*,*,*,* -------------------------------------------------------------------------------- /src/test/dict/matrix.def: -------------------------------------------------------------------------------- 1 | 10 10 2 | 0 0 0 3 | 0 1 863 4 | 0 2 2124 5 | 0 3 1032 6 | 0 4 591 7 | 0 5 -162 8 | 0 6 -79 9 | 0 7 887 10 | 0 8 447 11 | 0 9 -535 12 | 1 0 -3689 13 | 1 1 -3361 14 | 1 2 -7643 15 | 1 3 -3267 16 | 1 4 809 17 | 1 5 -1098 18 | 1 6 4606 19 | 1 7 4269 20 | 1 8 4567 21 | 1 9 1635 22 | 2 0 -1959 23 | 2 1 2457 24 | 2 2 811 25 | 2 3 840 26 | 2 4 903 27 | 2 5 -958 28 | 2 6 517 29 | 2 7 2037 30 | 2 8 1392 31 | 2 9 -193 32 | 3 0 -2288 33 | 3 1 1741 34 | 3 2 487 35 | 3 3 792 36 | 3 4 -1474 37 | 3 5 -3429 38 | 3 6 126 39 | 3 7 437 40 | 3 8 605 41 | 3 9 -547 42 | 4 0 -2809 43 | 4 1 -3584 44 | 4 2 -6743 45 | 4 3 -2869 46 | 4 4 -2805 47 | 4 5 -407 48 | 4 6 3422 49 | 4 7 5642 50 | 4 8 6382 51 | 4 9 2165 52 | 5 0 -509 53 | 5 1 -3665 54 | 5 2 -3882 55 | 5 3 -572 56 | 5 4 -1036 57 | 5 5 -54 58 | 5 6 2570 59 | 5 7 3319 60 | 5 8 4059 61 | 5 9 882 62 | 6 0 101 63 | 6 1 2933 64 | 6 2 2198 65 | 6 3 -2004 66 | 6 4 4392 67 | 6 5 4017 68 | 6 6 569 69 | 6 7 475 70 | 6 8 -390 71 | 6 9 852 72 | 7 0 -852 73 | 7 1 2079 74 | 7 2 1180 75 | 7 3 -3084 76 | 7 4 2010 77 | 7 5 1570 78 | 7 6 746 79 | 7 7 2341 80 | 7 8 2051 81 | 7 9 1393 82 | 8 0 -522 83 | 8 1 3354 84 | 8 2 2037 85 | 8 3 -2542 86 | 8 4 3071 87 | 8 5 2631 88 | 8 6 -352 89 | 8 7 2847 90 | 8 8 1134 91 | 8 9 1256 92 | 9 0 -975 93 | 9 1 2498 94 | 9 2 1690 95 | 9 3 -1523 96 | 9 4 3023 97 | 9 5 3139 98 | 9 6 2562 99 | 9 7 3962 100 | 9 8 418 101 | 9 9 -2490 102 | -------------------------------------------------------------------------------- /src/test/dict/user.csv: -------------------------------------------------------------------------------- 1 | ぴらる,8,8,-32768,ぴらる,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,*,A,*,*,*,* 2 | 府,8,8,2914,府,名詞,普通名詞,一般,*,*,*,フ,府,*,A,*,*,*,* 3 | 東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,*,B,5/U1,*,5/U1,1/3 4 | すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,*,A,*,*,*,* 5 | -------------------------------------------------------------------------------- /src/test/dict/user2.csv: -------------------------------------------------------------------------------- 1 | ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,*,A,*,*,*,* 2 | かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,*,A,*,*,*,* 3 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/DictionaryFactoryTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi 18 | 19 | import kotlin.test.assertFails 20 | import org.junit.Test 21 | 22 | class DictionaryFactoryTest { 23 | @Test 24 | @Deprecated( 25 | "testing deprecated methods", 26 | ReplaceWith("DictionaryFactory().create()", "com.worksap.nlp.sudachi.DictionaryFactory")) 27 | fun everythingNull() { 28 | val error = assertFails { DictionaryFactory().create(null, null, false) } 29 | assert(error.message!!.contains("Failed to resolve file: system.dic")) 30 | } 31 | 32 | @Test 33 | @Deprecated( 34 | "testing deprecated methods", 35 | ReplaceWith("DictionaryFactory().create()", "com.worksap.nlp.sudachi.DictionaryFactory")) 36 | fun notNullPath() { 37 | val error = assertFails { DictionaryFactory().create("does-not-exist", null, false) } 38 | assert(error.message!!.contains("base=does-not-exist")) 39 | } 40 | 41 | @Test 42 | @Deprecated( 43 | "testing deprecated methods", 44 | ReplaceWith("DictionaryFactory().create()", "com.worksap.nlp.sudachi.DictionaryFactory")) 45 | fun notNullPathSettings() { 46 | val error = assertFails { 47 | DictionaryFactory().create("", """{"systemDict": "test.dic"}""", true) 48 | } 49 | assert(error.message!!.contains("Failed to resolve file: test.dic")) 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/InhibitConnectionPluginTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import static org.hamcrest.CoreMatchers.is; 20 | import static org.hamcrest.MatcherAssert.assertThat; 21 | 22 | import java.util.Arrays; 23 | import java.util.Collections; 24 | 25 | import org.junit.Test; 26 | 27 | import com.worksap.nlp.sudachi.dictionary.Grammar; 28 | 29 | public class InhibitConnectionPluginTest { 30 | 31 | @Test 32 | public void edit() { 33 | short left = 0; 34 | short right = 0; 35 | MockGrammar grammar = new MockGrammar(); 36 | InhibitConnectionPlugin plugin = new InhibitConnectionPlugin(); 37 | plugin.inhibitedPairs = Collections.singletonList(Arrays.asList((int) left, (int) right)); 38 | plugin.edit(grammar); 39 | assertThat(grammar.getConnectCost(left, right), is(Grammar.INHIBITED_CONNECTION)); 40 | } 41 | 42 | } -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/JapaneseDictionaryTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import static org.hamcrest.CoreMatchers.is; 20 | import static org.hamcrest.CoreMatchers.isA; 21 | import static org.hamcrest.CoreMatchers.notNullValue; 22 | import static org.hamcrest.MatcherAssert.assertThat; 23 | 24 | import java.io.IOException; 25 | import java.util.List; 26 | 27 | import org.junit.After; 28 | import org.junit.Before; 29 | import org.junit.Test; 30 | 31 | public class JapaneseDictionaryTest { 32 | Dictionary dict; 33 | 34 | @Before 35 | public void setUp() throws IOException { 36 | dict = TestDictionary.INSTANCE.user0(); 37 | } 38 | 39 | @After 40 | public void tearDown() throws IOException { 41 | dict.close(); 42 | } 43 | 44 | @Test 45 | public void create() { 46 | assertThat(dict.create(), isA(Tokenizer.class)); 47 | } 48 | 49 | @Test 50 | public void getPartOfSpeechSize() { 51 | assertThat(dict.getPartOfSpeechSize(), is(8)); 52 | } 53 | 54 | @Test 55 | public void getPartOfSpeechString() { 56 | List pos = dict.getPartOfSpeechString((short) 0); 57 | assertThat(pos, notNullValue()); 58 | assertThat(pos.get(0), is("助動詞")); 59 | } 60 | 61 | @Test 62 | public void instantiateConfigWithoutCharDef() throws IOException { 63 | Config cfg = Config.fromClasspath("sudachi_minimum.json"); 64 | cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict()); 65 | try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) { 66 | assertThat(jd, notNullValue()); 67 | assertThat(jd.create(), notNullValue()); 68 | } 69 | } 70 | 71 | private JapaneseDictionary makeDictionaryIncorrectly() throws IOException { 72 | Config cfg = Config.fromClasspath("sudachi_minimum.json"); 73 | cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict()); 74 | try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) { 75 | return jd; 76 | } 77 | } 78 | 79 | @Test(expected = IllegalStateException.class) 80 | public void throwExceptionOnDictionaryUsageAfterClose() throws IOException { 81 | JapaneseDictionary dic = makeDictionaryIncorrectly(); 82 | Tokenizer ignored = dic.create(); 83 | } 84 | 85 | private Tokenizer makeTokenizerIncorrectly() throws IOException { 86 | Config cfg = Config.fromClasspath("sudachi_minimum.json"); 87 | cfg.systemDictionary(TestDictionary.INSTANCE.getSystemDict()); 88 | try (JapaneseDictionary jd = (JapaneseDictionary) new DictionaryFactory().create(cfg)) { 89 | return jd.create(); 90 | } 91 | } 92 | 93 | @Test(expected = IllegalStateException.class) 94 | public void throwExceptionOnTokenizerUsageAfterClose() throws IOException { 95 | Tokenizer tok = makeTokenizerIncorrectly(); 96 | tok.tokenize("a"); 97 | } 98 | 99 | } 100 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/JapaneseTokenizerMaskTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi 18 | 19 | import kotlin.test.Test 20 | import kotlin.test.assertEquals 21 | import kotlin.test.assertIs 22 | 23 | class JapaneseTokenizerMaskTest { 24 | private class CaptureOtherWords : OovProviderPlugin() { 25 | val otherWords = ArrayList>() 26 | override fun provideOOV( 27 | inputText: InputText?, 28 | offset: Int, 29 | otherWords: Long, 30 | result: MutableList? 31 | ): Int { 32 | this.otherWords.add(offset to otherWords) 33 | return 0 34 | } 35 | } 36 | 37 | @Test 38 | fun correctMasksWithFirstProvider() { 39 | val cfg0 = Config.empty() 40 | cfg0.addOovProviderPlugin(CaptureOtherWords::class.java) 41 | cfg0.addOovProviderPlugin(SimpleOovProviderPlugin::class.java) 42 | val cfg = cfg0.withFallback(TestDictionary.user0Cfg()) 43 | val dic = DictionaryFactory().create(cfg) as JapaneseDictionary 44 | val tokenizer = dic.create() 45 | 46 | assertEquals(2, dic.oovProviderPlugins.size) 47 | assertIs(dic.oovProviderPlugins[0]) 48 | assertIs(dic.oovProviderPlugins[1]) 49 | 50 | tokenizer.tokenize("かaiueoか") 51 | val provider = dic.oovProviderPlugins.first { it is CaptureOtherWords } as CaptureOtherWords 52 | val otherWords = provider.otherWords 53 | assertEquals(3, otherWords.size) 54 | // in this order word masks are empty 55 | assertEquals(0 to 0L, otherWords[0]) 56 | assertEquals(3 to 0L, otherWords[1]) 57 | assertEquals(8 to 0L, otherWords[2]) 58 | } 59 | 60 | @Test 61 | fun correctMasksWithSecondProvider() { 62 | val cfg = TestDictionary.user0Cfg() 63 | cfg.addOovProviderPlugin(CaptureOtherWords::class.java) 64 | val dic = DictionaryFactory().create(cfg) as JapaneseDictionary 65 | val tokenizer = dic.create() 66 | 67 | assertIs(dic.oovProviderPlugins[0]) 68 | assertIs(dic.oovProviderPlugins[1]) 69 | 70 | tokenizer.tokenize("かaiueoか") 71 | val provider = dic.oovProviderPlugins.first { it is CaptureOtherWords } as CaptureOtherWords 72 | val otherWords = provider.otherWords 73 | assertEquals(3, otherWords.size) 74 | // in this order word masks are not empty 75 | assertEquals(0 to WordMask.nth(3), otherWords[0]) 76 | assertEquals(3 to WordMask.nth(5), otherWords[1]) 77 | assertEquals(8 to WordMask.nth(3), otherWords[2]) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/JoinKatakanaOovPluginTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import static org.junit.Assert.assertEquals; 20 | import static org.junit.Assert.assertFalse; 21 | 22 | import java.io.IOException; 23 | import java.util.List; 24 | 25 | import org.junit.Before; 26 | import org.junit.Test; 27 | 28 | public class JoinKatakanaOovPluginTest { 29 | JapaneseTokenizer tokenizer; 30 | JoinKatakanaOovPlugin plugin; 31 | 32 | @Before 33 | public void setUp() throws IOException { 34 | Dictionary dict = TestDictionary.INSTANCE.user1(); 35 | tokenizer = (JapaneseTokenizer) dict.create(); 36 | plugin = new JoinKatakanaOovPlugin(); 37 | } 38 | 39 | @Test 40 | public void testKatakanaLength() { 41 | // アイ, アイウ in the dictionary 42 | 43 | plugin.minLength = 0; 44 | List path = getPath("アイアイウ"); 45 | assertEquals(2, path.size()); 46 | 47 | plugin.minLength = 1; 48 | path = getPath("アイアイウ"); 49 | assertEquals(2, path.size()); 50 | 51 | plugin.minLength = 2; 52 | path = getPath("アイアイウ"); 53 | assertEquals(2, path.size()); 54 | 55 | plugin.minLength = 3; 56 | path = getPath("アイアイウ"); 57 | assertEquals(1, path.size()); 58 | } 59 | 60 | @Test 61 | public void testPOS() { 62 | // アイアイウ is 名詞-固有名詞-地名-一般 in the dictionary 63 | plugin.minLength = 3; 64 | List path = getPath("アイアイウ"); 65 | assertEquals(1, path.size()); 66 | assertFalse(path.get(0).isOOV()); // use the word in dictionary 67 | } 68 | 69 | @Test 70 | public void testStartWithMiddle() { 71 | plugin.minLength = 3; 72 | List path = getPath("アイウアイアイウ"); 73 | assertEquals(1, path.size()); 74 | } 75 | 76 | @Test 77 | public void testStartWithTail() { 78 | plugin.minLength = 3; 79 | List path = getPath("アイウアイウアイ"); 80 | assertEquals(1, path.size()); 81 | } 82 | 83 | @Test 84 | public void testWithNOOOVBOW() { 85 | plugin.minLength = 3; 86 | List path = getPath("ァアイアイウ"); 87 | assertEquals(2, path.size()); 88 | assertEquals("ァ", path.get(0).getWordInfo().getSurface()); 89 | 90 | path = getPath("アイウァアイウ"); 91 | assertEquals(1, path.size()); 92 | } 93 | 94 | private List getPath(String text) { 95 | UTF8InputText input = new UTF8InputTextBuilder(text, tokenizer.grammar).build(); 96 | LatticeImpl lattice = tokenizer.buildLattice(input); 97 | List path = lattice.getBestPath(); 98 | plugin.rewrite(input, path, lattice); 99 | lattice.clear(); 100 | return path; 101 | } 102 | 103 | } 104 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/MMapTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import org.junit.Before; 20 | import org.junit.Rule; 21 | import org.junit.Test; 22 | import org.junit.rules.TemporaryFolder; 23 | 24 | import java.io.IOException; 25 | import java.nio.ByteBuffer; 26 | import java.nio.file.NoSuchFileException; 27 | import java.nio.file.Path; 28 | 29 | import static org.hamcrest.CoreMatchers.isA; 30 | import static org.hamcrest.MatcherAssert.assertThat; 31 | 32 | public class MMapTest { 33 | 34 | @Rule 35 | public TemporaryFolder temporaryFolder = new TemporaryFolder(); 36 | 37 | Path path; 38 | 39 | @Before 40 | public void setUp() throws IOException { 41 | path = temporaryFolder.getRoot().toPath(); 42 | TestDictionary.INSTANCE.getSystemDictData().writeData(path.resolve("system.dic")); 43 | } 44 | 45 | @Test 46 | public void map() throws IOException { 47 | String filename = path.resolve("system.dic").toString(); 48 | assertThat(MMap.map(filename), isA(ByteBuffer.class)); 49 | } 50 | 51 | @Test(expected = NoSuchFileException.class) 52 | public void mapWithNotExist() throws IOException { 53 | String filename = path.resolve("does_not_exist").toString(); 54 | MMap.map(filename); 55 | } 56 | 57 | @Test 58 | public void unmap() throws IOException { 59 | String filename = path.resolve("system.dic").toString(); 60 | ByteBuffer buffer = MMap.map(filename); 61 | assertThat(buffer, isA(ByteBuffer.class)); 62 | MMap.unmap(buffer); 63 | } 64 | 65 | @Test 66 | public void unmapWithoutMappedByteBuffer() throws IOException { 67 | ByteBuffer buffer = ByteBuffer.wrap(new byte[] { 0x00, 0x00 }); 68 | MMap.unmap(buffer); 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/MockGrammar.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.io.IOException; 20 | import java.util.Collections; 21 | import java.util.HashMap; 22 | import java.util.List; 23 | import java.util.Map; 24 | 25 | import com.worksap.nlp.sudachi.dictionary.CharacterCategory; 26 | import com.worksap.nlp.sudachi.dictionary.Grammar; 27 | import com.worksap.nlp.sudachi.dictionary.POS; 28 | 29 | public class MockGrammar implements Grammar { 30 | 31 | Map> matrix = new HashMap<>(); 32 | private final CharacterCategory category = defaultCharCategory(); 33 | 34 | @Override 35 | public int getPartOfSpeechSize() { 36 | return 0; 37 | } 38 | 39 | @Override 40 | public POS getPartOfSpeechString(short posId) { 41 | return null; 42 | } 43 | 44 | @Override 45 | public short getPartOfSpeechId(List pos) { 46 | return 0; 47 | } 48 | 49 | @Override 50 | public short getConnectCost(short left, short right) { 51 | return matrix.getOrDefault(left, Collections.emptyMap()).getOrDefault(right, (short) 0); 52 | } 53 | 54 | @Override 55 | public void setConnectCost(short left, short right, short cost) { 56 | matrix.computeIfAbsent(left, k -> new HashMap<>()).put(right, cost); 57 | } 58 | 59 | @Override 60 | public short[] getBOSParameter() { 61 | return null; 62 | } 63 | 64 | @Override 65 | public short[] getEOSParameter() { 66 | return null; 67 | } 68 | 69 | @Override 70 | public CharacterCategory getCharacterCategory() { 71 | return category; 72 | } 73 | 74 | public static CharacterCategory defaultCharCategory() { 75 | try { 76 | return CharacterCategory.load(PathAnchor.classpath().resource("char.def")); 77 | } catch (IOException e) { 78 | throw new RuntimeException(e); 79 | } 80 | } 81 | 82 | @Override 83 | public void setCharacterCategory(CharacterCategory charCategory) { 84 | } 85 | 86 | @Override 87 | public boolean isValid() { 88 | return true; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/MockInputText.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.util.EnumSet; 20 | import java.util.Set; 21 | 22 | import com.worksap.nlp.sudachi.dictionary.CategoryType; 23 | 24 | class MockInputText implements InputText { 25 | 26 | String text; 27 | EnumSet[] types; 28 | 29 | @SuppressWarnings("unchecked") 30 | MockInputText(String text) { 31 | this.text = text; 32 | types = new EnumSet[text.length()]; 33 | for (int i = 0; i < text.length(); i++) { 34 | types[i] = EnumSet.noneOf(CategoryType.class); 35 | } 36 | } 37 | 38 | void setCategoryType(int begin, int end, CategoryType... types) { 39 | for (int i = begin; i < end; i++) { 40 | for (CategoryType type : types) { 41 | this.types[i].add(type); 42 | } 43 | } 44 | } 45 | 46 | @Override 47 | public String getText() { 48 | return text; 49 | } 50 | 51 | @Override 52 | public String getOriginalText() { 53 | return text; 54 | } 55 | 56 | @Override 57 | public String getSubstring(int begin, int end) { 58 | return text.substring(begin, end); 59 | } 60 | 61 | @Override 62 | public InputText slice(int begin, int end) { 63 | return null; 64 | } 65 | 66 | @Override 67 | public int getOriginalIndex(int index) { 68 | return index; 69 | } 70 | 71 | @Override 72 | public Set getCharCategoryTypes(int index) { 73 | return types[index]; 74 | } 75 | 76 | @Override 77 | public Set getCharCategoryTypes(int begin, int end) { 78 | Set continuousCategory = types[begin].clone(); 79 | for (int i = text.offsetByCodePoints(begin, 1); i < end; i = text.offsetByCodePoints(i, 1)) { 80 | continuousCategory.retainAll(types[i]); 81 | } 82 | return continuousCategory; 83 | } 84 | 85 | @Override 86 | public int getCharCategoryContinuousLength(int index) { 87 | Set continuousCategory = types[index].clone(); 88 | for (int i = text.offsetByCodePoints(index, 1); i < text.length(); i = text.offsetByCodePoints(i, 1)) { 89 | continuousCategory.retainAll(types[i]); 90 | if (continuousCategory.isEmpty()) { 91 | return i - index; 92 | } 93 | } 94 | return text.length() - index; 95 | } 96 | 97 | @Override 98 | public int getCodePointsOffsetLength(int index, int codePointOffset) { 99 | return text.offsetByCodePoints(index, codePointOffset) - index; 100 | } 101 | 102 | @Override 103 | public int codePointCount(int begin, int end) { 104 | return Character.codePointCount(text, begin, end); 105 | } 106 | 107 | @Override 108 | public boolean canBow(int index) { 109 | return true; 110 | } 111 | 112 | @Override 113 | public int getWordCandidateLength(int index) { 114 | return 1; 115 | } 116 | 117 | @Override 118 | public int getNextInOriginal(int index) { 119 | return index + 1; 120 | } 121 | 122 | @Override 123 | public int modifiedOffset(int index) { 124 | return 0; 125 | } 126 | 127 | @Override 128 | public byte[] getByteText() { 129 | return new byte[0]; 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/MorphemeImplTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi 18 | 19 | import kotlin.test.Test 20 | import kotlin.test.assertEquals 21 | 22 | class MorphemeImplTest { 23 | @Test 24 | fun useToString() { 25 | val dic = TestDictionary.user0() 26 | val sudachi = dic.create().tokenize("すだち") 27 | assertEquals( 28 | "MorphemeImpl{begin=0, end=1, surface=す, pos=4/名詞,普通名詞,一般,*,*,*, wid=(0,0)}", 29 | sudachi[0].toString()) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/StringUtilTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi 18 | 19 | import kotlin.test.Test 20 | import kotlin.test.assertContentEquals 21 | import kotlin.test.assertEquals 22 | 23 | class StringUtilTest { 24 | @Test 25 | fun readAllBytes() { 26 | val resource = javaClass.getResource("/char.def") 27 | val buf = StringUtil.readAllBytes(resource) 28 | val str = StringUtil.readFully(resource) 29 | val bytes = str.encodeToByteArray() 30 | assertEquals(bytes.size, buf.remaining()) 31 | val arr2 = ByteArray(bytes.size) 32 | buf.get(arr2) 33 | assertContentEquals(bytes, arr2) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/TestDictionary.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi 18 | 19 | import com.worksap.nlp.sudachi.dictionary.BinaryDictionary 20 | import com.worksap.nlp.sudachi.dictionary.build.DicBuilder 21 | import com.worksap.nlp.sudachi.dictionary.build.MemChannel 22 | import com.worksap.nlp.sudachi.dictionary.build.res 23 | 24 | /** Utility for lazily creating binary dictionaries for test */ 25 | object TestDictionary { 26 | val systemDictData: MemChannel by lazy { 27 | val result = MemChannel() 28 | DicBuilder.system() 29 | .matrix(res("/dict/matrix.def")) 30 | .lexicon(res("/dict/lex.csv")) 31 | .description("the system dictionary for the unit tests") 32 | .build(result) 33 | result 34 | } 35 | 36 | val userDict1Data: MemChannel by lazy { 37 | val chan = MemChannel() 38 | DicBuilder.user(systemDict).lexicon(res("/dict/user.csv")).build(chan) 39 | chan 40 | } 41 | 42 | val systemDict: BinaryDictionary 43 | get() = BinaryDictionary.loadSystem(systemDictData.buffer()) 44 | 45 | val userDict1: BinaryDictionary 46 | get() = BinaryDictionary.loadUser(userDict1Data.buffer()) 47 | 48 | val userDict2: BinaryDictionary by lazy { 49 | val chan = MemChannel() 50 | DicBuilder.user(systemDict).lexicon(res("/dict/user2.csv")).build(chan) 51 | BinaryDictionary.loadUser(chan.buffer()) 52 | } 53 | 54 | fun user0Cfg(): Config { 55 | return Config.defaultConfig().clearUserDictionaries().systemDictionary(systemDict) 56 | } 57 | 58 | fun user1Cfg(): Config { 59 | return user0Cfg().addUserDictionary(userDict1) 60 | } 61 | 62 | fun user2Cfg(): Config { 63 | return user1Cfg().addUserDictionary(userDict2) 64 | } 65 | 66 | /** System only */ 67 | fun user0(): JapaneseDictionary { 68 | return DictionaryFactory().create(user0Cfg()) as JapaneseDictionary 69 | } 70 | 71 | /** System + One User dictionary */ 72 | fun user1(): JapaneseDictionary { 73 | return DictionaryFactory().create(user1Cfg()) as JapaneseDictionary 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/TestLoggingConfig.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.io.IOException; 20 | import java.io.InputStream; 21 | import java.util.logging.LogManager; 22 | 23 | public class TestLoggingConfig { 24 | public TestLoggingConfig() throws IOException { 25 | try (InputStream is = TestLoggingConfig.class.getClassLoader().getResourceAsStream("logging.properties")) { 26 | LogManager.getLogManager().readConfiguration(is); 27 | } 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/TextNormalizerTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi 18 | 19 | import com.worksap.nlp.sudachi.dictionary.CharacterCategory 20 | import com.worksap.nlp.sudachi.dictionary.GrammarImpl 21 | import kotlin.test.* 22 | 23 | class TextNormalizerTest { 24 | 25 | private val dic = 26 | DictionaryFactory() 27 | .create(TestDictionary.user2Cfg().characterDefinition(CharacterCategory.loadDefault())) 28 | as JapaneseDictionary 29 | 30 | @Test 31 | fun instantiation() { 32 | TextNormalizer.fromDictionary(dic) 33 | TextNormalizer(dic.getGrammar()) 34 | TextNormalizer(dic.getGrammar(), dic.inputTextPlugins) 35 | TextNormalizer.defaultTextNormalizer() 36 | } 37 | 38 | @Test 39 | fun failToInstantiateWithoutCharCategory() { 40 | val grammar = GrammarImpl() 41 | assertFails { TextNormalizer(grammar) } 42 | } 43 | 44 | @Test 45 | fun normalizeText() { 46 | val tn = TextNormalizer.defaultTextNormalizer() 47 | 48 | // from DefaultInputTextPlugin test 49 | assertEquals("âbγд(株)ガヴ⼼ⅲ", tn.normalize("ÂBΓД㈱ガウ゛⼼Ⅲ")) 50 | } 51 | 52 | @Test 53 | fun normalizeTextWithDefaultConfig() { 54 | // will use default config, which has InputTextPlugins of 55 | // [Default, ProlongedSoundMark, IgnoreYomigana] 56 | val tn = TextNormalizer.fromDictionary(dic) 57 | print(dic.inputTextPlugins) 58 | 59 | assertEquals("âbγд(株)ガヴ⼼ⅲ", tn.normalize("ÂBΓД㈱ガウ゛⼼Ⅲ")) // default 60 | assertEquals("うわーい", tn.normalize("うわーーーい")) // prolonged sound mark 61 | assertEquals("小鳥遊", tn.normalize("小鳥遊(タカナシ)")) // ignore yomigana 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/Utils.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi; 18 | 19 | import java.io.IOException; 20 | import java.net.URISyntaxException; 21 | import java.net.URL; 22 | import java.nio.file.Files; 23 | import java.nio.file.Path; 24 | import java.nio.file.Paths; 25 | 26 | public class Utils { 27 | public static void copyResource(Path folder, String... files) throws IOException { 28 | for (String file : files) { 29 | try { 30 | URL src = Utils.class.getResource(file); 31 | Path dest = Paths.get(src.toURI()).getFileName(); 32 | Files.copy(src.openStream(), folder.resolve(dest)); 33 | } catch (URISyntaxException e) { 34 | throw new IOException(e); 35 | } 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/WordIdTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi 18 | 19 | import kotlin.test.assertEquals 20 | import kotlin.test.assertFails 21 | import kotlin.test.assertNotEquals 22 | import org.junit.Test 23 | 24 | class WordIdTest { 25 | @Test 26 | fun valid() { 27 | assertEquals(WordId.make(0, 0), 0) 28 | assertEquals(WordId.make(0, 5), 5) 29 | assertNotEquals(WordId.make(1, 5), 5) 30 | } 31 | 32 | @Test 33 | fun deconstruct() { 34 | val wid = WordId.make(12, 51612312) 35 | assertEquals(12, WordId.dic(wid)) 36 | assertEquals(51612312, WordId.word(wid)) 37 | } 38 | 39 | @Test 40 | fun invalid() { 41 | assertFails { WordId.make(0, WordId.MAX_WORD_ID + 1) } 42 | assertFails { WordId.make(WordId.MAX_DIC_ID + 1, 0) } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/WordMaskTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi 18 | 19 | import kotlin.test.Test 20 | import kotlin.test.assertFalse 21 | import kotlin.test.assertTrue 22 | 23 | class WordMaskTest { 24 | 25 | @Test 26 | fun works() { 27 | (1..65).forEach { i -> 28 | val mask = WordMask.nth(i) 29 | assertTrue(WordMask.hasNth(mask, i)) 30 | } 31 | } 32 | 33 | @Test 34 | fun addNth() { 35 | val mask1 = WordMask.addNth(0, 1) 36 | val mask2 = WordMask.addNth(mask1, 3) 37 | val mask3 = WordMask.addNth(mask2, 64) 38 | assertTrue(WordMask.hasNth(mask3, 1)) 39 | assertFalse(WordMask.hasNth(mask3, 2)) 40 | assertTrue(WordMask.hasNth(mask3, 3)) 41 | assertFalse(WordMask.hasNth(mask3, 4)) 42 | assertFalse(WordMask.hasNth(mask3, 63)) 43 | assertTrue(WordMask.hasNth(mask3, 64)) 44 | assertTrue(WordMask.hasNth(mask3, 65)) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryHeaderTest.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | import static org.junit.Assert.assertEquals; 20 | import static org.junit.Assert.assertTrue; 21 | 22 | import java.io.IOException; 23 | 24 | import com.worksap.nlp.sudachi.TestDictionary; 25 | import org.junit.Before; 26 | import org.junit.Test; 27 | 28 | public class DictionaryHeaderTest { 29 | DictionaryHeader header; 30 | 31 | @Before 32 | public void setUp() throws IOException { 33 | header = new DictionaryHeader(TestDictionary.INSTANCE.getSystemDictData().buffer(), 0); 34 | } 35 | 36 | @Test 37 | public void getVersion() { 38 | assertEquals(DictionaryVersion.SYSTEM_DICT_VERSION_2, header.getVersion()); 39 | } 40 | 41 | @Test 42 | public void getCreateTime() { 43 | assertTrue(header.getCreateTime() > 0); 44 | } 45 | 46 | @Test 47 | public void getDescription() { 48 | assertEquals("the system dictionary for the unit tests", header.getDescription()); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/dictionary/DictionaryReader.java: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2021 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary; 18 | 19 | import java.io.IOException; 20 | import java.io.InputStream; 21 | import java.nio.ByteBuffer; 22 | import java.nio.ByteOrder; 23 | import java.util.ArrayList; 24 | 25 | class DictionaryReader { 26 | 27 | static ByteBuffer read(String filename) throws IOException { 28 | InputStream input = DictionaryReader.class.getResourceAsStream(filename); 29 | ArrayList buffer = new ArrayList<>(); 30 | for (int c = input.read(); c >= 0; c = input.read()) { 31 | buffer.add((byte) c); 32 | } 33 | ByteBuffer bytes = ByteBuffer.allocate(buffer.size()); 34 | bytes.order(ByteOrder.LITTLE_ENDIAN); 35 | for (Byte b : buffer) { 36 | bytes.put(b); 37 | } 38 | bytes.rewind(); 39 | 40 | return bytes; 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/dictionary/build/ConnectionMatrixTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build 18 | 19 | import com.worksap.nlp.sudachi.dictionary.Connection 20 | import java.io.InputStream 21 | import kotlin.test.assertEquals 22 | import kotlin.test.assertFailsWith 23 | import kotlin.test.assertNotNull 24 | import org.junit.Test 25 | 26 | object Res { 27 | operator fun invoke(name: String, fn: (InputStream) -> R): R { 28 | Res.javaClass.getResourceAsStream(name).use { 29 | assertNotNull(it, "resource '$name' did not exist") 30 | return fn(it) 31 | } 32 | } 33 | } 34 | 35 | class ConnectionMatrixTest { 36 | @Test 37 | fun parse3x3() { 38 | val cm = ConnectionMatrix() 39 | assertEquals(9, Res("test.matrix") { cm.readEntries(it) }) 40 | val conn = Connection(cm.compiledNoHeader.asShortBuffer(), 3, 3) 41 | assertEquals(conn.cost(0, 0), 0) 42 | assertEquals(conn.cost(1, 1), 4) 43 | assertEquals(conn.cost(2, 1), 7) 44 | } 45 | 46 | @Test 47 | fun invalidHeader() { 48 | val cm = ConnectionMatrix() 49 | assertFailsWith { cm.readEntries("1".byteInputStream()) } 50 | } 51 | 52 | @Test 53 | fun emptyHeader() { 54 | val cm = ConnectionMatrix() 55 | assertFailsWith { cm.readEntries("".byteInputStream()) } 56 | } 57 | 58 | @Test 59 | fun badHeader() { 60 | val cm = ConnectionMatrix() 61 | assertFailsWith { cm.readEntries("5 a".byteInputStream()) } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/dictionary/build/DicBufferTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build 18 | 19 | import java.nio.ByteOrder 20 | import kotlin.test.* 21 | 22 | class DicBufferTest { 23 | @Test 24 | fun writeEmptyIntArray() { 25 | val s = DicBuffer(1024) 26 | s.putInts(intArrayOf()) 27 | val bb = s.consume { it.duplicate() } 28 | assertEquals(bb.remaining(), 1) 29 | assertEquals(bb.get(), 0) 30 | assertEquals(bb.remaining(), 0) 31 | } 32 | 33 | @Test 34 | fun writeIntArray() { 35 | val s = DicBuffer(1024) 36 | s.putInts(intArrayOf(1, 2, 3)) 37 | val bb = s.consume { it.duplicate() } 38 | bb.order(ByteOrder.LITTLE_ENDIAN) 39 | assertEquals(bb.remaining(), 4 * 3 + 1) 40 | assertEquals(bb.get(), 3) 41 | assertEquals(bb.getInt(), 1) 42 | assertEquals(bb.getInt(), 2) 43 | assertEquals(bb.getInt(), 3) 44 | assertEquals(bb.remaining(), 0) 45 | } 46 | 47 | @Test 48 | fun writeEmptyString() { 49 | val s = DicBuffer(1024) 50 | s.put("") 51 | val bb = s.consume { it.duplicate() } 52 | assertEquals(bb.remaining(), 1) 53 | assertEquals(bb.get(), 0) 54 | assertEquals(bb.remaining(), 0) 55 | } 56 | 57 | @Test 58 | fun writeSmallString() { 59 | val s = DicBuffer(1024) 60 | s.put("あ𠮟") 61 | val bb = s.consume { it.duplicate() } 62 | bb.order(ByteOrder.LITTLE_ENDIAN) 63 | assertEquals(bb.remaining(), 1 + 2 * 3) 64 | assertEquals(bb.get(), 3) 65 | assertEquals(bb.getChar(), 'あ') 66 | assertEquals(bb.getChar(), '\uD842') 67 | assertEquals(bb.getChar(), '\uDF9F') 68 | assertEquals(bb.remaining(), 0) 69 | } 70 | 71 | @Test 72 | fun writeLargeString() { 73 | val s = DicBuffer(1024) 74 | val str = "0123456789".repeat(20) 75 | s.put(str) 76 | val bb = s.consume { it.duplicate() } 77 | bb.order(ByteOrder.LITTLE_ENDIAN) 78 | val length = str.length 79 | assertEquals(bb.remaining(), 2 + length * 2) 80 | assertEquals(bb.get(), (length shr 8 or 0x80).toByte()) 81 | assertEquals(bb.get(), (length and 0xff).toByte()) 82 | } 83 | 84 | @Test 85 | fun failWriteHugeString() { 86 | val s = DicBuffer(1024) 87 | val str = "0123456789".repeat(DicBuffer.MAX_STRING / 10 + 1) 88 | assertFails { s.put(str) } 89 | } 90 | 91 | @Test 92 | fun checkedPut() { 93 | val s = DicBuffer(10) 94 | assertTrue { s.put("asdf") } 95 | assertFalse { s.put("asdf") } 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/dictionary/build/GrammarTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build 18 | 19 | import com.worksap.nlp.sudachi.dictionary.GrammarImpl 20 | import com.worksap.nlp.sudachi.dictionary.POS 21 | import kotlin.test.assertEquals 22 | import kotlin.test.assertFails 23 | import org.junit.Test 24 | 25 | class GrammarTest { 26 | @Test 27 | fun singlePos() { 28 | val cm = ConnectionMatrix() 29 | Res("test.matrix") { cm.readEntries(it) } 30 | val pos = POSTable() 31 | assertEquals(0, pos.getId(POS("a", "b", "c", "d", "e", "f"))) 32 | val outbuf = MemChannel() 33 | val out = ModelOutput(outbuf) 34 | pos.writeTo(out) 35 | cm.writeTo(out) 36 | val gram = GrammarImpl(outbuf.buffer(), 0) 37 | assertEquals(gram.getPartOfSpeechString(0), POS("a", "b", "c", "d", "e", "f")) 38 | } 39 | 40 | @Test 41 | fun failPosData() { 42 | val posTable = POSTable() 43 | repeat(Short.MAX_VALUE.toInt()) { 44 | val pos = POS("a", "b", "c", "d", "e", it.toString()) 45 | assertEquals(posTable.getId(pos), it.toShort()) 46 | } 47 | assertFails { posTable.getId(POS("a", "a", "a", "a", "a", "a")) } 48 | } 49 | 50 | @Test 51 | fun invalidPos() { 52 | assertFails { POS() } 53 | assertFails { POS("1") } 54 | assertFails { POS("1", "2") } 55 | assertFails { POS("1", "2", "3") } 56 | assertFails { POS("1", "2", "3", "4") } 57 | assertFails { POS("1", "2", "3", "4", "5") } 58 | assertFails { POS("1", "2", "3", "4", "5", null) } 59 | assertFails { POS("1", "2", "3", "4", "5", "6", "7") } 60 | assertFails { POS("1", "2", "3", "4", "5", "6".repeat(POS.MAX_COMPONENT_LENGTH + 1)) } 61 | } 62 | 63 | @Test 64 | fun worksWithEnormousPos() { 65 | val posTable = POSTable() 66 | val e = "あ".repeat(127) 67 | repeat(1024) { 68 | val pos = POS(e, e, e, e, e, it.toString()) 69 | assertEquals(posTable.getId(pos), it.toShort()) 70 | } 71 | val cm = ConnectionMatrix() 72 | Res("test.matrix") { cm.readEntries(it) } 73 | val outbuf = MemChannel() 74 | val out = ModelOutput(outbuf) 75 | posTable.writeTo(out) 76 | cm.writeTo(out) 77 | val gram = GrammarImpl(outbuf.buffer(), 0) 78 | assertEquals(gram.partOfSpeechSize, 1024) 79 | repeat(1024) { 80 | val pos = POS(e, e, e, e, e, it.toString()) 81 | assertEquals(pos, gram.getPartOfSpeechString(it.toShort())) 82 | } 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/dictionary/build/MemChannel.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build 18 | 19 | import java.nio.ByteBuffer 20 | import java.nio.ByteOrder 21 | import java.nio.channels.SeekableByteChannel 22 | import java.nio.file.Files 23 | import java.nio.file.Path 24 | import java.nio.file.StandardOpenOption 25 | 26 | class MemChannel : SeekableByteChannel { 27 | private var buffer: ByteBuffer = ByteBuffer.allocate(1024 * 1024) 28 | private var size = 0L 29 | 30 | init { 31 | buffer.order(ByteOrder.LITTLE_ENDIAN) 32 | } 33 | 34 | override fun close() {} 35 | 36 | override fun isOpen(): Boolean { 37 | return true 38 | } 39 | 40 | override fun read(p0: ByteBuffer?): Int { 41 | throw UnsupportedOperationException() 42 | } 43 | 44 | override fun write(p0: ByteBuffer?): Int { 45 | val remaining = p0!!.remaining() 46 | reserve(remaining) 47 | buffer.put(p0) 48 | val pos = buffer.position().toLong() 49 | if (pos > size) { 50 | size = pos 51 | } 52 | return remaining 53 | } 54 | 55 | private fun reserve(additional: Int) { 56 | val remaining = buffer.remaining() 57 | if (additional <= remaining) { 58 | return 59 | } 60 | val newSize = buffer.capacity() * 2 61 | val newBuf = ByteBuffer.allocate(newSize) 62 | newBuf.order(ByteOrder.LITTLE_ENDIAN) 63 | buffer.flip() 64 | newBuf.put(buffer) 65 | buffer = newBuf 66 | } 67 | 68 | override fun position(): Long { 69 | return buffer.position().toLong() 70 | } 71 | 72 | override fun position(p0: Long): SeekableByteChannel { 73 | buffer.position(p0.toInt()) 74 | return this 75 | } 76 | 77 | override fun size(): Long { 78 | return this.size 79 | } 80 | 81 | override fun truncate(p0: Long): SeekableByteChannel { 82 | throw UnsupportedOperationException() 83 | } 84 | 85 | fun buffer(): ByteBuffer { 86 | val dup = buffer.duplicate() 87 | dup.position(0) 88 | dup.limit(buffer.position()) 89 | dup.order(ByteOrder.LITTLE_ENDIAN) 90 | return dup 91 | } 92 | 93 | fun writeData(path: Path) { 94 | Files.newByteChannel( 95 | path, 96 | StandardOpenOption.WRITE, 97 | StandardOpenOption.CREATE, 98 | StandardOpenOption.TRUNCATE_EXISTING) 99 | .use { it.write(buffer()) } 100 | } 101 | } 102 | -------------------------------------------------------------------------------- /src/test/java/com/worksap/nlp/sudachi/dictionary/build/ParametersTest.kt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2017-2022 Works Applications Co., Ltd. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package com.worksap.nlp.sudachi.dictionary.build 18 | 19 | import kotlin.test.Test 20 | import kotlin.test.assertEquals 21 | 22 | class ParametersTest { 23 | @Test 24 | fun resizeWorks() { 25 | val params = Parameters(4) 26 | params.add(1, 1, 1) 27 | params.add(2, 2, 2) 28 | val ch = MemChannel() 29 | val out = ModelOutput(ch) 30 | params.writeTo(out) 31 | assertEquals(ch.position(), 12) 32 | val b = ch.buffer() 33 | assertEquals(b.short, 1) 34 | assertEquals(b.short, 1) 35 | assertEquals(b.short, 1) 36 | assertEquals(b.short, 2) 37 | assertEquals(b.short, 2) 38 | assertEquals(b.short, 2) 39 | assertEquals(b.remaining(), 0) 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/test/resources/char.def: -------------------------------------------------------------------------------- 1 | 0x0021..0x002F SYMBOL #!"#$%&'()*+,-./ 2 | 0x0030..0x0039 NUMERIC #0-9 3 | 0x0041..0x005A ALPHA #A-Z 4 | 0x0061..0x007A ALPHA #a-z 5 | 0x00C0..0x00FF ALPHA # Latin 1 #À->ÿ 6 | 0x3041..0x309F HIRAGANA 7 | 0x30A1..0x30FF KATAKANA 8 | 0x30A1 NOOOVBOW 9 | 0xFF66..0xFF9D KATAKANA 10 | 0xFF9E..0xFF9F KATAKANA 11 | 0x2E80..0x2EF3 KANJI # CJK Raidcals Supplement 12 | 0x2F00..0x2FD5 KANJI 13 | 0x3005 KANJI 14 | 0x3007 KANJI 15 | 0x3400..0x4DB5 KANJI # CJK Unified Ideographs Extention 16 | 0x4E00..0x9FA5 KANJI 17 | 0xF900..0xFA2D KANJI 18 | 0xFA30..0xFA6A KANJI 19 | 0xFF10..0xFF19 NUMERIC 20 | 0xFF21..0xFF3A ALPHA 21 | 0xFF41..0xFF5A ALPHA 22 | -------------------------------------------------------------------------------- /src/test/resources/com/worksap/nlp/sudachi/dictionary/build/one.csv: -------------------------------------------------------------------------------- 1 | 東,1,1,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,* 2 | -------------------------------------------------------------------------------- /src/test/resources/com/worksap/nlp/sudachi/dictionary/build/sudachi_dic_build.json: -------------------------------------------------------------------------------- 1 | { 2 | "characterDefinitionFile" : "char.def", 3 | "inputTextPlugin" : [ 4 | { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" }, 5 | { "class" : "com.worksap.nlp.sudachi.IgnoreYomiganaPlugin", 6 | "leftBrackets": ["(", "("], 7 | "rightBrackets": [")", ")"], 8 | "maxYomiganaLength": 4} 9 | ], 10 | "oovProviderPlugin" : [ 11 | { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", 12 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 13 | "leftId" : 8, 14 | "rightId" : 8, 15 | "cost" : 6000 } 16 | ], 17 | "formatterPlugin" : [ 18 | { "class" : "com.worksap.nlp.sudachi.SimpleMorphemeFormatter" }, 19 | { "class" : "com.worksap.nlp.sudachi.WordSegmentationFormatter", 20 | "eos" : "\n" }, 21 | { "class" : "com.worksap.nlp.sudachi.WordSegmentationFormatter", 22 | "eos" : " " } 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /src/test/resources/com/worksap/nlp/sudachi/dictionary/build/test.matrix: -------------------------------------------------------------------------------- 1 | 3 3 2 | 0 0 0 3 | 0 1 1 4 | 0 2 2 5 | 1 0 3 6 | 1 1 4 7 | 1 2 5 8 | 2 0 6 9 | 2 1 7 10 | 2 2 8 11 | 12 | -------------------------------------------------------------------------------- /src/test/resources/dict/lex.csv: -------------------------------------------------------------------------------- 1 | た,1,1,8729,た,助動詞,*,*,*,助動詞-タ,終止形-一般,タ,た,*,A,*,*,*,* 2 | に,2,2,11406,に,助詞,接続助詞,*,*,*,*,ニ,に,*,A,*,*,*,* 3 | に,3,3,4481,に,助詞,格助詞,*,*,*,*,ニ,に,*,A,*,*,*,* 4 | 京都,6,6,5293,京都,名詞,固有名詞,地名,一般,*,*,キョウト,京都,*,A,*,*,*,1/5 5 | 東,7,7,4675,東,名詞,普通名詞,一般,*,*,*,ヒガシ,東,*,A,*,*,*,* 6 | 東京,6,6,2816,東京,名詞,固有名詞,地名,一般,*,*,トウキョウ,東京,*,A,*,*,*,* 7 | 東京都,6,8,5320,東京都,名詞,固有名詞,地名,一般,*,*,トウキョウト,東京都,*,B,5/9,*,5/9,* 8 | 行く,4,4,5105,行く,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,* 9 | 行っ,5,5,5122,行っ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,7,A,*,*,*,* 10 | 都,8,8,2914,都,名詞,普通名詞,一般,*,*,*,ト,都,*,A,*,*,*,* 11 | アイ,7,7,4675,アイ,名詞,普通名詞,一般,*,*,*,アイ,アイ,*,A,*,*,*,* 12 | アイウ,7,7,4675,アイウ,名詞,普通名詞,一般,*,*,*,アイウ,アイウ,*,A,*,*,*,* 13 | アイアイウ,6,6,32766,アイアイウ,名詞,固有名詞,地名,一般,*,*,アイアイウ,アイアイウ,*,A,*,*,*,* 14 | 0,9,9,2478,0,名詞,数詞,*,*,*,*,ゼロ,0,*,A,*,*,*,* 15 | 1,9,9,2478,1,名詞,数詞,*,*,*,*,イチ,1,*,A,*,*,*,* 16 | 2,9,9,2478,2,名詞,数詞,*,*,*,*,ニ,2,*,A,*,*,*,* 17 | 3,9,9,2478,3,名詞,数詞,*,*,*,*,サン,3,*,A,*,*,*,* 18 | 4,9,9,2478,4,名詞,数詞,*,*,*,*,ヨン,4,*,A,*,*,*,* 19 | 5,9,9,2478,5,名詞,数詞,*,*,*,*,ゴ,5,*,A,*,*,*,* 20 | 6,9,9,2478,6,名詞,数詞,*,*,*,*,ロク,6,*,A,*,*,*,* 21 | 7,9,9,2478,7,名詞,数詞,*,*,*,*,ナナ,7,*,A,*,*,*,* 22 | 8,9,9,2478,8,名詞,数詞,*,*,*,*,ハチ,8,*,A,*,*,*,* 23 | 9,9,9,2478,9,名詞,数詞,*,*,*,*,キュウ,9,*,A,*,*,*,* 24 | 〇,9,9,2478,〇,名詞,数詞,*,*,*,*,ゼロ,〇,*,A,*,*,*,* 25 | 一,9,9,2478,一,名詞,数詞,*,*,*,*,イチ,一,*,A,*,*,*,* 26 | 二,9,9,2478,二,名詞,数詞,*,*,*,*,ニ,二,*,A,*,*,*,* 27 | 三,9,9,2478,三,名詞,数詞,*,*,*,*,サン,三,*,A,*,*,*,* 28 | 四,9,9,2478,四,名詞,数詞,*,*,*,*,ヨン,四,*,A,*,*,*,* 29 | 五,9,9,2478,五,名詞,数詞,*,*,*,*,ゴ,五,*,A,*,*,*,* 30 | 六,9,9,2478,六,名詞,数詞,*,*,*,*,ロク,六,*,A,*,*,*,* 31 | 七,9,9,2478,七,名詞,数詞,*,*,*,*,ナナ,七,*,A,*,*,*,* 32 | 八,9,9,2478,八,名詞,数詞,*,*,*,*,ハチ,八,*,A,*,*,*,* 33 | 九,9,9,2478,九,名詞,数詞,*,*,*,*,キュウ,九,*,A,*,*,*,* 34 | 六三四,6,6,0,六三四,名詞,固有名詞,地名,一般,*,*,ムサシ,六三四,*,A,*,*,*,* 35 | いく,4,4,5105,いく,動詞,非自立可能,*,*,五段-カ行,終止形-一般,イク,行く,*,A,*,*,*,* 36 | いっ,5,5,5122,いっ,動詞,非自立可能,*,*,五段-カ行,連用形-促音便,イッ,行く,34,A,*,*,*,* 37 | 012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,9,9,-9000,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,名詞,数詞,*,*,*,*,ゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウゼロイチニサンヨンゴロクナナハチキュウ,012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789,*,A,*,*,*,* 38 | 特a,8,8,2914,特A,名詞,普通名詞,一般,*,*,*,トクエー,特A,*,A,*,*,*,* 39 | 隠し,-1,-1,0,隠し,名詞,普通名詞,一般,*,*,*,カクシ,隠し,*,A,*,*,*,* 40 | な。な,8,8,2914,な。な,名詞,普通名詞,一般,*,*,*,ナナ,な。な,*,C,11,11,*,* 41 | -------------------------------------------------------------------------------- /src/test/resources/dict/matrix.def: -------------------------------------------------------------------------------- 1 | 10 10 2 | 0 0 0 3 | 0 1 863 4 | 0 2 2124 5 | 0 3 1032 6 | 0 4 591 7 | 0 5 -162 8 | 0 6 -79 9 | 0 7 887 10 | 0 8 447 11 | 0 9 -535 12 | 1 0 -3689 13 | 1 1 -3361 14 | 1 2 -7643 15 | 1 3 -3267 16 | 1 4 809 17 | 1 5 -1098 18 | 1 6 4606 19 | 1 7 4269 20 | 1 8 4567 21 | 1 9 1635 22 | 2 0 -1959 23 | 2 1 2457 24 | 2 2 811 25 | 2 3 840 26 | 2 4 903 27 | 2 5 -958 28 | 2 6 517 29 | 2 7 2037 30 | 2 8 1392 31 | 2 9 -193 32 | 3 0 -2288 33 | 3 1 1741 34 | 3 2 487 35 | 3 3 792 36 | 3 4 -1474 37 | 3 5 -3429 38 | 3 6 126 39 | 3 7 437 40 | 3 8 605 41 | 3 9 -547 42 | 4 0 -2809 43 | 4 1 -3584 44 | 4 2 -6743 45 | 4 3 -2869 46 | 4 4 -2805 47 | 4 5 -407 48 | 4 6 3422 49 | 4 7 5642 50 | 4 8 6382 51 | 4 9 2165 52 | 5 0 -509 53 | 5 1 -3665 54 | 5 2 -3882 55 | 5 3 -572 56 | 5 4 -1036 57 | 5 5 -54 58 | 5 6 2570 59 | 5 7 3319 60 | 5 8 4059 61 | 5 9 882 62 | 6 0 101 63 | 6 1 2933 64 | 6 2 2198 65 | 6 3 -2004 66 | 6 4 4392 67 | 6 5 4017 68 | 6 6 569 69 | 6 7 475 70 | 6 8 -390 71 | 6 9 852 72 | 7 0 -852 73 | 7 1 2079 74 | 7 2 1180 75 | 7 3 -3084 76 | 7 4 2010 77 | 7 5 1570 78 | 7 6 746 79 | 7 7 2341 80 | 7 8 2051 81 | 7 9 1393 82 | 8 0 -522 83 | 8 1 3354 84 | 8 2 2037 85 | 8 3 -2542 86 | 8 4 3071 87 | 8 5 2631 88 | 8 6 -352 89 | 8 7 2847 90 | 8 8 1134 91 | 8 9 1256 92 | 9 0 -975 93 | 9 1 2498 94 | 9 2 1690 95 | 9 3 -1523 96 | 9 4 3023 97 | 9 5 3139 98 | 9 6 2562 99 | 9 7 3962 100 | 9 8 418 101 | 9 9 -2490 102 | -------------------------------------------------------------------------------- /src/test/resources/dict/user.csv: -------------------------------------------------------------------------------- 1 | ぴらる,8,8,-32768,ぴらる,名詞,普通名詞,一般,*,*,*,ピラル,ぴらる,*,A,*,*,*,* 2 | 府,8,8,2914,府,名詞,普通名詞,一般,*,*,*,フ,府,*,A,*,*,*,* 3 | 東京府,6,6,2816,東京府,名詞,固有名詞,地名,一般,*,*,トウキョウフ,東京府,*,B,5/U1,*,5/U1,1/3 4 | すだち,6,6,2816,すだち,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,スダチ,スダチ,すだち,*,A,*,*,*,* 5 | -------------------------------------------------------------------------------- /src/test/resources/dict/user2.csv: -------------------------------------------------------------------------------- 1 | ぴさる,8,8,-32768,ぴさる,名詞,普通名詞,一般,*,*,*,ピサル,ぴさる,*,A,*,*,*,* 2 | かぼす,6,6,2816,かぼす,被子植物門,双子葉植物綱,ムクロジ目,ミカン科,ミカン属,カボス,カボス,かぼす,*,A,*,*,*,* 3 | -------------------------------------------------------------------------------- /src/test/resources/joinnumeric/char.def: -------------------------------------------------------------------------------- 1 | # 2 | # Japanese charcter category map 3 | # 4 | # $Id: char.def 9 2012-12-12 04:13:15Z togiso $; 5 | # 6 | 7 | ################################################################################### 8 | # 9 | # CHARACTER CATEGORY DEFINITION 10 | # 11 | # CATEGORY_NAME INVOKE GROUP LENGTH 12 | # 13 | # - CATEGORY_NAME: Name of category. you have to define DEFAULT class. 14 | # - INVOKE: 1/0: always invoke unknown word processing, evan when the word can be found in the lexicon 15 | # - GROUP: 1/0: make a new word by grouping the same chracter category 16 | # - LENGTH: n: 1 to n length new words are added 17 | # 18 | DEFAULT 0 1 0 # DEFAULT is a mandatory category! 19 | SPACE 0 1 0 20 | KANJI 0 0 2 21 | SYMBOL 1 1 0 22 | NUMERIC 1 1 0 23 | ALPHA 1 1 0 24 | HIRAGANA 0 1 2 25 | KATAKANA 1 1 2 26 | KANJINUMERIC 0 1 0 #change INVOKE 1->0 27 | GREEK 1 1 0 28 | CYRILLIC 1 1 0 29 | 30 | ################################################################################### 31 | # 32 | # CODE(UCS2) TO CATEGORY MAPPING 33 | # 34 | 35 | # SPACE 36 | 0x0020 SPACE # DO NOT REMOVE THIS LINE, 0x0020 is reserved for SPACE 37 | 38 | # ASCII 39 | 0x0030..0x0039 NUMERIC #0-9 40 | 41 | # KANJI-NUMERIC (〇 一 二 三 四 五 六 七 八 九 十 百 千 万 億 兆) 42 | 0x3007 KANJINUMERIC KANJI 43 | 0x4E00 KANJINUMERIC KANJI 44 | 0x4E8C KANJINUMERIC KANJI 45 | 0x4E09 KANJINUMERIC KANJI 46 | 0x56DB KANJINUMERIC KANJI 47 | 0x4E94 KANJINUMERIC KANJI 48 | 0x516D KANJINUMERIC KANJI 49 | 0x4E03 KANJINUMERIC KANJI 50 | 0x516B KANJINUMERIC KANJI 51 | 0x4E5D KANJINUMERIC KANJI 52 | 0x5341 KANJINUMERIC KANJI 53 | 0x767E KANJINUMERIC KANJI 54 | 0x5343 KANJINUMERIC KANJI 55 | 0x4E07 KANJINUMERIC KANJI 56 | 0x5104 KANJINUMERIC KANJI 57 | 0x5146 KANJINUMERIC KANJI 58 | -------------------------------------------------------------------------------- /src/test/resources/logging.properties: -------------------------------------------------------------------------------- 1 | java.util.logging.SimpleFormatter.format=%5$s%n 2 | 3 | com.worksap.nlp.sudachi.handlers=java.util.logging.ConsoleHandler 4 | com.worksap.nlp.sudachi.level=FINEST 5 | java.util.logging.ConsoleHandler.level=ALL -------------------------------------------------------------------------------- /src/test/resources/rewrite.def: -------------------------------------------------------------------------------- 1 | # ignore normalize list 2 | Ⅲ 3 | ⅲ 4 | ⼼ 5 | 6 | # replace char list 7 | ガ ガ 8 | ウ゛ ヴ 9 | -------------------------------------------------------------------------------- /src/test/resources/rewrite_error_dup.def: -------------------------------------------------------------------------------- 1 | # there are ad uplicated replacement. 2 | 12 21 3 | 12 31 4 | -------------------------------------------------------------------------------- /src/test/resources/rewrite_error_ignorelist.def: -------------------------------------------------------------------------------- 1 | # there are two characters in ignore list 2 | 12 -------------------------------------------------------------------------------- /src/test/resources/rewrite_error_replacelist.def: -------------------------------------------------------------------------------- 1 | # there are three columns in replace list 2 | 12 21 31 3 | -------------------------------------------------------------------------------- /src/test/resources/sudachi.json: -------------------------------------------------------------------------------- 1 | { 2 | "systemDict" : "system.dic", 3 | "userDict" : [ "user.dic" ], 4 | "characterDefinitionFile" : "char.def", 5 | "inputTextPlugin" : [ 6 | { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" }, 7 | { "class" : "com.worksap.nlp.sudachi.ProlongedSoundMarkInputTextPlugin", 8 | "prolongedSoundMarks": ["ー", "〜", "〰"], 9 | "replacementSymbol": "ー"}, 10 | { "class" : "com.worksap.nlp.sudachi.IgnoreYomiganaPlugin", 11 | "leftBrackets": ["(", "("], 12 | "rightBrackets": [")", ")"], 13 | "maxYomiganaLength": 4} 14 | ], 15 | "oovProviderPlugin" : [ 16 | { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", 17 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 18 | "leftId" : 8, 19 | "rightId" : 8, 20 | "cost" : 6000 } 21 | ], 22 | "formatterPlugin" : [ 23 | { "class" : "com.worksap.nlp.sudachi.SimpleMorphemeFormatter" }, 24 | { "class" : "com.worksap.nlp.sudachi.WordSegmentationFormatter", 25 | "eos" : "\n" }, 26 | { "class" : "com.worksap.nlp.sudachi.WordSegmentationFormatter", 27 | "eos" : " " } 28 | ] 29 | } 30 | -------------------------------------------------------------------------------- /src/test/resources/sudachi_minimum.json: -------------------------------------------------------------------------------- 1 | { 2 | "systemDict" : "system.dic", 3 | "inputTextPlugin" : [ 4 | { "class" : "com.worksap.nlp.sudachi.DefaultInputTextPlugin" } 5 | ], 6 | "oovProviderPlugin" : [ 7 | { "class" : "com.worksap.nlp.sudachi.SimpleOovProviderPlugin", 8 | "oovPOS" : [ "名詞", "普通名詞", "一般", "*", "*", "*" ], 9 | "leftId" : 8, 10 | "rightId" : 8, 11 | "cost" : 6000 } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /src/test/resources/sudachi_test_empty.json: -------------------------------------------------------------------------------- 1 | {} --------------------------------------------------------------------------------