├── src
    ├── main
    │   └── java
    │   │   └── kss
    │   │       ├── util
    │   │           └── IntToBool.java
    │   │       ├── base
    │   │           ├── enumerate
    │   │           │   ├── Id.java
    │   │           │   └── Stats.java
    │   │           ├── ChunkWithIndex.java
    │   │           ├── SentenceIndex.java
    │   │           ├── Base.java
    │   │           ├── Const.java
    │   │           └── BackupManager.java
    │   │       ├── Kss.java
    │   │       ├── rule
    │   │           └── Rule.java
    │   │       └── core
    │   │           └── Backend.java
    └── test
    │   └── java
    │       └── KssTest.java
├── LICENSE
├── pom.xml
├── .gitignore
└── README.md


/src/main/java/kss/util/IntToBool.java:
--------------------------------------------------------------------------------
 1 | package kss.util;
 2 | /*
 3 |  * Korean Sentence Splitter
 4 |  * Split Korean text into sentences using heuristic algorithm.
 5 |  *
 6 |  * Copyright (C) 2021 Sang-ji Lee <tkdwl06@gmail.com>
 7 |  * Copyright (C) 2021 Hyun-woong Ko <kevin.woong@tunib.ai> and Sang-Kil Park <skpark1224@hyundai.com>
 8 |  * All rights reserved.
 9 |  *
10 |  * This software may be modified and distributed under the terms
11 |  * of the BSD license.  See the LICENSE file for details.
12 |  */
13 | public class IntToBool {
14 | 
15 |     /**
16 |      *
17 |      * @param input
18 |      * @return
19 |      */
20 |     public static boolean intToBool(int input) {
21 |         return input >= 1;
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/kss/base/enumerate/Id.java:
--------------------------------------------------------------------------------
 1 | package kss.base.enumerate;
 2 | /*
 3 |  * Korean Sentence Splitter
 4 |  * Split Korean text into sentences using heuristic algorithm.
 5 |  *
 6 |  * Copyright (C) 2021 Sang-ji Lee <tkdwl06@gmail.com>
 7 |  * Copyright (C) 2021 Hyun-woong Ko <kevin.woong@tunib.ai> and Sang-Kil Park <skpark1224@hyundai.com>
 8 |  * All rights reserved.
 9 |  *
10 |  * This software may be modified and distributed under the terms
11 |  * of the BSD license.  See the LICENSE file for details.
12 |  */
13 | 
14 | public enum Id {
15 |     NONE(0),
16 |     PREV(1 << 0),
17 |     CONT(1 << 1),
18 |     NEXT(1 << 2),
19 |     NEXT1(1 << 3),
20 |     NEXT2(1 << 4);
21 | 
22 |     private int value;
23 | 
24 |     Id(int value) {
25 |         this.value = value;
26 |     }
27 | 
28 |     public int getValue() {
29 |         return value;
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/java/kss/base/enumerate/Stats.java:
--------------------------------------------------------------------------------
 1 | package kss.base.enumerate;
 2 | /*
 3 |  * Korean Sentence Splitter
 4 |  * Split Korean text into sentences using heuristic algorithm.
 5 |  *
 6 |  * Copyright (C) 2021 Sang-ji Lee <tkdwl06@gmail.com>
 7 |  * Copyright (C) 2021 Hyun-woong Ko <kevin.woong@tunib.ai> and Sang-Kil Park <skpark1224@hyundai.com>
 8 |  * All rights reserved.
 9 |  *
10 |  * This software may be modified and distributed under the terms
11 |  * of the BSD license.  See the LICENSE file for details.
12 |  */
13 | 
14 | public enum Stats {
15 |     DEFAULT(0),
16 |     DA(1),
17 |     YO(2),
18 |     JYO(3),
19 |     HAM(4),
20 |     UM(5),
21 |     SB(6),
22 |     COMMON(7);
23 | 
24 |     private int value;
25 | 
26 |     Stats(int value) {
27 |         this.value = value;
28 |     }
29 | 
30 |     public int getValue() {
31 |         return value;
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/kss/base/ChunkWithIndex.java:
--------------------------------------------------------------------------------
 1 | package kss.base;
 2 | /*
 3 |  * Korean Sentence Splitter
 4 |  * Split Korean text into sentences using heuristic algorithm.
 5 |  *
 6 |  * Copyright (C) 2021 Sang-ji Lee <tkdwl06@gmail.com>
 7 |  * Copyright (C) 2021 Hyun-woong Ko <kevin.woong@tunib.ai> and Sang-Kil Park <skpark1224@hyundai.com>
 8 |  * All rights reserved.
 9 |  *
10 |  * This software may be modified and distributed under the terms
11 |  * of the BSD license.  See the LICENSE file for details.
12 |  */
13 | public class ChunkWithIndex {
14 | 
15 |     private int start;
16 |     private String text;
17 | 
18 |     public ChunkWithIndex(int start, String text) {
19 |         this.start = start;
20 |         this.text = text;
21 |     }
22 | 
23 |     public int getStart() {
24 |         return start;
25 |     }
26 | 
27 |     public void setStart(int start) {
28 |         this.start = start;
29 |     }
30 | 
31 |     public String getText() {
32 |         return text;
33 |     }
34 | 
35 |     public void setText(String text) {
36 |         this.text = text;
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/kss/base/SentenceIndex.java:
--------------------------------------------------------------------------------
 1 | package kss.base;
 2 | /*
 3 |  * Korean Sentence Splitter
 4 |  * Split Korean text into sentences using heuristic algorithm.
 5 |  *
 6 |  * Copyright (C) 2021 Sang-ji Lee <tkdwl06@gmail.com>
 7 |  * Copyright (C) 2021 Hyun-woong Ko <kevin.woong@tunib.ai> and Sang-Kil Park <skpark1224@hyundai.com>
 8 |  * All rights reserved.
 9 |  *
10 |  * This software may be modified and distributed under the terms
11 |  * of the BSD license.  See the LICENSE file for details.
12 |  */
13 | 
14 | public class SentenceIndex {
15 | 
16 |     private int start;
17 |     private int end;
18 | 
19 |     public SentenceIndex(int start, int end) {
20 |         this.start = start;
21 |         this.end = end;
22 |     }
23 | 
24 |     public int getStart() {
25 |         return start;
26 |     }
27 | 
28 |     public void setStart(int start) {
29 |         this.start = start;
30 |     }
31 | 
32 |     public int getEnd() {
33 |         return end;
34 |     }
35 | 
36 |     public void setEnd(int end) {
37 |         this.end = end;
38 |     }
39 | 
40 |     @Override
41 |     public String toString() {
42 |         return "SentenceIndex{" +
43 |             "start=" + start +
44 |             ", end=" + end +
45 |             '}';
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/java/kss/base/Base.java:
--------------------------------------------------------------------------------
 1 | package kss.base;
 2 | /*
 3 |  * Korean Sentence Splitter
 4 |  * Split Korean text into sentences using heuristic algorithm.
 5 |  *
 6 |  * Copyright (C) 2021 Sang-ji Lee <tkdwl06@gmail.com>
 7 |  * Copyright (C) 2021 Hyun-woong Ko <kevin.woong@tunib.ai> and Sang-Kil Park <skpark1224@hyundai.com>
 8 |  * All rights reserved.
 9 |  *
10 |  * This software may be modified and distributed under the terms
11 |  * of the BSD license.  See the LICENSE file for details.
12 |  */
13 | 
14 | import java.util.LinkedList;
15 | import java.util.List;
16 | import java.util.Objects;
17 | 
18 | public class Base {
19 | 
20 |     public static boolean empty(Object o) {
21 |         if (o instanceof String) {
22 |             return ((String) o).length() == 0;
23 |         }
24 |         return ((List<?>) o).size() == 0;
25 |     }
26 | 
27 |     public static boolean top(LinkedList<String> stack, String symbol) {
28 |         return Objects.equals(stack.peek(), symbol);
29 |     }
30 | 
31 |     public static String doPushPopSymbol(
32 |         LinkedList<String> stack,
33 |         String symbol,
34 |         String currentCh
35 |     ) {
36 |         if (empty(stack)) {
37 |             stack.add(symbol);
38 |         } else {
39 |             if (top(stack, currentCh)) {
40 |                 stack.pop();
41 |             } else {
42 |                 stack.add(symbol);
43 |             }
44 |         }
45 |         return currentCh;
46 |     }
47 | 
48 |     public static String doTrimSentPushResults(String curSentence, List<String> results) {
49 |         results.add(curSentence.strip());
50 |         curSentence = "";
51 |         return curSentence;
52 |     }
53 | }
54 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (C) 2021 Sang-ji Lee <tkdwl06@gmail.com>
 2 | Copyright (C) 2021 Hyun-woong Ko <kevin.woong@tunib.ai> and Sang-Kil Park <skpark1224@hyundai.com>
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | 1. Redistributions of source code must retain the above copyright notice, this
 9 |    list of conditions and the following disclaimer.
10 | 
11 | 2. Redistributions in binary form must reproduce the above copyright notice,
12 |    this list of conditions and the following disclaimer in the documentation
13 |    and/or other materials provided with the distribution.
14 | 
15 | 3. Neither the name of the copyright holder nor the names of its
16 |    contributors may be used to endorse or promote products derived from
17 |    this software without specific prior written permission.
18 | 
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
  3 |   xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 |   <modelVersion>4.0.0</modelVersion>
  6 | 
  7 |   <groupId>io.github.sangdee</groupId>
  8 |   <artifactId>kss-java</artifactId>
  9 |   <version>2.6.1</version>
 10 |   <packaging>jar</packaging>
 11 | 
 12 |   <name>kss-java</name>
 13 |   <description>Korean Sentence Splitter</description>
 14 |   <url>https://github.com/sangdee/kss-java</url>
 15 | 
 16 |   <licenses>
 17 |     <license>
 18 |       <name>BSD 3-Clause "New" or "Revised" License</name>
 19 |       <url>https://choosealicense.com/licenses/bsd-3-clause/</url>
 20 |     </license>
 21 |   </licenses>
 22 | 
 23 |   <developers>
 24 |     <developer>
 25 |       <name>sangdee</name>
 26 |       <email>tkdwl06@gmail.com</email>
 27 |     </developer>
 28 |   </developers>
 29 | 
 30 |   <scm>
 31 |     <connection>scm:git:git://github.com/sangdee/kss-java.git</connection>
 32 |     <url>https://github.com/sangdee/kss-java</url>
 33 |   </scm>
 34 | 
 35 |   <distributionManagement>
 36 |     <snapshotRepository>
 37 |       <id>ossrh</id>
 38 |       <url>https://s01.oss.sonatype.org/content/repositories/snapshots</url>
 39 |     </snapshotRepository>
 40 |     <repository>
 41 |       <id>ossrh</id>
 42 |       <url>https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/</url>
 43 |     </repository>
 44 |   </distributionManagement>
 45 | 
 46 |   <build>
 47 |     <plugins>
 48 |       <plugin>
 49 |         <groupId>org.apache.maven.plugins</groupId>
 50 |         <artifactId>maven-jar-plugin</artifactId>
 51 |         <version>2.3.2</version>
 52 |       </plugin>
 53 |       <plugin>
 54 |         <groupId>org.apache.maven.plugins</groupId>
 55 |         <artifactId>maven-source-plugin</artifactId>
 56 |         <version>2.3</version>
 57 |         <executions>
 58 |           <execution>
 59 |             <id>attach-sources</id>
 60 |             <phase>verify</phase>
 61 |             <goals>
 62 |               <goal>jar</goal>
 63 |             </goals>
 64 |           </execution>
 65 |         </executions>
 66 |       </plugin>
 67 |       <plugin>
 68 |         <groupId>org.apache.maven.plugins</groupId>
 69 |         <artifactId>maven-javadoc-plugin</artifactId>
 70 |         <version>2.9.1</version>
 71 |         <executions>
 72 |           <execution>
 73 |             <id>attach-javadocs</id>
 74 |             <phase>verify</phase>
 75 |             <goals>
 76 |               <goal>jar</goal>
 77 |             </goals>
 78 |           </execution>
 79 |         </executions>
 80 |       </plugin>
 81 | 
 82 |       <plugin>
 83 |         <groupId>org.apache.maven.plugins</groupId>
 84 |         <artifactId>maven-gpg-plugin</artifactId>
 85 |         <version>1.5</version>
 86 |         <executions>
 87 |           <execution>
 88 |             <id>sign-artifacts</id>
 89 |             <phase>verify</phase>
 90 |             <goals>
 91 |               <goal>sign</goal>
 92 |             </goals>
 93 |           </execution>
 94 |         </executions>
 95 |       </plugin>
 96 | 
 97 |       <!-- GPG sign -->
 98 | 
 99 |     </plugins>
100 |   </build>
101 | 
102 |   <dependencies>
103 |     <!-- https://mvnrepository.com/artifact/junit/junit -->
104 |     <dependency>
105 |       <groupId>junit</groupId>
106 |       <artifactId>junit</artifactId>
107 |       <version>4.13.2</version>
108 |       <scope>test</scope>
109 |     </dependency>
110 |     <dependency>
111 |       <groupId>org.junit.jupiter</groupId>
112 |       <artifactId>junit-jupiter</artifactId>
113 |       <version>RELEASE</version>
114 |       <scope>test</scope>
115 |     </dependency>
116 |     <dependency>
117 |       <groupId>org.assertj</groupId>
118 |       <artifactId>assertj-core</artifactId>
119 |       <version>3.18.1</version>
120 |       <scope>test</scope>
121 |     </dependency>
122 |   </dependencies>
123 |   <properties>
124 |     <maven.compiler.source>11</maven.compiler.source>
125 |     <maven.compiler.target>11</maven.compiler.target>
126 |   </properties>
127 | 
128 | </project>


--------------------------------------------------------------------------------
/src/main/java/kss/base/Const.java:
--------------------------------------------------------------------------------
  1 | package kss.base;
  2 | /*
  3 |  * Korean Sentence Splitter
  4 |  * Split Korean text into sentences using heuristic algorithm.
  5 |  *
  6 |  * Copyright (C) 2021 Sang-ji Lee <tkdwl06@gmail.com>
  7 |  * Copyright (C) 2021 Hyun-woong Ko <kevin.woong@tunib.ai> and Sang-Kil Park <skpark1224@hyundai.com>
  8 |  * All rights reserved.
  9 |  *
 10 |  * This software may be modified and distributed under the terms
 11 |  * of the BSD license.  See the LICENSE file for details.
 12 |  */
 13 | 
 14 | import java.util.ArrayList;
 15 | import java.util.Arrays;
 16 | import java.util.HashMap;
 17 | import java.util.List;
 18 | 
 19 | public class Const {
 20 | 
 21 |     static List<String> numbersArr = Arrays
 22 |         .asList(
 23 |             "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"
 24 |         );
 25 |     public static final ArrayList<String> numbers = new ArrayList<>(numbersArr);
 26 | 
 27 |     static List<String> alphabetArr = Arrays
 28 |         .asList(
 29 |             "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l",
 30 |             "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y",
 31 |             "z"
 32 |         );
 33 |     public static final ArrayList<String> alphabets = new ArrayList<>(alphabetArr);
 34 | 
 35 |     static List<String> bracketArr = Arrays
 36 |         .asList(
 37 |             ")", "）", "〉", "》", "]", "］", "〕", "】", "}", "｝", "』", "」",
 38 |             "(", "（", "〈", "《", "[", "［", "〔", "【", "{", "｛", "「", "『"
 39 |         );
 40 |     public static final ArrayList<String> bracket = new ArrayList<>(bracketArr);
 41 | 
 42 |     static List<String> punctuationArr = Arrays
 43 |         .asList(";", ".", "?", "!", "~", "…");
 44 |     public static final ArrayList<String> punctuation = new ArrayList<>(punctuationArr);
 45 | 
 46 |     static List<String> doubleQuotesArr = Arrays
 47 |         .asList("\"", "“", "”");
 48 |     public static final ArrayList<String> doubleQuotes = new ArrayList<>(doubleQuotesArr);
 49 | 
 50 |     static List<String> singleQuotesArr = Arrays
 51 |         .asList("'", "‘", "’");
 52 |     public static final ArrayList<String> singleQuotes = new ArrayList<>(singleQuotesArr);
 53 | 
 54 |     public static HashMap<String, String> doubleQuotesOpenToClose = new HashMap<String, String>() {{
 55 |         put("“", "”");
 56 |         put("\"", "\"");
 57 |     }};
 58 |     public static HashMap<String, String> doubleQuotesCloseToOpen = new HashMap<String, String>() {{
 59 |         put("”", "“");
 60 |         put("\"", "\"");
 61 |     }};
 62 |     public static HashMap<String, String> singleQuotesOpenToClose = new HashMap<String, String>() {{
 63 |         put("‘", "’");
 64 |         put("'", "'");
 65 |     }};
 66 |     public static HashMap<String, String> singleQuotesCloseToOpen = new HashMap<String, String>() {{
 67 |         put("’", "‘");
 68 |         put("'", "'");
 69 |     }};
 70 |     public static HashMap<String, String> bracketOpenToClose = new HashMap<String, String>() {{
 71 |         put("(", ")");
 72 |         put("（", "）");
 73 |         put("〈", "〉");
 74 |         put("《", "》");
 75 |         put("[", "]");
 76 |         put("［", "］");
 77 |         put("〔", "〕");
 78 |         put("【", "】");
 79 |         put("{", "}");
 80 |         put("｛", "｝");
 81 |         put("「", "」");
 82 |         put("『", "』");
 83 |     }};
 84 |     public static HashMap<String, String> bracketCloseToOpen = new HashMap<String, String>() {{
 85 |         put(")", "(");
 86 |         put("）", "（");
 87 |         put("〉", "〈");
 88 |         put("》", "《");
 89 |         put("]", "[");
 90 |         put("］", "［");
 91 |         put("〕", "〔");
 92 |         put("】", "【");
 93 |         put("}", "{");
 94 |         put("｝", "｛");
 95 |         put("」", "「");
 96 |         put("』", "『");
 97 |     }};
 98 | 
 99 | 
100 |     public static final ArrayList<String> lowerAlphabets = alphabets;
101 |     public static ArrayList<String> upperAlphabets = setUpperAlphabets();
102 |     public static ArrayList<String> special = setSpecial();
103 | 
104 |     private static ArrayList<String> setUpperAlphabets() {
105 |         upperAlphabets = new ArrayList<>();
106 |         for (String s : alphabets) {
107 |             upperAlphabets.add(s.toUpperCase());
108 |         }
109 |         return upperAlphabets;
110 |     }
111 | 
112 |     private static ArrayList<String> setSpecial() {
113 |         special = new ArrayList<>();
114 |         special.addAll(punctuation);
115 |         special.addAll(bracket);
116 |         return special;
117 |     }
118 | }
119 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/java,intellij,maven,gradle
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=java,intellij,maven,gradle
  4 | 
  5 | ### Intellij ###
  6 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
  7 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
  8 | 
  9 | # User-specific stuff
 10 | .idea/**/workspace.xml
 11 | .idea/**/tasks.xml
 12 | .idea/**/usage.statistics.xml
 13 | .idea/**/dictionaries
 14 | .idea/**/shelf
 15 | 
 16 | # AWS User-specific
 17 | .idea/**/aws.xml
 18 | 
 19 | # Generated files
 20 | .idea/**/contentModel.xml
 21 | 
 22 | # Sensitive or high-churn files
 23 | .idea/**/dataSources/
 24 | .idea/**/dataSources.ids
 25 | .idea/**/dataSources.local.xml
 26 | .idea/**/sqlDataSources.xml
 27 | .idea/**/dynamic.xml
 28 | .idea/**/uiDesigner.xml
 29 | .idea/**/dbnavigator.xml
 30 | 
 31 | # Gradle
 32 | .idea/**/gradle.xml
 33 | .idea/**/libraries
 34 | 
 35 | # Gradle and Maven with auto-import
 36 | # When using Gradle or Maven with auto-import, you should exclude module files,
 37 | # since they will be recreated, and may cause churn.  Uncomment if using
 38 | # auto-import.
 39 | # .idea/artifacts
 40 | # .idea/compiler.xml
 41 | # .idea/jarRepositories.xml
 42 | # .idea/modules.xml
 43 | # .idea/*.iml
 44 | # .idea/modules
 45 | # *.iml
 46 | # *.ipr
 47 | 
 48 | # CMake
 49 | cmake-build-*/
 50 | 
 51 | # Mongo Explorer plugin
 52 | .idea/**/mongoSettings.xml
 53 | 
 54 | # File-based project format
 55 | *.iws
 56 | 
 57 | # IntelliJ
 58 | out/
 59 | 
 60 | # mpeltonen/sbt-idea plugin
 61 | .idea_modules/
 62 | 
 63 | # JIRA plugin
 64 | atlassian-ide-plugin.xml
 65 | 
 66 | # Cursive Clojure plugin
 67 | .idea/replstate.xml
 68 | 
 69 | # Crashlytics plugin (for Android Studio and IntelliJ)
 70 | com_crashlytics_export_strings.xml
 71 | crashlytics.properties
 72 | crashlytics-build.properties
 73 | fabric.properties
 74 | 
 75 | # Editor-based Rest Client
 76 | .idea/httpRequests
 77 | 
 78 | # Android studio 3.1+ serialized cache file
 79 | .idea/caches/build_file_checksums.ser
 80 | 
 81 | ### Intellij Patch ###
 82 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
 83 | 
 84 | # *.iml
 85 | # modules.xml
 86 | # .idea/misc.xml
 87 | # *.ipr
 88 | 
 89 | # Sonarlint plugin
 90 | # https://plugins.jetbrains.com/plugin/7973-sonarlint
 91 | .idea/**/sonarlint/
 92 | 
 93 | # SonarQube Plugin
 94 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
 95 | .idea/**/sonarIssues.xml
 96 | 
 97 | # Markdown Navigator plugin
 98 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
 99 | .idea/**/markdown-navigator.xml
100 | .idea/**/markdown-navigator-enh.xml
101 | .idea/**/markdown-navigator/
102 | 
103 | # Cache file creation bug
104 | # See https://youtrack.jetbrains.com/issue/JBR-2257
105 | .idea/$CACHE_FILE$
106 | 
107 | # CodeStream plugin
108 | # https://plugins.jetbrains.com/plugin/12206-codestream
109 | .idea/codestream.xml
110 | 
111 | ### Java ###
112 | # Compiled class file
113 | *.class
114 | 
115 | # Log file
116 | *.log
117 | 
118 | # BlueJ files
119 | *.ctxt
120 | 
121 | # Mobile Tools for Java (J2ME)
122 | .mtj.tmp/
123 | 
124 | # Package Files #
125 | *.jar
126 | *.war
127 | *.nar
128 | *.ear
129 | *.zip
130 | *.tar.gz
131 | *.rar
132 | 
133 | # virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
134 | hs_err_pid*
135 | 
136 | ### Maven ###
137 | target/
138 | pom.xml.tag
139 | pom.xml.releaseBackup
140 | pom.xml.versionsBackup
141 | pom.xml.next
142 | release.properties
143 | dependency-reduced-pom.xml
144 | buildNumber.properties
145 | .mvn/timing.properties
146 | # https://github.com/takari/maven-wrapper#usage-without-binary-jar
147 | .mvn/wrapper/maven-wrapper.jar
148 | 
149 | ### Maven Patch ###
150 | # Eclipse m2e generated files
151 | # Eclipse Core
152 | .project
153 | # JDT-specific (Eclipse Java Development Tools)
154 | .classpath
155 | 
156 | ### Gradle ###
157 | .gradle
158 | build/
159 | 
160 | # Ignore Gradle GUI config
161 | gradle-app.setting
162 | 
163 | # Avoid ignoring Gradle wrapper jar file (.jar files are usually ignored)
164 | !gradle-wrapper.jar
165 | 
166 | # Cache of project
167 | .gradletasknamecache
168 | 
169 | # # Work around https://youtrack.jetbrains.com/issue/IDEA-116898
170 | # gradle/wrapper/gradle-wrapper.properties
171 | 
172 | ### Gradle Patch ###
173 | **/build/
174 | 
175 | # Eclipse Gradle plugin generated files
176 | # Eclipse Core
177 | # JDT-specific (Eclipse Java Development Tools)
178 | 
179 | # End of https://www.toptal.com/developers/gitignore/api/java,intellij,maven,gradle


--------------------------------------------------------------------------------
/src/main/java/kss/base/BackupManager.java:
--------------------------------------------------------------------------------
  1 | package kss.base;
  2 | /*
  3 |  * Korean Sentence Splitter
  4 |  * Split Korean text into sentences using heuristic algorithm.
  5 |  *
  6 |  * Copyright (C) 2021 Sang-ji Lee <tkdwl06@gmail.com>
  7 |  * Copyright (C) 2021 Hyun-woong Ko <kevin.woong@tunib.ai> and Sang-Kil Park <skpark1224@hyundai.com>
  8 |  * All rights reserved.
  9 |  *
 10 |  * This software may be modified and distributed under the terms
 11 |  * of the BSD license.  See the LICENSE file for details.
 12 |  */
 13 | 
 14 | import static kss.base.Const.doubleQuotes;
 15 | import static kss.base.Const.lowerAlphabets;
 16 | import static kss.base.Const.numbers;
 17 | import static kss.base.Const.singleQuotes;
 18 | import static kss.base.Const.upperAlphabets;
 19 | 
 20 | import java.util.ArrayList;
 21 | import java.util.Arrays;
 22 | import java.util.HashMap;
 23 | import java.util.List;
 24 | import java.util.Map;
 25 | 
 26 | public class BackupManager {
 27 | 
 28 |     private final Map<String, String> backupDict = new HashMap<>();
 29 | 
 30 |     public BackupManager() {
 31 |         for (String s : getData()) {
 32 |             this.backupDict.put(s, String.valueOf(Math.abs(s.hashCode())));
 33 |         }
 34 |     }
 35 | 
 36 |     public List<String> getData() {
 37 |         List<String> faces = Arrays.asList(":)", ":(", ":'(", "O:)", "&)", ">:(", "3:)", "<(\")");
 38 |         List<String> lowUpperNum = lowerAlphabets;
 39 |         lowUpperNum.addAll(upperAlphabets);
 40 | 
 41 |         List<String> apostrophe = new ArrayList<>();
 42 |         for (String i : lowUpperNum) {
 43 |             for (String j : lowUpperNum) {
 44 |                 apostrophe.add(String.format("%s'%s", i, j));
 45 |             }
 46 |         }
 47 | 
 48 |         List<String> years = new ArrayList<>();
 49 |         for (String i : numbers) {
 50 |             years.add(String.format("%s's", i));
 51 |             years.add(String.format("%s'S", i));
 52 |         }
 53 | 
 54 |         List<String> time = new ArrayList<>();
 55 |         for (String i : numbers) {
 56 |             for (String j : numbers) {
 57 |                 for (String k : singleQuotes) {
 58 |                     time.add(String.format("%s%s%s", i, j, k));
 59 |                 }
 60 |             }
 61 |         }
 62 | 
 63 |         List<String> inch = new ArrayList<>();
 64 |         List<String> numersAdd = numbers;
 65 |         numersAdd.add(".");
 66 |         for (String i : numersAdd) {
 67 |             for (String j : numbers) {
 68 |                 for (String k : doubleQuotes) {
 69 |                     inch.add(String.format("%s%s%s", i, j, k));
 70 |                 }
 71 |             }
 72 |         }
 73 | 
 74 |         List<String> ecCases = Arrays.asList(
 75 |             "쌓이다",
 76 |             "보이다",
 77 |             "먹이다",
 78 |             "죽이다",
 79 |             "끼이다",
 80 |             "트이다",
 81 |             "까이다",
 82 |             "꼬이다",
 83 |             "데이다",
 84 |             "치이다",
 85 |             "쬐이다",
 86 |             "꺾이다",
 87 |             "낚이다",
 88 |             "녹이다",
 89 |             "벌이다",
 90 |             "다 적발",
 91 |             "다 말하",
 92 |             "다 말한",
 93 |             "다 말했",
 94 |             "다 밝혀",
 95 |             "다 밝혔",
 96 |             "다 밝히",
 97 |             "다 밝힌",
 98 |             "다 주장",
 99 |             "요 라고",
100 |             "요. 라고",
101 |             "죠 라고",
102 |             "죠. 라고",
103 |             "다 라고",
104 |             "다. 라고",
105 |             "다 하여",
106 |             "다 거나",
107 |             "다. 거나",
108 |             "다 시피",
109 |             "다. 시피",
110 |             "다 응답",
111 |             "다 로 응답",
112 |             "다. 로 응답",
113 |             "요 로 응답",
114 |             "요. 로 응답",
115 |             "죠 로 응답",
116 |             "죠. 로 응답",
117 |             "다 에서",
118 |             "다. 에서",
119 |             "요 에서",
120 |             "요. 에서",
121 |             "죠 에서",
122 |             "죠. 에서",
123 |             "타다 금지법",
124 |             "다 온 사실",
125 |             "다 온 것",
126 |             "다 온 사람",
127 |             "다 왔다",
128 |             "다 왔더",
129 |             "다 와보",
130 |             "우간다",
131 |             "사이다");
132 | 
133 |         List<String> data = new ArrayList<>();
134 |         data.addAll(faces);
135 |         data.addAll(apostrophe);
136 |         data.addAll(years);
137 |         data.addAll(ecCases);
138 |         data.addAll(time);
139 |         data.addAll(inch);
140 | 
141 |         return data;
142 |     }
143 | 
144 |     public String process(String text, Map<String, String> purposeDict) {
145 |         for (Map.Entry<String, String> entry : purposeDict.entrySet()) {
146 |             text = text.replace(entry.getKey(), entry.getValue());
147 |         }
148 |         return text.strip();
149 |     }
150 | 
151 |     public void addItem2Dict(String key, String value) {
152 |         this.backupDict.put(key, value);
153 |     }
154 | 
155 |     public String backup(String text) {
156 |         return process(text, backupDict);
157 |     }
158 | 
159 |     public String restore(String text) {
160 |         Map<String, String> purposeDict = new HashMap<>();
161 |         for (Map.Entry<String, String> entry : backupDict.entrySet()) {
162 |             purposeDict.put(entry.getValue(), entry.getKey());
163 |         }
164 |         return process(text, purposeDict);
165 |     }
166 | }
167 | 


--------------------------------------------------------------------------------
/src/main/java/kss/Kss.java:
--------------------------------------------------------------------------------
  1 | package kss;
  2 | /*
  3 |  * Korean Sentence Splitter
  4 |  * Split Korean text into sentences using heuristic algorithm.
  5 |  *
  6 |  * Copyright (C) 2021 Sang-ji Lee <tkdwl06@gmail.com>
  7 |  * Copyright (C) 2021 Hyun-woong Ko <kevin.woong@tunib.ai> and Sang-Kil Park <skpark1224@hyundai.com>
  8 |  * All rights reserved.
  9 |  *
 10 |  * This software may be modified and distributed under the terms
 11 |  * of the BSD license.  See the LICENSE file for details.
 12 |  */
 13 | import kss.base.ChunkWithIndex;
 14 | import java.util.ArrayList;
 15 | import kss.core.Backend;
 16 | 
 17 | public class Kss {
 18 | 
 19 |     private final Backend kss;
 20 | 
 21 |     public Kss() {
 22 |         this.kss = new Backend();
 23 |     }
 24 | 
 25 |     public ArrayList<String> splitSentences(
 26 |         String text,
 27 |         boolean useHeuristic,
 28 |         boolean useQuotesBracketProcessing,
 29 |         int maxRecoverStep,
 30 |         int maxRecoverLength
 31 |     ) {
 32 |         return this.kss.splitSentences(
 33 |             text,
 34 |             useHeuristic,
 35 |             useQuotesBracketProcessing,
 36 |             maxRecoverStep,
 37 |             maxRecoverLength,
 38 |             0,
 39 |             true
 40 |         );
 41 |     }
 42 | 
 43 |     public ArrayList<String> splitSentences(
 44 |         String text,
 45 |         boolean useHeuristic,
 46 |         boolean useQuotesBracketProcessing,
 47 |         int maxRecoverStep
 48 |     ) {
 49 |         return this.kss.splitSentences(
 50 |             text,
 51 |             useHeuristic,
 52 |             useQuotesBracketProcessing,
 53 |             maxRecoverStep,
 54 |             20000,
 55 |             0,
 56 |             true
 57 |         );
 58 |     }
 59 | 
 60 |     public ArrayList<String> splitSentences(
 61 |         String text,
 62 |         boolean useHeuristic,
 63 |         boolean useQuotesBracketProcessing
 64 |     ) {
 65 |         return this.kss.splitSentences(
 66 |             text,
 67 |             useHeuristic,
 68 |             useQuotesBracketProcessing,
 69 |             5,
 70 |             20000,
 71 |             0,
 72 |             true
 73 |         );
 74 |     }
 75 | 
 76 |     public ArrayList<String> splitSentences(
 77 |         String text,
 78 |         boolean useHeuristic
 79 |     ) {
 80 |         return this.kss.splitSentences(
 81 |             text,
 82 |             useHeuristic,
 83 |             true,
 84 |             5,
 85 |             20000,
 86 |             0,
 87 |             true
 88 |         );
 89 |     }
 90 | 
 91 |     public ArrayList<String> splitSentences(
 92 |         String text
 93 |     ) {
 94 |         return this.kss.splitSentences(
 95 |             text,
 96 |             true,
 97 |             true,
 98 |             5,
 99 |             20000,
100 |             0,
101 |             true
102 |         );
103 |     }
104 | 
105 |     public ArrayList<ChunkWithIndex> splitChunks(
106 |         String text,
107 |         int maxLength,
108 |         boolean overlap,
109 |         boolean useHeuristic,
110 |         boolean useQuotesBracketsProcessing,
111 |         int maxRecoverStep,
112 |         int maxRecoverLength
113 |     ) {
114 |         return kss.splitChunks(
115 |             text,
116 |             maxLength,
117 |             overlap,
118 |             useHeuristic,
119 |             useQuotesBracketsProcessing,
120 |             maxRecoverStep,
121 |             maxRecoverLength
122 |         );
123 |     }
124 | 
125 |     public ArrayList<ChunkWithIndex> splitChunks(
126 |         String text,
127 |         int maxLength,
128 |         boolean overlap,
129 |         boolean useHeuristic,
130 |         boolean useQuotesBracketsProcessing,
131 |         int maxRecoverStep
132 |     ) {
133 |         return kss.splitChunks(
134 |             text,
135 |             maxLength,
136 |             overlap,
137 |             useHeuristic,
138 |             useQuotesBracketsProcessing,
139 |             maxRecoverStep,
140 |             20000
141 |         );
142 |     }
143 | 
144 |     public ArrayList<ChunkWithIndex> splitChunks(
145 |         String text,
146 |         int maxLength,
147 |         boolean overlap,
148 |         boolean useHeuristic,
149 |         boolean useQuotesBracketsProcessing
150 |     ) {
151 |         return kss.splitChunks(
152 |             text,
153 |             maxLength,
154 |             overlap,
155 |             useHeuristic,
156 |             useQuotesBracketsProcessing,
157 |             5,
158 |             20000
159 |         );
160 |     }
161 | 
162 |     public ArrayList<ChunkWithIndex> splitChunks(
163 |         String text,
164 |         int maxLength,
165 |         boolean overlap,
166 |         boolean useHeuristic
167 |     ) {
168 |         return kss.splitChunks(
169 |             text,
170 |             maxLength,
171 |             overlap,
172 |             useHeuristic,
173 |             true,
174 |             5,
175 |             20000
176 |         );
177 |     }
178 | 
179 |     public ArrayList<ChunkWithIndex> splitChunks(
180 |         String text,
181 |         int maxLength,
182 |         boolean overlap
183 |     ) {
184 |         return kss.splitChunks(
185 |             text,
186 |             maxLength,
187 |             overlap,
188 |             true,
189 |             true,
190 |             5,
191 |             20000
192 |         );
193 |     }
194 | 
195 |     public ArrayList<ChunkWithIndex> splitChunks(
196 |         String text,
197 |         int maxLength
198 |     ) {
199 |         return kss.splitChunks(
200 |             text,
201 |             maxLength,
202 |             false,
203 |             true,
204 |             true,
205 |             5,
206 |             20000
207 |         );
208 |     }
209 | }
210 | 


--------------------------------------------------------------------------------
/src/test/java/KssTest.java:
--------------------------------------------------------------------------------
  1 | import java.util.List;
  2 | import kss.Kss;
  3 | import org.assertj.core.api.Assertions;
  4 | import org.junit.Test;
  5 | 
  6 | public class KssTest {
  7 | 
  8 |     Kss kss = new Kss();
  9 | 
 10 |     @Test
 11 |     public void testSplit() {
 12 |         String text = "회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요 다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다 강남역 맛집 토끼정의 외부 모습.";
 13 |         List<String> splitted = kss.splitSentences(text);
 14 |         Assertions.assertThat(splitted.size()).isEqualTo(3);
 15 |     }
 16 | 
 17 |     @Test
 18 |     public void testSingleQuotes() {
 19 |         String text = "여당이 내놓은 상가건물 임대차보호법 개정안, 이른바 ‘임대료 멈춤법’에 대한 논의가 급물살을 타면서 자영업자들과 임대인의 의견이 팽팽하게 맞서고 있다. 신종 코로나바이러스 감염증(코로나19) 확진자 급증세로 집합 제한·집합 금지 기간이 길어지면서 한계에 직면한 소상공인과 자영업자들 사이에서는 임대료 부담을 호소하며 \"법안을 조속히 시행해 달라\"는 목소리가 터져나오고 있다. 반면 임대인들은 임대료 인하를 강제화하는 것은 부당하다며 정부와 여당이 ‘나쁜 임대인(건물주)’ 프레임을 만들고 있다고 비판했다. 업계 전문가들 사이에서도 우려의 목소리가 여럿 있다. 재산권 침해 소지가 있고, 월세 수익이 끊기면 생활이 곤란해지는 ‘생계형 임대인’들이 피해를 볼 수 있는 등 또 다른 부작용이 발생할 수 있다는 것이다. 법 개정 자체만으로 상가 거래 시장이 위축될 가능성도 지적됐다.";
 20 |         List<String> splitted = kss.splitSentences(text);
 21 |         Assertions.assertThat(splitted.size()).isEqualTo(6);
 22 |     }
 23 | 
 24 |     @Test
 25 |     public void testQuoteMisalignment() {
 26 |         String text = "부부 싸움 규칙 가운데 ‘돈 히트 언더 더 벨트’(Don’t hit under the belt)가 있다. 권투할 때 벨트 아래를 치면 반칙이듯이, 상대가 너무 아파할 만한 것을 건드리면 회복하기 어렵다. 그 부분은 사람마다 다르다.";
 27 |         List<String> splitted = kss.splitSentences(text);
 28 |         Assertions.assertThat(splitted.size()).isEqualTo(3);
 29 | 
 30 |         text = "안녕하십니까? 삼성전자가 11월 13일 삼성전자 서초사옥 다목적홀 5층에서 진행되는 2013 S'데이 멤버십 블루 강연회 \"Challenge BLUE, 박찬호&이동우의 삶과 도전\" 멤버십 블루 고객 480명을 초청한다.강연회는 삼성전자 멤버십 블루 회원들을 위해 마련된 고객 혜택 행사로 한국인 최초의 메이저리거 박찬호와 시각장애 개그맨 이동우를 초청, 그들의 삶 속에서 펼쳐진 다양한 도전기를 들을 수 있도록 마련했다.";
 31 |         splitted = kss.splitSentences(text);
 32 |         Assertions.assertThat(splitted.size()).isEqualTo(3);
 33 | 
 34 |         text = "삼성전자가 11월 13일 삼성전자 서초사옥 다목적홀 5층에서 진행되는 2013 S'데이 멤버십 블루 강연회 \" Challenge BLUE, 박찬호&이동우의 삶과 도전\" 멤버십 블루 고객 480명을 초청한다.강연회는 삼성전자 멤버십 블루 회원들을 위해 마련된 고객 혜택 행사로 한국인 최초의 메이저리거 박찬호와 시각장애 개그맨 이동우를 초청, 그들의 삶 속에서 펼쳐진 다양한 도전기를 들을 수 있도록 마련했다.";
 35 | 
 36 |         splitted = kss.splitSentences(text);
 37 |         Assertions.assertThat(splitted.size()).isEqualTo(2);
 38 | 
 39 |         text = "삼성전자가 11월 13일 삼성전자 서초사옥 다목적홀 5층에서 진행되는 2013 S\"데이 멤버십 블루 강연회 \"Challenge BLUE, 박찬호&이동우의 삶과 도전\" 멤버십 블루 고객 480명을 초청한다.강연회는 삼성전자 멤버십 블루 회원들을 위해 마련된 고객 혜택 행사로 한국인 최초의 메이저리거 박찬호와 시각장애 개그맨 이동우를 초청, 그들의 삶 속에서 펼쳐진 다양한 도전기를 들을 수 있도록 마련했다.";
 40 |         splitted = kss.splitSentences(text);
 41 |         Assertions.assertThat(splitted.size()).isEqualTo(2);
 42 |         text = "삼성전자가 11월 13일 삼성전자 서초사옥 다목적홀 5층에서 진행되는 2013 S\"데'이 멤버십 블루 강연회 \"Challenge BLUE, 박찬호&이동우의 삶과 도전\" 멤버십 블루 고객 480명을 초청한다.강연회는 삼성전자 멤버십 블루 회원들을 위해 마련된 고객 혜택 행사로 한국인 최초의 메이저리거 박찬호와 시각장애 개그맨 이동우를 초청, 그들의 삶 속에서 펼쳐진 다양한 도전기를 들을 수 있도록 마련했'다.";
 43 |         splitted = kss.splitSentences(text);
 44 |         Assertions.assertThat(splitted.size()).isEqualTo(2);
 45 |     }
 46 | 
 47 |     @Test
 48 |     public void testRealignment() {
 49 |         String text = "회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요 다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다 강남역 맛집 토끼정의 외부 모습.";
 50 |         List<String> splitted = kss.splitSentences(text);
 51 |         Assertions.assertThat(splitted.size()).isEqualTo(3);
 52 |     }
 53 | 
 54 |     @Test
 55 |     public void testPrime() {
 56 |         String text = "TV산업은 1926년 흑백 TV 개발, 1954년 RCA사가 Color TV(21\") 양산/판매를 시작한 이래로 트리니트론 브라운관(1967년), 완전평면 브라운관(1996년) 개발 등 기술적인 발전을 거듭해 왔으나, 주요 국가 보급률이 90%를 넘어서면서 브라운관 TV사업의 성장은 정체되었습니다. 그러나 Flat Panel TV(LCD, PDP) 출시, 디지털 방송 확산(영/미 1998년~ )을 통해 TV 시장은 성장 모멘텀을 되찾았으며, FPTV는 화질, 디자인 등 제품 성능 향상과 지속적인 Set가격 하락을 통해 성장을 지속하며 기존 CRT 시장을 빠르게 대체하였습니다. 또한 2010년 입체감을 느낄 수 있는 3D TV가 출시되었고, 2011년부터 2012년에 걸쳐 인터넷 동영상 서비스 업체들의 부상과 스마트기기에 대한 사용자들의 관심 확대로 스마트 TV 시장이 태동하였습니다. 2013년에는 화질 및 해상도가 혁신적으로 높아진 UHD TV, 2014년에는 새로운 Form Factor인 Curved TV가 출시되었으며 2015년에는 퀀텀닷TV가 상용화되는 등 TV 시장은 끊임없이 진화하였습니다.전체 TV 수요는 2017년 기준 2억 1,510만대 수준으로 LCD TV 수요가 2억 1천만대로 99% 이상의 시장 점유를 이어 나갔으며, OLED 수요는 159만대로 성장하였으나 비중은 0.7%로 영향이 미미하였습니다. 2018년도 전체 TV 수요는 2억 2,100만대 이상을 기록하며 전년 대비 2.9% 성장하였습니다. 최근 TV시장은 고해상도 대형화면에 대한 Needs가 지속적으로 증가하여, UHD TV는 전년비 26% 증가한 99.6백만대로 시장 비중 45% 수준이 될 전망이며, 60\"이상 대형시장은 약 19.7백만대를 초과하여 전년비 35% 성장, 75\"이상 초대형 시장도 당사의 판매 드라이브로 전년비 76% 이상 성장이 전망되고 있습니다.";
 57 |         List<String> splitted = kss.splitSentences(text);
 58 |         Assertions.assertThat(splitted.size()).isEqualTo(7);
 59 |     }
 60 | 
 61 |     @Test
 62 |     public void testApostrophe() {
 63 |         String text = "회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요 다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다 강남역 맛집 토끼정의 외부 모습.";
 64 |         List<String> splitted = kss.splitSentences(text);
 65 |         Assertions.assertThat(splitted.size()).isEqualTo(3);
 66 |     }
 67 | 
 68 |     @Test
 69 |     public void testJyo() {
 70 |         String text = "아무래도 그땐 그랬었죠 많이 힘들었으니까요 근데 이제는 괜찮아요 친구들이 많아졌어요 그때만 힘들었던거죠 이젠 괜찮아요";
 71 |         List<String> splitted = kss.splitSentences(text);
 72 |         Assertions.assertThat(splitted.size()).isEqualTo(6);
 73 |     }
 74 | 
 75 |     @Test
 76 |     public void testEcEfCases() {
 77 |         String text = "국내에 판매하는 OEM 수입차의 판매량이 2017년까지 하락세를 보이다 지난해 반등했으며 수입차 대비 비중도 높아질 전망이다.";
 78 |         List<String> splitted = kss.splitSentences(text);
 79 |         Assertions.assertThat(splitted.size()).isEqualTo(1);
 80 | 
 81 |         text = "전과 8범 A씨는 지난 17일 전자발찌를 끊고 도주하다 붙잡혀 전자발찌 부착기간이 2020년 8월14일까지 늘었다.";
 82 |         splitted = kss.splitSentences(text);
 83 |         Assertions.assertThat(splitted.size()).isEqualTo(1);
 84 | 
 85 |         text = "국내에 판매하는 OEM 수입차의 판매량은 내년 보다 높아질 전망이다.";
 86 |         splitted = kss.splitSentences(text);
 87 |         Assertions.assertThat(splitted.size()).isEqualTo(1);
 88 |         text = "개그맨 김병만 씨가 아르바이트를 하다 목숨을 잃을 뻔한 사연을 TV조선 '별별톡쇼'에서 전했다.";
 89 |         splitted = kss.splitSentences(text);
 90 |         Assertions.assertThat(splitted.size()).isEqualTo(1);
 91 |     }
 92 | 
 93 |     @Test
 94 |     public void testQuotes() {
 95 |         String text = "연비테스트를 진행하면서 들었던 의문점인 ‘트립 컴퓨터 정보'에 대한 신뢰도 문제였다. 3대의 차량 모두 연료를 더 이상 들어가지 않을 때 까지 가득 주유한 뒤 193km를 이동했다.";
 96 |         List<String> splitted = kss.splitSentences(text);
 97 |         Assertions.assertThat(splitted.size()).isEqualTo(2);
 98 | 
 99 |         text = "우리 팀 촬영 PD는 \"지금까지 탔던 차 중에 가장 편했다\"라고 말했다. 이런 쉐슬람 같은! 아니다.";
100 |         splitted = kss.splitSentences(text);
101 |         Assertions.assertThat(splitted.size()).isEqualTo(2);
102 | 
103 |         text = "한 시민은 \"코로나로 인해 '2020년'이란 시간은 멈춘 듯 하다\"고 말했다.";
104 |         splitted = kss.splitSentences(text);
105 |         Assertions.assertThat(splitted.size()).isEqualTo(1);
106 |     }
107 | }
108 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Korean Sentence Splitter
  2 | <img alt="latest version" src="https://img.shields.io/badge/latest%20version-2.6.1-blue.svg"/> <a href="https://github.com/sangdee/kss-java/blob/master/LICENSE"><img alt="BSD 3-Clause" src="https://img.shields.io/badge/license-BSD%203%20Clause-blue.svg"/></a>
  3 | 
  4 | Split Korean text into sentences using heuristic algorithm.
  5 | 
  6 | <br><br>
  7 | 
  8 | ## 1. Installation
  9 | - Maven
 10 | ```console
 11 | <dependency>
 12 |   <groupId>io.github.sangdee</groupId>
 13 |   <artifactId>kss-java</artifactId>
 14 |   <version>2.6.1</version>
 15 | </dependency>
 16 | ```
 17 | - Gradle
 18 | ```console
 19 | repositories {
 20 |     mavenCentral()
 21 | }
 22 | 
 23 | dependencies {
 24 |     implementation 'io.github.sangdee:kss-java:2.6.1'
 25 | }
 26 | ```
 27 | 
 28 | <br><br>
 29 | 
 30 | ## 2. Usage of `splitSentences`
 31 | 
 32 | ```java
 33 | ArrayList<String> splitSentences(
 34 |         String text,
 35 |         boolean useHeuristic,  //default = true
 36 |         boolean useQuotesBracketsProcessing, //default = true
 37 |         int maxRecoverStep, //default = 5
 38 |         int maxRecoverLength, // default = 20000
 39 |         int recoverStep //default = 0
 40 |     ) 
 41 | ```
 42 | 
 43 | ### 2.1. Split sentences with heuristic algorithm.
 44 | - `splitSentences` is the key method of Kss.
 45 | - You can segment text to sentences with this method.
 46 | 
 47 | ```java
 48 | import kss.Kss;
 49 | 
 50 | Kss kss = new Kss();
 51 | String text = "회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요 다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다 강남역 맛집 토끼정의 외부 모습.";
 52 | kss.splitSentences(text);
 53 | ```
 54 | ```java
 55 | ["회사 동료 분들과 다녀왔는데 분위기도 좋고 음식도 맛있었어요,"
 56 |  "다만, 강남 토끼정이 강남 쉑쉑버거 골목길로 쭉 올라가야 하는데 다들 쉑쉑버거의 유혹에 넘어갈 뻔 했답니다,"
 57 |  "강남역 맛집 토끼정의 외부 모습."]
 58 | ```
 59 | 
 60 | ### 2.2. Split sentences without heuristic algorithm.
 61 | - If your articles follow the punctuation rules reratively well, we recommend to you set the `useHeuristic = false`. (default is `true`)
 62 | - In these cases, Kss segments text depending only on punctuataion and you can segment text much more safely.
 63 |   - Formal articles (Wiki, News, Essay, ...) : recommend `useHeuristic = false`
 64 |   - Informal articles (SNS, Blogs, Messages, ...) : recommend `useHeuristic = true`
 65 | 
 66 | ```java
 67 | import kss.Kss;
 68 | 
 69 | Kss kss = new Kss();
 70 | String text = "미리 예약을 할 수 있는 시스템으로 합리적인 가격에 여러 종류의 생선, 그리고 다양한 부위를 즐길 수 있기 때문이다. 계절에 따라 모둠회의 종류는 조금씩 달라지지만 자주 올려주는 참돔 마스까와는 특히 맛이 매우 좋다. 일반 모둠회도 좋지만 좀 더 특별한 맛을 즐기고 싶다면 특수 부위 모둠회를 추천한다 제철 생선 5~6가지 구성에 평소 접하지 못했던 부위까지 색다르게 즐길 수 있다.";
 71 | kss.splitSentences(text, false);  
 72 | ```
 73 | ```java
 74 | ["미리 예약을 할 수 있는 시스템으로 합리적인 가격에 여러 종류의 생선, 그리고 다양한 부위를 즐길 수 있기 때문이다.", 
 75 |  "계절에 따라 모둠회의 종류는 조금씩 달라지지만 자주 올려주는 참돔 마스까와는 특히 맛이 매우 좋다.", 
 76 |  "제철 생선 5~6가지 구성에 평소 접하지 못했던 부위까지 색다르게 즐길 수 있다."]
 77 | ```
 78 | 
 79 | ### 2.3. Brackets and quotation marks processing
 80 | - Kss provides a technique for not segmenting sentences enclosed in brackets (괄호) or quotation marks (따옴표).
 81 | 
 82 | ```java
 83 | import kss.Kss;
 84 | 
 85 | Kss kss = new Kss();
 86 | String text = "그가 말했다. '거기는 가지 마세요. 위험하니까요. 알겠죠?' 그러자 그가 말했다. 알겠어요.";
 87 | kss.splitSentences(text)
 88 |         
 89 | ["그가 말했다.","'거기는 가지 마세요. 위험하니까요. 알겠죠?' 그러자 그가 말했다.","알겠어요."]
 90 | ```
 91 | 
 92 | #### 2.3.1. Several options to optimize recursion
 93 | - However, this can cause problem when brackets and quotation marks are misaligned, and it was a cronic problem of Kss 1.x (C++ version).
 94 | - From Kss 2.xx, we provide quotes and brocket calibration feature to solve this problem, but it uses recursion and has very poor time complexity O(2^n).
 95 | - So, we also provide several options to optimize recursion. You can save your precious time with these options.
 96 |   - The depth of the recursion can be modified through a parameter `maxRecoverStep`. (default is 5)
 97 |   - You can turn off calibration using the `maxRecoverLength` parameter. (default is 20,000)
 98 | 
 99 | ```java
100 | import kss.Kss;
101 | 
102 | Kss kss = new Kss();
103 | String text = "VERY_LONG_TEXT";
104 | 
105 | splitSentences(text, true, true, 5);
106 | // you can adjust recursion depth using `maxRecoverStep` (default is 5)
107 | splitSentences(text, true, true, 5, 20000);
108 | // you can turn it off when you input very long text using `maxRecoverLength` (default is 20000)
109 | ```
110 | 
111 | #### 2.3.2. Turn off brackets and quotation marks processing
112 | - You can also turn off brackets and quotation marks processing if you want.
113 | - Set `useQuotesBracketsProcessing = false` to turn it off.
114 | 
115 | ```java
116 | import kss.Kss;
117 | 
118 | Kss kss = new Kss();
119 | String text = "그가 말했다. (거기는 가지 마세요. 위험하니까요. 알겠죠?) 그러자 그가 말했다. 알겠어요.";
120 | 
121 | kss.splitSentences(text);
122 | ['그가 말했다.','(거기는 가지 마세요. 위험하니까요. 알겠죠?) 그러자 그가 말했다.','알겠어요.']
123 | 
124 | kss.splitSentences(text, true, false);
125 | ['그가 말했다.','(거기는 가지 마세요.','위험하니까요.','알겠죠?',') 그러자 그가 말했다.','알겠어요.']
126 | ```
127 | 
128 | <br><br>
129 | 
130 | ## 3. Usage of `splitChunks`
131 | ```java
132 |  ArrayList<ChunkWithIndex> splitChunks(
133 |         String text, 
134 |         int maxLength,
135 |         boolean overlap, //default = false
136 |         boolean useHeuristic, //default = true
137 |         boolean useQuotesBracketsProcessing,  //default = true
138 |         int maxRecoverStep,  //default = 5
139 |         int maxRecoverLength  //default = 20000
140 |     ) 
141 | ```
142 | 
143 | ### 3.1. Set maximum length of chunks via `maxLength`
144 | - `splitChunks` combine sentences into chunks of a `maxlength` or less.
145 | - You can set the maximum length of one chunk to `maxLength`.
146 | 
147 | ```java
148 | import kss.Kss;
149 | 
150 | Kss kss = new Kss();
151 | String text = "NoSQL이라고 하는 말은 No 'English'라고 하는 말과 마찬가지다. 세상에는 영어 말고도 수많은 언어가 존재한다. MongoDB에서 사용하는 쿼리 언어와 CouchDB에서 사용하는 쿼리 언어는 서로 전혀 다르다. 그럼에도 이 두 쿼리 언어는 같은 NoSQL 카테고리에 속한다. 어쨌거나 SQL이 아니기 때문이다. 또한 NoSQL이 No RDBMS를 의미하지는 않는다. BerkleyDB같은 예외가 있기 때문이다. 그리고 No RDBMS가 NoSQL인 것도 아니다. SQL호환 레이어를 제공하는 KV-store라는 예외가 역시 존재한다. 물론 KV-store의 특징상 range query를 where절에 넣을 수 없으므로 완전한 SQL은 못 되고 SQL의 부분집합 정도를 제공한다.";
152 | kss.splitChunks(text, 128);
153 | ```
154 | ```java
155 | [ChunkWithIndex(start = 0, text = "NoSQL이라고 하는 말은 No 'English'라고 하는 말과 마찬가지다. 세상에는 영어 말고도 수많은 언어가 존재한다. MongoDB에서 사용하는 쿼리 언어와 CouchDB에서 사용하는 쿼리 언어는 서로 전혀 다르다."),
156 |  ChunkWithIndex(start = 124, text = "그럼에도 이 두 쿼리 언어는 같은 NoSQL 카테고리에 속한다. 어쨌거나 SQL이 아니기 때문이다. 또한 NoSQL이 No RDBMS를 의미하지는 않는다. BerkleyDB같은 예외가 있기 때문이다."),
157 |  ChunkWithIndex(start = 236, text = "그리고 No RDBMS가 NoSQL인 것도 아니다. SQL호환 레이어를 제공하는 KV-store라는 예외가 역 시 존재한다."),
158 |  ChunkWithIndex(start = 305, text = "물론 KV-store의 특징상 range query를 where절에 넣을 수 없으므로 완전한 SQL은 못 되고 SQL의 부분집합 정도를 제공한다.")]
159 | ```
160 | 
161 | ### 3.2. Overlap sentences across chunks
162 | - If `overlap` is `true`, text will be chunked similar with sliding window.
163 | - Each chunk allows for duplicate sentences if you turn this feature on.
164 | 
165 | ```java
166 | import kss.Kss;
167 | 
168 | Kss kss = new Kss();
169 | String text = "NoSQL이라고 하는 말은 No 'English'라고 하는 말과 마찬가지다. 세상에는 영어 말고도 수많은 언어가 존재한다. MongoDB에서 사용하는 쿼리 언어와 CouchDB에서 사용하는 쿼리 언어는 서로 전혀 다르다. 그럼에도 이 두 쿼리 언어는 같은 NoSQL 카테고리에 속한다. 어쨌거나 SQL이 아니기 때문이다. 또한 NoSQL이 No RDBMS를 의미하지는 않는다. BerkleyDB같은 예외가 있기 때문이다. 그리고 No RDBMS가 NoSQL인 것도 아니다. SQL호환 레이어를 제공하는 KV-store라는 예외가 역시 존재한다. 물론 KV-store의 특징상 range query를 where절에 넣을 수 없으므로 완전한 SQL은 못 되고 SQL의 부분집합 정도를 제공한다.";
170 | kss.splitChunks(text, 128, false, true); // text maxLength, overlap, useHeuristic,
171 | ```
172 | ```java
173 | [ChunkWithIndex(start = 0, text = "NoSQL이라고 하는 말은 No 'English'라고 하는 말과 마찬가지다. 세상에는 영어 말고도 수많은 언어가 존재한다. MongoDB에서 사용하는 쿼리 언어와 CouchDB에서 사용하는 쿼리 언어는 서로 전혀 다르다."),
174 |  ChunkWithIndex(start = 43, text = "세상에는 영어 말고도 수많은 언어가 존재한다. MongoDB에서 사용하는 쿼리 언어와 CouchDB에서 사용하는 쿼리 언어는 서로 전혀 다르다. 그럼에도 이 두 쿼리 언어는 같은 NoSQL 카테고리에 속한다."),
175 |  ChunkWithIndex(start = 69, text = "MongoDB에서 사용하는 쿼리 언어와 CouchDB에서 사용하는 쿼리 언어는 서로 전혀 다르다. 그럼 에도 이 두 쿼리 언어는 같은 NoSQL 카테고리에 속한다. 어쨌거나 SQL이 아니기 때문이다."),
176 |  ChunkWithIndex(start = 124, text = "그럼에도 이 두 쿼리 언어는 같은 NoSQL 카테고리에 속한다. 어쨌거나 SQL이 아니기 때문이다. 또한 NoSQL이 No RDBMS를 의미하지는 않는다. BerkleyDB같은 예외가 있기 때문이다."),
177 |  ChunkWithIndex(start = 180, text = "또한 NoSQL이 No RDBMS를 의미하지는 않는다. BerkleyDB같은 예외가 있기 때문이다. 그리고 No RDBMS가 NoSQL인 것도 아니다. SQL호환 레이어를 제공하는 KV-store라는 예외가 역시 존재한다."),
178 |  ChunkWithIndex(start = 236, text = "그리고 No RDBMS가 NoSQL인 것도 아니다. SQL호환 레이어를 제공하는 KV-store라는 예외가 역 시 존재한다. 물론 KV-store의 특징상 range query를 where절에 넣을 수 없으므로 완전한 SQL은 못 되고 SQL의 부분집합 정도를 제공한다.")]
179 | ```
180 | 
181 | ### 3.3. Use every options used in `splitSentences`
182 | - You can use the EVERY options used in `splitSentences`.
183 | - For example, if you want to turn off the processing about quotation marks, you can set `useQuotesBracketsProcessing` the same as split_sentences.
184 | 
185 | ```java
186 | import kss.Kss;
187 | 
188 | Kss kss = new Kss();
189 | String text = "NoSQL이라고 하는 말은 No 'English'라고 하는 말과 마찬가지다. 세상에는 영어 말고도 수많은 언어가 존재한다. MongoDB에서 사용하는 쿼리 언어와 CouchDB에서 사용하는 쿼리 언어는 서로 전혀 다르다. 그럼에도 이 두 쿼리 언어는 같은 NoSQL 카테고리에 속한다. 어쨌거나 SQL이 아니기 때문이다. 또한 NoSQL이 No RDBMS를 의미하지는 않는다. BerkleyDB같은 예외가 있기 때문이다. 그리고 No RDBMS가 NoSQL인 것도 아니다. SQL호환 레이어를 제공하는 KV-store라는 예외가 역시 존재한다. 물론 KV-store의 특징상 range query를 where절에 넣을 수 없으므로 완전한 SQL은 못 되고 SQL의 부분집합 정도를 제공한다.";
190 | splitChunks(text, 128, false, true, false); // text maxLength, overlap, useHeuristic, useQuotesBracketsProcessing,
191 | ```
192 | <br><br>
193 | 
194 | ## 4. References
195 | Kss is available in various programming languages.
196 | - [Java version (this repo, ver 2.6.1)](https://github.com/sangdee/kss-java) is based on [Kss 2.6.0](https://github.com/hyunwoongko/kss/blob/main/docs/UPDATE.md#kss-260) and will be updated to 3.xx in the future.
197 | - [Python version](https://github.com/hyunwoongko/kss) contains the most recent changes to Kss.
198 | - [C++ version (ver 1.3.1)](https://github.com/likejazz/korean-sentence-splitter) has the original implementation of Kss but is deprecated now.
199 | 


--------------------------------------------------------------------------------
/src/main/java/kss/rule/Rule.java:
--------------------------------------------------------------------------------
  1 | package kss.rule;
  2 | /*
  3 |  * Korean Sentence Splitter
  4 |  * Split Korean text into sentences using heuristic algorithm.
  5 |  *
  6 |  * Copyright (C) 2021 Sang-ji Lee <tkdwl06@gmail.com>
  7 |  * Copyright (C) 2021 Hyun-woong Ko <kevin.woong@tunib.ai> and Sang-Kil Park <skpark1224@hyundai.com>
  8 |  * All rights reserved.
  9 |  *
 10 |  * This software may be modified and distributed under the terms
 11 |  * of the BSD license.  See the LICENSE file for details.
 12 |  */
 13 | 
 14 | import kss.base.enumerate.Id;
 15 | import kss.base.enumerate.Stats;
 16 | import java.util.ArrayList;
 17 | import java.util.Arrays;
 18 | import java.util.HashMap;
 19 | import java.util.HashSet;
 20 | import java.util.List;
 21 | import java.util.Map;
 22 | import java.util.Set;
 23 | 
 24 | public class Rule {
 25 | 
 26 |     static List<String> top500 = Arrays
 27 |         .asList("가", "간", "갈", "갉", "감", "갔", "갖", "같", "갚", "개", "걔", "걘", "거", "건", "걷", "걸", "검",
 28 |             "겪", "곤", "골", "곪", "곱", "괴",
 29 |             "구", "군", "굵", "굶", "굼", "굽", "궤", "귑", "귓", "규", "균", "긁", "긋", "기", "길", "긺", "깊",
 30 |             "까", "깎", "깐", "깖", "깜", "깠",
 31 |             "깨", "깬", "깼", "꺼", "꺾", "껐", "껴", "꼈", "꼬", "꼽", "꽂", "꽤", "꾸", "꾼", "꿇", "꿈", "꿔",
 32 |             "꿨", "꿰", "뀌", "끈", "끊", "끌",
 33 |             "끎", "끓", "끔", "끼", "낀", "낌", "나", "낚", "난", "날", "낡", "남", "났", "낮", "내", "낸", "냄",
 34 |             "냅", "냈", "넓", "넘", "넣", "녹",
 35 |             "논", "놀", "놂", "높", "놓", "놔", "놨", "누", "눈", "눕", "눠", "늘", "늙", "늚", "늦", "닦", "단",
 36 |             "닫", "달", "닮", "닳", "담", "답",
 37 |             "닿", "대", "댄", "댐", "댔", "던", "덜", "덞", "덥", "덮", "데", "덴", "뎀", "뎄", "돈", "돋", "돌",
 38 |             "돕", "돼", "됐", "되", "된", "됨",
 39 |             "두", "둔", "둠", "뒀", "든", "듣", "들", "듦", "듬", "딛", "딪", "따", "딴", "땀", "땄", "땋", "땠",
 40 |             "떠", "떨", "떴", "떼", "뗀", "뗌",
 41 |             "뛰", "뜀", "뜨", "뜯", "뜸", "띄", "띈", "띔", "띠", "띤", "막", "만", "많", "말", "맑", "맒", "맞",
 42 |             "맡", "매", "맨", "맴", "맵", "맸",
 43 |             "맺", "먹", "멀", "멂", "메", "멘", "멨", "몬", "몰", "몲", "묵", "묶", "묻", "물", "묽", "묾", "뭍",
 44 |             "뭘", "민", "믿", "밀", "밂", "밈",
 45 |             "밉", "박", "받", "밝", "밟", "배", "밴", "뱀", "뱄", "뱉", "번", "벌", "벎", "벗", "베", "벤", "보",
 46 |             "볶", "본", "봄", "봤", "봬", "뵀",
 47 |             "뵈", "뵌", "분", "붇", "불", "붉", "붊", "붓", "붙", "비", "빈", "빌", "빎", "빔", "빚", "빤", "빨",
 48 |             "빪", "빻", "빼", "뺀", "뺌", "뺐",
 49 |             "뻗", "뻤", "뼜", "뿜", "삔", "삠", "사", "산", "살", "삵", "삶", "삼", "샀", "새", "샌", "샘", "샛",
 50 |             "샜", "서", "섞", "선", "섰", "세",
 51 |             "셈", "셌", "속", "솎", "솟", "숨", "쉬", "쉰", "쉼", "쉽", "시", "식", "싣", "싫", "싶", "싸", "싼",
 52 |             "쌈", "쌌", "쌓", "쌔", "쌨", "써",
 53 |             "썩", "썰", "썲", "썼", "쎄", "쏘", "쏜", "쏟", "쏨", "쏴", "쐈", "쑤", "쑨", "쓰", "쓴", "쓸", "쓺",
 54 |             "씀", "씌", "씐", "씹", "안", "앉",
 55 |             "않", "알", "앎", "앓", "암", "약", "얇", "얕", "얘", "얜", "언", "얹", "얻", "얼", "없", "엎", "엮",
 56 |             "연", "열", "엶", "옅", "옌", "옛",
 57 |             "오", "온", "옭", "옮", "옳", "옴", "와", "왔", "왜", "운", "울", "읊", "일", "읽", "잃", "입", "있",
 58 |             "잊", "자", "작", "잔", "잡", "잤",
 59 |             "잦", "재", "잰", "잼", "쟀", "쟤", "쟨", "적", "전", "절", "젊", "접", "젓", "져", "졌", "존", "졸",
 60 |             "졺", "좁", "좇", "좋", "주", "죽",
 61 |             "준", "줌", "줍", "줘", "줬", "쥐", "쥠", "지", "진", "질", "집", "짓", "짖", "짙", "짜", "짧", "짰",
 62 |             "째", "짼", "쨌", "쩐", "쩔", "쪄",
 63 |             "쪘", "쫀", "쬐", "쬠", "찌", "찍", "찐", "찜", "찝", "찢", "차", "찬", "참", "찼", "찾", "채", "챈",
 64 |             "챘", "쳐", "쳤", "추", "춘", "춤",
 65 |             "춥", "춰", "췄", "치", "친", "침", "캐", "캠", "캤", "커", "컸", "켜", "켠", "켬", "켰", "크", "큼",
 66 |             "키", "킨", "킴", "타", "탄", "탐",
 67 |             "탔", "터", "턺", "텁", "텨", "튀", "튄", "튐", "트", "튼", "틂", "틈", "파", "팔", "팜", "팠", "패",
 68 |             "팼", "퍼", "펌", "펐", "펴", "편",
 69 |             "폄", "폈", "푼", "품", "피", "핀", "핌", "하", "핥", "함", "해", "했", "헌", "휘", "희");
 70 | 
 71 |     static List<String> da = Arrays.asList("간", "같", "걔", "거", "건", "검", "곤", "곱",
 72 |         "구", "군", "굵", "길", "깊", "깐", "깠",
 73 |         "깬", "깼", "껐", "꼈", "꾼", "꿨",
 74 |         "끼", "낀", "난", "낡", "났", "낮", "낸", "냈", "넓",
 75 |         "논", "높", "놨", "눈", "늙", "늦", "단", "답",
 76 |         "닿", "댄", "댔", "덥", "덴", "뎄", "돈", "돼", "됐", "되", "된",
 77 |         "둔", "뒀", "딴", "땄", "땠", "떴", "뗀",
 78 |         "띈", "띤", "많", "맑", "맨", "맵", "맸",
 79 |         "멀", "멘", "멨", "몬", "묽", "민",
 80 |         "밉", "밝", "밴", "뱄", "번", "벤", "본", "봤", "뵀",
 81 |         "뵌", "분", "붉", "비", "빈", "빤", "뺀", "뺐",
 82 |         "뻤", "뼜", "삔", "산", "샀", "샌", "샜", "선", "섰",
 83 |         "셌", "쉰", "쉽", "시", "싶", "싼", "쌈", "쌌", "쌔", "쌨", "써",
 84 |         "썼", "쏜", "쐈", "쑨", "쓴",
 85 |         "않", "얇", "얕", "얘", "언", "없", "연", "옅",
 86 |         "옳", "왔", "운", "있", "작", "잔", "잤",
 87 |         "잦", "잰", "쟀", "쟤", "적", "전", "젊", "졌", "존", "좁", "좋",
 88 |         "준", "줍", "줬", "진", "짙", "짜", "짧", "짰", "짼", "쨌", "쩐",
 89 |         "쪘", "쫀", "찐", "차", "찬", "찼", "챈", "챘", "쳤", "춘",
 90 |         "춥", "췄", "친", "캤", "컸", "켠", "켰", "크", "킨", "탄",
 91 |         "탔", "튀", "튄", "트", "튼", "팠", "팼", "펐", "편",
 92 |         "폈", "푼", "핀", "했", "희");
 93 | 
 94 |     static List<String> yo = Arrays.asList("가", "감", "개", "걔", "걘", "괴",
 95 |         "까",
 96 |         "깨", "껴", "꿈", "꿔", "꿰",
 97 |         "끔", "낌", "나", "남", "내", "냄",
 98 |         "놂", "놔", "눠", "늚",
 99 |         "대", "댐", "데", "돼", "되", "됨",
100 |         "둠", "듦", "듬", "따", "땀", "떠", "떼",
101 |         "뜀", "뜸", "띔", "매",
102 |         "메",
103 |         "배", "베", "봬",
104 |         "뵈", "빎", "빔", "빪", "뺌",
105 |         "삠", "사", "삶", "삼", "새", "서", "세",
106 |         "셈", "싸", "쌈", "쌔", "써",
107 |         "썲", "쎄", "쏨", "쏴", "쓺", "씀",
108 |         "앎", "암", "얘", "얜", "엶",
109 |         "옮", "옴", "와", "왜", "자",
110 |         "재", "잼", "쟤", "쟨", "젊", "져", "졺",
111 |         "줌", "줘", "짜", "째", "쪄",
112 |         "쬠", "찜", "차", "채", "쳐", "춤",
113 |         "춰", "캐", "커", "켜", "켬", "킴", "타", "탐",
114 |         "터", "텨", "튐", "틈", "팜", "패", "퍼", "펌", "펴",
115 |         "폄", "품", "핌", "함", "해");
116 | 
117 |     static List<String> jyo = Arrays
118 |         .asList("가", "갉", "갔", "갖", "같", "갚", "개", "걔", "걷", "걸", "검", "겪", "골", "곪", "곱", "괴",
119 |             "굵", "굶", "굼", "굽", "긁", "긋", "길", "깊", "깎", "깠",
120 |             "깨", "깼", "꺾", "껐", "꼈", "꼽", "꽂", "꾸", "꿇", "꿨", "꿰", "뀌", "끊", "끌",
121 |             "끓", "끼", "낚", "날", "낡", "남", "났", "낮", "내", "냈", "넓", "넘", "넣", "녹",
122 |             "놀", "높", "놓", "놨", "누", "눕", "늙", "늦", "닦", "닫", "달", "닮", "닳", "답",
123 |             "닿", "대", "댔", "덜", "덥", "덮", "데", "뎄", "돋", "돌", "돕", "돼", "됐", "되",
124 |             "두", "뒀", "듣", "들", "딛", "딪", "따", "땄", "땋", "땠", "떨", "떴", "떼",
125 |             "뛰", "뜨", "뜯", "띄", "띠", "막", "많", "말", "맑", "맞", "맡", "매", "맵", "맸",
126 |             "맺", "먹", "멀", "메", "멨", "몰", "묵", "묶", "묻", "묽", "뭍", "믿", "밀",
127 |             "밉", "박", "받", "밝", "밟", "배", "뱄", "뱉", "벌", "벗", "베", "보", "볶", "봤", "봬", "뵀",
128 |             "뵈", "붇", "불", "붉", "붓", "붙", "비", "빌", "빚", "빨", "빻", "빼", "뺐",
129 |             "뻗", "뻤", "뼜", "사", "살", "삵", "샀", "새", "샌", "샛", "샜", "서", "섞", "섰", "세",
130 |             "셌", "속", "솎", "솟", "숨", "쉬", "쉽", "시", "식", "싣", "싫", "싶", "싸", "쌌", "쌓", "쌔", "쌨",
131 |             "써",
132 |             "썩", "썰", "썼", "쎄", "쏘", "쏟", "쏴", "쐈", "쑤", "쓰", "쓸", "씌", "씹", "앉",
133 |             "않", "알", "앓", "약", "얇", "얕", "얘", "얹", "얻", "얼", "없", "엎", "엮", "열", "옅", "옛",
134 |             "오", "온", "옭", "옮", "옳", "와", "왔", "울", "읊", "일", "읽", "잃", "입", "있", "잊", "자", "작",
135 |             "잡", "잤",
136 |             "잦", "재", "잰", "쟀", "쟤", "적", "절", "젊", "접", "젓", "졌", "졸", "좁", "좇", "좋", "주", "죽",
137 |             "줍", "줬", "쥐", "지", "질", "집", "짓", "짖", "짙", "짜", "짧", "짰", "째", "쨌", "쩔",
138 |             "쪘", "쬐", "찌", "찍", "찐", "찝", "찢", "차", "찼", "찾", "채", "챘", "쳐", "쳤", "추",
139 |             "춥", "춰", "췄", "치", "캐", "캤", "커", "컸", "켜", "켠", "켰", "크", "키", "타",
140 |             "탔", "터", "튀", "트", "파", "팔", "팠", "패", "팼", "펐", "펴",
141 |             "폈", "피", "하", "핥", "했", "휘", "희");
142 | 
143 |     static List<String> ham = Arrays
144 |         .asList("리", "절", "용", "편", "륭", "듯", "야", "족", "못", "끗", "안", "천",
145 |             "정", "각", "실", "소", "끔", "분", "이", "약");
146 | 
147 |     static List<String> um = Arrays
148 |         .asList("았", "었", "했", "없", "좋", "있", "웠", "였", "않", "같", "많", "겠", "찮", "났", "좁", "작", "싶",
149 |             "셨", "졌", "넓");
150 | 
151 |     static List<String> before = Arrays.asList(
152 |         //조사
153 |         "이", "가", "에서", "은", "는", "을", "를", "도", "에", "게", "께", "한테", "로", "써",
154 |         "와", "과", "랑", "까지", "부터", "뿐", "만", "따라", "토록", "도록", "든지", "던지", "란",
155 |         "만큼", "만치", "때",
156 | 
157 |         //부사
158 |         "너무", "잘", "못", "빨리", "매우", "몹시", "별로", "아까", "내일", "일찍", "금방",
159 |         "이미", "이리", "저리", "아니", "과연", "설마", "제발", "정말", "결코", "가득", "히",
160 | 
161 |         //대명사
162 |         "나", "저", "우리", "저희", "너", "너희", "당신", "그대", "그", "그녀", "분", "놈", "거", "것",
163 |         "여기", "저기", "쪽", "곳", "님"
164 |     );
165 |     public static Map<String, Integer> daValue = new HashMap<>() {{
166 |         put("갔", Id.PREV.getValue());
167 |         put("간", Id.PREV.getValue());
168 |         put("겠", Id.PREV.getValue());
169 |         put("겼", Id.PREV.getValue());
170 |         put("같", Id.PREV.getValue());
171 |         put("놨", Id.PREV.getValue());
172 |         put("녔", Id.PREV.getValue());
173 |         put("니", Id.PREV.getValue());
174 |         put("논", Id.PREV.getValue());
175 |         put("낸", Id.PREV.getValue());
176 |         put("냈", Id.PREV.getValue());
177 |         put("뒀", Id.PREV.getValue());
178 |         put("때", Id.PREV.getValue());
179 |         put("랐", Id.PREV.getValue());
180 |         put("럽", Id.PREV.getValue());
181 |         put("렵", Id.PREV.getValue());
182 |         put("렸", Id.PREV.getValue());
183 |         put("뤘", Id.PREV.getValue());
184 |         put("몄", Id.PREV.getValue());
185 |         put("밌", Id.PREV.getValue());
186 |         put("볐", Id.PREV.getValue());
187 |         put("볍", Id.PREV.getValue());
188 |         put("봤", Id.PREV.getValue());
189 |         put("섰", Id.PREV.getValue());
190 |         put("샜", Id.PREV.getValue());
191 |         put("셨", Id.PREV.getValue());
192 |         put("싼", Id.PREV.getValue());
193 |         put("싸", Id.PREV.getValue());
194 |         put("않", Id.PREV.getValue());
195 |         put("았", Id.PREV.getValue());
196 |         put("없", Id.PREV.getValue());
197 |         put("었", Id.PREV.getValue());
198 |         put("였", Id.PREV.getValue());
199 |         put("온", Id.PREV.getValue());
200 |         put("웠", Id.PREV.getValue());
201 |         put("이", Id.PREV.getValue());
202 |         put("인", Id.PREV.getValue());
203 |         put("있", Id.PREV.getValue());
204 |         put("진", Id.PREV.getValue());
205 |         put("졌", Id.PREV.getValue());
206 |         put("쳤", Id.PREV.getValue());
207 |         put("췄", Id.PREV.getValue());
208 |         put("챘", Id.PREV.getValue());
209 |         put("켰", Id.PREV.getValue());
210 |         put("켠", Id.PREV.getValue());
211 |         put("팠", Id.PREV.getValue());
212 |         put("펐", Id.PREV.getValue());
213 |         put("폈", Id.PREV.getValue());
214 |         put("했", Id.PREV.getValue());
215 |         put("혔", Id.PREV.getValue());
216 |         put("한", Id.NEXT.getValue());
217 |         put("가", Id.NEXT.getValue());
218 |         put("고", Id.NEXT.getValue() | Id.NEXT2.getValue());
219 |         put("는", Id.NEXT.getValue() | Id.NEXT2.getValue());
220 |         put("라", Id.NEXT.getValue());
221 |         put("시", Id.NEXT.getValue());
222 |         put("등", Id.NEXT.getValue());
223 |         put("던", Id.NEXT.getValue());
224 |         put("든", Id.NEXT.getValue());
225 |         put("지", Id.NEXT1.getValue() | Id.NEXT2.getValue());
226 |         put("를", Id.NEXT.getValue());
227 |         put("운", Id.NEXT.getValue());  //~다운
228 |         put("만", Id.NEXT.getValue());
229 |         put("며", Id.NEXT.getValue() | Id.NEXT2.getValue());
230 |         put("면", Id.NEXT.getValue() | Id.NEXT1.getValue() | Id.NEXT2.getValue());
231 |         put("서", Id.NEXT2.getValue());
232 |         put("싶", Id.PREV.getValue() | Id.NEXT.getValue());
233 |         put("죠", Id.NEXT.getValue());
234 |         put("죵", Id.NEXT.getValue());
235 |         put("쥬", Id.NEXT.getValue());
236 |         put("하", Id.NEXT1.getValue());
237 |         put("해", Id.NEXT1.getValue());
238 |         put("도", Id.NEXT2.getValue());
239 |         put("", Id.NONE.getValue());
240 |     }};
241 |     public static Map<String, Integer> yoValue = new HashMap<>() {{
242 |         put("겨", Id.PREV.getValue());
243 |         put("거", Id.PREV.getValue());
244 |         put("구", Id.PREV.getValue());
245 |         put("군", Id.PREV.getValue());
246 |         put("걸", Id.PREV.getValue());
247 |         put("까", Id.PREV.getValue());
248 |         put("께", Id.PREV.getValue());
249 |         put("껴", Id.PREV.getValue());
250 |         put("네", Id.PREV.getValue());
251 |         put("나", Id.PREV.getValue());
252 |         put("니", Id.PREV.getValue());
253 |         put("데", Id.PREV.getValue());
254 |         put("든", Id.PREV.getValue());
255 |         put("려", Id.PREV.getValue());
256 |         put("서", Id.PREV.getValue());
257 |         put("세", Id.PREV.getValue());
258 |         put("아", Id.PREV.getValue());
259 |         put("어", Id.PREV.getValue());
260 |         put("워", Id.PREV.getValue());
261 |         put("에", Id.PREV.getValue());
262 |         put("예", Id.PREV.getValue());
263 |         put("을", Id.PREV.getValue());
264 |         put("져", Id.PREV.getValue());
265 |         put("줘", Id.PREV.getValue());
266 |         put("지", Id.PREV.getValue());
267 |         put("춰", Id.PREV.getValue());
268 |         put("해", Id.PREV.getValue());
269 |         put("고", Id.NEXT2.getValue());
270 |         put("는", Id.NEXT.getValue());
271 |         put("라", Id.NEXT1.getValue());
272 |         put("등", Id.NEXT.getValue());
273 |         put("를", Id.NEXT.getValue());
274 |         put("즘", Id.NEXT.getValue());
275 |         put("소", Id.NEXT.getValue());
276 |         put("며", Id.NEXT2.getValue());
277 |         put("면", Id.PREV.getValue() | Id.NEXT2.getValue());
278 |         put("하", Id.NEXT1.getValue());
279 |         put("", Id.NONE.getValue());
280 |     }};
281 |     public static Map<String, Integer> jyoValue = new HashMap<>() {{
282 |         put("거", Id.PREV.getValue());
283 |         put("가", Id.PREV.getValue());
284 |         put("갔", Id.PREV.getValue());
285 |         put("겠", Id.PREV.getValue());
286 |         put("같", Id.PREV.getValue());
287 |         put("놨", Id.PREV.getValue());
288 |         put("녔", Id.PREV.getValue());
289 |         put("냈", Id.PREV.getValue());
290 |         put("니", Id.PREV.getValue());
291 |         put("뒀", Id.PREV.getValue());
292 |         put("았", Id.PREV.getValue());
293 |         put("르", Id.PREV.getValue());
294 |         put("랐", Id.PREV.getValue());
295 |         put("럽", Id.PREV.getValue());
296 |         put("렵", Id.PREV.getValue());
297 |         put("렸", Id.PREV.getValue());
298 |         put("맞", Id.PREV.getValue());
299 |         put("몄", Id.PREV.getValue());
300 |         put("밌", Id.PREV.getValue());
301 |         put("볐", Id.PREV.getValue());
302 |         put("볍", Id.PREV.getValue());
303 |         put("봤", Id.PREV.getValue());
304 |         put("서", Id.PREV.getValue());
305 |         put("섰", Id.PREV.getValue());
306 |         put("셨", Id.PREV.getValue());
307 |         put("샜", Id.PREV.getValue());
308 |         put("않", Id.PREV.getValue());
309 |         put("없", Id.PREV.getValue());
310 |         put("었", Id.PREV.getValue());
311 |         put("였", Id.PREV.getValue());
312 |         put("이", Id.PREV.getValue());
313 |         put("졌", Id.PREV.getValue());
314 |         put("쳤", Id.PREV.getValue());
315 |         put("챘", Id.PREV.getValue());
316 |         put("켰", Id.PREV.getValue());
317 |         put("팠", Id.PREV.getValue());
318 |         put("폈", Id.PREV.getValue());
319 |         put("하", Id.PREV.getValue());
320 |         put("했", Id.PREV.getValue());
321 |         put("혔", Id.PREV.getValue());
322 |         put("고", Id.PREV.getValue() | Id.NEXT2.getValue());
323 |         put("는", Id.NEXT.getValue());
324 |         put("등", Id.NEXT.getValue());
325 |         put("라", Id.NEXT1.getValue());
326 |         put("를", Id.NEXT.getValue());
327 |         put("며", Id.NEXT2.getValue());
328 |         put("면", Id.PREV.getValue() | Id.NEXT2.getValue());
329 |         put("", Id.NONE.getValue());
330 |     }};
331 |     public static Map<String, Integer> umValue = new HashMap<>() {{
332 |         put("았", Id.PREV.getValue());
333 |         put("없", Id.PREV.getValue());
334 |         put("었", Id.PREV.getValue());
335 |         put("했", Id.PREV.getValue());
336 |         put("있", Id.PREV.getValue());
337 |         put("좋", Id.PREV.getValue());
338 |         put("웠", Id.PREV.getValue());
339 |         put("였", Id.PREV.getValue());
340 |         put("않", Id.PREV.getValue());
341 |         put("같", Id.PREV.getValue());
342 |         put("겠", Id.PREV.getValue());
343 |         put("봤", Id.PREV.getValue());
344 |         put("밌", Id.PREV.getValue());
345 |         put("많", Id.PREV.getValue());
346 |         put("찮", Id.PREV.getValue());
347 |         put("났", Id.PREV.getValue());
348 |         put("처", Id.PREV.getValue());
349 |         put("렸", Id.PREV.getValue());
350 |         put("졌", Id.PREV.getValue());
351 |         put("싶", Id.PREV.getValue());
352 |         put("이", Id.NEXT.getValue());
353 |         put("에", Id.NEXT.getValue());
354 |         put("악", Id.NEXT.getValue());
355 |         put("식", Id.NEXT.getValue());
356 |         put("을", Id.NEXT.getValue());
357 |         put("으", Id.NEXT.getValue());
358 |         put("부", Id.NEXT.getValue());
359 |         put("도", Id.NEXT.getValue());
360 |         put("은", Id.NEXT.getValue());
361 |         put("엔", Id.NEXT.getValue());
362 |         put("날", Id.NEXT.getValue());
363 |         put("료", Id.NEXT.getValue());
364 |         put("과", Id.NEXT.getValue());
365 |         put("의", Id.NEXT.getValue());
366 |         put("만", Id.NEXT.getValue());
367 |         put("보", Id.NEXT.getValue());
368 |         put("인", Id.NEXT.getValue());
369 |         put("속", Id.NEXT.getValue());
370 |         put("", Id.NONE.getValue());
371 |     }};
372 |     public static Map<String, Integer> hamValue = new HashMap<>() {{
373 |         put("루", Id.PREV.getValue());
374 |         put("편", Id.PREV.getValue());
375 |         put("절", Id.PREV.getValue());
376 |         put("포", Id.PREV.getValue());
377 |         put("안", Id.PREV.getValue());
378 |         put("못", Id.PREV.getValue());
379 |         put("만", Id.PREV.getValue() | Id.NEXT.getValue());
380 |         put("족", Id.PREV.getValue());
381 |         put("야", Id.PREV.getValue());
382 |         put("치", Id.PREV.getValue());
383 |         put("결", Id.PREV.getValue());
384 |         put("수", Id.PREV.getValue());
385 |         put("각", Id.PREV.getValue());
386 |         put("끗", Id.PREV.getValue());
387 |         put("리", Id.PREV.getValue());
388 |         put("답", Id.PREV.getValue());
389 |         put("중", Id.PREV.getValue());
390 |         put("용", Id.PREV.getValue());
391 |         put("심", Id.PREV.getValue());
392 |         put("쾌", Id.PREV.getValue());
393 |         put("께", Id.NEXT.getValue());
394 |         put("이", Id.NEXT.getValue());
395 |         put("을", Id.NEXT.getValue());
396 |         put("과", Id.NEXT.getValue());
397 |         put("에", Id.NEXT.getValue());
398 |         put("은", Id.NEXT.getValue());
399 |         put("의", Id.NEXT.getValue());
400 |         put("도", Id.NEXT.getValue());
401 |         put("으", Id.NEXT.getValue());
402 |         put("되", Id.NEXT.getValue());
403 |         put("없", Id.NEXT.getValue());
404 |         put("부", Id.NEXT.getValue());
405 |         put("된", Id.NEXT.getValue());
406 |         put("정", Id.NEXT.getValue());
407 |         put("해", Id.NEXT.getValue());
408 |         put("한", Id.NEXT.getValue());
409 |         put("까", Id.NEXT.getValue());
410 |         put("축", Id.NEXT.getValue());
411 |         put("", Id.NONE.getValue());
412 |     }};
413 | 
414 |     public static Map<String, Integer> sbValue = new HashMap<>() {{
415 |         put("것", Id.PREV.getValue());
416 |         put("가", Id.PREV.getValue());
417 |         put("까", Id.PREV.getValue());
418 |         put("거", Id.PREV.getValue());
419 |         put("게", Id.PREV.getValue());
420 |         put("걸", Id.PREV.getValue());
421 |         put("껄", Id.PREV.getValue());
422 |         put("나", Id.PREV.getValue());
423 |         put("니", Id.PREV.getValue());
424 |         put("네", Id.PREV.getValue());
425 |         put("다", Id.PREV.getValue());
426 |         put("쎄", Id.PREV.getValue());
427 |         put("래", Id.PREV.getValue());
428 |         put("데", Id.PREV.getValue());
429 |         put("지", Id.PREV.getValue());
430 |         put("든", Id.PREV.getValue());
431 |         put("덩", Id.PREV.getValue());
432 |         put("등", Id.PREV.getValue());
433 |         put("랴", Id.PREV.getValue());
434 |         put("마", Id.PREV.getValue());
435 |         put("봐", Id.PREV.getValue());
436 |         put("서", Id.PREV.getValue());
437 |         put("아", Id.PREV.getValue());
438 |         put("어", Id.PREV.getValue());
439 |         put("오", Id.PREV.getValue());
440 |         put("요", Id.PREV.getValue());
441 |         put("을", Id.PREV.getValue());
442 |         put("자", Id.PREV.getValue());
443 |         put("죠", Id.PREV.getValue());
444 |         put("고", Id.NEXT2.getValue());
445 |         put("는", Id.NEXT.getValue());
446 |         put("라", Id.PREV.getValue() | Id.NEXT.getValue());
447 |         put("며", Id.NEXT2.getValue());
448 |         put("면", Id.NEXT2.getValue());
449 |         put("하", Id.NEXT1.getValue());
450 |         put("", Id.NONE.getValue());
451 |     }};
452 |     public static Map<String, Integer> commonValue = new HashMap<>() {{
453 |         put("ㄱ", Id.CONT.getValue());
454 |         put("ㄴ", Id.CONT.getValue());
455 |         put("ㄷ", Id.CONT.getValue());
456 |         put("ㄹ", Id.CONT.getValue());
457 |         put("ㅁ", Id.CONT.getValue());
458 |         put("ㅂ", Id.CONT.getValue());
459 |         put("ㅅ", Id.CONT.getValue());
460 |         put("ㅇ", Id.CONT.getValue());
461 |         put("ㅈ", Id.CONT.getValue());
462 |         put("ㅊ", Id.CONT.getValue());
463 |         put("ㅋ", Id.CONT.getValue());
464 |         put("ㅌ", Id.CONT.getValue());
465 |         put("ㅍ", Id.CONT.getValue());
466 |         put("ㅎ", Id.CONT.getValue());
467 |         put("ㅏ", Id.CONT.getValue());
468 |         put("ㅑ", Id.CONT.getValue());
469 |         put("ㅓ", Id.CONT.getValue());
470 |         put("ㅕ", Id.CONT.getValue());
471 |         put("ㅗ", Id.CONT.getValue());
472 |         put("ㅛ", Id.CONT.getValue());
473 |         put("ㅜ", Id.CONT.getValue());
474 |         put("ㅠ", Id.CONT.getValue());
475 |         put("ㅡ", Id.CONT.getValue());
476 |         put("ㅣ", Id.CONT.getValue());
477 |         put("^", Id.CONT.getValue());
478 |         put(";", Id.CONT.getValue());
479 |         put(".", Id.CONT.getValue());
480 |         put("?", Id.CONT.getValue());
481 |         put("!", Id.CONT.getValue());
482 |         put("~", Id.CONT.getValue());
483 |         put("…", Id.CONT.getValue());
484 |         put(",", Id.CONT.getValue());
485 |         put("", Id.NONE.getValue());
486 |     }};
487 |     public static Map<Integer, Map<String, Integer>> table = new HashMap<>() {{
488 |         put(Stats.DEFAULT.getValue(), new HashMap<>());
489 |         put(Stats.DA.getValue(), daValue);
490 |         put(Stats.YO.getValue(), yoValue);
491 |         put(Stats.JYO.getValue(), jyoValue);
492 |         put(Stats.UM.getValue(), umValue);
493 |         put(Stats.HAM.getValue(), hamValue);
494 |         put(Stats.SB.getValue(), sbValue);
495 |         put(Stats.COMMON.getValue(), commonValue);
496 |     }};
497 | 
498 |     public static List<String> postProcessingDa = setPostProcessingDa();
499 |     public static List<String> postProcessingYo = setPostProcessingYo();
500 |     public static List<String> postProcessingJyo = setPostProcessingJyo();
501 |     public static List<String> postProcessingHam = setPostProcessingHam();
502 |     public static List<String> postProcessingUm = setPostProcessingUm();
503 | 
504 | 
505 |     private static ArrayList<String> setPostProcessingDa() {
506 |         Set<String> set = new HashSet<>();
507 |         for (String i : da) {
508 |             for (String j : before) {
509 |                 set.add(String.format("%s %s다", j, i));
510 |             }
511 |         }
512 |         return new ArrayList<>(set);
513 |     }
514 | 
515 |     private static List<String> setPostProcessingYo() {
516 |         Set<String> set = new HashSet<>();
517 |         for (String i : yo) {
518 |             for (String j : before) {
519 |                 set.add(String.format("%s %s요", j, i));
520 |             }
521 |         }
522 |         return new ArrayList<>(set);
523 |     }
524 | 
525 |     private static List<String> setPostProcessingJyo() {
526 |         Set<String> set = new HashSet<>();
527 |         for (String i : jyo) {
528 |             for (String j : before) {
529 |                 set.add(String.format("%s %s죠", j, i));
530 |             }
531 |         }
532 |         return new ArrayList<>(set);
533 |     }
534 | 
535 |     private static List<String> setPostProcessingHam() {
536 |         Set<String> set = new HashSet<>();
537 |         for (String i : ham) {
538 |             for (String j : before) {
539 |                 set.add(String.format("%s %s함", j, i));
540 |             }
541 |         }
542 |         return new ArrayList<>(set);
543 |     }
544 | 
545 |     private static List<String> setPostProcessingUm() {
546 |         Set<String> set = new HashSet<>();
547 |         for (String i : um) {
548 |             for (String j : before) {
549 |                 set.add(String.format("%s %s음", j, i));
550 |             }
551 |         }
552 |         return new ArrayList<>(set);
553 |     }
554 | }
555 | 


--------------------------------------------------------------------------------
/src/main/java/kss/core/Backend.java:
--------------------------------------------------------------------------------
  1 | package kss.core;
  2 | /*
  3 |  * Korean Sentence Splitter
  4 |  * Split Korean text into sentences using heuristic algorithm.
  5 |  *
  6 |  * Copyright (C) 2021 Sang-ji Lee <tkdwl06@gmail.com>
  7 |  * Copyright (C) 2021 Hyun-woong Ko <kevin.woong@tunib.ai> and Sang-Kil Park <skpark1224@hyundai.com>
  8 |  * All rights reserved.
  9 |  *
 10 |  * This software may be modified and distributed under the terms
 11 |  * of the BSD license.  See the LICENSE file for details.
 12 |  */
 13 | 
 14 | import static kss.base.Base.doPushPopSymbol;
 15 | import static kss.base.Base.doTrimSentPushResults;
 16 | import static kss.base.Const.bracket;
 17 | import static kss.base.Const.bracketCloseToOpen;
 18 | import static kss.base.Const.bracketOpenToClose;
 19 | import static kss.base.Const.doubleQuotes;
 20 | import static kss.base.Const.doubleQuotesCloseToOpen;
 21 | import static kss.base.Const.doubleQuotesOpenToClose;
 22 | import static kss.base.Const.punctuation;
 23 | import static kss.base.Const.singleQuotes;
 24 | import static kss.base.Const.singleQuotesCloseToOpen;
 25 | import static kss.base.Const.singleQuotesOpenToClose;
 26 | import static kss.rule.Rule.commonValue;
 27 | import static kss.rule.Rule.postProcessingDa;
 28 | import static kss.rule.Rule.postProcessingHam;
 29 | import static kss.rule.Rule.postProcessingJyo;
 30 | import static kss.rule.Rule.postProcessingUm;
 31 | import static kss.rule.Rule.postProcessingYo;
 32 | import static kss.rule.Rule.table;
 33 | import static kss.util.IntToBool.intToBool;
 34 | 
 35 | import kss.base.BackupManager;
 36 | import kss.base.ChunkWithIndex;
 37 | import kss.base.SentenceIndex;
 38 | import kss.base.enumerate.Id;
 39 | import kss.base.enumerate.Stats;
 40 | import java.util.ArrayList;
 41 | import java.util.Arrays;
 42 | import java.util.Collections;
 43 | import java.util.LinkedList;
 44 | import java.util.List;
 45 | 
 46 | 
 47 | public class Backend {
 48 | 
 49 |     public List<String> realignByQuote(
 50 |         String text,
 51 |         int lastQuotePos,
 52 |         String quoteType,
 53 |         boolean useHeuristic,
 54 |         boolean useQuotesBracketsProcessing,
 55 |         int maxRecoverStep,
 56 |         int maxRecoverLength,
 57 |         int recoverStep
 58 |     ) {
 59 | 
 60 |         List<String> beforeQuote =
 61 |             splitSentences(
 62 |                 text.substring(0, lastQuotePos),
 63 |                 useHeuristic,
 64 |                 useQuotesBracketsProcessing,
 65 |                 maxRecoverStep,
 66 |                 maxRecoverLength,
 67 |                 recoverStep,
 68 |                 false
 69 |             );
 70 | 
 71 |         String beforeLast = (beforeQuote.size() > 0) ? beforeQuote.get(beforeQuote.size() - 1) : "";
 72 |         beforeQuote = (beforeQuote.size() == 1) ? new ArrayList<>()
 73 |             : beforeQuote.subList(0, beforeQuote.size() - 1);
 74 | 
 75 |         List<String> afterQuote = splitSentences(
 76 |             text.substring(lastQuotePos + 1),
 77 |             useHeuristic,
 78 |             useQuotesBracketsProcessing,
 79 |             maxRecoverStep,
 80 |             maxRecoverLength,
 81 |             recoverStep,
 82 |             false
 83 |         );
 84 | 
 85 |         String afterFirst = (afterQuote.size() > 0) ? afterQuote.get(0) : "";
 86 | 
 87 |         afterQuote = (afterQuote.size() == 1)
 88 |             ? new ArrayList<>() : afterQuote.subList(1, afterQuote.size());
 89 | 
 90 |         List<String> middleQuote = new ArrayList<>();
 91 |         middleQuote.add(beforeLast + quoteType + afterFirst);
 92 |         List<String> results = new ArrayList<>();
 93 | 
 94 |         results.addAll(beforeQuote);
 95 |         results.addAll(middleQuote);
 96 |         results.addAll(afterQuote);
 97 | 
 98 |         return results;
 99 |     }
100 | 
101 |     public List<String> lindexSplit(String text, List<Integer> indices) {
102 |         List<Integer> args = new ArrayList<>();
103 |         args.add(0);
104 | 
105 |         for (Integer data : indices) {
106 |             args.add(data + 1);
107 |         }
108 |         args.add(text.length() + 1);
109 | 
110 |         List<List<Integer>> zipped = new ArrayList<>();
111 |         for (int i = 0; i < args.size(); i++) {
112 |             if (i != args.size() - 1) {
113 |                 List<Integer> newList = new ArrayList<>();
114 |                 newList.add(args.get(i));
115 |                 newList.add(args.get(i + 1));
116 |                 zipped.add(newList);
117 | 
118 |             }
119 |         }
120 |         List<String> newList = new ArrayList<>();
121 |         for (List<Integer> zip : zipped) {
122 |             newList.add(text.substring(zip.get(0), zip.get(1) - 1));
123 |         }
124 |         return newList;
125 |     }
126 | 
127 |     public List<Integer> findAll(String aStr, String sub) {
128 |         int start = 0;
129 |         List<Integer> output = new ArrayList<>();
130 |         while (true) {
131 |             start = aStr.indexOf(sub, start);
132 |             if (start == -1) {
133 |                 break;
134 |             }
135 |             output.add(start + sub.length());
136 |             start += sub.length();
137 |         }
138 |         return output;
139 | 
140 |     }
141 | 
142 |     public List<String> postProcessing(List<String> results, List<String> postProcessingList) {
143 |         List<String> finalResults = new ArrayList<>();
144 |         for (String res : results) {
145 |             List<Integer> splitIdx = new ArrayList<>();
146 |             List<String> qoutes = new ArrayList<>();
147 |             boolean findQuotes = false;
148 |             qoutes.addAll(singleQuotes);
149 |             qoutes.addAll(doubleQuotes);
150 |             qoutes.addAll(bracket);
151 | 
152 |             for (String qt : qoutes) {
153 |                 if (res.contains(qt)) {
154 |                     findQuotes = true;
155 |                     break;
156 |                 }
157 |             }
158 |             if (!findQuotes) {
159 |                 for (String post : postProcessingList) {
160 |                     if (res.contains(post)) {
161 |                         splitIdx.addAll(findAll(res, post + 1));
162 |                     }
163 |                 }
164 |             }
165 |             Collections.sort(splitIdx);
166 |             finalResults.addAll(lindexSplit(res, splitIdx));
167 |         }
168 |         return finalResults;
169 |     }
170 | 
171 |     ArrayList<String> endPoint = setEndPoint();
172 |     ArrayList<String> needToReplaceZwsp = setNeedToReplaceZwsp();
173 | 
174 |     private ArrayList<String> setEndPoint() {
175 |         ArrayList<String> list = new ArrayList<>();
176 |         list.addAll(singleQuotes);
177 |         list.addAll(doubleQuotes);
178 |         list.addAll(bracket);
179 |         list.addAll(punctuation);
180 |         list.add(" ");
181 |         list.add("");
182 |         list.addAll(commonValue.keySet());
183 |         return list;
184 |     }
185 | 
186 |     private ArrayList<String> setNeedToReplaceZwsp() {
187 |         ArrayList<String> list = new ArrayList<>();
188 |         list.addAll(singleQuotes);
189 |         list.addAll(doubleQuotes);
190 |         list.addAll(bracket);
191 |         return list;
192 |     }
193 | 
194 |     public ArrayList<String> splitSentences(
195 |         String text,
196 |         boolean useHeuristic,
197 |         boolean useQuotesBracketsProcessing,
198 |         int maxRecoverStep,
199 |         int maxRecoverLength,
200 |         int recoverStep,
201 |         boolean useStrip
202 |     ) {
203 | 
204 |         if (text.length() > maxRecoverLength) {
205 |             maxRecoverStep = 0;
206 |         }
207 | 
208 |         text = text.replace("\u200b", "");
209 |         BackupManager backupManager = new BackupManager();
210 | 
211 |         LinkedList<String> doubleQuoteStack = new LinkedList<>();
212 |         LinkedList<String> singleQuoteStack = new LinkedList<>();
213 |         LinkedList<String> bracketStack = new LinkedList<>();
214 |         List<String> tests = Arrays.asList("다", "요", "죠", "함", "음");
215 | 
216 |         for (int i = 0; i < text.length(); i++) {
217 |             String ch = Character.toString(text.charAt(i));
218 |             if (tests.contains(ch)) {
219 |                 if (i != text.length() - 1) {
220 |                     if (!endPoint.contains(Character.toString(text.charAt(i + 1)))) {
221 |                         String targetToBackup = ch + text.charAt(i + 1);
222 |                         backupManager.addItem2Dict(
223 |                             targetToBackup,
224 |                             String.valueOf(Math.abs(targetToBackup.hashCode()))
225 |                         );
226 |                     }
227 |                 }
228 |             }
229 |         }
230 | 
231 |         text = backupManager.backup(text);
232 | 
233 |         for (String s : needToReplaceZwsp) {
234 |             text = text.replace(s, String.format("\u200b%s\u200b", s));
235 |         }
236 | 
237 |         String prev = "";
238 |         String curSentence = "";
239 |         List<String> results = new ArrayList<>();
240 |         int curStat = Stats.DEFAULT.getValue();
241 | 
242 |         int lastSingleQuotePos = 0;
243 |         int lastDoubleQuotePos = 0;
244 |         int lastBracketPos = 0;
245 | 
246 |         String singleQuotePop = "'";
247 |         String doubleQuotePop = "\"";
248 |         String bracketPoP = " ";
249 | 
250 |         for (int i = 0; i < text.length(); i++) {
251 |             List<String> code = Arrays.asList(".", "!", "?");
252 |             String ch = Character.toString(text.charAt(i));
253 | 
254 |             if (curStat == Stats.DEFAULT.getValue()) {
255 |                 if (doubleQuotes.contains(ch)) {
256 |                     if (useQuotesBracketsProcessing) {
257 |                         if (doubleQuotesOpenToClose.containsKey(ch)) {
258 |                             doubleQuotePop = doPushPopSymbol(
259 |                                 doubleQuoteStack,
260 |                                 doubleQuotesOpenToClose.get(ch),
261 |                                 ch
262 |                             );
263 |                         } else {
264 |                             doubleQuotePop = doPushPopSymbol(
265 |                                 doubleQuoteStack,
266 |                                 doubleQuotesCloseToOpen.get(ch),
267 |                                 ch
268 |                             );
269 |                         }
270 |                         lastDoubleQuotePos = i;
271 |                     }
272 |                 } else if (singleQuotes.contains(ch)) {
273 |                     if (useQuotesBracketsProcessing) {
274 |                         if (singleQuotesOpenToClose.containsKey(ch)) {
275 |                             singleQuotePop = doPushPopSymbol(
276 |                                 singleQuoteStack,
277 |                                 singleQuotesOpenToClose.get(ch),
278 |                                 ch
279 |                             );
280 |                         } else {
281 |                             singleQuotePop = doPushPopSymbol(
282 |                                 singleQuoteStack,
283 |                                 singleQuotesCloseToOpen.get(ch),
284 |                                 ch
285 |                             );
286 |                         }
287 |                         lastSingleQuotePos = i;
288 |                     }
289 |                 } else if (bracket.contains(ch)) {
290 |                     if (useQuotesBracketsProcessing) {
291 |                         if (bracketOpenToClose.containsKey(ch)) {
292 |                             bracketPoP = doPushPopSymbol(
293 |                                 bracketStack,
294 |                                 bracketOpenToClose.get(ch),
295 |                                 ch
296 |                             );
297 |                         } else {
298 |                             bracketPoP = doPushPopSymbol(
299 |                                 bracketStack,
300 |                                 bracketCloseToOpen.get(ch),
301 |                                 ch
302 |                             );
303 |                         }
304 |                         lastBracketPos = i;
305 |                     }
306 |                 } else if (code.contains(ch)) {
307 |                     if (doubleQuoteStack.isEmpty()
308 |                         && singleQuoteStack.isEmpty()
309 |                         && bracketStack.isEmpty()
310 |                         && intToBool(table.get(Stats.SB.getValue()).getOrDefault(prev, 0)
311 |                         & Id.PREV.getValue())) {
312 |                         curStat = Stats.SB.getValue();
313 |                     }
314 |                 }
315 | 
316 |                 if (useHeuristic) {
317 |                     if (ch.equals("다")) {
318 | 
319 |                         if (doubleQuoteStack.isEmpty()
320 |                             && singleQuoteStack.isEmpty()
321 |                             && bracketStack.isEmpty()
322 |                             && intToBool(
323 |                             table.get(Stats.DA.getValue()).getOrDefault(prev, 0) & Id.PREV
324 |                                 .getValue())) {
325 |                             curStat = Stats.DA.getValue();
326 |                         }
327 |                     }
328 | 
329 |                     if (ch.equals("요")) {
330 |                         if (doubleQuoteStack.isEmpty()
331 |                             && singleQuoteStack.isEmpty()
332 |                             && bracketStack.isEmpty()
333 |                             && intToBool(table.get(Stats.YO.getValue()).getOrDefault(prev, 0)
334 |                             & Id.PREV.getValue())) {
335 |                             curStat = Stats.YO.getValue();
336 |                         }
337 |                     }
338 |                     if (ch.equals("죠")) {
339 |                         if (doubleQuoteStack.isEmpty()
340 |                             && singleQuoteStack.isEmpty()
341 |                             && bracketStack.isEmpty()
342 |                             && intToBool(table.get(Stats.JYO.getValue()).getOrDefault(prev, 0)
343 |                             & Id.PREV.getValue())) {
344 |                             curStat = Stats.JYO.getValue();
345 |                         }
346 |                     }
347 |                     if (ch.equals("함")) {
348 |                         if (doubleQuoteStack.isEmpty()
349 |                             && singleQuoteStack.isEmpty()
350 |                             && bracketStack.isEmpty()
351 |                             && intToBool(table.get(Stats.HAM.getValue()).getOrDefault(prev, 0)
352 |                             & Id.PREV.getValue())) {
353 |                             curStat = Stats.HAM.getValue();
354 |                         }
355 |                     }
356 |                     if (ch.equals("음")) {
357 |                         if (doubleQuoteStack.isEmpty()
358 |                             && singleQuoteStack.isEmpty()
359 |                             && bracketStack.isEmpty()
360 |                             && intToBool(table.get(Stats.UM.getValue()).getOrDefault(prev, 0)
361 |                             & Id.PREV.getValue())) {
362 |                             curStat = Stats.UM.getValue();
363 |                         }
364 |                     }
365 |                 }
366 |             } else {
367 |                 if (doubleQuotes.contains(ch)) {
368 |                     lastDoubleQuotePos = i;
369 |                 } else if (singleQuotes
370 |                     .contains(ch)) {
371 |                     lastSingleQuotePos = i;
372 |                 } else if (bracket.contains(ch)) {
373 |                     lastBracketPos = i;
374 |                 }
375 | 
376 |                 boolean endIf = false;
377 |                 if (!endIf) {
378 |                     if (ch.equals(" ")
379 |                         || (intToBool(table.get(Stats.COMMON.getValue())
380 |                         .getOrDefault(ch, 0) & Id.CONT.getValue()))) {
381 | 
382 |                         if (intToBool(table.get(curStat)
383 |                             .getOrDefault(prev, 0) & Id.NEXT1.getValue())) {
384 | 
385 |                             curSentence = doTrimSentPushResults(
386 |                                 curSentence,
387 |                                 results
388 |                             );
389 | 
390 |                             curSentence += prev;
391 |                             curStat = Stats.DEFAULT.getValue();
392 |                         }
393 |                         endIf = true;
394 |                     }
395 |                 }
396 |                 if (!endIf) {
397 |                     if (intToBool(table.get(curStat)
398 |                         .getOrDefault(ch, 0) & Id.NEXT
399 |                         .getValue())) {
400 | 
401 |                         if (intToBool(table.get(curStat).getOrDefault(prev, 0) & Id.NEXT1
402 |                             .getValue())) {
403 |                             curSentence += prev;
404 |                         }
405 |                         curStat = Stats.DEFAULT.getValue();
406 |                         endIf = true;
407 |                     }
408 |                 }
409 |                 if (!endIf) {
410 |                     if (intToBool(table.get(curStat)
411 |                         .getOrDefault(ch, 0) & Id.NEXT1
412 |                         .getValue())) {
413 | 
414 |                         if (intToBool(table.get(curStat).getOrDefault(prev, 0)
415 |                             & Id.NEXT1.getValue())) {
416 | 
417 |                             curSentence = doTrimSentPushResults(
418 |                                 curSentence,
419 |                                 results
420 |                             );
421 | 
422 |                             curSentence += prev;
423 |                             curStat = Stats.DEFAULT.getValue();
424 |                         }
425 |                         endIf = true;
426 |                     }
427 |                 }
428 | 
429 |                 if (!endIf) {
430 |                     if (intToBool(table.get(curStat)
431 |                         .getOrDefault(ch, 0)
432 |                         & Id.NEXT2
433 |                         .getValue())) {
434 |                         if (intToBool(table.get(curStat).getOrDefault(prev, 0) & Id.NEXT1
435 |                             .getValue())) {
436 |                             curSentence += prev;
437 |                         } else {
438 |                             curSentence = doTrimSentPushResults(curSentence,
439 |                                 results);
440 |                         }
441 |                         curStat = Stats.DEFAULT.getValue();
442 |                         endIf = true;
443 |                     }
444 |                 }
445 |                 if (!endIf) {
446 |                     if (!intToBool(table.get(curStat).getOrDefault(ch, 0))
447 |                         || intToBool(table.get(curStat).getOrDefault(ch, 0) & Id.PREV.getValue())) {
448 | 
449 |                         curSentence = doTrimSentPushResults(
450 |                             curSentence,
451 |                             results
452 |                         );
453 | 
454 |                         if (intToBool(table.get(curStat).getOrDefault(prev, 0)
455 |                             & Id.NEXT1.getValue())) {
456 |                             curSentence += prev;
457 |                         }
458 | 
459 |                         curStat = Stats.DEFAULT.getValue();
460 | 
461 |                         if (bracket.contains(ch)) {
462 |                             if (useQuotesBracketsProcessing) {
463 |                                 if (bracketOpenToClose.containsKey(ch)) {
464 | 
465 |                                     bracketPoP = doPushPopSymbol(
466 |                                         bracketStack,
467 |                                         bracketOpenToClose.get(ch),
468 |                                         ch
469 |                                     );
470 | 
471 |                                 } else {
472 |                                     bracketPoP = doPushPopSymbol(
473 |                                         bracketStack,
474 |                                         bracketCloseToOpen.get(ch),
475 |                                         ch
476 |                                     );
477 |                                 }
478 |                                 lastBracketPos = i;
479 |                             }
480 |                         } else if (doubleQuotes.contains(ch)) {
481 |                             if (useQuotesBracketsProcessing) {
482 |                                 if (doubleQuotesOpenToClose.containsKey(ch)) {
483 |                                     doubleQuotePop = doPushPopSymbol(
484 |                                         doubleQuoteStack,
485 |                                         doubleQuotesOpenToClose.get(ch),
486 |                                         ch
487 |                                     );
488 |                                 } else {
489 |                                     doubleQuotePop = doPushPopSymbol(
490 |                                         doubleQuoteStack,
491 |                                         doubleQuotesCloseToOpen.get(ch),
492 |                                         ch
493 |                                     );
494 |                                 }
495 |                                 lastDoubleQuotePos = i;
496 |                             }
497 |                         } else if (singleQuotes.contains(ch)) {
498 |                             if (useQuotesBracketsProcessing) {
499 |                                 if (singleQuotesOpenToClose.containsKey(ch)) {
500 |                                     singleQuotePop = doPushPopSymbol(
501 |                                         singleQuoteStack,
502 |                                         singleQuotesOpenToClose.get(ch),
503 |                                         ch
504 |                                     );
505 |                                 } else {
506 |                                     singleQuotePop = doPushPopSymbol(
507 |                                         singleQuoteStack,
508 |                                         singleQuotesCloseToOpen.get(ch),
509 |                                         ch
510 |                                     );
511 |                                 }
512 |                                 lastSingleQuotePos = i;
513 |                             }
514 |                         }
515 |                         endIf = true;
516 |                     }
517 |                 }
518 |             }
519 | 
520 |             if (curStat == Stats.DEFAULT.getValue()
521 |                 || !intToBool(
522 |                 (table.get(curStat).getOrDefault(ch, 0) & Id.NEXT1.getValue())
523 |             )) {
524 |                 curSentence += ch;
525 |             }
526 | 
527 |             prev = ch;
528 |         }
529 | 
530 |         if (!curSentence.isEmpty()) {
531 |             curSentence = doTrimSentPushResults(curSentence, results);
532 |         }
533 |         if (
534 |             intToBool(
535 |                 table.get(curStat).getOrDefault(prev, 0) & Id.NEXT1.getValue()
536 |             )) {
537 |             curSentence += prev;
538 |             doTrimSentPushResults(curSentence, results);
539 |         }
540 | 
541 |         if (useHeuristic) {
542 |             if (text.contains("다 ")) {
543 |                 results = postProcessing(results, postProcessingDa);
544 |             }
545 |             if (text.contains("요 ")) {
546 |                 results = postProcessing(results, postProcessingYo);
547 |             }
548 |             if (text.contains("죠 ")) {
549 |                 results = postProcessing(results, postProcessingJyo);
550 |             }
551 |             if (text.contains("함 ")) {
552 |                 results = postProcessing(results, postProcessingHam);
553 |             }
554 |             if (text.contains("음 ")) {
555 |                 results = postProcessing(results, postProcessingUm);
556 |             }
557 |         }
558 |         if (singleQuoteStack.size() != 0 && recoverStep < maxRecoverStep) {
559 |             results = realignByQuote(
560 |                 text,
561 |                 lastSingleQuotePos,
562 |                 singleQuotePop,
563 |                 useHeuristic,
564 |                 useQuotesBracketsProcessing,
565 |                 maxRecoverStep,
566 |                 maxRecoverLength,
567 |                 recoverStep + 1
568 |             );
569 |         }
570 |         if (doubleQuoteStack.size() != 0 && recoverStep < maxRecoverStep) {
571 |             results = realignByQuote(
572 |                 text,
573 |                 lastDoubleQuotePos,
574 |                 doubleQuotePop,
575 |                 useHeuristic,
576 |                 useQuotesBracketsProcessing,
577 |                 maxRecoverStep,
578 |                 maxRecoverLength,
579 |                 recoverStep + 1
580 |             );
581 |         }
582 |         if (bracketStack.size() != 0 && recoverStep < maxRecoverStep) {
583 |             results = realignByQuote(
584 |                 text,
585 |                 lastBracketPos,
586 |                 bracketPoP,
587 |                 useHeuristic,
588 |                 useQuotesBracketsProcessing,
589 |                 maxRecoverStep,
590 |                 maxRecoverLength,
591 |                 recoverStep + 1
592 |             );
593 |         }
594 | 
595 |         ArrayList<String> resultList = new ArrayList<>();
596 | 
597 |         for (String s : results) {
598 |             s = backupManager.restore(s);
599 |             s = s.replace("\u200b", "");
600 |             resultList.add(useStrip ? s.strip() : s);
601 |         }
602 | 
603 |         results.addAll(resultList);
604 |         return resultList;
605 |     }
606 | 
607 |     public ArrayList<SentenceIndex> splitSentencesIndex(
608 |         String text,
609 |         boolean useHeuristic,
610 |         boolean useQuotesBracketsProcessing,
611 |         int maxRecoverStep,
612 |         int maxRecoverLength
613 |     ) {
614 |         ArrayList<String> sentences = splitSentences(
615 |             text,
616 |             useHeuristic,
617 |             useQuotesBracketsProcessing,
618 |             maxRecoverStep,
619 |             maxRecoverLength,
620 |             0,
621 |             true
622 |         );
623 | 
624 |         ArrayList<SentenceIndex> sentenceIndexes = new ArrayList<>();
625 |         int offset = 0;
626 | 
627 |         for (String sentence : sentences) {
628 |             sentenceIndexes.add(new SentenceIndex(offset + text.indexOf(sentence),
629 |                 offset + text.indexOf(sentence) + sentence.length()));
630 | 
631 |             offset += text.indexOf(sentence) + sentence.length();
632 |             text = text.substring(text.indexOf(sentence) + sentence.length());
633 |         }
634 |         return sentenceIndexes;
635 |     }
636 | 
637 |     public ArrayList<ChunkWithIndex> splitChunks(
638 |         String text,
639 |         int maxLength,
640 |         boolean overlap,
641 |         boolean useHeuristic,
642 |         boolean useQuotesBracketsProcessing,
643 |         int maxRecoverStep,
644 |         int maxRecoverLength
645 |     ) {
646 | 
647 |         ArrayList<SentenceIndex> span = new ArrayList<>();
648 |         ArrayList<ChunkWithIndex> chunks = new ArrayList<>();
649 | 
650 |         ArrayList<SentenceIndex> indices = splitSentencesIndex(
651 |             text,
652 |             useHeuristic,
653 |             useQuotesBracketsProcessing,
654 |             maxRecoverStep,
655 |             maxRecoverLength
656 |         );
657 | 
658 |         for (SentenceIndex index : indices) {
659 |             if (span.size() > 0) {
660 |                 if (index.getEnd() - span.get(0).getStart() > maxLength) {
661 |                     chunks.add(getChunkWithIndex(span, text));
662 |                     if (overlap) {
663 |                         double halfSpanSize = span.size() / 2.0;
664 |                         span = new ArrayList<>(span.subList(
665 |                             (int) (halfSpanSize - (halfSpanSize % 1)),
666 |                             span.size()
667 |                         ));
668 | 
669 |                     } else {
670 |                         span = new ArrayList<>();
671 |                     }
672 |                 }
673 |             }
674 |             span.add(index);
675 |         }
676 | 
677 |         chunks.add(getChunkWithIndex(span, text));
678 |         return chunks;
679 |     }
680 | 
681 |     public ChunkWithIndex getChunkWithIndex(
682 |         List<SentenceIndex> span,
683 |         String text) {
684 |         int start = span.get(0).getStart();
685 |         int end = span.get(span.size() - 1).getEnd();
686 | 
687 |         return new ChunkWithIndex(
688 |             span.get(0).getStart(),
689 |             text.substring(start, end)
690 |         );
691 |     }
692 | }
693 | 


--------------------------------------------------------------------------------