├── .gradle
    └── 1.6
    │   └── taskArtifacts
    │       ├── cache.properties.lock
    │       ├── cache.properties
    │       ├── fileHashes.bin
    │       ├── fileSnapshots.bin
    │       ├── taskArtifacts.bin
    │       └── outputFileStates.bin
├── settings.gradle
├── .gitignore
├── korean-analyzer-3.x
    ├── src
    │   ├── main
    │   │   ├── java
    │   │   │   ├── org
    │   │   │   │   └── apache
    │   │   │   │   │   └── lucene
    │   │   │   │   │       └── analysis
    │   │   │   │   │           └── kr
    │   │   │   │   │               ├── dic
    │   │   │   │   │                   ├── cj.dic
    │   │   │   │   │                   ├── prefix.dic
    │   │   │   │   │                   ├── compounds.dic
    │   │   │   │   │                   ├── occurrence.dic
    │   │   │   │   │                   ├── suffix.dic
    │   │   │   │   │                   └── josa.dic
    │   │   │   │   │               ├── AttributeWrapper.java
    │   │   │   │   │               ├── morph
    │   │   │   │   │                   ├── NounProperty.java
    │   │   │   │   │                   ├── Status.java
    │   │   │   │   │                   ├── WSOuputComparator.java
    │   │   │   │   │                   ├── WSCandidateComparator.java
    │   │   │   │   │                   ├── MorphException.java
    │   │   │   │   │                   ├── CompoundEntry.java
    │   │   │   │   │                   ├── WSAOutput.java
    │   │   │   │   │                   ├── MorphAnalyzerManager.java
    │   │   │   │   │                   ├── AnalysisOutputComparator.java
    │   │   │   │   │                   ├── SpaceOutput.java
    │   │   │   │   │                   ├── WordEntry.java
    │   │   │   │   │                   └── WSOutput.java
    │   │   │   │   │               ├── korean.properties
    │   │   │   │   │               ├── utils
    │   │   │   │   │                   ├── UnhandledException.java
    │   │   │   │   │                   ├── UnmodifiableIterator.java
    │   │   │   │   │                   ├── HanjaUtils.java
    │   │   │   │   │                   ├── ConstraintUtil.java
    │   │   │   │   │                   ├── KoreanEnv.java
    │   │   │   │   │                   └── JarResources.java
    │   │   │   │   │               ├── KoreanFilter.java
    │   │   │   │   │               ├── KoreanTokenizer.java
    │   │   │   │   │               ├── KoreanTokenizerImpl.java
    │   │   │   │   │               └── KoreanAnalyzer.java
    │   │   │   └── com
    │   │   │   │   └── tistory
    │   │   │   │       └── devyongsik
    │   │   │   │           ├── analyzer
    │   │   │   │               ├── dictionary
    │   │   │   │               │   ├── stop.txt
    │   │   │   │               │   ├── compounds.txt
    │   │   │   │               │   ├── synonym.txt
    │   │   │   │               │   └── DictionaryType.java
    │   │   │   │               ├── Engine.java
    │   │   │   │               ├── dictionary.properties
    │   │   │   │               ├── KoreanNounFilter.java
    │   │   │   │               ├── KoreanAnalyzer.java
    │   │   │   │               ├── KoreanStopFilter.java
    │   │   │   │               ├── DictionaryProperties.java
    │   │   │   │               ├── KoreanCompoundNounEngine.java
    │   │   │   │               ├── dictionaryindex
    │   │   │   │               │   └── SynonymDictionaryIndex.java
    │   │   │   │               ├── KoreanBaseNounEngine.java
    │   │   │   │               └── KoreanLongestNounEngine.java
    │   │   │   │           └── utils
    │   │   │   │               └── NounDictionaryDuplWordRemover.java
    │   │   └── resources
    │   │   │   └── logback.groovy
    │   └── test
    │   │   └── java
    │   │       └── com
    │   │           └── tistory
    │   │               └── devyongsik
    │   │                   └── analyzer
    │   │                       ├── DictionaryPropertiesTest.java
    │   │                       ├── dictionary
    │   │                           └── DictionaryFactoryTest.java
    │   │                       ├── util
    │   │                           ├── TestToken.java
    │   │                           └── AnalyzerTestUtil.java
    │   │                       ├── AnalyzerTest.java
    │   │                       ├── KoreanStopFilterTest.java
    │   │                       ├── KoreanMorphEngineTest.java
    │   │                       ├── KoreanSynonymEngineTest.java
    │   │                       ├── KoreanCompoundNounEngineTest.java
    │   │                       └── KoreanCharacterTokenizerTest.java
    ├── deploy_to_local_repo.sh
    ├── README
    ├── NOTICE.txt
    └── build.gradle
├── korean-analyzer-4.x
    ├── src
    │   ├── main
    │   │   ├── java
    │   │   │   ├── org
    │   │   │   │   └── apache
    │   │   │   │   │   └── lucene
    │   │   │   │   │       └── analysis
    │   │   │   │   │           └── kr
    │   │   │   │   │               ├── dic
    │   │   │   │   │                   ├── cj.dic
    │   │   │   │   │                   ├── prefix.dic
    │   │   │   │   │                   ├── compounds.dic
    │   │   │   │   │                   ├── occurrence.dic
    │   │   │   │   │                   ├── suffix.dic
    │   │   │   │   │                   └── josa.dic
    │   │   │   │   │               ├── AttributeWrapper.java
    │   │   │   │   │               ├── korean.properties
    │   │   │   │   │               ├── KoreanFilter.java
    │   │   │   │   │               ├── KoreanTokenizer.java
    │   │   │   │   │               ├── utils
    │   │   │   │   │                   ├── UnmodifiableIterator.java
    │   │   │   │   │                   ├── UnhandledException.java
    │   │   │   │   │                   ├── HanjaUtils.java
    │   │   │   │   │                   ├── ConstraintUtil.java
    │   │   │   │   │                   └── KoreanEnv.java
    │   │   │   │   │               ├── morph
    │   │   │   │   │                   ├── NounProperty.java
    │   │   │   │   │                   ├── MorphException.java
    │   │   │   │   │                   ├── MorphAnalyzerManager.java
    │   │   │   │   │                   ├── Status.java
    │   │   │   │   │                   ├── WSOuputComparator.java
    │   │   │   │   │                   ├── AnalysisOutputComparator.java
    │   │   │   │   │                   ├── WSCandidateComparator.java
    │   │   │   │   │                   ├── CompoundEntry.java
    │   │   │   │   │                   ├── WSAOutput.java
    │   │   │   │   │                   ├── WordEntry.java
    │   │   │   │   │                   ├── SpaceOutput.java
    │   │   │   │   │                   └── WSOutput.java
    │   │   │   │   │               ├── KoreanTokenizerImpl.java
    │   │   │   │   │               └── KoreanAnalyzer.java
    │   │   │   └── com
    │   │   │   │   └── tistory
    │   │   │   │       └── devyongsik
    │   │   │   │           └── analyzer
    │   │   │   │               ├── dictionary
    │   │   │   │                   ├── stop.txt
    │   │   │   │                   ├── compounds.txt
    │   │   │   │                   ├── synonym.txt
    │   │   │   │                   └── DictionaryType.java
    │   │   │   │               ├── dictionary.properties
    │   │   │   │               ├── Engine.java
    │   │   │   │               ├── ComparableState.java
    │   │   │   │               ├── KoreanNounFilter.java
    │   │   │   │               ├── KoreanAnalyzer.java
    │   │   │   │               ├── KoreanStopFilter.java
    │   │   │   │               ├── DictionaryProperties.java
    │   │   │   │               ├── KoreanCompoundNounEngine.java
    │   │   │   │               ├── dictionaryindex
    │   │   │   │                   └── SynonymDictionaryIndex.java
    │   │   │   │               ├── KoreanBaseNounEngine.java
    │   │   │   │               └── KoreanLongestNounEngine.java
    │   │   └── resources
    │   │   │   └── logback.groovy
    │   └── test
    │   │   └── java
    │   │       └── com
    │   │           └── tistory
    │   │               └── devyongsik
    │   │                   └── analyzer
    │   │                       ├── DictionaryPropertiesTest.java
    │   │                       ├── dictionary
    │   │                           └── DictionaryFactoryTest.java
    │   │                       ├── util
    │   │                           ├── TestToken.java
    │   │                           └── AnalyzerTestUtil.java
    │   │                       ├── AnalyzerTest.java
    │   │                       ├── KoreanStopFilterTest.java
    │   │                       ├── KoreanMorphEngineTest.java
    │   │                       ├── KoreanSynonymEngineTest.java
    │   │                       ├── KoreanCompoundNounEngineTest.java
    │   │                       └── KoreanCharacterTokenizerTest.java
    ├── deploy_to_local_repo.sh
    ├── README.md
    ├── README
    ├── NOTICE.txt
    └── build.gradle
└── README.md


/.gradle/1.6/taskArtifacts/cache.properties.lock:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
1 | include "korean-analyzer-3.x", "korean-analyzer-4.x"


--------------------------------------------------------------------------------
/.gradle/1.6/taskArtifacts/cache.properties:
--------------------------------------------------------------------------------
1 | #Thu Sep 05 23:49:40 KST 2013
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target
2 | .settings
3 | .project
4 | .classpath
5 | bin
6 | .gradle
7 | build
8 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/dic/cj.dic:
--------------------------------------------------------------------------------
1 | ###################
2 | 金融:금융


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/dic/cj.dic:
--------------------------------------------------------------------------------
1 | ###################
2 | 金融:금융


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary/stop.txt:
--------------------------------------------------------------------------------
1 | 꼭
2 | 잘
3 | nbsp
4 | the
5 | .


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary/stop.txt:
--------------------------------------------------------------------------------
1 | 꼭
2 | 잘
3 | nbsp
4 | the
5 | .


--------------------------------------------------------------------------------
/.gradle/1.6/taskArtifacts/fileHashes.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/need4spd/lucene-Korean-Analyzer/HEAD/.gradle/1.6/taskArtifacts/fileHashes.bin


--------------------------------------------------------------------------------
/.gradle/1.6/taskArtifacts/fileSnapshots.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/need4spd/lucene-Korean-Analyzer/HEAD/.gradle/1.6/taskArtifacts/fileSnapshots.bin


--------------------------------------------------------------------------------
/.gradle/1.6/taskArtifacts/taskArtifacts.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/need4spd/lucene-Korean-Analyzer/HEAD/.gradle/1.6/taskArtifacts/taskArtifacts.bin


--------------------------------------------------------------------------------
/.gradle/1.6/taskArtifacts/outputFileStates.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/need4spd/lucene-Korean-Analyzer/HEAD/.gradle/1.6/taskArtifacts/outputFileStates.bin


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/dic/prefix.dic:
--------------------------------------------------------------------------------
 1 | ####
 2 | 최
 3 | 고
 4 | 남
 5 | 여
 6 | 비
 7 | 유
 8 | 무
 9 | 군
10 | 각
11 | 기


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/dic/prefix.dic:
--------------------------------------------------------------------------------
 1 | ####
 2 | 최
 3 | 고
 4 | 남
 5 | 여
 6 | 비
 7 | 유
 8 | 무
 9 | 군
10 | 각
11 | 기


--------------------------------------------------------------------------------
/korean-analyzer-3.x/deploy_to_local_repo.sh:
--------------------------------------------------------------------------------
1 | #mvn -DaltDeploymentRepository=snapshot-repo::default::file:../need4spd-maven-repo/snapshots clean deploy
2 | gradle uploadArchives
3 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/deploy_to_local_repo.sh:
--------------------------------------------------------------------------------
1 | #mvn -DaltDeploymentRepository=snapshot-repo::default::file:../need4spd-maven-repo/snapshots clean deploy
2 | gradle uploadArchives
3 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary/compounds.txt:
--------------------------------------------------------------------------------
1 | 컴퓨터공략:컴퓨터,공략
2 | 일본어공부:일본어,공부
3 | 스프링프로그래밍공부:스프링,프로그래밍,공부
4 | 랑콤아이크림:랑콤,아이크림
5 | 월드컵조직위원회분과위:월드컵,조직,위원회,분과위


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary/compounds.txt:
--------------------------------------------------------------------------------
1 | 컴퓨터공략:컴퓨터,공략
2 | 일본어공부:일본어,공부
3 | 스프링프로그래밍공부:스프링,프로그래밍,공부
4 | 랑콤아이크림:랑콤,아이크림
5 | 월드컵조직위원회분과위:월드컵,조직,위원회,분과위


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/dic/compounds.dic:
--------------------------------------------------------------------------------
1 | ###################
2 | 밤하늘:밤,하늘
3 | 경전철:경,전철
4 | 가서명:가,서명
5 | 가입국:가,입국
6 | 갓김치:갓,김치
7 | 과소비:과,소비
8 | 고투자율:고투자,투자,투자율


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/dic/compounds.dic:
--------------------------------------------------------------------------------
1 | ###################
2 | 밤하늘:밤,하늘
3 | 경전철:경,전철
4 | 가서명:가,서명
5 | 가입국:가,입국
6 | 갓김치:갓,김치
7 | 과소비:과,소비
8 | 고투자율:고투자,투자,투자율


--------------------------------------------------------------------------------
/korean-analyzer-4.x/README.md:
--------------------------------------------------------------------------------
1 | lucene-Korean-Analyzer_4x
2 | =========================
3 | 
4 | lucene korean analyzer for lucene4.x
5 | 
6 | 
7 | more information in https://github.com/need4spd/lucene-Korean-Analyzer
8 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/AttributeWrapper.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr;
 2 | 
 3 | 
 4 | @Deprecated
 5 | public class AttributeWrapper {
 6 | 	private AttributeWrapper() {
 7 | 		
 8 | 	}
 9 | }
10 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/AttributeWrapper.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr;
 2 | 
 3 | 
 4 | @Deprecated
 5 | public class AttributeWrapper {
 6 | 	private AttributeWrapper() {
 7 | 		
 8 | 	}
 9 | }
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | lucene-Korean-Analyzer
 2 | ======================
 3 | 
 4 | Lucene Analyzer For Korean
 5 | 
 6 | 이수명님의 Analyzer(http://cafe.naver.com/korlucene)를 형태소분석 Filter로 변형하여 사용하고 있으며 
 7 | 직접 개발한 동의어 Filter, 복합명사 Filter등을 추가로 붙여 개발한 루씬용 한글 분석기 입니다.
 8 | 
 9 | 루씬3.X, 루씬4.X 두가지 버전이 있으며, 3.X버전은 추가 업데이트는 없고 현재는
10 | 4.X 버전에 대해서만 업데이트를 진행하고 있습니다.
11 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary.properties:
--------------------------------------------------------------------------------
1 | compounds.txt = com/tistory/devyongsik/analyzer/dictionary/compounds.txt
2 | custom.txt = com/tistory/devyongsik/analyzer/dictionary/custom.txt
3 | stop.txt = com/tistory/devyongsik/analyzer/dictionary/stop.txt
4 | synonym.txt = com/tistory/devyongsik/analyzer/dictionary/synonym.txt


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/NounProperty.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.morph;
 2 | 
 3 | /**
 4 |  * 명사의 유형별 분류
 5 |  * @author user
 6 |  *
 7 |  */
 8 | public class NounProperty {
 9 | 
10 | 	// 위치, 장소
11 | 	public static final String NP_LOCATION = "L";
12 | 	
13 | 	// 물리 측정량 (속도, 각도)
14 | 	public static final String NP_MEASURE = "M";
15 | 	
16 | 	
17 | }
18 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary/synonym.txt:
--------------------------------------------------------------------------------
 1 | 오라클,oracle
 2 | db,database,데이터베이스
 3 | 노트북,노트북pc,노트북컴퓨터,노트북피씨,notebook
 4 | 튜닝,tunning
 5 | sql,쿼리
 6 | 서버,server
 7 | 이클립스,eclipse
 8 | 배너,banner
 9 | 우리벤처,우리벤쳐
10 | 데이타,데이터,data
11 | 모델링,modeling
12 | 평,평형
13 | 코롱,코오롱
14 | 엔유씨,nuc
15 | 아디다스,adidas
16 | 필라,fila,휠라
17 | 테팔,tefal
18 | 캐논,canon
19 | 니콘,nikon
20 | 코원,cowon
21 | 론,lone


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary/synonym.txt:
--------------------------------------------------------------------------------
 1 | 오라클,oracle
 2 | db,database,데이터베이스
 3 | 노트북,노트북pc,노트북컴퓨터,노트북피씨,notebook
 4 | 튜닝,tunning
 5 | sql,쿼리
 6 | 서버,server
 7 | 이클립스,eclipse
 8 | 배너,banner
 9 | 우리벤처,우리벤쳐
10 | 데이타,데이터,data
11 | 모델링,modeling
12 | 평,평형
13 | 코롱,코오롱
14 | 엔유씨,nuc
15 | 아디다스,adidas
16 | 필라,fila,휠라
17 | 테팔,tefal
18 | 캐논,canon
19 | 니콘,nikon
20 | 코원,cowon
21 | 론,lone


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/DictionaryPropertiesTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import junit.framework.Assert;
 4 | 
 5 | import org.junit.Test;
 6 | 
 7 | public class DictionaryPropertiesTest {
 8 | 
 9 | 	@Test
10 | 	public void propertiesLoad() {
11 | 		DictionaryProperties dp = DictionaryProperties.getInstance();
12 | 		Assert.assertNotNull(dp);
13 | 	}
14 | }
15 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/Engine.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.util.List;
 4 | import java.util.Map;
 5 | 
 6 | import org.apache.lucene.util.AttributeSource;
 7 | 
 8 | public interface Engine {
 9 | 	void collectNounState(AttributeSource attributeSource, List<ComparableState> comparableStateList, Map<String, String> returnedTokens) throws Exception;
10 | }
11 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/DictionaryPropertiesTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import junit.framework.Assert;
 4 | 
 5 | import org.junit.Test;
 6 | 
 7 | public class DictionaryPropertiesTest {
 8 | 
 9 | 	@Test
10 | 	public void propertiesLoad() {
11 | 		DictionaryProperties dp = DictionaryProperties.getInstance();
12 | 		Assert.assertNotNull(dp);
13 | 	}
14 | }
15 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/Engine.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.util.Map;
 4 | import java.util.Stack;
 5 | 
 6 | import org.apache.lucene.util.AttributeSource;
 7 | import org.apache.lucene.util.AttributeSource.State;
 8 | 
 9 | public interface Engine {
10 | 	void collectNounState(AttributeSource attributeSource, Stack<State> nounsStack, Map<String, String> returnedTokens) throws Exception;
11 | }
12 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary.properties:
--------------------------------------------------------------------------------
1 | compounds.txt = com/tistory/devyongsik/analyzer/dictionary/compounds.txt
2 | custom.txt = com/tistory/devyongsik/analyzer/dictionary/custom.txt
3 | eomi_josa.txt = com/tistory/devyongsik/analyzer/dictionary/eomi_josa.txt
4 | noun.txt = com/tistory/devyongsik/analyzer/dictionary/noun.txt
5 | stop.txt = com/tistory/devyongsik/analyzer/dictionary/stop.txt
6 | synonym.txt = com/tistory/devyongsik/analyzer/dictionary/synonym.txt


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/dictionary/DictionaryFactoryTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer.dictionary;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import junit.framework.Assert;
 6 | 
 7 | import org.junit.Test;
 8 | 
 9 | public class DictionaryFactoryTest {
10 | 
11 | 	@Test
12 | 	public void loadDictionary() {
13 | 		DictionaryFactory factory = DictionaryFactory.getFactory();
14 | 		List<String> readWords = factory.getSynonymList();
15 | 
16 | 		Assert.assertTrue(readWords.size() > 0);
17 | 
18 | 	}
19 | }
20 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/dictionary/DictionaryFactoryTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer.dictionary;
 2 | 
 3 | import java.util.List;
 4 | 
 5 | import junit.framework.Assert;
 6 | 
 7 | import org.junit.Test;
 8 | 
 9 | public class DictionaryFactoryTest {
10 | 	
11 | 	@Test
12 | 	public void loadDictionary() {
13 | 		DictionaryFactory factory = DictionaryFactory.getFactory();
14 | 		List<String> readWords = factory.getSynonymList();
15 | 		
16 | 		Assert.assertTrue(readWords.size() > 0);
17 | 	}
18 | }
19 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/README:
--------------------------------------------------------------------------------
 1 | Lucene Korean Analyzer
 2 | 
 3 | 1. 루씬의 Analyzer를 활용한 한글 명사 추출 모듈입니다. 4.2.1 버전을 기반으로 개발 되어있습니다.
 4 | 
 5 | 2. 키워드를 추출하는 방법은 크게 형태소분석과 사전기반의 키워드탐색으로 나누어집니다.
 6 | 
 7 | 3. 형태소분석은 이수명님께서 개발하신 한글형태소분석 Analyzer를 4.2.1버전으로 변환하여 사용하고 있습니다.
 8 | (http://cafe.naver.com/korlucene)
 9 | 
10 | 4. 사전기반의 키워드탐색은 명사사전 (기본사전, 사용자정의 사전)을 사용하여, 들어온 문장을 탐색하여 명사를 찾아냅니다.
11 | 
12 | 5. 그외 동의어필터, 스테머필터, 불용어필터등이 사전기반으로 작동하도록 되어있습니다.
13 | 
14 | 6. 형태소 분석 테스트는 아래의 페이지에서 해보실 수 있습니다.
15 | 
16 | http://ec2-54-248-115-161.ap-northeast-1.compute.amazonaws.com/morphMain.devys
17 | 
18 | 더 자세한 사항은 Wiki페이지를 참고하여 주세요.


--------------------------------------------------------------------------------
/korean-analyzer-3.x/README:
--------------------------------------------------------------------------------
 1 | Lucene Korean Analyzer
 2 | 
 3 | 1. 루씬의 Analyzer를 활용한 한글 명사 추출 모듈입니다. 3.6.1 버전을 기반으로 개발 되어있습니다.
 4 | 
 5 | 2. 키워드를 추출하는 방법은 크게 형태소분석과 사전기반의 키워드탐색으로 나누어집니다.
 6 | 
 7 | 3. 형태소분석은 이수명님께서 개발하신 한글형태소분석 Analyzer를 3.6.1버전으로 변환하여 사용하고 있습니다.
 8 | (http://cafe.naver.com/korlucene)
 9 | 
10 | 4. 사전기반의 키워드탐색은 명사사전 (기본사전, 사용자정의 사전)을 사용하여, 들어온 문장을 탐색하여 명사를 찾아냅니다.
11 | 
12 | 5. 그외 동의어필터, 스테머필터, 불용어필터등이 사전기반으로 작동하도록 되어있습니다.
13 | 
14 | 6. 형태소 분석 테스트는 아래의 페이지에서 해보실 수 있습니다.
15 | 
16 | http://ec2-54-248-115-161.ap-northeast-1.compute.amazonaws.com:8080/crescent/morphMain.devys
17 | 
18 | 더 자세한 사항은 Wiki페이지를 참고하여 주세요.
19 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/NOTICE.txt:
--------------------------------------------------------------------------------
 1 | Apache Commons logging is The Apache Software License, Version 2.0
 2 | http://commons.apache.org/logging/
 3 | 
 4 | Junit is Common Public License - v 1.0
 5 | http://www.junit.org/
 6 | 
 7 | Log4j is The Apache Software License, Version 2.0
 8 | http://logging.apache.org/log4j/1.2/download.html
 9 | 
10 | Lucene is Apache License, Version 2.0
11 | http://lucene.apache.org/java/docs/index.html
12 | 
13 | Twitter4j is Apache License 2.0
14 | http://twitter4j.org/en/index.html
15 | 
16 | koreananalyzer is Apache License 2.0
17 | http://sourceforge.net/projects/lucenekorean/
18 | 
19 | 사전은 세종21 프로젝트에 문의 하여 사용의 허락을 득하였습니다.


--------------------------------------------------------------------------------
/korean-analyzer-4.x/NOTICE.txt:
--------------------------------------------------------------------------------
 1 | Apache Commons logging is The Apache Software License, Version 2.0
 2 | http://commons.apache.org/logging/
 3 | 
 4 | Junit is Common Public License - v 1.0
 5 | http://www.junit.org/
 6 | 
 7 | Log4j is The Apache Software License, Version 2.0
 8 | http://logging.apache.org/log4j/1.2/download.html
 9 | 
10 | Lucene is Apache License, Version 2.0
11 | http://lucene.apache.org/java/docs/index.html
12 | 
13 | Twitter4j is Apache License 2.0
14 | http://twitter4j.org/en/index.html
15 | 
16 | koreananalyzer is Apache License 2.0
17 | http://sourceforge.net/projects/lucenekorean/
18 | 
19 | 사전은 세종21 프로젝트에 문의 하여 사용의 허락을 득하였습니다.


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/dic/occurrence.dic:
--------------------------------------------------------------------------------
 1 | //#####################
 2 | F:NILL/에/0:대하^S/NILL/11:0
 3 | F:NILL/에/0:관하^S/NILL/11:0
 4 | F:NILL/에/0:따르^S/NILL/11:0
 5 | F:NILL/기/0:위하^S/NILL/11:0
 6 | F:NILL/을,를,ㄹ/0:수^W/NILL/1:0
 7 | F:NILL/ㄴ,는,은/0:지^W/NILL/1:0
 8 | F:NILL/NILL/0:포함^S/고/0:0
 9 | F:NILL/으로/0:하는^W/NILL/0:0
10 | F:NILL/NILL/1:풀^S/NILL/1,2:0
11 | F:NILL/을,를/2:가지^S/는/11:0
12 | F:NILL/을,를/0:둔^W/NILL/11:0
13 | F:NILL/의/0:양^S/NILL/0:0
14 | F:NILL/의/0:량^S/NILL/0:0
15 | R:메^S/지를/0:못하,안하^S/NILL/0:1
16 | R:NILL/ㄴ/0:것^S/NILL/0:0
17 | R:NILL/는/0:것^S/NILL/0:0
18 | R:NILL/은/0:것^S/NILL/0:0
19 | R:NILL/은/0:것^S/NILL/0:0


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/dic/occurrence.dic:
--------------------------------------------------------------------------------
 1 | //#####################
 2 | F:NILL/에/0:대하^S/NILL/11:0
 3 | F:NILL/에/0:관하^S/NILL/11:0
 4 | F:NILL/에/0:따르^S/NILL/11:0
 5 | F:NILL/기/0:위하^S/NILL/11:0
 6 | F:NILL/을,를,ㄹ/0:수^W/NILL/1:0
 7 | F:NILL/ㄴ,는,은/0:지^W/NILL/1:0
 8 | F:NILL/NILL/0:포함^S/고/0:0
 9 | F:NILL/으로/0:하는^W/NILL/0:0
10 | F:NILL/NILL/1:풀^S/NILL/1,2:0
11 | F:NILL/을,를/2:가지^S/는/11:0
12 | F:NILL/을,를/0:둔^W/NILL/11:0
13 | F:NILL/의/0:양^S/NILL/0:0
14 | F:NILL/의/0:량^S/NILL/0:0
15 | R:메^S/지를/0:못하,안하^S/NILL/0:1
16 | R:NILL/ㄴ/0:것^S/NILL/0:0
17 | R:NILL/는/0:것^S/NILL/0:0
18 | R:NILL/은/0:것^S/NILL/0:0
19 | R:NILL/은/0:것^S/NILL/0:0


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/korean.properties:
--------------------------------------------------------------------------------
 1 | syllable.dic = org/apache/lucene/analysis/kr/dic/syllable.dic
 2 | josa.dic = org/apache/lucene/analysis/kr/dic/josa.dic
 3 | eomi.dic = org/apache/lucene/analysis/kr/dic/eomi.dic
 4 | dictionary.dic = org/apache/lucene/analysis/kr/dic/total.dic
 5 | extension.dic = org/apache/lucene/analysis/kr/dic/extension.dic
 6 | prefix.dic = org/apache/lucene/analysis/kr/dic/prefix.dic
 7 | suffix.dic = org/apache/lucene/analysis/kr/dic/suffix.dic
 8 | compounds.dic = org/apache/lucene/analysis/kr/dic/compounds.dic
 9 | tagger.dic = org/apache/lucene/analysis/kr/dic/occurrence.dic
10 | cj.dic = org/apache/lucene/analysis/kr/dic/cj.dic


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/korean.properties:
--------------------------------------------------------------------------------
 1 | syllable.dic = org/apache/lucene/analysis/kr/dic/syllable.dic
 2 | josa.dic = org/apache/lucene/analysis/kr/dic/josa.dic
 3 | eomi.dic = org/apache/lucene/analysis/kr/dic/eomi.dic
 4 | dictionary.dic = org/apache/lucene/analysis/kr/dic/total.dic
 5 | extension.dic = org/apache/lucene/analysis/kr/dic/extension.dic
 6 | prefix.dic = org/apache/lucene/analysis/kr/dic/prefix.dic
 7 | suffix.dic = org/apache/lucene/analysis/kr/dic/suffix.dic
 8 | compounds.dic = org/apache/lucene/analysis/kr/dic/compounds.dic
 9 | tagger.dic = org/apache/lucene/analysis/kr/dic/occurrence.dic
10 | cj.dic = org/apache/lucene/analysis/kr/dic/cj.dic


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/ComparableState.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import org.apache.lucene.util.AttributeSource.State;
 4 | 
 5 | public class ComparableState implements Comparable<ComparableState> {
 6 | 
 7 | 	private State state;
 8 | 	private int startOffset;
 9 | 	
10 | 	public State getState() {
11 | 		return state;
12 | 	}
13 | 	public void setState(State state) {
14 | 		this.state = state;
15 | 	}
16 | 	public int getStartOffset() {
17 | 		return startOffset;
18 | 	}
19 | 	public void setStartOffset(int startOffset) {
20 | 		this.startOffset = startOffset;
21 | 	}
22 | 	
23 | 	@Override
24 | 	public int compareTo(ComparableState comparableState) {
25 | 		return getStartOffset() - comparableState.getStartOffset();
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/dic/suffix.dic:
--------------------------------------------------------------------------------
 1 | #####
 2 | 각
 3 | 감
 4 | 값
 5 | 객
 6 | 계
 7 | 길
 8 | 고
 9 | 공
10 | 관
11 | 국
12 | 권
13 | 금
14 | 급
15 | 기
16 | 내
17 | 난
18 | 단
19 | 대
20 | 땅
21 | 량
22 | 록
23 | 론
24 | 력
25 | 령
26 | 료
27 | 류
28 | 률
29 | 말
30 | 망
31 | 맵
32 | 문
33 | 물
34 | 면
35 | 밤
36 | 방
37 | 법
38 | 부
39 | 분
40 | 병
41 | 비
42 | 사
43 | 생
44 | 서
45 | 세
46 | 선
47 | 성
48 | 시
49 | 식
50 | 심
51 | 실
52 | 쇼
53 | 수
54 | 속
55 | 안
56 | 어
57 | 액
58 | 염
59 | 율
60 | 원
61 | 용
62 | 음
63 | 인
64 | 일
65 | 위
66 | 자
67 | 장
68 | 족
69 | 제
70 | 증
71 | 주
72 | 중
73 | 직
74 | 진
75 | 집
76 | 적
77 | 전
78 | 점
79 | 죄
80 | 컴
81 | 폭
82 | 품
83 | 표
84 | 판
85 | 팀
86 | 차
87 | 창
88 | 책
89 | 청
90 | 철
91 | 체
92 | 층
93 | 학
94 | 항
95 | 화
96 | 형
97 | 회


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/dic/suffix.dic:
--------------------------------------------------------------------------------
 1 | #####
 2 | 각
 3 | 감
 4 | 값
 5 | 객
 6 | 계
 7 | 길
 8 | 고
 9 | 공
10 | 관
11 | 국
12 | 권
13 | 금
14 | 급
15 | 기
16 | 내
17 | 난
18 | 단
19 | 대
20 | 땅
21 | 량
22 | 록
23 | 론
24 | 력
25 | 령
26 | 료
27 | 류
28 | 률
29 | 말
30 | 망
31 | 맵
32 | 문
33 | 물
34 | 면
35 | 밤
36 | 방
37 | 법
38 | 부
39 | 분
40 | 병
41 | 비
42 | 사
43 | 생
44 | 서
45 | 세
46 | 선
47 | 성
48 | 시
49 | 식
50 | 심
51 | 실
52 | 쇼
53 | 수
54 | 속
55 | 안
56 | 어
57 | 액
58 | 염
59 | 율
60 | 원
61 | 용
62 | 음
63 | 인
64 | 일
65 | 위
66 | 자
67 | 장
68 | 족
69 | 제
70 | 증
71 | 주
72 | 중
73 | 직
74 | 진
75 | 집
76 | 적
77 | 전
78 | 점
79 | 죄
80 | 컴
81 | 폭
82 | 품
83 | 표
84 | 판
85 | 팀
86 | 차
87 | 창
88 | 책
89 | 청
90 | 철
91 | 체
92 | 층
93 | 학
94 | 항
95 | 화
96 | 형
97 | 회


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/Status.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.morph;
 2 | 
 3 | public class Status {
 4 | 
 5 | 	private int josaMaxStart = 0;
 6 | 	
 7 | 	private int eomiMaxStart = 0;
 8 | 	
 9 | 	private int maxStart = 0;
10 | 	
11 | 	public void apply(int num) {
12 | 		if(maxStart<num) maxStart = num;
13 | 	}
14 | 	
15 | 	public int getMaxStart() {
16 | 		return maxStart;
17 | 	}
18 | 
19 | 	public void setMaxStart(int maxStart) {
20 | 		this.maxStart = maxStart;
21 | 	}
22 | 
23 | 	public int getJosaMaxStart() {
24 | 		return josaMaxStart;
25 | 	}
26 | 
27 | 	public void setJosaMaxStart(int josaMaxStart) {
28 | 		this.josaMaxStart = josaMaxStart;
29 | 	}
30 | 
31 | 
32 | 	public int getEomiMaxStart() {
33 | 		return eomiMaxStart;
34 | 	}
35 | 
36 | 	public void setEomiMaxStart(int eomiMaxStart) {
37 | 		this.eomiMaxStart = eomiMaxStart;
38 | 	}
39 | 	
40 | }
41 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/utils/UnhandledException.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.utils;
 2 | 
 3 | public class UnhandledException extends RuntimeException {
 4 | 
 5 |     /**
 6 |      * Required for serialization support.
 7 |      * 
 8 |      * @see java.io.Serializable
 9 |      */
10 |     private static final long serialVersionUID = 1832101364842773720L;
11 | 
12 |     /**
13 |      * Constructs the exception using a cause.
14 |      *
15 |      * @param cause  the underlying cause
16 |      */
17 |     public UnhandledException(Throwable cause) {
18 |         super(cause);
19 |     }
20 | 
21 |     /**
22 |      * Constructs the exception using a message and cause.
23 |      *
24 |      * @param message  the message to use
25 |      * @param cause  the underlying cause
26 |      */
27 |     public UnhandledException(String message, Throwable cause) {
28 |         super(message, cause);
29 |     }
30 | 
31 |     
32 | }
33 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/resources/logback.groovy:
--------------------------------------------------------------------------------
 1 | import ch.qos.logback.classic.encoder.PatternLayoutEncoder
 2 | import ch.qos.logback.core.ConsoleAppender
 3 | 
 4 | import static ch.qos.logback.classic.Level.DEBUG
 5 | import static ch.qos.logback.classic.Level.INFO
 6 | import static ch.qos.logback.classic.Level.WARN
 7 | import static ch.qos.logback.classic.Level.ERROR
 8 | 
 9 | appender("STDOUT", ConsoleAppender) {
10 |   encoder(PatternLayoutEncoder) {
11 |     pattern = "%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n"
12 |   }
13 | }
14 | 
15 | appender("FILE", RollingFileAppender) {
16 |   file = "logFile.log"
17 |   rollingPolicy(TimeBasedRollingPolicy) {
18 |     fileNamePattern = "logFile.log.%d{yyyy-MM-dd}"
19 |     maxHistory = 30
20 |   }
21 |   encoder(PatternLayoutEncoder) {
22 |     pattern = "%-4relative [%thread] %-5level %logger{35} - %msg%n"
23 |   }
24 | }
25 | 
26 | logger("com.tistory.devyongsik", DEBUG, ["STDOUT"])
27 | 
28 | root(WARN, ["STDOUT"])


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/resources/logback.groovy:
--------------------------------------------------------------------------------
 1 | import ch.qos.logback.classic.encoder.PatternLayoutEncoder
 2 | import ch.qos.logback.core.ConsoleAppender
 3 | 
 4 | import static ch.qos.logback.classic.Level.DEBUG
 5 | import static ch.qos.logback.classic.Level.INFO
 6 | import static ch.qos.logback.classic.Level.WARN
 7 | import static ch.qos.logback.classic.Level.ERROR
 8 | 
 9 | appender("STDOUT", ConsoleAppender) {
10 |   encoder(PatternLayoutEncoder) {
11 |     pattern = "%d{HH:mm:ss.SSS} [%thread] %-5level %logger{36} - %msg%n"
12 |   }
13 | }
14 | 
15 | appender("FILE", RollingFileAppender) {
16 |   file = "logFile.log"
17 |   rollingPolicy(TimeBasedRollingPolicy) {
18 |     fileNamePattern = "logFile.log.%d{yyyy-MM-dd}"
19 |     maxHistory = 30
20 |   }
21 |   encoder(PatternLayoutEncoder) {
22 |     pattern = "%-4relative [%thread] %-5level %logger{35} - %msg%n"
23 |   }
24 | }
25 | 
26 | logger("com.tistory.devyongsik", DEBUG, ["STDOUT"])
27 | 
28 | root(WARN, ["STDOUT"])


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary/DictionaryType.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer.dictionary;
 2 | 
 3 | public enum DictionaryType {
 4 | 	COMPOUND("복합명사") {
 5 | 		@Override
 6 | 		public String getPropertiesKey() {
 7 | 			return "compounds.txt";
 8 | 		}
 9 | 	}
10 | 	,CUSTOM("사용자정의") {
11 | 		@Override
12 | 		public String getPropertiesKey() {
13 | 			return "custom.txt";
14 | 		}
15 | 	}
16 | 	,SYNONYM("동의어") {
17 | 		@Override
18 | 		public String getPropertiesKey() {
19 | 			return "synonym.txt";
20 | 		}
21 | 	}
22 | 	,STOP("불용어") {
23 | 		@Override
24 | 		public String getPropertiesKey() {
25 | 			return "stop.txt";
26 | 		}
27 | 	}
28 | 	;
29 | 	
30 | 	private String description;
31 | 	
32 | 	public String getDescription() {
33 | 		return description;
34 | 	}
35 | 	
36 | 	public abstract String getPropertiesKey();
37 | 	
38 | 	DictionaryType(String desc) {
39 | 		this.description = desc;
40 | 	}
41 | }
42 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary/DictionaryType.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer.dictionary;
 2 | 
 3 | public enum DictionaryType {
 4 | 	COMPOUND("복합명사") {
 5 | 		@Override
 6 | 		public String getPropertiesKey() {
 7 | 			return "compounds.txt";
 8 | 		}
 9 | 	}
10 | 	,CUSTOM("사용자정의") {
11 | 		@Override
12 | 		public String getPropertiesKey() {
13 | 			return "custom.txt";
14 | 		}
15 | 	}
16 | 	,SYNONYM("동의어") {
17 | 		@Override
18 | 		public String getPropertiesKey() {
19 | 			return "synonym.txt";
20 | 		}
21 | 	}
22 | 	,STOP("불용어") {
23 | 		@Override
24 | 		public String getPropertiesKey() {
25 | 			return "stop.txt";
26 | 		}
27 | 	}
28 | 	;
29 | 	
30 | 	private String description;
31 | 	
32 | 	public String getDescription() {
33 | 		return description;
34 | 	}
35 | 	
36 | 	public abstract String getPropertiesKey();
37 | 	
38 | 	DictionaryType(String desc) {
39 | 		this.description = desc;
40 | 	}
41 | }
42 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSOuputComparator.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.morph;
 2 | 
 3 | import java.util.Comparator;
 4 | 
 5 | public class WSOuputComparator implements Comparator<AnalysisOutput> {
 6 | 
 7 | 	public int compare(AnalysisOutput o1, AnalysisOutput o2) {
 8 | 		
 9 | 		// 길이의 역순으로 정렬한다.
10 | 		
11 | 		int score = o2.getScore() - o1.getScore();
12 | 		if(score!=0) return score;
13 | 		
14 | 		int len = o2.getSource().length() - o1.getSource().length();		
15 | 		if(len!=0) return len;
16 | 		
17 | 	
18 | 		int ptn = getPtnScore(o2.getPatn()) - getPtnScore(o1.getPatn());
19 | 		if(ptn!=0) return ptn;
20 | 		
21 | 		int stem = o1.getStem().length() - o2.getStem().length();
22 | 		if(stem!=0) return stem;
23 | 		
24 | 	
25 | 		return 0;
26 | 	}
27 | 
28 | 	private int getPtnScore(int ptn) {
29 | 		
30 | 		if(ptn==PatternConstants.PTN_N) ptn = 7;
31 | 		else if(ptn==PatternConstants.PTN_AID) return 50;
32 | 		
33 | 		return ptn;
34 | 		
35 | 	}
36 | 		
37 | }
38 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/KoreanFilter.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr;
 2 | 
 3 | /**
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | 
21 | @Deprecated
22 | public class KoreanFilter {
23 | 
24 | 	private KoreanFilter() {
25 | 		
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/KoreanFilter.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr;
 2 | 
 3 | /**
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | 
21 | @Deprecated
22 | public class KoreanFilter {
23 | 
24 | 	private KoreanFilter() {
25 | 		
26 | 	}
27 | }
28 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/KoreanTokenizer.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr;
 2 | 
 3 | /**
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | 
21 | @Deprecated
22 | public class KoreanTokenizer {
23 | 	private KoreanTokenizer() {
24 | 		
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/KoreanTokenizer.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr;
 2 | 
 3 | /**
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | 
21 | @Deprecated
22 | public class KoreanTokenizer {
23 | 	private KoreanTokenizer() {
24 | 		
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSCandidateComparator.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.morph;
 2 | 
 3 | import java.util.Comparator;
 4 | import java.util.List;
 5 | 
 6 | public class WSCandidateComparator implements Comparator<WSOutput> {
 7 | 
 8 | 	public int compare(WSOutput o1, WSOutput o2) {		
 9 | 		
10 | 		int end = o2.getLastEnd() - o1.getLastEnd();
11 | 		if(end!=0) return end;
12 | 		
13 | 		int s1 = o1.getPhrases().size()==0 ? 999999999 : o1.getPhrases().size();
14 | 		int s2 = o2.getPhrases().size()==0 ? 999999999 : o2.getPhrases().size();
15 | 		
16 | 		int size = s1-s2;		
17 | 		if(size!=0) return size;
18 | 				
19 | 		int score = calculateScore(o2)-calculateScore(o1);
20 | 		if(score!=0) return score;
21 | 					
22 | 		return 0;
23 | 	}
24 | 
25 | 	private int calculateScore(WSOutput o) {		
26 | 				
27 | 		List<AnalysisOutput> entries = o.getPhrases();
28 | 		
29 | 		if(entries.size()==0) return 0;
30 | 		
31 | 		int sum = 0;
32 | 		for(int i=0;i<entries.size();i++) {
33 | 			sum += entries.get(i).getScore();
34 | 		}
35 | 	
36 | 		return sum / entries.size();
37 | 	}
38 | 	
39 | }
40 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/utils/UnmodifiableIterator.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.utils;
 2 | 
 3 | /**
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | import java.util.Iterator;
21 | 
22 | public abstract class UnmodifiableIterator implements Iterator {
23 | 
24 |     public void remove() {
25 |         throw new UnsupportedOperationException("Cannot remove from this iterator");
26 |     }
27 | 
28 | }


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/utils/UnmodifiableIterator.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.utils;
 2 | 
 3 | /**
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | import java.util.Iterator;
21 | 
22 | public abstract class UnmodifiableIterator implements Iterator {
23 | 
24 |     public void remove() {
25 |         throw new UnsupportedOperationException("Cannot remove from this iterator");
26 |     }
27 | 
28 | }


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/NounProperty.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 | 
 4 |  * contributor license agreements.  See the NOTICE file distributed with
 5 |  * this work for additional information regarding copyright ownership.
 6 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.apache.lucene.analysis.kr.morph;
19 | 
20 | /**
21 |  * 명사의 유형별 분류
22 |  * @author user
23 |  *
24 |  */
25 | public class NounProperty {
26 | 
27 | 	// 위치, 장소
28 | 	public static final String NP_LOCATION = "L";
29 | 	
30 | 	// 물리 측정량 (속도, 각도)
31 | 	public static final String NP_MEASURE = "M";
32 | 	
33 | 	
34 | }
35 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/MorphException.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.morph;
 2 | 
 3 | /**
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | public class MorphException extends Exception {
21 | 
22 | 	public MorphException() {
23 | 		super();
24 | 	}
25 | 	
26 | 	public MorphException(String message) {
27 | 		super(message);
28 | 	}
29 | 
30 | 	public MorphException(String message, Throwable cause) {
31 | 		super(message, cause);
32 | 	}
33 | 
34 | 	public MorphException(Throwable cause) {
35 | 		super(cause);
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/MorphException.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.morph;
 2 | 
 3 | /**
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | public class MorphException extends Exception {
21 | 
22 | 	public MorphException() {
23 | 		super();
24 | 	}
25 | 	
26 | 	public MorphException(String message) {
27 | 		super(message);
28 | 	}
29 | 
30 | 	public MorphException(String message, Throwable cause) {
31 | 		super(message, cause);
32 | 	}
33 | 
34 | 	public MorphException(Throwable cause) {
35 | 		super(cause);
36 | 	}
37 | }
38 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/CompoundEntry.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.morph;
 2 | 
 3 | /**
 4 |  * 복합명사의 개별단어에 대한 정보를 담고있는 클래스 
 5 |  * @author S.M.Lee
 6 |  *
 7 |  */
 8 | public class CompoundEntry {
 9 | 	
10 | 	private String word;
11 | 	
12 | 	private int offset = -1;
13 | 	
14 | 	private boolean exist = true;
15 | 	
16 | 	private char pos = PatternConstants.POS_NOUN;
17 | 	
18 | 	public CompoundEntry() {
19 | 		
20 | 	}
21 | 	
22 | 	public CompoundEntry(String w) {
23 | 		this.word = w;
24 | 	}
25 | 	
26 | 	public CompoundEntry(String w,int o) {
27 | 		this(w);
28 | 		this.offset = o;		
29 | 	}
30 | 	
31 | 	public CompoundEntry(String w,int o, boolean is) {
32 | 		this(w,o);
33 | 		this.exist = is;		
34 | 	}
35 | 	
36 | 	public CompoundEntry(String w,int o, boolean is, char p) {
37 | 		this(w,o,is);
38 | 		this.pos = p;
39 | 	}
40 | 	
41 | 	public void setWord(String w) {
42 | 		this.word = w;
43 | 	}
44 | 	
45 | 	public void setOffset(int o) {
46 | 		this.offset = o;
47 | 	}
48 | 	
49 | 	public String getWord() {
50 | 		return this.word;
51 | 	}
52 | 	
53 | 	public int getOffset() {
54 | 		return this.offset;
55 | 	}
56 | 	
57 | 	public boolean isExist() {
58 | 		return exist;
59 | 	}
60 | 	
61 | 	public void setExist(boolean is) {
62 | 		this.exist = is;
63 | 	}
64 | 	
65 | 	public char getPos() {
66 | 		return pos;
67 | 	}
68 | 
69 | 	public void setPos(char pos) {
70 | 		this.pos = pos;
71 | 	}	
72 | }
73 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/KoreanTokenizerImpl.java:
--------------------------------------------------------------------------------
 1 | /* The following code was generated by JFlex 1.4.1 on 12. 1. 3 오전 3:51 */
 2 | 
 3 | package org.apache.lucene.analysis.kr;
 4 | 
 5 | /**
 6 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 7 |  * contributor license agreements.  See the NOTICE file distributed with
 8 |  * this work for additional information regarding copyright ownership.
 9 |  * The ASF licenses this file to You under the Apache License, Version 2.0
10 |  * (the "License"); you may not use this file except in compliance with
11 |  * the License.  You may obtain a copy of the License at
12 |  *
13 |  *     http://www.apache.org/licenses/LICENSE-2.0
14 |  *
15 |  * Unless required by applicable law or agreed to in writing, software
16 |  * distributed under the License is distributed on an "AS IS" BASIS,
17 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 |  * See the License for the specific language governing permissions and
19 |  * limitations under the License.
20 |  */
21 | 
22 | 
23 | 
24 | /**
25 |  * This class is a scanner generated by 
26 |  * <a href="http://www.jflex.de/">JFlex</a> 1.4.1
27 |  * on 12. 1. 3 오전 3:51 from the specification file
28 |  * <tt>D:/eclipse-workspace/search/kr.analyzer.3x/src/org/apache/lucene/analysis/kr/KoreanTokenizerImpl.jflex</tt>
29 |  */
30 | @Deprecated
31 | class KoreanTokenizerImpl {
32 | 	private KoreanTokenizerImpl() {
33 | 		
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/KoreanTokenizerImpl.java:
--------------------------------------------------------------------------------
 1 | /* The following code was generated by JFlex 1.4.1 on 12. 1. 3 오전 3:51 */
 2 | 
 3 | package org.apache.lucene.analysis.kr;
 4 | 
 5 | /**
 6 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 7 |  * contributor license agreements.  See the NOTICE file distributed with
 8 |  * this work for additional information regarding copyright ownership.
 9 |  * The ASF licenses this file to You under the Apache License, Version 2.0
10 |  * (the "License"); you may not use this file except in compliance with
11 |  * the License.  You may obtain a copy of the License at
12 |  *
13 |  *     http://www.apache.org/licenses/LICENSE-2.0
14 |  *
15 |  * Unless required by applicable law or agreed to in writing, software
16 |  * distributed under the License is distributed on an "AS IS" BASIS,
17 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 |  * See the License for the specific language governing permissions and
19 |  * limitations under the License.
20 |  */
21 | 
22 | 
23 | 
24 | /**
25 |  * This class is a scanner generated by 
26 |  * <a href="http://www.jflex.de/">JFlex</a> 1.4.1
27 |  * on 12. 1. 3 오전 3:51 from the specification file
28 |  * <tt>D:/eclipse-workspace/search/kr.analyzer.3x/src/org/apache/lucene/analysis/kr/KoreanTokenizerImpl.jflex</tt>
29 |  */
30 | @Deprecated
31 | class KoreanTokenizerImpl {
32 | 	private KoreanTokenizerImpl() {
33 | 		
34 | 	}
35 | }
36 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSAOutput.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.morph;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | public class WSAOutput {
 7 | 
 8 | 	private String source;
 9 | 	
10 | 	private List<AnalysisOutput> results;
11 | 	
12 | 	private int wds = 0;
13 | 	
14 | 	private int end = 0;
15 | 
16 | 	public WSAOutput() {
17 | 		results = new ArrayList();
18 | 	}
19 | 	
20 | 	public WSAOutput(String src) {
21 | 		source = src;
22 | 		results = new ArrayList();
23 | 	}
24 | 	
25 | 	public WSAOutput(String src, List list) {
26 | 		source = src;		
27 | 		results = list;
28 | 	}
29 | 	
30 | 	public String getSource() {
31 | 		return source;
32 | 	}
33 | 
34 | 	public void setSource(String source) {
35 | 		this.source = source;
36 | 	}
37 | 
38 | 	public List getResults() {
39 | 		return results;
40 | 	}
41 | 
42 | 	public void setResults(List results) {
43 | 		this.results = results;
44 | 	}
45 | 		
46 | 	public void addNounResults(String word) {		
47 | 		addNounResults(word, null);		
48 | 	}
49 | 	
50 | 	public void addNounResults(String word, String end) {		
51 | 		addNounResults(word, end, AnalysisOutput.SCORE_ANALYSIS);		
52 | 	}
53 | 	
54 | 	public void addNounResults(String word, String end, int score) {	
55 | 		
56 | 		AnalysisOutput output = new AnalysisOutput(word, end, null, PatternConstants.PTN_NJ);
57 | 		if(end==null) output.setPatn(PatternConstants.PTN_N);
58 | 		
59 | 		output.setPos(PatternConstants.POS_NOUN);
60 | 		output.setScore(score);
61 | 		
62 | 		this.results.add(output);	
63 | 	}	
64 | 	
65 | }
66 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/KoreanAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr;
 2 | 
 3 | 
 4 | /**
 5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 6 |  * contributor license agreements.  See the NOTICE file distributed with
 7 |  * this work for additional information regarding copyright ownership.
 8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 9 |  * (the "License"); you may not use this file except in compliance with
10 |  * the License.  You may obtain a copy of the License at
11 |  *
12 |  *     http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  * Unless required by applicable law or agreed to in writing, software
15 |  * distributed under the License is distributed on an "AS IS" BASIS,
16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  * See the License for the specific language governing permissions and
18 |  * limitations under the License.
19 |  */
20 | 
21 | import org.apache.lucene.analysis.LowerCaseFilter;
22 | import org.apache.lucene.analysis.StopFilter;
23 | import org.apache.lucene.analysis.standard.StandardFilter;
24 | import org.apache.lucene.analysis.standard.StandardTokenizer;
25 | 
26 | /**
27 |  * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
28 |  * LowerCaseFilter} and {@link StopFilter}, using a list of English stop words.
29 |  *
30 |  * @version $Id: KoreanAnalyzer.java,v 1.1 2012/02/08 15:00:11 smlee0818 Exp $
31 |  * @version 2012.11.20 need4spd, Analyzer를 하나로 통일하기 위해 생성하지 못 하도록 수정함
32 |  */
33 | @Deprecated
34 | public class KoreanAnalyzer {
35 | 	
36 | 	private KoreanAnalyzer() {
37 | 		
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/KoreanAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr;
 2 | 
 3 | 
 4 | /**
 5 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 6 |  * contributor license agreements.  See the NOTICE file distributed with
 7 |  * this work for additional information regarding copyright ownership.
 8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 9 |  * (the "License"); you may not use this file except in compliance with
10 |  * the License.  You may obtain a copy of the License at
11 |  *
12 |  *     http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  * Unless required by applicable law or agreed to in writing, software
15 |  * distributed under the License is distributed on an "AS IS" BASIS,
16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  * See the License for the specific language governing permissions and
18 |  * limitations under the License.
19 |  */
20 | 
21 | import org.apache.lucene.analysis.core.LowerCaseFilter;
22 | import org.apache.lucene.analysis.core.StopFilter;
23 | import org.apache.lucene.analysis.standard.StandardFilter;
24 | import org.apache.lucene.analysis.standard.StandardTokenizer;
25 | 
26 | /**
27 |  * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link
28 |  * LowerCaseFilter} and {@link StopFilter}, using a list of English stop words.
29 |  *
30 |  * @version $Id: KoreanAnalyzer.java,v 1.1 2012/02/08 15:00:11 smlee0818 Exp $
31 |  * @version 2012.11.20 need4spd, Analyzer를 하나로 통일하기 위해 생성하지 못 하도록 수정함
32 |  */
33 | @Deprecated
34 | public class KoreanAnalyzer {
35 | 	
36 | 	private KoreanAnalyzer() {
37 | 		
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/MorphAnalyzerManager.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.morph;
 2 | 
 3 | /**
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | import java.util.List;
21 | 
22 | 
23 | public class MorphAnalyzerManager {
24 | 
25 | 	public void analyze(String strs) {
26 | 		MorphAnalyzer analyzer = new MorphAnalyzer();
27 | 		String[] tokens = strs.split(" ");
28 | 		for(String token:tokens) {
29 | 			try {
30 | 				List<AnalysisOutput> results = analyzer.analyze(token);
31 | 				for(AnalysisOutput o:results) {
32 | 					System.out.print(o.toString()+"->");
33 | 					for(int i=0;i<o.getCNounList().size();i++){
34 | 						System.out.print(o.getCNounList().get(i)+"/");
35 | 					}
36 | 					System.out.println("<"+o.getScore()+">");
37 | 				}
38 | 			} catch (MorphException e) {
39 | 				e.printStackTrace();
40 | 			}
41 | 		}
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/MorphAnalyzerManager.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.morph;
 2 | 
 3 | /**
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | import java.util.List;
21 | 
22 | 
23 | public class MorphAnalyzerManager {
24 | 
25 | 	public void analyze(String strs) {
26 | 		MorphAnalyzer analyzer = new MorphAnalyzer();
27 | 		String[] tokens = strs.split(" ");
28 | 		for(String token:tokens) {
29 | 			try {
30 | 				List<AnalysisOutput> results = analyzer.analyze(token);
31 | 				for(AnalysisOutput o:results) {
32 | 					System.out.print(o.toString()+"->");
33 | 					for(int i=0;i<o.getCNounList().size();i++){
34 | 						System.out.print(o.getCNounList().get(i)+"/");
35 | 					}
36 | 					System.out.println("<"+o.getScore()+">");
37 | 				}
38 | 			} catch (MorphException e) {
39 | 				e.printStackTrace();
40 | 			}
41 | 		}
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/utils/HanjaUtils.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.utils;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import java.util.ArrayList;
 6 | import java.util.HashMap;
 7 | import java.util.List;
 8 | import java.util.Map;
 9 | 
10 | import org.apache.lucene.analysis.kr.morph.MorphException;
11 | 
12 | public class HanjaUtils {
13 | 
14 | 	private static Map<String, char[]> mapHanja;
15 | 	
16 | 	public synchronized static void loadDictionary() throws MorphException {
17 | 		try {
18 | 			List<String> strList = FileUtil.readLines("org/apache/lucene/analysis/kr/dic/mapHanja.dic","UTF-8");
19 | 			mapHanja = new HashMap();		
20 | 		
21 | 			for(int i=0;i<strList.size();i++) {
22 | 				
23 | 				if(strList.get(i).length()<1||
24 | 						strList.get(i).indexOf(",")==-1) continue;
25 | 
26 | 				String[] hanInfos = StringUtil.split(strList.get(i),",");
27 | 
28 | 				if(hanInfos.length!=2) continue;
29 | 				
30 | 				String hanja = StringEscapeUtil.unescapeJava(hanInfos[0]);
31 | 
32 | 				mapHanja.put(hanja, hanInfos[1].toCharArray());
33 | 			}			
34 | 		} catch (IOException e) {
35 | 			throw new MorphException(e);
36 | 		}
37 | 	}
38 | 	
39 | 	/**
40 | 	 * 한자에 대응하는 한글을 찾아서 반환한다.
41 | 	 * 하나의 한자는 여러 음으로 읽일 수 있으므로 가능한 모든 음을 한글로 반환한다.
42 | 	 * @param hanja
43 | 	 * @return
44 | 	 * @throws MorphException
45 | 	 */
46 |     public static char[] convertToHangul(char hanja) throws MorphException {
47 |  
48 |     	if(mapHanja==null)  loadDictionary();
49 | 
50 | //		if(hanja>0x9FFF||hanja<0x3400) return new char[]{hanja};
51 | 		
52 | 		char[] result = mapHanja.get(new String(new char[]{hanja}));
53 | 		if(result==null) return new char[]{hanja};
54 | 		
55 |     	return result;
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/util/TestToken.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer.util;
 2 | 
 3 | 
 4 | public class TestToken {
 5 | 	private String term;
 6 | 	private int startOffset;
 7 | 	private int endOffset;
 8 | 	public String getTerm() {
 9 | 		return term;
10 | 	}
11 | 	public void setTerm(String term) {
12 | 		this.term = term;
13 | 	}
14 | 	public int getStartOffset() {
15 | 		return startOffset;
16 | 	}
17 | 	public void setStartOffset(int startOffset) {
18 | 		this.startOffset = startOffset;
19 | 	}
20 | 	public int getEndOffset() {
21 | 		return endOffset;
22 | 	}
23 | 	public void setEndOffset(int endOffset) {
24 | 		this.endOffset = endOffset;
25 | 	}
26 | 	@Override
27 | 	public String toString() {
28 | 		return "TestToken [term=" + term + ", startOffset=" + startOffset
29 | 				+ ", endOffset=" + endOffset + "]";
30 | 	}
31 | 	@Override
32 | 	public int hashCode() {
33 | 		final int prime = 31;
34 | 		int result = 1;
35 | 		result = prime * result + endOffset;
36 | 		result = prime * result + startOffset;
37 | 		result = prime * result + ((term == null) ? 0 : term.hashCode());
38 | 		return result;
39 | 	}
40 | 	@Override
41 | 	public boolean equals(Object obj) {
42 | 		if (this == obj)
43 | 			return true;
44 | 		if (obj == null)
45 | 			return false;
46 | 		if (getClass() != obj.getClass())
47 | 			return false;
48 | 		TestToken other = (TestToken) obj;
49 | 		if (endOffset != other.endOffset)
50 | 			return false;
51 | 		if (startOffset != other.startOffset)
52 | 			return false;
53 | 		if (term == null) {
54 | 			if (other.term != null)
55 | 				return false;
56 | 		} else if (!term.equals(other.term))
57 | 			return false;
58 | 		return true;
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/util/TestToken.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer.util;
 2 | 
 3 | 
 4 | public class TestToken {
 5 | 	private String term;
 6 | 	private int startOffset;
 7 | 	private int endOffset;
 8 | 	public String getTerm() {
 9 | 		return term;
10 | 	}
11 | 	public void setTerm(String term) {
12 | 		this.term = term;
13 | 	}
14 | 	public int getStartOffset() {
15 | 		return startOffset;
16 | 	}
17 | 	public void setStartOffset(int startOffset) {
18 | 		this.startOffset = startOffset;
19 | 	}
20 | 	public int getEndOffset() {
21 | 		return endOffset;
22 | 	}
23 | 	public void setEndOffset(int endOffset) {
24 | 		this.endOffset = endOffset;
25 | 	}
26 | 	@Override
27 | 	public String toString() {
28 | 		return "TestToken [term=" + term + ", startOffset=" + startOffset
29 | 				+ ", endOffset=" + endOffset + "]";
30 | 	}
31 | 	@Override
32 | 	public int hashCode() {
33 | 		final int prime = 31;
34 | 		int result = 1;
35 | 		result = prime * result + endOffset;
36 | 		result = prime * result + startOffset;
37 | 		result = prime * result + ((term == null) ? 0 : term.hashCode());
38 | 		return result;
39 | 	}
40 | 	@Override
41 | 	public boolean equals(Object obj) {
42 | 		if (this == obj)
43 | 			return true;
44 | 		if (obj == null)
45 | 			return false;
46 | 		if (getClass() != obj.getClass())
47 | 			return false;
48 | 		TestToken other = (TestToken) obj;
49 | 		if (endOffset != other.endOffset)
50 | 			return false;
51 | 		if (startOffset != other.startOffset)
52 | 			return false;
53 | 		if (term == null) {
54 | 			if (other.term != null)
55 | 				return false;
56 | 		} else if (!term.equals(other.term))
57 | 			return false;
58 | 		return true;
59 | 	}
60 | }
61 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/Status.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 | 
 4 |  * contributor license agreements.  See the NOTICE file distributed with
 5 |  * this work for additional information regarding copyright ownership.
 6 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.apache.lucene.analysis.kr.morph;
19 | 
20 | public class Status {
21 | 
22 | 	private int josaMaxStart = 0;
23 | 	
24 | 	private int eomiMaxStart = 0;
25 | 	
26 | 	private int maxStart = 0;
27 | 	
28 | 	public void apply(int num) {
29 | 		if(maxStart<num) maxStart = num;
30 | 	}
31 | 	
32 | 	public int getMaxStart() {
33 | 		return maxStart;
34 | 	}
35 | 
36 | 	public void setMaxStart(int maxStart) {
37 | 		this.maxStart = maxStart;
38 | 	}
39 | 
40 | 	public int getJosaMaxStart() {
41 | 		return josaMaxStart;
42 | 	}
43 | 
44 | 	public void setJosaMaxStart(int josaMaxStart) {
45 | 		this.josaMaxStart = josaMaxStart;
46 | 	}
47 | 
48 | 
49 | 	public int getEomiMaxStart() {
50 | 		return eomiMaxStart;
51 | 	}
52 | 
53 | 	public void setEomiMaxStart(int eomiMaxStart) {
54 | 		this.eomiMaxStart = eomiMaxStart;
55 | 	}
56 | 	
57 | }
58 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/utils/UnhandledException.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.lucene.analysis.kr.utils;
18 | 
19 | public class UnhandledException extends RuntimeException {
20 | 
21 |     /**
22 |      * Required for serialization support.
23 |      * 
24 |      * @see java.io.Serializable
25 |      */
26 |     private static final long serialVersionUID = 1832101364842773720L;
27 | 
28 |     /**
29 |      * Constructs the exception using a cause.
30 |      *
31 |      * @param cause  the underlying cause
32 |      */
33 |     public UnhandledException(Throwable cause) {
34 |         super(cause);
35 |     }
36 | 
37 |     /**
38 |      * Constructs the exception using a message and cause.
39 |      *
40 |      * @param message  the message to use
41 |      * @param cause  the underlying cause
42 |      */
43 |     public UnhandledException(String message, Throwable cause) {
44 |         super(message, cause);
45 |     }
46 | 
47 |     
48 | }
49 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/AnalysisOutputComparator.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.morph;
 2 | 
 3 | /**
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | import java.util.Comparator;
21 | 
22 | public class AnalysisOutputComparator implements Comparator<AnalysisOutput> {
23 | 	public int compare(AnalysisOutput out1, AnalysisOutput out2) {
24 | 		
25 | 		int score = out2.getScore()-out1.getScore();
26 | 		int pattern = out2.getPatn()-out1.getPatn();
27 | 		int len = out1.getStem().length()-out2.getStem().length();
28 | 		
29 | 		if(score!=0) return score;
30 | 		
31 | 		if(out2.getScore()==AnalysisOutput.SCORE_CORRECT &&
32 | 				out1.getScore()==AnalysisOutput.SCORE_CORRECT) {
33 | 			pattern = out1.getPatn()==PatternConstants.PTN_N || out1.getPatn()==PatternConstants.PTN_AID ? -1 : pattern;
34 | 			pattern = out2.getPatn()==PatternConstants.PTN_N || out2.getPatn()==PatternConstants.PTN_AID ? 1 : pattern;
35 | 		}
36 | 		
37 | 		if(pattern!=0) return pattern;
38 | 		
39 | 		return len;
40 | 	}
41 | }
42 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSOuputComparator.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.lucene.analysis.kr.morph;
18 | 
19 | import java.util.Comparator;
20 | 
21 | public class WSOuputComparator implements Comparator<AnalysisOutput> {
22 | 
23 | 	public int compare(AnalysisOutput o1, AnalysisOutput o2) {
24 | 		
25 | 		// 길이의 역순으로 정렬한다.
26 | 		
27 | 		int score = o2.getScore() - o1.getScore();
28 | 		if(score!=0) return score;
29 | 		
30 | 		int len = o2.getSource().length() - o1.getSource().length();		
31 | 		if(len!=0) return len;
32 | 		
33 | 	
34 | 		int ptn = getPtnScore(o2.getPatn()) - getPtnScore(o1.getPatn());
35 | 		if(ptn!=0) return ptn;
36 | 		
37 | 		int stem = o1.getStem().length() - o2.getStem().length();
38 | 		if(stem!=0) return stem;
39 | 		
40 | 	
41 | 		return 0;
42 | 	}
43 | 
44 | 	private int getPtnScore(int ptn) {
45 | 		
46 | 		if(ptn==PatternConstants.PTN_N) ptn = 7;
47 | 		else if(ptn==PatternConstants.PTN_AID) return 50;
48 | 		
49 | 		return ptn;
50 | 		
51 | 	}
52 | 		
53 | }
54 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/AnalyzerTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.StringReader;
 4 | import java.util.HashMap;
 5 | import java.util.List;
 6 | import java.util.Map;
 7 | 
 8 | import org.apache.lucene.analysis.Analyzer;
 9 | import org.apache.lucene.analysis.TokenStream;
10 | import org.junit.Before;
11 | import org.junit.Test;
12 | 
13 | import com.google.common.collect.Lists;
14 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
15 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil;
16 | import com.tistory.devyongsik.analyzer.util.TestToken;
17 | 
18 | public class AnalyzerTest extends AnalyzerTestUtil {
19 | 	private List<TestToken> nouns = null;
20 | 	private DictionaryFactory dictionaryFactory;
21 | 
22 | 	@Before
23 | 	public void initDictionary() {
24 | 		nouns = Lists.newArrayList();
25 | 		dictionaryFactory = DictionaryFactory.getFactory();
26 | 	}
27 | 
28 | 	@Test
29 | 	public void testCase1() throws Exception {
30 | 		
31 | 		Map<String, String> customNounDictionaryMap = new HashMap<String, String>();
32 | 		customNounDictionaryMap.put("고속도로", null);
33 | 		customNounDictionaryMap.put("고속", null);
34 | 		customNounDictionaryMap.put("도로", null);
35 | 		
36 | 		dictionaryFactory.setCustomNounDictionaryMap(customNounDictionaryMap);
37 | 
38 | 		StringReader reader = new StringReader("고속도로");
39 | 
40 | 		nouns.add(getToken("고속도로", 0, 4));
41 | 		nouns.add(getToken("고속도", 0, 3));
42 | 		nouns.add(getToken("고속", 0, 2));
43 | 		nouns.add(getToken("속도", 1, 3));
44 | 		
45 | 		Analyzer analyzer = new KoreanAnalyzer(true);
46 | 		TokenStream stream = analyzer.tokenStream("dummy", reader);
47 | 		stream.reset();
48 | 		
49 | 		List<TestToken> extractedTokens = collectExtractedNouns(stream);
50 | 
51 | 		analyzer.close();
52 | 
53 | 		verify(nouns, extractedTokens);
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/AnalyzerTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.StringReader;
 4 | import java.util.HashMap;
 5 | import java.util.List;
 6 | import java.util.Map;
 7 | 
 8 | import org.apache.lucene.analysis.Analyzer;
 9 | import org.apache.lucene.analysis.TokenStream;
10 | import org.junit.Before;
11 | import org.junit.Test;
12 | 
13 | import com.google.common.collect.Lists;
14 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
15 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil;
16 | import com.tistory.devyongsik.analyzer.util.TestToken;
17 | 
18 | public class AnalyzerTest extends AnalyzerTestUtil {
19 | 	private List<TestToken> nouns = null;
20 | 	private DictionaryFactory dictionaryFactory;
21 | 
22 | 	@Before
23 | 	public void initDictionary() {
24 | 		nouns = Lists.newArrayList();
25 | 		dictionaryFactory = DictionaryFactory.getFactory();
26 | 	}
27 | 
28 | 	@Test
29 | 	public void testCase1() throws Exception {
30 | 		
31 | 		Map<String, String> customNounDictionaryMap = new HashMap<String, String>();
32 | 		customNounDictionaryMap.put("고속도로", null);
33 | 		customNounDictionaryMap.put("고속", null);
34 | 		customNounDictionaryMap.put("도로", null);
35 | 		
36 | 		dictionaryFactory.setCustomNounDictionaryMap(customNounDictionaryMap);
37 | 
38 | 		StringReader reader = new StringReader("고속도로");
39 | 
40 | 		nouns.add(getToken("고속도로", 0, 4));
41 | 		nouns.add(getToken("고속도", 0, 3));
42 | 		nouns.add(getToken("고속", 0, 2));
43 | 		nouns.add(getToken("속도", 1, 3));
44 | 		
45 | 		Analyzer analyzer = new KoreanAnalyzer(true);
46 | 		TokenStream stream = analyzer.tokenStream("dummy", reader);
47 | 		stream.reset();
48 | 		
49 | 		List<TestToken> extractedTokens = collectExtractedNouns(stream);
50 | 
51 | 		analyzer.close();
52 | 
53 | 		verify(nouns, extractedTokens);
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanNounFilter.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.HashMap;
 5 | import java.util.List;
 6 | import java.util.Map;
 7 | import java.util.Stack;
 8 | 
 9 | import org.apache.lucene.analysis.TokenFilter;
10 | import org.apache.lucene.analysis.TokenStream;
11 | import org.slf4j.Logger;
12 | import org.slf4j.LoggerFactory;
13 | 
14 | public class KoreanNounFilter extends TokenFilter {
15 | 	private Logger logger = LoggerFactory.getLogger(KoreanNounFilter.class);
16 | 	
17 | 	private Stack<State> nounsStack = new Stack<State>();
18 | 	private List<Engine> engines;
19 | 	private Map<String, String> returnedTokens = new HashMap<String, String>();
20 | 	
21 | 	protected KoreanNounFilter(TokenStream input, List<Engine> engines) {
22 | 		super(input);
23 | 		this.engines = engines;
24 | 	}
25 | 
26 | 	@Override
27 | 	public final boolean incrementToken() throws IOException {
28 | 		
29 | 		
30 | 		if(logger.isDebugEnabled())
31 | 			logger.debug("incrementToken KoreanNounFilter");
32 | 		
33 | 		if(engines == null) {
34 | 			throw new IllegalStateException("KoreanNounFilter의 engines가 Null입니다.");
35 | 		}
36 | 		
37 | 
38 | 		if (nounsStack.size() > 0) {
39 | 			if(logger.isDebugEnabled())
40 | 				logger.debug("명사 Stack에서 토큰 리턴함");
41 | 
42 | 			State synState = nounsStack.pop();
43 | 			restoreState(synState);
44 | 
45 | 			return true;
46 | 		}
47 | 
48 | 		if (!input.incrementToken())
49 | 			return false;
50 | 		
51 | 		try {
52 | 			
53 | 			for(Engine engine : engines) {
54 | 				engine.collectNounState(input.cloneAttributes(), nounsStack , returnedTokens);
55 | 			}
56 | 			
57 | 			returnedTokens.clear();
58 | 			
59 | 		} catch (Exception e) {
60 | 			logger.error("명사필터에서 목록 조회 오류");
61 | 			e.printStackTrace();
62 | 		}
63 | 		
64 | 		return true;
65 | 	}
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/AnalysisOutputComparator.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.morph;
 2 | 
 3 | /**
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | import java.util.Comparator;
21 | 
22 | public class AnalysisOutputComparator implements Comparator {
23 | 	public int compare(Object o1, Object o2) {
24 | 		
25 | 		AnalysisOutput out1 = (AnalysisOutput)o1;
26 | 		AnalysisOutput out2 = (AnalysisOutput)o2;
27 | 		
28 | 		int score = out2.getScore()-out1.getScore();
29 | 		int pattern = out2.getPatn()-out1.getPatn();
30 | 		int len = out1.getStem().length()-out2.getStem().length();
31 | 		
32 | 		if(score!=0) return score;
33 | 		
34 | 		if(out2.getScore()==AnalysisOutput.SCORE_CORRECT &&
35 | 				out1.getScore()==AnalysisOutput.SCORE_CORRECT) {
36 | 			pattern = out1.getPatn()==PatternConstants.PTN_N || out1.getPatn()==PatternConstants.PTN_AID ? -1 : pattern;
37 | 			pattern = out2.getPatn()==PatternConstants.PTN_N || out2.getPatn()==PatternConstants.PTN_AID ? 1 : pattern;
38 | 		}
39 | 		
40 | 		if(pattern!=0) return pattern;
41 | 		
42 | 		return len;
43 | 	}
44 | }
45 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanStopFilterTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.StringReader;
 5 | import java.util.List;
 6 | import java.util.Map;
 7 | 
 8 | import org.apache.lucene.analysis.TokenStream;
 9 | import org.junit.Before;
10 | import org.junit.Test;
11 | 
12 | import com.google.common.collect.Lists;
13 | import com.google.common.collect.Maps;
14 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
15 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil;
16 | import com.tistory.devyongsik.analyzer.util.TestToken;
17 | 
18 | public class KoreanStopFilterTest extends AnalyzerTestUtil {
19 | 	private List<TestToken> tokens = null;
20 | 	//불용어는 the와 .
21 | 	private StringReader reader = new StringReader("the 개발하고 꼭 이것을 잘 해야합니다. 공백입니다.");
22 | 	private DictionaryFactory dictionaryFactory = null;
23 | 	
24 | 	@Before
25 | 	public void setUp() {
26 | 		tokens = Lists.newArrayList();
27 | 		dictionaryFactory = DictionaryFactory.getFactory();
28 | 		
29 | 		tokens.add(getToken("공백입니다", 24, 29));
30 | 		tokens.add(getToken("해야합니다", 17, 22));
31 | 		tokens.add(getToken("이것을", 11, 14));
32 | 		tokens.add(getToken("개발하고", 4, 8));
33 | 		tokens.add(getToken("꼭", 9, 10));
34 | 		tokens.add(getToken("잘", 15, 16));
35 | 	}
36 | 	
37 | 	
38 | 	@Test
39 | 	public void stopFilter() throws IOException {
40 | 		
41 | 		Map<String, String> stopWordDictionaryMap = Maps.newHashMap();
42 | 		stopWordDictionaryMap.put("the", null);
43 | 		stopWordDictionaryMap.put(".", null);
44 | 		
45 | 		dictionaryFactory.setStopWordDictionaryMap(stopWordDictionaryMap);
46 | 
47 | 		TokenStream stream = new KoreanStopFilter(new KoreanCharacterTokenizer(reader));
48 | 		stream.reset();
49 | 		
50 | 		List<TestToken> extractedTokens = collectExtractedNouns(stream);
51 | 
52 | 		stream.close();
53 | 
54 | 		verify(tokens, extractedTokens);
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSCandidateComparator.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.lucene.analysis.kr.morph;
18 | 
19 | import java.util.Comparator;
20 | import java.util.List;
21 | 
22 | public class WSCandidateComparator implements Comparator<WSOutput> {
23 | 
24 | 	public int compare(WSOutput o1, WSOutput o2) {		
25 | 		
26 | 		int end = o2.getLastEnd() - o1.getLastEnd();
27 | 		if(end!=0) return end;
28 | 		
29 | 		int s1 = o1.getPhrases().size()==0 ? 999999999 : o1.getPhrases().size();
30 | 		int s2 = o2.getPhrases().size()==0 ? 999999999 : o2.getPhrases().size();
31 | 		
32 | 		int size = s1-s2;		
33 | 		if(size!=0) return size;
34 | 				
35 | 		int score = calculateScore(o2)-calculateScore(o1);
36 | 		if(score!=0) return score;
37 | 					
38 | 		return 0;
39 | 	}
40 | 
41 | 	private int calculateScore(WSOutput o) {		
42 | 				
43 | 		List<AnalysisOutput> entries = o.getPhrases();
44 | 		
45 | 		if(entries.size()==0) return 0;
46 | 		
47 | 		int sum = 0;
48 | 		for(int i=0;i<entries.size();i++) {
49 | 			sum += entries.get(i).getScore();
50 | 		}
51 | 	
52 | 		return sum / entries.size();
53 | 	}
54 | 	
55 | }
56 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanStopFilterTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.StringReader;
 5 | import java.util.List;
 6 | import java.util.Map;
 7 | 
 8 | import org.apache.lucene.analysis.TokenStream;
 9 | import org.junit.Before;
10 | import org.junit.Test;
11 | 
12 | import com.google.common.collect.Lists;
13 | import com.google.common.collect.Maps;
14 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
15 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil;
16 | import com.tistory.devyongsik.analyzer.util.TestToken;
17 | 
18 | public class KoreanStopFilterTest extends AnalyzerTestUtil {
19 | 	private List<TestToken> tokens = null;
20 | 	//불용어는 the와 .
21 | 	private StringReader reader = new StringReader("the 개발하고 꼭 이것을 잘 해야합니다. 공백입니다.");
22 | 	private DictionaryFactory dictionaryFactory = null;
23 | 	
24 | 	@Before
25 | 	public void setUp() {
26 | 		tokens = Lists.newArrayList();
27 | 		dictionaryFactory = DictionaryFactory.getFactory();
28 | 		
29 | 		tokens.add(getToken("공백입니다", 24, 29));
30 | 		tokens.add(getToken("해야합니다", 17, 22));
31 | 		tokens.add(getToken("이것을", 11, 14));
32 | 		tokens.add(getToken("개발하고", 4, 8));
33 | 		tokens.add(getToken("꼭", 9, 10));
34 | 		tokens.add(getToken("잘", 15, 16));
35 | 	}
36 | 	
37 | 	
38 | 	@Test
39 | 	public void stopFilter() throws IOException {
40 | 		
41 | 		Map<String, String> stopWordDictionaryMap = Maps.newHashMap();
42 | 		stopWordDictionaryMap.put("the", null);
43 | 		stopWordDictionaryMap.put(".", null);
44 | 		
45 | 		dictionaryFactory.setStopWordDictionaryMap(stopWordDictionaryMap);
46 | 
47 | 		TokenStream stream = new KoreanStopFilter(new KoreanCharacterTokenizer(reader));
48 | 		stream.reset();
49 | 		
50 | 		List<TestToken> extractedTokens = collectExtractedNouns(stream);
51 | 
52 | 		stream.close();
53 | 
54 | 		verify(tokens, extractedTokens);
55 | 	}
56 | }
57 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanMorphEngineTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.StringReader;
 4 | import java.util.List;
 5 | 
 6 | import org.apache.lucene.analysis.TokenStream;
 7 | import org.junit.Before;
 8 | import org.junit.Test;
 9 | 
10 | import com.google.common.collect.Lists;
11 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil;
12 | import com.tistory.devyongsik.analyzer.util.TestToken;
13 | 
14 | /**
15 |  * @author need4spd, need4spd@cplanet.co.kr, 2011. 10. 14.
16 |  *
17 |  */
18 | public class KoreanMorphEngineTest extends AnalyzerTestUtil {
19 | 	private List<TestToken> nouns = null;
20 | 	
21 | 	private List<Engine> engines = null;
22 | 
23 | 	@Before
24 | 	public void initDictionary() {
25 | 		nouns = Lists.newArrayList();
26 | 		engines = Lists.newArrayList();
27 | 		
28 | 		engines.add(new KoreanMorphEngine());
29 | 	}
30 | 
31 | 	@Test
32 | 	public void testCase1() throws Exception {
33 | 		StringReader reader = new StringReader("기본사전이변경되었습니다");
34 | 		nouns.add(getToken("기본사전이변경", 0, 7));
35 | 		nouns.add(getToken("기본", 0, 2));
36 | 		nouns.add(getToken("전이", 3, 5));
37 | 		nouns.add(getToken("변경", 5, 7));
38 | 		nouns.add(getToken("기본사전이변경되었습니다", 0, 12));
39 | 		
40 | 		TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines);
41 | 		stream.reset();
42 | 		
43 | 		List<TestToken> extractedTokens = collectExtractedNouns(stream);
44 | 
45 | 		stream.close();
46 | 
47 | 		verify(nouns, extractedTokens);
48 | 	}
49 | 	
50 | 	@Test
51 | 	public void testCase2() throws Exception {
52 | 		StringReader reader = new StringReader("worldcup경기장");
53 | 		nouns.add(getToken("worldcup", 0, 8));
54 | 		nouns.add(getToken("경기장", 8, 11));
55 | 		
56 | 		TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines);
57 | 		stream.reset();
58 | 		
59 | 		List<TestToken> extractedTokens = collectExtractedNouns(stream);
60 | 
61 | 		stream.close();
62 | 
63 | 		verify(nouns, extractedTokens);
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanSynonymEngineTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.StringReader;
 5 | import java.util.List;
 6 | 
 7 | import org.apache.lucene.analysis.TokenStream;
 8 | import org.junit.Before;
 9 | import org.junit.Test;
10 | 
11 | import com.google.common.collect.Lists;
12 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
13 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil;
14 | import com.tistory.devyongsik.analyzer.util.TestToken;
15 | 
16 | public class KoreanSynonymEngineTest extends AnalyzerTestUtil {
17 | 	private List<String> synonymWordList = null;
18 | 	private List<Engine> engines = null;
19 | 	private DictionaryFactory dictionaryFactory = null;
20 | 	private List<TestToken> nouns = null;
21 | 	
22 | 	@Before
23 | 	public void setUp() throws Exception {
24 | 		
25 | 		synonymWordList = Lists.newArrayList();
26 | 		engines = Lists.newArrayList();
27 | 		dictionaryFactory = DictionaryFactory.getFactory();
28 | 		nouns = Lists.newArrayList();
29 | 		
30 | 		synonymWordList.add("노트북");
31 | 		synonymWordList.add("노트북pc");
32 | 		synonymWordList.add("노트북컴퓨터");
33 | 		synonymWordList.add("노트북피씨");
34 | 		synonymWordList.add("notebook");
35 | 		
36 | 		engines.add(new KoreanSynonymEngine());
37 | 		
38 | 		dictionaryFactory.setSynonymList(synonymWordList);
39 | 	}
40 | 
41 | 	@Test
42 | 	public void testSynonym() throws IOException {
43 | 		StringReader reader = new StringReader("노트북");
44 | 		nouns.add(getToken("노트북", 0, 3));
45 | 		nouns.add(getToken("노트북pc", 0, 3));
46 | 		nouns.add(getToken("노트북컴퓨터", 0, 3));
47 | 		nouns.add(getToken("노트북피씨", 0, 3));
48 | 		nouns.add(getToken("notebook", 0, 3));
49 | 		
50 | 		TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines);
51 | 		stream.reset();
52 | 		
53 | 		List<TestToken> extractedTokens = collectExtractedNouns(stream);
54 | 
55 | 		stream.close();
56 | 
57 | 		verify(nouns, extractedTokens);
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanMorphEngineTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.StringReader;
 4 | import java.util.List;
 5 | 
 6 | import org.apache.lucene.analysis.TokenStream;
 7 | import org.junit.Before;
 8 | import org.junit.Test;
 9 | 
10 | import com.google.common.collect.Lists;
11 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil;
12 | import com.tistory.devyongsik.analyzer.util.TestToken;
13 | 
14 | /**
15 |  * @author need4spd, need4spd@cplanet.co.kr, 2011. 10. 14.
16 |  *
17 |  */
18 | public class KoreanMorphEngineTest extends AnalyzerTestUtil {
19 | 	private List<TestToken> nouns = null;
20 | 	
21 | 	private List<Engine> engines = null;
22 | 
23 | 	@Before
24 | 	public void initDictionary() {
25 | 		nouns = Lists.newArrayList();
26 | 		engines = Lists.newArrayList();
27 | 		
28 | 		engines.add(new KoreanMorphEngine());
29 | 	}
30 | 
31 | 	@Test
32 | 	public void testCase1() throws Exception {
33 | 		StringReader reader = new StringReader("기본사전이변경되었습니다");
34 | 		nouns.add(getToken("기본사전이변경", 0, 7));
35 | 		nouns.add(getToken("기본", 0, 2));
36 | 		nouns.add(getToken("전이", 3, 5));
37 | 		nouns.add(getToken("변경", 5, 7));
38 | 		nouns.add(getToken("기본사전이변경되었습니다", 0, 12));
39 | 		
40 | 		TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines);
41 | 		stream.reset();
42 | 		
43 | 		List<TestToken> extractedTokens = collectExtractedNouns(stream);
44 | 
45 | 		stream.close();
46 | 
47 | 		verify(nouns, extractedTokens);
48 | 	}
49 | 	
50 | 	@Test
51 | 	public void testCase2() throws Exception {
52 | 		StringReader reader = new StringReader("worldcup경기장");
53 | 		nouns.add(getToken("worldcup", 0, 8));
54 | 		nouns.add(getToken("경기장", 8, 11));
55 | 		
56 | 		TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines);
57 | 		stream.reset();
58 | 		
59 | 		List<TestToken> extractedTokens = collectExtractedNouns(stream);
60 | 
61 | 		stream.close();
62 | 
63 | 		verify(nouns, extractedTokens);
64 | 	}
65 | }
66 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanSynonymEngineTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.StringReader;
 5 | import java.util.List;
 6 | 
 7 | import org.apache.lucene.analysis.TokenStream;
 8 | import org.junit.Before;
 9 | import org.junit.Test;
10 | 
11 | import com.google.common.collect.Lists;
12 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
13 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil;
14 | import com.tistory.devyongsik.analyzer.util.TestToken;
15 | 
16 | public class KoreanSynonymEngineTest extends AnalyzerTestUtil {
17 | 	private List<String> synonymWordList = null;
18 | 	private List<Engine> engines = null;
19 | 	private DictionaryFactory dictionaryFactory = null;
20 | 	private List<TestToken> nouns = null;
21 | 	
22 | 	@Before
23 | 	public void setUp() throws Exception {
24 | 		
25 | 		synonymWordList = Lists.newArrayList();
26 | 		engines = Lists.newArrayList();
27 | 		dictionaryFactory = DictionaryFactory.getFactory();
28 | 		nouns = Lists.newArrayList();
29 | 		
30 | 		synonymWordList.add("노트북");
31 | 		synonymWordList.add("노트북pc");
32 | 		synonymWordList.add("노트북컴퓨터");
33 | 		synonymWordList.add("노트북피씨");
34 | 		synonymWordList.add("notebook");
35 | 		
36 | 		engines.add(new KoreanSynonymEngine());
37 | 		
38 | 		dictionaryFactory.setSynonymList(synonymWordList);
39 | 	}
40 | 
41 | 	@Test
42 | 	public void testSynonym() throws IOException {
43 | 		StringReader reader = new StringReader("노트북");
44 | 		nouns.add(getToken("노트북", 0, 3));
45 | 		nouns.add(getToken("노트북pc", 0, 3));
46 | 		nouns.add(getToken("노트북컴퓨터", 0, 3));
47 | 		nouns.add(getToken("노트북피씨", 0, 3));
48 | 		nouns.add(getToken("notebook", 0, 3));
49 | 		
50 | 		TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines);
51 | 		stream.reset();
52 | 		
53 | 		List<TestToken> extractedTokens = collectExtractedNouns(stream);
54 | 
55 | 		stream.close();
56 | 
57 | 		verify(nouns, extractedTokens);
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/util/AnalyzerTestUtil.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer.util;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.List;
 5 | 
 6 | import org.apache.lucene.analysis.TokenStream;
 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 9 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
10 | 
11 | import com.google.common.collect.Lists;
12 | 
13 | import junit.framework.Assert;
14 | 
15 | 
16 | public class AnalyzerTestUtil {
17 | 	protected TestToken getToken(String term, int start, int end) {
18 | 		TestToken t = new TestToken();
19 | 		t.setTerm(term);
20 | 		t.setStartOffset(start);
21 | 		t.setEndOffset(end);
22 | 		
23 | 		return t;
24 | 	}
25 | 	
26 | 	protected void verify(List<TestToken> expactedTokens, List<TestToken> extractedTokens) {
27 | 		
28 | 		for(TestToken testToken : expactedTokens) {
29 | 			Assert.assertTrue("[" + testToken + "] is expacted but not.", extractedTokens.contains(testToken));
30 | 		}
31 | 	}
32 | 	
33 | 	protected List<TestToken> collectExtractedNouns(TokenStream stream) throws IOException {
34 | 		CharTermAttribute charTermAtt = stream.addAttribute(CharTermAttribute.class);
35 | 		OffsetAttribute offSetAtt = stream.addAttribute(OffsetAttribute.class);
36 | 		TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class);
37 | 		
38 | 		List<TestToken> extractedTokens = Lists.newArrayList();
39 | 
40 | 		while(stream.incrementToken()) {
41 | 			TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset());
42 | 			
43 | 			System.out.println("termAtt.term() : " + charTermAtt.toString());
44 | 			System.out.println("startoffSetAtt : " + offSetAtt.startOffset());
45 | 			System.out.println("endoffSetAtt : " + offSetAtt.endOffset());
46 | 			System.out.println("typeAttr : " + typeAttr.toString());
47 | 
48 | 			extractedTokens.add(t);
49 | 		}
50 | 		
51 | 		return extractedTokens;
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/util/AnalyzerTestUtil.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer.util;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.List;
 5 | 
 6 | import org.apache.lucene.analysis.TokenStream;
 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 9 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
10 | 
11 | import com.google.common.collect.Lists;
12 | 
13 | import junit.framework.Assert;
14 | 
15 | 
16 | public class AnalyzerTestUtil {
17 | 	protected TestToken getToken(String term, int start, int end) {
18 | 		TestToken t = new TestToken();
19 | 		t.setTerm(term);
20 | 		t.setStartOffset(start);
21 | 		t.setEndOffset(end);
22 | 		
23 | 		return t;
24 | 	}
25 | 	
26 | 	protected void verify(List<TestToken> expactedTokens, List<TestToken> extractedTokens) {
27 | 		
28 | 		for(TestToken testToken : expactedTokens) {
29 | 			Assert.assertTrue("[" + testToken + "] is expacted but not.", extractedTokens.contains(testToken));
30 | 		}
31 | 	}
32 | 	
33 | 	protected List<TestToken> collectExtractedNouns(TokenStream stream) throws IOException {
34 | 		CharTermAttribute charTermAtt = stream.addAttribute(CharTermAttribute.class);
35 | 		OffsetAttribute offSetAtt = stream.addAttribute(OffsetAttribute.class);
36 | 		TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class);
37 | 		
38 | 		List<TestToken> extractedTokens = Lists.newArrayList();
39 | 
40 | 		while(stream.incrementToken()) {
41 | 			TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset());
42 | 			
43 | 			System.out.println("termAtt.term() : " + charTermAtt.toString());
44 | 			System.out.println("startoffSetAtt : " + offSetAtt.startOffset());
45 | 			System.out.println("endoffSetAtt : " + offSetAtt.endOffset());
46 | 			System.out.println("typeAttr : " + typeAttr.toString());
47 | 
48 | 			extractedTokens.add(t);
49 | 		}
50 | 		
51 | 		return extractedTokens;
52 | 	}
53 | }
54 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanNounFilter.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.ArrayList;
 5 | import java.util.Collections;
 6 | import java.util.HashMap;
 7 | import java.util.List;
 8 | import java.util.Map;
 9 | 
10 | import org.apache.lucene.analysis.TokenFilter;
11 | import org.apache.lucene.analysis.TokenStream;
12 | import org.slf4j.Logger;
13 | import org.slf4j.LoggerFactory;
14 | 
15 | public class KoreanNounFilter extends TokenFilter {
16 | 	private Logger logger = LoggerFactory.getLogger(KoreanNounFilter.class);
17 | 	
18 | 	private List<ComparableState> comparableStateList = new ArrayList<ComparableState>();
19 | 	private List<Engine> engines;
20 | 	private Map<String, String> returnedTokens = new HashMap<String, String>();
21 | 	
22 | 	protected KoreanNounFilter(TokenStream input, List<Engine> engines) {
23 | 		super(input);
24 | 		this.engines = engines;
25 | 	}
26 | 
27 | 	@Override
28 | 	public final boolean incrementToken() throws IOException {
29 | 		
30 | 		
31 | 		if(logger.isDebugEnabled())
32 | 			logger.debug("incrementToken KoreanNounFilter");
33 | 		
34 | 		if(engines == null) {
35 | 			throw new IllegalStateException("KoreanNounFilter의 engines가 Null입니다.");
36 | 		}
37 | 		
38 | 
39 | 		if (comparableStateList.size() > 0) {
40 | 			if(logger.isDebugEnabled())
41 | 				logger.debug("명사 Stack에서 토큰 리턴함");
42 | 
43 | 			ComparableState comparableState = comparableStateList.get(0);
44 | 			comparableStateList.remove(0);
45 | 			State synState = comparableState.getState();
46 | 			restoreState(synState);
47 | 
48 | 			return true;
49 | 		}
50 | 
51 | 		if (!input.incrementToken())
52 | 			return false;
53 | 		
54 | 		try {
55 | 			
56 | 			for(Engine engine : engines) {
57 | 				engine.collectNounState(input.cloneAttributes(), comparableStateList , returnedTokens);
58 | 			}
59 | 			
60 | 			returnedTokens.clear();
61 | 			Collections.sort(comparableStateList); //startoffset이 순서대로 나오도록...
62 | 			
63 | 		} catch (Exception e) {
64 | 			logger.error("명사필터에서 목록 조회 오류");
65 | 			e.printStackTrace();
66 | 		}
67 | 		
68 | 		return true;
69 | 	}
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.Reader;
 4 | import java.util.ArrayList;
 5 | import java.util.List;
 6 | 
 7 | import org.apache.lucene.analysis.Analyzer;
 8 | import org.apache.lucene.analysis.TokenStream;
 9 | import org.apache.lucene.analysis.Tokenizer;
10 | import org.apache.lucene.util.Version;
11 | 
12 | /**
13 |  * @author need4spd, need4spd@cplanet.co.kr, 2011. 8. 31.
14 |  *
15 |  */
16 | public class KoreanAnalyzer extends Analyzer {
17 | 
18 | 	private boolean isIndexingMode = false;
19 | 	
20 | 	public KoreanAnalyzer() {
21 | 		isIndexingMode = true;
22 | 	}
23 | 	
24 | 	public KoreanAnalyzer(boolean isIndexingMode) {
25 | 		this.isIndexingMode = isIndexingMode;
26 | 	}
27 | 	
28 | 	@Override
29 | 	protected TokenStreamComponents createComponents(final String fieldName,
30 | 			final Reader reader) {
31 | 		
32 | 		if(isIndexingMode) {
33 | 			List<Engine> nounExtractEngines = new ArrayList<Engine>();
34 | 			nounExtractEngines.add(new KoreanCompoundNounEngine());
35 | 			nounExtractEngines.add(new KoreanBaseNounEngine());
36 | 			nounExtractEngines.add(new KoreanLongestNounEngine());
37 | 			nounExtractEngines.add(new KoreanSynonymEngine());
38 | 			nounExtractEngines.add(new KoreanMorphEngine());
39 | 			
40 | 			Tokenizer tokenizer = new KoreanCharacterTokenizer(Version.LUCENE_44, reader);
41 | 			TokenStream tok = new KoreanNounFilter(tokenizer, nounExtractEngines);
42 | 			tok = new KoreanStopFilter(tok);
43 | 	
44 | 			return new TokenStreamComponents(tokenizer, tok);
45 | 		} else {
46 | 			List<Engine> nounExtractEngines = new ArrayList<Engine>();
47 | 			nounExtractEngines.add(new KoreanCompoundNounEngine());
48 | 			nounExtractEngines.add(new KoreanLongestNounEngine());
49 | 			nounExtractEngines.add(new KoreanSynonymEngine());
50 | 			nounExtractEngines.add(new KoreanMorphEngine());
51 | 			
52 | 			Tokenizer tokenizer = new KoreanCharacterTokenizer(Version.LUCENE_44, reader);
53 | 			TokenStream tok = new KoreanNounFilter(tokenizer, nounExtractEngines);
54 | 			tok = new KoreanStopFilter(tok);
55 | 	
56 | 			return new TokenStreamComponents(tokenizer, tok);	
57 | 		}
58 | 	}
59 | 
60 | }


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.Reader;
 4 | import java.util.ArrayList;
 5 | import java.util.List;
 6 | 
 7 | import org.apache.lucene.analysis.ReusableAnalyzerBase;
 8 | import org.apache.lucene.analysis.TokenStream;
 9 | import org.apache.lucene.analysis.Tokenizer;
10 | import org.apache.lucene.util.Version;
11 | 
12 | /**
13 |  * @author need4spd, need4spd@cplanet.co.kr, 2011. 8. 31.
14 |  *
15 |  */
16 | public class KoreanAnalyzer extends ReusableAnalyzerBase {
17 | 
18 | 	private boolean isIndexingMode = false;
19 | 	
20 | 	public KoreanAnalyzer() {
21 | 		isIndexingMode = true;
22 | 	}
23 | 	
24 | 	public KoreanAnalyzer(boolean isIndexingMode) {
25 | 		this.isIndexingMode = isIndexingMode;
26 | 	}
27 | 	
28 | 	@Override
29 | 	protected TokenStreamComponents createComponents(final String fieldName,
30 | 			final Reader reader) {
31 | 		
32 | 		if(isIndexingMode) {
33 | 			List<Engine> nounExtractEngines = new ArrayList<Engine>();
34 | 			nounExtractEngines.add(new KoreanCompoundNounEngine());
35 | 			nounExtractEngines.add(new KoreanBaseNounEngine());
36 | 			nounExtractEngines.add(new KoreanLongestNounEngine());
37 | 			nounExtractEngines.add(new KoreanSynonymEngine());
38 | 			nounExtractEngines.add(new KoreanMorphEngine());
39 | 			
40 | 			Tokenizer tokenizer = new KoreanCharacterTokenizer(Version.LUCENE_36, reader);
41 | 			TokenStream tok = new KoreanNounFilter(tokenizer, nounExtractEngines);
42 | 			tok = new KoreanStopFilter(tok);
43 | 	
44 | 			return new TokenStreamComponents(tokenizer, tok);
45 | 		} else {
46 | 			List<Engine> nounExtractEngines = new ArrayList<Engine>();
47 | 			nounExtractEngines.add(new KoreanCompoundNounEngine());
48 | 			nounExtractEngines.add(new KoreanLongestNounEngine());
49 | 			nounExtractEngines.add(new KoreanSynonymEngine());
50 | 			nounExtractEngines.add(new KoreanMorphEngine());
51 | 			
52 | 			Tokenizer tokenizer = new KoreanCharacterTokenizer(Version.LUCENE_36, reader);
53 | 			TokenStream tok = new KoreanNounFilter(tokenizer, nounExtractEngines);
54 | 			tok = new KoreanStopFilter(tok);
55 | 	
56 | 			return new TokenStreamComponents(tokenizer, tok);	
57 | 		}
58 | 	}
59 | 
60 | }


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/SpaceOutput.java:
--------------------------------------------------------------------------------
  1 | package org.apache.lucene.analysis.kr.morph;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.List;
  5 | 
  6 | /**
  7 |  * 공백을 분석한 결과를 저장한다.
  8 |  * @author smlee
  9 |  *
 10 |  */
 11 | public class SpaceOutput {
 12 | 
 13 | 	// 분석된 결과 
 14 | 	private AnalysisOutput output;
 15 | 	
 16 | 	// 분석 결과 앞에 있는 미등록어, 사람 이름은 대부분 이런 경우임.
 17 | 	private List<AnalysisOutput> nrWords = new ArrayList();
 18 | 	
 19 | 	// 분석하기 이전의 어절
 20 | 	private String source;
 21 | 	
 22 | 	public void initialize() {
 23 | 		output = null;
 24 | 		nrWords = new ArrayList();
 25 | 		source = null;
 26 | 	}
 27 | 
 28 | 	/**
 29 | 	 * @return the output
 30 | 	 */
 31 | 	public AnalysisOutput getOutput() {
 32 | 		return output;
 33 | 	}
 34 | 
 35 | 	/**
 36 | 	 * @param output the output to set
 37 | 	 */
 38 | 	public void setOutput(AnalysisOutput output) {
 39 | 		this.output = output;
 40 | 	}
 41 | 
 42 | 	/**
 43 | 	 * @return the nrWord
 44 | 	 */
 45 | 	public List getNRWords() {
 46 | 		return nrWords;
 47 | 	}
 48 | 
 49 | 	/**
 50 | 	 * @param nrWord the nrWord to set
 51 | 	 */
 52 | 	public void setNRWords(List words) {
 53 | 		this.nrWords = words;
 54 | 	}
 55 | 
 56 | 	/**
 57 | 	 * 
 58 | 	 * @param word
 59 | 	 */
 60 | 	public void addNRWord(String word) {
 61 | 		addNRWord(word, AnalysisOutput.SCORE_CORRECT);
 62 | 	}
 63 | 	
 64 | 	/**
 65 | 	 * 
 66 | 	 * @param word
 67 | 	 * @param score
 68 | 	 */
 69 | 	public void addNRWord(String word, int score) {
 70 | 		AnalysisOutput output = new AnalysisOutput(word,null,null,PatternConstants.PTN_N,score);
 71 | 		output.setSource(word);
 72 | 		output.setPos(PatternConstants.POS_NOUN);
 73 | 		this.nrWords.add(0,output);
 74 | 	}
 75 | 	
 76 | 	/**
 77 | 	 * @return the source
 78 | 	 */
 79 | 	public String getSource() {
 80 | 		return source;
 81 | 	}
 82 | 
 83 | 	/**
 84 | 	 * @param source the source to set
 85 | 	 */
 86 | 	public void setSource(String source) {
 87 | 		this.source = source;
 88 | 	}
 89 | 	
 90 | 	/**
 91 | 	 * 분석된 전체 단어의 길이를 반환한다.
 92 | 	 * @return
 93 | 	 */
 94 | 	public int getLength() {
 95 | 		
 96 | 		if(this.source ==null) return 0;
 97 | 		
 98 | 		return this.source.length();
 99 | 	}
100 | 	
101 | }
102 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanCompoundNounEngineTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.StringReader;
 4 | import java.util.ArrayList;
 5 | import java.util.List;
 6 | import java.util.Map;
 7 | 
 8 | import org.apache.lucene.analysis.TokenStream;
 9 | import org.junit.Before;
10 | import org.junit.Test;
11 | 
12 | import com.google.common.collect.Lists;
13 | import com.google.common.collect.Maps;
14 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
15 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil;
16 | import com.tistory.devyongsik.analyzer.util.TestToken;
17 | 
18 | public class KoreanCompoundNounEngineTest extends AnalyzerTestUtil {
19 | 	private List<TestToken> compondNouns = Lists.newArrayList();
20 | 	private StringReader reader = new StringReader("월드컵조직위원회분과위");
21 | 	private List<Engine> engines = new ArrayList<Engine>();
22 | 	private DictionaryFactory dictionaryFactory;
23 | 
24 | 	@Before
25 | 	public void initDictionary() {
26 | 		compondNouns.add(getToken("분과위", 8, 11));
27 | 		compondNouns.add(getToken("위원회", 5, 8));
28 | 		compondNouns.add(getToken("조직", 3, 5));
29 | 		compondNouns.add(getToken("월드컵", 0, 3));
30 | 		compondNouns.add(getToken("월드컵조직위원회분과위", 0, 11));
31 | 		
32 | 		dictionaryFactory = DictionaryFactory.getFactory();
33 | 	}
34 | 
35 | 	@Test
36 | 	public void testCompoundNounExtract() throws Exception {
37 | 		Map<String, List<String>> compoundNounDictionaryMap = Maps.newHashMap();
38 | 		List<String> compoundList = Lists.newArrayList();
39 | 		compoundList.add("분과위");
40 | 		compoundList.add("위원회");
41 | 		compoundList.add("조직");
42 | 		compoundList.add("월드컵");
43 | 		
44 | 		compoundNounDictionaryMap.put("월드컵조직위원회분과위", compoundList);
45 | 		
46 | 		dictionaryFactory.setCompoundDictionaryMap(compoundNounDictionaryMap);
47 | 		
48 | 		createEngines();
49 | 		
50 | 		TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines);
51 | 		
52 | 		stream.reset();
53 | 		
54 | 		List<TestToken> extractedTokens = collectExtractedNouns(stream);
55 | 		
56 | 		stream.close();
57 | 		
58 | 		verify(compondNouns, extractedTokens);
59 | 	}
60 | 	
61 | 	private void createEngines() {
62 | 		engines.add(new KoreanCompoundNounEngine());
63 | 	}
64 | 	
65 | }
66 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanCompoundNounEngineTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.StringReader;
 4 | import java.util.ArrayList;
 5 | import java.util.List;
 6 | import java.util.Map;
 7 | 
 8 | import org.apache.lucene.analysis.TokenStream;
 9 | import org.junit.Before;
10 | import org.junit.Test;
11 | 
12 | import com.google.common.collect.Lists;
13 | import com.google.common.collect.Maps;
14 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
15 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil;
16 | import com.tistory.devyongsik.analyzer.util.TestToken;
17 | 
18 | public class KoreanCompoundNounEngineTest extends AnalyzerTestUtil {
19 | 	private List<TestToken> compondNouns = Lists.newArrayList();
20 | 	private StringReader reader = new StringReader("월드컵조직위원회분과위");
21 | 	private List<Engine> engines = new ArrayList<Engine>();
22 | 	private DictionaryFactory dictionaryFactory;
23 | 
24 | 	@Before
25 | 	public void initDictionary() {
26 | 		compondNouns.add(getToken("분과위", 8, 11));
27 | 		compondNouns.add(getToken("위원회", 5, 8));
28 | 		compondNouns.add(getToken("조직", 3, 5));
29 | 		compondNouns.add(getToken("월드컵", 0, 3));
30 | 		compondNouns.add(getToken("월드컵조직위원회분과위", 0, 11));
31 | 		
32 | 		dictionaryFactory = DictionaryFactory.getFactory();
33 | 	}
34 | 
35 | 	@Test
36 | 	public void testCompoundNounExtract() throws Exception {
37 | 		Map<String, List<String>> compoundNounDictionaryMap = Maps.newHashMap();
38 | 		List<String> compoundList = Lists.newArrayList();
39 | 		compoundList.add("분과위");
40 | 		compoundList.add("위원회");
41 | 		compoundList.add("조직");
42 | 		compoundList.add("월드컵");
43 | 		
44 | 		compoundNounDictionaryMap.put("월드컵조직위원회분과위", compoundList);
45 | 		
46 | 		dictionaryFactory.setCompoundDictionaryMap(compoundNounDictionaryMap);
47 | 		
48 | 		createEngines();
49 | 		
50 | 		TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines);
51 | 		
52 | 		stream.reset();
53 | 		
54 | 		List<TestToken> extractedTokens = collectExtractedNouns(stream);
55 | 		
56 | 		stream.close();
57 | 		
58 | 		verify(compondNouns, extractedTokens);
59 | 	}
60 | 	
61 | 	private void createEngines() {
62 | 		engines.add(new KoreanCompoundNounEngine());
63 | 	}
64 | 	
65 | }
66 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/CompoundEntry.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 | 
 4 |  * contributor license agreements.  See the NOTICE file distributed with
 5 |  * this work for additional information regarding copyright ownership.
 6 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 7 |  * (the "License"); you may not use this file except in compliance with
 8 |  * the License.  You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | package org.apache.lucene.analysis.kr.morph;
19 | 
20 | /**
21 |  * 복합명사의 개별단어에 대한 정보를 담고있는 클래스 
22 |  * @author S.M.Lee
23 |  *
24 |  */
25 | public class CompoundEntry {
26 | 	
27 | 	private String word;
28 | 	
29 | 	private int offset = -1;
30 | 	
31 | 	private boolean exist = true;
32 | 	
33 | 	private char pos = PatternConstants.POS_NOUN;
34 | 	
35 | 	public CompoundEntry() {
36 | 		
37 | 	}
38 | 	
39 | 	public CompoundEntry(String w) {
40 | 		this.word = w;
41 | 	}
42 | 	
43 | 	public CompoundEntry(String w,int o) {
44 | 		this(w);
45 | 		this.offset = o;		
46 | 	}
47 | 	
48 | 	public CompoundEntry(String w,int o, boolean is) {
49 | 		this(w,o);
50 | 		this.exist = is;		
51 | 	}
52 | 	
53 | 	public CompoundEntry(String w,int o, boolean is, char p) {
54 | 		this(w,o,is);
55 | 		this.pos = p;
56 | 	}
57 | 	
58 | 	public void setWord(String w) {
59 | 		this.word = w;
60 | 	}
61 | 	
62 | 	public void setOffset(int o) {
63 | 		this.offset = o;
64 | 	}
65 | 	
66 | 	public String getWord() {
67 | 		return this.word;
68 | 	}
69 | 	
70 | 	public int getOffset() {
71 | 		return this.offset;
72 | 	}
73 | 	
74 | 	public boolean isExist() {
75 | 		return exist;
76 | 	}
77 | 	
78 | 	public void setExist(boolean is) {
79 | 		this.exist = is;
80 | 	}
81 | 	
82 | 	public char getPos() {
83 | 		return pos;
84 | 	}
85 | 
86 | 	public void setPos(char pos) {
87 | 		this.pos = pos;
88 | 	}	
89 | }
90 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanCharacterTokenizerTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | 
 4 | 
 5 | import java.io.IOException;
 6 | import java.io.StringReader;
 7 | import java.util.HashSet;
 8 | import java.util.Set;
 9 | 
10 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
11 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
12 | import org.junit.Assert;
13 | import org.junit.Before;
14 | import org.junit.Test;
15 | 
16 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil;
17 | import com.tistory.devyongsik.analyzer.util.TestToken;
18 | 
19 | /**
20 |  *
21 |  * @author 장용석, 2011.07.16 need4spd@naver.com
22 |  */
23 | 
24 | public class KoreanCharacterTokenizerTest extends AnalyzerTestUtil {
25 | 	
26 | 	private Set<TestToken> tokenizedToken = new HashSet<TestToken>();
27 | 	private StringReader content = new StringReader("삼성전자absc1234엠피3mp3버전1.2  띄어쓰기");
28 | 	private KoreanCharacterTokenizer tokenizer = new KoreanCharacterTokenizer(content);
29 | 
30 | 	@Before
31 | 	public void setUp() throws IOException {
32 | 		tokenizedToken.add(getToken("띄어쓰기", 25, 29));
33 | 		tokenizedToken.add(getToken("2", 22, 23));
34 | 		tokenizedToken.add(getToken("1", 20, 21));
35 | 		tokenizedToken.add(getToken("버전", 18, 20));
36 | 		tokenizedToken.add(getToken("3",17, 18));
37 | 		tokenizedToken.add(getToken("mp", 15, 17));
38 | 		tokenizedToken.add(getToken("3", 14, 15));
39 | 		tokenizedToken.add(getToken("엠피", 12, 14));
40 | 		tokenizedToken.add(getToken("1234", 8, 12));
41 | 		tokenizedToken.add(getToken("absc", 4, 8));
42 | 		tokenizedToken.add(getToken("삼성전자", 0, 4));
43 | 		
44 | 		tokenizer.reset();
45 | 	}
46 | 
47 | 	@Test
48 | 	public void testIncrementToken() throws IOException {
49 | 		CharTermAttribute charTermAtt = tokenizer.getAttribute(CharTermAttribute.class);
50 | 		OffsetAttribute offSetAtt = tokenizer.getAttribute(OffsetAttribute.class);
51 | 		
52 | 		while(tokenizer.incrementToken()) {
53 | 			TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset());
54 | 			System.out.println("termAtt.term() : " + charTermAtt.toString());
55 | 			System.out.println("offSetAtt : " + offSetAtt.startOffset());
56 | 			System.out.println("offSetAtt : " + offSetAtt.endOffset());
57 | 
58 | 			Assert.assertTrue(tokenizedToken.contains(t));
59 | 		}
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanCharacterTokenizerTest.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | 
 4 | 
 5 | import java.io.IOException;
 6 | import java.io.StringReader;
 7 | import java.util.HashSet;
 8 | import java.util.Set;
 9 | 
10 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
11 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
12 | import org.junit.Assert;
13 | import org.junit.Before;
14 | import org.junit.Test;
15 | 
16 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil;
17 | import com.tistory.devyongsik.analyzer.util.TestToken;
18 | 
19 | /**
20 |  *
21 |  * @author 장용석, 2011.07.16 need4spd@naver.com
22 |  */
23 | 
24 | public class KoreanCharacterTokenizerTest extends AnalyzerTestUtil {
25 | 	
26 | 	private Set<TestToken> tokenizedToken = new HashSet<TestToken>();
27 | 	private StringReader content = new StringReader("삼성전자absc1234엠피3mp3버전1.2  띄어쓰기");
28 | 	private KoreanCharacterTokenizer tokenizer = new KoreanCharacterTokenizer(content);
29 | 
30 | 	@Before
31 | 	public void setUp() throws IOException {
32 | 		tokenizedToken.add(getToken("띄어쓰기", 25, 29));
33 | 		tokenizedToken.add(getToken("2", 22, 23));
34 | 		tokenizedToken.add(getToken("1", 20, 21));
35 | 		tokenizedToken.add(getToken("버전", 18, 20));
36 | 		tokenizedToken.add(getToken("3",17, 18));
37 | 		tokenizedToken.add(getToken("mp", 15, 17));
38 | 		tokenizedToken.add(getToken("3", 14, 15));
39 | 		tokenizedToken.add(getToken("엠피", 12, 14));
40 | 		tokenizedToken.add(getToken("1234", 8, 12));
41 | 		tokenizedToken.add(getToken("absc", 4, 8));
42 | 		tokenizedToken.add(getToken("삼성전자", 0, 4));
43 | 		
44 | 		tokenizer.reset();
45 | 	}
46 | 
47 | 	@Test
48 | 	public void testIncrementToken() throws IOException {
49 | 		CharTermAttribute charTermAtt = tokenizer.getAttribute(CharTermAttribute.class);
50 | 		OffsetAttribute offSetAtt = tokenizer.getAttribute(OffsetAttribute.class);
51 | 		
52 | 		while(tokenizer.incrementToken()) {
53 | 			TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset());
54 | 			System.out.println("termAtt.term() : " + charTermAtt.toString());
55 | 			System.out.println("offSetAtt : " + offSetAtt.startOffset());
56 | 			System.out.println("offSetAtt : " + offSetAtt.endOffset());
57 | 
58 | 			Assert.assertTrue(tokenizedToken.contains(t));
59 | 		}
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSAOutput.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.lucene.analysis.kr.morph;
18 | 
19 | import java.util.ArrayList;
20 | import java.util.List;
21 | 
22 | public class WSAOutput {
23 | 
24 | 	private String source;
25 | 	
26 | 	private List<AnalysisOutput> results;
27 | 	
28 | 	private int wds = 0;
29 | 	
30 | 	private int end = 0;
31 | 
32 | 	public WSAOutput() {
33 | 		results = new ArrayList();
34 | 	}
35 | 	
36 | 	public WSAOutput(String src) {
37 | 		source = src;
38 | 		results = new ArrayList();
39 | 	}
40 | 	
41 | 	public WSAOutput(String src, List list) {
42 | 		source = src;		
43 | 		results = list;
44 | 	}
45 | 	
46 | 	public String getSource() {
47 | 		return source;
48 | 	}
49 | 
50 | 	public void setSource(String source) {
51 | 		this.source = source;
52 | 	}
53 | 
54 | 	public List getResults() {
55 | 		return results;
56 | 	}
57 | 
58 | 	public void setResults(List results) {
59 | 		this.results = results;
60 | 	}
61 | 		
62 | 	public void addNounResults(String word) {		
63 | 		addNounResults(word, null);		
64 | 	}
65 | 	
66 | 	public void addNounResults(String word, String end) {		
67 | 		addNounResults(word, end, AnalysisOutput.SCORE_ANALYSIS);		
68 | 	}
69 | 	
70 | 	public void addNounResults(String word, String end, int score) {	
71 | 		
72 | 		AnalysisOutput output = new AnalysisOutput(word, end, null, PatternConstants.PTN_NJ);
73 | 		if(end==null) output.setPatn(PatternConstants.PTN_N);
74 | 		
75 | 		output.setPos(PatternConstants.POS_NOUN);
76 | 		output.setScore(score);
77 | 		
78 | 		this.results.add(output);	
79 | 	}	
80 | 	
81 | }
82 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanStopFilter.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Map;
 5 | 
 6 | import org.apache.lucene.analysis.TokenFilter;
 7 | import org.apache.lucene.analysis.TokenStream;
 8 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 | 
13 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
14 | 
15 | public class KoreanStopFilter extends TokenFilter {
16 | 
17 | 	private boolean enablePositionIncrements = false;
18 | 
19 | 	private CharTermAttribute charTermAtt;
20 | 	private PositionIncrementAttribute posIncrAtt;
21 | 	private Logger logger = LoggerFactory.getLogger(KoreanStopFilter.class);
22 | 	private static Map<String, String> stopWordsMap = null;
23 | 	
24 | 	protected KoreanStopFilter(TokenStream input) {
25 | 		super(input);
26 | 		
27 | 		if(logger.isInfoEnabled()) {
28 | 			logger.info("init KoreanStopFilter");
29 | 		}
30 | 		charTermAtt = getAttribute(CharTermAttribute.class);
31 | 		posIncrAtt = getAttribute(PositionIncrementAttribute.class);
32 | 		
33 | 		DictionaryFactory dictionaryFactory = DictionaryFactory.getFactory();	
34 | 		stopWordsMap = dictionaryFactory.getStopWordDictionaryMap();
35 | 	}
36 | 
37 | 	public void setEnablePositionIncrements(boolean enable) {
38 | 		this.enablePositionIncrements = enable;
39 | 	}
40 | 
41 | 	public boolean getEnablePositionIncrements() {
42 | 		return enablePositionIncrements;
43 | 	}
44 | 	
45 | 	@Override
46 | 	public final boolean incrementToken() throws IOException {
47 | 		
48 | 		if(logger.isDebugEnabled())
49 | 			logger.debug("incrementToken KoreanStopFilter");
50 | 
51 | 
52 | 		// return the first non-stop word found
53 | 		int skippedPositions = 0;
54 | 		
55 | 		while(input.incrementToken()) {
56 | 
57 | 			if(logger.isDebugEnabled())
58 | 				logger.debug("원래 리턴 될 TermAtt : " + charTermAtt.toString() + " , stopWordDic.isExist : " + stopWordsMap.containsKey(charTermAtt.toString()));
59 | 
60 | 			if(!stopWordsMap.containsKey(charTermAtt.toString())) {
61 | 				if(enablePositionIncrements) {
62 | 					posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
63 | 				}
64 | 
65 | 				return true;
66 | 			}
67 | 
68 | 			skippedPositions += posIncrAtt.getPositionIncrement();
69 | 		}
70 | 
71 | 		return false;
72 | 	}
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanStopFilter.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.Map;
 5 | 
 6 | import org.apache.lucene.analysis.TokenFilter;
 7 | import org.apache.lucene.analysis.TokenStream;
 8 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
10 | import org.slf4j.Logger;
11 | import org.slf4j.LoggerFactory;
12 | 
13 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
14 | 
15 | public class KoreanStopFilter extends TokenFilter {
16 | 
17 | 	private boolean enablePositionIncrements = false;
18 | 
19 | 	private CharTermAttribute charTermAtt;
20 | 	private PositionIncrementAttribute posIncrAtt;
21 | 	private Logger logger = LoggerFactory.getLogger(KoreanStopFilter.class);
22 | 	private static Map<String, String> stopWordsMap = null;
23 | 	
24 | 	protected KoreanStopFilter(TokenStream input) {
25 | 		super(input);
26 | 		
27 | 		if(logger.isInfoEnabled()) {
28 | 			logger.info("init KoreanStopFilter");
29 | 		}
30 | 		charTermAtt = getAttribute(CharTermAttribute.class);
31 | 		posIncrAtt = getAttribute(PositionIncrementAttribute.class);
32 | 		
33 | 		DictionaryFactory dictionaryFactory = DictionaryFactory.getFactory();	
34 | 		stopWordsMap = dictionaryFactory.getStopWordDictionaryMap();
35 | 	}
36 | 
37 | 	public void setEnablePositionIncrements(boolean enable) {
38 | 		this.enablePositionIncrements = enable;
39 | 	}
40 | 
41 | 	public boolean getEnablePositionIncrements() {
42 | 		return enablePositionIncrements;
43 | 	}
44 | 	
45 | 	@Override
46 | 	public final boolean incrementToken() throws IOException {
47 | 		
48 | 		if(logger.isDebugEnabled())
49 | 			logger.debug("incrementToken KoreanStopFilter");
50 | 
51 | 
52 | 		// return the first non-stop word found
53 | 		int skippedPositions = 0;
54 | 		
55 | 		while(input.incrementToken()) {
56 | 
57 | 			if(logger.isDebugEnabled())
58 | 				logger.debug("원래 리턴 될 TermAtt : " + charTermAtt.toString() + " , stopWordDic.isExist : " + stopWordsMap.containsKey(charTermAtt.toString()));
59 | 
60 | 			if(!stopWordsMap.containsKey(charTermAtt.toString())) {
61 | 				if(enablePositionIncrements) {
62 | 					posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions);
63 | 				}
64 | 
65 | 				return true;
66 | 			}
67 | 
68 | 			skippedPositions += posIncrAtt.getPositionIncrement();
69 | 		}
70 | 
71 | 		return false;
72 | 	}
73 | 
74 | }
75 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/utils/HanjaUtils.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.lucene.analysis.kr.utils;
18 | 
19 | import java.io.IOException;
20 | 
21 | import java.util.ArrayList;
22 | import java.util.HashMap;
23 | import java.util.List;
24 | import java.util.Map;
25 | 
26 | import org.apache.lucene.analysis.kr.morph.MorphException;
27 | 
28 | public class HanjaUtils {
29 | 
30 | 	private static Map<String, char[]> mapHanja;
31 | 	
32 | 	public synchronized static void loadDictionary() throws MorphException {
33 | 		try {
34 | 			List<String> strList = FileUtil.readLines("org/apache/lucene/analysis/kr/dic/mapHanja.dic","UTF-8");
35 | 			mapHanja = new HashMap();		
36 | 		
37 | 			for(int i=0;i<strList.size();i++) {
38 | 				
39 | 				if(strList.get(i).length()<1||
40 | 						strList.get(i).indexOf(",")==-1) continue;
41 | 
42 | 				String[] hanInfos = StringUtil.split(strList.get(i),",");
43 | 
44 | 				if(hanInfos.length!=2) continue;
45 | 				
46 | 				String hanja = StringEscapeUtil.unescapeJava(hanInfos[0]);
47 | 
48 | 				mapHanja.put(hanja, hanInfos[1].toCharArray());
49 | 			}			
50 | 		} catch (IOException e) {
51 | 			throw new MorphException(e);
52 | 		}
53 | 	}
54 | 	
55 | 	/**
56 | 	 * 한자에 대응하는 한글을 찾아서 반환한다.
57 | 	 * 하나의 한자는 여러 음으로 읽일 수 있으므로 가능한 모든 음을 한글로 반환한다.
58 | 	 * @param hanja
59 | 	 * @return
60 | 	 * @throws MorphException
61 | 	 */
62 |     public static char[] convertToHangul(char hanja) throws MorphException {
63 |  
64 |     	if(mapHanja==null)  loadDictionary();
65 | 
66 | //		if(hanja>0x9FFF||hanja<0x3400) return new char[]{hanja};
67 | 		
68 | 		char[] result = mapHanja.get(new String(new char[]{hanja}));
69 | 		if(result==null) return new char[]{hanja};
70 | 		
71 |     	return result;
72 |     }
73 | }
74 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'java'
 2 | apply plugin: 'eclipse'
 3 | apply plugin: 'eclipse-wtp'
 4 | apply plugin: 'maven'
 5 | 
 6 | 
 7 | sourceCompatibility = 1.7
 8 | 
 9 | group = 'com.tistory.devyongsik'
10 | version = '0.6-SNAPSHOT'
11 | 
12 | repositories {
13 | 	mavenCentral()
14 | }
15 | 
16 | def versions = [
17 |     lucene : "3.6.2"
18 | ]
19 | 
20 | dependencies {
21 | 	compile(
22 | 		[group: 'junit', name: 'junit', version: '4.4'],
23 | 		[group: 'org.apache.lucene', name: 'lucene-core', version: "${versions.lucene}"],
24 | 		[group: 'org.slf4j', name: 'slf4j-api', version: '1.6.6'],
25 | 		[group: 'org.slf4j', name: 'slf4j-simple', version: '1.6.6'],
26 | 		[group: 'ch.qos.logback', name: 'logback-core', version: '1.0.13'],
27 | 		[group: 'ch.qos.logback', name: 'logback-classic', version: '1.0.13'],
28 | 		[group: 'org.codehaus.groovy', name: 'groovy-all', version: '2.1.6'],
29 | 		[group: 'com.google.guava', name: 'guava', version: 'r09']
30 | 	)
31 | 
32 | 	testRuntime(
33 |         [group: 'junit', name: 'junit', version: '4.4'],
34 |         [group: 'org.apache.lucene', name: 'lucene-core', version: "${versions.lucene}"],
35 |         [group: 'org.slf4j', name: 'slf4j-api', version: '1.6.6'],
36 |         [group: 'org.slf4j', name: 'slf4j-simple', version: '1.6.6'],
37 |         [group: 'ch.qos.logback', name: 'logback-core', version: '1.0.13'],
38 | 		[group: 'ch.qos.logback', name: 'logback-classic', version: '1.0.13'],
39 | 		[group: 'org.codehaus.groovy', name: 'groovy-all', version: '2.1.6'],
40 | 		[group: 'com.google.guava', name: 'guava', version: 'r09']
41 |     )
42 | }
43 | 
44 | test {
45 | 	jvmArgs = ['-ea', '-Xmx256m']
46 | 	logging.captureStandardOutput(LogLevel.INFO)
47 | }
48 | 
49 | task copyDictionary(type: Copy) {
50 | 	from 'src/main/java'
51 | 	into 'target/classes/main'
52 | 	include '**/*.properties'
53 | 	include '**/*.dic'
54 | 	include '**/*.jflex'
55 | 	include '**/*.txt'
56 | 
57 | 	includeEmptyDirs = false
58 | }
59 | 
60 | eclipse {
61 | 	classpath {
62 | 		downloadSources=true
63 | 	}
64 | 	
65 | 	jdt {
66 | 		file {
67 | 			withProperties { 
68 | 				properties -> properties.setProperty("encoding//src/main/java", "utf-8")
69 | 					      properties.setProperty("encoding//src/main/resources", "utf-8")
70 | 					      properties.setProperty("encoding//src/test/java", "utf-8")
71 | 					      properties.setProperty("encoding//src/test/resources", "utf-8")
72 | 			}		
73 | 		}
74 | 	}
75 | }
76 | 
77 | uploadArchives {
78 | 	repositories.mavenDeployer {
79 | 		repository(url: "file:///Users/need4spd/Programming/need4spd-maven-repo/snapshots")
80 | 		//repository(url: "file:///Programming/Java/need4spd-maven-repo/snapshots")
81 | 	}
82 | }
83 | 
84 | tasks.test.dependsOn copyDictionary
85 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/WordEntry.java:
--------------------------------------------------------------------------------
 1 | package org.apache.lucene.analysis.kr.morph;
 2 | 
 3 | /**
 4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 5 |  * contributor license agreements.  See the NOTICE file distributed with
 6 |  * this work for additional information regarding copyright ownership.
 7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 8 |  * (the "License"); you may not use this file except in compliance with
 9 |  * the License.  You may obtain a copy of the License at
10 |  *
11 |  *     http://www.apache.org/licenses/LICENSE-2.0
12 |  *
13 |  * Unless required by applicable law or agreed to in writing, software
14 |  * distributed under the License is distributed on an "AS IS" BASIS,
15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 |  * See the License for the specific language governing permissions and
17 |  * limitations under the License.
18 |  */
19 | 
20 | import java.util.ArrayList;
21 | import java.util.List;
22 | 
23 | public class WordEntry {
24 | 
25 | 	public static final int IDX_NOUN = 0;
26 | 	public static final int IDX_VERB = 1;
27 | 	public static final int IDX_BUSA = 2;
28 | 	public static final int IDX_DOV = 3;
29 | 	public static final int IDX_BEV = 4;
30 | 	public static final int IDX_NE = 5;
31 | 	public static final int IDX_ADJ = 6; // 형용사
32 | 	public static final int IDX_NPR = 7;  // 명사의 분류 (M:Measure)
33 | 	public static final int IDX_CNOUNX = 8; 
34 | 	public static final int IDX_REGURA = 9;
35 | 	
36 | 	/**
37 | 	 * 단어
38 | 	 */
39 | 	private String word;
40 | 	
41 | 	/**
42 | 	 * 단어특성
43 | 	 */
44 | 	private char[] features;
45 | 	
46 | 	private List<CompoundEntry> compounds = new ArrayList();
47 | 	
48 | 	public WordEntry() {
49 | 		
50 | 	}
51 | 	
52 | 	public WordEntry(String word) {
53 | 		this.word = word;
54 | 	}
55 | 	
56 | 	public WordEntry(String word, char[] cs) {
57 | 		this.word = word;
58 | 		this.features = cs;
59 | 	}
60 | 	
61 | 	public WordEntry(String word, List c) {
62 | 		this.word = word;
63 | 		this.compounds = c;
64 | 	}
65 | 	
66 | 	public void setWord(String w) {
67 | 		this.word = w;
68 | 	}
69 | 	
70 | 	public String getWord() {
71 | 		return this.word;
72 | 	}
73 | 	
74 | 	public void setFeatures(char[] cs) {
75 | 		this.features = cs;
76 | 	}
77 | 	
78 | 	public char getFeature(int index) {
79 | 		if(features==null||features.length<index) return '0';		
80 | 		return features[index];
81 | 	}
82 | 	
83 | 	public char[] getFeatures() {
84 | 		return this.features;
85 | 	}
86 | 	
87 | 	public void setCompounds(List<CompoundEntry> c) {
88 | 		this.compounds = c;
89 | 	}
90 | 	
91 | 	public List<CompoundEntry> getCompounds() {
92 | 		return this.compounds;
93 | 	}
94 | 	
95 | }
96 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/WordEntry.java:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *     http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package org.apache.lucene.analysis.kr.morph;
18 | 
19 | 
20 | import java.util.ArrayList;
21 | import java.util.List;
22 | 
23 | public class WordEntry {
24 | 
25 | 	public static final int IDX_NOUN = 0;
26 | 	public static final int IDX_VERB = 1;
27 | 	public static final int IDX_BUSA = 2;
28 | 	public static final int IDX_DOV = 3;
29 | 	public static final int IDX_BEV = 4;
30 | 	public static final int IDX_NE = 5;
31 | 	public static final int IDX_ADJ = 6; // 형용사
32 | 	public static final int IDX_NPR = 7;  // 명사의 분류 (M:Measure)
33 | 	public static final int IDX_CNOUNX = 8; 
34 | 	public static final int IDX_REGURA = 9;
35 | 	
36 | 	/**
37 | 	 * 단어
38 | 	 */
39 | 	private String word;
40 | 	
41 | 	/**
42 | 	 * 단어특성
43 | 	 */
44 | 	private char[] features;
45 | 	
46 | 	private List<CompoundEntry> compounds = new ArrayList();
47 | 	
48 | 	public WordEntry() {
49 | 		
50 | 	}
51 | 	
52 | 	public WordEntry(String word) {
53 | 		this.word = word;
54 | 	}
55 | 	
56 | 	public WordEntry(String word, char[] cs) {
57 | 		this.word = word;
58 | 		this.features = cs;
59 | 	}
60 | 	
61 | 	public WordEntry(String word, List c) {
62 | 		this.word = word;
63 | 		this.compounds = c;
64 | 	}
65 | 	
66 | 	public void setWord(String w) {
67 | 		this.word = w;
68 | 	}
69 | 	
70 | 	public String getWord() {
71 | 		return this.word;
72 | 	}
73 | 	
74 | 	public void setFeatures(char[] cs) {
75 | 		this.features = cs;
76 | 	}
77 | 	
78 | 	public char getFeature(int index) {
79 | 		if(features==null||features.length<index) return '0';		
80 | 		return features[index];
81 | 	}
82 | 	
83 | 	public char[] getFeatures() {
84 | 		return this.features;
85 | 	}
86 | 	
87 | 	public void setCompounds(List<CompoundEntry> c) {
88 | 		this.compounds = c;
89 | 	}
90 | 	
91 | 	public List<CompoundEntry> getCompounds() {
92 | 		return this.compounds;
93 | 	}
94 | 	
95 | }
96 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/DictionaryProperties.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.util.Properties;
 6 | 
 7 | import org.slf4j.Logger;
 8 | import org.slf4j.LoggerFactory;
 9 | 
10 | public class DictionaryProperties {
11 | 	private Logger logger = LoggerFactory.getLogger(DictionaryProperties.class);
12 | 	
13 | 	private static DictionaryProperties instance = new DictionaryProperties();
14 | 
15 | 	private Properties defaultProp = new Properties();
16 | 	private Properties customProp = new Properties();
17 | 	
18 | 	private String resourceName = "dictionary.properties";
19 | 	private final String defaultResourceName = "com/tistory/devyongsik/analyzer/dictionary.properties";
20 | 
21 | 	private DictionaryProperties() {
22 | 		loadDefaultProperties();
23 | 		loadCustomProperties();
24 | 	}
25 | 
26 | 	private void loadDefaultProperties() {
27 | 		if(logger.isDebugEnabled())
28 | 			logger.debug("load analyzer default properties..... : " + defaultResourceName);
29 | 
30 | 		Class<DictionaryProperties> clazz = DictionaryProperties.class;
31 | 		
32 | 		InputStream in = clazz.getClassLoader().getResourceAsStream(defaultResourceName);
33 | 		
34 | 		if(in == null) {
35 | 			logger.error(defaultResourceName + " was not found!!!");
36 | 			throw new IllegalStateException(defaultResourceName + " was not found!!!");
37 | 		}
38 | 
39 | 		try {
40 | 			defaultProp.load(in);
41 | 			in.close();
42 | 		} catch (IOException e) {
43 | 			logger.error(e.toString());
44 | 		}
45 | 		
46 | 		if(logger.isInfoEnabled()) {
47 | 			logger.info("default dictionary.properties : " + defaultProp);
48 | 		}
49 | 	}
50 | 
51 | 	private void loadCustomProperties() {
52 | 		if(logger.isDebugEnabled())
53 | 			logger.debug("load analyzer custom properties..... : " + resourceName);
54 | 
55 | 		Class<DictionaryProperties> clazz = DictionaryProperties.class;
56 | 		
57 | 		InputStream in = clazz.getClassLoader().getResourceAsStream(resourceName);
58 | 		
59 | 		if(in == null) {
60 | 			logger.warn(customProp + " was not found!!! skip load custom properties");
61 | 			return;
62 | 		}
63 | 
64 | 		try {
65 | 			customProp.load(in);
66 | 			in.close();
67 | 		} catch (IOException e) {
68 | 			logger.error(e.toString());
69 | 		}
70 | 		
71 | 		if(logger.isInfoEnabled()) {
72 | 			logger.info("custom dictionary.properties : " + customProp);
73 | 		}
74 | 	}
75 | 	
76 | 	public static DictionaryProperties getInstance() {
77 | 		return instance;
78 | 	}
79 | 
80 | 	public String getProperty(String key) {
81 | 		//read property value from custom properties first
82 | 		String value = customProp.getProperty(key);
83 | 		
84 | 		if(value == null) {
85 | 			value = defaultProp.getProperty(key);
86 | 		}
87 | 		
88 | 		return value.trim();
89 | 	}
90 | }
91 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/DictionaryProperties.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.InputStream;
 5 | import java.util.Properties;
 6 | 
 7 | import org.slf4j.Logger;
 8 | import org.slf4j.LoggerFactory;
 9 | 
10 | public class DictionaryProperties {
11 | 	private Logger logger = LoggerFactory.getLogger(DictionaryProperties.class);
12 | 	
13 | 	private static DictionaryProperties instance = new DictionaryProperties();
14 | 
15 | 	private Properties defaultProp = new Properties();
16 | 	private Properties customProp = new Properties();
17 | 	
18 | 	private String resourceName = "dictionary.properties";
19 | 	private final String defaultResourceName = "com/tistory/devyongsik/analyzer/dictionary.properties";
20 | 
21 | 	private DictionaryProperties() {
22 | 		loadDefaultProperties();
23 | 		loadCustomProperties();
24 | 	}
25 | 
26 | 	private void loadDefaultProperties() {
27 | 		if(logger.isDebugEnabled())
28 | 			logger.debug("load analyzer default properties..... : " + defaultResourceName);
29 | 
30 | 		Class<DictionaryProperties> clazz = DictionaryProperties.class;
31 | 		
32 | 		InputStream in = clazz.getClassLoader().getResourceAsStream(defaultResourceName);
33 | 		
34 | 		if(in == null) {
35 | 			logger.error(defaultResourceName + " was not found!!!");
36 | 			throw new IllegalStateException(defaultResourceName + " was not found!!!");
37 | 		}
38 | 
39 | 		try {
40 | 			defaultProp.load(in);
41 | 			in.close();
42 | 		} catch (IOException e) {
43 | 			logger.error(e.toString());
44 | 		}
45 | 		
46 | 		if(logger.isInfoEnabled()) {
47 | 			logger.info("default dictionary.properties : " + defaultProp);
48 | 		}
49 | 	}
50 | 
51 | 	private void loadCustomProperties() {
52 | 		if(logger.isDebugEnabled())
53 | 			logger.debug("load analyzer custom properties..... : " + resourceName);
54 | 
55 | 		Class<DictionaryProperties> clazz = DictionaryProperties.class;
56 | 		
57 | 		InputStream in = clazz.getClassLoader().getResourceAsStream(resourceName);
58 | 		
59 | 		if(in == null) {
60 | 			logger.warn(customProp + " was not found!!! skip load custom properties");
61 | 			return;
62 | 		}
63 | 
64 | 		try {
65 | 			customProp.load(in);
66 | 			in.close();
67 | 		} catch (IOException e) {
68 | 			logger.error(e.toString());
69 | 		}
70 | 		
71 | 		if(logger.isInfoEnabled()) {
72 | 			logger.info("custom dictionary.properties : " + customProp);
73 | 		}
74 | 	}
75 | 	
76 | 	public static DictionaryProperties getInstance() {
77 | 		return instance;
78 | 	}
79 | 
80 | 	public String getProperty(String key) {
81 | 		//read property value from custom properties first
82 | 		String value = customProp.getProperty(key);
83 | 		
84 | 		if(value == null) {
85 | 			value = defaultProp.getProperty(key);
86 | 		}
87 | 		
88 | 		return value.trim();
89 | 	}
90 | }
91 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/utils/NounDictionaryDuplWordRemover.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.utils;
 2 | 
 3 | import java.io.BufferedReader;
 4 | import java.io.BufferedWriter;
 5 | import java.io.File;
 6 | import java.io.FileInputStream;
 7 | import java.io.FileOutputStream;
 8 | import java.io.IOException;
 9 | import java.io.InputStream;
10 | import java.io.InputStreamReader;
11 | import java.io.OutputStream;
12 | import java.io.OutputStreamWriter;
13 | import java.util.ArrayList;
14 | import java.util.Collections;
15 | import java.util.HashMap;
16 | import java.util.List;
17 | import java.util.Map;
18 | import java.util.Set;
19 | 
20 | public class NounDictionaryDuplWordRemover {
21 | 	public static void main(String[] args) throws IOException {
22 | 		
23 | 		File nounFile = new File("/Users/need4spd/Programming/Java/workspace/walkingword/src/com/tistory/devyongsik/analyzer/dictionary/noun.txt");
24 | 		File customNounFile = new File("/Users/need4spd/Programming/Java/workspace/walkingword/src/com/tistory/devyongsik/analyzer/dictionary/custom.txt");
25 | 		
26 | 		
27 | 		InputStream nounIs = new FileInputStream(nounFile);
28 | 		InputStreamReader nounIsr = new InputStreamReader(nounIs);
29 | 		BufferedReader nounBr = new BufferedReader(nounIsr);
30 | 		
31 | 		Map<String, String> nounsMap = new HashMap<String, String>();
32 | 		
33 | 		String nounTemp = "";
34 | 		while((nounTemp = nounBr.readLine()) != null) {
35 | 			nounsMap.put(nounTemp, "");
36 | 		}
37 | 		
38 | 		InputStream customIs = new FileInputStream(customNounFile);
39 | 		InputStreamReader customIsr = new InputStreamReader(customIs);
40 | 		BufferedReader customBr = new BufferedReader(customIsr);
41 | 		
42 | 		Map<String, String> customMap = new HashMap<String, String>();
43 | 		
44 | 		String customTemp = "";
45 | 		while((customTemp = customBr.readLine()) != null) {
46 | 			customMap.put(customTemp, "");
47 | 		}
48 | 		
49 | 		int dupCount = 0;
50 | 		Set<String> customNounsKeySet = customMap.keySet();
51 | 		
52 | 		for(String customNoun : customNounsKeySet) {
53 | 			if (nounsMap.containsKey(customNoun)) {
54 | 				nounsMap.remove(customNoun);
55 | 				dupCount++;
56 | 			}
57 | 		}
58 | 		
59 | 		System.out.println("dup count : " + dupCount);
60 | 		
61 | 		customBr.close();
62 | 		customIsr.close();
63 | 		customIs.close();
64 | 		
65 | 		nounBr.close();
66 | 		nounIsr.close();
67 | 		nounIs.close();
68 | 		
69 | 		OutputStream nounOs = new FileOutputStream(nounFile, false);
70 | 		OutputStreamWriter osw = new OutputStreamWriter(nounOs);
71 | 		BufferedWriter bw = new BufferedWriter(osw);
72 | 		
73 | 		List<String> cleanedNouns = new ArrayList<String>();
74 | 		for(String n : nounsMap.keySet()) {
75 | 			cleanedNouns.add(n);
76 | 		}
77 | 		
78 | 		Collections.sort(cleanedNouns);
79 | 		
80 | 		for(String n : cleanedNouns) {
81 | 			bw.write(n);
82 | 			bw.write("\n");
83 | 		}
84 | 		
85 | 		bw.flush();
86 | 		bw.close();
87 | 		osw.close();
88 | 		nounOs.close();
89 | 		
90 | 	}
91 | }
92 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/SpaceOutput.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 | 
  4 |  * contributor license agreements.  See the NOTICE file distributed with
  5 |  * this work for additional information regarding copyright ownership.
  6 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  7 |  * (the "License"); you may not use this file except in compliance with
  8 |  * the License.  You may obtain a copy of the License at
  9 |  *
 10 |  *     http://www.apache.org/licenses/LICENSE-2.0
 11 |  *
 12 |  * Unless required by applicable law or agreed to in writing, software
 13 |  * distributed under the License is distributed on an "AS IS" BASIS,
 14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 |  * See the License for the specific language governing permissions and
 16 |  * limitations under the License.
 17 |  */
 18 | package org.apache.lucene.analysis.kr.morph;
 19 | 
 20 | import java.util.ArrayList;
 21 | import java.util.List;
 22 | 
 23 | /**
 24 |  * 공백을 분석한 결과를 저장한다.
 25 |  * @author smlee
 26 |  *
 27 |  */
 28 | public class SpaceOutput {
 29 | 
 30 | 	// 분석된 결과 
 31 | 	private AnalysisOutput output;
 32 | 	
 33 | 	// 분석 결과 앞에 있는 미등록어, 사람 이름은 대부분 이런 경우임.
 34 | 	private List<AnalysisOutput> nrWords = new ArrayList();
 35 | 	
 36 | 	// 분석하기 이전의 어절
 37 | 	private String source;
 38 | 	
 39 | 	public void initialize() {
 40 | 		output = null;
 41 | 		nrWords = new ArrayList();
 42 | 		source = null;
 43 | 	}
 44 | 
 45 | 	/**
 46 | 	 * @return the output
 47 | 	 */
 48 | 	public AnalysisOutput getOutput() {
 49 | 		return output;
 50 | 	}
 51 | 
 52 | 	/**
 53 | 	 * @param output the output to set
 54 | 	 */
 55 | 	public void setOutput(AnalysisOutput output) {
 56 | 		this.output = output;
 57 | 	}
 58 | 
 59 | 	/**
 60 | 	 * @return the nrWord
 61 | 	 */
 62 | 	public List getNRWords() {
 63 | 		return nrWords;
 64 | 	}
 65 | 
 66 | 	/**
 67 | 	 * @param nrWord the nrWord to set
 68 | 	 */
 69 | 	public void setNRWords(List words) {
 70 | 		this.nrWords = words;
 71 | 	}
 72 | 
 73 | 	/**
 74 | 	 * 
 75 | 	 * @param word
 76 | 	 */
 77 | 	public void addNRWord(String word) {
 78 | 		addNRWord(word, AnalysisOutput.SCORE_CORRECT);
 79 | 	}
 80 | 	
 81 | 	/**
 82 | 	 * 
 83 | 	 * @param word
 84 | 	 * @param score
 85 | 	 */
 86 | 	public void addNRWord(String word, int score) {
 87 | 		AnalysisOutput output = new AnalysisOutput(word,null,null,PatternConstants.PTN_N,score);
 88 | 		output.setSource(word);
 89 | 		output.setPos(PatternConstants.POS_NOUN);
 90 | 		this.nrWords.add(0,output);
 91 | 	}
 92 | 	
 93 | 	/**
 94 | 	 * @return the source
 95 | 	 */
 96 | 	public String getSource() {
 97 | 		return source;
 98 | 	}
 99 | 
100 | 	/**
101 | 	 * @param source the source to set
102 | 	 */
103 | 	public void setSource(String source) {
104 | 		this.source = source;
105 | 	}
106 | 	
107 | 	/**
108 | 	 * 분석된 전체 단어의 길이를 반환한다.
109 | 	 * @return
110 | 	 */
111 | 	public int getLength() {
112 | 		
113 | 		if(this.source ==null) return 0;
114 | 		
115 | 		return this.source.length();
116 | 	}
117 | 	
118 | }
119 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/build.gradle:
--------------------------------------------------------------------------------
 1 | apply plugin: 'java'
 2 | apply plugin: 'eclipse'
 3 | apply plugin: 'maven'
 4 | 
 5 | sourceCompatibility = 1.7
 6 | 
 7 | group = 'com.tistory.devyongsik'
 8 | version = '0.7-SNAPSHOT'
 9 | 
10 | repositories {
11 | 	mavenCentral()
12 | }
13 | 
14 | def versions = [
15 |     lucene : "4.4.0",
16 |     slf4j : "1.6.6",
17 |     logback : "1.0.13"
18 | ]
19 | 
20 | dependencies {
21 | 	compile(
22 | 		[group: 'junit', name: 'junit', version: '4.4'],
23 | 		[group: 'org.slf4j', name: 'slf4j-api', version: '1.6.6'],
24 | 		[group: 'org.slf4j', name: 'slf4j-simple', version: '1.6.6'],
25 | 		[group: 'org.apache.lucene', name: 'lucene-core', version: "${versions.lucene}"],
26 |         [group: 'org.apache.lucene', name: 'lucene-analyzers-common', version: "${versions.lucene}"],
27 |         [group: 'org.apache.lucene', name: 'lucene-queries', version: "${versions.lucene}"],
28 |         [group: 'org.apache.lucene', name: 'lucene-queryparser', version: "${versions.lucene}"],
29 |         [group: 'ch.qos.logback', name: 'logback-core', version: '1.0.13'],
30 | 		[group: 'ch.qos.logback', name: 'logback-classic', version: '1.0.13'],
31 | 		[group: 'org.codehaus.groovy', name: 'groovy-all', version: '2.1.6'],
32 | 		[group: 'com.google.guava', name: 'guava', version: 'r09']
33 | 	)
34 | 
35 | 	testRuntime(
36 |         [group: 'junit', name: 'junit', version: '4.4'],
37 |         [group: 'org.slf4j', name: 'slf4j-api', version: '1.6.6'],
38 |         [group: 'org.slf4j', name: 'slf4j-simple', version: '1.6.6'],
39 |         [group: 'org.apache.lucene', name: 'lucene-core', version: "${versions.lucene}"],
40 | 		[group: 'org.apache.lucene', name: 'lucene-analyzers-common', version: "${versions.lucene}"],
41 |         [group: 'org.apache.lucene', name: 'lucene-queries', version: "${versions.lucene}"],
42 |         [group: 'org.apache.lucene', name: 'lucene-queryparser', version: "${versions.lucene}"],
43 |         [group: 'ch.qos.logback', name: 'logback-core', version: '1.0.13'],
44 | 		[group: 'ch.qos.logback', name: 'logback-classic', version: '1.0.13'],
45 | 		[group: 'org.codehaus.groovy', name: 'groovy-all', version: '2.1.6'],
46 | 		[group: 'com.google.guava', name: 'guava', version: 'r09']
47 |     )
48 | 
49 | }
50 | 
51 | task copyDictionary(type: Copy) {
52 | 	from 'src/main/java'
53 | 	into 'target/classes/main'
54 | 	include '**/*.properties'
55 | 	include '**/*.dic'
56 | 	include '**/*.jflex'
57 | 	include '**/*.txt'
58 | 
59 | 	includeEmptyDirs = false
60 | }
61 | 
62 | eclipse {
63 | 	classpath {
64 | 		downloadSources=true
65 | 	}
66 | 	
67 | 	jdt {
68 | 		file {
69 | 			withProperties { 
70 | 				properties -> properties.setProperty("encoding//src/main/java", "utf-8")
71 | 					      properties.setProperty("encoding//src/main/resources", "utf-8")
72 | 					      properties.setProperty("encoding//src/test/java", "utf-8")
73 | 					      properties.setProperty("encoding//src/test/resources", "utf-8")
74 | 			}		
75 | 		}
76 | 	}
77 | }
78 | 
79 | uploadArchives {
80 | 	repositories.mavenDeployer {
81 | 		repository(url: "file:///Users/need4spd/Programming/need4spd-maven-repo/snapshots")
82 | 		//repository(url: "file:///Programming/Java/need4spd-maven-repo/snapshots")
83 | 	}
84 | }
85 | 
86 | tasks.test.dependsOn copyDictionary
87 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanCompoundNounEngine.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.List;
 5 | import java.util.Map;
 6 | import java.util.Stack;
 7 | 
 8 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 9 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
10 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
11 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
12 | import org.apache.lucene.util.AttributeSource;
13 | import org.apache.lucene.util.AttributeSource.State;
14 | import org.slf4j.Logger;
15 | import org.slf4j.LoggerFactory;
16 | 
17 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
18 | 
19 | public class KoreanCompoundNounEngine implements Engine {
20 | 	
21 | 	private Logger logger = LoggerFactory.getLogger(KoreanCompoundNounEngine.class);
22 | 	private Map<String, List<String>> compoundNouns = new HashMap<String, List<String>>();
23 | 	
24 | 	public KoreanCompoundNounEngine() {
25 | 		if(logger.isInfoEnabled()) {
26 | 			logger.info("init KoreanCompoundNounEngine");
27 | 		}
28 | 		
29 | 		compoundNouns = DictionaryFactory.getFactory().getCompoundDictionaryMap();
30 | 	}
31 | 	
32 | 	@Override
33 | 	public void collectNounState(AttributeSource attributeSource, Stack<State> nounsStack, Map<String, String> returnedTokens) throws Exception {
34 | 		
35 | 		
36 | 		CharTermAttribute termAttr = attributeSource.getAttribute(CharTermAttribute.class);
37 | 		TypeAttribute typeAttr = attributeSource.getAttribute(TypeAttribute.class);
38 | 		OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class);
39 | 		PositionIncrementAttribute positionAttr = attributeSource.getAttribute(PositionIncrementAttribute.class);
40 | 
41 | 		String termString = termAttr.toString();
42 | 		returnedTokens.put(termString+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), "");
43 | 		
44 | 		//복합명사 사전에 있는 단어면
45 | 		List<String> matchedData = compoundNouns.get(termString);
46 | 		if(matchedData != null) {
47 | 			typeAttr.setType("compounds");
48 | 
49 | 			for(String noun : matchedData) {
50 | 				
51 | 				if(logger.isDebugEnabled()) {
52 | 					logger.debug("복합명사추출 : " + noun);
53 | 				}
54 | 				
55 | 			    int startOffSet = termString.indexOf(noun);
56 | 			    int endOffSet = startOffSet + noun.length();
57 | 			    
58 | 			    String makeKeyForCheck = noun + "_" + startOffSet + "_" + endOffSet;
59 | 				
60 | 				if(returnedTokens.containsKey(makeKeyForCheck)) {
61 | 					if(logger.isDebugEnabled()) {
62 | 						logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip");
63 | 					}
64 | 					
65 | 					continue;
66 | 					
67 | 				} else {
68 | 					returnedTokens.put(makeKeyForCheck, "");
69 | 				}
70 | 				
71 | 				termAttr.setEmpty();
72 | 				termAttr.append(noun);
73 | 			    
74 | 				positionAttr.setPositionIncrement(1);
75 | 			    
76 | 			    offSetAttr.setOffset(startOffSet , endOffSet);
77 | 			    
78 | 			    typeAttr.setType("compound");
79 | 			    nounsStack.add(attributeSource.captureState());
80 | 			}
81 | 		}
82 | 		
83 | 		return;
84 | 	}
85 | }
86 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/dictionaryindex/SynonymDictionaryIndex.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer.dictionaryindex;
 2 | 
 3 | import java.io.IOException;
 4 | import java.util.List;
 5 | 
 6 | import org.apache.lucene.analysis.Analyzer;
 7 | import org.apache.lucene.analysis.SimpleAnalyzer;
 8 | import org.apache.lucene.document.Document;
 9 | import org.apache.lucene.document.Field;
10 | import org.apache.lucene.document.Field.Index;
11 | import org.apache.lucene.document.Field.Store;
12 | import org.apache.lucene.document.Field.TermVector;
13 | import org.apache.lucene.index.IndexWriter;
14 | import org.apache.lucene.index.IndexWriterConfig;
15 | import org.apache.lucene.index.IndexWriterConfig.OpenMode;
16 | import org.apache.lucene.search.SearcherFactory;
17 | import org.apache.lucene.search.SearcherManager;
18 | import org.apache.lucene.store.Directory;
19 | import org.apache.lucene.store.RAMDirectory;
20 | import org.apache.lucene.util.Version;
21 | import org.slf4j.Logger;
22 | import org.slf4j.LoggerFactory;
23 | 
24 | public class SynonymDictionaryIndex {
25 | 
26 | 	private Directory ramDirectory = new RAMDirectory();
27 | 	private SearcherManager searcherManager = null;
28 | 	private Logger logger = LoggerFactory.getLogger(SynonymDictionaryIndex.class);
29 | 	
30 | 	private static SynonymDictionaryIndex indexingModule = new SynonymDictionaryIndex();
31 | 	
32 | 	private IndexWriter indexWriter = null;
33 | 	
34 | 	private SynonymDictionaryIndex() {
35 | 		try {
36 | 			
37 | 			Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_36); //문서 내용을 분석 할 때 사용 될 Analyzer
38 | 			IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer);
39 | 			iwc.setOpenMode(OpenMode.CREATE);
40 | 
41 | 			indexWriter = new IndexWriter(ramDirectory, iwc);
42 | 			searcherManager = new SearcherManager(indexWriter,true, new SearcherFactory());
43 | 			
44 | 		} catch (IOException e) {
45 | 			e.printStackTrace();
46 | 			throw new IllegalStateException();
47 | 		}
48 | 	}
49 | 	
50 | 	public static SynonymDictionaryIndex getIndexingModule() {
51 | 		return indexingModule;
52 | 	}
53 | 	
54 | 	public SearcherManager getSearcherManager() {
55 | 		
56 | 		return searcherManager;
57 | 	}
58 | 	
59 | 	public synchronized void indexingDictionary(List<String> synonyms) {
60 | 		
61 | 		try {
62 | 			
63 | 			indexWriter.deleteAll();
64 | 			indexWriter.commit();
65 | 			
66 | 			int recordCnt = 0;
67 | 			//동의어들을 ,로 잘라내어 색인합니다.
68 | 			//하나의 document에 syn이라는 이름의 필드를 여러개 추가합니다.
69 | 			//나중에 syn=노트북 으로 검색한다면 그때 나온 결과 Document로부터 
70 | 			//모든 동의어 리스트를 얻을 수 있습니다.
71 | 		
72 | 			for(String syn : synonyms) {
73 | 				String[] synonymWords = syn.split(",");
74 | 				Document doc = new Document();
75 | 				for(int i = 0, size = synonymWords.length; i < size ; i++) {
76 | 
77 | 
78 | 					String fieldValue = synonymWords[i];
79 | 					Field field = new Field("syn",fieldValue,Store.YES,Index.NOT_ANALYZED_NO_NORMS, TermVector.NO);
80 | 					doc.add(field);
81 | 
82 | 					recordCnt++;
83 | 				}//end inner for
84 | 				indexWriter.addDocument(doc);
85 | 			}//end outer for
86 | 
87 | 			indexWriter.commit();
88 | 		
89 | 			logger.info("동의어 색인 단어 갯수 : {}", recordCnt);
90 | 			
91 | 		} catch (Exception e) {
92 | 			throw new IllegalStateException();
93 | 		}
94 | 	}
95 | }
96 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanCompoundNounEngine.java:
--------------------------------------------------------------------------------
 1 | package com.tistory.devyongsik.analyzer;
 2 | 
 3 | import java.util.HashMap;
 4 | import java.util.List;
 5 | import java.util.Map;
 6 | 
 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
11 | import org.apache.lucene.util.AttributeSource;
12 | import org.slf4j.Logger;
13 | import org.slf4j.LoggerFactory;
14 | 
15 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
16 | 
17 | public class KoreanCompoundNounEngine implements Engine {
18 | 	
19 | 	private Logger logger = LoggerFactory.getLogger(KoreanCompoundNounEngine.class);
20 | 	private Map<String, List<String>> compoundNouns = new HashMap<String, List<String>>();
21 | 	
22 | 	public KoreanCompoundNounEngine() {
23 | 		if(logger.isInfoEnabled()) {
24 | 			logger.info("init KoreanCompoundNounEngine");
25 | 		}
26 | 		
27 | 		compoundNouns = DictionaryFactory.getFactory().getCompoundDictionaryMap();
28 | 	}
29 | 	
30 | 	@Override
31 | 	public void collectNounState(AttributeSource attributeSource, List<ComparableState> comparableStateList, Map<String, String> returnedTokens) throws Exception {
32 | 		
33 | 		
34 | 		CharTermAttribute termAttr = attributeSource.getAttribute(CharTermAttribute.class);
35 | 		TypeAttribute typeAttr = attributeSource.getAttribute(TypeAttribute.class);
36 | 		OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class);
37 | 		PositionIncrementAttribute positionAttr = attributeSource.getAttribute(PositionIncrementAttribute.class);
38 | 
39 | 		String termString = termAttr.toString();
40 | 		returnedTokens.put(termString+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), "");
41 | 		
42 | 		//복합명사 사전에 있는 단어면
43 | 		List<String> matchedData = compoundNouns.get(termString);
44 | 		if(matchedData != null) {
45 | 			typeAttr.setType("compounds");
46 | 
47 | 			for(String noun : matchedData) {
48 | 				
49 | 				if(logger.isDebugEnabled()) {
50 | 					logger.debug("복합명사추출 : " + noun);
51 | 				}
52 | 				
53 | 			    int startOffSet = termString.indexOf(noun);
54 | 			    int endOffSet = startOffSet + noun.length();
55 | 			    
56 | 			    String makeKeyForCheck = noun + "_" + startOffSet + "_" + endOffSet;
57 | 				
58 | 				if(returnedTokens.containsKey(makeKeyForCheck)) {
59 | 					if(logger.isDebugEnabled()) {
60 | 						logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip");
61 | 					}
62 | 					
63 | 					continue;
64 | 					
65 | 				} else {
66 | 					returnedTokens.put(makeKeyForCheck, "");
67 | 				}
68 | 				
69 | 				termAttr.setEmpty();
70 | 				termAttr.append(noun);
71 | 			    
72 | 				positionAttr.setPositionIncrement(1);
73 | 			    
74 | 			    offSetAttr.setOffset(startOffSet , endOffSet);
75 | 			    
76 | 			    typeAttr.setType("compound");
77 | 			    
78 | 			    ComparableState comparableState = new ComparableState();
79 | 				comparableState.setState(attributeSource.captureState());
80 | 				comparableState.setStartOffset(offSetAttr.startOffset());
81 | 				
82 | 				comparableStateList.add(comparableState);
83 | 			}
84 | 		}
85 | 		
86 | 		return;
87 | 	}
88 | }
89 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/dictionaryindex/SynonymDictionaryIndex.java:
--------------------------------------------------------------------------------
  1 | package com.tistory.devyongsik.analyzer.dictionaryindex;
  2 | 
  3 | import java.io.IOException;
  4 | import java.util.List;
  5 | 
  6 | import org.apache.lucene.analysis.Analyzer;
  7 | import org.apache.lucene.analysis.core.SimpleAnalyzer;
  8 | import org.apache.lucene.document.Document;
  9 | import org.apache.lucene.document.Field;
 10 | import org.apache.lucene.document.FieldType;
 11 | import org.apache.lucene.index.FieldInfo.IndexOptions;
 12 | import org.apache.lucene.index.IndexWriter;
 13 | import org.apache.lucene.index.IndexWriterConfig;
 14 | import org.apache.lucene.index.IndexWriterConfig.OpenMode;
 15 | import org.apache.lucene.search.SearcherFactory;
 16 | import org.apache.lucene.search.SearcherManager;
 17 | import org.apache.lucene.store.Directory;
 18 | import org.apache.lucene.store.RAMDirectory;
 19 | import org.apache.lucene.util.Version;
 20 | import org.slf4j.Logger;
 21 | import org.slf4j.LoggerFactory;
 22 | 
 23 | public class SynonymDictionaryIndex {
 24 | 
 25 | 	private Directory ramDirectory = new RAMDirectory();
 26 | 	private SearcherManager searcherManager = null;
 27 | 	private Logger logger = LoggerFactory.getLogger(SynonymDictionaryIndex.class);
 28 | 	
 29 | 	private static SynonymDictionaryIndex indexingModule = new SynonymDictionaryIndex();
 30 | 	
 31 | 	private IndexWriter indexWriter = null;
 32 | 	
 33 | 	private SynonymDictionaryIndex() {
 34 | 		try {
 35 | 			
 36 | 			Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_44); //문서 내용을 분석 할 때 사용 될 Analyzer
 37 | 			IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer);
 38 | 			iwc.setOpenMode(OpenMode.CREATE);
 39 | 
 40 | 			indexWriter = new IndexWriter(ramDirectory, iwc);
 41 | 			searcherManager = new SearcherManager(indexWriter,true, new SearcherFactory());
 42 | 			
 43 | 		} catch (IOException e) {
 44 | 			e.printStackTrace();
 45 | 			throw new IllegalStateException();
 46 | 		}
 47 | 	}
 48 | 	
 49 | 	public static SynonymDictionaryIndex getIndexingModule() {
 50 | 		return indexingModule;
 51 | 	}
 52 | 	
 53 | 	public SearcherManager getSearcherManager() {
 54 | 		
 55 | 		return searcherManager;
 56 | 	}
 57 | 	
 58 | 	public synchronized void indexingDictionary(List<String> synonyms) {
 59 | 		
 60 | 		try {
 61 | 			
 62 | 			indexWriter.deleteAll();
 63 | 			indexWriter.commit();
 64 | 			
 65 | 			int recordCnt = 0;
 66 | 			//동의어들을 ,로 잘라내어 색인합니다.
 67 | 			//하나의 document에 syn이라는 이름의 필드를 여러개 추가합니다.
 68 | 			//나중에 syn=노트북 으로 검색한다면 그때 나온 결과 Document로부터 
 69 | 			//모든 동의어 리스트를 얻을 수 있습니다.
 70 | 		
 71 | 			FieldType fieldType = new FieldType();
 72 | 			fieldType.setIndexed(true);
 73 | 			fieldType.setStored(true);
 74 | 			fieldType.setIndexOptions(IndexOptions.DOCS_ONLY);
 75 | 			fieldType.setTokenized(false);
 76 | 			
 77 | 			for(String syn : synonyms) {
 78 | 				String[] synonymWords = syn.split(",");
 79 | 				Document doc = new Document();
 80 | 				for(int i = 0, size = synonymWords.length; i < size ; i++) {
 81 | 
 82 | 
 83 | 					String fieldValue = synonymWords[i];
 84 | 					Field field = new Field("syn", fieldValue, fieldType);
 85 | 					doc.add(field);
 86 | 
 87 | 					recordCnt++;
 88 | 				}//end inner for
 89 | 				indexWriter.addDocument(doc);
 90 | 			}//end outer for
 91 | 
 92 | 			indexWriter.commit();
 93 | 		
 94 | 			logger.info("동의어 색인 단어 갯수 : {}", recordCnt);
 95 | 			
 96 | 		} catch (Exception e) {
 97 | 			throw new IllegalStateException();
 98 | 		}
 99 | 	}
100 | }
101 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSOutput.java:
--------------------------------------------------------------------------------
  1 | package org.apache.lucene.analysis.kr.morph;
  2 | 
  3 | import java.util.ArrayList;
  4 | import java.util.Collections;
  5 | import java.util.List;
  6 | 
  7 | public class WSOutput  implements Cloneable {
  8 | 
  9 | 	private int lastStart = 0;
 10 | 	
 11 | 	private int lastEnd = 0;	
 12 | 	
 13 | 	private List<AnalysisOutput> phrases = new ArrayList();
 14 | 	
 15 | 	public WSOutput() {
 16 | 		
 17 | 	}
 18 | 	
 19 | 	public WSOutput(AnalysisOutput o) {
 20 | 		addPhrase(o);
 21 | 	}
 22 | 	
 23 | 	public int getLastStart() {
 24 | 		return lastStart;
 25 | 	}
 26 | 
 27 | 	public void setLastStart(int start) {
 28 | 		this.lastStart = start;
 29 | 	}
 30 | 
 31 | 	public int getLastEnd() {
 32 | 		return lastEnd;
 33 | 	}
 34 | 
 35 | 	public void setLastEnd(int end) {
 36 | 		this.lastStart = end;
 37 | 	}
 38 | 	
 39 | 
 40 | 	public List<AnalysisOutput> getPhrases() {
 41 | 		return phrases;
 42 | 	}
 43 | 
 44 | 	public void removeLast() {
 45 | 				
 46 | 		if(this.phrases.size()==0) return;
 47 | 		
 48 | 		AnalysisOutput o = this.phrases.remove(this.phrases.size()-1);
 49 | 		
 50 | 		if(this.phrases.size()==0) {
 51 | 			
 52 | 			this.lastStart = 0;
 53 | 			this.lastEnd = 0;
 54 | 			
 55 | 		} else {
 56 | 			
 57 | 			this.lastEnd -= o.getSource().length();
 58 | 			
 59 | 			if(this.phrases.size()>1) {
 60 | 				AnalysisOutput o1 = this.phrases.get(this.phrases.size()-1);
 61 | 				this.lastStart = lastEnd-o1.getSource().length();
 62 | 			} else {
 63 | 				this.lastStart = 0;
 64 | 			}
 65 | 			
 66 | 		}
 67 | 		
 68 | 	}
 69 | 	
 70 | 	public void addPhrase(AnalysisOutput o) {
 71 | 
 72 | 		this.lastStart = this.lastEnd;
 73 | 		this.lastEnd += o.getSource().length();
 74 | 		
 75 | 		if(o.getCNounList().size()==0)
 76 | 			this.phrases.add(o);
 77 | 		else
 78 | 			addCompounds(o);
 79 | 
 80 | 	}
 81 | 	
 82 | 	private void addCompounds(AnalysisOutput o) {
 83 | 		
 84 | 		List<CompoundEntry> cnouns = o.getCNounList();
 85 | 			
 86 | 		String source = o.getSource();		
 87 | 		int rmstemlen = 0;
 88 | 		
 89 | //		for(int i=0;i<cnouns.size();i++) {
 90 | //			System.out.println(cnouns.get(i).getWord());
 91 | //		}
 92 | 		for(int i=0;i<cnouns.size()-1;i++) {
 93 | 			
 94 | 			String noun = cnouns.get(i).getWord();			
 95 | 			boolean isOnechar = false;
 96 | 		
 97 | 			// 접두어는 처음 음절에만 온다. 복합명사 분해규칙
 98 | 			// 처음이 아닌 경우 1글자는 앞 문자와 결합한다.
 99 | 			if(cnouns.get(i).getWord().length()==1 ||
100 | 					cnouns.get(i+1).getWord().length()==1) { // 접두어는 처음 음절에만 온다. 복합명사 분해규칙
101 | 				noun += cnouns.get(i+1).getWord();			
102 | 				isOnechar = true;
103 | 			}
104 | 			
105 | 			if(isOnechar && i>=cnouns.size()-2) break;
106 | 						
107 | 			int score = AnalysisOutput.SCORE_CORRECT;
108 | 			if(!cnouns.get(i).isExist()) score=AnalysisOutput.SCORE_CANDIDATE;
109 | 			
110 | 			AnalysisOutput o1 = new AnalysisOutput(noun, null, null, 
111 | 					PatternConstants.POS_NOUN, PatternConstants.PTN_N, score);
112 | 			
113 | 			o1.setSource(noun);
114 | 			
115 | 			if(isOnechar) {
116 | 				o1.addCNoun(cnouns.get(i));
117 | 				o1.addCNoun(cnouns.get(i+1));
118 | 			}
119 | 		
120 | 			if(source.length()>noun.length())
121 | 				source = source.substring(noun.length());
122 | 			
123 | 			this.phrases.add(o1);
124 | 			cnouns.remove(cnouns.get(0));
125 | 			i--;
126 | 			
127 | 			if(isOnechar) {
128 | 				cnouns.remove(cnouns.get(0));
129 | 			}
130 | 
131 | 		}
132 | 		
133 | 		o.setStem(o.getStem().substring(o.getSource().length()-source.length()));
134 | 		o.setSource(source);
135 | 		if(cnouns.size()==1) cnouns.remove(0);
136 | 	
137 | 		this.phrases.add(o);
138 | 
139 | 	}
140 | 	
141 | 	public void setPhrases(List<AnalysisOutput> phrases) {
142 | 		this.phrases = phrases;
143 | 	}
144 | 	
145 | 	public WSOutput clone() throws CloneNotSupportedException {
146 | 				
147 | 		WSOutput candidate = (WSOutput)super.clone();
148 | 		
149 | 		candidate.setLastStart(lastStart);
150 | 		
151 | 		candidate.setLastEnd(lastEnd);
152 | 		
153 | 		List list = new ArrayList();
154 | 		list.addAll(phrases);
155 | 		candidate.setPhrases(list);
156 | 		
157 | 		return candidate;
158 | 	}
159 | }
160 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/utils/ConstraintUtil.java:
--------------------------------------------------------------------------------
  1 | package org.apache.lucene.analysis.kr.utils;
  2 | 
  3 | import java.util.HashMap;
  4 | 
  5 | import java.util.Map;
  6 | 
  7 | import org.apache.lucene.analysis.kr.morph.PatternConstants;
  8 | 
  9 | /**
 10 |  * 결합이 가능한 조건을 처리하는 클래스
 11 |  * @author smlee
 12 |  *
 13 |  */
 14 | public class ConstraintUtil {
 15 | 
 16 | 	private static Map hahes = new HashMap(); // "글로벌화해 ", "민족화해" 처럼 화해와 결합이 가능한 명사
 17 | 	static {
 18 | 		hahes.put("민족", "Y");hahes.put("동서", "Y");hahes.put("남북", "Y");
 19 | 	}
 20 | 	
 21 | 	private static Map eomiPnouns = new HashMap(); 
 22 | 	static {
 23 | 		eomiPnouns.put("ㄴ", "Y");eomiPnouns.put("ㄹ", "Y");eomiPnouns.put("ㅁ", "Y");
 24 | 	}
 25 | 	
 26 | 	private static Map PTN_MLIST= new HashMap();
 27 | 	static {
 28 | 		PTN_MLIST.put(PatternConstants.PTN_NSM, PatternConstants.PTN_NSM);
 29 | 		PTN_MLIST.put(PatternConstants.PTN_NSMXM, PatternConstants.PTN_NSMXM);
 30 | 		PTN_MLIST.put(PatternConstants.PTN_NJCM, PatternConstants.PTN_NJCM);
 31 | 		PTN_MLIST.put(PatternConstants.PTN_VM, PatternConstants.PTN_VM);
 32 | 		PTN_MLIST.put(PatternConstants.PTN_VMCM, PatternConstants.PTN_VMCM);
 33 | 		PTN_MLIST.put(PatternConstants.PTN_VMXM, PatternConstants.PTN_VMXM);
 34 | 		PTN_MLIST.put(PatternConstants.PTN_NVM, PatternConstants.PTN_NVM);
 35 | 	}
 36 | 	
 37 | 	private static Map PTN_JLIST= new HashMap();
 38 | 	static {
 39 | 		PTN_JLIST.put(PatternConstants.PTN_NJ, PatternConstants.PTN_NJ);
 40 | 		PTN_JLIST.put(PatternConstants.PTN_NSMJ, PatternConstants.PTN_NSMJ);
 41 | 		PTN_JLIST.put(PatternConstants.PTN_VMJ, PatternConstants.PTN_VMJ);
 42 | 		PTN_JLIST.put(PatternConstants.PTN_VMXMJ, PatternConstants.PTN_VMXMJ);
 43 | 	}
 44 | 	
 45 | 	private static Map WORD_GUKS= new HashMap();
 46 | 	static {
 47 | 		WORD_GUKS.put("날것", "Y");
 48 | 		WORD_GUKS.put("들것", "Y");
 49 | 		WORD_GUKS.put("별것", "Y");
 50 | 		WORD_GUKS.put("찰것", "Y");
 51 | 		WORD_GUKS.put("탈것", "Y");
 52 | 		WORD_GUKS.put("하잘것", "Y");
 53 | 	}
 54 | 	
 55 | 	// 종성이 있는 음절과 연결될 수 없는 조사
 56 | 	private static Map JOSA_TWO= new HashMap();
 57 | 	static {
 58 | 		JOSA_TWO.put("가", "Y");
 59 | 		JOSA_TWO.put("는", "Y");
 60 | 		JOSA_TWO.put("다", "Y");
 61 | 		JOSA_TWO.put("나", "Y");
 62 | 		JOSA_TWO.put("니", "Y");
 63 | 		JOSA_TWO.put("고", "Y");
 64 | 		JOSA_TWO.put("라", "Y");
 65 | 		JOSA_TWO.put("와", "Y");
 66 | 		JOSA_TWO.put("랑", "Y");
 67 | 		JOSA_TWO.put("를", "Y");
 68 | 		JOSA_TWO.put("며", "Y");
 69 | 		JOSA_TWO.put("든", "Y");
 70 | 		JOSA_TWO.put("야", "Y");
 71 | 		JOSA_TWO.put("여", "Y");
 72 | 	}
 73 | 	
 74 | 	// 종성이 없는 음절과 연결될 수 없는 조사
 75 | 	private static Map JOSA_THREE= new HashMap();
 76 | 	static {
 77 | 		JOSA_THREE.put("과", "Y");
 78 | 		JOSA_THREE.put("은", "Y");
 79 | 		JOSA_THREE.put("아", "Y");
 80 | 		JOSA_THREE.put("으", "Y");
 81 | 		JOSA_THREE.put("은", "Y");
 82 | 		JOSA_THREE.put("을", "Y");
 83 | 	}
 84 | 	
 85 | 	public static boolean canHaheCompound(String key) {
 86 | 		if(hahes.get(key)!=null) return true;
 87 | 		return false;
 88 | 	}
 89 | 		
 90 | 	/**
 91 | 	 * 어미가 ㄴ,ㄹ,ㅁ 으로 끝나는지 조사한다.
 92 | 	 * @param eomi
 93 | 	 * @return
 94 | 	 */
 95 | 	public static boolean isNLM(String eomi) {
 96 | 		
 97 | 		if(eomi==null || "".equals(eomi)) return false;
 98 | 		
 99 | 		if(eomiPnouns.get(eomi)!=null) return true;
100 | 		
101 | 		char[] chrs = MorphUtil.decompose(eomi.charAt(eomi.length()-1));
102 | 		if(chrs.length==3  && eomiPnouns.get(Character.toString(chrs[2]))!=null) return true;
103 | 		
104 | 		return true;
105 | 		
106 | 	}
107 | 	
108 | 	public static boolean isEomiPhrase(int ptn) {
109 | 		
110 | 		if(PTN_MLIST.get(ptn)!=null) return true;
111 | 		
112 | 		return false;
113 | 		
114 | 	}
115 | 	
116 | 	public static boolean isJosaNounPhrase(int ptn) {
117 | 		
118 | 		if(PTN_JLIST.get(ptn)!=null) return true;
119 | 		
120 | 		return false;
121 | 		
122 | 	}
123 | 	
124 | 	public static boolean isJosaAdvPhrase(int ptn) {
125 | 		
126 | 		if(PatternConstants.PTN_ADVJ==ptn) return true;
127 | 		
128 | 		return false;
129 | 		
130 | 	}
131 | 	
132 | 	public static boolean isAdvPhrase(int ptn) {
133 | 		
134 | 		if(PatternConstants.PTN_ADVJ==ptn || PatternConstants.PTN_AID==ptn) return true;
135 | 		
136 | 		return false;
137 | 		
138 | 	}
139 | 	
140 | 	public static boolean isTwoJosa(String josa) {
141 | 		
142 | 		return (JOSA_TWO.get(josa)!=null);
143 | 		
144 | 	}
145 | 	public static boolean isThreeJosa(String josa) {
146 | 		
147 | 		return (JOSA_THREE.get(josa)!=null);
148 | 		
149 | 	}	
150 | }
151 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanBaseNounEngine.java:
--------------------------------------------------------------------------------
  1 | package com.tistory.devyongsik.analyzer;
  2 | 
  3 | import java.util.HashMap;
  4 | import java.util.List;
  5 | import java.util.Map;
  6 | 
  7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 11 | import org.apache.lucene.util.AttributeSource;
 12 | import org.slf4j.Logger;
 13 | import org.slf4j.LoggerFactory;
 14 | 
 15 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
 16 | 
 17 | public class KoreanBaseNounEngine implements Engine {
 18 | 	
 19 | 	private Logger logger = LoggerFactory.getLogger(KoreanBaseNounEngine.class);
 20 | 	
 21 | 	private Map<String, String> customNounsDic = new HashMap<String, String>();
 22 | 		
 23 | 	public KoreanBaseNounEngine() {
 24 | 		if(logger.isInfoEnabled()) {
 25 | 			logger.info("init KoreanBaseNounEngine");
 26 | 		}
 27 | 		
 28 | 		customNounsDic = DictionaryFactory.getFactory().getCustomNounDictionaryMap();
 29 | 	}
 30 | 
 31 | 	@Override
 32 | 	public void collectNounState(AttributeSource attributeSource, List<ComparableState> comparableStateList, Map<String, String> returnedTokens) throws Exception {
 33 | 		
 34 | 		CharTermAttribute termAttr = attributeSource.getAttribute(CharTermAttribute.class);
 35 | 		TypeAttribute typeAttr = attributeSource.getAttribute(TypeAttribute.class);
 36 | 		OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class);
 37 | 		PositionIncrementAttribute positionAttr = attributeSource.getAttribute(PositionIncrementAttribute.class);
 38 | 
 39 | 		//Stack<State> nounsStack = new Stack<State>();
 40 | 
 41 | 		if(!typeAttr.type().equals("word")) {
 42 | 			
 43 | 			if(logger.isDebugEnabled()) {
 44 | 				logger.debug("명사 분석 대상이 아닙니다.");
 45 | 			}
 46 | 			
 47 | 			return;
 48 | 		}
 49 | 		
 50 | 		String term = termAttr.toString();
 51 | 		//단어 자체에 대한 명사인지 평가
 52 | 		if(customNounsDic.containsKey(term)) {
 53 | 			typeAttr.setType("base_noun");
 54 | 		}
 55 | 		
 56 | 		returnedTokens.put(term+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), "");
 57 | 		
 58 | 		String comparedWord = null;
 59 | 		//1. 매칭이 되는대로 추출한다.
 60 | 		int startIndex = 0;
 61 | 		int endIndex = startIndex + 1;
 62 | 		
 63 | 		int orgStartOffset = offSetAttr.startOffset();
 64 | 		
 65 | 		boolean isPrevMatch = false;
 66 | 		
 67 | 		while(true) {
 68 | 			
 69 | 			if(endIndex > term.length()) {
 70 | 				startIndex ++;
 71 | 				endIndex = startIndex + 1;
 72 | 			}
 73 | 			
 74 | 			if(startIndex >= term.length()) {
 75 | 				break;
 76 | 			}
 77 | 			
 78 | 			comparedWord = term.substring(startIndex, endIndex);
 79 | 			
 80 | 			//매칭될 때 State 저장
 81 | 			if(customNounsDic.containsKey(comparedWord) && !term.equals(comparedWord)) {
 82 | 
 83 | 				//offset도 계산해주어야 합니다. 그래야 하이라이팅이 잘 됩니다.
 84 | 				int startOffSet = orgStartOffset + startIndex;
 85 | 				int endOffSet = orgStartOffset + endIndex;
 86 | 				
 87 | 				String makeKeyForCheck = comparedWord + "_" + startOffSet + "_" + endOffSet;
 88 | 				
 89 | 				if(returnedTokens.containsKey(makeKeyForCheck)) {
 90 | 					
 91 | 					if(logger.isDebugEnabled()) {
 92 | 						logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip");
 93 | 					}
 94 | 					
 95 | 					endIndex++;
 96 | 					isPrevMatch = true;
 97 | 					
 98 | 					continue;
 99 | 					
100 | 				} else {
101 | 					returnedTokens.put(makeKeyForCheck, "");
102 | 				}
103 | 				
104 | 				termAttr.setEmpty();
105 | 				termAttr.append(comparedWord);
106 | 
107 | 				positionAttr.setPositionIncrement(1);  //추출된 명사이기 때문에 위치정보를 1로 셋팅
108 | 				//타입을 noun으로 설정한다.
109 | 				typeAttr.setType("base_noun"); 
110 | 				
111 | 				offSetAttr.setOffset(startOffSet , endOffSet);
112 | 				
113 | 				ComparableState comparableState = new ComparableState();
114 | 				comparableState.setState(attributeSource.captureState());
115 | 				comparableState.setStartOffset(offSetAttr.startOffset());
116 | 				
117 | 				comparableStateList.add(comparableState);
118 | 				
119 | 				endIndex++;
120 | 				isPrevMatch = true;
121 | 				
122 | 			} else {
123 | 				if(isPrevMatch) {
124 | 					startIndex = endIndex - 1;
125 | 					endIndex = startIndex + 1;
126 | 				} else {
127 | 					endIndex++;
128 | 				}
129 | 				
130 | 				isPrevMatch = false;
131 | 			}
132 | 		}
133 | 
134 | 		return;
135 | 	}
136 | }
137 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanBaseNounEngine.java:
--------------------------------------------------------------------------------
  1 | package com.tistory.devyongsik.analyzer;
  2 | 
  3 | import java.util.HashMap;
  4 | import java.util.Map;
  5 | import java.util.Stack;
  6 | 
  7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 11 | import org.apache.lucene.util.AttributeSource;
 12 | import org.apache.lucene.util.AttributeSource.State;
 13 | import org.slf4j.Logger;
 14 | import org.slf4j.LoggerFactory;
 15 | 
 16 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
 17 | 
 18 | public class KoreanBaseNounEngine implements Engine {
 19 | 	
 20 | 	private Logger logger = LoggerFactory.getLogger(KoreanBaseNounEngine.class);
 21 | 	private boolean isUseForIndexing = true;
 22 | 	
 23 | 	private Map<String, String> customNounsDic = new HashMap<String, String>();
 24 | 	
 25 | 
 26 | 	protected void setIsUseForIndexing(boolean useForIndexing) {
 27 | 		this.isUseForIndexing = useForIndexing;
 28 | 	}
 29 | 	
 30 | 	protected boolean isUseForIndexing() {
 31 | 		return isUseForIndexing;
 32 | 	}
 33 | 	
 34 | 	public KoreanBaseNounEngine() {
 35 | 		if(logger.isInfoEnabled()) {
 36 | 			logger.info("init KoreanBaseNounEngine");
 37 | 		}
 38 | 		
 39 | 		customNounsDic = DictionaryFactory.getFactory().getCustomNounDictionaryMap();
 40 | 	}
 41 | 
 42 | 	@Override
 43 | 	public void collectNounState(AttributeSource attributeSource, Stack<State> nounsStack, Map<String, String> returnedTokens) throws Exception {
 44 | 		
 45 | 		CharTermAttribute termAttr = attributeSource.getAttribute(CharTermAttribute.class);
 46 | 		TypeAttribute typeAttr = attributeSource.getAttribute(TypeAttribute.class);
 47 | 		OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class);
 48 | 		PositionIncrementAttribute positionAttr = attributeSource.getAttribute(PositionIncrementAttribute.class);
 49 | 
 50 | 		//Stack<State> nounsStack = new Stack<State>();
 51 | 
 52 | 		if(!typeAttr.type().equals("word")) {
 53 | 			
 54 | 			if(logger.isDebugEnabled()) {
 55 | 				logger.debug("명사 분석 대상이 아닙니다.");
 56 | 			}
 57 | 			
 58 | 			return;
 59 | 		}
 60 | 		
 61 | 		String term = termAttr.toString();
 62 | 		//단어 자체에 대한 명사인지 평가
 63 | 		if(customNounsDic.containsKey(term)) {
 64 | 			typeAttr.setType("base_noun");
 65 | 		}
 66 | 		
 67 | 		returnedTokens.put(term+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), "");
 68 | 		
 69 | 		String comparedWord = null;
 70 | 		//1. 매칭이 되는대로 추출한다.
 71 | 		int startIndex = 0;
 72 | 		int endIndex = startIndex + 1;
 73 | 		
 74 | 		int orgStartOffset = offSetAttr.startOffset();
 75 | 		
 76 | 		boolean isPrevMatch = false;
 77 | 		
 78 | 		while(true) {
 79 | 			
 80 | 			if(endIndex > term.length()) {
 81 | 				startIndex ++;
 82 | 				endIndex = startIndex + 1;
 83 | 			}
 84 | 			
 85 | 			if(startIndex >= term.length()) {
 86 | 				break;
 87 | 			}
 88 | 			
 89 | 			comparedWord = term.substring(startIndex, endIndex);
 90 | 			
 91 | 			//매칭될 때 State 저장
 92 | 			if(customNounsDic.containsKey(comparedWord) && !term.equals(comparedWord)) {
 93 | 
 94 | 				//offset도 계산해주어야 합니다. 그래야 하이라이팅이 잘 됩니다.
 95 | 				int startOffSet = orgStartOffset + startIndex;
 96 | 				int endOffSet = orgStartOffset + endIndex;
 97 | 				
 98 | 				String makeKeyForCheck = comparedWord + "_" + startOffSet + "_" + endOffSet;
 99 | 				
100 | 				if(returnedTokens.containsKey(makeKeyForCheck)) {
101 | 					
102 | 					if(logger.isDebugEnabled()) {
103 | 						logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip");
104 | 					}
105 | 					
106 | 					endIndex++;
107 | 					isPrevMatch = true;
108 | 					
109 | 					continue;
110 | 					
111 | 				} else {
112 | 					returnedTokens.put(makeKeyForCheck, "");
113 | 				}
114 | 				
115 | 				termAttr.setEmpty();
116 | 				termAttr.append(comparedWord);
117 | 
118 | 				positionAttr.setPositionIncrement(1);  //추출된 명사이기 때문에 위치정보를 1로 셋팅
119 | 				//타입을 noun으로 설정한다.
120 | 				typeAttr.setType("base_noun"); 
121 | 				
122 | 				offSetAttr.setOffset(startOffSet , endOffSet);
123 | 				
124 | 				nounsStack.push(attributeSource.captureState()); //추출된 명사에 대한 AttributeSource를 Stack에 저장
125 | 				endIndex++;
126 | 				isPrevMatch = true;
127 | 				
128 | 			} else {
129 | 				if(isPrevMatch) {
130 | 					startIndex = endIndex - 1;
131 | 					endIndex = startIndex + 1;
132 | 				} else {
133 | 					endIndex++;
134 | 				}
135 | 				
136 | 				isPrevMatch = false;
137 | 			}
138 | 		}
139 | 
140 | 		return;
141 | 	}
142 | }
143 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/dic/josa.dic:
--------------------------------------------------------------------------------
  1 | //#######
  2 | 가
  3 | 같이
  4 | 같이나
  5 | 같이는
  6 | 같이는야
  7 | 같이는커녕
  8 | 같이도
  9 | 같이만
 10 | 같인
 11 | 고
 12 | 과
 13 | 과는
 14 | 과는커녕
 15 | 과도
 16 | 과를
 17 | 과만
 18 | 과만은
 19 | 과의
 20 | 까지
 21 | 까지가
 22 | 까지나
 23 | 까지나마
 24 | 까지는
 25 | 까지는야
 26 | 까지는커녕
 27 | 까지도
 28 | 까지든지
 29 | 까지라고
 30 | 까지라고는
 31 | 까지라고만은
 32 | 까지라도
 33 | 까지로
 34 | 까지로나
 35 | 까지로나마
 36 | 까지로는
 37 | 까지로는야
 38 | 까지로는커녕
 39 | 까지로도
 40 | 까지로든
 41 | 까지로든지
 42 | 까지로라서
 43 | 까지로라야
 44 | 까지로만
 45 | 까지로만은
 46 | 까지로서
 47 | 까지로써
 48 | 까지를
 49 | 까지만
 50 | 까지만은
 51 | 까지만이라도
 52 | 까지야
 53 | 까지야말로
 54 | 까지에
 55 | 까지와
 56 | 까지의
 57 | 까지조차
 58 | 까지조차도
 59 | 까진
 60 | 끼리
 61 | 께
 62 | 께서
 63 | 께옵서
 64 | 께옵서는
 65 | 께옵서는야
 66 | 께옵서는커녕
 67 | 께옵서도
 68 | 께옵서만
 69 | 께옵서만은
 70 | 께옵서만이
 71 | 께옵선
 72 | 나
 73 | 나마
 74 | 는
 75 | 는야
 76 | 는커녕
 77 | 니
 78 | 다
 79 | 다가
 80 | 다가는
 81 | 다가도
 82 | 다간
 83 | 대로
 84 | 대로가
 85 | 대로는
 86 | 대로의
 87 | 더러
 88 | 더러는
 89 | 더러만은
 90 | 도
 91 | 든
 92 | 든지
 93 | 라
 94 | 라고
 95 | 라고까지
 96 | 라고까지는
 97 | 라고는
 98 | 라고만은
 99 | 라곤
100 | 라는
101 | 라도
102 | 라든지
103 | 라서
104 | 라야
105 | 라야만
106 | 라오
107 | 라지
108 | 라지요
109 | 랑
110 | 랑은
111 | 로
112 | 로고
113 | 로구나
114 | 로구려
115 | 로구먼
116 | 로군
117 | 로군요
118 | 로는
119 | 로다
120 | 로되
121 | 로서
122 | 로서의
123 | 로서는
124 | 로세
125 | 를
126 | 마다
127 | 마다라도
128 | 마다를
129 | 마다에게
130 | 마다의
131 | 마따나
132 | 마저
133 | 마저나마라도
134 | 마저도
135 | 마저라도
136 | 마저야
137 | 만
138 | 만도
139 | 만에
140 | 만으로
141 | 만으로는
142 | 만으로도
143 | 만으로라도
144 | 만으로써
145 | 만으론
146 | 만은
147 | 만을
148 | 만의
149 | 만이
150 | 만이라도
151 | 만치
152 | 만큼
153 | 만큼도
154 | 만큼만
155 | 만큼씩
156 | 만큼은
157 | 만큼의
158 | 만큼이나
159 | 만큼이라도
160 | 만큼이야
161 | 말고
162 | 말고는
163 | 말고도
164 | 며
165 | 밖에
166 | 밖에는
167 | 밖에도
168 | 밖엔
169 | 보고
170 | 보고는
171 | 보고도
172 | 보고만
173 | 보고만은
174 | 보고만이라도
175 | 보곤
176 | 보다
177 | 보다는
178 | 보다는야
179 | 보다도
180 | 보다만
181 | 보다야
182 | 보단
183 | 부터
184 | 부터가
185 | 부터나마
186 | 부터는
187 | 부터도
188 | 부터라도
189 | 부터를
190 | 부터만
191 | 부터만은
192 | 부터서는
193 | 부터야말로
194 | 부터의
195 | 부턴
196 | 아
197 | 야
198 | 야말로
199 | 에
200 | 에게
201 | 에게가
202 | 에게까지
203 | 에게까지는
204 | 에게까지는커녕
205 | 에게까지도
206 | 에게까지만
207 | 에게까지만은
208 | 에게나
209 | 에게는
210 | 에게는커녕
211 | 에게다
212 | 에게도
213 | 에게든
214 | 에게든지
215 | 에게라도
216 | 에게로
217 | 에게로는
218 | 에게마다
219 | 에게만
220 | 에게며
221 | 에게보다
222 | 에게보다는
223 | 에게부터
224 | 에게서
225 | 에게서가
226 | 에게서까지
227 | 에게서나
228 | 에게서는
229 | 에게서도
230 | 에게서든지
231 | 에게서라도
232 | 에게서만
233 | 에게서보다
234 | 에게서부터
235 | 에게서야
236 | 에게서와
237 | 에게서의
238 | 에게서처럼
239 | 에게선
240 | 에게야
241 | 에게와
242 | 에게의
243 | 에게처럼
244 | 에게하고
245 | 에게하며
246 | 에겐
247 | 에까지
248 | 에까지는
249 | 에까지도
250 | 에까지든지
251 | 에까지라도
252 | 에까지만
253 | 에까지만은
254 | 에까진
255 | 에나
256 | 에는
257 | 에다
258 | 에다가
259 | 에다가는
260 | 에다간
261 | 에도
262 | 에든
263 | 에든지
264 | 에라도
265 | 에로
266 | 에로의
267 | 에를
268 | 에만
269 | 에만은
270 | 에부터
271 | 에서
272 | 에서가
273 | 에서까지
274 | 에서까지도
275 | 에서나
276 | 에서나마
277 | 에서는
278 | 에서도
279 | 에서든지
280 | 에서라도
281 | 에서만
282 | 에서만도
283 | 에서만이
284 | 에서만큼
285 | 에서만큼은
286 | 에서보다
287 | 에서부터
288 | 에서부터는
289 | 에서부터도
290 | 에서부터라도
291 | 에서부터만
292 | 에서부터만은
293 | 에서야
294 | 에서와
295 | 에서와는
296 | 에서와의
297 | 에서의
298 | 에서조차
299 | 에서처럼
300 | 에선
301 | 에야
302 | 에의
303 | 에조차도
304 | 에하며
305 | 엔
306 | 엔들
307 | 엘
308 | 엘랑
309 | 여
310 | 와
311 | 와는
312 | 와도
313 | 와라도
314 | 와를
315 | 와만
316 | 와만은
317 | 와에만
318 | 와의
319 | 와처럼
320 | 와한테
321 | 요
322 | 으로
323 | 으로가
324 | 으로까지
325 | 으로까지만은
326 | 으로나
327 | 으로나든지
328 | 으로는
329 | 으로도
330 | 으로든지
331 | 으로라도
332 | 으로랑
333 | 으로만
334 | 으로만은
335 | 으로부터
336 | 으로부터는
337 | 으로부터는커녕
338 | 으로부터도
339 | 으로부터만
340 | 으로부터만은
341 | 으로부터서는
342 | 으로부터서도
343 | 으로부터서만
344 | 으로부터의
345 | 으로서
346 | 으로서가
347 | 으로서나
348 | 으로서는
349 | 으로서도
350 | 으로서든지
351 | 으로서라도
352 | 으로서만
353 | 으로서만도
354 | 으로서만은
355 | 으로서야
356 | 으로서의
357 | 으로선
358 | 으로써
359 | 으로써나
360 | 으로써는
361 | 으로써라도
362 | 으로써만
363 | 으로써야
364 | 으로야
365 | 으로의
366 | 으론
367 | 은
368 | 은커녕
369 | 을
370 | 의
371 | 이
372 | 이고
373 | 이나
374 | 이나마
375 | 이니
376 | 이다
377 | 이든
378 | 이든지
379 | 이라
380 | 이라고
381 | 이라고는
382 | 이라고도
383 | 이라고만은
384 | 이라곤
385 | 이라는
386 | 이라도
387 | 이라든지
388 | 이라서
389 | 이라야
390 | 이라야만
391 | 이랑
392 | 이랑은
393 | 이며
394 | 이며에게
395 | 이며조차도
396 | 이야
397 | 이야말로
398 | 이여
399 | 인들
400 | 인즉
401 | 인즉슨
402 | 일랑
403 | 일랑은
404 | 조차
405 | 조차가
406 | 조차도
407 | 조차를
408 | 조차의
409 | 처럼
410 | 처럼과
411 | 처럼도
412 | 처럼만
413 | 처럼만은
414 | 처럼은
415 | 처럼이라도
416 | 처럼이야
417 | 치고
418 | 치고는
419 | 커녕
420 | 커녕은
421 | 커니와
422 | 토록
423 | 하고
424 | 하고가
425 | 하고는
426 | 하고는커녕
427 | 하고도
428 | 하고라도
429 | 하고마저
430 | 하고만
431 | 하고만은
432 | 하고야
433 | 하고에게
434 | 하고의
435 | 하고조차
436 | 하고조차도
437 | 하곤


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/dic/josa.dic:
--------------------------------------------------------------------------------
  1 | //#######
  2 | 가
  3 | 같이
  4 | 같이나
  5 | 같이는
  6 | 같이는야
  7 | 같이는커녕
  8 | 같이도
  9 | 같이만
 10 | 같인
 11 | 고
 12 | 과
 13 | 과는
 14 | 과는커녕
 15 | 과도
 16 | 과를
 17 | 과만
 18 | 과만은
 19 | 과의
 20 | 까지
 21 | 까지가
 22 | 까지나
 23 | 까지나마
 24 | 까지는
 25 | 까지는야
 26 | 까지는커녕
 27 | 까지도
 28 | 까지든지
 29 | 까지라고
 30 | 까지라고는
 31 | 까지라고만은
 32 | 까지라도
 33 | 까지로
 34 | 까지로나
 35 | 까지로나마
 36 | 까지로는
 37 | 까지로는야
 38 | 까지로는커녕
 39 | 까지로도
 40 | 까지로든
 41 | 까지로든지
 42 | 까지로라서
 43 | 까지로라야
 44 | 까지로만
 45 | 까지로만은
 46 | 까지로서
 47 | 까지로써
 48 | 까지를
 49 | 까지만
 50 | 까지만은
 51 | 까지만이라도
 52 | 까지야
 53 | 까지야말로
 54 | 까지에
 55 | 까지와
 56 | 까지의
 57 | 까지조차
 58 | 까지조차도
 59 | 까진
 60 | 끼리
 61 | 께
 62 | 께서
 63 | 께옵서
 64 | 께옵서는
 65 | 께옵서는야
 66 | 께옵서는커녕
 67 | 께옵서도
 68 | 께옵서만
 69 | 께옵서만은
 70 | 께옵서만이
 71 | 께옵선
 72 | 나
 73 | 나마
 74 | 는
 75 | 는야
 76 | 는커녕
 77 | 니
 78 | 다
 79 | 다가
 80 | 다가는
 81 | 다가도
 82 | 다간
 83 | 대로
 84 | 대로가
 85 | 대로는
 86 | 대로의
 87 | 더러
 88 | 더러는
 89 | 더러만은
 90 | 도
 91 | 든
 92 | 든지
 93 | 라
 94 | 라고
 95 | 라고까지
 96 | 라고까지는
 97 | 라고는
 98 | 라고만은
 99 | 라곤
100 | 라는
101 | 라도
102 | 라든지
103 | 라서
104 | 라야
105 | 라야만
106 | 라오
107 | 라지
108 | 라지요
109 | 랑
110 | 랑은
111 | 로
112 | 로고
113 | 로구나
114 | 로구려
115 | 로구먼
116 | 로군
117 | 로군요
118 | 로는
119 | 로다
120 | 로되
121 | 로서
122 | 로서의
123 | 로서는
124 | 로세
125 | 를
126 | 마다
127 | 마다라도
128 | 마다를
129 | 마다에게
130 | 마다의
131 | 마따나
132 | 마저
133 | 마저나마라도
134 | 마저도
135 | 마저라도
136 | 마저야
137 | 만
138 | 만도
139 | 만에
140 | 만으로
141 | 만으로는
142 | 만으로도
143 | 만으로라도
144 | 만으로써
145 | 만으론
146 | 만은
147 | 만을
148 | 만의
149 | 만이
150 | 만이라도
151 | 만치
152 | 만큼
153 | 만큼도
154 | 만큼만
155 | 만큼씩
156 | 만큼은
157 | 만큼의
158 | 만큼이나
159 | 만큼이라도
160 | 만큼이야
161 | 말고
162 | 말고는
163 | 말고도
164 | 며
165 | 밖에
166 | 밖에는
167 | 밖에도
168 | 밖엔
169 | 보고
170 | 보고는
171 | 보고도
172 | 보고만
173 | 보고만은
174 | 보고만이라도
175 | 보곤
176 | 보다
177 | 보다는
178 | 보다는야
179 | 보다도
180 | 보다만
181 | 보다야
182 | 보단
183 | 부터
184 | 부터가
185 | 부터나마
186 | 부터는
187 | 부터도
188 | 부터라도
189 | 부터를
190 | 부터만
191 | 부터만은
192 | 부터서는
193 | 부터야말로
194 | 부터의
195 | 부턴
196 | 아
197 | 야
198 | 야말로
199 | 에
200 | 에게
201 | 에게가
202 | 에게까지
203 | 에게까지는
204 | 에게까지는커녕
205 | 에게까지도
206 | 에게까지만
207 | 에게까지만은
208 | 에게나
209 | 에게는
210 | 에게는커녕
211 | 에게다
212 | 에게도
213 | 에게든
214 | 에게든지
215 | 에게라도
216 | 에게로
217 | 에게로는
218 | 에게마다
219 | 에게만
220 | 에게며
221 | 에게보다
222 | 에게보다는
223 | 에게부터
224 | 에게서
225 | 에게서가
226 | 에게서까지
227 | 에게서나
228 | 에게서는
229 | 에게서도
230 | 에게서든지
231 | 에게서라도
232 | 에게서만
233 | 에게서보다
234 | 에게서부터
235 | 에게서야
236 | 에게서와
237 | 에게서의
238 | 에게서처럼
239 | 에게선
240 | 에게야
241 | 에게와
242 | 에게의
243 | 에게처럼
244 | 에게하고
245 | 에게하며
246 | 에겐
247 | 에까지
248 | 에까지는
249 | 에까지도
250 | 에까지든지
251 | 에까지라도
252 | 에까지만
253 | 에까지만은
254 | 에까진
255 | 에나
256 | 에는
257 | 에다
258 | 에다가
259 | 에다가는
260 | 에다간
261 | 에도
262 | 에든
263 | 에든지
264 | 에라도
265 | 에로
266 | 에로의
267 | 에를
268 | 에만
269 | 에만은
270 | 에부터
271 | 에서
272 | 에서가
273 | 에서까지
274 | 에서까지도
275 | 에서나
276 | 에서나마
277 | 에서는
278 | 에서도
279 | 에서든지
280 | 에서라도
281 | 에서만
282 | 에서만도
283 | 에서만이
284 | 에서만큼
285 | 에서만큼은
286 | 에서보다
287 | 에서부터
288 | 에서부터는
289 | 에서부터도
290 | 에서부터라도
291 | 에서부터만
292 | 에서부터만은
293 | 에서야
294 | 에서와
295 | 에서와는
296 | 에서와의
297 | 에서의
298 | 에서조차
299 | 에서처럼
300 | 에선
301 | 에야
302 | 에의
303 | 에조차도
304 | 에하며
305 | 엔
306 | 엔들
307 | 엘
308 | 엘랑
309 | 여
310 | 와
311 | 와는
312 | 와도
313 | 와라도
314 | 와를
315 | 와만
316 | 와만은
317 | 와에만
318 | 와의
319 | 와처럼
320 | 와한테
321 | 요
322 | 으로
323 | 으로가
324 | 으로까지
325 | 으로까지만은
326 | 으로나
327 | 으로나든지
328 | 으로는
329 | 으로도
330 | 으로든지
331 | 으로라도
332 | 으로랑
333 | 으로만
334 | 으로만은
335 | 으로부터
336 | 으로부터는
337 | 으로부터는커녕
338 | 으로부터도
339 | 으로부터만
340 | 으로부터만은
341 | 으로부터서는
342 | 으로부터서도
343 | 으로부터서만
344 | 으로부터의
345 | 으로서
346 | 으로서가
347 | 으로서나
348 | 으로서는
349 | 으로서도
350 | 으로서든지
351 | 으로서라도
352 | 으로서만
353 | 으로서만도
354 | 으로서만은
355 | 으로서야
356 | 으로서의
357 | 으로선
358 | 으로써
359 | 으로써나
360 | 으로써는
361 | 으로써라도
362 | 으로써만
363 | 으로써야
364 | 으로야
365 | 으로의
366 | 으론
367 | 은
368 | 은커녕
369 | 을
370 | 의
371 | 이
372 | 이고
373 | 이나
374 | 이나마
375 | 이니
376 | 이다
377 | 이든
378 | 이든지
379 | 이라
380 | 이라고
381 | 이라고는
382 | 이라고도
383 | 이라고만은
384 | 이라곤
385 | 이라는
386 | 이라도
387 | 이라든지
388 | 이라서
389 | 이라야
390 | 이라야만
391 | 이랑
392 | 이랑은
393 | 이며
394 | 이며에게
395 | 이며조차도
396 | 이야
397 | 이야말로
398 | 이여
399 | 인들
400 | 인즉
401 | 인즉슨
402 | 일랑
403 | 일랑은
404 | 조차
405 | 조차가
406 | 조차도
407 | 조차를
408 | 조차의
409 | 처럼
410 | 처럼과
411 | 처럼도
412 | 처럼만
413 | 처럼만은
414 | 처럼은
415 | 처럼이라도
416 | 처럼이야
417 | 치고
418 | 치고는
419 | 커녕
420 | 커녕은
421 | 커니와
422 | 토록
423 | 하고
424 | 하고가
425 | 하고는
426 | 하고는커녕
427 | 하고도
428 | 하고라도
429 | 하고마저
430 | 하고만
431 | 하고만은
432 | 하고야
433 | 하고에게
434 | 하고의
435 | 하고조차
436 | 하고조차도
437 | 하곤


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanLongestNounEngine.java:
--------------------------------------------------------------------------------
  1 | package com.tistory.devyongsik.analyzer;
  2 | 
  3 | import java.util.HashMap;
  4 | import java.util.Map;
  5 | import java.util.Stack;
  6 | 
  7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 11 | import org.apache.lucene.util.AttributeSource;
 12 | import org.apache.lucene.util.AttributeSource.State;
 13 | import org.slf4j.Logger;
 14 | import org.slf4j.LoggerFactory;
 15 | 
 16 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
 17 | 
 18 | public class KoreanLongestNounEngine implements Engine {
 19 | 
 20 | 	private Logger logger = LoggerFactory.getLogger(KoreanLongestNounEngine.class);
 21 | 	
 22 | 	private static Map<String, String> customNounsDic = new HashMap<String, String>();
 23 | 	
 24 | 	
 25 | 	public KoreanLongestNounEngine() {
 26 | 		if(logger.isInfoEnabled()) {
 27 | 			logger.info("init KoreanLongestNounEngine");
 28 | 		}
 29 | 		
 30 | 		customNounsDic = DictionaryFactory.getFactory().getCustomNounDictionaryMap();
 31 | 	}
 32 | 
 33 | 	@Override
 34 | 	public void collectNounState(AttributeSource attributeSource, Stack<State> nounsStack, Map<String, String> returnedTokens) throws Exception {
 35 | 		
 36 | 		
 37 | 		CharTermAttribute termAttr = attributeSource.getAttribute(CharTermAttribute.class);
 38 | 		TypeAttribute typeAttr = attributeSource.getAttribute(TypeAttribute.class);
 39 | 		OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class);
 40 | 		PositionIncrementAttribute positionAttr = attributeSource.getAttribute(PositionIncrementAttribute.class);
 41 | 
 42 | 		if(!typeAttr.type().equals("word")) {
 43 | 			
 44 | 			if(logger.isDebugEnabled()) {
 45 | 				logger.debug("명사 분석 대상이 아닙니다.");
 46 | 			}
 47 | 			
 48 | 			return;
 49 | 		}
 50 | 		
 51 | 		String term = termAttr.toString();
 52 | 		//단어 자체에 대한 명사인지 평가
 53 | 		if(customNounsDic.containsKey(term)) {
 54 | 			typeAttr.setType("long_noun");
 55 | 		}
 56 | 		
 57 | 		returnedTokens.put(term+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), "");
 58 | 		
 59 | 		String comparedWord = null;
 60 | 		
 61 | 		//1. 사전과 매칭되는 가장 긴 단어를 추출한다.
 62 | 		int startIndex = 0;
 63 | 		int endIndex = startIndex + 1;
 64 | 		
 65 | 		int orgStartOffSet = offSetAttr.startOffset();
 66 | 		
 67 | 		int prevMatchedStartIndex = 0;
 68 | 		int prevMatchedEndIndex = 0;
 69 | 		
 70 | 		String matchedTerm = "";
 71 | 		
 72 | 		while(true) {
 73 | 			
 74 | 			if(endIndex > term.length()) {
 75 | 				
 76 | 				if(matchedTerm.length() > 0 && !term.equals(matchedTerm)) { //endIndex가 끝까지 갔고, 매칭된 키워드가 있음
 77 | 
 78 | 					int startOffSet = orgStartOffSet + prevMatchedStartIndex;
 79 | 					int endOffSet = orgStartOffSet + prevMatchedEndIndex;
 80 | 					
 81 | 					String makeKeyForCheck = matchedTerm + "_" + startOffSet + "_" + endOffSet;
 82 | 					
 83 | 					if(returnedTokens.containsKey(makeKeyForCheck)) {
 84 | 						
 85 | 						if(logger.isDebugEnabled()) {
 86 | 							logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip");
 87 | 						}
 88 | 						
 89 | 						matchedTerm = "";
 90 | 						
 91 | 						startIndex = prevMatchedEndIndex;
 92 | 						endIndex = startIndex + 1;
 93 | 						
 94 | 						continue;
 95 | 						
 96 | 					} else {
 97 | 						returnedTokens.put(makeKeyForCheck, "");
 98 | 					}
 99 | 					
100 | 					termAttr.setEmpty();
101 | 					termAttr.append(matchedTerm);
102 | 
103 | 					positionAttr.setPositionIncrement(1);  //추출된 명사이기 때문에 위치정보를 1로 셋팅
104 | 					//타입을 noun으로 설정한다.
105 | 					typeAttr.setType("long_noun"); 
106 | 					
107 | 					offSetAttr.setOffset(startOffSet , endOffSet);
108 | 					
109 | 					nounsStack.push(attributeSource.captureState()); //추출된 명사에 대한 AttributeSource를 Stack에 저장
110 | 					
111 | 					matchedTerm = "";
112 | 					
113 | 					startIndex = prevMatchedEndIndex;
114 | 					endIndex = startIndex + 1;
115 | 				} else {
116 | 					
117 | 					if(startIndex == prevMatchedEndIndex) {
118 | 						startIndex++;
119 | 						endIndex = startIndex + 1;
120 | 					} else {
121 | 						startIndex = endIndex;
122 | 						endIndex = startIndex + 1;
123 | 					}
124 | 				}
125 | 				
126 | 				
127 | 			}
128 | 			
129 | 			if(startIndex >= term.length()) {
130 | 				break;
131 | 			}
132 | 			
133 | 			comparedWord = term.substring(startIndex, endIndex);
134 | 			
135 | 			//매칭될 때 우선 matchedTerm에 저장
136 | 			if(customNounsDic.containsKey(comparedWord)) {
137 | 				matchedTerm = comparedWord;
138 | 				prevMatchedStartIndex = startIndex;
139 | 				prevMatchedEndIndex = endIndex;
140 | 			}
141 | 			
142 | 			endIndex++;
143 | 			
144 | 		}//end while
145 | 
146 | 		return;
147 | 	}
148 | 
149 | }
150 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSOutput.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.lucene.analysis.kr.morph;
 18 | 
 19 | import java.util.ArrayList;
 20 | import java.util.Collections;
 21 | import java.util.List;
 22 | 
 23 | public class WSOutput  implements Cloneable {
 24 | 
 25 | 	private int lastStart = 0;
 26 | 	
 27 | 	private int lastEnd = 0;	
 28 | 	
 29 | 	private List<AnalysisOutput> phrases = new ArrayList();
 30 | 	
 31 | 	public WSOutput() {
 32 | 		
 33 | 	}
 34 | 	
 35 | 	public WSOutput(AnalysisOutput o) {
 36 | 		addPhrase(o);
 37 | 	}
 38 | 	
 39 | 	public int getLastStart() {
 40 | 		return lastStart;
 41 | 	}
 42 | 
 43 | 	public void setLastStart(int start) {
 44 | 		this.lastStart = start;
 45 | 	}
 46 | 
 47 | 	public int getLastEnd() {
 48 | 		return lastEnd;
 49 | 	}
 50 | 
 51 | 	public void setLastEnd(int end) {
 52 | 		this.lastStart = end;
 53 | 	}
 54 | 	
 55 | 
 56 | 	public List<AnalysisOutput> getPhrases() {
 57 | 		return phrases;
 58 | 	}
 59 | 
 60 | 	public void removeLast() {
 61 | 				
 62 | 		if(this.phrases.size()==0) return;
 63 | 		
 64 | 		AnalysisOutput o = this.phrases.remove(this.phrases.size()-1);
 65 | 		
 66 | 		if(this.phrases.size()==0) {
 67 | 			
 68 | 			this.lastStart = 0;
 69 | 			this.lastEnd = 0;
 70 | 			
 71 | 		} else {
 72 | 			
 73 | 			this.lastEnd -= o.getSource().length();
 74 | 			
 75 | 			if(this.phrases.size()>1) {
 76 | 				AnalysisOutput o1 = this.phrases.get(this.phrases.size()-1);
 77 | 				this.lastStart = lastEnd-o1.getSource().length();
 78 | 			} else {
 79 | 				this.lastStart = 0;
 80 | 			}
 81 | 			
 82 | 		}
 83 | 		
 84 | 	}
 85 | 	
 86 | 	public void addPhrase(AnalysisOutput o) {
 87 | 
 88 | 		this.lastStart = this.lastEnd;
 89 | 		this.lastEnd += o.getSource().length();
 90 | 		
 91 | 		if(o.getCNounList().size()==0)
 92 | 			this.phrases.add(o);
 93 | 		else
 94 | 			addCompounds(o);
 95 | 
 96 | 	}
 97 | 	
 98 | 	private void addCompounds(AnalysisOutput o) {
 99 | 		
100 | 		List<CompoundEntry> cnouns = o.getCNounList();
101 | 			
102 | 		String source = o.getSource();		
103 | 		int rmstemlen = 0;
104 | 		
105 | //		for(int i=0;i<cnouns.size();i++) {
106 | //			System.out.println(cnouns.get(i).getWord());
107 | //		}
108 | 		for(int i=0;i<cnouns.size()-1;i++) {
109 | 			
110 | 			String noun = cnouns.get(i).getWord();			
111 | 			boolean isOnechar = false;
112 | 		
113 | 			// 접두어는 처음 음절에만 온다. 복합명사 분해규칙
114 | 			// 처음이 아닌 경우 1글자는 앞 문자와 결합한다.
115 | 			if(cnouns.get(i).getWord().length()==1 ||
116 | 					cnouns.get(i+1).getWord().length()==1) { // 접두어는 처음 음절에만 온다. 복합명사 분해규칙
117 | 				noun += cnouns.get(i+1).getWord();			
118 | 				isOnechar = true;
119 | 			}
120 | 			
121 | 			if(isOnechar && i>=cnouns.size()-2) break;
122 | 						
123 | 			int score = AnalysisOutput.SCORE_CORRECT;
124 | 			if(!cnouns.get(i).isExist()) score=AnalysisOutput.SCORE_CANDIDATE;
125 | 			
126 | 			AnalysisOutput o1 = new AnalysisOutput(noun, null, null, 
127 | 					PatternConstants.POS_NOUN, PatternConstants.PTN_N, score);
128 | 			
129 | 			o1.setSource(noun);
130 | 			
131 | 			if(isOnechar) {
132 | 				o1.addCNoun(cnouns.get(i));
133 | 				o1.addCNoun(cnouns.get(i+1));
134 | 			}
135 | 		
136 | 			if(source.length()>noun.length())
137 | 				source = source.substring(noun.length());
138 | 			
139 | 			this.phrases.add(o1);
140 | 			cnouns.remove(cnouns.get(0));
141 | 			i--;
142 | 			
143 | 			if(isOnechar) {
144 | 				cnouns.remove(cnouns.get(0));
145 | 			}
146 | 
147 | 		}
148 | 		
149 | 		o.setStem(o.getStem().substring(o.getSource().length()-source.length()));
150 | 		o.setSource(source);
151 | 		if(cnouns.size()==1) cnouns.remove(0);
152 | 	
153 | 		this.phrases.add(o);
154 | 
155 | 	}
156 | 	
157 | 	public void setPhrases(List<AnalysisOutput> phrases) {
158 | 		this.phrases = phrases;
159 | 	}
160 | 	
161 | 	public WSOutput clone() throws CloneNotSupportedException {
162 | 				
163 | 		WSOutput candidate = (WSOutput)super.clone();
164 | 		
165 | 		candidate.setLastStart(lastStart);
166 | 		
167 | 		candidate.setLastEnd(lastEnd);
168 | 		
169 | 		List list = new ArrayList();
170 | 		list.addAll(phrases);
171 | 		candidate.setPhrases(list);
172 | 		
173 | 		return candidate;
174 | 	}
175 | }
176 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanLongestNounEngine.java:
--------------------------------------------------------------------------------
  1 | package com.tistory.devyongsik.analyzer;
  2 | 
  3 | import java.util.HashMap;
  4 | import java.util.List;
  5 | import java.util.Map;
  6 | 
  7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 11 | import org.apache.lucene.util.AttributeSource;
 12 | import org.slf4j.Logger;
 13 | import org.slf4j.LoggerFactory;
 14 | 
 15 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory;
 16 | 
 17 | public class KoreanLongestNounEngine implements Engine {
 18 | 
 19 | 	private Logger logger = LoggerFactory.getLogger(KoreanLongestNounEngine.class);
 20 | 	
 21 | 	private static Map<String, String> customNounsDic = new HashMap<String, String>();
 22 | 	
 23 | 	
 24 | 	public KoreanLongestNounEngine() {
 25 | 		if(logger.isInfoEnabled()) {
 26 | 			logger.info("init KoreanLongestNounEngine");
 27 | 		}
 28 | 		
 29 | 		customNounsDic = DictionaryFactory.getFactory().getCustomNounDictionaryMap();
 30 | 	}
 31 | 
 32 | 	@Override
 33 | 	public void collectNounState(AttributeSource attributeSource, List<ComparableState> comparableStateList, Map<String, String> returnedTokens) throws Exception {
 34 | 		
 35 | 		
 36 | 		CharTermAttribute termAttr = attributeSource.getAttribute(CharTermAttribute.class);
 37 | 		TypeAttribute typeAttr = attributeSource.getAttribute(TypeAttribute.class);
 38 | 		OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class);
 39 | 		PositionIncrementAttribute positionAttr = attributeSource.getAttribute(PositionIncrementAttribute.class);
 40 | 
 41 | 		if(!typeAttr.type().equals("word")) {
 42 | 			
 43 | 			if(logger.isDebugEnabled()) {
 44 | 				logger.debug("명사 분석 대상이 아닙니다.");
 45 | 			}
 46 | 			
 47 | 			return;
 48 | 		}
 49 | 		
 50 | 		String term = termAttr.toString();
 51 | 		//단어 자체에 대한 명사인지 평가
 52 | 		if(customNounsDic.containsKey(term)) {
 53 | 			typeAttr.setType("long_noun");
 54 | 		}
 55 | 		
 56 | 		returnedTokens.put(term+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), "");
 57 | 		
 58 | 		String comparedWord = null;
 59 | 		
 60 | 		//1. 사전과 매칭되는 가장 긴 단어를 추출한다.
 61 | 		int startIndex = 0;
 62 | 		int endIndex = startIndex + 1;
 63 | 		
 64 | 		int orgStartOffSet = offSetAttr.startOffset();
 65 | 		
 66 | 		int prevMatchedStartIndex = 0;
 67 | 		int prevMatchedEndIndex = 0;
 68 | 		
 69 | 		String matchedTerm = "";
 70 | 		
 71 | 		while(true) {
 72 | 			
 73 | 			if(endIndex > term.length()) {
 74 | 				
 75 | 				if(matchedTerm.length() > 0 && !term.equals(matchedTerm)) { //endIndex가 끝까지 갔고, 매칭된 키워드가 있음
 76 | 
 77 | 					int startOffSet = orgStartOffSet + prevMatchedStartIndex;
 78 | 					int endOffSet = orgStartOffSet + prevMatchedEndIndex;
 79 | 					
 80 | 					String makeKeyForCheck = matchedTerm + "_" + startOffSet + "_" + endOffSet;
 81 | 					
 82 | 					if(returnedTokens.containsKey(makeKeyForCheck)) {
 83 | 						
 84 | 						if(logger.isDebugEnabled()) {
 85 | 							logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip");
 86 | 						}
 87 | 						
 88 | 						matchedTerm = "";
 89 | 						
 90 | 						startIndex = prevMatchedEndIndex;
 91 | 						endIndex = startIndex + 1;
 92 | 						
 93 | 						continue;
 94 | 						
 95 | 					} else {
 96 | 						returnedTokens.put(makeKeyForCheck, "");
 97 | 					}
 98 | 					
 99 | 					termAttr.setEmpty();
100 | 					termAttr.append(matchedTerm);
101 | 
102 | 					positionAttr.setPositionIncrement(1);  //추출된 명사이기 때문에 위치정보를 1로 셋팅
103 | 					//타입을 noun으로 설정한다.
104 | 					typeAttr.setType("long_noun"); 
105 | 					
106 | 					offSetAttr.setOffset(startOffSet , endOffSet);
107 | 					
108 | 					ComparableState comparableState = new ComparableState();
109 | 					comparableState.setState(attributeSource.captureState());
110 | 					comparableState.setStartOffset(offSetAttr.startOffset());
111 | 					
112 | 					comparableStateList.add(comparableState);
113 | 					
114 | 					matchedTerm = "";
115 | 					
116 | 					startIndex = prevMatchedEndIndex;
117 | 					endIndex = startIndex + 1;
118 | 				} else {
119 | 					
120 | 					if(startIndex == prevMatchedEndIndex) {
121 | 						startIndex++;
122 | 						endIndex = startIndex + 1;
123 | 					} else {
124 | 						startIndex = endIndex;
125 | 						endIndex = startIndex + 1;
126 | 					}
127 | 				}
128 | 				
129 | 				
130 | 			}
131 | 			
132 | 			if(startIndex >= term.length()) {
133 | 				break;
134 | 			}
135 | 			
136 | 			comparedWord = term.substring(startIndex, endIndex);
137 | 			
138 | 			//매칭될 때 우선 matchedTerm에 저장
139 | 			if(customNounsDic.containsKey(comparedWord)) {
140 | 				matchedTerm = comparedWord;
141 | 				prevMatchedStartIndex = startIndex;
142 | 				prevMatchedEndIndex = endIndex;
143 | 			}
144 | 			
145 | 			endIndex++;
146 | 			
147 | 		}//end while
148 | 
149 | 		return;
150 | 	}
151 | 
152 | }
153 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/utils/ConstraintUtil.java:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  3 |  * contributor license agreements.  See the NOTICE file distributed with
  4 |  * this work for additional information regarding copyright ownership.
  5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  6 |  * (the "License"); you may not use this file except in compliance with
  7 |  * the License.  You may obtain a copy of the License at
  8 |  *
  9 |  *     http://www.apache.org/licenses/LICENSE-2.0
 10 |  *
 11 |  * Unless required by applicable law or agreed to in writing, software
 12 |  * distributed under the License is distributed on an "AS IS" BASIS,
 13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 |  * See the License for the specific language governing permissions and
 15 |  * limitations under the License.
 16 |  */
 17 | package org.apache.lucene.analysis.kr.utils;
 18 | 
 19 | import java.util.HashMap;
 20 | 
 21 | import java.util.Map;
 22 | 
 23 | import org.apache.lucene.analysis.kr.morph.PatternConstants;
 24 | 
 25 | /**
 26 |  * 결합이 가능한 조건을 처리하는 클래스
 27 |  * @author smlee
 28 |  *
 29 |  */
 30 | public class ConstraintUtil {
 31 | 
 32 | 	private static Map hahes = new HashMap(); // "글로벌화해 ", "민족화해" 처럼 화해와 결합이 가능한 명사
 33 | 	static {
 34 | 		hahes.put("민족", "Y");hahes.put("동서", "Y");hahes.put("남북", "Y");
 35 | 	}
 36 | 	
 37 | 	private static Map eomiPnouns = new HashMap(); 
 38 | 	static {
 39 | 		eomiPnouns.put("ㄴ", "Y");eomiPnouns.put("ㄹ", "Y");eomiPnouns.put("ㅁ", "Y");
 40 | 	}
 41 | 	
 42 | 	private static Map PTN_MLIST= new HashMap();
 43 | 	static {
 44 | 		PTN_MLIST.put(PatternConstants.PTN_NSM, PatternConstants.PTN_NSM);
 45 | 		PTN_MLIST.put(PatternConstants.PTN_NSMXM, PatternConstants.PTN_NSMXM);
 46 | 		PTN_MLIST.put(PatternConstants.PTN_NJCM, PatternConstants.PTN_NJCM);
 47 | 		PTN_MLIST.put(PatternConstants.PTN_VM, PatternConstants.PTN_VM);
 48 | 		PTN_MLIST.put(PatternConstants.PTN_VMCM, PatternConstants.PTN_VMCM);
 49 | 		PTN_MLIST.put(PatternConstants.PTN_VMXM, PatternConstants.PTN_VMXM);
 50 | 		PTN_MLIST.put(PatternConstants.PTN_NVM, PatternConstants.PTN_NVM);
 51 | 	}
 52 | 	
 53 | 	private static Map PTN_JLIST= new HashMap();
 54 | 	static {
 55 | 		PTN_JLIST.put(PatternConstants.PTN_NJ, PatternConstants.PTN_NJ);
 56 | 		PTN_JLIST.put(PatternConstants.PTN_NSMJ, PatternConstants.PTN_NSMJ);
 57 | 		PTN_JLIST.put(PatternConstants.PTN_VMJ, PatternConstants.PTN_VMJ);
 58 | 		PTN_JLIST.put(PatternConstants.PTN_VMXMJ, PatternConstants.PTN_VMXMJ);
 59 | 	}
 60 | 	
 61 | 	private static Map WORD_GUKS= new HashMap();
 62 | 	static {
 63 | 		WORD_GUKS.put("날것", "Y");
 64 | 		WORD_GUKS.put("들것", "Y");
 65 | 		WORD_GUKS.put("별것", "Y");
 66 | 		WORD_GUKS.put("찰것", "Y");
 67 | 		WORD_GUKS.put("탈것", "Y");
 68 | 		WORD_GUKS.put("하잘것", "Y");
 69 | 	}
 70 | 	
 71 | 	// 종성이 있는 음절과 연결될 수 없는 조사
 72 | 	private static Map JOSA_TWO= new HashMap();
 73 | 	static {
 74 | 		JOSA_TWO.put("가", "Y");
 75 | 		JOSA_TWO.put("는", "Y");
 76 | 		JOSA_TWO.put("다", "Y");
 77 | 		JOSA_TWO.put("나", "Y");
 78 | 		JOSA_TWO.put("니", "Y");
 79 | 		JOSA_TWO.put("고", "Y");
 80 | 		JOSA_TWO.put("라", "Y");
 81 | 		JOSA_TWO.put("와", "Y");
 82 | 		JOSA_TWO.put("랑", "Y");
 83 | 		JOSA_TWO.put("를", "Y");
 84 | 		JOSA_TWO.put("며", "Y");
 85 | 		JOSA_TWO.put("든", "Y");
 86 | 		JOSA_TWO.put("야", "Y");
 87 | 		JOSA_TWO.put("여", "Y");
 88 | 	}
 89 | 	
 90 | 	// 종성이 없는 음절과 연결될 수 없는 조사
 91 | 	private static Map JOSA_THREE= new HashMap();
 92 | 	static {
 93 | 		JOSA_THREE.put("과", "Y");
 94 | 		JOSA_THREE.put("은", "Y");
 95 | 		JOSA_THREE.put("아", "Y");
 96 | 		JOSA_THREE.put("으", "Y");
 97 | 		JOSA_THREE.put("은", "Y");
 98 | 		JOSA_THREE.put("을", "Y");
 99 | 	}
100 | 	
101 | 	public static boolean canHaheCompound(String key) {
102 | 		if(hahes.get(key)!=null) return true;
103 | 		return false;
104 | 	}
105 | 		
106 | 	/**
107 | 	 * 어미가 ㄴ,ㄹ,ㅁ 으로 끝나는지 조사한다.
108 | 	 * @param eomi
109 | 	 * @return
110 | 	 */
111 | 	public static boolean isNLM(String eomi) {
112 | 		
113 | 		if(eomi==null || "".equals(eomi)) return false;
114 | 		
115 | 		if(eomiPnouns.get(eomi)!=null) return true;
116 | 		
117 | 		char[] chrs = MorphUtil.decompose(eomi.charAt(eomi.length()-1));
118 | 		if(chrs.length==3  && eomiPnouns.get(Character.toString(chrs[2]))!=null) return true;
119 | 		
120 | 		return true;
121 | 		
122 | 	}
123 | 	
124 | 	public static boolean isEomiPhrase(int ptn) {
125 | 		
126 | 		if(PTN_MLIST.get(ptn)!=null) return true;
127 | 		
128 | 		return false;
129 | 		
130 | 	}
131 | 	
132 | 	public static boolean isJosaNounPhrase(int ptn) {
133 | 		
134 | 		if(PTN_JLIST.get(ptn)!=null) return true;
135 | 		
136 | 		return false;
137 | 		
138 | 	}
139 | 	
140 | 	public static boolean isJosaAdvPhrase(int ptn) {
141 | 		
142 | 		if(PatternConstants.PTN_ADVJ==ptn) return true;
143 | 		
144 | 		return false;
145 | 		
146 | 	}
147 | 	
148 | 	public static boolean isAdvPhrase(int ptn) {
149 | 		
150 | 		if(PatternConstants.PTN_ADVJ==ptn || PatternConstants.PTN_AID==ptn) return true;
151 | 		
152 | 		return false;
153 | 		
154 | 	}
155 | 	
156 | 	public static boolean isTwoJosa(String josa) {
157 | 		
158 | 		return (JOSA_TWO.get(josa)!=null);
159 | 		
160 | 	}
161 | 	public static boolean isThreeJosa(String josa) {
162 | 		
163 | 		return (JOSA_THREE.get(josa)!=null);
164 | 		
165 | 	}	
166 | }
167 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/utils/KoreanEnv.java:
--------------------------------------------------------------------------------
  1 | package org.apache.lucene.analysis.kr.utils;
  2 | 
  3 | /**
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *     http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | import java.io.ByteArrayInputStream;
 21 | import java.io.File;
 22 | import java.io.FileInputStream;
 23 | import java.util.Properties;
 24 | 
 25 | import org.apache.lucene.analysis.kr.morph.MorphException;
 26 | 
 27 | public class KoreanEnv {
 28 | 
 29 | 	public static final String FILE_SYLLABLE_FEATURE = "syllable.dic";
 30 | 	
 31 | 	public static final String FILE_DICTIONARY = "dictionary.dic";	
 32 | 	
 33 | 	public static final String FILE_JOSA = "josa.dic";
 34 | 	
 35 | 	public static final String FILE_EOMI = "eomi.dic";
 36 | 	
 37 | 	public static final String FILE_EXTENSION = "extension.dic";
 38 | 	
 39 | 	public static final String FILE_PREFIX = "prefix.dic";
 40 | 	
 41 | 	public static final String FILE_SUFFIX = "suffix.dic";	
 42 | 	
 43 | 	public static final String FILE_COMPOUNDS = "compounds.dic";	
 44 | 	
 45 | 	public static final String FILE_UNCOMPOUNDS = "uncompounds.dic";
 46 | 		
 47 | 	public static final String FILE_CJ = "cj.dic";
 48 | 	
 49 | 	public static final String FILE_KOREAN_PROPERTY = "org/apache/lucene/analysis/kr/korean.properties";
 50 | 	
 51 | 	private Properties defaults = null;
 52 | 
 53 | 	/**
 54 | 	 * The props member gets its values from the configuration in the property file.
 55 | 	 */
 56 | 	private Properties props = null;
 57 | 	
 58 | 	private static KoreanEnv instance = null;
 59 | 	
 60 | 	/**
 61 | 	 * The constructor loads property values from the property file.
 62 | 	 */
 63 | 	private KoreanEnv() throws MorphException {
 64 | 		try {
 65 | 			initDefaultProperties();
 66 | 			props = loadProperties(defaults);
 67 | 		} catch (MorphException e) {
 68 | 			throw new MorphException ("Failure while initializing property values:\n"+e.getMessage());
 69 | 		}
 70 | 	}
 71 | 	
 72 | 	public static KoreanEnv getInstance() throws MorphException {
 73 | 		if(instance==null)
 74 | 			instance = new KoreanEnv();
 75 | 
 76 | 		return instance;
 77 | 	}
 78 | 	
 79 | 	/**
 80 | 	 * Initialize the default property values.
 81 | 	 */
 82 | 	private void initDefaultProperties() {
 83 | 		defaults = new Properties();
 84 | 		
 85 | 		defaults.setProperty(FILE_SYLLABLE_FEATURE,"org/apache/lucene/analysis/kr/dic/syllable.dic");
 86 | 		defaults.setProperty(FILE_DICTIONARY,"org/apache/lucene/analysis/kr/dic/dictionary.dic");
 87 | 		defaults.setProperty(FILE_DICTIONARY,"org/apache/lucene/analysis/kr/dic/extension.dic");		
 88 | 		defaults.setProperty(FILE_JOSA,"org/apache/lucene/analysis/kr/dic/josa.dic");	
 89 | 		defaults.setProperty(FILE_EOMI,"org/apache/lucene/analysis/kr/dic/eomi.dic");	
 90 | 		defaults.setProperty(FILE_PREFIX,"org/apache/lucene/analysis/kr/dic/prefix.dic");		
 91 | 		defaults.setProperty(FILE_SUFFIX,"org/apache/lucene/analysis/kr/dic/suffix.dic");	
 92 | 		defaults.setProperty(FILE_COMPOUNDS,"org/apache/lucene/analysis/kr/dic/compounds.dic");	
 93 | 		defaults.setProperty(FILE_UNCOMPOUNDS,"org/apache/lucene/analysis/kr/dic/uncompounds.dic");
 94 | 		defaults.setProperty(FILE_CJ,"org/apache/lucene/analysis/kr/dic/cj.dic");
 95 | 	 }
 96 | 
 97 | 	
 98 | 	/**
 99 | 	 * Given a property file name, load the property file and return an object
100 | 	 * representing the property values.
101 | 	 *
102 | 	 * @param propertyFile The name of the property file to load.
103 | 	 * @param def Default property values, or <code>null</code> if there are no defaults.
104 | 	 * @return The loaded SortedProperties object.
105 | 	 */
106 | 	private Properties loadProperties(Properties def) throws MorphException {
107 | 		Properties properties = new Properties();
108 | 
109 | 		if (def != null) {
110 | 			properties = new Properties(def);
111 | 		}
112 | 
113 | 		File file = null;
114 | 		try {
115 | 			file = FileUtil.getClassLoaderFile(FILE_KOREAN_PROPERTY);
116 | 			if (file != null) {
117 | 				properties.load(new FileInputStream(file));
118 | 				return properties;
119 | 			}
120 | 			
121 | 			byte[] in = FileUtil.readByteFromCurrentJar(FILE_KOREAN_PROPERTY);
122 | 			properties.load(new ByteArrayInputStream(in));
123 | 		} catch (Exception e) {
124 | 			throw new MorphException("Failure while trying to load properties file " + file.getPath(), e);
125 | 		}
126 | 		return properties;
127 | 	}
128 | 	
129 | 	
130 | 	/**
131 | 	 * Returns the value of a property.
132 | 	 *
133 | 	 * @param name The name of the property whose value is to be retrieved.
134 | 	 * @return The value of the property.
135 | 	 */
136 | 	public String getValue(String name) {
137 | 		return props.getProperty(name);
138 | 	}
139 | }
140 | 


--------------------------------------------------------------------------------
/korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/utils/KoreanEnv.java:
--------------------------------------------------------------------------------
  1 | package org.apache.lucene.analysis.kr.utils;
  2 | 
  3 | /**
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 | 
  6 |  * contributor license agreements.  See the NOTICE file distributed with
  7 |  * this work for additional information regarding copyright ownership.
  8 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  9 |  * (the "License"); you may not use this file except in compliance with
 10 |  * the License.  You may obtain a copy of the License at
 11 |  *
 12 |  *     http://www.apache.org/licenses/LICENSE-2.0
 13 |  *
 14 |  * Unless required by applicable law or agreed to in writing, software
 15 |  * distributed under the License is distributed on an "AS IS" BASIS,
 16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 17 |  * See the License for the specific language governing permissions and
 18 |  * limitations under the License.
 19 |  */
 20 | 
 21 | import java.io.ByteArrayInputStream;
 22 | import java.io.File;
 23 | import java.io.FileInputStream;
 24 | import java.util.Properties;
 25 | 
 26 | import org.apache.lucene.analysis.kr.morph.MorphException;
 27 | 
 28 | public class KoreanEnv {
 29 | 
 30 | 	public static final String FILE_SYLLABLE_FEATURE = "syllable.dic";
 31 | 	
 32 | 	public static final String FILE_DICTIONARY = "dictionary.dic";	
 33 | 	
 34 | 	public static final String FILE_JOSA = "josa.dic";
 35 | 	
 36 | 	public static final String FILE_EOMI = "eomi.dic";
 37 | 	
 38 | 	public static final String FILE_EXTENSION = "extension.dic";
 39 | 	
 40 | 	public static final String FILE_PREFIX = "prefix.dic";
 41 | 	
 42 | 	public static final String FILE_SUFFIX = "suffix.dic";	
 43 | 	
 44 | 	public static final String FILE_COMPOUNDS = "compounds.dic";	
 45 | 	
 46 | 	public static final String FILE_UNCOMPOUNDS = "uncompounds.dic";
 47 | 		
 48 | 	public static final String FILE_CJ = "cj.dic";
 49 | 	
 50 | 	public static final String FILE_KOREAN_PROPERTY = "org/apache/lucene/analysis/kr/korean.properties";
 51 | 	
 52 | 	private Properties defaults = null;
 53 | 
 54 | 	/**
 55 | 	 * The props member gets its values from the configuration in the property file.
 56 | 	 */
 57 | 	private Properties props = null;
 58 | 	
 59 | 	private static KoreanEnv instance = null;
 60 | 	
 61 | 	/**
 62 | 	 * The constructor loads property values from the property file.
 63 | 	 */
 64 | 	private KoreanEnv() throws MorphException {
 65 | 		try {
 66 | 			initDefaultProperties();
 67 | 			props = loadProperties(defaults);
 68 | 		} catch (MorphException e) {
 69 | 			throw new MorphException ("Failure while initializing property values:\n"+e.getMessage());
 70 | 		}
 71 | 	}
 72 | 	
 73 | 	public static KoreanEnv getInstance() throws MorphException {
 74 | 		if(instance==null)
 75 | 			instance = new KoreanEnv();
 76 | 
 77 | 		return instance;
 78 | 	}
 79 | 	
 80 | 	/**
 81 | 	 * Initialize the default property values.
 82 | 	 */
 83 | 	private void initDefaultProperties() {
 84 | 		defaults = new Properties();
 85 | 		
 86 | 		defaults.setProperty(FILE_SYLLABLE_FEATURE,"org/apache/lucene/analysis/kr/dic/syllable.dic");
 87 | 		defaults.setProperty(FILE_DICTIONARY,"org/apache/lucene/analysis/kr/dic/dictionary.dic");
 88 | 		defaults.setProperty(FILE_DICTIONARY,"org/apache/lucene/analysis/kr/dic/extension.dic");		
 89 | 		defaults.setProperty(FILE_JOSA,"org/apache/lucene/analysis/kr/dic/josa.dic");	
 90 | 		defaults.setProperty(FILE_EOMI,"org/apache/lucene/analysis/kr/dic/eomi.dic");	
 91 | 		defaults.setProperty(FILE_PREFIX,"org/apache/lucene/analysis/kr/dic/prefix.dic");		
 92 | 		defaults.setProperty(FILE_SUFFIX,"org/apache/lucene/analysis/kr/dic/suffix.dic");	
 93 | 		defaults.setProperty(FILE_COMPOUNDS,"org/apache/lucene/analysis/kr/dic/compounds.dic");	
 94 | 		defaults.setProperty(FILE_UNCOMPOUNDS,"org/apache/lucene/analysis/kr/dic/uncompounds.dic");
 95 | 		defaults.setProperty(FILE_CJ,"org/apache/lucene/analysis/kr/dic/cj.dic");
 96 | 	 }
 97 | 
 98 | 	
 99 | 	/**
100 | 	 * Given a property file name, load the property file and return an object
101 | 	 * representing the property values.
102 | 	 *
103 | 	 * @param propertyFile The name of the property file to load.
104 | 	 * @param def Default property values, or <code>null</code> if there are no defaults.
105 | 	 * @return The loaded SortedProperties object.
106 | 	 */
107 | 	private Properties loadProperties(Properties def) throws MorphException {
108 | 		Properties properties = new Properties();
109 | 
110 | 		if (def != null) {
111 | 			properties = new Properties(def);
112 | 		}
113 | 
114 | 		File file = null;
115 | 		try {
116 | 			file = FileUtil.getClassLoaderFile(FILE_KOREAN_PROPERTY);
117 | 			if (file != null) {
118 | 				properties.load(new FileInputStream(file));
119 | 				return properties;
120 | 			}
121 | 			
122 | 			byte[] in = FileUtil.readByteFromCurrentJar(FILE_KOREAN_PROPERTY);
123 | 			properties.load(new ByteArrayInputStream(in));
124 | 		} catch (Exception e) {
125 | 			throw new MorphException("Failure while trying to load properties file " + file.getPath(), e);
126 | 		}
127 | 		return properties;
128 | 	}
129 | 	
130 | 	
131 | 	/**
132 | 	 * Returns the value of a property.
133 | 	 *
134 | 	 * @param name The name of the property whose value is to be retrieved.
135 | 	 * @return The value of the property.
136 | 	 */
137 | 	public String getValue(String name) {
138 | 		return props.getProperty(name);
139 | 	}
140 | }
141 | 


--------------------------------------------------------------------------------
/korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/utils/JarResources.java:
--------------------------------------------------------------------------------
  1 | package org.apache.lucene.analysis.kr.utils;
  2 | 
  3 | /**
  4 |  * Licensed to the Apache Software Foundation (ASF) under one or more
  5 |  * contributor license agreements.  See the NOTICE file distributed with
  6 |  * this work for additional information regarding copyright ownership.
  7 |  * The ASF licenses this file to You under the Apache License, Version 2.0
  8 |  * (the "License"); you may not use this file except in compliance with
  9 |  * the License.  You may obtain a copy of the License at
 10 |  *
 11 |  *     http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | 
 20 | import java.io.*;
 21 | import java.util.*;
 22 | import java.util.zip.*;
 23 | 
 24 | /**
 25 |  * JarResources: JarResources maps all resources included in a
 26 |  * Zip or Jar file. Additionaly, it provides a method to extract one
 27 |  * as a blob.
 28 |  */
 29 | public final class JarResources {
 30 | 
 31 |    // external debug flag
 32 |    public boolean debugOn=false;
 33 | 
 34 |    // jar resource mapping tables
 35 |    private Hashtable htSizes=new Hashtable();  
 36 | 
 37 |    // a jar file
 38 |    private String jarFileName;
 39 | 
 40 |    /**
 41 |     * creates a JarResources. It extracts all resources from a Jar
 42 |     * into an internal hashtable, keyed by resource names.
 43 |     * @param jarFileName a jar or zip file
 44 |     */
 45 |    public JarResources(String jarFileName) {
 46 |       this.jarFileName=jarFileName;
 47 |    }
 48 | 
 49 |    /**
 50 |     * Extracts a jar resource as a blob.
 51 |     * @param name a resource name.
 52 |     */
 53 |    public byte[] getResource(String name) {
 54 |       return read(name);
 55 |    }
 56 | 
 57 |    /**
 58 |     * initializes internal hash tables with Jar file resources.
 59 |     */
 60 |    private byte[] read(String name) {
 61 |       try {
 62 |           // extracts just sizes only. 
 63 |           ZipFile zf=new ZipFile(jarFileName);
 64 |           Enumeration e=zf.entries();
 65 |           while (e.hasMoreElements()) {
 66 |               ZipEntry ze=(ZipEntry)e.nextElement();
 67 |               if (debugOn) {
 68 |                  System.out.println(dumpZipEntry(ze));
 69 |               }
 70 |               htSizes.put(ze.getName(),new Integer((int)ze.getSize()));
 71 |           }
 72 |           zf.close();
 73 | 
 74 |           // extract resources and put them into the hashtable.
 75 |           FileInputStream fis=new FileInputStream(jarFileName);
 76 |           BufferedInputStream bis=new BufferedInputStream(fis);
 77 |           ZipInputStream zis=new ZipInputStream(bis);
 78 |           ZipEntry ze=null;
 79 |           while ((ze=zis.getNextEntry())!=null) {
 80 |              if (ze.isDirectory()) {
 81 |                 continue;
 82 |              }
 83 |              if (debugOn) {
 84 |                 System.out.println(
 85 |                    "ze.getName()="+ze.getName()+","+"getSize()="+ze.getSize()
 86 |                    );
 87 |              }
 88 |              int size=(int)ze.getSize();
 89 |              // -1 means unknown size. 
 90 |              if (size==-1) {
 91 |                 size=((Integer)htSizes.get(ze.getName())).intValue();
 92 |              }
 93 |              byte[] b=new byte[(int)size];
 94 |              int rb=0;
 95 |              int chunk=0;
 96 |              while (((int)size - rb) > 0) {
 97 |                  chunk=zis.read(b,rb,(int)size - rb);
 98 |                  if (chunk==-1) {
 99 |                     break;
100 |                  }
101 |                  rb+=chunk;
102 |              }
103 | 
104 |              if (debugOn) {
105 |                 System.out.println(
106 |                    ze.getName()+"  rb="+rb+
107 |                    ",size="+size+
108 |                    ",csize="+ze.getCompressedSize()
109 |                    );
110 |              }
111 |              
112 |              if(ze.getName().equals(name))	{
113 |             	 return b;
114 |              }
115 |           }
116 |        } catch (NullPointerException e) {
117 |           System.out.println("done.");
118 |        } catch (FileNotFoundException e) {
119 |           e.printStackTrace();
120 |        } catch (IOException e) {
121 |           e.printStackTrace();
122 |        }
123 |        
124 |        return null;
125 |    }
126 | 
127 |    /**
128 |     * Dumps a zip entry into a string.
129 |     * @param ze a ZipEntry
130 |     */
131 |    private String dumpZipEntry(ZipEntry ze) {
132 |        StringBuffer sb=new StringBuffer();
133 |        if (ze.isDirectory()) {
134 |           sb.append("d "); 
135 |        } else {
136 |           sb.append("f "); 
137 |        }
138 |        if (ze.getMethod()==ZipEntry.STORED) {
139 |           sb.append("stored   "); 
140 |        } else {
141 |           sb.append("defalted ");
142 |        }
143 |        sb.append(ze.getName());
144 |        sb.append("\t");
145 |        sb.append(""+ze.getSize());
146 |        if (ze.getMethod()==ZipEntry.DEFLATED) {
147 |           sb.append("/"+ze.getCompressedSize());
148 |        }
149 |        return (sb.toString());
150 |    }
151 | 
152 | }


--------------------------------------------------------------------------------