├── .gradle └── 1.6 │ └── taskArtifacts │ ├── cache.properties.lock │ ├── cache.properties │ ├── fileHashes.bin │ ├── fileSnapshots.bin │ ├── taskArtifacts.bin │ └── outputFileStates.bin ├── settings.gradle ├── .gitignore ├── korean-analyzer-3.x ├── src │ ├── main │ │ ├── java │ │ │ ├── org │ │ │ │ └── apache │ │ │ │ │ └── lucene │ │ │ │ │ └── analysis │ │ │ │ │ └── kr │ │ │ │ │ ├── dic │ │ │ │ │ ├── cj.dic │ │ │ │ │ ├── prefix.dic │ │ │ │ │ ├── compounds.dic │ │ │ │ │ ├── occurrence.dic │ │ │ │ │ ├── suffix.dic │ │ │ │ │ └── josa.dic │ │ │ │ │ ├── AttributeWrapper.java │ │ │ │ │ ├── morph │ │ │ │ │ ├── NounProperty.java │ │ │ │ │ ├── Status.java │ │ │ │ │ ├── WSOuputComparator.java │ │ │ │ │ ├── WSCandidateComparator.java │ │ │ │ │ ├── MorphException.java │ │ │ │ │ ├── CompoundEntry.java │ │ │ │ │ ├── WSAOutput.java │ │ │ │ │ ├── MorphAnalyzerManager.java │ │ │ │ │ ├── AnalysisOutputComparator.java │ │ │ │ │ ├── SpaceOutput.java │ │ │ │ │ ├── WordEntry.java │ │ │ │ │ └── WSOutput.java │ │ │ │ │ ├── korean.properties │ │ │ │ │ ├── utils │ │ │ │ │ ├── UnhandledException.java │ │ │ │ │ ├── UnmodifiableIterator.java │ │ │ │ │ ├── HanjaUtils.java │ │ │ │ │ ├── ConstraintUtil.java │ │ │ │ │ ├── KoreanEnv.java │ │ │ │ │ └── JarResources.java │ │ │ │ │ ├── KoreanFilter.java │ │ │ │ │ ├── KoreanTokenizer.java │ │ │ │ │ ├── KoreanTokenizerImpl.java │ │ │ │ │ └── KoreanAnalyzer.java │ │ │ └── com │ │ │ │ └── tistory │ │ │ │ └── devyongsik │ │ │ │ ├── analyzer │ │ │ │ ├── dictionary │ │ │ │ │ ├── stop.txt │ │ │ │ │ ├── compounds.txt │ │ │ │ │ ├── synonym.txt │ │ │ │ │ └── DictionaryType.java │ │ │ │ ├── Engine.java │ │ │ │ ├── dictionary.properties │ │ │ │ ├── KoreanNounFilter.java │ │ │ │ ├── KoreanAnalyzer.java │ │ │ │ ├── KoreanStopFilter.java │ │ │ │ ├── DictionaryProperties.java │ │ │ │ ├── KoreanCompoundNounEngine.java │ │ │ │ ├── dictionaryindex │ │ │ │ │ └── SynonymDictionaryIndex.java │ │ │ │ ├── KoreanBaseNounEngine.java │ │ │ │ └── KoreanLongestNounEngine.java │ │ │ │ └── utils │ │ │ │ └── NounDictionaryDuplWordRemover.java │ │ └── resources │ │ │ └── logback.groovy │ └── test │ │ └── java │ │ └── com │ │ └── tistory │ │ └── devyongsik │ │ └── analyzer │ │ ├── DictionaryPropertiesTest.java │ │ ├── dictionary │ │ └── DictionaryFactoryTest.java │ │ ├── util │ │ ├── TestToken.java │ │ └── AnalyzerTestUtil.java │ │ ├── AnalyzerTest.java │ │ ├── KoreanStopFilterTest.java │ │ ├── KoreanMorphEngineTest.java │ │ ├── KoreanSynonymEngineTest.java │ │ ├── KoreanCompoundNounEngineTest.java │ │ └── KoreanCharacterTokenizerTest.java ├── deploy_to_local_repo.sh ├── README ├── NOTICE.txt └── build.gradle ├── korean-analyzer-4.x ├── src │ ├── main │ │ ├── java │ │ │ ├── org │ │ │ │ └── apache │ │ │ │ │ └── lucene │ │ │ │ │ └── analysis │ │ │ │ │ └── kr │ │ │ │ │ ├── dic │ │ │ │ │ ├── cj.dic │ │ │ │ │ ├── prefix.dic │ │ │ │ │ ├── compounds.dic │ │ │ │ │ ├── occurrence.dic │ │ │ │ │ ├── suffix.dic │ │ │ │ │ └── josa.dic │ │ │ │ │ ├── AttributeWrapper.java │ │ │ │ │ ├── korean.properties │ │ │ │ │ ├── KoreanFilter.java │ │ │ │ │ ├── KoreanTokenizer.java │ │ │ │ │ ├── utils │ │ │ │ │ ├── UnmodifiableIterator.java │ │ │ │ │ ├── UnhandledException.java │ │ │ │ │ ├── HanjaUtils.java │ │ │ │ │ ├── ConstraintUtil.java │ │ │ │ │ └── KoreanEnv.java │ │ │ │ │ ├── morph │ │ │ │ │ ├── NounProperty.java │ │ │ │ │ ├── MorphException.java │ │ │ │ │ ├── MorphAnalyzerManager.java │ │ │ │ │ ├── Status.java │ │ │ │ │ ├── WSOuputComparator.java │ │ │ │ │ ├── AnalysisOutputComparator.java │ │ │ │ │ ├── WSCandidateComparator.java │ │ │ │ │ ├── CompoundEntry.java │ │ │ │ │ ├── WSAOutput.java │ │ │ │ │ ├── WordEntry.java │ │ │ │ │ ├── SpaceOutput.java │ │ │ │ │ └── WSOutput.java │ │ │ │ │ ├── KoreanTokenizerImpl.java │ │ │ │ │ └── KoreanAnalyzer.java │ │ │ └── com │ │ │ │ └── tistory │ │ │ │ └── devyongsik │ │ │ │ └── analyzer │ │ │ │ ├── dictionary │ │ │ │ ├── stop.txt │ │ │ │ ├── compounds.txt │ │ │ │ ├── synonym.txt │ │ │ │ └── DictionaryType.java │ │ │ │ ├── dictionary.properties │ │ │ │ ├── Engine.java │ │ │ │ ├── ComparableState.java │ │ │ │ ├── KoreanNounFilter.java │ │ │ │ ├── KoreanAnalyzer.java │ │ │ │ ├── KoreanStopFilter.java │ │ │ │ ├── DictionaryProperties.java │ │ │ │ ├── KoreanCompoundNounEngine.java │ │ │ │ ├── dictionaryindex │ │ │ │ └── SynonymDictionaryIndex.java │ │ │ │ ├── KoreanBaseNounEngine.java │ │ │ │ └── KoreanLongestNounEngine.java │ │ └── resources │ │ │ └── logback.groovy │ └── test │ │ └── java │ │ └── com │ │ └── tistory │ │ └── devyongsik │ │ └── analyzer │ │ ├── DictionaryPropertiesTest.java │ │ ├── dictionary │ │ └── DictionaryFactoryTest.java │ │ ├── util │ │ ├── TestToken.java │ │ └── AnalyzerTestUtil.java │ │ ├── AnalyzerTest.java │ │ ├── KoreanStopFilterTest.java │ │ ├── KoreanMorphEngineTest.java │ │ ├── KoreanSynonymEngineTest.java │ │ ├── KoreanCompoundNounEngineTest.java │ │ └── KoreanCharacterTokenizerTest.java ├── deploy_to_local_repo.sh ├── README.md ├── README ├── NOTICE.txt └── build.gradle └── README.md /.gradle/1.6/taskArtifacts/cache.properties.lock: -------------------------------------------------------------------------------- 1 |  -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- 1 | include "korean-analyzer-3.x", "korean-analyzer-4.x" -------------------------------------------------------------------------------- /.gradle/1.6/taskArtifacts/cache.properties: -------------------------------------------------------------------------------- 1 | #Thu Sep 05 23:49:40 KST 2013 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | .settings 3 | .project 4 | .classpath 5 | bin 6 | .gradle 7 | build 8 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/dic/cj.dic: -------------------------------------------------------------------------------- 1 | ################### 2 | 金融:금융 -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/dic/cj.dic: -------------------------------------------------------------------------------- 1 | ################### 2 | 金融:금융 -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary/stop.txt: -------------------------------------------------------------------------------- 1 | 꼭 2 | 잘 3 | nbsp 4 | the 5 | . -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary/stop.txt: -------------------------------------------------------------------------------- 1 | 꼭 2 | 잘 3 | nbsp 4 | the 5 | . -------------------------------------------------------------------------------- /.gradle/1.6/taskArtifacts/fileHashes.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/need4spd/lucene-Korean-Analyzer/HEAD/.gradle/1.6/taskArtifacts/fileHashes.bin -------------------------------------------------------------------------------- /.gradle/1.6/taskArtifacts/fileSnapshots.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/need4spd/lucene-Korean-Analyzer/HEAD/.gradle/1.6/taskArtifacts/fileSnapshots.bin -------------------------------------------------------------------------------- /.gradle/1.6/taskArtifacts/taskArtifacts.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/need4spd/lucene-Korean-Analyzer/HEAD/.gradle/1.6/taskArtifacts/taskArtifacts.bin -------------------------------------------------------------------------------- /.gradle/1.6/taskArtifacts/outputFileStates.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/need4spd/lucene-Korean-Analyzer/HEAD/.gradle/1.6/taskArtifacts/outputFileStates.bin -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/dic/prefix.dic: -------------------------------------------------------------------------------- 1 | #### 2 | 최 3 | 고 4 | 남 5 | 여 6 | 비 7 | 유 8 | 무 9 | 군 10 | 각 11 | 기 -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/dic/prefix.dic: -------------------------------------------------------------------------------- 1 | #### 2 | 최 3 | 고 4 | 남 5 | 여 6 | 비 7 | 유 8 | 무 9 | 군 10 | 각 11 | 기 -------------------------------------------------------------------------------- /korean-analyzer-3.x/deploy_to_local_repo.sh: -------------------------------------------------------------------------------- 1 | #mvn -DaltDeploymentRepository=snapshot-repo::default::file:../need4spd-maven-repo/snapshots clean deploy 2 | gradle uploadArchives 3 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/deploy_to_local_repo.sh: -------------------------------------------------------------------------------- 1 | #mvn -DaltDeploymentRepository=snapshot-repo::default::file:../need4spd-maven-repo/snapshots clean deploy 2 | gradle uploadArchives 3 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary/compounds.txt: -------------------------------------------------------------------------------- 1 | 컴퓨터공략:컴퓨터,공략 2 | 일본어공부:일본어,공부 3 | 스프링프로그래밍공부:스프링,프로그래밍,공부 4 | 랑콤아이크림:랑콤,아이크림 5 | 월드컵조직위원회분과위:월드컵,조직,위원회,분과위 -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary/compounds.txt: -------------------------------------------------------------------------------- 1 | 컴퓨터공략:컴퓨터,공략 2 | 일본어공부:일본어,공부 3 | 스프링프로그래밍공부:스프링,프로그래밍,공부 4 | 랑콤아이크림:랑콤,아이크림 5 | 월드컵조직위원회분과위:월드컵,조직,위원회,분과위 -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/dic/compounds.dic: -------------------------------------------------------------------------------- 1 | ################### 2 | 밤하늘:밤,하늘 3 | 경전철:경,전철 4 | 가서명:가,서명 5 | 가입국:가,입국 6 | 갓김치:갓,김치 7 | 과소비:과,소비 8 | 고투자율:고투자,투자,투자율 -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/dic/compounds.dic: -------------------------------------------------------------------------------- 1 | ################### 2 | 밤하늘:밤,하늘 3 | 경전철:경,전철 4 | 가서명:가,서명 5 | 가입국:가,입국 6 | 갓김치:갓,김치 7 | 과소비:과,소비 8 | 고투자율:고투자,투자,투자율 -------------------------------------------------------------------------------- /korean-analyzer-4.x/README.md: -------------------------------------------------------------------------------- 1 | lucene-Korean-Analyzer_4x 2 | ========================= 3 | 4 | lucene korean analyzer for lucene4.x 5 | 6 | 7 | more information in https://github.com/need4spd/lucene-Korean-Analyzer 8 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/AttributeWrapper.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr; 2 | 3 | 4 | @Deprecated 5 | public class AttributeWrapper { 6 | private AttributeWrapper() { 7 | 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/AttributeWrapper.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr; 2 | 3 | 4 | @Deprecated 5 | public class AttributeWrapper { 6 | private AttributeWrapper() { 7 | 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | lucene-Korean-Analyzer 2 | ====================== 3 | 4 | Lucene Analyzer For Korean 5 | 6 | 이수명님의 Analyzer(http://cafe.naver.com/korlucene)를 형태소분석 Filter로 변형하여 사용하고 있으며 7 | 직접 개발한 동의어 Filter, 복합명사 Filter등을 추가로 붙여 개발한 루씬용 한글 분석기 입니다. 8 | 9 | 루씬3.X, 루씬4.X 두가지 버전이 있으며, 3.X버전은 추가 업데이트는 없고 현재는 10 | 4.X 버전에 대해서만 업데이트를 진행하고 있습니다. 11 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary.properties: -------------------------------------------------------------------------------- 1 | compounds.txt = com/tistory/devyongsik/analyzer/dictionary/compounds.txt 2 | custom.txt = com/tistory/devyongsik/analyzer/dictionary/custom.txt 3 | stop.txt = com/tistory/devyongsik/analyzer/dictionary/stop.txt 4 | synonym.txt = com/tistory/devyongsik/analyzer/dictionary/synonym.txt -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/NounProperty.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.morph; 2 | 3 | /** 4 | * 명사의 유형별 분류 5 | * @author user 6 | * 7 | */ 8 | public class NounProperty { 9 | 10 | // 위치, 장소 11 | public static final String NP_LOCATION = "L"; 12 | 13 | // 물리 측정량 (속도, 각도) 14 | public static final String NP_MEASURE = "M"; 15 | 16 | 17 | } 18 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary/synonym.txt: -------------------------------------------------------------------------------- 1 | 오라클,oracle 2 | db,database,데이터베이스 3 | 노트북,노트북pc,노트북컴퓨터,노트북피씨,notebook 4 | 튜닝,tunning 5 | sql,쿼리 6 | 서버,server 7 | 이클립스,eclipse 8 | 배너,banner 9 | 우리벤처,우리벤쳐 10 | 데이타,데이터,data 11 | 모델링,modeling 12 | 평,평형 13 | 코롱,코오롱 14 | 엔유씨,nuc 15 | 아디다스,adidas 16 | 필라,fila,휠라 17 | 테팔,tefal 18 | 캐논,canon 19 | 니콘,nikon 20 | 코원,cowon 21 | 론,lone -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary/synonym.txt: -------------------------------------------------------------------------------- 1 | 오라클,oracle 2 | db,database,데이터베이스 3 | 노트북,노트북pc,노트북컴퓨터,노트북피씨,notebook 4 | 튜닝,tunning 5 | sql,쿼리 6 | 서버,server 7 | 이클립스,eclipse 8 | 배너,banner 9 | 우리벤처,우리벤쳐 10 | 데이타,데이터,data 11 | 모델링,modeling 12 | 평,평형 13 | 코롱,코오롱 14 | 엔유씨,nuc 15 | 아디다스,adidas 16 | 필라,fila,휠라 17 | 테팔,tefal 18 | 캐논,canon 19 | 니콘,nikon 20 | 코원,cowon 21 | 론,lone -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/DictionaryPropertiesTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import junit.framework.Assert; 4 | 5 | import org.junit.Test; 6 | 7 | public class DictionaryPropertiesTest { 8 | 9 | @Test 10 | public void propertiesLoad() { 11 | DictionaryProperties dp = DictionaryProperties.getInstance(); 12 | Assert.assertNotNull(dp); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/Engine.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.util.List; 4 | import java.util.Map; 5 | 6 | import org.apache.lucene.util.AttributeSource; 7 | 8 | public interface Engine { 9 | void collectNounState(AttributeSource attributeSource, List comparableStateList, Map returnedTokens) throws Exception; 10 | } 11 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/DictionaryPropertiesTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import junit.framework.Assert; 4 | 5 | import org.junit.Test; 6 | 7 | public class DictionaryPropertiesTest { 8 | 9 | @Test 10 | public void propertiesLoad() { 11 | DictionaryProperties dp = DictionaryProperties.getInstance(); 12 | Assert.assertNotNull(dp); 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/Engine.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.util.Map; 4 | import java.util.Stack; 5 | 6 | import org.apache.lucene.util.AttributeSource; 7 | import org.apache.lucene.util.AttributeSource.State; 8 | 9 | public interface Engine { 10 | void collectNounState(AttributeSource attributeSource, Stack nounsStack, Map returnedTokens) throws Exception; 11 | } 12 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/dictionary.properties: -------------------------------------------------------------------------------- 1 | compounds.txt = com/tistory/devyongsik/analyzer/dictionary/compounds.txt 2 | custom.txt = com/tistory/devyongsik/analyzer/dictionary/custom.txt 3 | eomi_josa.txt = com/tistory/devyongsik/analyzer/dictionary/eomi_josa.txt 4 | noun.txt = com/tistory/devyongsik/analyzer/dictionary/noun.txt 5 | stop.txt = com/tistory/devyongsik/analyzer/dictionary/stop.txt 6 | synonym.txt = com/tistory/devyongsik/analyzer/dictionary/synonym.txt -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/dictionary/DictionaryFactoryTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer.dictionary; 2 | 3 | import java.util.List; 4 | 5 | import junit.framework.Assert; 6 | 7 | import org.junit.Test; 8 | 9 | public class DictionaryFactoryTest { 10 | 11 | @Test 12 | public void loadDictionary() { 13 | DictionaryFactory factory = DictionaryFactory.getFactory(); 14 | List readWords = factory.getSynonymList(); 15 | 16 | Assert.assertTrue(readWords.size() > 0); 17 | 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/dictionary/DictionaryFactoryTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer.dictionary; 2 | 3 | import java.util.List; 4 | 5 | import junit.framework.Assert; 6 | 7 | import org.junit.Test; 8 | 9 | public class DictionaryFactoryTest { 10 | 11 | @Test 12 | public void loadDictionary() { 13 | DictionaryFactory factory = DictionaryFactory.getFactory(); 14 | List readWords = factory.getSynonymList(); 15 | 16 | Assert.assertTrue(readWords.size() > 0); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/README: -------------------------------------------------------------------------------- 1 | Lucene Korean Analyzer 2 | 3 | 1. 루씬의 Analyzer를 활용한 한글 명사 추출 모듈입니다. 4.2.1 버전을 기반으로 개발 되어있습니다. 4 | 5 | 2. 키워드를 추출하는 방법은 크게 형태소분석과 사전기반의 키워드탐색으로 나누어집니다. 6 | 7 | 3. 형태소분석은 이수명님께서 개발하신 한글형태소분석 Analyzer를 4.2.1버전으로 변환하여 사용하고 있습니다. 8 | (http://cafe.naver.com/korlucene) 9 | 10 | 4. 사전기반의 키워드탐색은 명사사전 (기본사전, 사용자정의 사전)을 사용하여, 들어온 문장을 탐색하여 명사를 찾아냅니다. 11 | 12 | 5. 그외 동의어필터, 스테머필터, 불용어필터등이 사전기반으로 작동하도록 되어있습니다. 13 | 14 | 6. 형태소 분석 테스트는 아래의 페이지에서 해보실 수 있습니다. 15 | 16 | http://ec2-54-248-115-161.ap-northeast-1.compute.amazonaws.com/morphMain.devys 17 | 18 | 더 자세한 사항은 Wiki페이지를 참고하여 주세요. -------------------------------------------------------------------------------- /korean-analyzer-3.x/README: -------------------------------------------------------------------------------- 1 | Lucene Korean Analyzer 2 | 3 | 1. 루씬의 Analyzer를 활용한 한글 명사 추출 모듈입니다. 3.6.1 버전을 기반으로 개발 되어있습니다. 4 | 5 | 2. 키워드를 추출하는 방법은 크게 형태소분석과 사전기반의 키워드탐색으로 나누어집니다. 6 | 7 | 3. 형태소분석은 이수명님께서 개발하신 한글형태소분석 Analyzer를 3.6.1버전으로 변환하여 사용하고 있습니다. 8 | (http://cafe.naver.com/korlucene) 9 | 10 | 4. 사전기반의 키워드탐색은 명사사전 (기본사전, 사용자정의 사전)을 사용하여, 들어온 문장을 탐색하여 명사를 찾아냅니다. 11 | 12 | 5. 그외 동의어필터, 스테머필터, 불용어필터등이 사전기반으로 작동하도록 되어있습니다. 13 | 14 | 6. 형태소 분석 테스트는 아래의 페이지에서 해보실 수 있습니다. 15 | 16 | http://ec2-54-248-115-161.ap-northeast-1.compute.amazonaws.com:8080/crescent/morphMain.devys 17 | 18 | 더 자세한 사항은 Wiki페이지를 참고하여 주세요. 19 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/NOTICE.txt: -------------------------------------------------------------------------------- 1 | Apache Commons logging is The Apache Software License, Version 2.0 2 | http://commons.apache.org/logging/ 3 | 4 | Junit is Common Public License - v 1.0 5 | http://www.junit.org/ 6 | 7 | Log4j is The Apache Software License, Version 2.0 8 | http://logging.apache.org/log4j/1.2/download.html 9 | 10 | Lucene is Apache License, Version 2.0 11 | http://lucene.apache.org/java/docs/index.html 12 | 13 | Twitter4j is Apache License 2.0 14 | http://twitter4j.org/en/index.html 15 | 16 | koreananalyzer is Apache License 2.0 17 | http://sourceforge.net/projects/lucenekorean/ 18 | 19 | 사전은 세종21 프로젝트에 문의 하여 사용의 허락을 득하였습니다. -------------------------------------------------------------------------------- /korean-analyzer-4.x/NOTICE.txt: -------------------------------------------------------------------------------- 1 | Apache Commons logging is The Apache Software License, Version 2.0 2 | http://commons.apache.org/logging/ 3 | 4 | Junit is Common Public License - v 1.0 5 | http://www.junit.org/ 6 | 7 | Log4j is The Apache Software License, Version 2.0 8 | http://logging.apache.org/log4j/1.2/download.html 9 | 10 | Lucene is Apache License, Version 2.0 11 | http://lucene.apache.org/java/docs/index.html 12 | 13 | Twitter4j is Apache License 2.0 14 | http://twitter4j.org/en/index.html 15 | 16 | koreananalyzer is Apache License 2.0 17 | http://sourceforge.net/projects/lucenekorean/ 18 | 19 | 사전은 세종21 프로젝트에 문의 하여 사용의 허락을 득하였습니다. -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/dic/occurrence.dic: -------------------------------------------------------------------------------- 1 | //##################### 2 | F:NILL/에/0:대하^S/NILL/11:0 3 | F:NILL/에/0:관하^S/NILL/11:0 4 | F:NILL/에/0:따르^S/NILL/11:0 5 | F:NILL/기/0:위하^S/NILL/11:0 6 | F:NILL/을,를,ㄹ/0:수^W/NILL/1:0 7 | F:NILL/ㄴ,는,은/0:지^W/NILL/1:0 8 | F:NILL/NILL/0:포함^S/고/0:0 9 | F:NILL/으로/0:하는^W/NILL/0:0 10 | F:NILL/NILL/1:풀^S/NILL/1,2:0 11 | F:NILL/을,를/2:가지^S/는/11:0 12 | F:NILL/을,를/0:둔^W/NILL/11:0 13 | F:NILL/의/0:양^S/NILL/0:0 14 | F:NILL/의/0:량^S/NILL/0:0 15 | R:메^S/지를/0:못하,안하^S/NILL/0:1 16 | R:NILL/ㄴ/0:것^S/NILL/0:0 17 | R:NILL/는/0:것^S/NILL/0:0 18 | R:NILL/은/0:것^S/NILL/0:0 19 | R:NILL/은/0:것^S/NILL/0:0 -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/dic/occurrence.dic: -------------------------------------------------------------------------------- 1 | //##################### 2 | F:NILL/에/0:대하^S/NILL/11:0 3 | F:NILL/에/0:관하^S/NILL/11:0 4 | F:NILL/에/0:따르^S/NILL/11:0 5 | F:NILL/기/0:위하^S/NILL/11:0 6 | F:NILL/을,를,ㄹ/0:수^W/NILL/1:0 7 | F:NILL/ㄴ,는,은/0:지^W/NILL/1:0 8 | F:NILL/NILL/0:포함^S/고/0:0 9 | F:NILL/으로/0:하는^W/NILL/0:0 10 | F:NILL/NILL/1:풀^S/NILL/1,2:0 11 | F:NILL/을,를/2:가지^S/는/11:0 12 | F:NILL/을,를/0:둔^W/NILL/11:0 13 | F:NILL/의/0:양^S/NILL/0:0 14 | F:NILL/의/0:량^S/NILL/0:0 15 | R:메^S/지를/0:못하,안하^S/NILL/0:1 16 | R:NILL/ㄴ/0:것^S/NILL/0:0 17 | R:NILL/는/0:것^S/NILL/0:0 18 | R:NILL/은/0:것^S/NILL/0:0 19 | R:NILL/은/0:것^S/NILL/0:0 -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/korean.properties: -------------------------------------------------------------------------------- 1 | syllable.dic = org/apache/lucene/analysis/kr/dic/syllable.dic 2 | josa.dic = org/apache/lucene/analysis/kr/dic/josa.dic 3 | eomi.dic = org/apache/lucene/analysis/kr/dic/eomi.dic 4 | dictionary.dic = org/apache/lucene/analysis/kr/dic/total.dic 5 | extension.dic = org/apache/lucene/analysis/kr/dic/extension.dic 6 | prefix.dic = org/apache/lucene/analysis/kr/dic/prefix.dic 7 | suffix.dic = org/apache/lucene/analysis/kr/dic/suffix.dic 8 | compounds.dic = org/apache/lucene/analysis/kr/dic/compounds.dic 9 | tagger.dic = org/apache/lucene/analysis/kr/dic/occurrence.dic 10 | cj.dic = org/apache/lucene/analysis/kr/dic/cj.dic -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/korean.properties: -------------------------------------------------------------------------------- 1 | syllable.dic = org/apache/lucene/analysis/kr/dic/syllable.dic 2 | josa.dic = org/apache/lucene/analysis/kr/dic/josa.dic 3 | eomi.dic = org/apache/lucene/analysis/kr/dic/eomi.dic 4 | dictionary.dic = org/apache/lucene/analysis/kr/dic/total.dic 5 | extension.dic = org/apache/lucene/analysis/kr/dic/extension.dic 6 | prefix.dic = org/apache/lucene/analysis/kr/dic/prefix.dic 7 | suffix.dic = org/apache/lucene/analysis/kr/dic/suffix.dic 8 | compounds.dic = org/apache/lucene/analysis/kr/dic/compounds.dic 9 | tagger.dic = org/apache/lucene/analysis/kr/dic/occurrence.dic 10 | cj.dic = org/apache/lucene/analysis/kr/dic/cj.dic -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/ComparableState.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import org.apache.lucene.util.AttributeSource.State; 4 | 5 | public class ComparableState implements Comparable { 6 | 7 | private State state; 8 | private int startOffset; 9 | 10 | public State getState() { 11 | return state; 12 | } 13 | public void setState(State state) { 14 | this.state = state; 15 | } 16 | public int getStartOffset() { 17 | return startOffset; 18 | } 19 | public void setStartOffset(int startOffset) { 20 | this.startOffset = startOffset; 21 | } 22 | 23 | @Override 24 | public int compareTo(ComparableState comparableState) { 25 | return getStartOffset() - comparableState.getStartOffset(); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/dic/suffix.dic: -------------------------------------------------------------------------------- 1 | ##### 2 | 각 3 | 감 4 | 값 5 | 객 6 | 계 7 | 길 8 | 고 9 | 공 10 | 관 11 | 국 12 | 권 13 | 금 14 | 급 15 | 기 16 | 내 17 | 난 18 | 단 19 | 대 20 | 땅 21 | 량 22 | 록 23 | 론 24 | 력 25 | 령 26 | 료 27 | 류 28 | 률 29 | 말 30 | 망 31 | 맵 32 | 문 33 | 물 34 | 면 35 | 밤 36 | 방 37 | 법 38 | 부 39 | 분 40 | 병 41 | 비 42 | 사 43 | 생 44 | 서 45 | 세 46 | 선 47 | 성 48 | 시 49 | 식 50 | 심 51 | 실 52 | 쇼 53 | 수 54 | 속 55 | 안 56 | 어 57 | 액 58 | 염 59 | 율 60 | 원 61 | 용 62 | 음 63 | 인 64 | 일 65 | 위 66 | 자 67 | 장 68 | 족 69 | 제 70 | 증 71 | 주 72 | 중 73 | 직 74 | 진 75 | 집 76 | 적 77 | 전 78 | 점 79 | 죄 80 | 컴 81 | 폭 82 | 품 83 | 표 84 | 판 85 | 팀 86 | 차 87 | 창 88 | 책 89 | 청 90 | 철 91 | 체 92 | 층 93 | 학 94 | 항 95 | 화 96 | 형 97 | 회 -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/dic/suffix.dic: -------------------------------------------------------------------------------- 1 | ##### 2 | 각 3 | 감 4 | 값 5 | 객 6 | 계 7 | 길 8 | 고 9 | 공 10 | 관 11 | 국 12 | 권 13 | 금 14 | 급 15 | 기 16 | 내 17 | 난 18 | 단 19 | 대 20 | 땅 21 | 량 22 | 록 23 | 론 24 | 력 25 | 령 26 | 료 27 | 류 28 | 률 29 | 말 30 | 망 31 | 맵 32 | 문 33 | 물 34 | 면 35 | 밤 36 | 방 37 | 법 38 | 부 39 | 분 40 | 병 41 | 비 42 | 사 43 | 생 44 | 서 45 | 세 46 | 선 47 | 성 48 | 시 49 | 식 50 | 심 51 | 실 52 | 쇼 53 | 수 54 | 속 55 | 안 56 | 어 57 | 액 58 | 염 59 | 율 60 | 원 61 | 용 62 | 음 63 | 인 64 | 일 65 | 위 66 | 자 67 | 장 68 | 족 69 | 제 70 | 증 71 | 주 72 | 중 73 | 직 74 | 진 75 | 집 76 | 적 77 | 전 78 | 점 79 | 죄 80 | 컴 81 | 폭 82 | 품 83 | 표 84 | 판 85 | 팀 86 | 차 87 | 창 88 | 책 89 | 청 90 | 철 91 | 체 92 | 층 93 | 학 94 | 항 95 | 화 96 | 형 97 | 회 -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/Status.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.morph; 2 | 3 | public class Status { 4 | 5 | private int josaMaxStart = 0; 6 | 7 | private int eomiMaxStart = 0; 8 | 9 | private int maxStart = 0; 10 | 11 | public void apply(int num) { 12 | if(maxStart { 6 | 7 | public int compare(AnalysisOutput o1, AnalysisOutput o2) { 8 | 9 | // 길이의 역순으로 정렬한다. 10 | 11 | int score = o2.getScore() - o1.getScore(); 12 | if(score!=0) return score; 13 | 14 | int len = o2.getSource().length() - o1.getSource().length(); 15 | if(len!=0) return len; 16 | 17 | 18 | int ptn = getPtnScore(o2.getPatn()) - getPtnScore(o1.getPatn()); 19 | if(ptn!=0) return ptn; 20 | 21 | int stem = o1.getStem().length() - o2.getStem().length(); 22 | if(stem!=0) return stem; 23 | 24 | 25 | return 0; 26 | } 27 | 28 | private int getPtnScore(int ptn) { 29 | 30 | if(ptn==PatternConstants.PTN_N) ptn = 7; 31 | else if(ptn==PatternConstants.PTN_AID) return 50; 32 | 33 | return ptn; 34 | 35 | } 36 | 37 | } 38 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/KoreanFilter.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | 21 | @Deprecated 22 | public class KoreanFilter { 23 | 24 | private KoreanFilter() { 25 | 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/KoreanFilter.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | 21 | @Deprecated 22 | public class KoreanFilter { 23 | 24 | private KoreanFilter() { 25 | 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/KoreanTokenizer.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | 21 | @Deprecated 22 | public class KoreanTokenizer { 23 | private KoreanTokenizer() { 24 | 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/KoreanTokenizer.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | 21 | @Deprecated 22 | public class KoreanTokenizer { 23 | private KoreanTokenizer() { 24 | 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSCandidateComparator.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.morph; 2 | 3 | import java.util.Comparator; 4 | import java.util.List; 5 | 6 | public class WSCandidateComparator implements Comparator { 7 | 8 | public int compare(WSOutput o1, WSOutput o2) { 9 | 10 | int end = o2.getLastEnd() - o1.getLastEnd(); 11 | if(end!=0) return end; 12 | 13 | int s1 = o1.getPhrases().size()==0 ? 999999999 : o1.getPhrases().size(); 14 | int s2 = o2.getPhrases().size()==0 ? 999999999 : o2.getPhrases().size(); 15 | 16 | int size = s1-s2; 17 | if(size!=0) return size; 18 | 19 | int score = calculateScore(o2)-calculateScore(o1); 20 | if(score!=0) return score; 21 | 22 | return 0; 23 | } 24 | 25 | private int calculateScore(WSOutput o) { 26 | 27 | List entries = o.getPhrases(); 28 | 29 | if(entries.size()==0) return 0; 30 | 31 | int sum = 0; 32 | for(int i=0;iJFlex 1.4.1 27 | * on 12. 1. 3 오전 3:51 from the specification file 28 | * D:/eclipse-workspace/search/kr.analyzer.3x/src/org/apache/lucene/analysis/kr/KoreanTokenizerImpl.jflex 29 | */ 30 | @Deprecated 31 | class KoreanTokenizerImpl { 32 | private KoreanTokenizerImpl() { 33 | 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/KoreanTokenizerImpl.java: -------------------------------------------------------------------------------- 1 | /* The following code was generated by JFlex 1.4.1 on 12. 1. 3 오전 3:51 */ 2 | 3 | package org.apache.lucene.analysis.kr; 4 | 5 | /** 6 | * Licensed to the Apache Software Foundation (ASF) under one or more 7 | * contributor license agreements. See the NOTICE file distributed with 8 | * this work for additional information regarding copyright ownership. 9 | * The ASF licenses this file to You under the Apache License, Version 2.0 10 | * (the "License"); you may not use this file except in compliance with 11 | * the License. You may obtain a copy of the License at 12 | * 13 | * http://www.apache.org/licenses/LICENSE-2.0 14 | * 15 | * Unless required by applicable law or agreed to in writing, software 16 | * distributed under the License is distributed on an "AS IS" BASIS, 17 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | * See the License for the specific language governing permissions and 19 | * limitations under the License. 20 | */ 21 | 22 | 23 | 24 | /** 25 | * This class is a scanner generated by 26 | * JFlex 1.4.1 27 | * on 12. 1. 3 오전 3:51 from the specification file 28 | * D:/eclipse-workspace/search/kr.analyzer.3x/src/org/apache/lucene/analysis/kr/KoreanTokenizerImpl.jflex 29 | */ 30 | @Deprecated 31 | class KoreanTokenizerImpl { 32 | private KoreanTokenizerImpl() { 33 | 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSAOutput.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.morph; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | public class WSAOutput { 7 | 8 | private String source; 9 | 10 | private List results; 11 | 12 | private int wds = 0; 13 | 14 | private int end = 0; 15 | 16 | public WSAOutput() { 17 | results = new ArrayList(); 18 | } 19 | 20 | public WSAOutput(String src) { 21 | source = src; 22 | results = new ArrayList(); 23 | } 24 | 25 | public WSAOutput(String src, List list) { 26 | source = src; 27 | results = list; 28 | } 29 | 30 | public String getSource() { 31 | return source; 32 | } 33 | 34 | public void setSource(String source) { 35 | this.source = source; 36 | } 37 | 38 | public List getResults() { 39 | return results; 40 | } 41 | 42 | public void setResults(List results) { 43 | this.results = results; 44 | } 45 | 46 | public void addNounResults(String word) { 47 | addNounResults(word, null); 48 | } 49 | 50 | public void addNounResults(String word, String end) { 51 | addNounResults(word, end, AnalysisOutput.SCORE_ANALYSIS); 52 | } 53 | 54 | public void addNounResults(String word, String end, int score) { 55 | 56 | AnalysisOutput output = new AnalysisOutput(word, end, null, PatternConstants.PTN_NJ); 57 | if(end==null) output.setPatn(PatternConstants.PTN_N); 58 | 59 | output.setPos(PatternConstants.POS_NOUN); 60 | output.setScore(score); 61 | 62 | this.results.add(output); 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/KoreanAnalyzer.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr; 2 | 3 | 4 | /** 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | import org.apache.lucene.analysis.LowerCaseFilter; 22 | import org.apache.lucene.analysis.StopFilter; 23 | import org.apache.lucene.analysis.standard.StandardFilter; 24 | import org.apache.lucene.analysis.standard.StandardTokenizer; 25 | 26 | /** 27 | * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link 28 | * LowerCaseFilter} and {@link StopFilter}, using a list of English stop words. 29 | * 30 | * @version $Id: KoreanAnalyzer.java,v 1.1 2012/02/08 15:00:11 smlee0818 Exp $ 31 | * @version 2012.11.20 need4spd, Analyzer를 하나로 통일하기 위해 생성하지 못 하도록 수정함 32 | */ 33 | @Deprecated 34 | public class KoreanAnalyzer { 35 | 36 | private KoreanAnalyzer() { 37 | 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/KoreanAnalyzer.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr; 2 | 3 | 4 | /** 5 | * Licensed to the Apache Software Foundation (ASF) under one or more 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | import org.apache.lucene.analysis.core.LowerCaseFilter; 22 | import org.apache.lucene.analysis.core.StopFilter; 23 | import org.apache.lucene.analysis.standard.StandardFilter; 24 | import org.apache.lucene.analysis.standard.StandardTokenizer; 25 | 26 | /** 27 | * Filters {@link StandardTokenizer} with {@link StandardFilter}, {@link 28 | * LowerCaseFilter} and {@link StopFilter}, using a list of English stop words. 29 | * 30 | * @version $Id: KoreanAnalyzer.java,v 1.1 2012/02/08 15:00:11 smlee0818 Exp $ 31 | * @version 2012.11.20 need4spd, Analyzer를 하나로 통일하기 위해 생성하지 못 하도록 수정함 32 | */ 33 | @Deprecated 34 | public class KoreanAnalyzer { 35 | 36 | private KoreanAnalyzer() { 37 | 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/MorphAnalyzerManager.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.morph; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import java.util.List; 21 | 22 | 23 | public class MorphAnalyzerManager { 24 | 25 | public void analyze(String strs) { 26 | MorphAnalyzer analyzer = new MorphAnalyzer(); 27 | String[] tokens = strs.split(" "); 28 | for(String token:tokens) { 29 | try { 30 | List results = analyzer.analyze(token); 31 | for(AnalysisOutput o:results) { 32 | System.out.print(o.toString()+"->"); 33 | for(int i=0;i"); 37 | } 38 | } catch (MorphException e) { 39 | e.printStackTrace(); 40 | } 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/MorphAnalyzerManager.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.morph; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import java.util.List; 21 | 22 | 23 | public class MorphAnalyzerManager { 24 | 25 | public void analyze(String strs) { 26 | MorphAnalyzer analyzer = new MorphAnalyzer(); 27 | String[] tokens = strs.split(" "); 28 | for(String token:tokens) { 29 | try { 30 | List results = analyzer.analyze(token); 31 | for(AnalysisOutput o:results) { 32 | System.out.print(o.toString()+"->"); 33 | for(int i=0;i"); 37 | } 38 | } catch (MorphException e) { 39 | e.printStackTrace(); 40 | } 41 | } 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/utils/HanjaUtils.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.utils; 2 | 3 | import java.io.IOException; 4 | 5 | import java.util.ArrayList; 6 | import java.util.HashMap; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | import org.apache.lucene.analysis.kr.morph.MorphException; 11 | 12 | public class HanjaUtils { 13 | 14 | private static Map mapHanja; 15 | 16 | public synchronized static void loadDictionary() throws MorphException { 17 | try { 18 | List strList = FileUtil.readLines("org/apache/lucene/analysis/kr/dic/mapHanja.dic","UTF-8"); 19 | mapHanja = new HashMap(); 20 | 21 | for(int i=0;i0x9FFF||hanja<0x3400) return new char[]{hanja}; 51 | 52 | char[] result = mapHanja.get(new String(new char[]{hanja})); 53 | if(result==null) return new char[]{hanja}; 54 | 55 | return result; 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/util/TestToken.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer.util; 2 | 3 | 4 | public class TestToken { 5 | private String term; 6 | private int startOffset; 7 | private int endOffset; 8 | public String getTerm() { 9 | return term; 10 | } 11 | public void setTerm(String term) { 12 | this.term = term; 13 | } 14 | public int getStartOffset() { 15 | return startOffset; 16 | } 17 | public void setStartOffset(int startOffset) { 18 | this.startOffset = startOffset; 19 | } 20 | public int getEndOffset() { 21 | return endOffset; 22 | } 23 | public void setEndOffset(int endOffset) { 24 | this.endOffset = endOffset; 25 | } 26 | @Override 27 | public String toString() { 28 | return "TestToken [term=" + term + ", startOffset=" + startOffset 29 | + ", endOffset=" + endOffset + "]"; 30 | } 31 | @Override 32 | public int hashCode() { 33 | final int prime = 31; 34 | int result = 1; 35 | result = prime * result + endOffset; 36 | result = prime * result + startOffset; 37 | result = prime * result + ((term == null) ? 0 : term.hashCode()); 38 | return result; 39 | } 40 | @Override 41 | public boolean equals(Object obj) { 42 | if (this == obj) 43 | return true; 44 | if (obj == null) 45 | return false; 46 | if (getClass() != obj.getClass()) 47 | return false; 48 | TestToken other = (TestToken) obj; 49 | if (endOffset != other.endOffset) 50 | return false; 51 | if (startOffset != other.startOffset) 52 | return false; 53 | if (term == null) { 54 | if (other.term != null) 55 | return false; 56 | } else if (!term.equals(other.term)) 57 | return false; 58 | return true; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/util/TestToken.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer.util; 2 | 3 | 4 | public class TestToken { 5 | private String term; 6 | private int startOffset; 7 | private int endOffset; 8 | public String getTerm() { 9 | return term; 10 | } 11 | public void setTerm(String term) { 12 | this.term = term; 13 | } 14 | public int getStartOffset() { 15 | return startOffset; 16 | } 17 | public void setStartOffset(int startOffset) { 18 | this.startOffset = startOffset; 19 | } 20 | public int getEndOffset() { 21 | return endOffset; 22 | } 23 | public void setEndOffset(int endOffset) { 24 | this.endOffset = endOffset; 25 | } 26 | @Override 27 | public String toString() { 28 | return "TestToken [term=" + term + ", startOffset=" + startOffset 29 | + ", endOffset=" + endOffset + "]"; 30 | } 31 | @Override 32 | public int hashCode() { 33 | final int prime = 31; 34 | int result = 1; 35 | result = prime * result + endOffset; 36 | result = prime * result + startOffset; 37 | result = prime * result + ((term == null) ? 0 : term.hashCode()); 38 | return result; 39 | } 40 | @Override 41 | public boolean equals(Object obj) { 42 | if (this == obj) 43 | return true; 44 | if (obj == null) 45 | return false; 46 | if (getClass() != obj.getClass()) 47 | return false; 48 | TestToken other = (TestToken) obj; 49 | if (endOffset != other.endOffset) 50 | return false; 51 | if (startOffset != other.startOffset) 52 | return false; 53 | if (term == null) { 54 | if (other.term != null) 55 | return false; 56 | } else if (!term.equals(other.term)) 57 | return false; 58 | return true; 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/Status.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.lucene.analysis.kr.morph; 19 | 20 | public class Status { 21 | 22 | private int josaMaxStart = 0; 23 | 24 | private int eomiMaxStart = 0; 25 | 26 | private int maxStart = 0; 27 | 28 | public void apply(int num) { 29 | if(maxStart { 23 | public int compare(AnalysisOutput out1, AnalysisOutput out2) { 24 | 25 | int score = out2.getScore()-out1.getScore(); 26 | int pattern = out2.getPatn()-out1.getPatn(); 27 | int len = out1.getStem().length()-out2.getStem().length(); 28 | 29 | if(score!=0) return score; 30 | 31 | if(out2.getScore()==AnalysisOutput.SCORE_CORRECT && 32 | out1.getScore()==AnalysisOutput.SCORE_CORRECT) { 33 | pattern = out1.getPatn()==PatternConstants.PTN_N || out1.getPatn()==PatternConstants.PTN_AID ? -1 : pattern; 34 | pattern = out2.getPatn()==PatternConstants.PTN_N || out2.getPatn()==PatternConstants.PTN_AID ? 1 : pattern; 35 | } 36 | 37 | if(pattern!=0) return pattern; 38 | 39 | return len; 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSOuputComparator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.lucene.analysis.kr.morph; 18 | 19 | import java.util.Comparator; 20 | 21 | public class WSOuputComparator implements Comparator { 22 | 23 | public int compare(AnalysisOutput o1, AnalysisOutput o2) { 24 | 25 | // 길이의 역순으로 정렬한다. 26 | 27 | int score = o2.getScore() - o1.getScore(); 28 | if(score!=0) return score; 29 | 30 | int len = o2.getSource().length() - o1.getSource().length(); 31 | if(len!=0) return len; 32 | 33 | 34 | int ptn = getPtnScore(o2.getPatn()) - getPtnScore(o1.getPatn()); 35 | if(ptn!=0) return ptn; 36 | 37 | int stem = o1.getStem().length() - o2.getStem().length(); 38 | if(stem!=0) return stem; 39 | 40 | 41 | return 0; 42 | } 43 | 44 | private int getPtnScore(int ptn) { 45 | 46 | if(ptn==PatternConstants.PTN_N) ptn = 7; 47 | else if(ptn==PatternConstants.PTN_AID) return 50; 48 | 49 | return ptn; 50 | 51 | } 52 | 53 | } 54 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/AnalyzerTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.StringReader; 4 | import java.util.HashMap; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | import org.apache.lucene.analysis.Analyzer; 9 | import org.apache.lucene.analysis.TokenStream; 10 | import org.junit.Before; 11 | import org.junit.Test; 12 | 13 | import com.google.common.collect.Lists; 14 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 15 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil; 16 | import com.tistory.devyongsik.analyzer.util.TestToken; 17 | 18 | public class AnalyzerTest extends AnalyzerTestUtil { 19 | private List nouns = null; 20 | private DictionaryFactory dictionaryFactory; 21 | 22 | @Before 23 | public void initDictionary() { 24 | nouns = Lists.newArrayList(); 25 | dictionaryFactory = DictionaryFactory.getFactory(); 26 | } 27 | 28 | @Test 29 | public void testCase1() throws Exception { 30 | 31 | Map customNounDictionaryMap = new HashMap(); 32 | customNounDictionaryMap.put("고속도로", null); 33 | customNounDictionaryMap.put("고속", null); 34 | customNounDictionaryMap.put("도로", null); 35 | 36 | dictionaryFactory.setCustomNounDictionaryMap(customNounDictionaryMap); 37 | 38 | StringReader reader = new StringReader("고속도로"); 39 | 40 | nouns.add(getToken("고속도로", 0, 4)); 41 | nouns.add(getToken("고속도", 0, 3)); 42 | nouns.add(getToken("고속", 0, 2)); 43 | nouns.add(getToken("속도", 1, 3)); 44 | 45 | Analyzer analyzer = new KoreanAnalyzer(true); 46 | TokenStream stream = analyzer.tokenStream("dummy", reader); 47 | stream.reset(); 48 | 49 | List extractedTokens = collectExtractedNouns(stream); 50 | 51 | analyzer.close(); 52 | 53 | verify(nouns, extractedTokens); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/AnalyzerTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.StringReader; 4 | import java.util.HashMap; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | import org.apache.lucene.analysis.Analyzer; 9 | import org.apache.lucene.analysis.TokenStream; 10 | import org.junit.Before; 11 | import org.junit.Test; 12 | 13 | import com.google.common.collect.Lists; 14 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 15 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil; 16 | import com.tistory.devyongsik.analyzer.util.TestToken; 17 | 18 | public class AnalyzerTest extends AnalyzerTestUtil { 19 | private List nouns = null; 20 | private DictionaryFactory dictionaryFactory; 21 | 22 | @Before 23 | public void initDictionary() { 24 | nouns = Lists.newArrayList(); 25 | dictionaryFactory = DictionaryFactory.getFactory(); 26 | } 27 | 28 | @Test 29 | public void testCase1() throws Exception { 30 | 31 | Map customNounDictionaryMap = new HashMap(); 32 | customNounDictionaryMap.put("고속도로", null); 33 | customNounDictionaryMap.put("고속", null); 34 | customNounDictionaryMap.put("도로", null); 35 | 36 | dictionaryFactory.setCustomNounDictionaryMap(customNounDictionaryMap); 37 | 38 | StringReader reader = new StringReader("고속도로"); 39 | 40 | nouns.add(getToken("고속도로", 0, 4)); 41 | nouns.add(getToken("고속도", 0, 3)); 42 | nouns.add(getToken("고속", 0, 2)); 43 | nouns.add(getToken("속도", 1, 3)); 44 | 45 | Analyzer analyzer = new KoreanAnalyzer(true); 46 | TokenStream stream = analyzer.tokenStream("dummy", reader); 47 | stream.reset(); 48 | 49 | List extractedTokens = collectExtractedNouns(stream); 50 | 51 | analyzer.close(); 52 | 53 | verify(nouns, extractedTokens); 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanNounFilter.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.IOException; 4 | import java.util.HashMap; 5 | import java.util.List; 6 | import java.util.Map; 7 | import java.util.Stack; 8 | 9 | import org.apache.lucene.analysis.TokenFilter; 10 | import org.apache.lucene.analysis.TokenStream; 11 | import org.slf4j.Logger; 12 | import org.slf4j.LoggerFactory; 13 | 14 | public class KoreanNounFilter extends TokenFilter { 15 | private Logger logger = LoggerFactory.getLogger(KoreanNounFilter.class); 16 | 17 | private Stack nounsStack = new Stack(); 18 | private List engines; 19 | private Map returnedTokens = new HashMap(); 20 | 21 | protected KoreanNounFilter(TokenStream input, List engines) { 22 | super(input); 23 | this.engines = engines; 24 | } 25 | 26 | @Override 27 | public final boolean incrementToken() throws IOException { 28 | 29 | 30 | if(logger.isDebugEnabled()) 31 | logger.debug("incrementToken KoreanNounFilter"); 32 | 33 | if(engines == null) { 34 | throw new IllegalStateException("KoreanNounFilter의 engines가 Null입니다."); 35 | } 36 | 37 | 38 | if (nounsStack.size() > 0) { 39 | if(logger.isDebugEnabled()) 40 | logger.debug("명사 Stack에서 토큰 리턴함"); 41 | 42 | State synState = nounsStack.pop(); 43 | restoreState(synState); 44 | 45 | return true; 46 | } 47 | 48 | if (!input.incrementToken()) 49 | return false; 50 | 51 | try { 52 | 53 | for(Engine engine : engines) { 54 | engine.collectNounState(input.cloneAttributes(), nounsStack , returnedTokens); 55 | } 56 | 57 | returnedTokens.clear(); 58 | 59 | } catch (Exception e) { 60 | logger.error("명사필터에서 목록 조회 오류"); 61 | e.printStackTrace(); 62 | } 63 | 64 | return true; 65 | } 66 | 67 | } 68 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/AnalysisOutputComparator.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.morph; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import java.util.Comparator; 21 | 22 | public class AnalysisOutputComparator implements Comparator { 23 | public int compare(Object o1, Object o2) { 24 | 25 | AnalysisOutput out1 = (AnalysisOutput)o1; 26 | AnalysisOutput out2 = (AnalysisOutput)o2; 27 | 28 | int score = out2.getScore()-out1.getScore(); 29 | int pattern = out2.getPatn()-out1.getPatn(); 30 | int len = out1.getStem().length()-out2.getStem().length(); 31 | 32 | if(score!=0) return score; 33 | 34 | if(out2.getScore()==AnalysisOutput.SCORE_CORRECT && 35 | out1.getScore()==AnalysisOutput.SCORE_CORRECT) { 36 | pattern = out1.getPatn()==PatternConstants.PTN_N || out1.getPatn()==PatternConstants.PTN_AID ? -1 : pattern; 37 | pattern = out2.getPatn()==PatternConstants.PTN_N || out2.getPatn()==PatternConstants.PTN_AID ? 1 : pattern; 38 | } 39 | 40 | if(pattern!=0) return pattern; 41 | 42 | return len; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanStopFilterTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.IOException; 4 | import java.io.StringReader; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | import org.apache.lucene.analysis.TokenStream; 9 | import org.junit.Before; 10 | import org.junit.Test; 11 | 12 | import com.google.common.collect.Lists; 13 | import com.google.common.collect.Maps; 14 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 15 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil; 16 | import com.tistory.devyongsik.analyzer.util.TestToken; 17 | 18 | public class KoreanStopFilterTest extends AnalyzerTestUtil { 19 | private List tokens = null; 20 | //불용어는 the와 . 21 | private StringReader reader = new StringReader("the 개발하고 꼭 이것을 잘 해야합니다. 공백입니다."); 22 | private DictionaryFactory dictionaryFactory = null; 23 | 24 | @Before 25 | public void setUp() { 26 | tokens = Lists.newArrayList(); 27 | dictionaryFactory = DictionaryFactory.getFactory(); 28 | 29 | tokens.add(getToken("공백입니다", 24, 29)); 30 | tokens.add(getToken("해야합니다", 17, 22)); 31 | tokens.add(getToken("이것을", 11, 14)); 32 | tokens.add(getToken("개발하고", 4, 8)); 33 | tokens.add(getToken("꼭", 9, 10)); 34 | tokens.add(getToken("잘", 15, 16)); 35 | } 36 | 37 | 38 | @Test 39 | public void stopFilter() throws IOException { 40 | 41 | Map stopWordDictionaryMap = Maps.newHashMap(); 42 | stopWordDictionaryMap.put("the", null); 43 | stopWordDictionaryMap.put(".", null); 44 | 45 | dictionaryFactory.setStopWordDictionaryMap(stopWordDictionaryMap); 46 | 47 | TokenStream stream = new KoreanStopFilter(new KoreanCharacterTokenizer(reader)); 48 | stream.reset(); 49 | 50 | List extractedTokens = collectExtractedNouns(stream); 51 | 52 | stream.close(); 53 | 54 | verify(tokens, extractedTokens); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSCandidateComparator.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.lucene.analysis.kr.morph; 18 | 19 | import java.util.Comparator; 20 | import java.util.List; 21 | 22 | public class WSCandidateComparator implements Comparator { 23 | 24 | public int compare(WSOutput o1, WSOutput o2) { 25 | 26 | int end = o2.getLastEnd() - o1.getLastEnd(); 27 | if(end!=0) return end; 28 | 29 | int s1 = o1.getPhrases().size()==0 ? 999999999 : o1.getPhrases().size(); 30 | int s2 = o2.getPhrases().size()==0 ? 999999999 : o2.getPhrases().size(); 31 | 32 | int size = s1-s2; 33 | if(size!=0) return size; 34 | 35 | int score = calculateScore(o2)-calculateScore(o1); 36 | if(score!=0) return score; 37 | 38 | return 0; 39 | } 40 | 41 | private int calculateScore(WSOutput o) { 42 | 43 | List entries = o.getPhrases(); 44 | 45 | if(entries.size()==0) return 0; 46 | 47 | int sum = 0; 48 | for(int i=0;i tokens = null; 20 | //불용어는 the와 . 21 | private StringReader reader = new StringReader("the 개발하고 꼭 이것을 잘 해야합니다. 공백입니다."); 22 | private DictionaryFactory dictionaryFactory = null; 23 | 24 | @Before 25 | public void setUp() { 26 | tokens = Lists.newArrayList(); 27 | dictionaryFactory = DictionaryFactory.getFactory(); 28 | 29 | tokens.add(getToken("공백입니다", 24, 29)); 30 | tokens.add(getToken("해야합니다", 17, 22)); 31 | tokens.add(getToken("이것을", 11, 14)); 32 | tokens.add(getToken("개발하고", 4, 8)); 33 | tokens.add(getToken("꼭", 9, 10)); 34 | tokens.add(getToken("잘", 15, 16)); 35 | } 36 | 37 | 38 | @Test 39 | public void stopFilter() throws IOException { 40 | 41 | Map stopWordDictionaryMap = Maps.newHashMap(); 42 | stopWordDictionaryMap.put("the", null); 43 | stopWordDictionaryMap.put(".", null); 44 | 45 | dictionaryFactory.setStopWordDictionaryMap(stopWordDictionaryMap); 46 | 47 | TokenStream stream = new KoreanStopFilter(new KoreanCharacterTokenizer(reader)); 48 | stream.reset(); 49 | 50 | List extractedTokens = collectExtractedNouns(stream); 51 | 52 | stream.close(); 53 | 54 | verify(tokens, extractedTokens); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanMorphEngineTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.StringReader; 4 | import java.util.List; 5 | 6 | import org.apache.lucene.analysis.TokenStream; 7 | import org.junit.Before; 8 | import org.junit.Test; 9 | 10 | import com.google.common.collect.Lists; 11 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil; 12 | import com.tistory.devyongsik.analyzer.util.TestToken; 13 | 14 | /** 15 | * @author need4spd, need4spd@cplanet.co.kr, 2011. 10. 14. 16 | * 17 | */ 18 | public class KoreanMorphEngineTest extends AnalyzerTestUtil { 19 | private List nouns = null; 20 | 21 | private List engines = null; 22 | 23 | @Before 24 | public void initDictionary() { 25 | nouns = Lists.newArrayList(); 26 | engines = Lists.newArrayList(); 27 | 28 | engines.add(new KoreanMorphEngine()); 29 | } 30 | 31 | @Test 32 | public void testCase1() throws Exception { 33 | StringReader reader = new StringReader("기본사전이변경되었습니다"); 34 | nouns.add(getToken("기본사전이변경", 0, 7)); 35 | nouns.add(getToken("기본", 0, 2)); 36 | nouns.add(getToken("전이", 3, 5)); 37 | nouns.add(getToken("변경", 5, 7)); 38 | nouns.add(getToken("기본사전이변경되었습니다", 0, 12)); 39 | 40 | TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines); 41 | stream.reset(); 42 | 43 | List extractedTokens = collectExtractedNouns(stream); 44 | 45 | stream.close(); 46 | 47 | verify(nouns, extractedTokens); 48 | } 49 | 50 | @Test 51 | public void testCase2() throws Exception { 52 | StringReader reader = new StringReader("worldcup경기장"); 53 | nouns.add(getToken("worldcup", 0, 8)); 54 | nouns.add(getToken("경기장", 8, 11)); 55 | 56 | TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines); 57 | stream.reset(); 58 | 59 | List extractedTokens = collectExtractedNouns(stream); 60 | 61 | stream.close(); 62 | 63 | verify(nouns, extractedTokens); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanSynonymEngineTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.IOException; 4 | import java.io.StringReader; 5 | import java.util.List; 6 | 7 | import org.apache.lucene.analysis.TokenStream; 8 | import org.junit.Before; 9 | import org.junit.Test; 10 | 11 | import com.google.common.collect.Lists; 12 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 13 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil; 14 | import com.tistory.devyongsik.analyzer.util.TestToken; 15 | 16 | public class KoreanSynonymEngineTest extends AnalyzerTestUtil { 17 | private List synonymWordList = null; 18 | private List engines = null; 19 | private DictionaryFactory dictionaryFactory = null; 20 | private List nouns = null; 21 | 22 | @Before 23 | public void setUp() throws Exception { 24 | 25 | synonymWordList = Lists.newArrayList(); 26 | engines = Lists.newArrayList(); 27 | dictionaryFactory = DictionaryFactory.getFactory(); 28 | nouns = Lists.newArrayList(); 29 | 30 | synonymWordList.add("노트북"); 31 | synonymWordList.add("노트북pc"); 32 | synonymWordList.add("노트북컴퓨터"); 33 | synonymWordList.add("노트북피씨"); 34 | synonymWordList.add("notebook"); 35 | 36 | engines.add(new KoreanSynonymEngine()); 37 | 38 | dictionaryFactory.setSynonymList(synonymWordList); 39 | } 40 | 41 | @Test 42 | public void testSynonym() throws IOException { 43 | StringReader reader = new StringReader("노트북"); 44 | nouns.add(getToken("노트북", 0, 3)); 45 | nouns.add(getToken("노트북pc", 0, 3)); 46 | nouns.add(getToken("노트북컴퓨터", 0, 3)); 47 | nouns.add(getToken("노트북피씨", 0, 3)); 48 | nouns.add(getToken("notebook", 0, 3)); 49 | 50 | TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines); 51 | stream.reset(); 52 | 53 | List extractedTokens = collectExtractedNouns(stream); 54 | 55 | stream.close(); 56 | 57 | verify(nouns, extractedTokens); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanMorphEngineTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.StringReader; 4 | import java.util.List; 5 | 6 | import org.apache.lucene.analysis.TokenStream; 7 | import org.junit.Before; 8 | import org.junit.Test; 9 | 10 | import com.google.common.collect.Lists; 11 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil; 12 | import com.tistory.devyongsik.analyzer.util.TestToken; 13 | 14 | /** 15 | * @author need4spd, need4spd@cplanet.co.kr, 2011. 10. 14. 16 | * 17 | */ 18 | public class KoreanMorphEngineTest extends AnalyzerTestUtil { 19 | private List nouns = null; 20 | 21 | private List engines = null; 22 | 23 | @Before 24 | public void initDictionary() { 25 | nouns = Lists.newArrayList(); 26 | engines = Lists.newArrayList(); 27 | 28 | engines.add(new KoreanMorphEngine()); 29 | } 30 | 31 | @Test 32 | public void testCase1() throws Exception { 33 | StringReader reader = new StringReader("기본사전이변경되었습니다"); 34 | nouns.add(getToken("기본사전이변경", 0, 7)); 35 | nouns.add(getToken("기본", 0, 2)); 36 | nouns.add(getToken("전이", 3, 5)); 37 | nouns.add(getToken("변경", 5, 7)); 38 | nouns.add(getToken("기본사전이변경되었습니다", 0, 12)); 39 | 40 | TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines); 41 | stream.reset(); 42 | 43 | List extractedTokens = collectExtractedNouns(stream); 44 | 45 | stream.close(); 46 | 47 | verify(nouns, extractedTokens); 48 | } 49 | 50 | @Test 51 | public void testCase2() throws Exception { 52 | StringReader reader = new StringReader("worldcup경기장"); 53 | nouns.add(getToken("worldcup", 0, 8)); 54 | nouns.add(getToken("경기장", 8, 11)); 55 | 56 | TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines); 57 | stream.reset(); 58 | 59 | List extractedTokens = collectExtractedNouns(stream); 60 | 61 | stream.close(); 62 | 63 | verify(nouns, extractedTokens); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanSynonymEngineTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.IOException; 4 | import java.io.StringReader; 5 | import java.util.List; 6 | 7 | import org.apache.lucene.analysis.TokenStream; 8 | import org.junit.Before; 9 | import org.junit.Test; 10 | 11 | import com.google.common.collect.Lists; 12 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 13 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil; 14 | import com.tistory.devyongsik.analyzer.util.TestToken; 15 | 16 | public class KoreanSynonymEngineTest extends AnalyzerTestUtil { 17 | private List synonymWordList = null; 18 | private List engines = null; 19 | private DictionaryFactory dictionaryFactory = null; 20 | private List nouns = null; 21 | 22 | @Before 23 | public void setUp() throws Exception { 24 | 25 | synonymWordList = Lists.newArrayList(); 26 | engines = Lists.newArrayList(); 27 | dictionaryFactory = DictionaryFactory.getFactory(); 28 | nouns = Lists.newArrayList(); 29 | 30 | synonymWordList.add("노트북"); 31 | synonymWordList.add("노트북pc"); 32 | synonymWordList.add("노트북컴퓨터"); 33 | synonymWordList.add("노트북피씨"); 34 | synonymWordList.add("notebook"); 35 | 36 | engines.add(new KoreanSynonymEngine()); 37 | 38 | dictionaryFactory.setSynonymList(synonymWordList); 39 | } 40 | 41 | @Test 42 | public void testSynonym() throws IOException { 43 | StringReader reader = new StringReader("노트북"); 44 | nouns.add(getToken("노트북", 0, 3)); 45 | nouns.add(getToken("노트북pc", 0, 3)); 46 | nouns.add(getToken("노트북컴퓨터", 0, 3)); 47 | nouns.add(getToken("노트북피씨", 0, 3)); 48 | nouns.add(getToken("notebook", 0, 3)); 49 | 50 | TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines); 51 | stream.reset(); 52 | 53 | List extractedTokens = collectExtractedNouns(stream); 54 | 55 | stream.close(); 56 | 57 | verify(nouns, extractedTokens); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/util/AnalyzerTestUtil.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer.util; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import org.apache.lucene.analysis.TokenStream; 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 9 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 10 | 11 | import com.google.common.collect.Lists; 12 | 13 | import junit.framework.Assert; 14 | 15 | 16 | public class AnalyzerTestUtil { 17 | protected TestToken getToken(String term, int start, int end) { 18 | TestToken t = new TestToken(); 19 | t.setTerm(term); 20 | t.setStartOffset(start); 21 | t.setEndOffset(end); 22 | 23 | return t; 24 | } 25 | 26 | protected void verify(List expactedTokens, List extractedTokens) { 27 | 28 | for(TestToken testToken : expactedTokens) { 29 | Assert.assertTrue("[" + testToken + "] is expacted but not.", extractedTokens.contains(testToken)); 30 | } 31 | } 32 | 33 | protected List collectExtractedNouns(TokenStream stream) throws IOException { 34 | CharTermAttribute charTermAtt = stream.addAttribute(CharTermAttribute.class); 35 | OffsetAttribute offSetAtt = stream.addAttribute(OffsetAttribute.class); 36 | TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class); 37 | 38 | List extractedTokens = Lists.newArrayList(); 39 | 40 | while(stream.incrementToken()) { 41 | TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset()); 42 | 43 | System.out.println("termAtt.term() : " + charTermAtt.toString()); 44 | System.out.println("startoffSetAtt : " + offSetAtt.startOffset()); 45 | System.out.println("endoffSetAtt : " + offSetAtt.endOffset()); 46 | System.out.println("typeAttr : " + typeAttr.toString()); 47 | 48 | extractedTokens.add(t); 49 | } 50 | 51 | return extractedTokens; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/util/AnalyzerTestUtil.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer.util; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import org.apache.lucene.analysis.TokenStream; 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 9 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 10 | 11 | import com.google.common.collect.Lists; 12 | 13 | import junit.framework.Assert; 14 | 15 | 16 | public class AnalyzerTestUtil { 17 | protected TestToken getToken(String term, int start, int end) { 18 | TestToken t = new TestToken(); 19 | t.setTerm(term); 20 | t.setStartOffset(start); 21 | t.setEndOffset(end); 22 | 23 | return t; 24 | } 25 | 26 | protected void verify(List expactedTokens, List extractedTokens) { 27 | 28 | for(TestToken testToken : expactedTokens) { 29 | Assert.assertTrue("[" + testToken + "] is expacted but not.", extractedTokens.contains(testToken)); 30 | } 31 | } 32 | 33 | protected List collectExtractedNouns(TokenStream stream) throws IOException { 34 | CharTermAttribute charTermAtt = stream.addAttribute(CharTermAttribute.class); 35 | OffsetAttribute offSetAtt = stream.addAttribute(OffsetAttribute.class); 36 | TypeAttribute typeAttr = stream.addAttribute(TypeAttribute.class); 37 | 38 | List extractedTokens = Lists.newArrayList(); 39 | 40 | while(stream.incrementToken()) { 41 | TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset()); 42 | 43 | System.out.println("termAtt.term() : " + charTermAtt.toString()); 44 | System.out.println("startoffSetAtt : " + offSetAtt.startOffset()); 45 | System.out.println("endoffSetAtt : " + offSetAtt.endOffset()); 46 | System.out.println("typeAttr : " + typeAttr.toString()); 47 | 48 | extractedTokens.add(t); 49 | } 50 | 51 | return extractedTokens; 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanNounFilter.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.IOException; 4 | import java.util.ArrayList; 5 | import java.util.Collections; 6 | import java.util.HashMap; 7 | import java.util.List; 8 | import java.util.Map; 9 | 10 | import org.apache.lucene.analysis.TokenFilter; 11 | import org.apache.lucene.analysis.TokenStream; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | public class KoreanNounFilter extends TokenFilter { 16 | private Logger logger = LoggerFactory.getLogger(KoreanNounFilter.class); 17 | 18 | private List comparableStateList = new ArrayList(); 19 | private List engines; 20 | private Map returnedTokens = new HashMap(); 21 | 22 | protected KoreanNounFilter(TokenStream input, List engines) { 23 | super(input); 24 | this.engines = engines; 25 | } 26 | 27 | @Override 28 | public final boolean incrementToken() throws IOException { 29 | 30 | 31 | if(logger.isDebugEnabled()) 32 | logger.debug("incrementToken KoreanNounFilter"); 33 | 34 | if(engines == null) { 35 | throw new IllegalStateException("KoreanNounFilter의 engines가 Null입니다."); 36 | } 37 | 38 | 39 | if (comparableStateList.size() > 0) { 40 | if(logger.isDebugEnabled()) 41 | logger.debug("명사 Stack에서 토큰 리턴함"); 42 | 43 | ComparableState comparableState = comparableStateList.get(0); 44 | comparableStateList.remove(0); 45 | State synState = comparableState.getState(); 46 | restoreState(synState); 47 | 48 | return true; 49 | } 50 | 51 | if (!input.incrementToken()) 52 | return false; 53 | 54 | try { 55 | 56 | for(Engine engine : engines) { 57 | engine.collectNounState(input.cloneAttributes(), comparableStateList , returnedTokens); 58 | } 59 | 60 | returnedTokens.clear(); 61 | Collections.sort(comparableStateList); //startoffset이 순서대로 나오도록... 62 | 63 | } catch (Exception e) { 64 | logger.error("명사필터에서 목록 조회 오류"); 65 | e.printStackTrace(); 66 | } 67 | 68 | return true; 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanAnalyzer.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.Reader; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import org.apache.lucene.analysis.Analyzer; 8 | import org.apache.lucene.analysis.TokenStream; 9 | import org.apache.lucene.analysis.Tokenizer; 10 | import org.apache.lucene.util.Version; 11 | 12 | /** 13 | * @author need4spd, need4spd@cplanet.co.kr, 2011. 8. 31. 14 | * 15 | */ 16 | public class KoreanAnalyzer extends Analyzer { 17 | 18 | private boolean isIndexingMode = false; 19 | 20 | public KoreanAnalyzer() { 21 | isIndexingMode = true; 22 | } 23 | 24 | public KoreanAnalyzer(boolean isIndexingMode) { 25 | this.isIndexingMode = isIndexingMode; 26 | } 27 | 28 | @Override 29 | protected TokenStreamComponents createComponents(final String fieldName, 30 | final Reader reader) { 31 | 32 | if(isIndexingMode) { 33 | List nounExtractEngines = new ArrayList(); 34 | nounExtractEngines.add(new KoreanCompoundNounEngine()); 35 | nounExtractEngines.add(new KoreanBaseNounEngine()); 36 | nounExtractEngines.add(new KoreanLongestNounEngine()); 37 | nounExtractEngines.add(new KoreanSynonymEngine()); 38 | nounExtractEngines.add(new KoreanMorphEngine()); 39 | 40 | Tokenizer tokenizer = new KoreanCharacterTokenizer(Version.LUCENE_44, reader); 41 | TokenStream tok = new KoreanNounFilter(tokenizer, nounExtractEngines); 42 | tok = new KoreanStopFilter(tok); 43 | 44 | return new TokenStreamComponents(tokenizer, tok); 45 | } else { 46 | List nounExtractEngines = new ArrayList(); 47 | nounExtractEngines.add(new KoreanCompoundNounEngine()); 48 | nounExtractEngines.add(new KoreanLongestNounEngine()); 49 | nounExtractEngines.add(new KoreanSynonymEngine()); 50 | nounExtractEngines.add(new KoreanMorphEngine()); 51 | 52 | Tokenizer tokenizer = new KoreanCharacterTokenizer(Version.LUCENE_44, reader); 53 | TokenStream tok = new KoreanNounFilter(tokenizer, nounExtractEngines); 54 | tok = new KoreanStopFilter(tok); 55 | 56 | return new TokenStreamComponents(tokenizer, tok); 57 | } 58 | } 59 | 60 | } -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanAnalyzer.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.Reader; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | 7 | import org.apache.lucene.analysis.ReusableAnalyzerBase; 8 | import org.apache.lucene.analysis.TokenStream; 9 | import org.apache.lucene.analysis.Tokenizer; 10 | import org.apache.lucene.util.Version; 11 | 12 | /** 13 | * @author need4spd, need4spd@cplanet.co.kr, 2011. 8. 31. 14 | * 15 | */ 16 | public class KoreanAnalyzer extends ReusableAnalyzerBase { 17 | 18 | private boolean isIndexingMode = false; 19 | 20 | public KoreanAnalyzer() { 21 | isIndexingMode = true; 22 | } 23 | 24 | public KoreanAnalyzer(boolean isIndexingMode) { 25 | this.isIndexingMode = isIndexingMode; 26 | } 27 | 28 | @Override 29 | protected TokenStreamComponents createComponents(final String fieldName, 30 | final Reader reader) { 31 | 32 | if(isIndexingMode) { 33 | List nounExtractEngines = new ArrayList(); 34 | nounExtractEngines.add(new KoreanCompoundNounEngine()); 35 | nounExtractEngines.add(new KoreanBaseNounEngine()); 36 | nounExtractEngines.add(new KoreanLongestNounEngine()); 37 | nounExtractEngines.add(new KoreanSynonymEngine()); 38 | nounExtractEngines.add(new KoreanMorphEngine()); 39 | 40 | Tokenizer tokenizer = new KoreanCharacterTokenizer(Version.LUCENE_36, reader); 41 | TokenStream tok = new KoreanNounFilter(tokenizer, nounExtractEngines); 42 | tok = new KoreanStopFilter(tok); 43 | 44 | return new TokenStreamComponents(tokenizer, tok); 45 | } else { 46 | List nounExtractEngines = new ArrayList(); 47 | nounExtractEngines.add(new KoreanCompoundNounEngine()); 48 | nounExtractEngines.add(new KoreanLongestNounEngine()); 49 | nounExtractEngines.add(new KoreanSynonymEngine()); 50 | nounExtractEngines.add(new KoreanMorphEngine()); 51 | 52 | Tokenizer tokenizer = new KoreanCharacterTokenizer(Version.LUCENE_36, reader); 53 | TokenStream tok = new KoreanNounFilter(tokenizer, nounExtractEngines); 54 | tok = new KoreanStopFilter(tok); 55 | 56 | return new TokenStreamComponents(tokenizer, tok); 57 | } 58 | } 59 | 60 | } -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/SpaceOutput.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.morph; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * 공백을 분석한 결과를 저장한다. 8 | * @author smlee 9 | * 10 | */ 11 | public class SpaceOutput { 12 | 13 | // 분석된 결과 14 | private AnalysisOutput output; 15 | 16 | // 분석 결과 앞에 있는 미등록어, 사람 이름은 대부분 이런 경우임. 17 | private List nrWords = new ArrayList(); 18 | 19 | // 분석하기 이전의 어절 20 | private String source; 21 | 22 | public void initialize() { 23 | output = null; 24 | nrWords = new ArrayList(); 25 | source = null; 26 | } 27 | 28 | /** 29 | * @return the output 30 | */ 31 | public AnalysisOutput getOutput() { 32 | return output; 33 | } 34 | 35 | /** 36 | * @param output the output to set 37 | */ 38 | public void setOutput(AnalysisOutput output) { 39 | this.output = output; 40 | } 41 | 42 | /** 43 | * @return the nrWord 44 | */ 45 | public List getNRWords() { 46 | return nrWords; 47 | } 48 | 49 | /** 50 | * @param nrWord the nrWord to set 51 | */ 52 | public void setNRWords(List words) { 53 | this.nrWords = words; 54 | } 55 | 56 | /** 57 | * 58 | * @param word 59 | */ 60 | public void addNRWord(String word) { 61 | addNRWord(word, AnalysisOutput.SCORE_CORRECT); 62 | } 63 | 64 | /** 65 | * 66 | * @param word 67 | * @param score 68 | */ 69 | public void addNRWord(String word, int score) { 70 | AnalysisOutput output = new AnalysisOutput(word,null,null,PatternConstants.PTN_N,score); 71 | output.setSource(word); 72 | output.setPos(PatternConstants.POS_NOUN); 73 | this.nrWords.add(0,output); 74 | } 75 | 76 | /** 77 | * @return the source 78 | */ 79 | public String getSource() { 80 | return source; 81 | } 82 | 83 | /** 84 | * @param source the source to set 85 | */ 86 | public void setSource(String source) { 87 | this.source = source; 88 | } 89 | 90 | /** 91 | * 분석된 전체 단어의 길이를 반환한다. 92 | * @return 93 | */ 94 | public int getLength() { 95 | 96 | if(this.source ==null) return 0; 97 | 98 | return this.source.length(); 99 | } 100 | 101 | } 102 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanCompoundNounEngineTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.StringReader; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | import org.apache.lucene.analysis.TokenStream; 9 | import org.junit.Before; 10 | import org.junit.Test; 11 | 12 | import com.google.common.collect.Lists; 13 | import com.google.common.collect.Maps; 14 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 15 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil; 16 | import com.tistory.devyongsik.analyzer.util.TestToken; 17 | 18 | public class KoreanCompoundNounEngineTest extends AnalyzerTestUtil { 19 | private List compondNouns = Lists.newArrayList(); 20 | private StringReader reader = new StringReader("월드컵조직위원회분과위"); 21 | private List engines = new ArrayList(); 22 | private DictionaryFactory dictionaryFactory; 23 | 24 | @Before 25 | public void initDictionary() { 26 | compondNouns.add(getToken("분과위", 8, 11)); 27 | compondNouns.add(getToken("위원회", 5, 8)); 28 | compondNouns.add(getToken("조직", 3, 5)); 29 | compondNouns.add(getToken("월드컵", 0, 3)); 30 | compondNouns.add(getToken("월드컵조직위원회분과위", 0, 11)); 31 | 32 | dictionaryFactory = DictionaryFactory.getFactory(); 33 | } 34 | 35 | @Test 36 | public void testCompoundNounExtract() throws Exception { 37 | Map> compoundNounDictionaryMap = Maps.newHashMap(); 38 | List compoundList = Lists.newArrayList(); 39 | compoundList.add("분과위"); 40 | compoundList.add("위원회"); 41 | compoundList.add("조직"); 42 | compoundList.add("월드컵"); 43 | 44 | compoundNounDictionaryMap.put("월드컵조직위원회분과위", compoundList); 45 | 46 | dictionaryFactory.setCompoundDictionaryMap(compoundNounDictionaryMap); 47 | 48 | createEngines(); 49 | 50 | TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines); 51 | 52 | stream.reset(); 53 | 54 | List extractedTokens = collectExtractedNouns(stream); 55 | 56 | stream.close(); 57 | 58 | verify(compondNouns, extractedTokens); 59 | } 60 | 61 | private void createEngines() { 62 | engines.add(new KoreanCompoundNounEngine()); 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanCompoundNounEngineTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.StringReader; 4 | import java.util.ArrayList; 5 | import java.util.List; 6 | import java.util.Map; 7 | 8 | import org.apache.lucene.analysis.TokenStream; 9 | import org.junit.Before; 10 | import org.junit.Test; 11 | 12 | import com.google.common.collect.Lists; 13 | import com.google.common.collect.Maps; 14 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 15 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil; 16 | import com.tistory.devyongsik.analyzer.util.TestToken; 17 | 18 | public class KoreanCompoundNounEngineTest extends AnalyzerTestUtil { 19 | private List compondNouns = Lists.newArrayList(); 20 | private StringReader reader = new StringReader("월드컵조직위원회분과위"); 21 | private List engines = new ArrayList(); 22 | private DictionaryFactory dictionaryFactory; 23 | 24 | @Before 25 | public void initDictionary() { 26 | compondNouns.add(getToken("분과위", 8, 11)); 27 | compondNouns.add(getToken("위원회", 5, 8)); 28 | compondNouns.add(getToken("조직", 3, 5)); 29 | compondNouns.add(getToken("월드컵", 0, 3)); 30 | compondNouns.add(getToken("월드컵조직위원회분과위", 0, 11)); 31 | 32 | dictionaryFactory = DictionaryFactory.getFactory(); 33 | } 34 | 35 | @Test 36 | public void testCompoundNounExtract() throws Exception { 37 | Map> compoundNounDictionaryMap = Maps.newHashMap(); 38 | List compoundList = Lists.newArrayList(); 39 | compoundList.add("분과위"); 40 | compoundList.add("위원회"); 41 | compoundList.add("조직"); 42 | compoundList.add("월드컵"); 43 | 44 | compoundNounDictionaryMap.put("월드컵조직위원회분과위", compoundList); 45 | 46 | dictionaryFactory.setCompoundDictionaryMap(compoundNounDictionaryMap); 47 | 48 | createEngines(); 49 | 50 | TokenStream stream = new KoreanNounFilter(new KoreanCharacterTokenizer(reader), engines); 51 | 52 | stream.reset(); 53 | 54 | List extractedTokens = collectExtractedNouns(stream); 55 | 56 | stream.close(); 57 | 58 | verify(compondNouns, extractedTokens); 59 | } 60 | 61 | private void createEngines() { 62 | engines.add(new KoreanCompoundNounEngine()); 63 | } 64 | 65 | } 66 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/CompoundEntry.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.lucene.analysis.kr.morph; 19 | 20 | /** 21 | * 복합명사의 개별단어에 대한 정보를 담고있는 클래스 22 | * @author S.M.Lee 23 | * 24 | */ 25 | public class CompoundEntry { 26 | 27 | private String word; 28 | 29 | private int offset = -1; 30 | 31 | private boolean exist = true; 32 | 33 | private char pos = PatternConstants.POS_NOUN; 34 | 35 | public CompoundEntry() { 36 | 37 | } 38 | 39 | public CompoundEntry(String w) { 40 | this.word = w; 41 | } 42 | 43 | public CompoundEntry(String w,int o) { 44 | this(w); 45 | this.offset = o; 46 | } 47 | 48 | public CompoundEntry(String w,int o, boolean is) { 49 | this(w,o); 50 | this.exist = is; 51 | } 52 | 53 | public CompoundEntry(String w,int o, boolean is, char p) { 54 | this(w,o,is); 55 | this.pos = p; 56 | } 57 | 58 | public void setWord(String w) { 59 | this.word = w; 60 | } 61 | 62 | public void setOffset(int o) { 63 | this.offset = o; 64 | } 65 | 66 | public String getWord() { 67 | return this.word; 68 | } 69 | 70 | public int getOffset() { 71 | return this.offset; 72 | } 73 | 74 | public boolean isExist() { 75 | return exist; 76 | } 77 | 78 | public void setExist(boolean is) { 79 | this.exist = is; 80 | } 81 | 82 | public char getPos() { 83 | return pos; 84 | } 85 | 86 | public void setPos(char pos) { 87 | this.pos = pos; 88 | } 89 | } 90 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanCharacterTokenizerTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | 4 | 5 | import java.io.IOException; 6 | import java.io.StringReader; 7 | import java.util.HashSet; 8 | import java.util.Set; 9 | 10 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 11 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 12 | import org.junit.Assert; 13 | import org.junit.Before; 14 | import org.junit.Test; 15 | 16 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil; 17 | import com.tistory.devyongsik.analyzer.util.TestToken; 18 | 19 | /** 20 | * 21 | * @author 장용석, 2011.07.16 need4spd@naver.com 22 | */ 23 | 24 | public class KoreanCharacterTokenizerTest extends AnalyzerTestUtil { 25 | 26 | private Set tokenizedToken = new HashSet(); 27 | private StringReader content = new StringReader("삼성전자absc1234엠피3mp3버전1.2 띄어쓰기"); 28 | private KoreanCharacterTokenizer tokenizer = new KoreanCharacterTokenizer(content); 29 | 30 | @Before 31 | public void setUp() throws IOException { 32 | tokenizedToken.add(getToken("띄어쓰기", 25, 29)); 33 | tokenizedToken.add(getToken("2", 22, 23)); 34 | tokenizedToken.add(getToken("1", 20, 21)); 35 | tokenizedToken.add(getToken("버전", 18, 20)); 36 | tokenizedToken.add(getToken("3",17, 18)); 37 | tokenizedToken.add(getToken("mp", 15, 17)); 38 | tokenizedToken.add(getToken("3", 14, 15)); 39 | tokenizedToken.add(getToken("엠피", 12, 14)); 40 | tokenizedToken.add(getToken("1234", 8, 12)); 41 | tokenizedToken.add(getToken("absc", 4, 8)); 42 | tokenizedToken.add(getToken("삼성전자", 0, 4)); 43 | 44 | tokenizer.reset(); 45 | } 46 | 47 | @Test 48 | public void testIncrementToken() throws IOException { 49 | CharTermAttribute charTermAtt = tokenizer.getAttribute(CharTermAttribute.class); 50 | OffsetAttribute offSetAtt = tokenizer.getAttribute(OffsetAttribute.class); 51 | 52 | while(tokenizer.incrementToken()) { 53 | TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset()); 54 | System.out.println("termAtt.term() : " + charTermAtt.toString()); 55 | System.out.println("offSetAtt : " + offSetAtt.startOffset()); 56 | System.out.println("offSetAtt : " + offSetAtt.endOffset()); 57 | 58 | Assert.assertTrue(tokenizedToken.contains(t)); 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/test/java/com/tistory/devyongsik/analyzer/KoreanCharacterTokenizerTest.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | 4 | 5 | import java.io.IOException; 6 | import java.io.StringReader; 7 | import java.util.HashSet; 8 | import java.util.Set; 9 | 10 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 11 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 12 | import org.junit.Assert; 13 | import org.junit.Before; 14 | import org.junit.Test; 15 | 16 | import com.tistory.devyongsik.analyzer.util.AnalyzerTestUtil; 17 | import com.tistory.devyongsik.analyzer.util.TestToken; 18 | 19 | /** 20 | * 21 | * @author 장용석, 2011.07.16 need4spd@naver.com 22 | */ 23 | 24 | public class KoreanCharacterTokenizerTest extends AnalyzerTestUtil { 25 | 26 | private Set tokenizedToken = new HashSet(); 27 | private StringReader content = new StringReader("삼성전자absc1234엠피3mp3버전1.2 띄어쓰기"); 28 | private KoreanCharacterTokenizer tokenizer = new KoreanCharacterTokenizer(content); 29 | 30 | @Before 31 | public void setUp() throws IOException { 32 | tokenizedToken.add(getToken("띄어쓰기", 25, 29)); 33 | tokenizedToken.add(getToken("2", 22, 23)); 34 | tokenizedToken.add(getToken("1", 20, 21)); 35 | tokenizedToken.add(getToken("버전", 18, 20)); 36 | tokenizedToken.add(getToken("3",17, 18)); 37 | tokenizedToken.add(getToken("mp", 15, 17)); 38 | tokenizedToken.add(getToken("3", 14, 15)); 39 | tokenizedToken.add(getToken("엠피", 12, 14)); 40 | tokenizedToken.add(getToken("1234", 8, 12)); 41 | tokenizedToken.add(getToken("absc", 4, 8)); 42 | tokenizedToken.add(getToken("삼성전자", 0, 4)); 43 | 44 | tokenizer.reset(); 45 | } 46 | 47 | @Test 48 | public void testIncrementToken() throws IOException { 49 | CharTermAttribute charTermAtt = tokenizer.getAttribute(CharTermAttribute.class); 50 | OffsetAttribute offSetAtt = tokenizer.getAttribute(OffsetAttribute.class); 51 | 52 | while(tokenizer.incrementToken()) { 53 | TestToken t = getToken(charTermAtt.toString(), offSetAtt.startOffset(), offSetAtt.endOffset()); 54 | System.out.println("termAtt.term() : " + charTermAtt.toString()); 55 | System.out.println("offSetAtt : " + offSetAtt.startOffset()); 56 | System.out.println("offSetAtt : " + offSetAtt.endOffset()); 57 | 58 | Assert.assertTrue(tokenizedToken.contains(t)); 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSAOutput.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.lucene.analysis.kr.morph; 18 | 19 | import java.util.ArrayList; 20 | import java.util.List; 21 | 22 | public class WSAOutput { 23 | 24 | private String source; 25 | 26 | private List results; 27 | 28 | private int wds = 0; 29 | 30 | private int end = 0; 31 | 32 | public WSAOutput() { 33 | results = new ArrayList(); 34 | } 35 | 36 | public WSAOutput(String src) { 37 | source = src; 38 | results = new ArrayList(); 39 | } 40 | 41 | public WSAOutput(String src, List list) { 42 | source = src; 43 | results = list; 44 | } 45 | 46 | public String getSource() { 47 | return source; 48 | } 49 | 50 | public void setSource(String source) { 51 | this.source = source; 52 | } 53 | 54 | public List getResults() { 55 | return results; 56 | } 57 | 58 | public void setResults(List results) { 59 | this.results = results; 60 | } 61 | 62 | public void addNounResults(String word) { 63 | addNounResults(word, null); 64 | } 65 | 66 | public void addNounResults(String word, String end) { 67 | addNounResults(word, end, AnalysisOutput.SCORE_ANALYSIS); 68 | } 69 | 70 | public void addNounResults(String word, String end, int score) { 71 | 72 | AnalysisOutput output = new AnalysisOutput(word, end, null, PatternConstants.PTN_NJ); 73 | if(end==null) output.setPatn(PatternConstants.PTN_N); 74 | 75 | output.setPos(PatternConstants.POS_NOUN); 76 | output.setScore(score); 77 | 78 | this.results.add(output); 79 | } 80 | 81 | } 82 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanStopFilter.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | 6 | import org.apache.lucene.analysis.TokenFilter; 7 | import org.apache.lucene.analysis.TokenStream; 8 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | 13 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 14 | 15 | public class KoreanStopFilter extends TokenFilter { 16 | 17 | private boolean enablePositionIncrements = false; 18 | 19 | private CharTermAttribute charTermAtt; 20 | private PositionIncrementAttribute posIncrAtt; 21 | private Logger logger = LoggerFactory.getLogger(KoreanStopFilter.class); 22 | private static Map stopWordsMap = null; 23 | 24 | protected KoreanStopFilter(TokenStream input) { 25 | super(input); 26 | 27 | if(logger.isInfoEnabled()) { 28 | logger.info("init KoreanStopFilter"); 29 | } 30 | charTermAtt = getAttribute(CharTermAttribute.class); 31 | posIncrAtt = getAttribute(PositionIncrementAttribute.class); 32 | 33 | DictionaryFactory dictionaryFactory = DictionaryFactory.getFactory(); 34 | stopWordsMap = dictionaryFactory.getStopWordDictionaryMap(); 35 | } 36 | 37 | public void setEnablePositionIncrements(boolean enable) { 38 | this.enablePositionIncrements = enable; 39 | } 40 | 41 | public boolean getEnablePositionIncrements() { 42 | return enablePositionIncrements; 43 | } 44 | 45 | @Override 46 | public final boolean incrementToken() throws IOException { 47 | 48 | if(logger.isDebugEnabled()) 49 | logger.debug("incrementToken KoreanStopFilter"); 50 | 51 | 52 | // return the first non-stop word found 53 | int skippedPositions = 0; 54 | 55 | while(input.incrementToken()) { 56 | 57 | if(logger.isDebugEnabled()) 58 | logger.debug("원래 리턴 될 TermAtt : " + charTermAtt.toString() + " , stopWordDic.isExist : " + stopWordsMap.containsKey(charTermAtt.toString())); 59 | 60 | if(!stopWordsMap.containsKey(charTermAtt.toString())) { 61 | if(enablePositionIncrements) { 62 | posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); 63 | } 64 | 65 | return true; 66 | } 67 | 68 | skippedPositions += posIncrAtt.getPositionIncrement(); 69 | } 70 | 71 | return false; 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanStopFilter.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.IOException; 4 | import java.util.Map; 5 | 6 | import org.apache.lucene.analysis.TokenFilter; 7 | import org.apache.lucene.analysis.TokenStream; 8 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 10 | import org.slf4j.Logger; 11 | import org.slf4j.LoggerFactory; 12 | 13 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 14 | 15 | public class KoreanStopFilter extends TokenFilter { 16 | 17 | private boolean enablePositionIncrements = false; 18 | 19 | private CharTermAttribute charTermAtt; 20 | private PositionIncrementAttribute posIncrAtt; 21 | private Logger logger = LoggerFactory.getLogger(KoreanStopFilter.class); 22 | private static Map stopWordsMap = null; 23 | 24 | protected KoreanStopFilter(TokenStream input) { 25 | super(input); 26 | 27 | if(logger.isInfoEnabled()) { 28 | logger.info("init KoreanStopFilter"); 29 | } 30 | charTermAtt = getAttribute(CharTermAttribute.class); 31 | posIncrAtt = getAttribute(PositionIncrementAttribute.class); 32 | 33 | DictionaryFactory dictionaryFactory = DictionaryFactory.getFactory(); 34 | stopWordsMap = dictionaryFactory.getStopWordDictionaryMap(); 35 | } 36 | 37 | public void setEnablePositionIncrements(boolean enable) { 38 | this.enablePositionIncrements = enable; 39 | } 40 | 41 | public boolean getEnablePositionIncrements() { 42 | return enablePositionIncrements; 43 | } 44 | 45 | @Override 46 | public final boolean incrementToken() throws IOException { 47 | 48 | if(logger.isDebugEnabled()) 49 | logger.debug("incrementToken KoreanStopFilter"); 50 | 51 | 52 | // return the first non-stop word found 53 | int skippedPositions = 0; 54 | 55 | while(input.incrementToken()) { 56 | 57 | if(logger.isDebugEnabled()) 58 | logger.debug("원래 리턴 될 TermAtt : " + charTermAtt.toString() + " , stopWordDic.isExist : " + stopWordsMap.containsKey(charTermAtt.toString())); 59 | 60 | if(!stopWordsMap.containsKey(charTermAtt.toString())) { 61 | if(enablePositionIncrements) { 62 | posIncrAtt.setPositionIncrement(posIncrAtt.getPositionIncrement() + skippedPositions); 63 | } 64 | 65 | return true; 66 | } 67 | 68 | skippedPositions += posIncrAtt.getPositionIncrement(); 69 | } 70 | 71 | return false; 72 | } 73 | 74 | } 75 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/utils/HanjaUtils.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.lucene.analysis.kr.utils; 18 | 19 | import java.io.IOException; 20 | 21 | import java.util.ArrayList; 22 | import java.util.HashMap; 23 | import java.util.List; 24 | import java.util.Map; 25 | 26 | import org.apache.lucene.analysis.kr.morph.MorphException; 27 | 28 | public class HanjaUtils { 29 | 30 | private static Map mapHanja; 31 | 32 | public synchronized static void loadDictionary() throws MorphException { 33 | try { 34 | List strList = FileUtil.readLines("org/apache/lucene/analysis/kr/dic/mapHanja.dic","UTF-8"); 35 | mapHanja = new HashMap(); 36 | 37 | for(int i=0;i0x9FFF||hanja<0x3400) return new char[]{hanja}; 67 | 68 | char[] result = mapHanja.get(new String(new char[]{hanja})); 69 | if(result==null) return new char[]{hanja}; 70 | 71 | return result; 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | apply plugin: 'eclipse' 3 | apply plugin: 'eclipse-wtp' 4 | apply plugin: 'maven' 5 | 6 | 7 | sourceCompatibility = 1.7 8 | 9 | group = 'com.tistory.devyongsik' 10 | version = '0.6-SNAPSHOT' 11 | 12 | repositories { 13 | mavenCentral() 14 | } 15 | 16 | def versions = [ 17 | lucene : "3.6.2" 18 | ] 19 | 20 | dependencies { 21 | compile( 22 | [group: 'junit', name: 'junit', version: '4.4'], 23 | [group: 'org.apache.lucene', name: 'lucene-core', version: "${versions.lucene}"], 24 | [group: 'org.slf4j', name: 'slf4j-api', version: '1.6.6'], 25 | [group: 'org.slf4j', name: 'slf4j-simple', version: '1.6.6'], 26 | [group: 'ch.qos.logback', name: 'logback-core', version: '1.0.13'], 27 | [group: 'ch.qos.logback', name: 'logback-classic', version: '1.0.13'], 28 | [group: 'org.codehaus.groovy', name: 'groovy-all', version: '2.1.6'], 29 | [group: 'com.google.guava', name: 'guava', version: 'r09'] 30 | ) 31 | 32 | testRuntime( 33 | [group: 'junit', name: 'junit', version: '4.4'], 34 | [group: 'org.apache.lucene', name: 'lucene-core', version: "${versions.lucene}"], 35 | [group: 'org.slf4j', name: 'slf4j-api', version: '1.6.6'], 36 | [group: 'org.slf4j', name: 'slf4j-simple', version: '1.6.6'], 37 | [group: 'ch.qos.logback', name: 'logback-core', version: '1.0.13'], 38 | [group: 'ch.qos.logback', name: 'logback-classic', version: '1.0.13'], 39 | [group: 'org.codehaus.groovy', name: 'groovy-all', version: '2.1.6'], 40 | [group: 'com.google.guava', name: 'guava', version: 'r09'] 41 | ) 42 | } 43 | 44 | test { 45 | jvmArgs = ['-ea', '-Xmx256m'] 46 | logging.captureStandardOutput(LogLevel.INFO) 47 | } 48 | 49 | task copyDictionary(type: Copy) { 50 | from 'src/main/java' 51 | into 'target/classes/main' 52 | include '**/*.properties' 53 | include '**/*.dic' 54 | include '**/*.jflex' 55 | include '**/*.txt' 56 | 57 | includeEmptyDirs = false 58 | } 59 | 60 | eclipse { 61 | classpath { 62 | downloadSources=true 63 | } 64 | 65 | jdt { 66 | file { 67 | withProperties { 68 | properties -> properties.setProperty("encoding//src/main/java", "utf-8") 69 | properties.setProperty("encoding//src/main/resources", "utf-8") 70 | properties.setProperty("encoding//src/test/java", "utf-8") 71 | properties.setProperty("encoding//src/test/resources", "utf-8") 72 | } 73 | } 74 | } 75 | } 76 | 77 | uploadArchives { 78 | repositories.mavenDeployer { 79 | repository(url: "file:///Users/need4spd/Programming/need4spd-maven-repo/snapshots") 80 | //repository(url: "file:///Programming/Java/need4spd-maven-repo/snapshots") 81 | } 82 | } 83 | 84 | tasks.test.dependsOn copyDictionary 85 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/WordEntry.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.morph; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | public class WordEntry { 24 | 25 | public static final int IDX_NOUN = 0; 26 | public static final int IDX_VERB = 1; 27 | public static final int IDX_BUSA = 2; 28 | public static final int IDX_DOV = 3; 29 | public static final int IDX_BEV = 4; 30 | public static final int IDX_NE = 5; 31 | public static final int IDX_ADJ = 6; // 형용사 32 | public static final int IDX_NPR = 7; // 명사의 분류 (M:Measure) 33 | public static final int IDX_CNOUNX = 8; 34 | public static final int IDX_REGURA = 9; 35 | 36 | /** 37 | * 단어 38 | */ 39 | private String word; 40 | 41 | /** 42 | * 단어특성 43 | */ 44 | private char[] features; 45 | 46 | private List compounds = new ArrayList(); 47 | 48 | public WordEntry() { 49 | 50 | } 51 | 52 | public WordEntry(String word) { 53 | this.word = word; 54 | } 55 | 56 | public WordEntry(String word, char[] cs) { 57 | this.word = word; 58 | this.features = cs; 59 | } 60 | 61 | public WordEntry(String word, List c) { 62 | this.word = word; 63 | this.compounds = c; 64 | } 65 | 66 | public void setWord(String w) { 67 | this.word = w; 68 | } 69 | 70 | public String getWord() { 71 | return this.word; 72 | } 73 | 74 | public void setFeatures(char[] cs) { 75 | this.features = cs; 76 | } 77 | 78 | public char getFeature(int index) { 79 | if(features==null||features.length c) { 88 | this.compounds = c; 89 | } 90 | 91 | public List getCompounds() { 92 | return this.compounds; 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/WordEntry.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.lucene.analysis.kr.morph; 18 | 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | public class WordEntry { 24 | 25 | public static final int IDX_NOUN = 0; 26 | public static final int IDX_VERB = 1; 27 | public static final int IDX_BUSA = 2; 28 | public static final int IDX_DOV = 3; 29 | public static final int IDX_BEV = 4; 30 | public static final int IDX_NE = 5; 31 | public static final int IDX_ADJ = 6; // 형용사 32 | public static final int IDX_NPR = 7; // 명사의 분류 (M:Measure) 33 | public static final int IDX_CNOUNX = 8; 34 | public static final int IDX_REGURA = 9; 35 | 36 | /** 37 | * 단어 38 | */ 39 | private String word; 40 | 41 | /** 42 | * 단어특성 43 | */ 44 | private char[] features; 45 | 46 | private List compounds = new ArrayList(); 47 | 48 | public WordEntry() { 49 | 50 | } 51 | 52 | public WordEntry(String word) { 53 | this.word = word; 54 | } 55 | 56 | public WordEntry(String word, char[] cs) { 57 | this.word = word; 58 | this.features = cs; 59 | } 60 | 61 | public WordEntry(String word, List c) { 62 | this.word = word; 63 | this.compounds = c; 64 | } 65 | 66 | public void setWord(String w) { 67 | this.word = w; 68 | } 69 | 70 | public String getWord() { 71 | return this.word; 72 | } 73 | 74 | public void setFeatures(char[] cs) { 75 | this.features = cs; 76 | } 77 | 78 | public char getFeature(int index) { 79 | if(features==null||features.length c) { 88 | this.compounds = c; 89 | } 90 | 91 | public List getCompounds() { 92 | return this.compounds; 93 | } 94 | 95 | } 96 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/DictionaryProperties.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.util.Properties; 6 | 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | public class DictionaryProperties { 11 | private Logger logger = LoggerFactory.getLogger(DictionaryProperties.class); 12 | 13 | private static DictionaryProperties instance = new DictionaryProperties(); 14 | 15 | private Properties defaultProp = new Properties(); 16 | private Properties customProp = new Properties(); 17 | 18 | private String resourceName = "dictionary.properties"; 19 | private final String defaultResourceName = "com/tistory/devyongsik/analyzer/dictionary.properties"; 20 | 21 | private DictionaryProperties() { 22 | loadDefaultProperties(); 23 | loadCustomProperties(); 24 | } 25 | 26 | private void loadDefaultProperties() { 27 | if(logger.isDebugEnabled()) 28 | logger.debug("load analyzer default properties..... : " + defaultResourceName); 29 | 30 | Class clazz = DictionaryProperties.class; 31 | 32 | InputStream in = clazz.getClassLoader().getResourceAsStream(defaultResourceName); 33 | 34 | if(in == null) { 35 | logger.error(defaultResourceName + " was not found!!!"); 36 | throw new IllegalStateException(defaultResourceName + " was not found!!!"); 37 | } 38 | 39 | try { 40 | defaultProp.load(in); 41 | in.close(); 42 | } catch (IOException e) { 43 | logger.error(e.toString()); 44 | } 45 | 46 | if(logger.isInfoEnabled()) { 47 | logger.info("default dictionary.properties : " + defaultProp); 48 | } 49 | } 50 | 51 | private void loadCustomProperties() { 52 | if(logger.isDebugEnabled()) 53 | logger.debug("load analyzer custom properties..... : " + resourceName); 54 | 55 | Class clazz = DictionaryProperties.class; 56 | 57 | InputStream in = clazz.getClassLoader().getResourceAsStream(resourceName); 58 | 59 | if(in == null) { 60 | logger.warn(customProp + " was not found!!! skip load custom properties"); 61 | return; 62 | } 63 | 64 | try { 65 | customProp.load(in); 66 | in.close(); 67 | } catch (IOException e) { 68 | logger.error(e.toString()); 69 | } 70 | 71 | if(logger.isInfoEnabled()) { 72 | logger.info("custom dictionary.properties : " + customProp); 73 | } 74 | } 75 | 76 | public static DictionaryProperties getInstance() { 77 | return instance; 78 | } 79 | 80 | public String getProperty(String key) { 81 | //read property value from custom properties first 82 | String value = customProp.getProperty(key); 83 | 84 | if(value == null) { 85 | value = defaultProp.getProperty(key); 86 | } 87 | 88 | return value.trim(); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/DictionaryProperties.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.io.IOException; 4 | import java.io.InputStream; 5 | import java.util.Properties; 6 | 7 | import org.slf4j.Logger; 8 | import org.slf4j.LoggerFactory; 9 | 10 | public class DictionaryProperties { 11 | private Logger logger = LoggerFactory.getLogger(DictionaryProperties.class); 12 | 13 | private static DictionaryProperties instance = new DictionaryProperties(); 14 | 15 | private Properties defaultProp = new Properties(); 16 | private Properties customProp = new Properties(); 17 | 18 | private String resourceName = "dictionary.properties"; 19 | private final String defaultResourceName = "com/tistory/devyongsik/analyzer/dictionary.properties"; 20 | 21 | private DictionaryProperties() { 22 | loadDefaultProperties(); 23 | loadCustomProperties(); 24 | } 25 | 26 | private void loadDefaultProperties() { 27 | if(logger.isDebugEnabled()) 28 | logger.debug("load analyzer default properties..... : " + defaultResourceName); 29 | 30 | Class clazz = DictionaryProperties.class; 31 | 32 | InputStream in = clazz.getClassLoader().getResourceAsStream(defaultResourceName); 33 | 34 | if(in == null) { 35 | logger.error(defaultResourceName + " was not found!!!"); 36 | throw new IllegalStateException(defaultResourceName + " was not found!!!"); 37 | } 38 | 39 | try { 40 | defaultProp.load(in); 41 | in.close(); 42 | } catch (IOException e) { 43 | logger.error(e.toString()); 44 | } 45 | 46 | if(logger.isInfoEnabled()) { 47 | logger.info("default dictionary.properties : " + defaultProp); 48 | } 49 | } 50 | 51 | private void loadCustomProperties() { 52 | if(logger.isDebugEnabled()) 53 | logger.debug("load analyzer custom properties..... : " + resourceName); 54 | 55 | Class clazz = DictionaryProperties.class; 56 | 57 | InputStream in = clazz.getClassLoader().getResourceAsStream(resourceName); 58 | 59 | if(in == null) { 60 | logger.warn(customProp + " was not found!!! skip load custom properties"); 61 | return; 62 | } 63 | 64 | try { 65 | customProp.load(in); 66 | in.close(); 67 | } catch (IOException e) { 68 | logger.error(e.toString()); 69 | } 70 | 71 | if(logger.isInfoEnabled()) { 72 | logger.info("custom dictionary.properties : " + customProp); 73 | } 74 | } 75 | 76 | public static DictionaryProperties getInstance() { 77 | return instance; 78 | } 79 | 80 | public String getProperty(String key) { 81 | //read property value from custom properties first 82 | String value = customProp.getProperty(key); 83 | 84 | if(value == null) { 85 | value = defaultProp.getProperty(key); 86 | } 87 | 88 | return value.trim(); 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/utils/NounDictionaryDuplWordRemover.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.utils; 2 | 3 | import java.io.BufferedReader; 4 | import java.io.BufferedWriter; 5 | import java.io.File; 6 | import java.io.FileInputStream; 7 | import java.io.FileOutputStream; 8 | import java.io.IOException; 9 | import java.io.InputStream; 10 | import java.io.InputStreamReader; 11 | import java.io.OutputStream; 12 | import java.io.OutputStreamWriter; 13 | import java.util.ArrayList; 14 | import java.util.Collections; 15 | import java.util.HashMap; 16 | import java.util.List; 17 | import java.util.Map; 18 | import java.util.Set; 19 | 20 | public class NounDictionaryDuplWordRemover { 21 | public static void main(String[] args) throws IOException { 22 | 23 | File nounFile = new File("/Users/need4spd/Programming/Java/workspace/walkingword/src/com/tistory/devyongsik/analyzer/dictionary/noun.txt"); 24 | File customNounFile = new File("/Users/need4spd/Programming/Java/workspace/walkingword/src/com/tistory/devyongsik/analyzer/dictionary/custom.txt"); 25 | 26 | 27 | InputStream nounIs = new FileInputStream(nounFile); 28 | InputStreamReader nounIsr = new InputStreamReader(nounIs); 29 | BufferedReader nounBr = new BufferedReader(nounIsr); 30 | 31 | Map nounsMap = new HashMap(); 32 | 33 | String nounTemp = ""; 34 | while((nounTemp = nounBr.readLine()) != null) { 35 | nounsMap.put(nounTemp, ""); 36 | } 37 | 38 | InputStream customIs = new FileInputStream(customNounFile); 39 | InputStreamReader customIsr = new InputStreamReader(customIs); 40 | BufferedReader customBr = new BufferedReader(customIsr); 41 | 42 | Map customMap = new HashMap(); 43 | 44 | String customTemp = ""; 45 | while((customTemp = customBr.readLine()) != null) { 46 | customMap.put(customTemp, ""); 47 | } 48 | 49 | int dupCount = 0; 50 | Set customNounsKeySet = customMap.keySet(); 51 | 52 | for(String customNoun : customNounsKeySet) { 53 | if (nounsMap.containsKey(customNoun)) { 54 | nounsMap.remove(customNoun); 55 | dupCount++; 56 | } 57 | } 58 | 59 | System.out.println("dup count : " + dupCount); 60 | 61 | customBr.close(); 62 | customIsr.close(); 63 | customIs.close(); 64 | 65 | nounBr.close(); 66 | nounIsr.close(); 67 | nounIs.close(); 68 | 69 | OutputStream nounOs = new FileOutputStream(nounFile, false); 70 | OutputStreamWriter osw = new OutputStreamWriter(nounOs); 71 | BufferedWriter bw = new BufferedWriter(osw); 72 | 73 | List cleanedNouns = new ArrayList(); 74 | for(String n : nounsMap.keySet()) { 75 | cleanedNouns.add(n); 76 | } 77 | 78 | Collections.sort(cleanedNouns); 79 | 80 | for(String n : cleanedNouns) { 81 | bw.write(n); 82 | bw.write("\n"); 83 | } 84 | 85 | bw.flush(); 86 | bw.close(); 87 | osw.close(); 88 | nounOs.close(); 89 | 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/SpaceOutput.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | 4 | * contributor license agreements. See the NOTICE file distributed with 5 | * this work for additional information regarding copyright ownership. 6 | * The ASF licenses this file to You under the Apache License, Version 2.0 7 | * (the "License"); you may not use this file except in compliance with 8 | * the License. You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | package org.apache.lucene.analysis.kr.morph; 19 | 20 | import java.util.ArrayList; 21 | import java.util.List; 22 | 23 | /** 24 | * 공백을 분석한 결과를 저장한다. 25 | * @author smlee 26 | * 27 | */ 28 | public class SpaceOutput { 29 | 30 | // 분석된 결과 31 | private AnalysisOutput output; 32 | 33 | // 분석 결과 앞에 있는 미등록어, 사람 이름은 대부분 이런 경우임. 34 | private List nrWords = new ArrayList(); 35 | 36 | // 분석하기 이전의 어절 37 | private String source; 38 | 39 | public void initialize() { 40 | output = null; 41 | nrWords = new ArrayList(); 42 | source = null; 43 | } 44 | 45 | /** 46 | * @return the output 47 | */ 48 | public AnalysisOutput getOutput() { 49 | return output; 50 | } 51 | 52 | /** 53 | * @param output the output to set 54 | */ 55 | public void setOutput(AnalysisOutput output) { 56 | this.output = output; 57 | } 58 | 59 | /** 60 | * @return the nrWord 61 | */ 62 | public List getNRWords() { 63 | return nrWords; 64 | } 65 | 66 | /** 67 | * @param nrWord the nrWord to set 68 | */ 69 | public void setNRWords(List words) { 70 | this.nrWords = words; 71 | } 72 | 73 | /** 74 | * 75 | * @param word 76 | */ 77 | public void addNRWord(String word) { 78 | addNRWord(word, AnalysisOutput.SCORE_CORRECT); 79 | } 80 | 81 | /** 82 | * 83 | * @param word 84 | * @param score 85 | */ 86 | public void addNRWord(String word, int score) { 87 | AnalysisOutput output = new AnalysisOutput(word,null,null,PatternConstants.PTN_N,score); 88 | output.setSource(word); 89 | output.setPos(PatternConstants.POS_NOUN); 90 | this.nrWords.add(0,output); 91 | } 92 | 93 | /** 94 | * @return the source 95 | */ 96 | public String getSource() { 97 | return source; 98 | } 99 | 100 | /** 101 | * @param source the source to set 102 | */ 103 | public void setSource(String source) { 104 | this.source = source; 105 | } 106 | 107 | /** 108 | * 분석된 전체 단어의 길이를 반환한다. 109 | * @return 110 | */ 111 | public int getLength() { 112 | 113 | if(this.source ==null) return 0; 114 | 115 | return this.source.length(); 116 | } 117 | 118 | } 119 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/build.gradle: -------------------------------------------------------------------------------- 1 | apply plugin: 'java' 2 | apply plugin: 'eclipse' 3 | apply plugin: 'maven' 4 | 5 | sourceCompatibility = 1.7 6 | 7 | group = 'com.tistory.devyongsik' 8 | version = '0.7-SNAPSHOT' 9 | 10 | repositories { 11 | mavenCentral() 12 | } 13 | 14 | def versions = [ 15 | lucene : "4.4.0", 16 | slf4j : "1.6.6", 17 | logback : "1.0.13" 18 | ] 19 | 20 | dependencies { 21 | compile( 22 | [group: 'junit', name: 'junit', version: '4.4'], 23 | [group: 'org.slf4j', name: 'slf4j-api', version: '1.6.6'], 24 | [group: 'org.slf4j', name: 'slf4j-simple', version: '1.6.6'], 25 | [group: 'org.apache.lucene', name: 'lucene-core', version: "${versions.lucene}"], 26 | [group: 'org.apache.lucene', name: 'lucene-analyzers-common', version: "${versions.lucene}"], 27 | [group: 'org.apache.lucene', name: 'lucene-queries', version: "${versions.lucene}"], 28 | [group: 'org.apache.lucene', name: 'lucene-queryparser', version: "${versions.lucene}"], 29 | [group: 'ch.qos.logback', name: 'logback-core', version: '1.0.13'], 30 | [group: 'ch.qos.logback', name: 'logback-classic', version: '1.0.13'], 31 | [group: 'org.codehaus.groovy', name: 'groovy-all', version: '2.1.6'], 32 | [group: 'com.google.guava', name: 'guava', version: 'r09'] 33 | ) 34 | 35 | testRuntime( 36 | [group: 'junit', name: 'junit', version: '4.4'], 37 | [group: 'org.slf4j', name: 'slf4j-api', version: '1.6.6'], 38 | [group: 'org.slf4j', name: 'slf4j-simple', version: '1.6.6'], 39 | [group: 'org.apache.lucene', name: 'lucene-core', version: "${versions.lucene}"], 40 | [group: 'org.apache.lucene', name: 'lucene-analyzers-common', version: "${versions.lucene}"], 41 | [group: 'org.apache.lucene', name: 'lucene-queries', version: "${versions.lucene}"], 42 | [group: 'org.apache.lucene', name: 'lucene-queryparser', version: "${versions.lucene}"], 43 | [group: 'ch.qos.logback', name: 'logback-core', version: '1.0.13'], 44 | [group: 'ch.qos.logback', name: 'logback-classic', version: '1.0.13'], 45 | [group: 'org.codehaus.groovy', name: 'groovy-all', version: '2.1.6'], 46 | [group: 'com.google.guava', name: 'guava', version: 'r09'] 47 | ) 48 | 49 | } 50 | 51 | task copyDictionary(type: Copy) { 52 | from 'src/main/java' 53 | into 'target/classes/main' 54 | include '**/*.properties' 55 | include '**/*.dic' 56 | include '**/*.jflex' 57 | include '**/*.txt' 58 | 59 | includeEmptyDirs = false 60 | } 61 | 62 | eclipse { 63 | classpath { 64 | downloadSources=true 65 | } 66 | 67 | jdt { 68 | file { 69 | withProperties { 70 | properties -> properties.setProperty("encoding//src/main/java", "utf-8") 71 | properties.setProperty("encoding//src/main/resources", "utf-8") 72 | properties.setProperty("encoding//src/test/java", "utf-8") 73 | properties.setProperty("encoding//src/test/resources", "utf-8") 74 | } 75 | } 76 | } 77 | } 78 | 79 | uploadArchives { 80 | repositories.mavenDeployer { 81 | repository(url: "file:///Users/need4spd/Programming/need4spd-maven-repo/snapshots") 82 | //repository(url: "file:///Programming/Java/need4spd-maven-repo/snapshots") 83 | } 84 | } 85 | 86 | tasks.test.dependsOn copyDictionary 87 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanCompoundNounEngine.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.util.HashMap; 4 | import java.util.List; 5 | import java.util.Map; 6 | import java.util.Stack; 7 | 8 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 9 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 10 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 11 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 12 | import org.apache.lucene.util.AttributeSource; 13 | import org.apache.lucene.util.AttributeSource.State; 14 | import org.slf4j.Logger; 15 | import org.slf4j.LoggerFactory; 16 | 17 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 18 | 19 | public class KoreanCompoundNounEngine implements Engine { 20 | 21 | private Logger logger = LoggerFactory.getLogger(KoreanCompoundNounEngine.class); 22 | private Map> compoundNouns = new HashMap>(); 23 | 24 | public KoreanCompoundNounEngine() { 25 | if(logger.isInfoEnabled()) { 26 | logger.info("init KoreanCompoundNounEngine"); 27 | } 28 | 29 | compoundNouns = DictionaryFactory.getFactory().getCompoundDictionaryMap(); 30 | } 31 | 32 | @Override 33 | public void collectNounState(AttributeSource attributeSource, Stack nounsStack, Map returnedTokens) throws Exception { 34 | 35 | 36 | CharTermAttribute termAttr = attributeSource.getAttribute(CharTermAttribute.class); 37 | TypeAttribute typeAttr = attributeSource.getAttribute(TypeAttribute.class); 38 | OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class); 39 | PositionIncrementAttribute positionAttr = attributeSource.getAttribute(PositionIncrementAttribute.class); 40 | 41 | String termString = termAttr.toString(); 42 | returnedTokens.put(termString+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), ""); 43 | 44 | //복합명사 사전에 있는 단어면 45 | List matchedData = compoundNouns.get(termString); 46 | if(matchedData != null) { 47 | typeAttr.setType("compounds"); 48 | 49 | for(String noun : matchedData) { 50 | 51 | if(logger.isDebugEnabled()) { 52 | logger.debug("복합명사추출 : " + noun); 53 | } 54 | 55 | int startOffSet = termString.indexOf(noun); 56 | int endOffSet = startOffSet + noun.length(); 57 | 58 | String makeKeyForCheck = noun + "_" + startOffSet + "_" + endOffSet; 59 | 60 | if(returnedTokens.containsKey(makeKeyForCheck)) { 61 | if(logger.isDebugEnabled()) { 62 | logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip"); 63 | } 64 | 65 | continue; 66 | 67 | } else { 68 | returnedTokens.put(makeKeyForCheck, ""); 69 | } 70 | 71 | termAttr.setEmpty(); 72 | termAttr.append(noun); 73 | 74 | positionAttr.setPositionIncrement(1); 75 | 76 | offSetAttr.setOffset(startOffSet , endOffSet); 77 | 78 | typeAttr.setType("compound"); 79 | nounsStack.add(attributeSource.captureState()); 80 | } 81 | } 82 | 83 | return; 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/dictionaryindex/SynonymDictionaryIndex.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer.dictionaryindex; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import org.apache.lucene.analysis.Analyzer; 7 | import org.apache.lucene.analysis.SimpleAnalyzer; 8 | import org.apache.lucene.document.Document; 9 | import org.apache.lucene.document.Field; 10 | import org.apache.lucene.document.Field.Index; 11 | import org.apache.lucene.document.Field.Store; 12 | import org.apache.lucene.document.Field.TermVector; 13 | import org.apache.lucene.index.IndexWriter; 14 | import org.apache.lucene.index.IndexWriterConfig; 15 | import org.apache.lucene.index.IndexWriterConfig.OpenMode; 16 | import org.apache.lucene.search.SearcherFactory; 17 | import org.apache.lucene.search.SearcherManager; 18 | import org.apache.lucene.store.Directory; 19 | import org.apache.lucene.store.RAMDirectory; 20 | import org.apache.lucene.util.Version; 21 | import org.slf4j.Logger; 22 | import org.slf4j.LoggerFactory; 23 | 24 | public class SynonymDictionaryIndex { 25 | 26 | private Directory ramDirectory = new RAMDirectory(); 27 | private SearcherManager searcherManager = null; 28 | private Logger logger = LoggerFactory.getLogger(SynonymDictionaryIndex.class); 29 | 30 | private static SynonymDictionaryIndex indexingModule = new SynonymDictionaryIndex(); 31 | 32 | private IndexWriter indexWriter = null; 33 | 34 | private SynonymDictionaryIndex() { 35 | try { 36 | 37 | Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_36); //문서 내용을 분석 할 때 사용 될 Analyzer 38 | IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_36, analyzer); 39 | iwc.setOpenMode(OpenMode.CREATE); 40 | 41 | indexWriter = new IndexWriter(ramDirectory, iwc); 42 | searcherManager = new SearcherManager(indexWriter,true, new SearcherFactory()); 43 | 44 | } catch (IOException e) { 45 | e.printStackTrace(); 46 | throw new IllegalStateException(); 47 | } 48 | } 49 | 50 | public static SynonymDictionaryIndex getIndexingModule() { 51 | return indexingModule; 52 | } 53 | 54 | public SearcherManager getSearcherManager() { 55 | 56 | return searcherManager; 57 | } 58 | 59 | public synchronized void indexingDictionary(List synonyms) { 60 | 61 | try { 62 | 63 | indexWriter.deleteAll(); 64 | indexWriter.commit(); 65 | 66 | int recordCnt = 0; 67 | //동의어들을 ,로 잘라내어 색인합니다. 68 | //하나의 document에 syn이라는 이름의 필드를 여러개 추가합니다. 69 | //나중에 syn=노트북 으로 검색한다면 그때 나온 결과 Document로부터 70 | //모든 동의어 리스트를 얻을 수 있습니다. 71 | 72 | for(String syn : synonyms) { 73 | String[] synonymWords = syn.split(","); 74 | Document doc = new Document(); 75 | for(int i = 0, size = synonymWords.length; i < size ; i++) { 76 | 77 | 78 | String fieldValue = synonymWords[i]; 79 | Field field = new Field("syn",fieldValue,Store.YES,Index.NOT_ANALYZED_NO_NORMS, TermVector.NO); 80 | doc.add(field); 81 | 82 | recordCnt++; 83 | }//end inner for 84 | indexWriter.addDocument(doc); 85 | }//end outer for 86 | 87 | indexWriter.commit(); 88 | 89 | logger.info("동의어 색인 단어 갯수 : {}", recordCnt); 90 | 91 | } catch (Exception e) { 92 | throw new IllegalStateException(); 93 | } 94 | } 95 | } 96 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanCompoundNounEngine.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.util.HashMap; 4 | import java.util.List; 5 | import java.util.Map; 6 | 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 11 | import org.apache.lucene.util.AttributeSource; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 16 | 17 | public class KoreanCompoundNounEngine implements Engine { 18 | 19 | private Logger logger = LoggerFactory.getLogger(KoreanCompoundNounEngine.class); 20 | private Map> compoundNouns = new HashMap>(); 21 | 22 | public KoreanCompoundNounEngine() { 23 | if(logger.isInfoEnabled()) { 24 | logger.info("init KoreanCompoundNounEngine"); 25 | } 26 | 27 | compoundNouns = DictionaryFactory.getFactory().getCompoundDictionaryMap(); 28 | } 29 | 30 | @Override 31 | public void collectNounState(AttributeSource attributeSource, List comparableStateList, Map returnedTokens) throws Exception { 32 | 33 | 34 | CharTermAttribute termAttr = attributeSource.getAttribute(CharTermAttribute.class); 35 | TypeAttribute typeAttr = attributeSource.getAttribute(TypeAttribute.class); 36 | OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class); 37 | PositionIncrementAttribute positionAttr = attributeSource.getAttribute(PositionIncrementAttribute.class); 38 | 39 | String termString = termAttr.toString(); 40 | returnedTokens.put(termString+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), ""); 41 | 42 | //복합명사 사전에 있는 단어면 43 | List matchedData = compoundNouns.get(termString); 44 | if(matchedData != null) { 45 | typeAttr.setType("compounds"); 46 | 47 | for(String noun : matchedData) { 48 | 49 | if(logger.isDebugEnabled()) { 50 | logger.debug("복합명사추출 : " + noun); 51 | } 52 | 53 | int startOffSet = termString.indexOf(noun); 54 | int endOffSet = startOffSet + noun.length(); 55 | 56 | String makeKeyForCheck = noun + "_" + startOffSet + "_" + endOffSet; 57 | 58 | if(returnedTokens.containsKey(makeKeyForCheck)) { 59 | if(logger.isDebugEnabled()) { 60 | logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip"); 61 | } 62 | 63 | continue; 64 | 65 | } else { 66 | returnedTokens.put(makeKeyForCheck, ""); 67 | } 68 | 69 | termAttr.setEmpty(); 70 | termAttr.append(noun); 71 | 72 | positionAttr.setPositionIncrement(1); 73 | 74 | offSetAttr.setOffset(startOffSet , endOffSet); 75 | 76 | typeAttr.setType("compound"); 77 | 78 | ComparableState comparableState = new ComparableState(); 79 | comparableState.setState(attributeSource.captureState()); 80 | comparableState.setStartOffset(offSetAttr.startOffset()); 81 | 82 | comparableStateList.add(comparableState); 83 | } 84 | } 85 | 86 | return; 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/dictionaryindex/SynonymDictionaryIndex.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer.dictionaryindex; 2 | 3 | import java.io.IOException; 4 | import java.util.List; 5 | 6 | import org.apache.lucene.analysis.Analyzer; 7 | import org.apache.lucene.analysis.core.SimpleAnalyzer; 8 | import org.apache.lucene.document.Document; 9 | import org.apache.lucene.document.Field; 10 | import org.apache.lucene.document.FieldType; 11 | import org.apache.lucene.index.FieldInfo.IndexOptions; 12 | import org.apache.lucene.index.IndexWriter; 13 | import org.apache.lucene.index.IndexWriterConfig; 14 | import org.apache.lucene.index.IndexWriterConfig.OpenMode; 15 | import org.apache.lucene.search.SearcherFactory; 16 | import org.apache.lucene.search.SearcherManager; 17 | import org.apache.lucene.store.Directory; 18 | import org.apache.lucene.store.RAMDirectory; 19 | import org.apache.lucene.util.Version; 20 | import org.slf4j.Logger; 21 | import org.slf4j.LoggerFactory; 22 | 23 | public class SynonymDictionaryIndex { 24 | 25 | private Directory ramDirectory = new RAMDirectory(); 26 | private SearcherManager searcherManager = null; 27 | private Logger logger = LoggerFactory.getLogger(SynonymDictionaryIndex.class); 28 | 29 | private static SynonymDictionaryIndex indexingModule = new SynonymDictionaryIndex(); 30 | 31 | private IndexWriter indexWriter = null; 32 | 33 | private SynonymDictionaryIndex() { 34 | try { 35 | 36 | Analyzer analyzer = new SimpleAnalyzer(Version.LUCENE_44); //문서 내용을 분석 할 때 사용 될 Analyzer 37 | IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44, analyzer); 38 | iwc.setOpenMode(OpenMode.CREATE); 39 | 40 | indexWriter = new IndexWriter(ramDirectory, iwc); 41 | searcherManager = new SearcherManager(indexWriter,true, new SearcherFactory()); 42 | 43 | } catch (IOException e) { 44 | e.printStackTrace(); 45 | throw new IllegalStateException(); 46 | } 47 | } 48 | 49 | public static SynonymDictionaryIndex getIndexingModule() { 50 | return indexingModule; 51 | } 52 | 53 | public SearcherManager getSearcherManager() { 54 | 55 | return searcherManager; 56 | } 57 | 58 | public synchronized void indexingDictionary(List synonyms) { 59 | 60 | try { 61 | 62 | indexWriter.deleteAll(); 63 | indexWriter.commit(); 64 | 65 | int recordCnt = 0; 66 | //동의어들을 ,로 잘라내어 색인합니다. 67 | //하나의 document에 syn이라는 이름의 필드를 여러개 추가합니다. 68 | //나중에 syn=노트북 으로 검색한다면 그때 나온 결과 Document로부터 69 | //모든 동의어 리스트를 얻을 수 있습니다. 70 | 71 | FieldType fieldType = new FieldType(); 72 | fieldType.setIndexed(true); 73 | fieldType.setStored(true); 74 | fieldType.setIndexOptions(IndexOptions.DOCS_ONLY); 75 | fieldType.setTokenized(false); 76 | 77 | for(String syn : synonyms) { 78 | String[] synonymWords = syn.split(","); 79 | Document doc = new Document(); 80 | for(int i = 0, size = synonymWords.length; i < size ; i++) { 81 | 82 | 83 | String fieldValue = synonymWords[i]; 84 | Field field = new Field("syn", fieldValue, fieldType); 85 | doc.add(field); 86 | 87 | recordCnt++; 88 | }//end inner for 89 | indexWriter.addDocument(doc); 90 | }//end outer for 91 | 92 | indexWriter.commit(); 93 | 94 | logger.info("동의어 색인 단어 갯수 : {}", recordCnt); 95 | 96 | } catch (Exception e) { 97 | throw new IllegalStateException(); 98 | } 99 | } 100 | } 101 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSOutput.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.morph; 2 | 3 | import java.util.ArrayList; 4 | import java.util.Collections; 5 | import java.util.List; 6 | 7 | public class WSOutput implements Cloneable { 8 | 9 | private int lastStart = 0; 10 | 11 | private int lastEnd = 0; 12 | 13 | private List phrases = new ArrayList(); 14 | 15 | public WSOutput() { 16 | 17 | } 18 | 19 | public WSOutput(AnalysisOutput o) { 20 | addPhrase(o); 21 | } 22 | 23 | public int getLastStart() { 24 | return lastStart; 25 | } 26 | 27 | public void setLastStart(int start) { 28 | this.lastStart = start; 29 | } 30 | 31 | public int getLastEnd() { 32 | return lastEnd; 33 | } 34 | 35 | public void setLastEnd(int end) { 36 | this.lastStart = end; 37 | } 38 | 39 | 40 | public List getPhrases() { 41 | return phrases; 42 | } 43 | 44 | public void removeLast() { 45 | 46 | if(this.phrases.size()==0) return; 47 | 48 | AnalysisOutput o = this.phrases.remove(this.phrases.size()-1); 49 | 50 | if(this.phrases.size()==0) { 51 | 52 | this.lastStart = 0; 53 | this.lastEnd = 0; 54 | 55 | } else { 56 | 57 | this.lastEnd -= o.getSource().length(); 58 | 59 | if(this.phrases.size()>1) { 60 | AnalysisOutput o1 = this.phrases.get(this.phrases.size()-1); 61 | this.lastStart = lastEnd-o1.getSource().length(); 62 | } else { 63 | this.lastStart = 0; 64 | } 65 | 66 | } 67 | 68 | } 69 | 70 | public void addPhrase(AnalysisOutput o) { 71 | 72 | this.lastStart = this.lastEnd; 73 | this.lastEnd += o.getSource().length(); 74 | 75 | if(o.getCNounList().size()==0) 76 | this.phrases.add(o); 77 | else 78 | addCompounds(o); 79 | 80 | } 81 | 82 | private void addCompounds(AnalysisOutput o) { 83 | 84 | List cnouns = o.getCNounList(); 85 | 86 | String source = o.getSource(); 87 | int rmstemlen = 0; 88 | 89 | // for(int i=0;i=cnouns.size()-2) break; 106 | 107 | int score = AnalysisOutput.SCORE_CORRECT; 108 | if(!cnouns.get(i).isExist()) score=AnalysisOutput.SCORE_CANDIDATE; 109 | 110 | AnalysisOutput o1 = new AnalysisOutput(noun, null, null, 111 | PatternConstants.POS_NOUN, PatternConstants.PTN_N, score); 112 | 113 | o1.setSource(noun); 114 | 115 | if(isOnechar) { 116 | o1.addCNoun(cnouns.get(i)); 117 | o1.addCNoun(cnouns.get(i+1)); 118 | } 119 | 120 | if(source.length()>noun.length()) 121 | source = source.substring(noun.length()); 122 | 123 | this.phrases.add(o1); 124 | cnouns.remove(cnouns.get(0)); 125 | i--; 126 | 127 | if(isOnechar) { 128 | cnouns.remove(cnouns.get(0)); 129 | } 130 | 131 | } 132 | 133 | o.setStem(o.getStem().substring(o.getSource().length()-source.length())); 134 | o.setSource(source); 135 | if(cnouns.size()==1) cnouns.remove(0); 136 | 137 | this.phrases.add(o); 138 | 139 | } 140 | 141 | public void setPhrases(List phrases) { 142 | this.phrases = phrases; 143 | } 144 | 145 | public WSOutput clone() throws CloneNotSupportedException { 146 | 147 | WSOutput candidate = (WSOutput)super.clone(); 148 | 149 | candidate.setLastStart(lastStart); 150 | 151 | candidate.setLastEnd(lastEnd); 152 | 153 | List list = new ArrayList(); 154 | list.addAll(phrases); 155 | candidate.setPhrases(list); 156 | 157 | return candidate; 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/utils/ConstraintUtil.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.utils; 2 | 3 | import java.util.HashMap; 4 | 5 | import java.util.Map; 6 | 7 | import org.apache.lucene.analysis.kr.morph.PatternConstants; 8 | 9 | /** 10 | * 결합이 가능한 조건을 처리하는 클래스 11 | * @author smlee 12 | * 13 | */ 14 | public class ConstraintUtil { 15 | 16 | private static Map hahes = new HashMap(); // "글로벌화해 ", "민족화해" 처럼 화해와 결합이 가능한 명사 17 | static { 18 | hahes.put("민족", "Y");hahes.put("동서", "Y");hahes.put("남북", "Y"); 19 | } 20 | 21 | private static Map eomiPnouns = new HashMap(); 22 | static { 23 | eomiPnouns.put("ㄴ", "Y");eomiPnouns.put("ㄹ", "Y");eomiPnouns.put("ㅁ", "Y"); 24 | } 25 | 26 | private static Map PTN_MLIST= new HashMap(); 27 | static { 28 | PTN_MLIST.put(PatternConstants.PTN_NSM, PatternConstants.PTN_NSM); 29 | PTN_MLIST.put(PatternConstants.PTN_NSMXM, PatternConstants.PTN_NSMXM); 30 | PTN_MLIST.put(PatternConstants.PTN_NJCM, PatternConstants.PTN_NJCM); 31 | PTN_MLIST.put(PatternConstants.PTN_VM, PatternConstants.PTN_VM); 32 | PTN_MLIST.put(PatternConstants.PTN_VMCM, PatternConstants.PTN_VMCM); 33 | PTN_MLIST.put(PatternConstants.PTN_VMXM, PatternConstants.PTN_VMXM); 34 | PTN_MLIST.put(PatternConstants.PTN_NVM, PatternConstants.PTN_NVM); 35 | } 36 | 37 | private static Map PTN_JLIST= new HashMap(); 38 | static { 39 | PTN_JLIST.put(PatternConstants.PTN_NJ, PatternConstants.PTN_NJ); 40 | PTN_JLIST.put(PatternConstants.PTN_NSMJ, PatternConstants.PTN_NSMJ); 41 | PTN_JLIST.put(PatternConstants.PTN_VMJ, PatternConstants.PTN_VMJ); 42 | PTN_JLIST.put(PatternConstants.PTN_VMXMJ, PatternConstants.PTN_VMXMJ); 43 | } 44 | 45 | private static Map WORD_GUKS= new HashMap(); 46 | static { 47 | WORD_GUKS.put("날것", "Y"); 48 | WORD_GUKS.put("들것", "Y"); 49 | WORD_GUKS.put("별것", "Y"); 50 | WORD_GUKS.put("찰것", "Y"); 51 | WORD_GUKS.put("탈것", "Y"); 52 | WORD_GUKS.put("하잘것", "Y"); 53 | } 54 | 55 | // 종성이 있는 음절과 연결될 수 없는 조사 56 | private static Map JOSA_TWO= new HashMap(); 57 | static { 58 | JOSA_TWO.put("가", "Y"); 59 | JOSA_TWO.put("는", "Y"); 60 | JOSA_TWO.put("다", "Y"); 61 | JOSA_TWO.put("나", "Y"); 62 | JOSA_TWO.put("니", "Y"); 63 | JOSA_TWO.put("고", "Y"); 64 | JOSA_TWO.put("라", "Y"); 65 | JOSA_TWO.put("와", "Y"); 66 | JOSA_TWO.put("랑", "Y"); 67 | JOSA_TWO.put("를", "Y"); 68 | JOSA_TWO.put("며", "Y"); 69 | JOSA_TWO.put("든", "Y"); 70 | JOSA_TWO.put("야", "Y"); 71 | JOSA_TWO.put("여", "Y"); 72 | } 73 | 74 | // 종성이 없는 음절과 연결될 수 없는 조사 75 | private static Map JOSA_THREE= new HashMap(); 76 | static { 77 | JOSA_THREE.put("과", "Y"); 78 | JOSA_THREE.put("은", "Y"); 79 | JOSA_THREE.put("아", "Y"); 80 | JOSA_THREE.put("으", "Y"); 81 | JOSA_THREE.put("은", "Y"); 82 | JOSA_THREE.put("을", "Y"); 83 | } 84 | 85 | public static boolean canHaheCompound(String key) { 86 | if(hahes.get(key)!=null) return true; 87 | return false; 88 | } 89 | 90 | /** 91 | * 어미가 ㄴ,ㄹ,ㅁ 으로 끝나는지 조사한다. 92 | * @param eomi 93 | * @return 94 | */ 95 | public static boolean isNLM(String eomi) { 96 | 97 | if(eomi==null || "".equals(eomi)) return false; 98 | 99 | if(eomiPnouns.get(eomi)!=null) return true; 100 | 101 | char[] chrs = MorphUtil.decompose(eomi.charAt(eomi.length()-1)); 102 | if(chrs.length==3 && eomiPnouns.get(Character.toString(chrs[2]))!=null) return true; 103 | 104 | return true; 105 | 106 | } 107 | 108 | public static boolean isEomiPhrase(int ptn) { 109 | 110 | if(PTN_MLIST.get(ptn)!=null) return true; 111 | 112 | return false; 113 | 114 | } 115 | 116 | public static boolean isJosaNounPhrase(int ptn) { 117 | 118 | if(PTN_JLIST.get(ptn)!=null) return true; 119 | 120 | return false; 121 | 122 | } 123 | 124 | public static boolean isJosaAdvPhrase(int ptn) { 125 | 126 | if(PatternConstants.PTN_ADVJ==ptn) return true; 127 | 128 | return false; 129 | 130 | } 131 | 132 | public static boolean isAdvPhrase(int ptn) { 133 | 134 | if(PatternConstants.PTN_ADVJ==ptn || PatternConstants.PTN_AID==ptn) return true; 135 | 136 | return false; 137 | 138 | } 139 | 140 | public static boolean isTwoJosa(String josa) { 141 | 142 | return (JOSA_TWO.get(josa)!=null); 143 | 144 | } 145 | public static boolean isThreeJosa(String josa) { 146 | 147 | return (JOSA_THREE.get(josa)!=null); 148 | 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanBaseNounEngine.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.util.HashMap; 4 | import java.util.List; 5 | import java.util.Map; 6 | 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 11 | import org.apache.lucene.util.AttributeSource; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 16 | 17 | public class KoreanBaseNounEngine implements Engine { 18 | 19 | private Logger logger = LoggerFactory.getLogger(KoreanBaseNounEngine.class); 20 | 21 | private Map customNounsDic = new HashMap(); 22 | 23 | public KoreanBaseNounEngine() { 24 | if(logger.isInfoEnabled()) { 25 | logger.info("init KoreanBaseNounEngine"); 26 | } 27 | 28 | customNounsDic = DictionaryFactory.getFactory().getCustomNounDictionaryMap(); 29 | } 30 | 31 | @Override 32 | public void collectNounState(AttributeSource attributeSource, List comparableStateList, Map returnedTokens) throws Exception { 33 | 34 | CharTermAttribute termAttr = attributeSource.getAttribute(CharTermAttribute.class); 35 | TypeAttribute typeAttr = attributeSource.getAttribute(TypeAttribute.class); 36 | OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class); 37 | PositionIncrementAttribute positionAttr = attributeSource.getAttribute(PositionIncrementAttribute.class); 38 | 39 | //Stack nounsStack = new Stack(); 40 | 41 | if(!typeAttr.type().equals("word")) { 42 | 43 | if(logger.isDebugEnabled()) { 44 | logger.debug("명사 분석 대상이 아닙니다."); 45 | } 46 | 47 | return; 48 | } 49 | 50 | String term = termAttr.toString(); 51 | //단어 자체에 대한 명사인지 평가 52 | if(customNounsDic.containsKey(term)) { 53 | typeAttr.setType("base_noun"); 54 | } 55 | 56 | returnedTokens.put(term+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), ""); 57 | 58 | String comparedWord = null; 59 | //1. 매칭이 되는대로 추출한다. 60 | int startIndex = 0; 61 | int endIndex = startIndex + 1; 62 | 63 | int orgStartOffset = offSetAttr.startOffset(); 64 | 65 | boolean isPrevMatch = false; 66 | 67 | while(true) { 68 | 69 | if(endIndex > term.length()) { 70 | startIndex ++; 71 | endIndex = startIndex + 1; 72 | } 73 | 74 | if(startIndex >= term.length()) { 75 | break; 76 | } 77 | 78 | comparedWord = term.substring(startIndex, endIndex); 79 | 80 | //매칭될 때 State 저장 81 | if(customNounsDic.containsKey(comparedWord) && !term.equals(comparedWord)) { 82 | 83 | //offset도 계산해주어야 합니다. 그래야 하이라이팅이 잘 됩니다. 84 | int startOffSet = orgStartOffset + startIndex; 85 | int endOffSet = orgStartOffset + endIndex; 86 | 87 | String makeKeyForCheck = comparedWord + "_" + startOffSet + "_" + endOffSet; 88 | 89 | if(returnedTokens.containsKey(makeKeyForCheck)) { 90 | 91 | if(logger.isDebugEnabled()) { 92 | logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip"); 93 | } 94 | 95 | endIndex++; 96 | isPrevMatch = true; 97 | 98 | continue; 99 | 100 | } else { 101 | returnedTokens.put(makeKeyForCheck, ""); 102 | } 103 | 104 | termAttr.setEmpty(); 105 | termAttr.append(comparedWord); 106 | 107 | positionAttr.setPositionIncrement(1); //추출된 명사이기 때문에 위치정보를 1로 셋팅 108 | //타입을 noun으로 설정한다. 109 | typeAttr.setType("base_noun"); 110 | 111 | offSetAttr.setOffset(startOffSet , endOffSet); 112 | 113 | ComparableState comparableState = new ComparableState(); 114 | comparableState.setState(attributeSource.captureState()); 115 | comparableState.setStartOffset(offSetAttr.startOffset()); 116 | 117 | comparableStateList.add(comparableState); 118 | 119 | endIndex++; 120 | isPrevMatch = true; 121 | 122 | } else { 123 | if(isPrevMatch) { 124 | startIndex = endIndex - 1; 125 | endIndex = startIndex + 1; 126 | } else { 127 | endIndex++; 128 | } 129 | 130 | isPrevMatch = false; 131 | } 132 | } 133 | 134 | return; 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanBaseNounEngine.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | import java.util.Stack; 6 | 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 11 | import org.apache.lucene.util.AttributeSource; 12 | import org.apache.lucene.util.AttributeSource.State; 13 | import org.slf4j.Logger; 14 | import org.slf4j.LoggerFactory; 15 | 16 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 17 | 18 | public class KoreanBaseNounEngine implements Engine { 19 | 20 | private Logger logger = LoggerFactory.getLogger(KoreanBaseNounEngine.class); 21 | private boolean isUseForIndexing = true; 22 | 23 | private Map customNounsDic = new HashMap(); 24 | 25 | 26 | protected void setIsUseForIndexing(boolean useForIndexing) { 27 | this.isUseForIndexing = useForIndexing; 28 | } 29 | 30 | protected boolean isUseForIndexing() { 31 | return isUseForIndexing; 32 | } 33 | 34 | public KoreanBaseNounEngine() { 35 | if(logger.isInfoEnabled()) { 36 | logger.info("init KoreanBaseNounEngine"); 37 | } 38 | 39 | customNounsDic = DictionaryFactory.getFactory().getCustomNounDictionaryMap(); 40 | } 41 | 42 | @Override 43 | public void collectNounState(AttributeSource attributeSource, Stack nounsStack, Map returnedTokens) throws Exception { 44 | 45 | CharTermAttribute termAttr = attributeSource.getAttribute(CharTermAttribute.class); 46 | TypeAttribute typeAttr = attributeSource.getAttribute(TypeAttribute.class); 47 | OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class); 48 | PositionIncrementAttribute positionAttr = attributeSource.getAttribute(PositionIncrementAttribute.class); 49 | 50 | //Stack nounsStack = new Stack(); 51 | 52 | if(!typeAttr.type().equals("word")) { 53 | 54 | if(logger.isDebugEnabled()) { 55 | logger.debug("명사 분석 대상이 아닙니다."); 56 | } 57 | 58 | return; 59 | } 60 | 61 | String term = termAttr.toString(); 62 | //단어 자체에 대한 명사인지 평가 63 | if(customNounsDic.containsKey(term)) { 64 | typeAttr.setType("base_noun"); 65 | } 66 | 67 | returnedTokens.put(term+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), ""); 68 | 69 | String comparedWord = null; 70 | //1. 매칭이 되는대로 추출한다. 71 | int startIndex = 0; 72 | int endIndex = startIndex + 1; 73 | 74 | int orgStartOffset = offSetAttr.startOffset(); 75 | 76 | boolean isPrevMatch = false; 77 | 78 | while(true) { 79 | 80 | if(endIndex > term.length()) { 81 | startIndex ++; 82 | endIndex = startIndex + 1; 83 | } 84 | 85 | if(startIndex >= term.length()) { 86 | break; 87 | } 88 | 89 | comparedWord = term.substring(startIndex, endIndex); 90 | 91 | //매칭될 때 State 저장 92 | if(customNounsDic.containsKey(comparedWord) && !term.equals(comparedWord)) { 93 | 94 | //offset도 계산해주어야 합니다. 그래야 하이라이팅이 잘 됩니다. 95 | int startOffSet = orgStartOffset + startIndex; 96 | int endOffSet = orgStartOffset + endIndex; 97 | 98 | String makeKeyForCheck = comparedWord + "_" + startOffSet + "_" + endOffSet; 99 | 100 | if(returnedTokens.containsKey(makeKeyForCheck)) { 101 | 102 | if(logger.isDebugEnabled()) { 103 | logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip"); 104 | } 105 | 106 | endIndex++; 107 | isPrevMatch = true; 108 | 109 | continue; 110 | 111 | } else { 112 | returnedTokens.put(makeKeyForCheck, ""); 113 | } 114 | 115 | termAttr.setEmpty(); 116 | termAttr.append(comparedWord); 117 | 118 | positionAttr.setPositionIncrement(1); //추출된 명사이기 때문에 위치정보를 1로 셋팅 119 | //타입을 noun으로 설정한다. 120 | typeAttr.setType("base_noun"); 121 | 122 | offSetAttr.setOffset(startOffSet , endOffSet); 123 | 124 | nounsStack.push(attributeSource.captureState()); //추출된 명사에 대한 AttributeSource를 Stack에 저장 125 | endIndex++; 126 | isPrevMatch = true; 127 | 128 | } else { 129 | if(isPrevMatch) { 130 | startIndex = endIndex - 1; 131 | endIndex = startIndex + 1; 132 | } else { 133 | endIndex++; 134 | } 135 | 136 | isPrevMatch = false; 137 | } 138 | } 139 | 140 | return; 141 | } 142 | } 143 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/dic/josa.dic: -------------------------------------------------------------------------------- 1 | //####### 2 | 가 3 | 같이 4 | 같이나 5 | 같이는 6 | 같이는야 7 | 같이는커녕 8 | 같이도 9 | 같이만 10 | 같인 11 | 고 12 | 과 13 | 과는 14 | 과는커녕 15 | 과도 16 | 과를 17 | 과만 18 | 과만은 19 | 과의 20 | 까지 21 | 까지가 22 | 까지나 23 | 까지나마 24 | 까지는 25 | 까지는야 26 | 까지는커녕 27 | 까지도 28 | 까지든지 29 | 까지라고 30 | 까지라고는 31 | 까지라고만은 32 | 까지라도 33 | 까지로 34 | 까지로나 35 | 까지로나마 36 | 까지로는 37 | 까지로는야 38 | 까지로는커녕 39 | 까지로도 40 | 까지로든 41 | 까지로든지 42 | 까지로라서 43 | 까지로라야 44 | 까지로만 45 | 까지로만은 46 | 까지로서 47 | 까지로써 48 | 까지를 49 | 까지만 50 | 까지만은 51 | 까지만이라도 52 | 까지야 53 | 까지야말로 54 | 까지에 55 | 까지와 56 | 까지의 57 | 까지조차 58 | 까지조차도 59 | 까진 60 | 끼리 61 | 께 62 | 께서 63 | 께옵서 64 | 께옵서는 65 | 께옵서는야 66 | 께옵서는커녕 67 | 께옵서도 68 | 께옵서만 69 | 께옵서만은 70 | 께옵서만이 71 | 께옵선 72 | 나 73 | 나마 74 | 는 75 | 는야 76 | 는커녕 77 | 니 78 | 다 79 | 다가 80 | 다가는 81 | 다가도 82 | 다간 83 | 대로 84 | 대로가 85 | 대로는 86 | 대로의 87 | 더러 88 | 더러는 89 | 더러만은 90 | 도 91 | 든 92 | 든지 93 | 라 94 | 라고 95 | 라고까지 96 | 라고까지는 97 | 라고는 98 | 라고만은 99 | 라곤 100 | 라는 101 | 라도 102 | 라든지 103 | 라서 104 | 라야 105 | 라야만 106 | 라오 107 | 라지 108 | 라지요 109 | 랑 110 | 랑은 111 | 로 112 | 로고 113 | 로구나 114 | 로구려 115 | 로구먼 116 | 로군 117 | 로군요 118 | 로는 119 | 로다 120 | 로되 121 | 로서 122 | 로서의 123 | 로서는 124 | 로세 125 | 를 126 | 마다 127 | 마다라도 128 | 마다를 129 | 마다에게 130 | 마다의 131 | 마따나 132 | 마저 133 | 마저나마라도 134 | 마저도 135 | 마저라도 136 | 마저야 137 | 만 138 | 만도 139 | 만에 140 | 만으로 141 | 만으로는 142 | 만으로도 143 | 만으로라도 144 | 만으로써 145 | 만으론 146 | 만은 147 | 만을 148 | 만의 149 | 만이 150 | 만이라도 151 | 만치 152 | 만큼 153 | 만큼도 154 | 만큼만 155 | 만큼씩 156 | 만큼은 157 | 만큼의 158 | 만큼이나 159 | 만큼이라도 160 | 만큼이야 161 | 말고 162 | 말고는 163 | 말고도 164 | 며 165 | 밖에 166 | 밖에는 167 | 밖에도 168 | 밖엔 169 | 보고 170 | 보고는 171 | 보고도 172 | 보고만 173 | 보고만은 174 | 보고만이라도 175 | 보곤 176 | 보다 177 | 보다는 178 | 보다는야 179 | 보다도 180 | 보다만 181 | 보다야 182 | 보단 183 | 부터 184 | 부터가 185 | 부터나마 186 | 부터는 187 | 부터도 188 | 부터라도 189 | 부터를 190 | 부터만 191 | 부터만은 192 | 부터서는 193 | 부터야말로 194 | 부터의 195 | 부턴 196 | 아 197 | 야 198 | 야말로 199 | 에 200 | 에게 201 | 에게가 202 | 에게까지 203 | 에게까지는 204 | 에게까지는커녕 205 | 에게까지도 206 | 에게까지만 207 | 에게까지만은 208 | 에게나 209 | 에게는 210 | 에게는커녕 211 | 에게다 212 | 에게도 213 | 에게든 214 | 에게든지 215 | 에게라도 216 | 에게로 217 | 에게로는 218 | 에게마다 219 | 에게만 220 | 에게며 221 | 에게보다 222 | 에게보다는 223 | 에게부터 224 | 에게서 225 | 에게서가 226 | 에게서까지 227 | 에게서나 228 | 에게서는 229 | 에게서도 230 | 에게서든지 231 | 에게서라도 232 | 에게서만 233 | 에게서보다 234 | 에게서부터 235 | 에게서야 236 | 에게서와 237 | 에게서의 238 | 에게서처럼 239 | 에게선 240 | 에게야 241 | 에게와 242 | 에게의 243 | 에게처럼 244 | 에게하고 245 | 에게하며 246 | 에겐 247 | 에까지 248 | 에까지는 249 | 에까지도 250 | 에까지든지 251 | 에까지라도 252 | 에까지만 253 | 에까지만은 254 | 에까진 255 | 에나 256 | 에는 257 | 에다 258 | 에다가 259 | 에다가는 260 | 에다간 261 | 에도 262 | 에든 263 | 에든지 264 | 에라도 265 | 에로 266 | 에로의 267 | 에를 268 | 에만 269 | 에만은 270 | 에부터 271 | 에서 272 | 에서가 273 | 에서까지 274 | 에서까지도 275 | 에서나 276 | 에서나마 277 | 에서는 278 | 에서도 279 | 에서든지 280 | 에서라도 281 | 에서만 282 | 에서만도 283 | 에서만이 284 | 에서만큼 285 | 에서만큼은 286 | 에서보다 287 | 에서부터 288 | 에서부터는 289 | 에서부터도 290 | 에서부터라도 291 | 에서부터만 292 | 에서부터만은 293 | 에서야 294 | 에서와 295 | 에서와는 296 | 에서와의 297 | 에서의 298 | 에서조차 299 | 에서처럼 300 | 에선 301 | 에야 302 | 에의 303 | 에조차도 304 | 에하며 305 | 엔 306 | 엔들 307 | 엘 308 | 엘랑 309 | 여 310 | 와 311 | 와는 312 | 와도 313 | 와라도 314 | 와를 315 | 와만 316 | 와만은 317 | 와에만 318 | 와의 319 | 와처럼 320 | 와한테 321 | 요 322 | 으로 323 | 으로가 324 | 으로까지 325 | 으로까지만은 326 | 으로나 327 | 으로나든지 328 | 으로는 329 | 으로도 330 | 으로든지 331 | 으로라도 332 | 으로랑 333 | 으로만 334 | 으로만은 335 | 으로부터 336 | 으로부터는 337 | 으로부터는커녕 338 | 으로부터도 339 | 으로부터만 340 | 으로부터만은 341 | 으로부터서는 342 | 으로부터서도 343 | 으로부터서만 344 | 으로부터의 345 | 으로서 346 | 으로서가 347 | 으로서나 348 | 으로서는 349 | 으로서도 350 | 으로서든지 351 | 으로서라도 352 | 으로서만 353 | 으로서만도 354 | 으로서만은 355 | 으로서야 356 | 으로서의 357 | 으로선 358 | 으로써 359 | 으로써나 360 | 으로써는 361 | 으로써라도 362 | 으로써만 363 | 으로써야 364 | 으로야 365 | 으로의 366 | 으론 367 | 은 368 | 은커녕 369 | 을 370 | 의 371 | 이 372 | 이고 373 | 이나 374 | 이나마 375 | 이니 376 | 이다 377 | 이든 378 | 이든지 379 | 이라 380 | 이라고 381 | 이라고는 382 | 이라고도 383 | 이라고만은 384 | 이라곤 385 | 이라는 386 | 이라도 387 | 이라든지 388 | 이라서 389 | 이라야 390 | 이라야만 391 | 이랑 392 | 이랑은 393 | 이며 394 | 이며에게 395 | 이며조차도 396 | 이야 397 | 이야말로 398 | 이여 399 | 인들 400 | 인즉 401 | 인즉슨 402 | 일랑 403 | 일랑은 404 | 조차 405 | 조차가 406 | 조차도 407 | 조차를 408 | 조차의 409 | 처럼 410 | 처럼과 411 | 처럼도 412 | 처럼만 413 | 처럼만은 414 | 처럼은 415 | 처럼이라도 416 | 처럼이야 417 | 치고 418 | 치고는 419 | 커녕 420 | 커녕은 421 | 커니와 422 | 토록 423 | 하고 424 | 하고가 425 | 하고는 426 | 하고는커녕 427 | 하고도 428 | 하고라도 429 | 하고마저 430 | 하고만 431 | 하고만은 432 | 하고야 433 | 하고에게 434 | 하고의 435 | 하고조차 436 | 하고조차도 437 | 하곤 -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/dic/josa.dic: -------------------------------------------------------------------------------- 1 | //####### 2 | 가 3 | 같이 4 | 같이나 5 | 같이는 6 | 같이는야 7 | 같이는커녕 8 | 같이도 9 | 같이만 10 | 같인 11 | 고 12 | 과 13 | 과는 14 | 과는커녕 15 | 과도 16 | 과를 17 | 과만 18 | 과만은 19 | 과의 20 | 까지 21 | 까지가 22 | 까지나 23 | 까지나마 24 | 까지는 25 | 까지는야 26 | 까지는커녕 27 | 까지도 28 | 까지든지 29 | 까지라고 30 | 까지라고는 31 | 까지라고만은 32 | 까지라도 33 | 까지로 34 | 까지로나 35 | 까지로나마 36 | 까지로는 37 | 까지로는야 38 | 까지로는커녕 39 | 까지로도 40 | 까지로든 41 | 까지로든지 42 | 까지로라서 43 | 까지로라야 44 | 까지로만 45 | 까지로만은 46 | 까지로서 47 | 까지로써 48 | 까지를 49 | 까지만 50 | 까지만은 51 | 까지만이라도 52 | 까지야 53 | 까지야말로 54 | 까지에 55 | 까지와 56 | 까지의 57 | 까지조차 58 | 까지조차도 59 | 까진 60 | 끼리 61 | 께 62 | 께서 63 | 께옵서 64 | 께옵서는 65 | 께옵서는야 66 | 께옵서는커녕 67 | 께옵서도 68 | 께옵서만 69 | 께옵서만은 70 | 께옵서만이 71 | 께옵선 72 | 나 73 | 나마 74 | 는 75 | 는야 76 | 는커녕 77 | 니 78 | 다 79 | 다가 80 | 다가는 81 | 다가도 82 | 다간 83 | 대로 84 | 대로가 85 | 대로는 86 | 대로의 87 | 더러 88 | 더러는 89 | 더러만은 90 | 도 91 | 든 92 | 든지 93 | 라 94 | 라고 95 | 라고까지 96 | 라고까지는 97 | 라고는 98 | 라고만은 99 | 라곤 100 | 라는 101 | 라도 102 | 라든지 103 | 라서 104 | 라야 105 | 라야만 106 | 라오 107 | 라지 108 | 라지요 109 | 랑 110 | 랑은 111 | 로 112 | 로고 113 | 로구나 114 | 로구려 115 | 로구먼 116 | 로군 117 | 로군요 118 | 로는 119 | 로다 120 | 로되 121 | 로서 122 | 로서의 123 | 로서는 124 | 로세 125 | 를 126 | 마다 127 | 마다라도 128 | 마다를 129 | 마다에게 130 | 마다의 131 | 마따나 132 | 마저 133 | 마저나마라도 134 | 마저도 135 | 마저라도 136 | 마저야 137 | 만 138 | 만도 139 | 만에 140 | 만으로 141 | 만으로는 142 | 만으로도 143 | 만으로라도 144 | 만으로써 145 | 만으론 146 | 만은 147 | 만을 148 | 만의 149 | 만이 150 | 만이라도 151 | 만치 152 | 만큼 153 | 만큼도 154 | 만큼만 155 | 만큼씩 156 | 만큼은 157 | 만큼의 158 | 만큼이나 159 | 만큼이라도 160 | 만큼이야 161 | 말고 162 | 말고는 163 | 말고도 164 | 며 165 | 밖에 166 | 밖에는 167 | 밖에도 168 | 밖엔 169 | 보고 170 | 보고는 171 | 보고도 172 | 보고만 173 | 보고만은 174 | 보고만이라도 175 | 보곤 176 | 보다 177 | 보다는 178 | 보다는야 179 | 보다도 180 | 보다만 181 | 보다야 182 | 보단 183 | 부터 184 | 부터가 185 | 부터나마 186 | 부터는 187 | 부터도 188 | 부터라도 189 | 부터를 190 | 부터만 191 | 부터만은 192 | 부터서는 193 | 부터야말로 194 | 부터의 195 | 부턴 196 | 아 197 | 야 198 | 야말로 199 | 에 200 | 에게 201 | 에게가 202 | 에게까지 203 | 에게까지는 204 | 에게까지는커녕 205 | 에게까지도 206 | 에게까지만 207 | 에게까지만은 208 | 에게나 209 | 에게는 210 | 에게는커녕 211 | 에게다 212 | 에게도 213 | 에게든 214 | 에게든지 215 | 에게라도 216 | 에게로 217 | 에게로는 218 | 에게마다 219 | 에게만 220 | 에게며 221 | 에게보다 222 | 에게보다는 223 | 에게부터 224 | 에게서 225 | 에게서가 226 | 에게서까지 227 | 에게서나 228 | 에게서는 229 | 에게서도 230 | 에게서든지 231 | 에게서라도 232 | 에게서만 233 | 에게서보다 234 | 에게서부터 235 | 에게서야 236 | 에게서와 237 | 에게서의 238 | 에게서처럼 239 | 에게선 240 | 에게야 241 | 에게와 242 | 에게의 243 | 에게처럼 244 | 에게하고 245 | 에게하며 246 | 에겐 247 | 에까지 248 | 에까지는 249 | 에까지도 250 | 에까지든지 251 | 에까지라도 252 | 에까지만 253 | 에까지만은 254 | 에까진 255 | 에나 256 | 에는 257 | 에다 258 | 에다가 259 | 에다가는 260 | 에다간 261 | 에도 262 | 에든 263 | 에든지 264 | 에라도 265 | 에로 266 | 에로의 267 | 에를 268 | 에만 269 | 에만은 270 | 에부터 271 | 에서 272 | 에서가 273 | 에서까지 274 | 에서까지도 275 | 에서나 276 | 에서나마 277 | 에서는 278 | 에서도 279 | 에서든지 280 | 에서라도 281 | 에서만 282 | 에서만도 283 | 에서만이 284 | 에서만큼 285 | 에서만큼은 286 | 에서보다 287 | 에서부터 288 | 에서부터는 289 | 에서부터도 290 | 에서부터라도 291 | 에서부터만 292 | 에서부터만은 293 | 에서야 294 | 에서와 295 | 에서와는 296 | 에서와의 297 | 에서의 298 | 에서조차 299 | 에서처럼 300 | 에선 301 | 에야 302 | 에의 303 | 에조차도 304 | 에하며 305 | 엔 306 | 엔들 307 | 엘 308 | 엘랑 309 | 여 310 | 와 311 | 와는 312 | 와도 313 | 와라도 314 | 와를 315 | 와만 316 | 와만은 317 | 와에만 318 | 와의 319 | 와처럼 320 | 와한테 321 | 요 322 | 으로 323 | 으로가 324 | 으로까지 325 | 으로까지만은 326 | 으로나 327 | 으로나든지 328 | 으로는 329 | 으로도 330 | 으로든지 331 | 으로라도 332 | 으로랑 333 | 으로만 334 | 으로만은 335 | 으로부터 336 | 으로부터는 337 | 으로부터는커녕 338 | 으로부터도 339 | 으로부터만 340 | 으로부터만은 341 | 으로부터서는 342 | 으로부터서도 343 | 으로부터서만 344 | 으로부터의 345 | 으로서 346 | 으로서가 347 | 으로서나 348 | 으로서는 349 | 으로서도 350 | 으로서든지 351 | 으로서라도 352 | 으로서만 353 | 으로서만도 354 | 으로서만은 355 | 으로서야 356 | 으로서의 357 | 으로선 358 | 으로써 359 | 으로써나 360 | 으로써는 361 | 으로써라도 362 | 으로써만 363 | 으로써야 364 | 으로야 365 | 으로의 366 | 으론 367 | 은 368 | 은커녕 369 | 을 370 | 의 371 | 이 372 | 이고 373 | 이나 374 | 이나마 375 | 이니 376 | 이다 377 | 이든 378 | 이든지 379 | 이라 380 | 이라고 381 | 이라고는 382 | 이라고도 383 | 이라고만은 384 | 이라곤 385 | 이라는 386 | 이라도 387 | 이라든지 388 | 이라서 389 | 이라야 390 | 이라야만 391 | 이랑 392 | 이랑은 393 | 이며 394 | 이며에게 395 | 이며조차도 396 | 이야 397 | 이야말로 398 | 이여 399 | 인들 400 | 인즉 401 | 인즉슨 402 | 일랑 403 | 일랑은 404 | 조차 405 | 조차가 406 | 조차도 407 | 조차를 408 | 조차의 409 | 처럼 410 | 처럼과 411 | 처럼도 412 | 처럼만 413 | 처럼만은 414 | 처럼은 415 | 처럼이라도 416 | 처럼이야 417 | 치고 418 | 치고는 419 | 커녕 420 | 커녕은 421 | 커니와 422 | 토록 423 | 하고 424 | 하고가 425 | 하고는 426 | 하고는커녕 427 | 하고도 428 | 하고라도 429 | 하고마저 430 | 하고만 431 | 하고만은 432 | 하고야 433 | 하고에게 434 | 하고의 435 | 하고조차 436 | 하고조차도 437 | 하곤 -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanLongestNounEngine.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.util.HashMap; 4 | import java.util.Map; 5 | import java.util.Stack; 6 | 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 11 | import org.apache.lucene.util.AttributeSource; 12 | import org.apache.lucene.util.AttributeSource.State; 13 | import org.slf4j.Logger; 14 | import org.slf4j.LoggerFactory; 15 | 16 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 17 | 18 | public class KoreanLongestNounEngine implements Engine { 19 | 20 | private Logger logger = LoggerFactory.getLogger(KoreanLongestNounEngine.class); 21 | 22 | private static Map customNounsDic = new HashMap(); 23 | 24 | 25 | public KoreanLongestNounEngine() { 26 | if(logger.isInfoEnabled()) { 27 | logger.info("init KoreanLongestNounEngine"); 28 | } 29 | 30 | customNounsDic = DictionaryFactory.getFactory().getCustomNounDictionaryMap(); 31 | } 32 | 33 | @Override 34 | public void collectNounState(AttributeSource attributeSource, Stack nounsStack, Map returnedTokens) throws Exception { 35 | 36 | 37 | CharTermAttribute termAttr = attributeSource.getAttribute(CharTermAttribute.class); 38 | TypeAttribute typeAttr = attributeSource.getAttribute(TypeAttribute.class); 39 | OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class); 40 | PositionIncrementAttribute positionAttr = attributeSource.getAttribute(PositionIncrementAttribute.class); 41 | 42 | if(!typeAttr.type().equals("word")) { 43 | 44 | if(logger.isDebugEnabled()) { 45 | logger.debug("명사 분석 대상이 아닙니다."); 46 | } 47 | 48 | return; 49 | } 50 | 51 | String term = termAttr.toString(); 52 | //단어 자체에 대한 명사인지 평가 53 | if(customNounsDic.containsKey(term)) { 54 | typeAttr.setType("long_noun"); 55 | } 56 | 57 | returnedTokens.put(term+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), ""); 58 | 59 | String comparedWord = null; 60 | 61 | //1. 사전과 매칭되는 가장 긴 단어를 추출한다. 62 | int startIndex = 0; 63 | int endIndex = startIndex + 1; 64 | 65 | int orgStartOffSet = offSetAttr.startOffset(); 66 | 67 | int prevMatchedStartIndex = 0; 68 | int prevMatchedEndIndex = 0; 69 | 70 | String matchedTerm = ""; 71 | 72 | while(true) { 73 | 74 | if(endIndex > term.length()) { 75 | 76 | if(matchedTerm.length() > 0 && !term.equals(matchedTerm)) { //endIndex가 끝까지 갔고, 매칭된 키워드가 있음 77 | 78 | int startOffSet = orgStartOffSet + prevMatchedStartIndex; 79 | int endOffSet = orgStartOffSet + prevMatchedEndIndex; 80 | 81 | String makeKeyForCheck = matchedTerm + "_" + startOffSet + "_" + endOffSet; 82 | 83 | if(returnedTokens.containsKey(makeKeyForCheck)) { 84 | 85 | if(logger.isDebugEnabled()) { 86 | logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip"); 87 | } 88 | 89 | matchedTerm = ""; 90 | 91 | startIndex = prevMatchedEndIndex; 92 | endIndex = startIndex + 1; 93 | 94 | continue; 95 | 96 | } else { 97 | returnedTokens.put(makeKeyForCheck, ""); 98 | } 99 | 100 | termAttr.setEmpty(); 101 | termAttr.append(matchedTerm); 102 | 103 | positionAttr.setPositionIncrement(1); //추출된 명사이기 때문에 위치정보를 1로 셋팅 104 | //타입을 noun으로 설정한다. 105 | typeAttr.setType("long_noun"); 106 | 107 | offSetAttr.setOffset(startOffSet , endOffSet); 108 | 109 | nounsStack.push(attributeSource.captureState()); //추출된 명사에 대한 AttributeSource를 Stack에 저장 110 | 111 | matchedTerm = ""; 112 | 113 | startIndex = prevMatchedEndIndex; 114 | endIndex = startIndex + 1; 115 | } else { 116 | 117 | if(startIndex == prevMatchedEndIndex) { 118 | startIndex++; 119 | endIndex = startIndex + 1; 120 | } else { 121 | startIndex = endIndex; 122 | endIndex = startIndex + 1; 123 | } 124 | } 125 | 126 | 127 | } 128 | 129 | if(startIndex >= term.length()) { 130 | break; 131 | } 132 | 133 | comparedWord = term.substring(startIndex, endIndex); 134 | 135 | //매칭될 때 우선 matchedTerm에 저장 136 | if(customNounsDic.containsKey(comparedWord)) { 137 | matchedTerm = comparedWord; 138 | prevMatchedStartIndex = startIndex; 139 | prevMatchedEndIndex = endIndex; 140 | } 141 | 142 | endIndex++; 143 | 144 | }//end while 145 | 146 | return; 147 | } 148 | 149 | } 150 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/morph/WSOutput.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.lucene.analysis.kr.morph; 18 | 19 | import java.util.ArrayList; 20 | import java.util.Collections; 21 | import java.util.List; 22 | 23 | public class WSOutput implements Cloneable { 24 | 25 | private int lastStart = 0; 26 | 27 | private int lastEnd = 0; 28 | 29 | private List phrases = new ArrayList(); 30 | 31 | public WSOutput() { 32 | 33 | } 34 | 35 | public WSOutput(AnalysisOutput o) { 36 | addPhrase(o); 37 | } 38 | 39 | public int getLastStart() { 40 | return lastStart; 41 | } 42 | 43 | public void setLastStart(int start) { 44 | this.lastStart = start; 45 | } 46 | 47 | public int getLastEnd() { 48 | return lastEnd; 49 | } 50 | 51 | public void setLastEnd(int end) { 52 | this.lastStart = end; 53 | } 54 | 55 | 56 | public List getPhrases() { 57 | return phrases; 58 | } 59 | 60 | public void removeLast() { 61 | 62 | if(this.phrases.size()==0) return; 63 | 64 | AnalysisOutput o = this.phrases.remove(this.phrases.size()-1); 65 | 66 | if(this.phrases.size()==0) { 67 | 68 | this.lastStart = 0; 69 | this.lastEnd = 0; 70 | 71 | } else { 72 | 73 | this.lastEnd -= o.getSource().length(); 74 | 75 | if(this.phrases.size()>1) { 76 | AnalysisOutput o1 = this.phrases.get(this.phrases.size()-1); 77 | this.lastStart = lastEnd-o1.getSource().length(); 78 | } else { 79 | this.lastStart = 0; 80 | } 81 | 82 | } 83 | 84 | } 85 | 86 | public void addPhrase(AnalysisOutput o) { 87 | 88 | this.lastStart = this.lastEnd; 89 | this.lastEnd += o.getSource().length(); 90 | 91 | if(o.getCNounList().size()==0) 92 | this.phrases.add(o); 93 | else 94 | addCompounds(o); 95 | 96 | } 97 | 98 | private void addCompounds(AnalysisOutput o) { 99 | 100 | List cnouns = o.getCNounList(); 101 | 102 | String source = o.getSource(); 103 | int rmstemlen = 0; 104 | 105 | // for(int i=0;i=cnouns.size()-2) break; 122 | 123 | int score = AnalysisOutput.SCORE_CORRECT; 124 | if(!cnouns.get(i).isExist()) score=AnalysisOutput.SCORE_CANDIDATE; 125 | 126 | AnalysisOutput o1 = new AnalysisOutput(noun, null, null, 127 | PatternConstants.POS_NOUN, PatternConstants.PTN_N, score); 128 | 129 | o1.setSource(noun); 130 | 131 | if(isOnechar) { 132 | o1.addCNoun(cnouns.get(i)); 133 | o1.addCNoun(cnouns.get(i+1)); 134 | } 135 | 136 | if(source.length()>noun.length()) 137 | source = source.substring(noun.length()); 138 | 139 | this.phrases.add(o1); 140 | cnouns.remove(cnouns.get(0)); 141 | i--; 142 | 143 | if(isOnechar) { 144 | cnouns.remove(cnouns.get(0)); 145 | } 146 | 147 | } 148 | 149 | o.setStem(o.getStem().substring(o.getSource().length()-source.length())); 150 | o.setSource(source); 151 | if(cnouns.size()==1) cnouns.remove(0); 152 | 153 | this.phrases.add(o); 154 | 155 | } 156 | 157 | public void setPhrases(List phrases) { 158 | this.phrases = phrases; 159 | } 160 | 161 | public WSOutput clone() throws CloneNotSupportedException { 162 | 163 | WSOutput candidate = (WSOutput)super.clone(); 164 | 165 | candidate.setLastStart(lastStart); 166 | 167 | candidate.setLastEnd(lastEnd); 168 | 169 | List list = new ArrayList(); 170 | list.addAll(phrases); 171 | candidate.setPhrases(list); 172 | 173 | return candidate; 174 | } 175 | } 176 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/com/tistory/devyongsik/analyzer/KoreanLongestNounEngine.java: -------------------------------------------------------------------------------- 1 | package com.tistory.devyongsik.analyzer; 2 | 3 | import java.util.HashMap; 4 | import java.util.List; 5 | import java.util.Map; 6 | 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 8 | import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; 9 | import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; 10 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 11 | import org.apache.lucene.util.AttributeSource; 12 | import org.slf4j.Logger; 13 | import org.slf4j.LoggerFactory; 14 | 15 | import com.tistory.devyongsik.analyzer.dictionary.DictionaryFactory; 16 | 17 | public class KoreanLongestNounEngine implements Engine { 18 | 19 | private Logger logger = LoggerFactory.getLogger(KoreanLongestNounEngine.class); 20 | 21 | private static Map customNounsDic = new HashMap(); 22 | 23 | 24 | public KoreanLongestNounEngine() { 25 | if(logger.isInfoEnabled()) { 26 | logger.info("init KoreanLongestNounEngine"); 27 | } 28 | 29 | customNounsDic = DictionaryFactory.getFactory().getCustomNounDictionaryMap(); 30 | } 31 | 32 | @Override 33 | public void collectNounState(AttributeSource attributeSource, List comparableStateList, Map returnedTokens) throws Exception { 34 | 35 | 36 | CharTermAttribute termAttr = attributeSource.getAttribute(CharTermAttribute.class); 37 | TypeAttribute typeAttr = attributeSource.getAttribute(TypeAttribute.class); 38 | OffsetAttribute offSetAttr = attributeSource.getAttribute(OffsetAttribute.class); 39 | PositionIncrementAttribute positionAttr = attributeSource.getAttribute(PositionIncrementAttribute.class); 40 | 41 | if(!typeAttr.type().equals("word")) { 42 | 43 | if(logger.isDebugEnabled()) { 44 | logger.debug("명사 분석 대상이 아닙니다."); 45 | } 46 | 47 | return; 48 | } 49 | 50 | String term = termAttr.toString(); 51 | //단어 자체에 대한 명사인지 평가 52 | if(customNounsDic.containsKey(term)) { 53 | typeAttr.setType("long_noun"); 54 | } 55 | 56 | returnedTokens.put(term+"_"+offSetAttr.startOffset()+"_"+offSetAttr.endOffset(), ""); 57 | 58 | String comparedWord = null; 59 | 60 | //1. 사전과 매칭되는 가장 긴 단어를 추출한다. 61 | int startIndex = 0; 62 | int endIndex = startIndex + 1; 63 | 64 | int orgStartOffSet = offSetAttr.startOffset(); 65 | 66 | int prevMatchedStartIndex = 0; 67 | int prevMatchedEndIndex = 0; 68 | 69 | String matchedTerm = ""; 70 | 71 | while(true) { 72 | 73 | if(endIndex > term.length()) { 74 | 75 | if(matchedTerm.length() > 0 && !term.equals(matchedTerm)) { //endIndex가 끝까지 갔고, 매칭된 키워드가 있음 76 | 77 | int startOffSet = orgStartOffSet + prevMatchedStartIndex; 78 | int endOffSet = orgStartOffSet + prevMatchedEndIndex; 79 | 80 | String makeKeyForCheck = matchedTerm + "_" + startOffSet + "_" + endOffSet; 81 | 82 | if(returnedTokens.containsKey(makeKeyForCheck)) { 83 | 84 | if(logger.isDebugEnabled()) { 85 | logger.debug("["+makeKeyForCheck+"] 는 이미 추출된 Token입니다. Skip"); 86 | } 87 | 88 | matchedTerm = ""; 89 | 90 | startIndex = prevMatchedEndIndex; 91 | endIndex = startIndex + 1; 92 | 93 | continue; 94 | 95 | } else { 96 | returnedTokens.put(makeKeyForCheck, ""); 97 | } 98 | 99 | termAttr.setEmpty(); 100 | termAttr.append(matchedTerm); 101 | 102 | positionAttr.setPositionIncrement(1); //추출된 명사이기 때문에 위치정보를 1로 셋팅 103 | //타입을 noun으로 설정한다. 104 | typeAttr.setType("long_noun"); 105 | 106 | offSetAttr.setOffset(startOffSet , endOffSet); 107 | 108 | ComparableState comparableState = new ComparableState(); 109 | comparableState.setState(attributeSource.captureState()); 110 | comparableState.setStartOffset(offSetAttr.startOffset()); 111 | 112 | comparableStateList.add(comparableState); 113 | 114 | matchedTerm = ""; 115 | 116 | startIndex = prevMatchedEndIndex; 117 | endIndex = startIndex + 1; 118 | } else { 119 | 120 | if(startIndex == prevMatchedEndIndex) { 121 | startIndex++; 122 | endIndex = startIndex + 1; 123 | } else { 124 | startIndex = endIndex; 125 | endIndex = startIndex + 1; 126 | } 127 | } 128 | 129 | 130 | } 131 | 132 | if(startIndex >= term.length()) { 133 | break; 134 | } 135 | 136 | comparedWord = term.substring(startIndex, endIndex); 137 | 138 | //매칭될 때 우선 matchedTerm에 저장 139 | if(customNounsDic.containsKey(comparedWord)) { 140 | matchedTerm = comparedWord; 141 | prevMatchedStartIndex = startIndex; 142 | prevMatchedEndIndex = endIndex; 143 | } 144 | 145 | endIndex++; 146 | 147 | }//end while 148 | 149 | return; 150 | } 151 | 152 | } 153 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/utils/ConstraintUtil.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package org.apache.lucene.analysis.kr.utils; 18 | 19 | import java.util.HashMap; 20 | 21 | import java.util.Map; 22 | 23 | import org.apache.lucene.analysis.kr.morph.PatternConstants; 24 | 25 | /** 26 | * 결합이 가능한 조건을 처리하는 클래스 27 | * @author smlee 28 | * 29 | */ 30 | public class ConstraintUtil { 31 | 32 | private static Map hahes = new HashMap(); // "글로벌화해 ", "민족화해" 처럼 화해와 결합이 가능한 명사 33 | static { 34 | hahes.put("민족", "Y");hahes.put("동서", "Y");hahes.put("남북", "Y"); 35 | } 36 | 37 | private static Map eomiPnouns = new HashMap(); 38 | static { 39 | eomiPnouns.put("ㄴ", "Y");eomiPnouns.put("ㄹ", "Y");eomiPnouns.put("ㅁ", "Y"); 40 | } 41 | 42 | private static Map PTN_MLIST= new HashMap(); 43 | static { 44 | PTN_MLIST.put(PatternConstants.PTN_NSM, PatternConstants.PTN_NSM); 45 | PTN_MLIST.put(PatternConstants.PTN_NSMXM, PatternConstants.PTN_NSMXM); 46 | PTN_MLIST.put(PatternConstants.PTN_NJCM, PatternConstants.PTN_NJCM); 47 | PTN_MLIST.put(PatternConstants.PTN_VM, PatternConstants.PTN_VM); 48 | PTN_MLIST.put(PatternConstants.PTN_VMCM, PatternConstants.PTN_VMCM); 49 | PTN_MLIST.put(PatternConstants.PTN_VMXM, PatternConstants.PTN_VMXM); 50 | PTN_MLIST.put(PatternConstants.PTN_NVM, PatternConstants.PTN_NVM); 51 | } 52 | 53 | private static Map PTN_JLIST= new HashMap(); 54 | static { 55 | PTN_JLIST.put(PatternConstants.PTN_NJ, PatternConstants.PTN_NJ); 56 | PTN_JLIST.put(PatternConstants.PTN_NSMJ, PatternConstants.PTN_NSMJ); 57 | PTN_JLIST.put(PatternConstants.PTN_VMJ, PatternConstants.PTN_VMJ); 58 | PTN_JLIST.put(PatternConstants.PTN_VMXMJ, PatternConstants.PTN_VMXMJ); 59 | } 60 | 61 | private static Map WORD_GUKS= new HashMap(); 62 | static { 63 | WORD_GUKS.put("날것", "Y"); 64 | WORD_GUKS.put("들것", "Y"); 65 | WORD_GUKS.put("별것", "Y"); 66 | WORD_GUKS.put("찰것", "Y"); 67 | WORD_GUKS.put("탈것", "Y"); 68 | WORD_GUKS.put("하잘것", "Y"); 69 | } 70 | 71 | // 종성이 있는 음절과 연결될 수 없는 조사 72 | private static Map JOSA_TWO= new HashMap(); 73 | static { 74 | JOSA_TWO.put("가", "Y"); 75 | JOSA_TWO.put("는", "Y"); 76 | JOSA_TWO.put("다", "Y"); 77 | JOSA_TWO.put("나", "Y"); 78 | JOSA_TWO.put("니", "Y"); 79 | JOSA_TWO.put("고", "Y"); 80 | JOSA_TWO.put("라", "Y"); 81 | JOSA_TWO.put("와", "Y"); 82 | JOSA_TWO.put("랑", "Y"); 83 | JOSA_TWO.put("를", "Y"); 84 | JOSA_TWO.put("며", "Y"); 85 | JOSA_TWO.put("든", "Y"); 86 | JOSA_TWO.put("야", "Y"); 87 | JOSA_TWO.put("여", "Y"); 88 | } 89 | 90 | // 종성이 없는 음절과 연결될 수 없는 조사 91 | private static Map JOSA_THREE= new HashMap(); 92 | static { 93 | JOSA_THREE.put("과", "Y"); 94 | JOSA_THREE.put("은", "Y"); 95 | JOSA_THREE.put("아", "Y"); 96 | JOSA_THREE.put("으", "Y"); 97 | JOSA_THREE.put("은", "Y"); 98 | JOSA_THREE.put("을", "Y"); 99 | } 100 | 101 | public static boolean canHaheCompound(String key) { 102 | if(hahes.get(key)!=null) return true; 103 | return false; 104 | } 105 | 106 | /** 107 | * 어미가 ㄴ,ㄹ,ㅁ 으로 끝나는지 조사한다. 108 | * @param eomi 109 | * @return 110 | */ 111 | public static boolean isNLM(String eomi) { 112 | 113 | if(eomi==null || "".equals(eomi)) return false; 114 | 115 | if(eomiPnouns.get(eomi)!=null) return true; 116 | 117 | char[] chrs = MorphUtil.decompose(eomi.charAt(eomi.length()-1)); 118 | if(chrs.length==3 && eomiPnouns.get(Character.toString(chrs[2]))!=null) return true; 119 | 120 | return true; 121 | 122 | } 123 | 124 | public static boolean isEomiPhrase(int ptn) { 125 | 126 | if(PTN_MLIST.get(ptn)!=null) return true; 127 | 128 | return false; 129 | 130 | } 131 | 132 | public static boolean isJosaNounPhrase(int ptn) { 133 | 134 | if(PTN_JLIST.get(ptn)!=null) return true; 135 | 136 | return false; 137 | 138 | } 139 | 140 | public static boolean isJosaAdvPhrase(int ptn) { 141 | 142 | if(PatternConstants.PTN_ADVJ==ptn) return true; 143 | 144 | return false; 145 | 146 | } 147 | 148 | public static boolean isAdvPhrase(int ptn) { 149 | 150 | if(PatternConstants.PTN_ADVJ==ptn || PatternConstants.PTN_AID==ptn) return true; 151 | 152 | return false; 153 | 154 | } 155 | 156 | public static boolean isTwoJosa(String josa) { 157 | 158 | return (JOSA_TWO.get(josa)!=null); 159 | 160 | } 161 | public static boolean isThreeJosa(String josa) { 162 | 163 | return (JOSA_THREE.get(josa)!=null); 164 | 165 | } 166 | } 167 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/utils/KoreanEnv.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.utils; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import java.io.ByteArrayInputStream; 21 | import java.io.File; 22 | import java.io.FileInputStream; 23 | import java.util.Properties; 24 | 25 | import org.apache.lucene.analysis.kr.morph.MorphException; 26 | 27 | public class KoreanEnv { 28 | 29 | public static final String FILE_SYLLABLE_FEATURE = "syllable.dic"; 30 | 31 | public static final String FILE_DICTIONARY = "dictionary.dic"; 32 | 33 | public static final String FILE_JOSA = "josa.dic"; 34 | 35 | public static final String FILE_EOMI = "eomi.dic"; 36 | 37 | public static final String FILE_EXTENSION = "extension.dic"; 38 | 39 | public static final String FILE_PREFIX = "prefix.dic"; 40 | 41 | public static final String FILE_SUFFIX = "suffix.dic"; 42 | 43 | public static final String FILE_COMPOUNDS = "compounds.dic"; 44 | 45 | public static final String FILE_UNCOMPOUNDS = "uncompounds.dic"; 46 | 47 | public static final String FILE_CJ = "cj.dic"; 48 | 49 | public static final String FILE_KOREAN_PROPERTY = "org/apache/lucene/analysis/kr/korean.properties"; 50 | 51 | private Properties defaults = null; 52 | 53 | /** 54 | * The props member gets its values from the configuration in the property file. 55 | */ 56 | private Properties props = null; 57 | 58 | private static KoreanEnv instance = null; 59 | 60 | /** 61 | * The constructor loads property values from the property file. 62 | */ 63 | private KoreanEnv() throws MorphException { 64 | try { 65 | initDefaultProperties(); 66 | props = loadProperties(defaults); 67 | } catch (MorphException e) { 68 | throw new MorphException ("Failure while initializing property values:\n"+e.getMessage()); 69 | } 70 | } 71 | 72 | public static KoreanEnv getInstance() throws MorphException { 73 | if(instance==null) 74 | instance = new KoreanEnv(); 75 | 76 | return instance; 77 | } 78 | 79 | /** 80 | * Initialize the default property values. 81 | */ 82 | private void initDefaultProperties() { 83 | defaults = new Properties(); 84 | 85 | defaults.setProperty(FILE_SYLLABLE_FEATURE,"org/apache/lucene/analysis/kr/dic/syllable.dic"); 86 | defaults.setProperty(FILE_DICTIONARY,"org/apache/lucene/analysis/kr/dic/dictionary.dic"); 87 | defaults.setProperty(FILE_DICTIONARY,"org/apache/lucene/analysis/kr/dic/extension.dic"); 88 | defaults.setProperty(FILE_JOSA,"org/apache/lucene/analysis/kr/dic/josa.dic"); 89 | defaults.setProperty(FILE_EOMI,"org/apache/lucene/analysis/kr/dic/eomi.dic"); 90 | defaults.setProperty(FILE_PREFIX,"org/apache/lucene/analysis/kr/dic/prefix.dic"); 91 | defaults.setProperty(FILE_SUFFIX,"org/apache/lucene/analysis/kr/dic/suffix.dic"); 92 | defaults.setProperty(FILE_COMPOUNDS,"org/apache/lucene/analysis/kr/dic/compounds.dic"); 93 | defaults.setProperty(FILE_UNCOMPOUNDS,"org/apache/lucene/analysis/kr/dic/uncompounds.dic"); 94 | defaults.setProperty(FILE_CJ,"org/apache/lucene/analysis/kr/dic/cj.dic"); 95 | } 96 | 97 | 98 | /** 99 | * Given a property file name, load the property file and return an object 100 | * representing the property values. 101 | * 102 | * @param propertyFile The name of the property file to load. 103 | * @param def Default property values, or null if there are no defaults. 104 | * @return The loaded SortedProperties object. 105 | */ 106 | private Properties loadProperties(Properties def) throws MorphException { 107 | Properties properties = new Properties(); 108 | 109 | if (def != null) { 110 | properties = new Properties(def); 111 | } 112 | 113 | File file = null; 114 | try { 115 | file = FileUtil.getClassLoaderFile(FILE_KOREAN_PROPERTY); 116 | if (file != null) { 117 | properties.load(new FileInputStream(file)); 118 | return properties; 119 | } 120 | 121 | byte[] in = FileUtil.readByteFromCurrentJar(FILE_KOREAN_PROPERTY); 122 | properties.load(new ByteArrayInputStream(in)); 123 | } catch (Exception e) { 124 | throw new MorphException("Failure while trying to load properties file " + file.getPath(), e); 125 | } 126 | return properties; 127 | } 128 | 129 | 130 | /** 131 | * Returns the value of a property. 132 | * 133 | * @param name The name of the property whose value is to be retrieved. 134 | * @return The value of the property. 135 | */ 136 | public String getValue(String name) { 137 | return props.getProperty(name); 138 | } 139 | } 140 | -------------------------------------------------------------------------------- /korean-analyzer-4.x/src/main/java/org/apache/lucene/analysis/kr/utils/KoreanEnv.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.utils; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | 6 | * contributor license agreements. See the NOTICE file distributed with 7 | * this work for additional information regarding copyright ownership. 8 | * The ASF licenses this file to You under the Apache License, Version 2.0 9 | * (the "License"); you may not use this file except in compliance with 10 | * the License. You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | import java.io.ByteArrayInputStream; 22 | import java.io.File; 23 | import java.io.FileInputStream; 24 | import java.util.Properties; 25 | 26 | import org.apache.lucene.analysis.kr.morph.MorphException; 27 | 28 | public class KoreanEnv { 29 | 30 | public static final String FILE_SYLLABLE_FEATURE = "syllable.dic"; 31 | 32 | public static final String FILE_DICTIONARY = "dictionary.dic"; 33 | 34 | public static final String FILE_JOSA = "josa.dic"; 35 | 36 | public static final String FILE_EOMI = "eomi.dic"; 37 | 38 | public static final String FILE_EXTENSION = "extension.dic"; 39 | 40 | public static final String FILE_PREFIX = "prefix.dic"; 41 | 42 | public static final String FILE_SUFFIX = "suffix.dic"; 43 | 44 | public static final String FILE_COMPOUNDS = "compounds.dic"; 45 | 46 | public static final String FILE_UNCOMPOUNDS = "uncompounds.dic"; 47 | 48 | public static final String FILE_CJ = "cj.dic"; 49 | 50 | public static final String FILE_KOREAN_PROPERTY = "org/apache/lucene/analysis/kr/korean.properties"; 51 | 52 | private Properties defaults = null; 53 | 54 | /** 55 | * The props member gets its values from the configuration in the property file. 56 | */ 57 | private Properties props = null; 58 | 59 | private static KoreanEnv instance = null; 60 | 61 | /** 62 | * The constructor loads property values from the property file. 63 | */ 64 | private KoreanEnv() throws MorphException { 65 | try { 66 | initDefaultProperties(); 67 | props = loadProperties(defaults); 68 | } catch (MorphException e) { 69 | throw new MorphException ("Failure while initializing property values:\n"+e.getMessage()); 70 | } 71 | } 72 | 73 | public static KoreanEnv getInstance() throws MorphException { 74 | if(instance==null) 75 | instance = new KoreanEnv(); 76 | 77 | return instance; 78 | } 79 | 80 | /** 81 | * Initialize the default property values. 82 | */ 83 | private void initDefaultProperties() { 84 | defaults = new Properties(); 85 | 86 | defaults.setProperty(FILE_SYLLABLE_FEATURE,"org/apache/lucene/analysis/kr/dic/syllable.dic"); 87 | defaults.setProperty(FILE_DICTIONARY,"org/apache/lucene/analysis/kr/dic/dictionary.dic"); 88 | defaults.setProperty(FILE_DICTIONARY,"org/apache/lucene/analysis/kr/dic/extension.dic"); 89 | defaults.setProperty(FILE_JOSA,"org/apache/lucene/analysis/kr/dic/josa.dic"); 90 | defaults.setProperty(FILE_EOMI,"org/apache/lucene/analysis/kr/dic/eomi.dic"); 91 | defaults.setProperty(FILE_PREFIX,"org/apache/lucene/analysis/kr/dic/prefix.dic"); 92 | defaults.setProperty(FILE_SUFFIX,"org/apache/lucene/analysis/kr/dic/suffix.dic"); 93 | defaults.setProperty(FILE_COMPOUNDS,"org/apache/lucene/analysis/kr/dic/compounds.dic"); 94 | defaults.setProperty(FILE_UNCOMPOUNDS,"org/apache/lucene/analysis/kr/dic/uncompounds.dic"); 95 | defaults.setProperty(FILE_CJ,"org/apache/lucene/analysis/kr/dic/cj.dic"); 96 | } 97 | 98 | 99 | /** 100 | * Given a property file name, load the property file and return an object 101 | * representing the property values. 102 | * 103 | * @param propertyFile The name of the property file to load. 104 | * @param def Default property values, or null if there are no defaults. 105 | * @return The loaded SortedProperties object. 106 | */ 107 | private Properties loadProperties(Properties def) throws MorphException { 108 | Properties properties = new Properties(); 109 | 110 | if (def != null) { 111 | properties = new Properties(def); 112 | } 113 | 114 | File file = null; 115 | try { 116 | file = FileUtil.getClassLoaderFile(FILE_KOREAN_PROPERTY); 117 | if (file != null) { 118 | properties.load(new FileInputStream(file)); 119 | return properties; 120 | } 121 | 122 | byte[] in = FileUtil.readByteFromCurrentJar(FILE_KOREAN_PROPERTY); 123 | properties.load(new ByteArrayInputStream(in)); 124 | } catch (Exception e) { 125 | throw new MorphException("Failure while trying to load properties file " + file.getPath(), e); 126 | } 127 | return properties; 128 | } 129 | 130 | 131 | /** 132 | * Returns the value of a property. 133 | * 134 | * @param name The name of the property whose value is to be retrieved. 135 | * @return The value of the property. 136 | */ 137 | public String getValue(String name) { 138 | return props.getProperty(name); 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /korean-analyzer-3.x/src/main/java/org/apache/lucene/analysis/kr/utils/JarResources.java: -------------------------------------------------------------------------------- 1 | package org.apache.lucene.analysis.kr.utils; 2 | 3 | /** 4 | * Licensed to the Apache Software Foundation (ASF) under one or more 5 | * contributor license agreements. See the NOTICE file distributed with 6 | * this work for additional information regarding copyright ownership. 7 | * The ASF licenses this file to You under the Apache License, Version 2.0 8 | * (the "License"); you may not use this file except in compliance with 9 | * the License. You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | 20 | import java.io.*; 21 | import java.util.*; 22 | import java.util.zip.*; 23 | 24 | /** 25 | * JarResources: JarResources maps all resources included in a 26 | * Zip or Jar file. Additionaly, it provides a method to extract one 27 | * as a blob. 28 | */ 29 | public final class JarResources { 30 | 31 | // external debug flag 32 | public boolean debugOn=false; 33 | 34 | // jar resource mapping tables 35 | private Hashtable htSizes=new Hashtable(); 36 | 37 | // a jar file 38 | private String jarFileName; 39 | 40 | /** 41 | * creates a JarResources. It extracts all resources from a Jar 42 | * into an internal hashtable, keyed by resource names. 43 | * @param jarFileName a jar or zip file 44 | */ 45 | public JarResources(String jarFileName) { 46 | this.jarFileName=jarFileName; 47 | } 48 | 49 | /** 50 | * Extracts a jar resource as a blob. 51 | * @param name a resource name. 52 | */ 53 | public byte[] getResource(String name) { 54 | return read(name); 55 | } 56 | 57 | /** 58 | * initializes internal hash tables with Jar file resources. 59 | */ 60 | private byte[] read(String name) { 61 | try { 62 | // extracts just sizes only. 63 | ZipFile zf=new ZipFile(jarFileName); 64 | Enumeration e=zf.entries(); 65 | while (e.hasMoreElements()) { 66 | ZipEntry ze=(ZipEntry)e.nextElement(); 67 | if (debugOn) { 68 | System.out.println(dumpZipEntry(ze)); 69 | } 70 | htSizes.put(ze.getName(),new Integer((int)ze.getSize())); 71 | } 72 | zf.close(); 73 | 74 | // extract resources and put them into the hashtable. 75 | FileInputStream fis=new FileInputStream(jarFileName); 76 | BufferedInputStream bis=new BufferedInputStream(fis); 77 | ZipInputStream zis=new ZipInputStream(bis); 78 | ZipEntry ze=null; 79 | while ((ze=zis.getNextEntry())!=null) { 80 | if (ze.isDirectory()) { 81 | continue; 82 | } 83 | if (debugOn) { 84 | System.out.println( 85 | "ze.getName()="+ze.getName()+","+"getSize()="+ze.getSize() 86 | ); 87 | } 88 | int size=(int)ze.getSize(); 89 | // -1 means unknown size. 90 | if (size==-1) { 91 | size=((Integer)htSizes.get(ze.getName())).intValue(); 92 | } 93 | byte[] b=new byte[(int)size]; 94 | int rb=0; 95 | int chunk=0; 96 | while (((int)size - rb) > 0) { 97 | chunk=zis.read(b,rb,(int)size - rb); 98 | if (chunk==-1) { 99 | break; 100 | } 101 | rb+=chunk; 102 | } 103 | 104 | if (debugOn) { 105 | System.out.println( 106 | ze.getName()+" rb="+rb+ 107 | ",size="+size+ 108 | ",csize="+ze.getCompressedSize() 109 | ); 110 | } 111 | 112 | if(ze.getName().equals(name)) { 113 | return b; 114 | } 115 | } 116 | } catch (NullPointerException e) { 117 | System.out.println("done."); 118 | } catch (FileNotFoundException e) { 119 | e.printStackTrace(); 120 | } catch (IOException e) { 121 | e.printStackTrace(); 122 | } 123 | 124 | return null; 125 | } 126 | 127 | /** 128 | * Dumps a zip entry into a string. 129 | * @param ze a ZipEntry 130 | */ 131 | private String dumpZipEntry(ZipEntry ze) { 132 | StringBuffer sb=new StringBuffer(); 133 | if (ze.isDirectory()) { 134 | sb.append("d "); 135 | } else { 136 | sb.append("f "); 137 | } 138 | if (ze.getMethod()==ZipEntry.STORED) { 139 | sb.append("stored "); 140 | } else { 141 | sb.append("defalted "); 142 | } 143 | sb.append(ze.getName()); 144 | sb.append("\t"); 145 | sb.append(""+ze.getSize()); 146 | if (ze.getMethod()==ZipEntry.DEFLATED) { 147 | sb.append("/"+ze.getCompressedSize()); 148 | } 149 | return (sb.toString()); 150 | } 151 | 152 | } --------------------------------------------------------------------------------