├── .gitignore ├── .travis.yml ├── CREDITS.txt ├── LICENSE.txt ├── NOTICE.txt ├── README.adoc ├── bin └── langdetect.sh ├── build.gradle ├── config └── checkstyle │ └── checkstyle.xml ├── gradle.properties ├── gradle ├── ext.gradle ├── publish.gradle ├── sourcequality.gradle └── wrapper │ ├── gradle-wrapper.jar │ └── gradle-wrapper.properties ├── gradlew ├── gradlew.bat ├── licenses ├── icu4j-62.1.jar.sha1 ├── icu4j-LICENSE.txt ├── icu4j-NOTICE.txt ├── standardnumber-1.0.1.jar.sha1 ├── standardnumber-LICENSE.txt └── standardnumber-NOTICE.txt ├── settings.gradle └── src ├── docs └── asciidoc │ ├── css │ └── foundation.css │ ├── hyphen.adoc │ ├── icu.adoc │ ├── langdetect.adoc │ └── standardnumbers.adoc ├── main ├── java │ └── org │ │ └── xbib │ │ └── elasticsearch │ │ └── plugin │ │ └── bundle │ │ ├── BundlePlugin.java │ │ ├── action │ │ ├── isbnformat │ │ │ ├── ISBNFormatAction.java │ │ │ ├── ISBNFormatRequest.java │ │ │ ├── ISBNFormatRequestBuilder.java │ │ │ ├── ISBNFormatResponse.java │ │ │ ├── TransportISBNFormatAction.java │ │ │ └── package-info.java │ │ └── langdetect │ │ │ ├── LangdetectAction.java │ │ │ ├── LangdetectRequest.java │ │ │ ├── LangdetectRequestBuilder.java │ │ │ ├── LangdetectResponse.java │ │ │ ├── TransportLangdetectAction.java │ │ │ └── package-info.java │ │ ├── common │ │ ├── decompound │ │ │ ├── fst │ │ │ │ ├── FstDecompounder.java │ │ │ │ └── package-info.java │ │ │ └── patricia │ │ │ │ ├── CompactPatriciaTrie.java │ │ │ │ ├── Decompounder.java │ │ │ │ ├── LFUCache.java │ │ │ │ ├── Node.java │ │ │ │ └── package-info.java │ │ ├── fsa │ │ │ ├── ConstantArcSizeFSA.java │ │ │ ├── Dictionary.java │ │ │ ├── FSA.java │ │ │ ├── FSABuilder.java │ │ │ ├── FSAFinalStatesIterator.java │ │ │ ├── FSAFlags.java │ │ │ ├── FSATraversal.java │ │ │ ├── MatchResult.java │ │ │ └── StateVisitor.java │ │ ├── fst │ │ │ └── FstCompiler.java │ │ ├── langdetect │ │ │ ├── LangProfile.java │ │ │ ├── LangdetectService.java │ │ │ ├── Language.java │ │ │ ├── LanguageDetectionException.java │ │ │ ├── NGram.java │ │ │ └── package-info.java │ │ ├── reference │ │ │ ├── ReferenceService.java │ │ │ └── package-info.java │ │ └── standardnumber │ │ │ ├── StandardnumberService.java │ │ │ └── package-info.java │ │ ├── index │ │ ├── analysis │ │ │ ├── autophrase │ │ │ │ ├── AutoPhrasingTokenFilter.java │ │ │ │ ├── AutoPhrasingTokenFilterFactory.java │ │ │ │ └── package-info.java │ │ │ ├── baseform │ │ │ │ ├── BaseformTokenFilter.java │ │ │ │ ├── BaseformTokenFilterFactory.java │ │ │ │ └── package-info.java │ │ │ ├── concat │ │ │ │ ├── ConcatTokenFilter.java │ │ │ │ ├── ConcatTokenFilterFactory.java │ │ │ │ ├── PairTokenFilter.java │ │ │ │ ├── PairTokenFilterFactory.java │ │ │ │ └── package-info.java │ │ │ ├── decompound │ │ │ │ ├── fst │ │ │ │ │ ├── FstDecompoundTokenFilter.java │ │ │ │ │ ├── FstDecompoundTokenFilterFactory.java │ │ │ │ │ └── package-info.java │ │ │ │ └── patricia │ │ │ │ │ ├── DecompoundTokenFilter.java │ │ │ │ │ ├── DecompoundTokenFilterFactory.java │ │ │ │ │ └── package-info.java │ │ │ ├── german │ │ │ │ ├── GermanNormalizationFilterFactory.java │ │ │ │ └── package-info.java │ │ │ ├── hyphen │ │ │ │ ├── HyphenAnalyzer.java │ │ │ │ ├── HyphenAnalyzerProvider.java │ │ │ │ ├── HyphenTokenFilter.java │ │ │ │ ├── HyphenTokenFilterFactory.java │ │ │ │ ├── HyphenTokenizer.java │ │ │ │ ├── HyphenTokenizerFactory.java │ │ │ │ └── package-info.java │ │ │ ├── icu │ │ │ │ ├── IcuCollationAttributeFactory.java │ │ │ │ ├── IcuCollationKeyAnalyzer.java │ │ │ │ ├── IcuCollationKeyAnalyzerProvider.java │ │ │ │ ├── IcuCollationTokenizerFactory.java │ │ │ │ ├── IcuFoldingCharFilterFactory.java │ │ │ │ ├── IcuFoldingTokenFilterFactory.java │ │ │ │ ├── IcuNormalizerCharFilter.java │ │ │ │ ├── IcuNormalizerCharFilterFactory.java │ │ │ │ ├── IcuNormalizerFilter.java │ │ │ │ ├── IcuNormalizerTokenFilterFactory.java │ │ │ │ ├── IcuNumberFormatTokenFilter.java │ │ │ │ ├── IcuNumberFormatTokenFilterFactory.java │ │ │ │ ├── IcuTransformTokenFilter.java │ │ │ │ ├── IcuTransformTokenFilterFactory.java │ │ │ │ ├── IndexableBinaryStringTools.java │ │ │ │ ├── package-info.java │ │ │ │ ├── segmentation │ │ │ │ │ ├── BreakIteratorWrapper.java │ │ │ │ │ ├── CharArrayIterator.java │ │ │ │ │ ├── CompositeBreakIterator.java │ │ │ │ │ ├── DefaultIcuTokenizerConfig.java │ │ │ │ │ ├── IcuTokenizer.java │ │ │ │ │ ├── IcuTokenizerConfig.java │ │ │ │ │ ├── IcuTokenizerFactory.java │ │ │ │ │ ├── ScriptIterator.java │ │ │ │ │ └── package-info.java │ │ │ │ ├── tokenattributes │ │ │ │ │ ├── ScriptAttribute.java │ │ │ │ │ ├── ScriptAttributeImpl.java │ │ │ │ │ └── package-info.java │ │ │ │ └── tools │ │ │ │ │ ├── RBBIRuleCompiler.java │ │ │ │ │ ├── UTR30DataFileGenerator.java │ │ │ │ │ └── package-info.java │ │ │ ├── lemmatize │ │ │ │ ├── LemmatizeTokenFilter.java │ │ │ │ └── LemmatizeTokenFilterFactory.java │ │ │ ├── naturalsort │ │ │ │ ├── NaturalSortKeyAnalyzer.java │ │ │ │ ├── NaturalSortKeyAnalyzerProvider.java │ │ │ │ ├── NaturalSortKeyAttributeFactory.java │ │ │ │ ├── NaturalSortKeyAttributeImpl.java │ │ │ │ ├── NaturalSortKeyTokenizerFactory.java │ │ │ │ └── package-info.java │ │ │ ├── sortform │ │ │ │ ├── SortformAnalyzerProvider.java │ │ │ │ ├── SortformTokenFilter.java │ │ │ │ └── SortformTokenFilterFactory.java │ │ │ ├── standardnumber │ │ │ │ ├── StandardnumberAnalyzer.java │ │ │ │ ├── StandardnumberAnalyzerProvider.java │ │ │ │ ├── StandardnumberTokenFilter.java │ │ │ │ └── StandardnumberTokenFilterFactory.java │ │ │ ├── symbolname │ │ │ │ ├── SymbolnameTokenFilter.java │ │ │ │ ├── SymbolnameTokenFilterFactory.java │ │ │ │ └── package-info.java │ │ │ ├── worddelimiter │ │ │ │ ├── WordDelimiterFilter.java │ │ │ │ ├── WordDelimiterFilter2.java │ │ │ │ ├── WordDelimiterFilter2Factory.java │ │ │ │ ├── WordDelimiterFilterFactory.java │ │ │ │ ├── WordDelimiterFlags.java │ │ │ │ ├── WordDelimiterIterator.java │ │ │ │ └── package-info.java │ │ │ └── year │ │ │ │ ├── GregorianYearTokenFilter.java │ │ │ │ └── GregorianYearTokenFilterFactory.java │ │ └── mapper │ │ │ ├── icu │ │ │ └── IcuCollationKeyFieldMapper.java │ │ │ ├── langdetect │ │ │ └── LangdetectMapper.java │ │ │ ├── reference │ │ │ ├── ReferenceMapper.java │ │ │ ├── ReferenceMapperModule.java │ │ │ └── ReferenceMapperTypeParser.java │ │ │ └── standardnumber │ │ │ ├── StandardnumberMapper.java │ │ │ ├── StandardnumberMapperModule.java │ │ │ └── StandardnumberMapperTypeParser.java │ │ ├── package-info.java │ │ ├── query │ │ └── decompound │ │ │ ├── CustomSpanPayloadCheckQuery.java │ │ │ ├── ExactPhraseQueryBuilder.java │ │ │ └── QueryTransformer.java │ │ └── rest │ │ └── action │ │ ├── isbnformat │ │ ├── RestISBNFormatterAction.java │ │ └── package-info.java │ │ └── langdetect │ │ └── RestLangdetectAction.java ├── jflex │ └── HyphenTokenizer.jflex ├── plugin-metadata │ └── plugin-security.policy └── resources │ └── org │ └── xbib │ └── elasticsearch │ └── plugin │ └── bundle │ ├── common │ └── langdetect │ │ ├── af │ │ ├── ar │ │ ├── bg │ │ ├── bn │ │ ├── cs │ │ ├── da │ │ ├── de │ │ ├── el │ │ ├── en │ │ ├── es │ │ ├── et │ │ ├── fa │ │ ├── fi │ │ ├── fr │ │ ├── gu │ │ ├── he │ │ ├── hi │ │ ├── hr │ │ ├── hu │ │ ├── id │ │ ├── it │ │ ├── ja │ │ ├── kn │ │ ├── ko │ │ ├── language.json │ │ ├── lt │ │ ├── lv │ │ ├── mk │ │ ├── ml │ │ ├── mr │ │ ├── ne │ │ ├── nl │ │ ├── no │ │ ├── pa │ │ ├── pl │ │ ├── pt │ │ ├── ro │ │ ├── ru │ │ ├── shorttext │ │ ├── bg │ │ ├── bn │ │ ├── cs │ │ ├── da │ │ ├── de │ │ ├── en │ │ ├── es │ │ ├── fa │ │ ├── fi │ │ ├── fr │ │ ├── gu │ │ ├── hi │ │ ├── hr │ │ ├── hu │ │ ├── id │ │ ├── it │ │ ├── lt │ │ ├── lv │ │ ├── mk │ │ ├── nl │ │ ├── no │ │ ├── pa │ │ ├── pl │ │ ├── pt │ │ ├── ro │ │ ├── sv │ │ ├── ta │ │ ├── te │ │ ├── tr │ │ ├── uk │ │ ├── ur │ │ └── vi │ │ ├── sk │ │ ├── sl │ │ ├── so │ │ ├── sq │ │ ├── sv │ │ ├── sw │ │ ├── ta │ │ ├── te │ │ ├── th │ │ ├── tl │ │ ├── tr │ │ ├── uk │ │ ├── ur │ │ ├── vi │ │ ├── zh-cn │ │ └── zh-tw │ ├── icu │ ├── KeywordTokenizer.brk │ ├── Latin-break-only-on-whitespace.brk │ ├── Latin-dont-break-on-hyphens.brk │ └── folding │ │ ├── BasicFoldings.txt │ │ ├── DiacriticFolding.txt │ │ ├── DingbatFolding.txt │ │ ├── HanRadicalFolding.txt │ │ ├── NativeDigitFolding.txt │ │ ├── nfc.txt │ │ ├── nfkc.txt │ │ └── nfkc_cf.txt │ └── index │ └── analysis │ ├── baseform │ ├── de-lemma-utf8.txt │ └── en-lemma-utf8.txt │ ├── decompound │ ├── fst │ │ └── words.fst │ └── patricia │ │ ├── grfExt.tree │ │ ├── kompVHic.tree │ │ └── kompVVic.tree │ └── icu │ ├── segmentation │ ├── Default.brk │ ├── Default.rbbi │ ├── KeywordTokenizer.rbbi │ ├── Latin-break-only-on-whitespace.rbbi │ ├── Latin-dont-break-on-hyphens.rbbi │ ├── MyanmarSyllable.brk │ └── MyanmarSyllable.rbbi │ └── utr30.nrm └── test ├── java └── org │ └── xbib │ └── elasticsearch │ └── plugin │ └── bundle │ └── test │ ├── MultiMap.java │ ├── TreeMultiMap.java │ ├── common │ └── decompound │ │ └── patricia │ │ ├── DecompounderTest.java │ │ └── LFUCacheTest.java │ ├── index │ ├── analysis │ │ ├── autophrase │ │ │ └── AutoPhrasingTokenFilterTests.java │ │ ├── baseform │ │ │ ├── BaseformTokenFilterTests.java │ │ │ └── DictionaryTest.java │ │ ├── concat │ │ │ └── ConcatTokenFilterTests.java │ │ ├── decompound │ │ │ ├── fst │ │ │ │ └── FstDecompoundTokenFilterTests.java │ │ │ └── patricia │ │ │ │ └── DecompoundTokenFilterTests.java │ │ ├── german │ │ │ ├── GermanNormalizationTests.java │ │ │ └── UnstemmedGermanNormalizationTests.java │ │ ├── hyphen │ │ │ └── HyphenTokenizerTests.java │ │ ├── icu │ │ │ ├── IcuAnalysisTests.java │ │ │ ├── IcuClientYamlTestSuiteIT.java │ │ │ ├── IcuCollationAnalyzerTests.java │ │ │ ├── IcuCollationKeyAnalyzerTests.java │ │ │ ├── IcuFoldingFilterTests.java │ │ │ ├── IcuNormalizeCharTests.java │ │ │ ├── IcuNormalizerFilterTests.java │ │ │ ├── IcuNumberFormatTests.java │ │ │ ├── IcuTokenizerTests.java │ │ │ ├── IcuTransformFilterTests.java │ │ │ ├── segmentation │ │ │ │ ├── CJKBigramFilterTests.java │ │ │ │ ├── CharArrayIteratorTests.java │ │ │ │ ├── IcuTokenizerCJKTests.java │ │ │ │ ├── IcuTokenizerFactoryTests.java │ │ │ │ ├── MyanmarSyllableTests.java │ │ │ │ └── SegmentationIcuTokenizerTests.java │ │ │ └── tools │ │ │ │ ├── RBBIRuleCompilerTest.java │ │ │ │ └── UTR30DataFileGeneratorTest.java │ │ ├── lemmatize │ │ │ ├── LemmatizeSearchTests.java │ │ │ └── LemmatizeTokenFilterTests.java │ │ ├── naturalsort │ │ │ └── NaturalSortKeyTests.java │ │ ├── sortform │ │ │ └── SortFormTests.java │ │ ├── symbolname │ │ │ └── SymbolnameTokenFilterTests.java │ │ └── worddelimiter │ │ │ └── WordDelimiterFilter2Tests.java │ └── mapper │ │ ├── langdetect │ │ ├── DetectLanguageTests.java │ │ ├── DetectorTests.java │ │ ├── LangDetectActionTests.java │ │ ├── LangDetectBinaryTests.java │ │ ├── LangDetectChineseTests.java │ │ ├── LangDetectGermanTests.java │ │ ├── LangProfileTests.java │ │ ├── LangdetectMappingTests.java │ │ ├── LanguageTests.java │ │ ├── NGramTests.java │ │ └── SimpleDetectorTests.java │ │ ├── reference │ │ ├── GNDReferenceMappingTests.java │ │ ├── ReferenceMappingTests.java │ │ └── SimpleReferenceMappingTests.java │ │ └── standardnumber │ │ └── StandardnumberMappingTests.java │ └── query │ └── decompound │ └── DecompoundQueryTests.java └── resources ├── log4j2.xml ├── org └── xbib │ └── elasticsearch │ └── plugin │ └── bundle │ └── test │ ├── index │ ├── analysis │ │ ├── concat │ │ │ └── concat_analysis.json │ │ ├── decompound │ │ │ ├── fst │ │ │ │ └── decompound_analysis.json │ │ │ └── patricia │ │ │ │ ├── decompound_analysis.json │ │ │ │ └── keywords_analysis.json │ │ ├── document.json │ │ ├── expansion │ │ │ └── expansion_analysis.json │ │ ├── german │ │ │ ├── german_normalization_analysis.json │ │ │ └── unstemmed.json │ │ ├── hyphen │ │ │ ├── custom_hyphen_tokenizer.json │ │ │ ├── hyphen_analyzer.json │ │ │ ├── hyphen_tokenizer.json │ │ │ └── hyphen_tokenizer_without_subwords.json │ │ ├── icu │ │ │ ├── icu_collation.json │ │ │ ├── icu_folding.json │ │ │ ├── icu_normalize.json │ │ │ ├── icu_numberformat.json │ │ │ ├── icu_tokenizer.json │ │ │ └── icu_transform.json │ │ ├── mapping.json │ │ ├── settings.json │ │ ├── sortform │ │ │ └── sortform.json │ │ └── worddelimiter │ │ │ └── worddelimiter.json │ └── mapper │ │ ├── langdetect │ │ ├── base64-2-decoded.txt │ │ ├── base64-2-mapping.json │ │ ├── base64-2.txt │ │ ├── base64-decoded.txt │ │ ├── base64-mapping.json │ │ ├── base64.txt │ │ ├── chinese.txt │ │ ├── english.txt │ │ ├── german.txt │ │ ├── japanese.txt │ │ ├── korean.txt │ │ ├── mapping-to-fields.json │ │ ├── mapping.json │ │ ├── settings.json │ │ ├── short-text-mapping.json │ │ └── simple-mapping.json │ │ ├── reference │ │ ├── doc-simple-document.json │ │ ├── doc-simple-mapping.json │ │ ├── doc-simple-settings.json │ │ ├── gnd-document.json │ │ ├── gnd-mapping.json │ │ ├── gnd-settings.json │ │ ├── ref-doc-book.json │ │ ├── ref-mapping-authorities.json │ │ ├── ref-mapping-books-test.json │ │ ├── ref-mapping-from-id.json │ │ ├── ref-mapping-nested.json │ │ ├── ref-mapping.json │ │ ├── ref-simple-document.json │ │ ├── ref-simple-mapping.json │ │ ├── ref-simple-settings.json │ │ ├── title-document-1.json │ │ ├── title-document-2.json │ │ ├── title-mapping.json │ │ └── title-settings.json │ │ └── standardnumber │ │ └── mapping.json │ └── query │ └── decompound │ └── decompound_query.json └── rest-api-spec └── test └── analysis_icu ├── 10_basic.yml └── 20_search.yml /.gitignore: -------------------------------------------------------------------------------- 1 | /data 2 | /work 3 | /logs 4 | /.idea 5 | /target 6 | .DS_Store 7 | *.iml 8 | /.settings 9 | /.classpath 10 | /.project 11 | /.gradle 12 | /build 13 | /plugins 14 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | language: java 3 | jdk: 4 | - oraclejdk9 5 | 6 | cache: 7 | directories: 8 | - $HOME/.m2 9 | -------------------------------------------------------------------------------- /CREDITS.txt: -------------------------------------------------------------------------------- 1 | The plugin bundle wouldn't be possible without the hard work of many authors 2 | who generously published their work under an open source license. 3 | 4 | This file should contain all the credits to them. If you miss a credit, please 5 | notify me about it and it will be added as soon as possible. 6 | 7 | The ICU analysis is heavily based on Apache Lucene ICU 8 | 9 | https://github.com/apache/lucene-solr/tree/master/lucene/analysis/icu 10 | 11 | The AutoPhraseTokenFilter is derived from 12 | 13 | https://github.com/lucidworks/auto-phrase-tokenfilter 14 | 15 | The ConcatTokenFilter is authored by Sujit Pal and was taken from 16 | 17 | http://sujitpal.blogspot.de/2011/07/lucene-token-concatenating-tokenfilter_30.html 18 | 19 | The Decompound token filter is a reworked implementation of the 20 | link:http://wortschatz.uni-leipzig.de/~cbiemann/software/toolbox/Baseforms%20Tool.htm[Baseforms Tool] 21 | found in the http://wortschatz.uni-leipzig.de/~cbiemann/software/toolbox/index.htm[ASV toolbox] 22 | of http://asv.informatik.uni-leipzig.de/staff/Chris_Biemann[Chris Biemann], 23 | Automatische Sprachverarbeitung of Leipzig University. 24 | 25 | The FSA in package org.xbib.elastixsearch.common.fsa which provides the dictionary structure for 26 | the baseform tokenizer is a derived version of 27 | 28 | https://github.com/morfologik/morfologik-stemming/tree/master/morfologik-fsa/src/main/java/morfologik/fsa 29 | 30 | Thanks to GBI-Genios Deutsche Wirtschaftsdatenbank GmbH for adding the caching-functionality and the "Exact phrase matches". 31 | The implementation of an exact phrase match query can ignore/skip decompounded tokens while matching phrases. 32 | The LFU cache for the Patricia Decompounder was inspired by the use of ConcurrentHashMap cache 33 | in the original pull request: https://github.com/jprante/elasticsearch-analysis-decompound/pull/54/ 34 | -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/NOTICE.txt -------------------------------------------------------------------------------- /bin/langdetect.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | curl -XDELETE 'localhost:9200/test' 4 | 5 | curl -XPUT 'localhost:9200/test' 6 | 7 | curl -XPOST 'localhost:9200/test/article/_mapping' -d ' 8 | { 9 | "article" : { 10 | "properties" : { 11 | "content" : { "type" : "langdetect" } 12 | } 13 | } 14 | } 15 | ' 16 | 17 | curl -XPUT 'localhost:9200/test/article/1' -d ' 18 | { 19 | "title" : "Some title", 20 | "content" : "Oh, say can you see by the dawn`s early light, What so proudly we hailed at the twilight`s last gleaming?" 21 | } 22 | ' 23 | 24 | curl -XPUT 'localhost:9200/test/article/2' -d ' 25 | { 26 | "title" : "Ein Titel", 27 | "content" : "Einigkeit und Recht und Freiheit für das deutsche Vaterland!" 28 | } 29 | ' 30 | 31 | curl -XPUT 'localhost:9200/test/article/3' -d ' 32 | { 33 | "title" : "Un titre", 34 | "content" : "Allons enfants de la Patrie, Le jour de gloire est arrivé!" 35 | } 36 | ' 37 | 38 | curl -XGET 'localhost:9200/test/_refresh' 39 | 40 | curl -XPOST 'localhost:9200/test/article/_search' -d ' 41 | { 42 | "query" : { 43 | "term" : { 44 | "content" : "eng" 45 | } 46 | } 47 | } 48 | ' 49 | curl -XPOST 'localhost:9200/test/_search' -d ' 50 | { 51 | "query" : { 52 | "term" : { 53 | "content" : "ger" 54 | } 55 | } 56 | } 57 | ' 58 | 59 | curl -XPOST 'localhost:9200/test/_search' -d ' 60 | { 61 | "query" : { 62 | "term" : { 63 | "content" : "fre" 64 | } 65 | } 66 | } 67 | ' 68 | -------------------------------------------------------------------------------- /gradle.properties: -------------------------------------------------------------------------------- 1 | group = org.xbib.elasticsearch.plugin 2 | name = elasticsearch-plugin-bundle 3 | version = 6.3.2.3 4 | 5 | elasticsearch.version = 6.3.2 6 | lucene.version = 7.3.1 7 | 8 | icu4j.version = 62.1 9 | log4j.version = 2.11.0 10 | jackson.version = 2.8.11 11 | standardnumber.version = 1.0.1 12 | junit.version = 4.12 13 | wagon.version = 3.0.0 14 | spatial4j.version = 0.7 15 | jts.version = 1.15.1 16 | jna.version = 4.5.1 17 | checkstyle.version = 8.13 18 | 19 | org.gradle.warning.mode = all -------------------------------------------------------------------------------- /gradle/ext.gradle: -------------------------------------------------------------------------------- 1 | ext { 2 | pluginName = 'bundle' 3 | pluginClassname = 'org.xbib.elasticsearch.plugin.bundle.BundlePlugin' 4 | pluginDescription = 'A bundle of plugins for Elasticsearch' 5 | user = 'jprante' 6 | name = 'elasticsearch-plugin-bundle' 7 | scmUrl = 'https://github.com/' + user + '/' + name 8 | scmConnection = 'scm:git:git://github.com/' + user + '/' + name + '.git' 9 | scmDeveloperConnection = 'scm:git:git://github.com/' + user + '/' + name + '.git' 10 | } 11 | -------------------------------------------------------------------------------- /gradle/publish.gradle: -------------------------------------------------------------------------------- 1 | 2 | task xbibUpload(type: Upload) { 3 | group = 'publish' 4 | configuration = configurations.archives 5 | uploadDescriptor = true 6 | repositories { 7 | if (project.hasProperty('xbibUsername')) { 8 | mavenDeployer { 9 | configuration = configurations.wagon 10 | repository(url: uri(project.property('xbibUrl'))) { 11 | authentication(userName: xbibUsername, privateKey: xbibPrivateKey) 12 | } 13 | } 14 | } 15 | } 16 | } 17 | 18 | task sonatypeUpload(type: Upload) { 19 | group = 'publish' 20 | configuration = configurations.archives 21 | uploadDescriptor = true 22 | repositories { 23 | if (project.hasProperty('ossrhUsername')) { 24 | mavenDeployer { 25 | beforeDeployment { MavenDeployment deployment -> signing.signPom(deployment) } 26 | repository(url: uri(ossrhReleaseUrl)) { 27 | authentication(userName: ossrhUsername, password: ossrhPassword) 28 | } 29 | snapshotRepository(url: uri(ossrhSnapshotUrl)) { 30 | authentication(userName: ossrhUsername, password: ossrhPassword) 31 | } 32 | pom.project { 33 | groupId project.group 34 | artifactId project.name 35 | version project.version 36 | name project.name 37 | description pluginDescription 38 | packaging 'jar' 39 | inceptionYear '2012' 40 | url scmUrl 41 | organization { 42 | name 'xbib' 43 | url 'http://xbib.org' 44 | } 45 | developers { 46 | developer { 47 | id user 48 | name 'Jörg Prante' 49 | email 'joergprante@gmail.com' 50 | url 'https://github.com/jprante' 51 | } 52 | } 53 | scm { 54 | url scmUrl 55 | connection scmConnection 56 | developerConnection scmDeveloperConnection 57 | } 58 | licenses { 59 | license { 60 | name 'Affero GNU Public License Version 3' 61 | url 'http://www.gnu.org/licenses/agpl-3.0.html' 62 | } 63 | } 64 | } 65 | } 66 | } 67 | } 68 | } 69 | 70 | nexusStaging { 71 | packageGroup = "org.xbib" 72 | } 73 | -------------------------------------------------------------------------------- /gradle/sourcequality.gradle: -------------------------------------------------------------------------------- 1 | 2 | sonarqube { 3 | properties { 4 | property "sonar.projectName", "${project.group} ${project.name}" 5 | property "sonar.sourceEncoding", "UTF-8" 6 | property "sonar.tests", "src/test/java" 7 | property "sonar.scm.provider", "git" 8 | property "sonar.java.coveragePlugin", "jacoco" 9 | property "sonar.junit.reportsPath", "build/test-results/test/" 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/gradle/wrapper/gradle-wrapper.jar -------------------------------------------------------------------------------- /gradle/wrapper/gradle-wrapper.properties: -------------------------------------------------------------------------------- 1 | #Fri Mar 15 22:26:04 CET 2019 2 | distributionBase=GRADLE_USER_HOME 3 | distributionPath=wrapper/dists 4 | zipStoreBase=GRADLE_USER_HOME 5 | zipStorePath=wrapper/dists 6 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.3-all.zip 7 | -------------------------------------------------------------------------------- /gradlew.bat: -------------------------------------------------------------------------------- 1 | @if "%DEBUG%" == "" @echo off 2 | @rem ########################################################################## 3 | @rem 4 | @rem Gradle startup script for Windows 5 | @rem 6 | @rem ########################################################################## 7 | 8 | @rem Set local scope for the variables with windows NT shell 9 | if "%OS%"=="Windows_NT" setlocal 10 | 11 | set DIRNAME=%~dp0 12 | if "%DIRNAME%" == "" set DIRNAME=. 13 | set APP_BASE_NAME=%~n0 14 | set APP_HOME=%DIRNAME% 15 | 16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. 17 | set DEFAULT_JVM_OPTS="-Xmx64m" 18 | 19 | @rem Find java.exe 20 | if defined JAVA_HOME goto findJavaFromJavaHome 21 | 22 | set JAVA_EXE=java.exe 23 | %JAVA_EXE% -version >NUL 2>&1 24 | if "%ERRORLEVEL%" == "0" goto init 25 | 26 | echo. 27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 28 | echo. 29 | echo Please set the JAVA_HOME variable in your environment to match the 30 | echo location of your Java installation. 31 | 32 | goto fail 33 | 34 | :findJavaFromJavaHome 35 | set JAVA_HOME=%JAVA_HOME:"=% 36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe 37 | 38 | if exist "%JAVA_EXE%" goto init 39 | 40 | echo. 41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 42 | echo. 43 | echo Please set the JAVA_HOME variable in your environment to match the 44 | echo location of your Java installation. 45 | 46 | goto fail 47 | 48 | :init 49 | @rem Get command-line arguments, handling Windows variants 50 | 51 | if not "%OS%" == "Windows_NT" goto win9xME_args 52 | 53 | :win9xME_args 54 | @rem Slurp the command line arguments. 55 | set CMD_LINE_ARGS= 56 | set _SKIP=2 57 | 58 | :win9xME_args_slurp 59 | if "x%~1" == "x" goto execute 60 | 61 | set CMD_LINE_ARGS=%* 62 | 63 | :execute 64 | @rem Setup the command line 65 | 66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar 67 | 68 | @rem Execute Gradle 69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% 70 | 71 | :end 72 | @rem End local scope for the variables with windows NT shell 73 | if "%ERRORLEVEL%"=="0" goto mainEnd 74 | 75 | :fail 76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of 77 | rem the _cmd.exe /c_ return code! 78 | if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 79 | exit /b 1 80 | 81 | :mainEnd 82 | if "%OS%"=="Windows_NT" endlocal 83 | 84 | :omega 85 | -------------------------------------------------------------------------------- /licenses/icu4j-62.1.jar.sha1: -------------------------------------------------------------------------------- 1 | 7a4d00d5ec5febd252a6182e8b6e87a0a9821f81 -------------------------------------------------------------------------------- /licenses/icu4j-LICENSE.txt: -------------------------------------------------------------------------------- 1 | UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE 2 | 3 | Unicode Data Files include all data files under the directories 4 | http://www.unicode.org/Public/, http://www.unicode.org/reports/, 5 | http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and 6 | http://www.unicode.org/utility/trac/browser/. 7 | 8 | Unicode Data Files do not include PDF online code charts under the 9 | directory http://www.unicode.org/Public/. 10 | 11 | Software includes any source code published in the Unicode Standard 12 | or under the directories 13 | http://www.unicode.org/Public/, http://www.unicode.org/reports/, 14 | http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and 15 | http://www.unicode.org/utility/trac/browser/. 16 | 17 | NOTICE TO USER: Carefully read the following legal agreement. 18 | BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S 19 | DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), 20 | YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE 21 | TERMS AND CONDITIONS OF THIS AGREEMENT. 22 | IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE 23 | THE DATA FILES OR SOFTWARE. 24 | 25 | COPYRIGHT AND PERMISSION NOTICE 26 | 27 | Copyright © 1991-2016 Unicode, Inc. All rights reserved. 28 | Distributed under the Terms of Use in http://www.unicode.org/copyright.html. 29 | 30 | Permission is hereby granted, free of charge, to any person obtaining 31 | a copy of the Unicode data files and any associated documentation 32 | (the "Data Files") or Unicode software and any associated documentation 33 | (the "Software") to deal in the Data Files or Software 34 | without restriction, including without limitation the rights to use, 35 | copy, modify, merge, publish, distribute, and/or sell copies of 36 | the Data Files or Software, and to permit persons to whom the Data Files 37 | or Software are furnished to do so, provided that either 38 | (a) this copyright and permission notice appear with all copies 39 | of the Data Files or Software, or 40 | (b) this copyright and permission notice appear in associated 41 | Documentation. 42 | 43 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF 44 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE 45 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 46 | NONINFRINGEMENT OF THIRD PARTY RIGHTS. 47 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS 48 | NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL 49 | DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, 50 | DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER 51 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR 52 | PERFORMANCE OF THE DATA FILES OR SOFTWARE. 53 | 54 | Except as contained in this notice, the name of a copyright holder 55 | shall not be used in advertising or otherwise to promote the sale, 56 | use or other dealings in these Data Files or Software without prior 57 | written authorization of the copyright holder. -------------------------------------------------------------------------------- /licenses/icu4j-NOTICE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/licenses/icu4j-NOTICE.txt -------------------------------------------------------------------------------- /licenses/standardnumber-1.0.1.jar.sha1: -------------------------------------------------------------------------------- 1 | 9d1cf31cbc87cc9cdfd505fd30d3598da4eee700 -------------------------------------------------------------------------------- /licenses/standardnumber-NOTICE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/licenses/standardnumber-NOTICE.txt -------------------------------------------------------------------------------- /settings.gradle: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/settings.gradle -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/action/isbnformat/ISBNFormatAction.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.action.isbnformat; 2 | 3 | import org.elasticsearch.action.Action; 4 | import org.elasticsearch.client.ElasticsearchClient; 5 | 6 | /** 7 | * ISBN format action. 8 | */ 9 | public class ISBNFormatAction extends Action { 10 | 11 | public static final String NAME = "isbnformat"; 12 | 13 | public static final ISBNFormatAction INSTANCE = new ISBNFormatAction(); 14 | 15 | private ISBNFormatAction() { 16 | super(NAME); 17 | } 18 | 19 | @Override 20 | public ISBNFormatRequestBuilder newRequestBuilder(ElasticsearchClient client) { 21 | return new ISBNFormatRequestBuilder(client); 22 | } 23 | 24 | @Override 25 | public ISBNFormatResponse newResponse() { 26 | return new ISBNFormatResponse(); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/action/isbnformat/ISBNFormatRequest.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.action.isbnformat; 2 | 3 | import org.elasticsearch.action.ActionRequest; 4 | import org.elasticsearch.action.ActionRequestValidationException; 5 | import org.elasticsearch.common.io.stream.StreamInput; 6 | import org.elasticsearch.common.io.stream.StreamOutput; 7 | 8 | import java.io.IOException; 9 | 10 | import static org.elasticsearch.action.ValidateActions.addValidationError; 11 | 12 | /** 13 | * ISBN format request. 14 | */ 15 | public class ISBNFormatRequest extends ActionRequest { 16 | 17 | private String value; 18 | 19 | @Override 20 | public ActionRequestValidationException validate() { 21 | ActionRequestValidationException validationException = null; 22 | if (value == null) { 23 | validationException = addValidationError("value is missing", null); 24 | } 25 | return validationException; 26 | } 27 | 28 | public String getValue() { 29 | return value; 30 | } 31 | 32 | public ISBNFormatRequest setValue(String value) { 33 | this.value = value; 34 | return this; 35 | } 36 | 37 | @Override 38 | public void readFrom(StreamInput in) throws IOException { 39 | super.readFrom(in); 40 | value = in.readString(); 41 | } 42 | 43 | @Override 44 | public void writeTo(StreamOutput out) throws IOException { 45 | super.writeTo(out); 46 | out.writeString(value); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/action/isbnformat/ISBNFormatRequestBuilder.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.action.isbnformat; 2 | 3 | import org.elasticsearch.action.ActionRequestBuilder; 4 | import org.elasticsearch.client.ElasticsearchClient; 5 | 6 | /** 7 | * ISBN format request builder. 8 | */ 9 | public class ISBNFormatRequestBuilder 10 | extends ActionRequestBuilder { 11 | 12 | public ISBNFormatRequestBuilder(ElasticsearchClient client) { 13 | super(client, ISBNFormatAction.INSTANCE, new ISBNFormatRequest()); 14 | } 15 | 16 | public ISBNFormatRequestBuilder setValue(String string) { 17 | request.setValue(string); 18 | return this; 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/action/isbnformat/ISBNFormatResponse.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.action.isbnformat; 2 | 3 | import org.elasticsearch.action.ActionResponse; 4 | import org.elasticsearch.common.xcontent.StatusToXContentObject; 5 | import org.elasticsearch.common.xcontent.ToXContent; 6 | import org.elasticsearch.common.xcontent.XContentBuilder; 7 | import org.elasticsearch.rest.RestStatus; 8 | 9 | import java.io.IOException; 10 | 11 | import static org.elasticsearch.rest.RestStatus.OK; 12 | 13 | /** 14 | * ISBN format response. 15 | */ 16 | public class ISBNFormatResponse extends ActionResponse implements StatusToXContentObject { 17 | 18 | private String isbn10; 19 | 20 | private String isbn10Formatted; 21 | 22 | private String isbn13; 23 | 24 | private String isbn13Formatted; 25 | 26 | private String invalid; 27 | 28 | public ISBNFormatResponse setIsbn10(String value) { 29 | this.isbn10 = value; 30 | return this; 31 | } 32 | 33 | public ISBNFormatResponse setIsbn10Formatted(String value) { 34 | this.isbn10Formatted = value; 35 | return this; 36 | } 37 | 38 | public ISBNFormatResponse setIsbn13(String value) { 39 | this.isbn13 = value; 40 | return this; 41 | } 42 | 43 | public ISBNFormatResponse setIsbn13Formatted(String value) { 44 | this.isbn13Formatted = value; 45 | return this; 46 | } 47 | 48 | public ISBNFormatResponse setInvalid(String value) { 49 | this.invalid = value; 50 | return this; 51 | } 52 | 53 | @Override 54 | public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params params) throws IOException { 55 | builder.startObject() 56 | .startObject("result") 57 | .field("isbn10", isbn10) 58 | .field("isbn10formatted", isbn10Formatted) 59 | .field("isbn13", isbn13) 60 | .field("isbn13formatted", isbn13Formatted) 61 | .field("invalid", invalid) 62 | .endObject() 63 | .endObject(); 64 | return builder; 65 | } 66 | 67 | @Override 68 | public RestStatus status() { 69 | return OK; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/action/isbnformat/TransportISBNFormatAction.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.action.isbnformat; 2 | 3 | import org.elasticsearch.action.ActionListener; 4 | import org.elasticsearch.action.support.ActionFilters; 5 | import org.elasticsearch.action.support.TransportAction; 6 | import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver; 7 | import org.elasticsearch.common.inject.Inject; 8 | import org.elasticsearch.common.settings.Settings; 9 | import org.elasticsearch.threadpool.ThreadPool; 10 | import org.elasticsearch.transport.TransportService; 11 | import org.xbib.elasticsearch.plugin.bundle.common.standardnumber.StandardnumberService; 12 | 13 | /** 14 | * Transport action for ISBN format action. 15 | */ 16 | public class TransportISBNFormatAction extends TransportAction { 17 | 18 | private final StandardnumberService standardnumberService; 19 | 20 | @Inject 21 | public TransportISBNFormatAction(Settings settings, ThreadPool threadPool, 22 | ActionFilters actionFilters, 23 | IndexNameExpressionResolver indexNameExpressionResolver, 24 | TransportService transportService, 25 | StandardnumberService standardnumberService) { 26 | super(settings, ISBNFormatAction.NAME, threadPool, actionFilters, indexNameExpressionResolver, 27 | transportService.getTaskManager()); 28 | this.standardnumberService = standardnumberService; 29 | } 30 | 31 | @Override 32 | protected void doExecute(ISBNFormatRequest request, ActionListener listener) { 33 | ISBNFormatResponse response = new ISBNFormatResponse(); 34 | try { 35 | standardnumberService.handle(request.getValue(), response); 36 | } catch (IllegalArgumentException e) { 37 | logger.debug(e.getMessage(), e); 38 | response.setInvalid(request.getValue()); 39 | } 40 | listener.onResponse(response); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/action/isbnformat/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for ISBN formatter action. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.action.isbnformat; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/action/langdetect/LangdetectAction.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.action.langdetect; 2 | 3 | import org.elasticsearch.action.Action; 4 | import org.elasticsearch.client.ElasticsearchClient; 5 | 6 | /** 7 | * Language detection action. 8 | */ 9 | public class LangdetectAction extends Action { 10 | 11 | public static final String NAME = "langdetect"; 12 | 13 | public static final LangdetectAction INSTANCE = new LangdetectAction(); 14 | 15 | private LangdetectAction() { 16 | super(NAME); 17 | } 18 | 19 | @Override 20 | public LangdetectRequestBuilder newRequestBuilder(ElasticsearchClient client) { 21 | return new LangdetectRequestBuilder(client); 22 | } 23 | 24 | @Override 25 | public LangdetectResponse newResponse() { 26 | return new LangdetectResponse(); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/action/langdetect/LangdetectRequest.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.action.langdetect; 2 | 3 | import org.elasticsearch.action.ActionRequest; 4 | import org.elasticsearch.action.ActionRequestValidationException; 5 | import org.elasticsearch.common.io.stream.StreamInput; 6 | import org.elasticsearch.common.io.stream.StreamOutput; 7 | 8 | import java.io.IOException; 9 | 10 | import static org.elasticsearch.action.ValidateActions.addValidationError; 11 | 12 | /** 13 | * Language detection request. 14 | */ 15 | public class LangdetectRequest extends ActionRequest { 16 | 17 | private String profile; 18 | 19 | private String text; 20 | 21 | @Override 22 | public ActionRequestValidationException validate() { 23 | ActionRequestValidationException validationException = null; 24 | if (text == null) { 25 | validationException = addValidationError("text is missing", null); 26 | } 27 | return validationException; 28 | } 29 | 30 | public String getProfile() { 31 | return profile; 32 | } 33 | 34 | public LangdetectRequest setProfile(String profile) { 35 | this.profile = profile; 36 | return this; 37 | } 38 | 39 | public String getText() { 40 | return text; 41 | } 42 | 43 | public LangdetectRequest setText(String text) { 44 | this.text = text; 45 | return this; 46 | } 47 | 48 | @Override 49 | public void readFrom(StreamInput in) throws IOException { 50 | super.readFrom(in); 51 | text = in.readString(); 52 | profile = in.readOptionalString(); 53 | } 54 | 55 | @Override 56 | public void writeTo(StreamOutput out) throws IOException { 57 | super.writeTo(out); 58 | out.writeString(text); 59 | out.writeOptionalString(profile); 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/action/langdetect/LangdetectRequestBuilder.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.action.langdetect; 2 | 3 | import org.elasticsearch.action.ActionRequestBuilder; 4 | import org.elasticsearch.client.ElasticsearchClient; 5 | 6 | /** 7 | * Language detection request builder. 8 | */ 9 | public class LangdetectRequestBuilder extends ActionRequestBuilder { 10 | 11 | public LangdetectRequestBuilder(ElasticsearchClient client) { 12 | super(client, LangdetectAction.INSTANCE, new LangdetectRequest()); 13 | } 14 | 15 | public LangdetectRequestBuilder setProfile(String string) { 16 | request.setProfile(string); 17 | return this; 18 | } 19 | 20 | public LangdetectRequestBuilder setText(String string) { 21 | request.setText(string); 22 | return this; 23 | } 24 | 25 | } 26 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/action/langdetect/LangdetectResponse.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.action.langdetect; 2 | 3 | import org.elasticsearch.action.ActionResponse; 4 | import org.elasticsearch.common.Strings; 5 | import org.elasticsearch.common.xcontent.StatusToXContentObject; 6 | import org.elasticsearch.common.xcontent.ToXContent; 7 | import org.elasticsearch.common.xcontent.XContentBuilder; 8 | import org.elasticsearch.rest.RestStatus; 9 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.Language; 10 | 11 | import java.io.IOException; 12 | import java.util.ArrayList; 13 | import java.util.List; 14 | 15 | import static org.elasticsearch.rest.RestStatus.OK; 16 | 17 | /** 18 | * Language detection response. 19 | */ 20 | public class LangdetectResponse extends ActionResponse implements StatusToXContentObject { 21 | 22 | private String profile; 23 | 24 | private List languages = new ArrayList<>(); 25 | 26 | public String getProfile() { 27 | return profile; 28 | } 29 | 30 | public LangdetectResponse setProfile(String profile) { 31 | this.profile = profile; 32 | return this; 33 | } 34 | 35 | public List getLanguages() { 36 | return languages; 37 | } 38 | 39 | public LangdetectResponse setLanguages(List languages) { 40 | this.languages = languages; 41 | return this; 42 | } 43 | 44 | @Override 45 | public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params params) throws IOException { 46 | if (!Strings.isNullOrEmpty(profile)) { 47 | builder.field("profile", profile); 48 | } 49 | builder.startArray("languages"); 50 | for (Language lang : languages) { 51 | builder.startObject().field("language", lang.getLanguage()) 52 | .field("probability", lang.getProbability()).endObject(); 53 | } 54 | builder.endArray(); 55 | return builder; 56 | } 57 | 58 | @Override 59 | public RestStatus status() { 60 | return OK; 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/action/langdetect/TransportLangdetectAction.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.action.langdetect; 2 | 3 | import org.elasticsearch.action.ActionListener; 4 | import org.elasticsearch.action.support.ActionFilters; 5 | import org.elasticsearch.action.support.TransportAction; 6 | import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver; 7 | import org.elasticsearch.common.inject.Inject; 8 | import org.elasticsearch.common.settings.Settings; 9 | import org.elasticsearch.threadpool.ThreadPool; 10 | import org.elasticsearch.transport.TransportService; 11 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.LangdetectService; 12 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.Language; 13 | 14 | import java.util.HashMap; 15 | import java.util.List; 16 | import java.util.Map; 17 | 18 | /** 19 | * Transport action for language detection response. 20 | */ 21 | public class TransportLangdetectAction extends TransportAction { 22 | 23 | private static final Map services = new HashMap<>(); 24 | 25 | @Inject 26 | public TransportLangdetectAction(Settings settings, ThreadPool threadPool, 27 | ActionFilters actionFilters, 28 | IndexNameExpressionResolver indexNameExpressionResolver, 29 | TransportService transportService) { 30 | super(settings, LangdetectAction.NAME, threadPool, actionFilters, indexNameExpressionResolver, transportService.getTaskManager()); 31 | services.put("", new LangdetectService(settings)); 32 | } 33 | 34 | @Override 35 | protected void doExecute(LangdetectRequest request, ActionListener listener) { 36 | String profile = request.getProfile(); 37 | if (profile == null) { 38 | profile = ""; 39 | } 40 | if (!services.containsKey(profile)) { 41 | services.put(profile, new LangdetectService(settings, profile)); 42 | } 43 | List langs = services.get(profile).detectAll(request.getText()); 44 | listener.onResponse(new LangdetectResponse().setLanguages(langs).setProfile(request.getProfile())); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/action/langdetect/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for language detection action. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.action.langdetect; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/common/decompound/fst/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for Finite-State-Transformer based decompounder. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.common.decompound.fst; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/common/decompound/patricia/Node.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.common.decompound.patricia; 2 | 3 | import java.util.ArrayList; 4 | import java.util.List; 5 | 6 | /** 7 | * Node. 8 | */ 9 | class Node { 10 | 11 | private String content; 12 | 13 | private int pos; 14 | 15 | private List classes; 16 | 17 | private List children; 18 | 19 | Node() { 20 | this.content = ""; 21 | this.classes = new ArrayList<>(); 22 | this.children = new ArrayList<>(); 23 | } 24 | 25 | Node(String content) { 26 | this.content = content; 27 | this.classes = new ArrayList<>(); 28 | this.children = new ArrayList<>(); 29 | } 30 | 31 | public void setContent(String content) { 32 | this.content = content; 33 | } 34 | 35 | public String getContent() { 36 | return content; 37 | } 38 | 39 | public void setPos(int pos) { 40 | this.pos = pos; 41 | } 42 | 43 | public int getPos() { 44 | return pos; 45 | } 46 | 47 | public Node classes(List classes) { 48 | this.classes = classes; 49 | return this; 50 | } 51 | 52 | public List classes() { 53 | return classes; 54 | } 55 | 56 | public Node children(List children) { 57 | this.children = children; 58 | return this; 59 | } 60 | 61 | public List children() { 62 | return children; 63 | } 64 | 65 | @Override 66 | public String toString() { 67 | return "[" + content + ',' + classes + ']'; 68 | } 69 | } -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/common/decompound/patricia/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for Patricia-Trie based decompounder. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.common.decompound.patricia; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/common/fsa/FSAFlags.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.common.fsa; 2 | 3 | import java.util.Set; 4 | 5 | /** 6 | * FSA automaton flags. Where applicable, flags follow Daciuk's fsa package. 7 | */ 8 | public enum FSAFlags { 9 | /** 10 | * Daciuk: flexible FSA encoding. 11 | */ 12 | FLEXIBLE(1), 13 | 14 | /** 15 | * Daciuk: stop bit in use. 16 | */ 17 | STOPBIT(1 << 1), 18 | 19 | /** 20 | * Daciuk: next bit in use. 21 | */ 22 | NEXTBIT(1 << 2), 23 | 24 | /** 25 | * Daciuk: tails compression. 26 | */ 27 | TAILS(1 << 3), 28 | 29 | /* 30 | * These flags are outside of byte range (never occur in Daciuk's FSA). 31 | */ 32 | 33 | /** 34 | * The FSA contains right-language count numbers on states. 35 | * 36 | * @see FSA#getRightLanguageCount(int) 37 | */ 38 | NUMBERS(1 << 8), 39 | 40 | /** 41 | * The FSA supports legacy built-in separator and filler characters (Daciuk's FSA package 42 | * compatibility). 43 | */ 44 | SEPARATORS(1 << 9); 45 | 46 | /** 47 | * Bit mask for the corresponding flag. 48 | */ 49 | public final int bits; 50 | 51 | FSAFlags(int bits) { 52 | this.bits = bits; 53 | } 54 | 55 | /** 56 | * @param flag flag 57 | * @param flags flags 58 | * @return true if the corresponding flag is set in the bit set. 59 | */ 60 | public static boolean isSet(int flags, FSAFlags flag) { 61 | return (flags & flag.bits) != 0; 62 | } 63 | 64 | public static short asShort(Set flags) { 65 | short value = 0; 66 | for (FSAFlags f : flags) { 67 | value |= f.bits; 68 | } 69 | return value; 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/common/fsa/MatchResult.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.common.fsa; 2 | 3 | /** 4 | * A matching result returned from {@link FSATraversal}. 5 | * 6 | * @see FSATraversal 7 | */ 8 | public final class MatchResult { 9 | /** 10 | * The automaton has exactly one match for the input sequence. 11 | */ 12 | public static final int EXACT_MATCH = 0; 13 | 14 | /** 15 | * The automaton has no match for the input sequence. 16 | */ 17 | public static final int NO_MATCH = -1; 18 | 19 | /** 20 | * The automaton contains a prefix of the input sequence. That is: 21 | * one of the input sequences used to build the automaton is a 22 | * prefix of the input sequence that is shorter than the sequence. 23 | * {@link MatchResult#index} will contain an index of the 24 | * first character of the input sequence not present in the 25 | * dictionary. 26 | */ 27 | public static final int AUTOMATON_HAS_PREFIX = -3; 28 | 29 | /** 30 | * The sequence is a prefix of at least one sequence in the automaton. 31 | * {@link MatchResult#node} returns the node from which all sequences 32 | * with the given prefix start in the automaton. 33 | */ 34 | public static final int SEQUENCE_IS_A_PREFIX = -4; 35 | 36 | /** 37 | * One of the match kind constants defined in this class. 38 | * 39 | * @see #NO_MATCH 40 | * @see #EXACT_MATCH 41 | * @see #AUTOMATON_HAS_PREFIX 42 | * @see #SEQUENCE_IS_A_PREFIX 43 | */ 44 | private int kind; 45 | 46 | /** 47 | * Input sequence's index, interpretation depends on {@link #kind}. 48 | */ 49 | private int index; 50 | 51 | /** 52 | * Automaton node, interpretation depends on the {@link #kind}. 53 | */ 54 | private int node; 55 | 56 | /** 57 | * Constructor. 58 | */ 59 | public MatchResult() { 60 | reset(NO_MATCH, 0, 0); 61 | } 62 | 63 | /** 64 | * Reset. 65 | * @param index index 66 | * @param kind kind 67 | * @param node node 68 | */ 69 | void reset(int kind, int index, int node) { 70 | this.kind = kind; 71 | this.index = index; 72 | this.node = node; 73 | } 74 | 75 | public int getKind() { 76 | return kind; 77 | } 78 | 79 | public int getIndex() { 80 | return index; 81 | } 82 | 83 | public int getNode() { 84 | return node; 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/common/fsa/StateVisitor.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.common.fsa; 2 | 3 | /** 4 | * State visitor. 5 | * 6 | * @see FSA#visitInPostOrder(StateVisitor) 7 | * @see FSA#visitInPreOrder(StateVisitor) 8 | */ 9 | 10 | @FunctionalInterface 11 | public interface StateVisitor { 12 | 13 | boolean accept(int state); 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/common/langdetect/LangProfile.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.common.langdetect; 2 | 3 | import org.elasticsearch.common.xcontent.XContentHelper; 4 | import org.elasticsearch.common.xcontent.json.JsonXContent; 5 | 6 | import java.io.IOException; 7 | import java.io.InputStream; 8 | import java.util.ArrayList; 9 | import java.util.HashMap; 10 | import java.util.List; 11 | import java.util.Map; 12 | 13 | /** 14 | * Language profile. 15 | */ 16 | public class LangProfile { 17 | 18 | private String name; 19 | 20 | private Map freq; 21 | 22 | private List nWords; 23 | 24 | public LangProfile() { 25 | this.freq = new HashMap<>(); 26 | this.nWords = new ArrayList<>(NGram.N_GRAM); 27 | for (int i = 0; i < NGram.N_GRAM; i++) { 28 | nWords.add(0); 29 | } 30 | } 31 | 32 | public void add(String gram) { 33 | if (name == null || gram == null) { 34 | return; 35 | } 36 | int len = gram.length(); 37 | if (len < 1 || len > NGram.N_GRAM) { 38 | return; 39 | } 40 | nWords.set(len - 1, nWords.get(len - 1) + 1); 41 | if (freq.containsKey(gram)) { 42 | freq.put(gram, freq.get(gram) + 1); 43 | } else { 44 | freq.put(gram, 1); 45 | } 46 | } 47 | 48 | public String getName() { 49 | return name; 50 | } 51 | 52 | public void setName(String name) { 53 | this.name = name; 54 | } 55 | 56 | public List getNWords() { 57 | return nWords; 58 | } 59 | 60 | public Map getFreq() { 61 | return freq; 62 | } 63 | 64 | public void setFreq(Map freq) { 65 | this.freq = freq; 66 | } 67 | 68 | @SuppressWarnings("unchecked") 69 | public void read(InputStream input) throws IOException { 70 | Map map = XContentHelper.convertToMap(JsonXContent.jsonXContent, input, true); 71 | freq = (Map) map.get("freq"); 72 | name = (String) map.get("name"); 73 | nWords = (List) map.get("n_words"); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/common/langdetect/Language.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.common.langdetect; 2 | 3 | import org.elasticsearch.common.io.stream.StreamInput; 4 | import org.elasticsearch.common.io.stream.StreamOutput; 5 | import org.elasticsearch.common.io.stream.Streamable; 6 | 7 | import java.io.IOException; 8 | 9 | /** 10 | * Language. 11 | */ 12 | public class Language implements Streamable { 13 | 14 | private String lang; 15 | 16 | private double prob; 17 | 18 | public Language(String lang, double prob) { 19 | this.lang = lang; 20 | this.prob = prob; 21 | } 22 | 23 | public String getLanguage() { 24 | return lang; 25 | } 26 | 27 | public double getProbability() { 28 | return prob; 29 | } 30 | 31 | @Override 32 | public void readFrom(StreamInput in) throws IOException { 33 | this.lang = in.readString(); 34 | this.prob = in.readDouble(); 35 | } 36 | 37 | @Override 38 | public void writeTo(StreamOutput out) throws IOException { 39 | out.writeString(lang); 40 | out.writeDouble(prob); 41 | } 42 | 43 | @Override 44 | public String toString() { 45 | return lang + " (prob=" + prob + ")"; 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/common/langdetect/LanguageDetectionException.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.common.langdetect; 2 | 3 | import java.io.IOException; 4 | 5 | /** 6 | * Language detection exception. 7 | */ 8 | public class LanguageDetectionException extends IOException { 9 | 10 | public LanguageDetectionException(String message) { 11 | super(message); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/common/langdetect/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for language detection implementation. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.common.langdetect; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/common/reference/ReferenceService.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.common.reference; 2 | 3 | import org.elasticsearch.client.Client; 4 | import org.elasticsearch.common.component.AbstractLifecycleComponent; 5 | import org.elasticsearch.common.inject.Inject; 6 | import org.elasticsearch.common.inject.Injector; 7 | import org.elasticsearch.common.settings.Settings; 8 | import org.xbib.elasticsearch.plugin.bundle.index.mapper.reference.ReferenceMapperTypeParser; 9 | 10 | /** 11 | * Reference service. 12 | */ 13 | public class ReferenceService extends AbstractLifecycleComponent { 14 | 15 | private final Injector injector; 16 | 17 | @Inject 18 | public ReferenceService(Settings settings, Injector injector) { 19 | super(settings); 20 | this.injector = injector; 21 | } 22 | 23 | @Override 24 | protected void doStart() { 25 | // get the client from the injector 26 | Client client = injector.getInstance(Client.class); 27 | // copy the client to the mapper type parser 28 | ReferenceMapperTypeParser referenceMapperTypeParser = injector.getInstance(ReferenceMapperTypeParser.class); 29 | referenceMapperTypeParser.setClient(client); 30 | } 31 | 32 | @Override 33 | protected void doStop() { 34 | // nothing to stop 35 | } 36 | 37 | @Override 38 | protected void doClose() { 39 | // nothing to close 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/common/reference/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for reference mapper implementation. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.common.reference; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/common/standardnumber/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for standard number implementation. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.common.standardnumber; -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/autophrase/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for auto phrase token filter. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.autophrase; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/baseform/BaseformTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.baseform; 2 | 3 | import org.apache.lucene.analysis.TokenStream; 4 | import org.elasticsearch.ElasticsearchException; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.IndexSettings; 8 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 9 | import org.xbib.elasticsearch.plugin.bundle.common.fsa.Dictionary; 10 | 11 | import java.io.IOException; 12 | import java.io.InputStreamReader; 13 | import java.nio.charset.StandardCharsets; 14 | 15 | /** 16 | * Base form token filter factory. 17 | */ 18 | public class BaseformTokenFilterFactory extends AbstractTokenFilterFactory { 19 | 20 | private final boolean respectKeywords; 21 | 22 | private final Dictionary dictionary; 23 | 24 | public BaseformTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { 25 | super(indexSettings, name, settings); 26 | this.respectKeywords = settings.getAsBoolean("respect_keywords", false); 27 | this.dictionary = createDictionary(settings); 28 | } 29 | 30 | @Override 31 | public TokenStream create(TokenStream tokenStream) { 32 | return new BaseformTokenFilter(tokenStream, dictionary, respectKeywords); 33 | } 34 | 35 | private Dictionary createDictionary(Settings settings) { 36 | try { 37 | String lang = settings.get("language", "de"); 38 | String path = lang + "-lemma-utf8.txt"; 39 | return new Dictionary().loadLines(new InputStreamReader(getClass().getResourceAsStream(path), StandardCharsets.UTF_8)); 40 | } catch (IOException e) { 41 | throw new ElasticsearchException("resources in settings not found: " + settings, e); 42 | } 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/baseform/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for baseform token filter. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.baseform; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/concat/ConcatTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.concat; 2 | 3 | import org.apache.lucene.analysis.TokenStream; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.env.Environment; 6 | import org.elasticsearch.index.IndexSettings; 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 8 | 9 | /** 10 | * Concat token filter factory. 11 | */ 12 | public class ConcatTokenFilterFactory extends AbstractTokenFilterFactory { 13 | 14 | public ConcatTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { 15 | super(indexSettings, name, settings); 16 | } 17 | 18 | @Override 19 | public TokenStream create(TokenStream tokenStream) { 20 | return new ConcatTokenFilter(tokenStream); 21 | 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/concat/PairTokenFilter.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.concat; 2 | 3 | import org.apache.lucene.analysis.TokenFilter; 4 | import org.apache.lucene.analysis.TokenStream; 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 6 | 7 | import java.io.IOException; 8 | import java.util.ArrayDeque; 9 | import java.util.Deque; 10 | import java.util.LinkedList; 11 | import java.util.Map; 12 | import java.util.Queue; 13 | 14 | /** 15 | * Pair token filter. 16 | */ 17 | public final class PairTokenFilter extends TokenFilter { 18 | 19 | private final CharTermAttribute termAttr; 20 | 21 | private final Map pairs; 22 | 23 | private final Queue queue; 24 | 25 | protected PairTokenFilter(TokenStream input, Map pairs) { 26 | super(input); 27 | this.termAttr = addAttribute(CharTermAttribute.class); 28 | this.pairs = pairs; 29 | this.queue = new LinkedList<>(); 30 | } 31 | 32 | @Override 33 | public boolean incrementToken() throws IOException { 34 | if (!queue.isEmpty()) { 35 | termAttr.append(queue.poll()); 36 | return true; 37 | } 38 | if (!input.incrementToken()) { 39 | return false; 40 | } 41 | Deque stack = new ArrayDeque<>(); 42 | while (pairs.containsKey(termAttr.toString())) { 43 | String term = termAttr.toString(); 44 | stack.push(term); 45 | if (!input.incrementToken()) { 46 | break; 47 | } 48 | String next = termAttr.toString(); 49 | if (pairs.get(term).equals(next)) { 50 | stack.pop(); 51 | stack.push(term + " " + next); 52 | break; 53 | } else if (!pairs.containsKey(next)) { 54 | stack.push(next); 55 | } 56 | } 57 | for (String term : stack) { 58 | queue.add(term); 59 | } 60 | if (!queue.isEmpty()) { 61 | termAttr.setEmpty().append(queue.poll()); 62 | } 63 | return true; 64 | } 65 | 66 | @Override 67 | public boolean equals(Object object) { 68 | return object instanceof PairTokenFilter && 69 | pairs.equals( ((PairTokenFilter)object).pairs); 70 | } 71 | 72 | @Override 73 | public int hashCode() { 74 | return pairs.hashCode(); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/concat/PairTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.concat; 2 | 3 | import org.apache.lucene.analysis.TokenStream; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.env.Environment; 6 | import org.elasticsearch.index.IndexSettings; 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 8 | 9 | import java.util.LinkedHashMap; 10 | import java.util.Map; 11 | 12 | /** 13 | * Pair token filter factory. 14 | */ 15 | public class PairTokenFilterFactory extends AbstractTokenFilterFactory { 16 | 17 | private final Map pairs; 18 | 19 | public PairTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { 20 | super(indexSettings, name, settings); 21 | this.pairs = new LinkedHashMap<>(); 22 | Settings pairsSettings = settings.getAsSettings("pairs"); 23 | for (String key: pairsSettings.keySet()) { 24 | pairs.put(key, pairsSettings.get(key)); 25 | } 26 | } 27 | 28 | @Override 29 | public TokenStream create(TokenStream tokenStream) { 30 | return new PairTokenFilter(tokenStream, pairs); 31 | 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/concat/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for concat token filter. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.concat; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/fst/FstDecompoundTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.decompound.fst; 2 | 3 | import org.apache.lucene.analysis.TokenStream; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.env.Environment; 6 | import org.elasticsearch.index.IndexSettings; 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 8 | import org.xbib.elasticsearch.plugin.bundle.common.decompound.fst.FstDecompounder; 9 | 10 | import java.io.IOException; 11 | import java.util.List; 12 | 13 | /** 14 | * Finite state decompound token filter factory. 15 | */ 16 | public class FstDecompoundTokenFilterFactory extends AbstractTokenFilterFactory { 17 | 18 | private final FstDecompounder decompounder; 19 | 20 | private final Boolean respectKeywords; 21 | 22 | private final Boolean subwordsonly; 23 | 24 | public FstDecompoundTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, 25 | Settings settings) { 26 | super(indexSettings, name, settings); 27 | this.decompounder = createDecompounder(settings); 28 | this.respectKeywords = settings.getAsBoolean("respect_keywords", false); 29 | this.subwordsonly = settings.getAsBoolean("subwords_only", false); 30 | } 31 | 32 | @Override 33 | public TokenStream create(TokenStream tokenStream) { 34 | return new FstDecompoundTokenFilter(tokenStream, decompounder, respectKeywords, subwordsonly); 35 | } 36 | 37 | private FstDecompounder createDecompounder(Settings settings) { 38 | try { 39 | String words = settings.get("fst", "words.fst"); 40 | List glueMorphs = settings.getAsList("glue_morphs"); 41 | return new FstDecompounder(getClass().getResourceAsStream(words), glueMorphs); 42 | } catch (IOException e) { 43 | throw new IllegalArgumentException("fst decompounder resources in settings not found: " + settings, e); 44 | } 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/fst/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Finite-State-Transformer base expand token filter. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.decompound.fst; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/patricia/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Patricia-Trie based decompounder. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.decompound.patricia; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/german/GermanNormalizationFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.german; 2 | 3 | import org.apache.lucene.analysis.TokenStream; 4 | import org.apache.lucene.analysis.de.GermanNormalizationFilter; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.IndexSettings; 8 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 9 | 10 | /** 11 | * German normalization filter factory. 12 | */ 13 | public class GermanNormalizationFilterFactory extends AbstractTokenFilterFactory { 14 | 15 | public GermanNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name, 16 | Settings settings) { 17 | super(indexSettings, name, settings); 18 | } 19 | 20 | @Override 21 | public TokenStream create(TokenStream tokenStream) { 22 | return new GermanNormalizationFilter(tokenStream); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/german/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * German normalization filter. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.german; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/hyphen/HyphenAnalyzer.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.hyphen; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.apache.lucene.analysis.Tokenizer; 5 | 6 | /** 7 | * Hyphen analyzer. 8 | */ 9 | public class HyphenAnalyzer extends Analyzer { 10 | 11 | private final HyphenTokenizerFactory tokenizerFactory; 12 | 13 | public HyphenAnalyzer(HyphenTokenizerFactory tokenizerFactory) { 14 | this.tokenizerFactory = tokenizerFactory; 15 | } 16 | 17 | @Override 18 | protected TokenStreamComponents createComponents(String fieldName) { 19 | Tokenizer tokenizer = tokenizerFactory.create(); 20 | return new TokenStreamComponents(tokenizer, tokenizer); 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/hyphen/HyphenTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.hyphen; 2 | 3 | import org.apache.lucene.analysis.TokenStream; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.env.Environment; 6 | import org.elasticsearch.index.IndexSettings; 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 8 | 9 | /** 10 | * Hyphen token filter factory. 11 | */ 12 | public class HyphenTokenFilterFactory extends AbstractTokenFilterFactory { 13 | 14 | private final char[] hyphenchars; 15 | 16 | private final boolean subwords; 17 | 18 | private final boolean respectKeywords; 19 | 20 | public HyphenTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, 21 | Settings settings) { 22 | super(indexSettings, name, settings); 23 | this.hyphenchars = settings.get("hyphens") != null ? settings.get("hyphens").toCharArray() : HyphenTokenFilter.HYPHEN; 24 | this.subwords = settings.getAsBoolean("subwords", true); 25 | this.respectKeywords = settings.getAsBoolean("respect_keywords", false); 26 | } 27 | 28 | @Override 29 | public TokenStream create(TokenStream tokenStream) { 30 | return new HyphenTokenFilter(tokenStream, hyphenchars, subwords, respectKeywords); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/hyphen/HyphenTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.hyphen; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.apache.lucene.analysis.standard.StandardAnalyzer; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.IndexSettings; 8 | import org.elasticsearch.index.analysis.AbstractTokenizerFactory; 9 | 10 | /** 11 | * Hyphen tokenizer factory. 12 | */ 13 | public class HyphenTokenizerFactory extends AbstractTokenizerFactory { 14 | 15 | private final Integer maxTokenLength; 16 | 17 | public HyphenTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, 18 | Settings settings) { 19 | super(indexSettings, name, settings); 20 | this.maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); 21 | } 22 | 23 | @Override 24 | public Tokenizer create() { 25 | return new HyphenTokenizer(maxTokenLength); 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/hyphen/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for hyphen analysis. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.hyphen; -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuCollationKeyAnalyzer.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu; 2 | 3 | import com.ibm.icu.text.Collator; 4 | import org.apache.lucene.analysis.Analyzer; 5 | import org.apache.lucene.analysis.core.KeywordTokenizer; 6 | import org.apache.lucene.collation.CollationKeyAnalyzer; 7 | 8 | /** 9 | * Configures a {@link KeywordTokenizer} with an {@link IcuCollationAttributeFactory}. 10 | *

11 | * Converts the token into its {@link com.ibm.icu.text.CollationKey} and 12 | * then encodes the CollationKey directly. 13 | *

14 | *

15 | * WARNING: Make sure you use exactly the same Collator at 16 | * index and query time -- CollationKeys are only comparable when produced by 17 | * the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are 18 | * independently versioned, so it is safe to search against stored 19 | * CollationKeys if the following are exactly the same (best practice is 20 | * to store this information with the index and check that they remain the 21 | * same at query time): 22 | *

23 | *
    24 | *
  1. 25 | * Collator version - see {@link Collator#getVersion()} 26 | *
  2. 27 | *
  3. 28 | * The collation strength used - see {@link Collator#setStrength(int)} 29 | *
  4. 30 | *
31 | *

32 | * CollationKeys generated by ICU Collators are not compatible with those 33 | * generated by java.text.Collators. Specifically, if you use 34 | * ICUCollationKeyAnalyzer to generate index terms, do not use 35 | * {@link CollationKeyAnalyzer} on the query side, or vice versa. 36 | *

37 | *

38 | * ICUCollationKeyAnalyzer is significantly faster and generates significantly 39 | * shorter keys than CollationKeyAnalyzer. See 40 | * http://site.icu-project.org/charts/collation-icu4j-sun for key 42 | * generation timing and key length comparisons between ICU4J and 43 | * java.text.Collator over several languages. 44 | *

45 | */ 46 | public final class IcuCollationKeyAnalyzer extends Analyzer { 47 | 48 | private final IcuCollationAttributeFactory factory; 49 | 50 | public IcuCollationKeyAnalyzer(Collator collator) { 51 | this.factory = new IcuCollationAttributeFactory(collator); 52 | } 53 | 54 | @Override 55 | protected TokenStreamComponents createComponents(String fieldName) { 56 | KeywordTokenizer tokenizer = new KeywordTokenizer(factory, KeywordTokenizer.DEFAULT_BUFFER_SIZE); 57 | return new TokenStreamComponents(tokenizer, tokenizer); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuCollationTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu; 2 | 3 | import com.ibm.icu.text.Collator; 4 | import org.apache.lucene.analysis.Tokenizer; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.IndexSettings; 8 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation.IcuTokenizer; 9 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation.IcuTokenizerFactory; 10 | 11 | /** 12 | * This {@link IcuTokenizer} uses an ICU @{@link Collator} as a char attribute factory. 13 | */ 14 | public class IcuCollationTokenizerFactory extends IcuTokenizerFactory { 15 | 16 | private final IcuCollationAttributeFactory factory; 17 | 18 | public IcuCollationTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, 19 | Settings settings) { 20 | super(indexSettings, environment, name, settings); 21 | this.factory = new IcuCollationAttributeFactory(IcuCollationKeyAnalyzerProvider.createCollator(settings)); 22 | } 23 | 24 | @Override 25 | public Tokenizer create() { 26 | return new IcuTokenizer(factory, config); 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuFoldingCharFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu; 2 | 3 | import org.elasticsearch.common.settings.Settings; 4 | import org.elasticsearch.env.Environment; 5 | import org.elasticsearch.index.IndexSettings; 6 | 7 | import java.io.InputStream; 8 | 9 | /** 10 | * Applies foldings from UTR#30 Character Foldings. 11 | * Can be filtered to handle certain characters in a specified way. 12 | * See http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html 13 | * E.g national chars that should be retained, like unicodeSetFilter : "[^åäöÅÄÖ]". 14 | */ 15 | public class IcuFoldingCharFilterFactory extends IcuNormalizerCharFilterFactory { 16 | 17 | public IcuFoldingCharFilterFactory(IndexSettings indexSettings, Environment environment, String name, 18 | Settings settings) { 19 | super(indexSettings, environment, name, settings); 20 | } 21 | 22 | @Override 23 | protected String getNormalizationName(Settings settings) { 24 | return settings.get("normalization_name", "utr30"); 25 | } 26 | 27 | @Override 28 | protected InputStream getNormalizationResource(Settings settings) { 29 | InputStream inputStream = null; 30 | if ("utr30".equals(getNormalizationName(settings))) { 31 | inputStream = getClass().getResourceAsStream("utr30.nrm"); 32 | } 33 | return inputStream; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuFoldingTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu; 2 | 3 | import org.elasticsearch.common.settings.Settings; 4 | import org.elasticsearch.env.Environment; 5 | import org.elasticsearch.index.IndexSettings; 6 | 7 | import java.io.InputStream; 8 | 9 | /** 10 | * Applies foldings from UTR#30 Character Foldings. 11 | * Can be filtered to handle certain characters in a specified way. 12 | * See http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html 13 | * E.g national chars that should be retained, like unicode_set_filter : "[^åäöÅÄÖ]". 14 | */ 15 | public class IcuFoldingTokenFilterFactory extends IcuNormalizerTokenFilterFactory { 16 | 17 | public IcuFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, 18 | Settings settings) { 19 | super(indexSettings, environment, name, settings); 20 | } 21 | 22 | @Override 23 | public Object getMultiTermComponent() { 24 | return this; 25 | } 26 | 27 | @Override 28 | protected String getNormalizationName(Settings settings) { 29 | return settings.get("normalization_name", "utr30"); 30 | } 31 | 32 | @Override 33 | protected InputStream getNormalizationResource(Settings settings) { 34 | InputStream inputStream = null; 35 | if ("utr30".equals(getNormalizationName(settings))) { 36 | inputStream = getClass().getResourceAsStream("utr30.nrm"); 37 | } 38 | return inputStream; 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuNormalizerCharFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu; 2 | 3 | import com.ibm.icu.text.FilteredNormalizer2; 4 | import com.ibm.icu.text.Normalizer2; 5 | import com.ibm.icu.text.UnicodeSet; 6 | import org.elasticsearch.common.settings.Settings; 7 | import org.elasticsearch.env.Environment; 8 | import org.elasticsearch.index.IndexSettings; 9 | import org.elasticsearch.index.analysis.AbstractCharFilterFactory; 10 | import org.elasticsearch.index.analysis.MultiTermAwareComponent; 11 | 12 | import java.io.InputStream; 13 | import java.io.Reader; 14 | 15 | /** 16 | * ICU normalizer char filter factory. 17 | */ 18 | public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory implements MultiTermAwareComponent { 19 | 20 | private final Normalizer2 normalizer; 21 | 22 | public IcuNormalizerCharFilterFactory(IndexSettings indexSettings, Environment environment, String name, 23 | Settings settings) { 24 | super(indexSettings, name); 25 | Normalizer2 base = Normalizer2.getInstance(getNormalizationResource(settings), 26 | getNormalizationName(settings), getNormalizationMode(settings)); 27 | String unicodeSetFilter = settings.get("unicode_set_filter"); 28 | this.normalizer = unicodeSetFilter != null ? 29 | new FilteredNormalizer2(base, new UnicodeSet(unicodeSetFilter).freeze()) : base; 30 | } 31 | 32 | @Override 33 | public Reader create(Reader reader) { 34 | return new IcuNormalizerCharFilter(reader, normalizer); 35 | } 36 | 37 | @Override 38 | public Object getMultiTermComponent() { 39 | return this; 40 | } 41 | 42 | protected InputStream getNormalizationResource(Settings settings) { 43 | InputStream inputStream = null; 44 | if ("utr30".equals(getNormalizationName(settings))) { 45 | inputStream = getClass().getResourceAsStream("utr30.nrm"); 46 | } 47 | return inputStream; 48 | } 49 | 50 | protected String getNormalizationName(Settings settings) { 51 | return settings.get("normalization_name", "nfkc_cf"); 52 | } 53 | 54 | protected Normalizer2.Mode getNormalizationMode(Settings settings) { 55 | Normalizer2.Mode normalizationMode; 56 | switch (settings.get("normalization_mode", "compose")) { 57 | case "compose_contiguous": 58 | normalizationMode = Normalizer2.Mode.COMPOSE_CONTIGUOUS; 59 | break; 60 | case "decompose": 61 | normalizationMode = Normalizer2.Mode.DECOMPOSE; 62 | break; 63 | case "fcd": 64 | normalizationMode = Normalizer2.Mode.FCD; 65 | break; 66 | default: 67 | normalizationMode = Normalizer2.Mode.COMPOSE; 68 | break; 69 | } 70 | return normalizationMode; 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuNormalizerFilter.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu; 2 | 3 | import com.ibm.icu.text.Normalizer; 4 | import com.ibm.icu.text.Normalizer2; 5 | import org.apache.lucene.analysis.TokenFilter; 6 | import org.apache.lucene.analysis.TokenStream; 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 8 | 9 | import java.io.IOException; 10 | 11 | /** 12 | * Normalize token text with ICU {@link Normalizer2}. 13 | *

14 | * With this filter, you can normalize text in the following ways: 15 | *

    16 | *
  • NFKC Normalization, Case Folding, and removing Ignorables (the default) 17 | *
  • Using a standard Normalization mode (NFC, NFD, NFKC, NFKD) 18 | *
  • Based on rules from a custom normalization mapping. 19 | *
20 | *

21 | * If you use the defaults, this filter is a simple way to standardize Unicode text 22 | * in a language-independent way for search: 23 | *

    24 | *
  • The case folding that it does can be seen as a replacement for 25 | * LowerCaseFilter: For example, it handles cases such as the Greek sigma, so that 26 | * "Μάϊος" and "ΜΆΪΟΣ" will match correctly. 27 | *
  • The normalization will standardizes different forms of the same 28 | * character in Unicode. For example, CJK full-width numbers will be standardized 29 | * to their ASCII forms. 30 | *
  • Ignorables such as Zero-Width Joiner and Variation Selectors are removed. 31 | * These are typically modifier characters that affect display. 32 | *
33 | * 34 | * @see Normalizer2 35 | * @see com.ibm.icu.text.FilteredNormalizer2 36 | */ 37 | public class IcuNormalizerFilter extends TokenFilter { 38 | 39 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 40 | 41 | private final Normalizer2 normalizer; 42 | 43 | private final StringBuilder buffer = new StringBuilder(); 44 | 45 | /** 46 | * Create a new Normalizer2Filter with the specified Normalizer2. 47 | * 48 | * @param input stream 49 | * @param normalizer normalizer to use 50 | */ 51 | public IcuNormalizerFilter(TokenStream input, Normalizer2 normalizer) { 52 | super(input); 53 | this.normalizer = normalizer; 54 | } 55 | 56 | @Override 57 | public final boolean incrementToken() throws IOException { 58 | if (input.incrementToken()) { 59 | if (normalizer.quickCheck(termAtt) != Normalizer.YES) { 60 | buffer.setLength(0); 61 | normalizer.normalize(termAtt, buffer); 62 | termAtt.setEmpty().append(buffer); 63 | } 64 | return true; 65 | } else { 66 | return false; 67 | } 68 | } 69 | 70 | @Override 71 | public boolean equals(Object object) { 72 | return object instanceof IcuNormalizerFilter; 73 | } 74 | 75 | @Override 76 | public int hashCode() { 77 | return 0; 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuNormalizerTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu; 2 | 3 | import com.ibm.icu.text.FilteredNormalizer2; 4 | import com.ibm.icu.text.Normalizer2; 5 | import com.ibm.icu.text.UnicodeSet; 6 | import org.apache.lucene.analysis.TokenStream; 7 | import org.elasticsearch.common.settings.Settings; 8 | import org.elasticsearch.env.Environment; 9 | import org.elasticsearch.index.IndexSettings; 10 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 11 | import org.elasticsearch.index.analysis.MultiTermAwareComponent; 12 | 13 | import java.io.InputStream; 14 | 15 | /** 16 | * Uses the {@link IcuNormalizerFilter} to normalize tokens. 17 | * 18 | * The name can be used to provide the type of normalization to perform, 19 | * the mode can be used to provide the mode of normalization. 20 | */ 21 | public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent { 22 | 23 | private final Normalizer2 normalizer; 24 | 25 | public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, 26 | Settings settings) { 27 | super(indexSettings, name, settings); 28 | 29 | Normalizer2 base = Normalizer2.getInstance(getNormalizationResource(settings), 30 | getNormalizationName(settings), getNormalizationMode(settings)); 31 | 32 | String unicodeSetFilter = settings.get("unicode_set_filter"); 33 | this.normalizer = unicodeSetFilter != null ? 34 | new FilteredNormalizer2(base, new UnicodeSet(unicodeSetFilter).freeze()) : base; 35 | } 36 | 37 | @Override 38 | public TokenStream create(TokenStream tokenStream) { 39 | return new IcuNormalizerFilter(tokenStream, normalizer); 40 | } 41 | 42 | @Override 43 | public Object getMultiTermComponent() { 44 | return this; 45 | } 46 | 47 | protected InputStream getNormalizationResource(Settings settings) { 48 | InputStream inputStream = null; 49 | if ("utr30".equals(getNormalizationName(settings))) { 50 | inputStream = getClass().getResourceAsStream("utr30.nrm"); 51 | } 52 | return inputStream; 53 | } 54 | 55 | protected String getNormalizationName(Settings settings) { 56 | return settings.get("normalization_name", "nfkc_cf"); 57 | } 58 | 59 | protected Normalizer2.Mode getNormalizationMode(Settings settings) { 60 | Normalizer2.Mode normalizationMode; 61 | switch (settings.get("normalization_mode", "compose")) { 62 | case "compose_contiguous": 63 | normalizationMode = Normalizer2.Mode.COMPOSE_CONTIGUOUS; 64 | break; 65 | case "decompose": 66 | normalizationMode = Normalizer2.Mode.DECOMPOSE; 67 | break; 68 | case "fcd": 69 | normalizationMode = Normalizer2.Mode.FCD; 70 | break; 71 | default: 72 | normalizationMode = Normalizer2.Mode.COMPOSE; 73 | break; 74 | } 75 | return normalizationMode; 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuNumberFormatTokenFilter.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu; 2 | 3 | import com.ibm.icu.text.NumberFormat; 4 | import org.apache.lucene.analysis.TokenFilter; 5 | import org.apache.lucene.analysis.TokenStream; 6 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 7 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute; 8 | 9 | import java.io.IOException; 10 | import java.text.ParsePosition; 11 | 12 | /** 13 | * ICU number format token filter. 14 | */ 15 | public final class IcuNumberFormatTokenFilter extends TokenFilter { 16 | 17 | private final NumberFormat numberFormat; 18 | 19 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 20 | private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); 21 | 22 | public IcuNumberFormatTokenFilter(TokenStream input, NumberFormat numberFormat) { 23 | super(input); 24 | this.numberFormat = numberFormat; 25 | } 26 | 27 | @Override 28 | public boolean incrementToken() throws IOException { 29 | if (!input.incrementToken()) { 30 | return false; 31 | } else { 32 | String s = termAtt.toString(); 33 | ParsePosition parsePosition = new ParsePosition(0); 34 | Number result = numberFormat.parse(s, parsePosition); 35 | if (parsePosition.getIndex() > 0) { 36 | // zehn-tausend -> zehntausend 37 | // one hundred thousand -> onehundredthousand 38 | s = numberFormat.format(result).replaceAll("[\u00AD\u0020]", ""); 39 | } 40 | termAtt.setEmpty().append(s); 41 | typeAtt.setType(""); 42 | return true; 43 | } 44 | } 45 | 46 | @Override 47 | public boolean equals(Object object) { 48 | return object instanceof IcuNumberFormatTokenFilter && 49 | numberFormat.equals(((IcuNumberFormatTokenFilter) object).numberFormat); 50 | } 51 | 52 | @Override 53 | public int hashCode() { 54 | return numberFormat.hashCode(); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuNumberFormatTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu; 2 | 3 | import com.ibm.icu.text.RuleBasedNumberFormat; 4 | import com.ibm.icu.util.ULocale; 5 | import org.apache.lucene.analysis.TokenStream; 6 | import org.elasticsearch.common.settings.Settings; 7 | import org.elasticsearch.env.Environment; 8 | import org.elasticsearch.index.IndexSettings; 9 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 10 | 11 | import java.util.Locale; 12 | 13 | /** 14 | * ICU number format token filter factory. 15 | */ 16 | public class IcuNumberFormatTokenFilterFactory extends AbstractTokenFilterFactory { 17 | 18 | private final ULocale locale; 19 | 20 | private final int format; 21 | 22 | private final boolean lenient; 23 | 24 | private final boolean grouping; 25 | 26 | public IcuNumberFormatTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, 27 | Settings settings) { 28 | super(indexSettings, name, settings); 29 | this.locale = settings.get("locale") != null ? new ULocale(settings.get("locale")) : ULocale.getDefault(); 30 | String formatStr = settings.get("format", "SPELLOUT"); 31 | switch (formatStr.toUpperCase(Locale.ROOT)) { 32 | case "DURATION": 33 | format = RuleBasedNumberFormat.DURATION; 34 | break; 35 | case "NUMBERING_SYSTEM": 36 | format = RuleBasedNumberFormat.NUMBERING_SYSTEM; 37 | break; 38 | case "NUMBERSTYLE": 39 | format = RuleBasedNumberFormat.NUMBERSTYLE; 40 | break; 41 | case "ORDINAL": 42 | format = RuleBasedNumberFormat.ORDINAL; 43 | break; 44 | case "SPELLOUT": 45 | default: 46 | format = RuleBasedNumberFormat.SPELLOUT; 47 | break; 48 | } 49 | // RBNF parsing is incredibly slow when lenient is enabled but the only method to parse compound number words 50 | this.lenient = settings.getAsBoolean("lenient", true); 51 | this.grouping = settings.getAsBoolean("grouping", true); 52 | } 53 | 54 | @Override 55 | public TokenStream create(TokenStream tokenStream) { 56 | // create a new number format instance for each token stream 57 | RuleBasedNumberFormat ruleBasedNumberFormat = new RuleBasedNumberFormat(locale, format); 58 | ruleBasedNumberFormat.setLenientParseMode(lenient); 59 | ruleBasedNumberFormat.setGroupingUsed(grouping); 60 | return new IcuNumberFormatTokenFilter(tokenStream, ruleBasedNumberFormat); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuTransformTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu; 2 | 3 | import com.ibm.icu.text.Transliterator; 4 | import com.ibm.icu.text.UnicodeSet; 5 | import org.apache.lucene.analysis.TokenStream; 6 | import org.elasticsearch.common.settings.Settings; 7 | import org.elasticsearch.env.Environment; 8 | import org.elasticsearch.index.IndexSettings; 9 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 10 | 11 | /** 12 | * ICU transform token filter factory. 13 | */ 14 | public class IcuTransformTokenFilterFactory extends AbstractTokenFilterFactory { 15 | 16 | private final Transliterator transliterator; 17 | 18 | public IcuTransformTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, 19 | Settings settings) { 20 | super(indexSettings, name, settings); 21 | String id = settings.get("id", "Null"); 22 | String direction = settings.get("dir", "forward"); 23 | int dir = "forward".equals(direction) ? Transliterator.FORWARD : Transliterator.REVERSE; 24 | String rules = settings.get("rules"); 25 | this.transliterator = rules != null ? 26 | Transliterator.createFromRules(id, rules, dir) : 27 | Transliterator.getInstance(id, dir); 28 | String unicodeSetFilter = settings.get("unicodeSetFilter"); 29 | if (unicodeSetFilter != null) { 30 | transliterator.setFilter(new UnicodeSet(unicodeSetFilter).freeze()); 31 | } 32 | } 33 | 34 | @Override 35 | public TokenStream create(TokenStream tokenStream) { 36 | return new IcuTransformTokenFilter(tokenStream, transliterator); 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for Elasticsearch analysis by International Components for Unicode. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/CharArrayIterator.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation; 2 | 3 | import java.text.CharacterIterator; 4 | 5 | /** 6 | * Wraps a char[] as CharacterIterator for processing with a BreakIterator. 7 | */ 8 | public final class CharArrayIterator implements CharacterIterator { 9 | 10 | private char[] array; 11 | 12 | private int start; 13 | 14 | private int index; 15 | 16 | private int length; 17 | 18 | private int limit; 19 | 20 | public char[] getText() { 21 | return array; 22 | } 23 | 24 | public int getStart() { 25 | return start; 26 | } 27 | 28 | public int getLength() { 29 | return length; 30 | } 31 | 32 | /** 33 | * Set a new region of text to be examined by this iterator. 34 | * 35 | * @param array text buffer to examine 36 | * @param start offset into buffer 37 | * @param length maximum length to examine 38 | */ 39 | public void setText(final char[] array, int start, int length) { 40 | this.array = array; 41 | this.start = start; 42 | this.index = start; 43 | this.length = length; 44 | this.limit = start + length; 45 | } 46 | 47 | @Override 48 | public char current() { 49 | return (index == limit) ? DONE : array[index]; 50 | } 51 | 52 | @Override 53 | public char first() { 54 | index = start; 55 | return current(); 56 | } 57 | 58 | @Override 59 | public int getBeginIndex() { 60 | return 0; 61 | } 62 | 63 | @Override 64 | public int getEndIndex() { 65 | return length; 66 | } 67 | 68 | @Override 69 | public int getIndex() { 70 | return index - start; 71 | } 72 | 73 | @Override 74 | public char last() { 75 | index = (limit == start) ? limit : limit - 1; 76 | return current(); 77 | } 78 | 79 | @Override 80 | public char next() { 81 | if (++index >= limit) { 82 | index = limit; 83 | return DONE; 84 | } else { 85 | return current(); 86 | } 87 | } 88 | 89 | @Override 90 | public char previous() { 91 | if (--index < start) { 92 | index = start; 93 | return DONE; 94 | } else { 95 | return current(); 96 | } 97 | } 98 | 99 | @Override 100 | public char setIndex(int position) { 101 | if (position < getBeginIndex() || position > getEndIndex()) { 102 | throw new IllegalArgumentException("Illegal Position: " + position); 103 | } 104 | index = start + position; 105 | return current(); 106 | } 107 | 108 | @Override 109 | public CharArrayIterator clone() { 110 | CharArrayIterator clone = new CharArrayIterator(); 111 | clone.setText(array, start, length); 112 | clone.index = index; 113 | return clone; 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/IcuTokenizerConfig.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation; 2 | 3 | import com.ibm.icu.text.BreakIterator; 4 | 5 | /** 6 | * Class that allows for tailored Unicode Text Segmentation on 7 | * a per-writing system basis. 8 | */ 9 | public interface IcuTokenizerConfig { 10 | 11 | /** 12 | * Return a breakiterator capable of processing a given script. 13 | * 14 | * @param script script 15 | * @return iterator 16 | */ 17 | BreakIterator getBreakIterator(int script); 18 | 19 | /** 20 | * Return a token type value for a given script and BreakIterator 21 | * rule status. 22 | * 23 | * @param script script 24 | * @param ruleStatus rule status 25 | * @return type 26 | */ 27 | String getType(int script, int ruleStatus); 28 | 29 | /** 30 | * @return true if Han, Hiragana, and Katakana scripts should all be returned as Japanese 31 | */ 32 | boolean combineCJ(); 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for text segmentation with International Components for Unicode. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/tokenattributes/ScriptAttribute.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tokenattributes; 2 | 3 | import org.apache.lucene.util.Attribute; 4 | 5 | /** 6 | * This attribute stores the UTR #24 script value for a token of text. 7 | */ 8 | public interface ScriptAttribute extends Attribute { 9 | /** 10 | * Get the numeric code for this script value. 11 | * This is the constant value from {@link com.ibm.icu.lang.UScript}. 12 | * 13 | * @return numeric code 14 | */ 15 | int getCode(); 16 | 17 | /** 18 | * Set the numeric code for this script value. 19 | * This is the constant value from {@link com.ibm.icu.lang.UScript}. 20 | * 21 | * @param code numeric code 22 | */ 23 | void setCode(int code); 24 | 25 | /** 26 | * Get the full name. 27 | * 28 | * @return UTR #24 full name. 29 | */ 30 | String getName(); 31 | 32 | /** 33 | * Get the abbreviated name. 34 | * 35 | * @return UTR #24 abbreviated name. 36 | */ 37 | String getShortName(); 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/tokenattributes/ScriptAttributeImpl.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tokenattributes; 2 | 3 | import com.ibm.icu.lang.UScript; 4 | import org.apache.lucene.util.AttributeImpl; 5 | import org.apache.lucene.util.AttributeReflector; 6 | 7 | /** 8 | * Implementation of {@link ScriptAttribute} that stores the script as an integer. 9 | */ 10 | public class ScriptAttributeImpl extends AttributeImpl implements ScriptAttribute, Cloneable { 11 | private int code = UScript.COMMON; 12 | 13 | public ScriptAttributeImpl() {} 14 | 15 | @Override 16 | public int getCode() { 17 | return code; 18 | } 19 | 20 | @Override 21 | public void setCode(int code) { 22 | this.code = code; 23 | } 24 | 25 | @Override 26 | public String getName() { 27 | return UScript.getName(code); 28 | } 29 | 30 | @Override 31 | public String getShortName() { 32 | return UScript.getShortName(code); 33 | } 34 | 35 | @Override 36 | public void clear() { 37 | code = UScript.COMMON; 38 | } 39 | 40 | @Override 41 | public void copyTo(AttributeImpl target) { 42 | ScriptAttribute t = (ScriptAttribute) target; 43 | t.setCode(code); 44 | } 45 | 46 | @Override 47 | public boolean equals(Object other) { 48 | return this == other || other instanceof ScriptAttributeImpl && 49 | ((ScriptAttributeImpl) other).code == code; 50 | } 51 | 52 | @Override 53 | public ScriptAttributeImpl clone() { 54 | ScriptAttributeImpl attribute = (ScriptAttributeImpl) super.clone(); 55 | attribute.code = this.code; 56 | return attribute; 57 | } 58 | 59 | @Override 60 | public int hashCode() { 61 | return code; 62 | } 63 | 64 | @Override 65 | public void reflectWith(AttributeReflector reflector) { 66 | String name = code == UScript.JAPANESE ? "Chinese/Japanese" : getName(); 67 | reflector.reflect(ScriptAttribute.class, "script", name); 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/tokenattributes/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes fro token attributes of International Components for Unicode. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tokenattributes; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/tools/RBBIRuleCompiler.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tools; 2 | 3 | import com.ibm.icu.text.RuleBasedBreakIterator; 4 | import org.apache.logging.log4j.LogManager; 5 | import org.apache.logging.log4j.Logger; 6 | 7 | import java.io.BufferedReader; 8 | import java.io.IOException; 9 | import java.io.InputStream; 10 | import java.io.InputStreamReader; 11 | import java.io.OutputStream; 12 | import java.nio.charset.StandardCharsets; 13 | import java.nio.file.Files; 14 | import java.nio.file.Path; 15 | 16 | /** 17 | * Utility to convert RuleBasedBreakIterator (.rbbi) files into binary compiled form (.brk). 18 | */ 19 | public class RBBIRuleCompiler { 20 | 21 | private static final Logger logger = LogManager.getLogger(RBBIRuleCompiler.class.getName()); 22 | 23 | public void compile(Path inputPath, Path outputPath) throws IOException { 24 | compile(Files.newInputStream(inputPath), Files.newOutputStream(outputPath)); 25 | } 26 | 27 | public void compile(InputStream inputStream, OutputStream outputStream) throws IOException { 28 | String rules = getRules(inputStream); 29 | try (OutputStream os = outputStream) { 30 | new RuleBasedBreakIterator(rules); 31 | RuleBasedBreakIterator.compileRules(rules, os); 32 | } catch (IllegalArgumentException e) { 33 | logger.error(e.getMessage(), e); 34 | } 35 | } 36 | 37 | private String getRules(InputStream inputStream) throws IOException { 38 | StringBuilder rules = new StringBuilder(); 39 | try (BufferedReader bufferedReader = 40 | new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) { 41 | String line; 42 | while ((line = bufferedReader.readLine()) != null) { 43 | if (!line.startsWith("#")) { 44 | rules.append(line); 45 | rules.append('\n'); 46 | } 47 | } 48 | } 49 | return rules.toString(); 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/tools/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Classes for ICU tools. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tools; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/lemmatize/LemmatizeTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.lemmatize; 2 | 3 | import org.apache.lucene.analysis.TokenStream; 4 | import org.elasticsearch.ElasticsearchException; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.IndexSettings; 8 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 9 | import org.xbib.elasticsearch.plugin.bundle.common.fsa.Dictionary; 10 | 11 | import java.io.InputStream; 12 | import java.io.InputStreamReader; 13 | import java.io.Reader; 14 | import java.nio.charset.StandardCharsets; 15 | import java.util.zip.GZIPInputStream; 16 | 17 | /** 18 | * Lemmatize token filter factory. 19 | */ 20 | public class LemmatizeTokenFilterFactory extends AbstractTokenFilterFactory { 21 | 22 | private final Dictionary dictionary; 23 | 24 | private final boolean respectKeywords; 25 | 26 | private final boolean lemmaOnly; 27 | 28 | public LemmatizeTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { 29 | super(indexSettings, name, settings); 30 | this.respectKeywords = settings.getAsBoolean("respect_keywords", false); 31 | this.lemmaOnly = settings.getAsBoolean("lemma_only", true); 32 | this.dictionary = createDictionary(settings); 33 | } 34 | 35 | @Override 36 | public TokenStream create(TokenStream tokenStream) { 37 | return new LemmatizeTokenFilter(tokenStream, dictionary, respectKeywords, lemmaOnly); 38 | } 39 | 40 | private Dictionary createDictionary(Settings settings) { 41 | String language = settings.get("language", "en"); 42 | try { 43 | String resource = settings.get("resource", "lemmatization-" + language + ".fsa.gz"); 44 | if (resource.endsWith(".fsa") || resource.endsWith("fsa.gz")) { 45 | // FSA 46 | InputStream inputStream = getClass().getResourceAsStream(resource); 47 | if (resource.endsWith(".gz")) { 48 | inputStream = new GZIPInputStream(inputStream); 49 | } 50 | Dictionary dictionary = new Dictionary().loadFSA(inputStream); 51 | inputStream.close(); 52 | return dictionary; 53 | } else { 54 | // Text 55 | InputStream inputStream = getClass().getResourceAsStream(resource); 56 | if (resource.endsWith(".gz")) { 57 | inputStream = new GZIPInputStream(inputStream); 58 | } 59 | Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8); 60 | Dictionary dictionary = new Dictionary().loadLinesReverse(reader); 61 | reader.close(); 62 | return dictionary; 63 | } 64 | } catch (Exception e) { 65 | throw new ElasticsearchException("resources for language " + language + 66 | " in settings not found: " + settings, e); 67 | } 68 | } 69 | } 70 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/naturalsort/NaturalSortKeyAnalyzer.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.naturalsort; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.apache.lucene.analysis.core.KeywordTokenizer; 5 | 6 | import java.text.Collator; 7 | 8 | /** 9 | * Natural sort key analyzer. 10 | */ 11 | public class NaturalSortKeyAnalyzer extends Analyzer { 12 | 13 | private final NaturalSortKeyAttributeFactory factory; 14 | 15 | private final int bufferSize; 16 | 17 | public NaturalSortKeyAnalyzer(Collator collator, int bufferSize, int digits, int maxtoken) { 18 | this.factory = new NaturalSortKeyAttributeFactory(collator, digits, maxtoken); 19 | this.bufferSize = bufferSize; 20 | } 21 | 22 | @Override 23 | protected TokenStreamComponents createComponents(String fieldName) { 24 | KeywordTokenizer tokenizer = new KeywordTokenizer(factory, bufferSize); 25 | return new TokenStreamComponents(tokenizer, tokenizer); 26 | } 27 | 28 | } 29 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/naturalsort/NaturalSortKeyAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.naturalsort; 2 | 3 | import org.apache.lucene.analysis.core.KeywordTokenizer; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.env.Environment; 6 | import org.elasticsearch.index.IndexSettings; 7 | import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider; 8 | 9 | import java.text.Collator; 10 | import java.util.Locale; 11 | 12 | /** 13 | * Natural sort key analyzer provider. 14 | */ 15 | public class NaturalSortKeyAnalyzerProvider extends AbstractIndexAnalyzerProvider { 16 | 17 | private final Collator collator; 18 | 19 | private final int digits; 20 | 21 | private final int maxTokens; 22 | 23 | private final int bufferSize; 24 | 25 | public NaturalSortKeyAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, 26 | Settings settings) { 27 | super(indexSettings, name, settings); 28 | this.collator = createCollator(settings); 29 | this.digits = settings.getAsInt("digits", 1); 30 | this.maxTokens = settings.getAsInt("maxTokens", 2); 31 | this.bufferSize = settings.getAsInt("bufferSize", KeywordTokenizer.DEFAULT_BUFFER_SIZE); 32 | } 33 | 34 | protected static Collator createCollator(Settings settings) { 35 | return Collator.getInstance(new Locale(settings.get("locale", Locale.getDefault().toString()))); 36 | } 37 | 38 | @Override 39 | public NaturalSortKeyAnalyzer get() { 40 | return new NaturalSortKeyAnalyzer(collator, bufferSize, digits, maxTokens); 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/naturalsort/NaturalSortKeyAttributeFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.naturalsort; 2 | 3 | import org.apache.lucene.analysis.TokenStream; 4 | import org.apache.lucene.util.AttributeFactory; 5 | 6 | import java.text.Collator; 7 | 8 | /** 9 | * Natural sort key attribute factory. 10 | */ 11 | public class NaturalSortKeyAttributeFactory 12 | extends AttributeFactory.StaticImplementationAttributeFactory { 13 | 14 | private final Collator collator; 15 | 16 | private final int digits; 17 | 18 | private final int maxTokens; 19 | 20 | public NaturalSortKeyAttributeFactory(Collator collator, int digits, int maxTokens) { 21 | this(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, collator, digits, maxTokens); 22 | } 23 | 24 | public NaturalSortKeyAttributeFactory(AttributeFactory delegate, Collator collator, int digits, int maxTokens) { 25 | super(delegate, NaturalSortKeyAttributeImpl.class); 26 | this.collator = collator; 27 | this.digits = digits; 28 | this.maxTokens = maxTokens; 29 | } 30 | 31 | @Override 32 | protected NaturalSortKeyAttributeImpl createInstance() { 33 | return new NaturalSortKeyAttributeImpl(collator, digits, maxTokens); 34 | } 35 | 36 | @Override 37 | public boolean equals(Object object) { 38 | return object instanceof NaturalSortKeyAttributeFactory && 39 | collator.equals(((NaturalSortKeyAttributeFactory)object).collator) && 40 | Integer.compare(digits, ((NaturalSortKeyAttributeFactory)object).digits) == 0 && 41 | Integer.compare(maxTokens, ((NaturalSortKeyAttributeFactory)object).maxTokens) == 0; 42 | } 43 | 44 | @Override 45 | public int hashCode() { 46 | return collator.hashCode() ^ Integer.hashCode(digits) ^ Integer.hashCode(maxTokens); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/naturalsort/NaturalSortKeyAttributeImpl.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.naturalsort; 2 | 3 | import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl; 4 | import org.apache.lucene.util.BytesRef; 5 | 6 | import java.text.Collator; 7 | import java.util.Locale; 8 | import java.util.regex.Matcher; 9 | import java.util.regex.Pattern; 10 | 11 | /** 12 | * Natural sort key attribute implementation. 13 | */ 14 | public class NaturalSortKeyAttributeImpl extends CharTermAttributeImpl { 15 | 16 | private static final Pattern numberPattern = Pattern.compile("(\\+|\\-)?([0-9]+)"); 17 | 18 | private final Collator collator; 19 | 20 | private final int digits; 21 | 22 | private final int maxTokens; 23 | 24 | public NaturalSortKeyAttributeImpl(Collator collator, int digits, int maxTokens) { 25 | this.collator = collator; 26 | this.digits = digits; 27 | this.maxTokens = maxTokens; 28 | } 29 | 30 | @Override 31 | public BytesRef getBytesRef() { 32 | byte[] collationKey = collator.getCollationKey(natural(toString())).toByteArray(); 33 | final BytesRef ref = this.builder.get(); 34 | ref.bytes = collationKey; 35 | ref.offset = 0; 36 | ref.length = collationKey.length; 37 | return ref; 38 | } 39 | 40 | private String natural(String s) { 41 | StringBuffer sb = new StringBuffer(); 42 | Matcher m = numberPattern.matcher(s); 43 | int foundTokens = 0; 44 | while (m.find()) { 45 | int len = m.group(2).length(); 46 | String fmt = "%0" + digits + "d"; 47 | String repl = String.format(Locale.ROOT, fmt, len) + m.group(); 48 | m.appendReplacement(sb, repl); 49 | foundTokens++; 50 | if (foundTokens >= maxTokens) { 51 | break; 52 | } 53 | } 54 | m.appendTail(sb); 55 | return sb.toString(); 56 | } 57 | 58 | @Override 59 | public boolean equals(Object object) { 60 | return object instanceof NaturalSortKeyAttributeImpl && 61 | collator.equals(((NaturalSortKeyAttributeImpl)object).collator) && 62 | Integer.compare(digits, ((NaturalSortKeyAttributeImpl)object).digits) == 0 && 63 | Integer.compare(maxTokens, ((NaturalSortKeyAttributeImpl)object).maxTokens) == 0; 64 | } 65 | 66 | @Override 67 | public int hashCode() { 68 | return collator.hashCode() ^ Integer.hashCode(digits) ^ Integer.hashCode(maxTokens); 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/naturalsort/NaturalSortKeyTokenizerFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.naturalsort; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.apache.lucene.analysis.core.KeywordTokenizer; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.env.Environment; 7 | import org.elasticsearch.index.IndexSettings; 8 | import org.elasticsearch.index.analysis.AbstractTokenizerFactory; 9 | 10 | import java.text.Collator; 11 | 12 | /** 13 | * Natural sort key tokenizer factory. 14 | */ 15 | public class NaturalSortKeyTokenizerFactory extends AbstractTokenizerFactory { 16 | 17 | private final NaturalSortKeyAttributeFactory factory; 18 | 19 | private final int bufferSize; 20 | 21 | public NaturalSortKeyTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, 22 | Settings settings) { 23 | super(indexSettings, name, settings); 24 | Collator collator = NaturalSortKeyAnalyzerProvider.createCollator(settings); 25 | int digits = settings.getAsInt("digits", 1); 26 | int maxTokens = settings.getAsInt("maxTokens", 2); 27 | this.factory = new NaturalSortKeyAttributeFactory(collator, digits, maxTokens); 28 | this.bufferSize = settings.getAsInt("bufferSize", KeywordTokenizer.DEFAULT_BUFFER_SIZE); 29 | } 30 | 31 | @Override 32 | public Tokenizer create() { 33 | return new KeywordTokenizer(factory, bufferSize); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/naturalsort/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Natural sort. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.naturalsort; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/sortform/SortformTokenFilter.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.sortform; 2 | 3 | import org.apache.lucene.analysis.TokenFilter; 4 | import org.apache.lucene.analysis.TokenStream; 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 6 | 7 | import java.io.IOException; 8 | import java.util.regex.Pattern; 9 | 10 | /** 11 | * Sort form token filter. 12 | */ 13 | public class SortformTokenFilter extends TokenFilter { 14 | 15 | private static final Pattern[] patterns = { 16 | Pattern.compile("\\s*<<.*?>>\\s*"), 17 | Pattern.compile("\\s*<.*?>\\s*"), 18 | Pattern.compile("\\s*\u0098.*?\u009C\\s*"), 19 | Pattern.compile("\\s*\u02BE.*?\u02BB\\s*"), 20 | Pattern.compile("\\s*\u00AC.*?\u00AC\\s*") 21 | }; 22 | private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 23 | 24 | protected SortformTokenFilter(TokenStream input) { 25 | super(input); 26 | } 27 | 28 | @Override 29 | public final boolean incrementToken() throws IOException { 30 | if (!input.incrementToken()) { 31 | return false; 32 | } else { 33 | String s = termAtt.toString(); 34 | for (Pattern pattern : patterns) { 35 | s = pattern.matcher(s).replaceAll(""); 36 | } 37 | termAtt.setEmpty().append(s); 38 | return true; 39 | } 40 | } 41 | 42 | @Override 43 | public boolean equals(Object object) { 44 | return object instanceof SortformTokenFilter; 45 | } 46 | 47 | @Override 48 | public int hashCode() { 49 | return 0; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/sortform/SortformTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.sortform; 2 | 3 | import org.apache.lucene.analysis.TokenStream; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.env.Environment; 6 | import org.elasticsearch.index.IndexSettings; 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 8 | 9 | /** 10 | * Sort form token filter factory. 11 | */ 12 | public class SortformTokenFilterFactory extends AbstractTokenFilterFactory { 13 | 14 | public SortformTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, 15 | Settings settings) { 16 | super(indexSettings, name, settings); 17 | } 18 | 19 | @Override 20 | public TokenStream create(TokenStream tokenStream) { 21 | return new SortformTokenFilter(tokenStream); 22 | } 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/standardnumber/StandardnumberAnalyzer.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.standardnumber; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.apache.lucene.analysis.TokenStream; 5 | import org.apache.lucene.analysis.Tokenizer; 6 | import org.elasticsearch.index.analysis.TokenFilterFactory; 7 | import org.elasticsearch.index.analysis.TokenizerFactory; 8 | 9 | import java.util.Collections; 10 | 11 | /** 12 | * Standard number analyzer. 13 | */ 14 | public class StandardnumberAnalyzer extends Analyzer { 15 | 16 | private final TokenizerFactory tokenizerFactory; 17 | private final StandardnumberTokenFilterFactory stdnumTokenFilterFactory; 18 | 19 | public StandardnumberAnalyzer(TokenizerFactory tokenizerFactory, 20 | StandardnumberTokenFilterFactory stdnumTokenFilterFactory) { 21 | this.tokenizerFactory = tokenizerFactory; 22 | this.stdnumTokenFilterFactory = stdnumTokenFilterFactory; 23 | } 24 | 25 | @Override 26 | protected TokenStreamComponents createComponents(String fieldName) { 27 | Tokenizer tokenizer = tokenizerFactory.create(); 28 | TokenStream tokenStream = tokenizer; 29 | for (TokenFilterFactory tokenFilter : Collections.singletonList(stdnumTokenFilterFactory)) { 30 | tokenStream = tokenFilter.create(tokenStream); 31 | } 32 | return new TokenStreamComponents(tokenizer, tokenStream); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/standardnumber/StandardnumberAnalyzerProvider.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.standardnumber; 2 | 3 | import org.elasticsearch.common.settings.Settings; 4 | import org.elasticsearch.env.Environment; 5 | import org.elasticsearch.index.IndexSettings; 6 | import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider; 7 | import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory; 8 | import org.xbib.elasticsearch.plugin.bundle.index.mapper.standardnumber.StandardnumberMapper; 9 | 10 | /** 11 | * Standard number analyzer provider. 12 | */ 13 | public class StandardnumberAnalyzerProvider extends AbstractIndexAnalyzerProvider { 14 | 15 | private final StandardnumberAnalyzer analyzer; 16 | 17 | public StandardnumberAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name, 18 | Settings settings, StandardnumberMapper.TypeParser standardNumberTypeParser) { 19 | super(indexSettings, name, settings); 20 | WhitespaceTokenizerFactory tokenizerFactory = 21 | new WhitespaceTokenizerFactory(indexSettings, environment, name, settings); 22 | StandardnumberTokenFilterFactory stdnumTokenFilterFactory = 23 | new StandardnumberTokenFilterFactory(indexSettings, environment, name, settings, standardNumberTypeParser); 24 | this.analyzer = new StandardnumberAnalyzer(tokenizerFactory, stdnumTokenFilterFactory); 25 | } 26 | 27 | @Override 28 | public StandardnumberAnalyzer get() { 29 | return this.analyzer; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/standardnumber/StandardnumberTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.standardnumber; 2 | 3 | import org.apache.lucene.analysis.TokenStream; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.env.Environment; 6 | import org.elasticsearch.index.IndexSettings; 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 8 | import org.xbib.elasticsearch.plugin.bundle.index.mapper.standardnumber.StandardnumberMapper; 9 | import org.xbib.elasticsearch.plugin.bundle.common.standardnumber.StandardnumberService; 10 | 11 | /** 12 | * Standard number token filter factory. 13 | */ 14 | public class StandardnumberTokenFilterFactory extends AbstractTokenFilterFactory { 15 | 16 | private final Settings settings; 17 | 18 | private final StandardnumberService standardnumberService; 19 | 20 | public StandardnumberTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, 21 | Settings settings, StandardnumberMapper.TypeParser standardNumberTypeParser) { 22 | super(indexSettings, name, settings); 23 | this.settings = settings; 24 | this.standardnumberService = new StandardnumberService(settings); 25 | this.standardnumberService.setStandardNumberTypeParser(standardNumberTypeParser); 26 | } 27 | 28 | @Override 29 | public TokenStream create(TokenStream tokenStream) { 30 | return new StandardnumberTokenFilter(tokenStream, standardnumberService, settings); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/symbolname/SymbolnameTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.symbolname; 2 | 3 | import org.apache.lucene.analysis.TokenStream; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.env.Environment; 6 | import org.elasticsearch.index.IndexSettings; 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 8 | 9 | /** 10 | * Symbol name token filter factory. 11 | */ 12 | public class SymbolnameTokenFilterFactory extends AbstractTokenFilterFactory { 13 | 14 | public SymbolnameTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, 15 | Settings settings) { 16 | super(indexSettings, name, settings); 17 | } 18 | 19 | @Override 20 | public TokenStream create(TokenStream tokenStream) { 21 | return new SymbolnameTokenFilter(tokenStream); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/symbolname/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Symbol name token filter. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.symbolname; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/worddelimiter/WordDelimiterFlags.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.worddelimiter; 2 | 3 | /** 4 | * Flags for {@link WordDelimiterFilter}. 5 | */ 6 | public interface WordDelimiterFlags { 7 | int LOWER = 0x01; 8 | 9 | int UPPER = 0x02; 10 | 11 | int DIGIT = 0x04; 12 | 13 | int SUBWORD_DELIM = 0x08; 14 | 15 | int ALPHA = 0x03; 16 | 17 | int ALPHANUM = 0x07; 18 | 19 | /** 20 | * Causes parts of words to be generated: 21 | * "PowerShot" => "Power" "Shot" 22 | */ 23 | int GENERATE_WORD_PARTS = 1; 24 | 25 | /** 26 | * Causes number subwords to be generated: 27 | * "500-42" => "500" "42" 28 | */ 29 | int GENERATE_NUMBER_PARTS = 2; 30 | 31 | /** 32 | * Causes maximum runs of word parts to be catenated: 33 | * "wi-fi" => "wifi" 34 | */ 35 | int CATENATE_WORDS = 4; 36 | 37 | /** 38 | * Causes maximum runs of word parts to be catenated: 39 | * "wi-fi" =>"wifi" 40 | */ 41 | int CATENATE_NUMBERS = 8; 42 | 43 | /** 44 | * Causes all subword parts to be catenated: 45 | * "wi-fi-4000" => "wifi4000" 46 | */ 47 | int CATENATE_ALL = 16; 48 | 49 | /** 50 | * Causes original words are preserved and added to the subword list (Defaults to false) 51 | * "500-42" => "500" "42" "500-42" 52 | */ 53 | int PRESERVE_ORIGINAL = 32; 54 | 55 | /** 56 | * If not set, causes case changes to be ignored (subwords will only be generated 57 | * given SUBWORD_DELIM tokens) 58 | */ 59 | int SPLIT_ON_CASE_CHANGE = 64; 60 | 61 | /** 62 | * If not set, causes numeric changes to be ignored (subwords will only be generated 63 | * given SUBWORD_DELIM tokens). 64 | */ 65 | int SPLIT_ON_NUMERICS = 128; 66 | 67 | /** 68 | * Causes trailing "'s" to be removed for each subword 69 | * "O'Neil's" => "O", "Neil" 70 | */ 71 | int STEM_ENGLISH_POSSESSIVE = 256; 72 | 73 | /** 74 | * Causes every parts to share the same position. 75 | * The default is off and causes each intermediate part to take its own position. 76 | */ 77 | int ALL_PARTS_AT_SAME_POSITION = 512; 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/worddelimiter/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Word delimiter filter. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.worddelimiter; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/year/GregorianYearTokenFilter.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.year; 2 | 3 | import org.apache.lucene.analysis.TokenFilter; 4 | import org.apache.lucene.analysis.TokenStream; 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; 6 | 7 | import java.io.IOException; 8 | import java.util.regex.Matcher; 9 | import java.util.regex.Pattern; 10 | 11 | /** 12 | * Gregorian year token filter. 13 | */ 14 | public class GregorianYearTokenFilter extends TokenFilter { 15 | 16 | private static final Pattern pattern = Pattern.compile("(\\d{4})"); 17 | 18 | protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); 19 | 20 | private final String defaultYear; 21 | 22 | protected GregorianYearTokenFilter(TokenStream input, String defaultYear) { 23 | super(input); 24 | this.defaultYear = defaultYear; 25 | } 26 | 27 | @Override 28 | public final boolean incrementToken() throws IOException { 29 | if (!input.incrementToken()) { 30 | return false; 31 | } else { 32 | String s = termAtt.toString(); 33 | Matcher m = pattern.matcher(s); 34 | termAtt.setEmpty(); 35 | if (!m.matches()) { 36 | termAtt.append(defaultYear); 37 | } else { 38 | while (m.find()) { 39 | termAtt.append(m.group()); 40 | } 41 | } 42 | return true; 43 | } 44 | } 45 | 46 | @Override 47 | public boolean equals(Object object) { 48 | return object instanceof GregorianYearTokenFilter && 49 | defaultYear.equals(((GregorianYearTokenFilter)object).defaultYear); 50 | } 51 | 52 | @Override 53 | public int hashCode() { 54 | return defaultYear.hashCode(); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/year/GregorianYearTokenFilterFactory.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.year; 2 | 3 | import org.apache.lucene.analysis.TokenStream; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.env.Environment; 6 | import org.elasticsearch.index.IndexSettings; 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory; 8 | 9 | /** 10 | * Gregorian year token filter factory. 11 | */ 12 | public class GregorianYearTokenFilterFactory extends AbstractTokenFilterFactory { 13 | 14 | private final String defaultYear; 15 | 16 | public GregorianYearTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, 17 | Settings settings) { 18 | super(indexSettings, name, settings); 19 | defaultYear = settings.get("default_year", "0000"); 20 | } 21 | 22 | @Override 23 | public TokenStream create(TokenStream tokenStream) { 24 | return new GregorianYearTokenFilter(tokenStream, defaultYear); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/mapper/reference/ReferenceMapperModule.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.mapper.reference; 2 | 3 | import org.elasticsearch.common.inject.AbstractModule; 4 | import org.xbib.elasticsearch.plugin.bundle.common.reference.ReferenceService; 5 | 6 | /** 7 | * Reference field mapper module. 8 | */ 9 | public class ReferenceMapperModule extends AbstractModule { 10 | 11 | private final ReferenceMapperTypeParser typeParser; 12 | 13 | public ReferenceMapperModule(ReferenceMapperTypeParser typeParser) { 14 | this.typeParser = typeParser; 15 | } 16 | 17 | @Override 18 | protected void configure() { 19 | bind(ReferenceService.class).asEagerSingleton(); 20 | bind(ReferenceMapperTypeParser.class).toInstance(typeParser); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/mapper/reference/ReferenceMapperTypeParser.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.mapper.reference; 2 | 3 | /** 4 | * Reference field mapper type parser. 5 | */ 6 | public final class ReferenceMapperTypeParser extends ReferenceMapper.TypeParser { 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/mapper/standardnumber/StandardnumberMapperModule.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.mapper.standardnumber; 2 | 3 | import org.elasticsearch.common.inject.AbstractModule; 4 | import org.xbib.elasticsearch.plugin.bundle.common.standardnumber.StandardnumberService; 5 | 6 | /** 7 | * Standard number field mapper module. 8 | */ 9 | public class StandardnumberMapperModule extends AbstractModule { 10 | 11 | private final StandardnumberMapperTypeParser typeParser; 12 | 13 | public StandardnumberMapperModule(StandardnumberMapperTypeParser typeParser) { 14 | this.typeParser = typeParser; 15 | } 16 | 17 | @Override 18 | protected void configure() { 19 | bind(StandardnumberService.class).asEagerSingleton(); 20 | bind(StandardnumberMapperTypeParser.class).toInstance(typeParser); 21 | } 22 | 23 | } 24 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/index/mapper/standardnumber/StandardnumberMapperTypeParser.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.index.mapper.standardnumber; 2 | 3 | /** 4 | * Standard number field mapper type parser. 5 | */ 6 | public final class StandardnumberMapperTypeParser extends StandardnumberMapper.TypeParser { 7 | } 8 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Plugin bundle for Elasticsearch. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle; 5 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/rest/action/isbnformat/RestISBNFormatterAction.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.rest.action.isbnformat; 2 | 3 | import org.elasticsearch.client.node.NodeClient; 4 | import org.elasticsearch.common.inject.Inject; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.rest.BaseRestHandler; 7 | import org.elasticsearch.rest.RestController; 8 | import org.elasticsearch.rest.RestRequest; 9 | import org.elasticsearch.rest.action.RestStatusToXContentListener; 10 | import org.xbib.elasticsearch.plugin.bundle.action.isbnformat.ISBNFormatAction; 11 | import org.xbib.elasticsearch.plugin.bundle.action.isbnformat.ISBNFormatRequest; 12 | 13 | import java.io.IOException; 14 | 15 | import static org.elasticsearch.rest.RestRequest.Method.GET; 16 | 17 | /** 18 | * REST ISBN format action. 19 | */ 20 | public class RestISBNFormatterAction extends BaseRestHandler { 21 | 22 | @Inject 23 | public RestISBNFormatterAction(Settings settings, RestController controller) { 24 | super(settings); 25 | controller.registerHandler(GET, "/_isbn", this); 26 | controller.registerHandler(GET, "/_isbn/{value}", this); 27 | } 28 | 29 | @Override 30 | public String getName() { 31 | return "ISBN"; 32 | } 33 | 34 | @Override 35 | protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient client) throws IOException { 36 | final String value = request.param("value"); 37 | final ISBNFormatRequest isbnFormatRequest = new ISBNFormatRequest().setValue(value); 38 | return channel -> client.execute(ISBNFormatAction.INSTANCE, isbnFormatRequest, 39 | new RestStatusToXContentListener<>(channel)); 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/main/java/org/xbib/elasticsearch/plugin/bundle/rest/action/isbnformat/package-info.java: -------------------------------------------------------------------------------- 1 | /** 2 | * Rest action for ISBN format. 3 | */ 4 | package org.xbib.elasticsearch.plugin.bundle.rest.action.isbnformat; 5 | -------------------------------------------------------------------------------- /src/main/plugin-metadata/plugin-security.policy: -------------------------------------------------------------------------------- 1 | grant codeBase "${codebase.elasticsearch-plugin-bundle}" { 2 | permission java.io.FilePermission "*", "read"; 3 | permission java.lang.RuntimePermission "accessDeclaredMembers"; 4 | permission java.lang.RuntimePermission "getClassLoader"; 5 | permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; 6 | }; 7 | 8 | grant codeBase "${codebase.icu4j}" { 9 | permission java.io.FilePermission "*", "read"; 10 | permission java.lang.RuntimePermission "accessDeclaredMembers"; 11 | permission java.lang.RuntimePermission "getClassLoader"; 12 | permission java.lang.reflect.ReflectPermission "suppressAccessChecks"; 13 | }; 14 | -------------------------------------------------------------------------------- /src/main/resources/org/xbib/elasticsearch/plugin/bundle/icu/KeywordTokenizer.brk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/icu/KeywordTokenizer.brk -------------------------------------------------------------------------------- /src/main/resources/org/xbib/elasticsearch/plugin/bundle/icu/Latin-break-only-on-whitespace.brk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/icu/Latin-break-only-on-whitespace.brk -------------------------------------------------------------------------------- /src/main/resources/org/xbib/elasticsearch/plugin/bundle/icu/Latin-dont-break-on-hyphens.brk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/icu/Latin-dont-break-on-hyphens.brk -------------------------------------------------------------------------------- /src/main/resources/org/xbib/elasticsearch/plugin/bundle/icu/folding/DingbatFolding.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2001-2010 Unicode, Inc. 2 | # 3 | # Disclaimer 4 | # 5 | # This source code is provided as is by Unicode, Inc. No claims are 6 | # made as to fitness for any particular purpose. No warranties of any 7 | # kind are expressed or implied. The recipient agrees to determine 8 | # applicability of information provided. If this file has been 9 | # purchased on magnetic or optical media from Unicode, Inc., the 10 | # sole remedy for any claim will be exchange of defective media 11 | # within 90 days of receipt. 12 | # 13 | # Limitations on Rights to Redistribute This Code 14 | # 15 | # Unicode, Inc. hereby grants the right to freely use the information 16 | # supplied in this file in the creation of products supporting the 17 | # Unicode Standard, and to make copies of this file in any form 18 | # for internal or external distribution as long as this notice 19 | # remains attached. 20 | 21 | ### Custom Normalization mappings for UTR#30 22 | ### (http://www.unicode.org/reports/tr30/tr30-4.html) 23 | ### 24 | ### Created from Unicode 5.2 UCD 25 | ### 26 | 27 | #### WARNING #### 28 | #### Rule: lines direct content generation. 29 | #### All non-comments will be REMOVED when this file's contents 30 | #### are generated by 'ant gen-utr30-data-files'. 31 | #### Use "# Rule: verbatim" to keep non-comments up until 32 | #### the next "# Rule:" line. 33 | #### WARNING #### 34 | 35 | # Folds dingbats and other adorned forms 36 | # Generated from ASCIIFoldingFilter 37 | # Rule: verbatim 38 | 24EB>0031 0031 39 | 24EC>0031 0032 40 | 24ED>0031 0033 41 | 24EE>0031 0034 42 | 24EF>0031 0035 43 | 24F0>0031 0036 44 | 24F1>0031 0037 45 | 24F2>0031 0038 46 | 24F3>0031 0039 47 | 24F4>0032 0030 48 | 24F5>0031 49 | 24F6>0032 50 | 24F7>0033 51 | 24F8>0034 52 | 24F9>0035 53 | 24FA>0036 54 | 24FB>0037 55 | 24FC>0038 56 | 24FD>0039 57 | 24FE>0031 0030 58 | 24FF>0030 59 | 275B>0027 60 | 275C>0027 61 | 275D>0022 62 | 275E>0022 63 | 2768>0028 64 | 2769>0029 65 | 276A>0028 66 | 276B>0029 67 | 276C>003C 68 | 276D>003E 69 | 276E>0022 70 | 276F>0022 71 | 2770>003C 72 | 2771>003E 73 | 2772>005B 74 | 2773>005D 75 | 2774>007B 76 | 2775>007D 77 | 2776>0031 78 | 2777>0032 79 | 2778>0033 80 | 2779>0034 81 | 277A>0035 82 | 277B>0036 83 | 277C>0037 84 | 277D>0038 85 | 277E>0039 86 | 277F>0031 0030 87 | 2780>0031 88 | 2781>0032 89 | 2782>0033 90 | 2783>0034 91 | 2784>0035 92 | 2785>0036 93 | 2786>0037 94 | 2787>0038 95 | 2788>0039 96 | 2789>0031 0030 97 | 278A>0031 98 | 278B>0032 99 | 278C>0033 100 | 278D>0034 101 | 278E>0035 102 | 278F>0036 103 | 2790>0037 104 | 2791>0038 105 | 2792>0039 106 | 2793>0031 0030 107 | -------------------------------------------------------------------------------- /src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/fst/words.fst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/fst/words.fst -------------------------------------------------------------------------------- /src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/patricia/grfExt.tree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/patricia/grfExt.tree -------------------------------------------------------------------------------- /src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/patricia/kompVHic.tree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/patricia/kompVHic.tree -------------------------------------------------------------------------------- /src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/patricia/kompVVic.tree: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/patricia/kompVVic.tree -------------------------------------------------------------------------------- /src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/Default.brk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/Default.brk -------------------------------------------------------------------------------- /src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/KeywordTokenizer.rbbi: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # RBBI Keyword tokenizer: keep everything as a single token. 18 | 19 | # Apply rule status {200}=RBBI.WORD_LETTER, which is mapped 20 | # to token type by DefaultICUTokenizerConfig. 21 | .+ {200}; 22 | -------------------------------------------------------------------------------- /src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/Latin-break-only-on-whitespace.rbbi: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Break only on whitespace; assign token type from set { , , } 18 | # 19 | 20 | !!forward; 21 | 22 | $Whitespace = [\p{Whitespace}]; 23 | $NonWhitespace = [\P{Whitespace}]; 24 | $Letter = [\p{Letter}]; 25 | $Number = [\p{Number}]; 26 | 27 | # Default rule status is {0}=RBBI.WORD_NONE => not tokenized by ICUTokenizer 28 | $Whitespace; 29 | 30 | # Assign rule status {200}=RBBI.WORD_LETTER when the token contains a letter char 31 | # Mapped to token type by DefaultICUTokenizerConfig 32 | $NonWhitespace* $Letter $NonWhitespace* {200}; 33 | 34 | # Assign rule status {100}=RBBI.WORD_NUM when the token contains a numeric char 35 | # Mapped to token type by DefaultICUTokenizerConfig 36 | $NonWhitespace* $Number $NonWhitespace* {100}; 37 | 38 | # Assign rule status {1} (no RBBI equivalent) when the token contains neither a letter nor a numeric char 39 | # Mapped to token type by DefaultICUTokenizerConfig 40 | $NonWhitespace+ {1}; 41 | -------------------------------------------------------------------------------- /src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/MyanmarSyllable.brk: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/MyanmarSyllable.brk -------------------------------------------------------------------------------- /src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/MyanmarSyllable.rbbi: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # 18 | # Parses Myanmar text, with syllable as token. 19 | # 20 | 21 | $Cons = [[:Other_Letter:]&[:Myanmar:]]; 22 | $Virama = [\u1039]; 23 | $Asat = [\u103A]; 24 | 25 | $WordJoin = [:Line_Break=Word_Joiner:]; 26 | 27 | # 28 | # default numerical definitions 29 | # 30 | $Extend = [\p{Word_Break = Extend}]; 31 | $Format = [\p{Word_Break = Format}]; 32 | $MidNumLet = [\p{Word_Break = MidNumLet}]; 33 | $MidNum = [\p{Word_Break = MidNum}]; 34 | $Numeric = [\p{Word_Break = Numeric}]; 35 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; 36 | $MidNumLetEx = $MidNumLet ($Extend | $Format)*; 37 | $MidNumEx = $MidNum ($Extend | $Format)*; 38 | $NumericEx = $Numeric ($Extend | $Format)*; 39 | $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; 40 | 41 | $ConsEx = $Cons ($Extend | $Format)*; 42 | $AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*; 43 | $MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*; 44 | $MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*; 45 | 46 | !!forward; 47 | $MyanmarJoinedSyllableEx {200}; 48 | 49 | # default numeric rules 50 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100}; -------------------------------------------------------------------------------- /src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/utr30.nrm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/utr30.nrm -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/MultiMap.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test; 2 | 3 | import java.util.Collection; 4 | import java.util.Set; 5 | 6 | public interface MultiMap { 7 | 8 | void clear(); 9 | 10 | int size(); 11 | 12 | boolean isEmpty(); 13 | 14 | boolean containsKey(K key); 15 | 16 | Collection get(K key); 17 | 18 | Set keySet(); 19 | 20 | Collection> values(); 21 | 22 | Collection put(K key, V value); 23 | 24 | Collection remove(K key); 25 | 26 | Collection remove(K key, V value); 27 | 28 | void putAll(K key, Collection values); 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/TreeMultiMap.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test; 2 | 3 | import java.util.Collection; 4 | import java.util.LinkedHashSet; 5 | import java.util.Map; 6 | import java.util.Set; 7 | import java.util.TreeMap; 8 | import java.util.TreeSet; 9 | 10 | public class TreeMultiMap implements MultiMap { 11 | 12 | private final Map> map = new TreeMap<>(); 13 | 14 | @Override 15 | public int size() { 16 | return map.size(); 17 | } 18 | 19 | @Override 20 | public void clear() { 21 | map.clear(); 22 | } 23 | 24 | @Override 25 | public boolean isEmpty() { 26 | return map.isEmpty(); 27 | } 28 | 29 | @Override 30 | public boolean containsKey(K key) { 31 | return map.containsKey(key); 32 | } 33 | 34 | @Override 35 | public Set keySet() { 36 | return map.keySet(); 37 | } 38 | 39 | @Override 40 | public Collection> values() { 41 | return map.values(); 42 | } 43 | 44 | @Override 45 | public Collection put(K key, V value) { 46 | Set set = map.get(key); 47 | if (set == null) { 48 | set = new TreeSet<>(); 49 | } 50 | set.add(value); 51 | return map.put(key, set); 52 | } 53 | 54 | @Override 55 | public void putAll(K key, Collection values) { 56 | Set set = map.get(key); 57 | if (set == null) { 58 | set = new LinkedHashSet<>(); 59 | map.put(key, set); 60 | } 61 | set.addAll(values); 62 | } 63 | 64 | @Override 65 | public Collection get(K key) { 66 | return map.get(key); 67 | } 68 | 69 | @Override 70 | public Set remove(K key) { 71 | return map.remove(key); 72 | } 73 | 74 | @Override 75 | public Set remove(K key, V value) { 76 | Set set = map.get(key); 77 | if (set != null) { 78 | set.remove(value); 79 | } 80 | return set; 81 | } 82 | 83 | @Override 84 | public boolean equals(Object obj) { 85 | return obj != null && obj instanceof TreeMultiMap && map.equals(((TreeMultiMap) obj).map); 86 | } 87 | 88 | @Override 89 | public int hashCode() { 90 | return map.hashCode(); 91 | } 92 | 93 | @Override 94 | public String toString() { 95 | return map.toString(); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/common/decompound/patricia/LFUCacheTest.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.common.decompound.patricia; 2 | 3 | import com.carrotsearch.randomizedtesting.annotations.SuppressForbidden; 4 | import org.junit.Test; 5 | import org.xbib.elasticsearch.plugin.bundle.common.decompound.patricia.LFUCache; 6 | 7 | import static org.junit.Assert.assertEquals; 8 | 9 | public class LFUCacheTest { 10 | 11 | @SuppressForbidden(value = "execute this to test LFU cache") 12 | @Test 13 | public void testCache() { 14 | LFUCache cache = new LFUCache<>(100, 0.90f); 15 | for (int i = 0; i < 500; i++) { 16 | cache.computeIfAbsent(i, f -> f % 2); 17 | } 18 | assertEquals(50, cache.size()); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/baseform/DictionaryTest.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.baseform; 2 | 3 | import org.apache.lucene.util.SuppressForbidden; 4 | import org.xbib.elasticsearch.plugin.bundle.common.fsa.Dictionary; 5 | 6 | import java.io.BufferedReader; 7 | import java.io.IOException; 8 | import java.io.InputStreamReader; 9 | import java.nio.charset.CharacterCodingException; 10 | import java.nio.charset.StandardCharsets; 11 | 12 | /** 13 | * Dictionary tests. 14 | */ 15 | public class DictionaryTest { 16 | 17 | @SuppressForbidden(reason = "accessing local resources from classpath") 18 | public void testVerifyDE() throws IOException { 19 | Dictionary dictionary = new Dictionary(); 20 | InputStreamReader reader = new InputStreamReader(getClass().getResourceAsStream("de-lemma-utf8.txt"), 21 | StandardCharsets.UTF_8); 22 | dictionary.loadLines(reader); 23 | reader.close(); 24 | BufferedReader br = new BufferedReader(new InputStreamReader(getClass().getResourceAsStream("de-lemma-utf8.txt"), 25 | StandardCharsets.UTF_8)); 26 | String line; 27 | while ((line = br.readLine()) != null) { 28 | if (!line.startsWith("#")) { 29 | if (!check(line, dictionary)) { 30 | break; 31 | } 32 | } 33 | } 34 | br.close(); 35 | } 36 | 37 | private boolean check(String line, Dictionary dictionary) throws CharacterCodingException { 38 | int pos = line.indexOf("\t"); 39 | String word = pos > 0 ? line.substring(0, pos) : line; 40 | try { 41 | CharSequence baseform = dictionary.lookup(word); 42 | } catch (StackOverflowError e) { 43 | // if stack overflow error occurs, we have faulty entries 44 | return false; 45 | } 46 | return true; 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/concat/ConcatTokenFilterTests.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.concat; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.index.Index; 6 | import org.elasticsearch.test.ESTestCase; 7 | import org.elasticsearch.test.ESTokenStreamTestCase; 8 | import org.xbib.elasticsearch.plugin.bundle.BundlePlugin; 9 | 10 | /** 11 | * Concat token filter tests. 12 | */ 13 | public class ConcatTokenFilterTests extends ESTokenStreamTestCase { 14 | 15 | public void testConcat() throws Exception { 16 | String source = "Das ist ein Schlüsselwort, ein Bindestrichwort"; 17 | String[] expected = { 18 | "Das ist ein Schlüsselwort ein Bindestrichwort" 19 | }; 20 | String resource = "concat_analysis.json"; 21 | Settings settings = Settings.builder() 22 | .loadFromStream(resource, getClass().getResourceAsStream(resource), true) 23 | .build(); 24 | ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), 25 | settings, 26 | new BundlePlugin(Settings.EMPTY)); 27 | Analyzer analyzer = analysis.indexAnalyzers.get("concat"); 28 | assertNotNull(analyzer); 29 | assertTokenStreamContents(analyzer.tokenStream("test-field", source), expected); 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/decompound/fst/FstDecompoundTokenFilterTests.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.decompound.fst; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.elasticsearch.analysis.common.CommonAnalysisPlugin; 5 | import org.elasticsearch.common.settings.Settings; 6 | import org.elasticsearch.index.Index; 7 | import org.elasticsearch.test.ESTestCase; 8 | import org.elasticsearch.test.ESTokenStreamTestCase; 9 | import org.xbib.elasticsearch.plugin.bundle.BundlePlugin; 10 | 11 | /** 12 | * Finite state transducer decompound token filter tests. 13 | */ 14 | public class FstDecompoundTokenFilterTests extends ESTokenStreamTestCase { 15 | 16 | public void testDecompound() throws Exception { 17 | 18 | String source = "Die Jahresfeier der Rechtsanwaltskanzleien auf dem Donaudampfschiff hat viel Ökosteuer gekostet"; 19 | 20 | String[] expected = { 21 | "Die", 22 | "Jahresfeier", 23 | "jahres", 24 | "feier", 25 | "der", 26 | "Rechtsanwaltskanzleien", 27 | "rechts", 28 | "anwalts", 29 | "kanzleien", 30 | "auf", 31 | "dem", 32 | "Donaudampfschiff", 33 | "donau", 34 | "dampf", 35 | "schiff", 36 | "hat", 37 | "viel", 38 | "Ökosteuer", 39 | "ökos", 40 | "teuer", 41 | "gekostet" 42 | }; 43 | 44 | Settings settings = Settings.builder() 45 | .put("index.analysis.analyzer.myanalyzer.type", "custom") 46 | .put("index.analysis.analyzer.myanalyzer.tokenizer", "standard") 47 | .put("index.analysis.analyzer.myanalyzer.filter.0", "fst_decompound") 48 | .put("index.analysis.analyzer.myanalyzer.filter.1", "unique") 49 | .build(); 50 | ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), 51 | settings, 52 | new BundlePlugin(Settings.EMPTY), new CommonAnalysisPlugin()); 53 | Analyzer myanalyzer = analysis.indexAnalyzers.get("myanalyzer"); 54 | assertAnalyzesTo(myanalyzer, source, expected); 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/german/GermanNormalizationTests.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.german; 2 | 3 | import org.apache.lucene.analysis.Tokenizer; 4 | import org.elasticsearch.Version; 5 | import org.elasticsearch.cluster.metadata.IndexMetaData; 6 | import org.elasticsearch.common.settings.Settings; 7 | import org.elasticsearch.index.Index; 8 | import org.elasticsearch.index.analysis.TokenFilterFactory; 9 | import org.elasticsearch.test.ESTestCase; 10 | import org.elasticsearch.test.ESTokenStreamTestCase; 11 | import org.xbib.elasticsearch.plugin.bundle.BundlePlugin; 12 | 13 | import java.io.IOException; 14 | import java.io.StringReader; 15 | 16 | /** 17 | * German normalization tests. 18 | */ 19 | public class GermanNormalizationTests extends ESTokenStreamTestCase { 20 | 21 | public void testGerman1() throws IOException { 22 | 23 | String source = "Ein schöner Tag in Köln im Café an der Straßenecke"; 24 | 25 | String[] expected = { 26 | "Ein", 27 | "schoner", 28 | "Tag", 29 | "in", 30 | "Koln", 31 | "im", 32 | "Café", 33 | "an", 34 | "der", 35 | "Strassenecke" 36 | }; 37 | String resource = "german_normalization_analysis.json"; 38 | Settings settings = Settings.builder() 39 | .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) 40 | .put("path.home", System.getProperty("path.home")) 41 | .loadFromStream(resource, getClass().getResourceAsStream(resource), true) 42 | .build(); 43 | ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), 44 | settings, 45 | new BundlePlugin(Settings.EMPTY)); 46 | 47 | TokenFilterFactory tokenFilter = analysis.tokenFilter.get("umlaut"); 48 | Tokenizer tokenizer = analysis.tokenizer.get("standard").create(); 49 | tokenizer.setReader(new StringReader(source)); 50 | assertTokenStreamContents(tokenFilter.create(tokenizer), expected); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/IcuAnalysisTests.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.icu; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.elasticsearch.common.settings.Settings; 5 | import org.elasticsearch.index.Index; 6 | import org.elasticsearch.index.analysis.CharFilterFactory; 7 | import org.elasticsearch.index.analysis.NamedAnalyzer; 8 | import org.elasticsearch.index.analysis.TokenFilterFactory; 9 | import org.elasticsearch.index.analysis.TokenizerFactory; 10 | import org.elasticsearch.test.ESTestCase; 11 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation.IcuTokenizerFactory; 12 | import org.xbib.elasticsearch.plugin.bundle.BundlePlugin; 13 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.IcuFoldingTokenFilterFactory; 14 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.IcuNormalizerCharFilterFactory; 15 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.IcuNormalizerTokenFilterFactory; 16 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.IcuTransformTokenFilterFactory; 17 | 18 | import java.io.IOException; 19 | 20 | import static org.hamcrest.CoreMatchers.instanceOf; 21 | 22 | /** 23 | * ICU analysis tests 24 | */ 25 | public class IcuAnalysisTests extends ESTestCase { 26 | 27 | public void testDefaultsIcuAnalysis() throws IOException { 28 | 29 | TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, 30 | new BundlePlugin(Settings.EMPTY)); 31 | 32 | CharFilterFactory charFilterFactory = analysis.charFilter.get("icu_normalizer"); 33 | assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class)); 34 | 35 | TokenizerFactory tf = analysis.tokenizer.get("icu_tokenizer"); 36 | assertThat(tf, instanceOf(IcuTokenizerFactory.class)); 37 | 38 | TokenFilterFactory filterFactory = analysis.tokenFilter.get("icu_normalizer"); 39 | assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class)); 40 | 41 | filterFactory = analysis.tokenFilter.get("icu_folding"); 42 | assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class)); 43 | 44 | filterFactory = analysis.tokenFilter.get("icu_transform"); 45 | assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class)); 46 | 47 | Analyzer analyzer = analysis.indexAnalyzers.get( "icu_collation"); 48 | assertThat(analyzer, instanceOf(NamedAnalyzer.class)); 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/IcuClientYamlTestSuiteIT.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.icu; 2 | 3 | import com.carrotsearch.randomizedtesting.annotations.Name; 4 | import com.carrotsearch.randomizedtesting.annotations.ParametersFactory; 5 | 6 | import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate; 7 | import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase; 8 | 9 | public class IcuClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase { 10 | 11 | public IcuClientYamlTestSuiteIT(@Name("yaml") ClientYamlTestCandidate testCandidate) { 12 | super(testCandidate); 13 | } 14 | 15 | @ParametersFactory 16 | public static Iterable parameters() throws Exception { 17 | return ESClientYamlSuiteTestCase.createParameters(); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/IcuNormalizerFilterTests.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.icu; 2 | 3 | import com.ibm.icu.text.Normalizer2; 4 | import org.apache.lucene.analysis.Analyzer; 5 | import org.apache.lucene.analysis.MockTokenizer; 6 | import org.apache.lucene.analysis.Tokenizer; 7 | import org.apache.lucene.analysis.core.KeywordTokenizer; 8 | import org.elasticsearch.test.ESTokenStreamTestCase; 9 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.IcuNormalizerFilter; 10 | 11 | /** 12 | * ICU normalizer filter tests. 13 | */ 14 | public class IcuNormalizerFilterTests extends ESTokenStreamTestCase { 15 | 16 | public void testDefaults() throws Exception { 17 | Analyzer a = new Analyzer() { 18 | @Override 19 | public TokenStreamComponents createComponents(String fieldName) { 20 | Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); 21 | return new TokenStreamComponents(tokenizer, 22 | new IcuNormalizerFilter(tokenizer, 23 | Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE))); 24 | } 25 | }; 26 | assertAnalyzesTo(a, "This is a test", new String[] { "this", "is", "a", "test" }); 27 | assertAnalyzesTo(a, "Ruß", new String[] { "russ" }); 28 | assertAnalyzesTo(a, "ΜΆΪΟΣ", new String[] { "μάϊοσ" }); 29 | assertAnalyzesTo(a, "Μάϊος", new String[] { "μάϊοσ" }); 30 | assertAnalyzesTo(a, "𐐖", new String[] { "𐐾" }); 31 | assertAnalyzesTo(a, "ﴳﴺﰧ", new String[] { "طمطمطم" }); 32 | assertAnalyzesTo(a, "क्‍ष", new String[] { "क्ष" }); 33 | a.close(); 34 | } 35 | 36 | public void testAlternate() throws Exception { 37 | Analyzer a = new Analyzer() { 38 | @Override 39 | public TokenStreamComponents createComponents(String fieldName) { 40 | Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); 41 | return new TokenStreamComponents(tokenizer, new IcuNormalizerFilter( 42 | tokenizer, 43 | Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE))); 44 | } 45 | }; 46 | assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" }); 47 | a.close(); 48 | } 49 | 50 | public void testEmptyTerm() throws Exception { 51 | Analyzer a = new Analyzer() { 52 | @Override 53 | protected TokenStreamComponents createComponents(String fieldName) { 54 | Tokenizer tokenizer = new KeywordTokenizer(); 55 | return new TokenStreamComponents(tokenizer, 56 | new IcuNormalizerFilter(tokenizer, 57 | Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE))); 58 | } 59 | }; 60 | checkOneTerm(a, "", ""); 61 | a.close(); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/segmentation/CharArrayIteratorTests.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.icu.segmentation; 2 | 3 | import org.elasticsearch.test.ESTestCase; 4 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation.CharArrayIterator; 5 | 6 | import java.text.CharacterIterator; 7 | 8 | /** 9 | * Char array iterator tests. 10 | */ 11 | public class CharArrayIteratorTests extends ESTestCase { 12 | 13 | public void testBasicUsage() { 14 | CharArrayIterator ci = new CharArrayIterator(); 15 | ci.setText("testing".toCharArray(), 0, "testing".length()); 16 | assertEquals(0, ci.getBeginIndex()); 17 | assertEquals(7, ci.getEndIndex()); 18 | assertEquals(0, ci.getIndex()); 19 | assertEquals('t', ci.current()); 20 | assertEquals('e', ci.next()); 21 | assertEquals('g', ci.last()); 22 | assertEquals('n', ci.previous()); 23 | assertEquals('t', ci.first()); 24 | assertEquals(CharacterIterator.DONE, ci.previous()); 25 | } 26 | 27 | public void testFirst() { 28 | CharArrayIterator ci = new CharArrayIterator(); 29 | ci.setText("testing".toCharArray(), 0, "testing".length()); 30 | ci.next(); 31 | assertEquals('t', ci.first()); 32 | assertEquals(ci.getBeginIndex(), ci.getIndex()); 33 | ci.setText(new char[] {}, 0, 0); 34 | assertEquals(CharacterIterator.DONE, ci.first()); 35 | } 36 | 37 | public void testLast() { 38 | CharArrayIterator ci = new CharArrayIterator(); 39 | ci.setText("testing".toCharArray(), 0, "testing".length()); 40 | assertEquals('g', ci.last()); 41 | assertEquals(ci.getIndex(), ci.getEndIndex() - 1); 42 | ci.setText(new char[] {}, 0, 0); 43 | assertEquals(CharacterIterator.DONE, ci.last()); 44 | assertEquals(ci.getEndIndex(), ci.getIndex()); 45 | } 46 | 47 | public void testCurrent() { 48 | CharArrayIterator ci = new CharArrayIterator(); 49 | ci.setText("testing".toCharArray(), 0, "testing".length()); 50 | assertEquals('t', ci.current()); 51 | ci.last(); 52 | ci.next(); 53 | assertEquals(CharacterIterator.DONE, ci.current()); 54 | } 55 | 56 | public void testNext() { 57 | CharArrayIterator ci = new CharArrayIterator(); 58 | ci.setText("te".toCharArray(), 0, 2); 59 | assertEquals('e', ci.next()); 60 | assertEquals(1, ci.getIndex()); 61 | assertEquals(CharacterIterator.DONE, ci.next()); 62 | assertEquals(ci.getEndIndex(), ci.getIndex()); 63 | } 64 | 65 | /*public void testSetIndex() { 66 | CharArrayIterator ci = new CharArrayIterator(); 67 | ci.setText("test".toCharArray(), 0, "test".length()); 68 | ci.setIndex(5); 69 | }*/ 70 | 71 | public void testClone() { 72 | char text[] = "testing".toCharArray(); 73 | CharArrayIterator ci = new CharArrayIterator(); 74 | ci.setText(text, 0, text.length); 75 | ci.next(); 76 | CharArrayIterator ci2 = ci.clone(); 77 | assertEquals(ci.getIndex(), ci2.getIndex()); 78 | assertEquals(ci.next(), ci2.next()); 79 | assertEquals(ci.last(), ci2.last()); 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/segmentation/IcuTokenizerCJKTests.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.icu.segmentation; 2 | 3 | import org.apache.lucene.analysis.Analyzer; 4 | import org.apache.lucene.util.AttributeFactory; 5 | import org.elasticsearch.test.ESTokenStreamTestCase; 6 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation.DefaultIcuTokenizerConfig; 7 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation.IcuTokenizer; 8 | 9 | /** 10 | * ICU tokenizer CJK tests. 11 | */ 12 | public class IcuTokenizerCJKTests extends ESTokenStreamTestCase { 13 | 14 | private static Analyzer create() { 15 | return new Analyzer() { 16 | @Override 17 | protected TokenStreamComponents createComponents(String fieldName) { 18 | return new TokenStreamComponents(new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY, 19 | new DefaultIcuTokenizerConfig(true, true))); 20 | } 21 | }; 22 | } 23 | 24 | public static void destroyAnalyzer(Analyzer a) { 25 | a.close(); 26 | } 27 | 28 | public void testSimpleChinese() throws Exception { 29 | Analyzer a = create(); 30 | assertAnalyzesTo(a, "我购买了道具和服装。", 31 | new String[] { "我", "购买", "了", "道具", "和", "服装" } 32 | ); 33 | destroyAnalyzer(a); 34 | } 35 | 36 | public void testChineseNumerics() throws Exception { 37 | Analyzer a = create(); 38 | assertAnalyzesTo(a, "9483", new String[] { "9483" }); 39 | assertAnalyzesTo(a, "院內分機9483。", 40 | new String[] { "院", "內", "分機", "9483" }); 41 | assertAnalyzesTo(a, "院內分機9483。", 42 | new String[] { "院", "內", "分機", "9483" }); 43 | destroyAnalyzer(a); 44 | } 45 | 46 | public void testSimpleJapanese() throws Exception { 47 | Analyzer a = create(); 48 | assertAnalyzesTo(a, "それはまだ実験段階にあります", 49 | new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます" } 50 | ); 51 | destroyAnalyzer(a); 52 | } 53 | 54 | public void testJapaneseTypes() throws Exception { 55 | Analyzer a = create(); 56 | assertAnalyzesTo(a, "仮名遣い カタカナ", 57 | new String[] { "仮名遣い", "カタカナ" }, 58 | new String[] { "", "" }); 59 | destroyAnalyzer(a); 60 | } 61 | 62 | public void testKorean() throws Exception { 63 | Analyzer a = create(); 64 | // Korean words 65 | assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"}); 66 | destroyAnalyzer(a); 67 | } 68 | 69 | public void testKoreanTypes() throws Exception { 70 | Analyzer a = create(); 71 | assertAnalyzesTo(a, "훈민정음", new String[] { "훈민정음" }, new String[] { "" }); 72 | destroyAnalyzer(a); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/tools/RBBIRuleCompilerTest.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.icu.tools; 2 | 3 | import com.carrotsearch.randomizedtesting.annotations.SuppressForbidden; 4 | import org.junit.Test; 5 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tools.RBBIRuleCompiler; 6 | 7 | import java.io.IOException; 8 | import java.io.InputStream; 9 | import java.io.OutputStream; 10 | import java.nio.file.Files; 11 | import java.nio.file.Paths; 12 | 13 | /** 14 | * RBBI rule compiler test. 15 | */ 16 | public class RBBIRuleCompilerTest { 17 | 18 | @SuppressForbidden(value = "execute this test to create brk files") 19 | @Test 20 | public void testRBBICompile() throws IOException { 21 | RBBIRuleCompiler rbbiRuleCompiler = new RBBIRuleCompiler(); 22 | String[] rbbis = { 23 | "/icu/Default.rbbi", 24 | "/icu/KeywordTokenizer.rbbi", 25 | "/icu/Latin-break-only-on-whitespace.rbbi", 26 | "/icu/Latin-dont-break-on-hyphens.rbbi", 27 | "/icu/MyanmarSyllable.rbbi" 28 | }; 29 | for (String rbbi : rbbis) { 30 | InputStream inputStream = getClass().getResourceAsStream(rbbi); 31 | int pos1 = rbbi.lastIndexOf("/"); 32 | int pos2 = rbbi.lastIndexOf(".rbbi"); 33 | String basename = rbbi.substring(pos1, pos2); 34 | System.err.println(basename); 35 | OutputStream outputStream = Files.newOutputStream(Paths.get( "build" + basename + ".brk")); 36 | rbbiRuleCompiler.compile(inputStream, outputStream); 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/tools/UTR30DataFileGeneratorTest.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.icu.tools; 2 | 3 | import com.carrotsearch.randomizedtesting.annotations.SuppressForbidden; 4 | import org.junit.Test; 5 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tools.UTR30DataFileGenerator; 6 | 7 | /** 8 | * UTR-30 data file generator test. 9 | */ 10 | public class UTR30DataFileGeneratorTest { 11 | 12 | @SuppressForbidden(value = "execute this test to download utr30 files") 13 | @Test 14 | public void generateUTR30() throws Exception { 15 | UTR30DataFileGenerator utr30DataFileGenerator = new UTR30DataFileGenerator(); 16 | utr30DataFileGenerator.execute("release-62-1", "build/"); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/DetectLanguageTests.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.mapper.langdetect; 2 | 3 | import org.elasticsearch.common.io.Streams; 4 | import org.elasticsearch.test.ESTestCase; 5 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.LangdetectService; 6 | 7 | import java.io.InputStreamReader; 8 | import java.io.Reader; 9 | import java.io.StringWriter; 10 | import java.io.Writer; 11 | import java.nio.charset.StandardCharsets; 12 | 13 | public class DetectLanguageTests extends ESTestCase { 14 | 15 | public void testEnglish() throws Exception { 16 | testLanguage("english.txt", "en"); 17 | } 18 | 19 | public void testChinese() throws Exception { 20 | testLanguage("chinese.txt", "zh-cn"); 21 | } 22 | 23 | public void testJapanese() throws Exception { 24 | testLanguage("japanese.txt", "ja"); 25 | } 26 | 27 | public void testKorean() throws Exception { 28 | testLanguage("korean.txt", "ko"); 29 | } 30 | 31 | private void testLanguage(String path, String lang) throws Exception { 32 | Reader reader = new InputStreamReader(getClass().getResourceAsStream(path), StandardCharsets.UTF_8); 33 | Writer writer = new StringWriter(); 34 | Streams.copy(reader, writer); 35 | reader.close(); 36 | writer.close(); 37 | LangdetectService detect = new LangdetectService(); 38 | assertEquals(lang, detect.detectAll(writer.toString()).get(0).getLanguage()); 39 | } 40 | 41 | } 42 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/DetectorTests.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.mapper.langdetect; 2 | 3 | import org.elasticsearch.common.settings.Settings; 4 | import org.elasticsearch.test.ESTestCase; 5 | 6 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.LangProfile; 7 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.LangdetectService; 8 | 9 | /** 10 | * Detector test. 11 | */ 12 | public class DetectorTests extends ESTestCase { 13 | 14 | private static final String TRAINING_EN = "a a a b b c c d e"; 15 | 16 | private static final String TRAINING_FR = "a b b c c c d d d"; 17 | 18 | private static final String TRAINING_JA = "\u3042 \u3042 \u3042 \u3044 \u3046 \u3048 \u3048"; 19 | 20 | public static LangdetectService create() throws Exception { 21 | LangdetectService detect = new LangdetectService(Settings.EMPTY); 22 | LangProfile profile_en = new LangProfile(); 23 | profile_en.setName("en_test"); 24 | for (String w : TRAINING_EN.split(" ")) { 25 | profile_en.add(w); 26 | } 27 | detect.addProfile(profile_en, 0, 3); 28 | LangProfile profile_fr = new LangProfile(); 29 | profile_fr.setName("fr_test"); 30 | for (String w : TRAINING_FR.split(" ")) { 31 | profile_fr.add(w); 32 | } 33 | detect.addProfile(profile_fr, 1, 3); 34 | LangProfile profile_ja = new LangProfile(); 35 | profile_ja.setName("ja_test"); 36 | for (String w : TRAINING_JA.split(" ")) { 37 | profile_ja.add(w); 38 | } 39 | detect.addProfile(profile_ja, 2, 3); 40 | return detect; 41 | } 42 | 43 | public void testDetector1() throws Exception { 44 | LangdetectService detect = create(); 45 | assertEquals(detect.detectAll("a").get(0).getLanguage(), "en_test"); 46 | } 47 | 48 | public void testDetector2() throws Exception { 49 | LangdetectService detect = create(); 50 | assertEquals(detect.detectAll("b d").get(0).getLanguage(), "fr_test"); 51 | } 52 | 53 | public void testDetector3() throws Exception { 54 | LangdetectService detect = create(); 55 | assertEquals(detect.detectAll("d e").get(0).getLanguage(), "en_test"); 56 | } 57 | 58 | public void testDetector4() throws Exception { 59 | LangdetectService detect = create(); 60 | assertEquals(detect.detectAll("\u3042\u3042\u3042\u3042a").get(0).getLanguage(), "ja_test"); 61 | } 62 | 63 | public void testPunctuation() throws Exception { 64 | LangdetectService detect = create(); 65 | assertTrue(detect.detectAll("...").isEmpty()); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/LangProfileTests.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.mapper.langdetect; 2 | 3 | import org.elasticsearch.test.ESTestCase; 4 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.LangProfile; 5 | 6 | public class LangProfileTests extends ESTestCase { 7 | 8 | public final void testLangProfile() { 9 | LangProfile profile = new LangProfile(); 10 | assertEquals(profile.getName(), null); 11 | } 12 | 13 | public final void testLangProfileStringInt() { 14 | LangProfile profile = new LangProfile(); 15 | profile.setName("en"); 16 | assertEquals(profile.getName(), "en"); 17 | } 18 | 19 | public final void testAdd() { 20 | LangProfile profile = new LangProfile(); 21 | profile.setName("en"); 22 | profile.add("a"); 23 | assertEquals((int) profile.getFreq().get("a"), 1); 24 | profile.add("a"); 25 | assertEquals((int) profile.getFreq().get("a"), 2); 26 | //profile.omitLessFreq(); 27 | } 28 | 29 | public final void testAddIllegally1() { 30 | LangProfile profile = new LangProfile(); 31 | profile.add("a"); 32 | assertEquals(profile.getFreq().get("a"), null); 33 | } 34 | 35 | public final void testAddIllegally2() { 36 | LangProfile profile = new LangProfile(); 37 | profile.setName("en"); 38 | profile.add("a"); 39 | profile.add(""); 40 | profile.add("abcd"); 41 | assertEquals((int) profile.getFreq().get("a"), 1); 42 | assertEquals(profile.getFreq().get(""), null); 43 | assertEquals(profile.getFreq().get("abcd"), null); 44 | } 45 | 46 | public final void testOmitLessFreq() { 47 | LangProfile profile = new LangProfile(); 48 | profile.setName("en"); 49 | String[] grams = "a b c \u3042 \u3044 \u3046 \u3048 \u304a \u304b \u304c \u304d \u304e \u304f".split(" "); 50 | for (int i = 0; i < 5; ++i) { 51 | for (String g : grams) { 52 | profile.add(g); 53 | } 54 | } 55 | profile.add("\u3050"); 56 | 57 | assertEquals((int) profile.getFreq().get("a"), 5); 58 | assertEquals((int) profile.getFreq().get("\u3042"), 5); 59 | assertEquals((int) profile.getFreq().get("\u3050"), 1); 60 | //profile.omitLessFreq(); 61 | //assertEquals(profile.freq.get("a"), null); 62 | //assertEquals((int) profile.freq.get("\u3042"), 5); 63 | //assertEquals(profile.freq.get("\u3050"), null); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/LanguageTests.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.mapper.langdetect; 2 | 3 | import org.elasticsearch.test.ESTestCase; 4 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.Language; 5 | 6 | public class LanguageTests extends ESTestCase { 7 | 8 | public final void testLanguage() { 9 | Language lang = new Language(null, 0); 10 | assertEquals(lang.getLanguage(), null); 11 | assertEquals(lang.getProbability(), 0.0, 0.0001); 12 | assertEquals(lang.getLanguage(), null); 13 | 14 | Language lang2 = new Language("en", 1.0); 15 | assertEquals(lang2.getLanguage(), "en"); 16 | assertEquals(lang2.getProbability(), 1.0, 0.0001); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/SimpleDetectorTests.java: -------------------------------------------------------------------------------- 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.mapper.langdetect; 2 | 3 | import org.elasticsearch.test.ESTestCase; 4 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.LangdetectService; 5 | 6 | public class SimpleDetectorTests extends ESTestCase { 7 | 8 | public void testDetector() throws Exception { 9 | LangdetectService detect = new LangdetectService(); 10 | assertEquals("de", detect.detectAll("Das kann deutsch sein").get(0).getLanguage()); 11 | assertEquals("en", detect.detectAll("This is a very small test").get(0).getLanguage()); 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/test/resources/log4j2.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/concat/concat_analysis.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "analyzer" : { 5 | "concat" : { 6 | "type" : "custom", 7 | "tokenizer" : "standard", 8 | "filter" : [ "concat" ] 9 | } 10 | } 11 | } 12 | } 13 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/decompound/fst/decompound_analysis.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "filter":{ 5 | "decomp":{ 6 | "type":"fst_decompound" 7 | } 8 | }, 9 | "analyzer" : { 10 | "decomp" : { 11 | "type": "custom", 12 | "filter" : ["decomp", "unique" ] 13 | } 14 | } 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/decompound/patricia/decompound_analysis.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "filter":{ 5 | "decomp":{ 6 | "type":"decompound" 7 | } 8 | }, 9 | "tokenizer" : { 10 | "decomp" : { 11 | "type":"standard", 12 | "filter" : "decomp" 13 | } 14 | } 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/decompound/patricia/keywords_analysis.json: -------------------------------------------------------------------------------- 1 | { 2 | "index": { 3 | "analysis": { 4 | "analyzer": { 5 | "decompounding_default": { 6 | "tokenizer": "decomp", 7 | "filter": [ 8 | "keywords", 9 | "decomp" 10 | ], 11 | "type": "custom" 12 | }, 13 | "with_keywords": { 14 | "tokenizer": "decomp", 15 | "filter": [ 16 | "keywords", 17 | "decomp_with_keywords" 18 | ], 19 | "type": "custom" 20 | }, 21 | "with_keywords_disabled": { 22 | "tokenizer": "decomp", 23 | "filter": [ 24 | "keywords", 25 | "decomp_with_keywords_disabled" 26 | ], 27 | "type": "custom" 28 | }, 29 | "with_subwords_only": { 30 | "tokenizer": "decomp", 31 | "filter": [ 32 | "decomp_subwords_only" 33 | ], 34 | "type": "custom" 35 | } 36 | }, 37 | "filter": { 38 | "keywords": { 39 | "type": "keyword_marker", 40 | "keywords": [ 41 | "Schlüsselwort" 42 | ] 43 | }, 44 | "decomp": { 45 | "type": "decompound" 46 | }, 47 | "decomp_with_keywords": { 48 | "type": "decompound", 49 | "respect_keywords": true 50 | }, 51 | "decomp_with_keywords_disabled": { 52 | "type": "decompound", 53 | "respect_keywords": false 54 | }, 55 | "decomp_subwords_only" : { 56 | "type" : "decompound", 57 | "subwords_only" : true 58 | } 59 | }, 60 | "tokenizer": { 61 | "decomp": { 62 | "type": "standard", 63 | "filter": "decomp" 64 | } 65 | } 66 | } 67 | } 68 | } 69 | 70 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/document.json: -------------------------------------------------------------------------------- 1 | { 2 | "text" : "Hello World" 3 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/expansion/expansion_analysis.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "analyzer" : { 5 | "expansion" : { 6 | "type": "custom", 7 | "filter" : ["expansion", "unique" ] 8 | } 9 | } 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/german/german_normalization_analysis.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "filter":{ 5 | "umlaut":{ 6 | "type":"german_normalize" 7 | } 8 | }, 9 | "tokenizer" : { 10 | "umlaut" : { 11 | "type":"standard", 12 | "filter" : "umlaut" 13 | } 14 | } 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/hyphen/custom_hyphen_tokenizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "tokenizer" : { 5 | "my_hyphen_tokenizer" : { 6 | "type" : "hyphen", 7 | "hyphens": "." 8 | } 9 | } 10 | } 11 | } 12 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/hyphen/hyphen_analyzer.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "analyzer" : { 5 | "my_hyphen_analyzer" : { 6 | "type" : "hyphen" 7 | } 8 | } 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/hyphen/hyphen_tokenizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "tokenizer" : { 5 | "my_icu_tokenizer" : { 6 | "type" : "icu_tokenizer", 7 | "rulefiles" : "Latn:Latin-dont-break-on-hyphens.rbbi" 8 | }, 9 | "my_hyphen_tokenizer" : { 10 | "type" : "hyphen" 11 | } 12 | } 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/hyphen/hyphen_tokenizer_without_subwords.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "tokenizer" : { 5 | "my_hyphen_tokenizer" : { 6 | "type" : "hyphen" 7 | } 8 | }, 9 | "filter" : { 10 | "my_hyphen_tokenfilter" : { 11 | "type" : "hyphen", 12 | "subwords" : false 13 | } 14 | } 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/icu_collation.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "analyzer" : { 5 | "icu_german_collate" : { 6 | "type" : "icu_collation", 7 | "language" : "de", 8 | "country" : "DE", 9 | "strength" : "primary", 10 | "rules" : "& ae , a\u0308 & AE , A\u0308& oe , o\u0308 & OE , O\u0308& ue , u\u0308 & UE , u\u0308" 11 | }, 12 | "icu_german_collate_without_punct" : { 13 | "type" : "icu_collation", 14 | "language" : "de", 15 | "country" : "DE", 16 | "strength" : "quaternary", 17 | "alternate" : "shifted", 18 | "rules" : "& ae , a\u0308 & AE , A\u0308& oe , o\u0308 & OE , O\u0308& ue , u\u0308 & UE , u\u0308" 19 | }, 20 | "german_phonebook" : { 21 | "type" : "icu_collation", 22 | "locale" : "de@collation=phonebook", 23 | "strength" : "primary" 24 | }, 25 | "reorder" : { 26 | "type" : "icu_collation", 27 | "rules" : "& ae , a\u0308 & AE , A\u0308& oe , o\u0308 & OE , O\u0308& ue , u\u0308 & UE , u\u0308", 28 | "strength" : "tertiary", 29 | "reorder" : [ "Latn", "digit", "punctuation", "symbol", "currency", "others", "space"] 30 | } 31 | } 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/icu_folding.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "char_filter" : { 5 | "my_icu_folder" : { 6 | "type" : "icu_folding" 7 | } 8 | }, 9 | "tokenizer" : { 10 | "my_icu_tokenizer" : { 11 | "type" : "icu_tokenizer" 12 | } 13 | }, 14 | "filter" : { 15 | "my_icu_folder_filter" : { 16 | "type" : "icu_folding", 17 | "normalization_name" : "utr30" 18 | }, 19 | "my_icu_folder_filter_with_exceptions" : { 20 | "type" : "icu_folding", 21 | "normalization_name" : "utr30", 22 | "unicode_set_filter" : "[^åäöÅÄÖ]" 23 | } 24 | }, 25 | "analyzer" : { 26 | "my_icu_analyzer" : { 27 | "type" : "custom", 28 | "tokenizer" : "my_icu_tokenizer", 29 | "filter" : [ "my_icu_folder_filter" ] 30 | }, 31 | "my_icu_analyzer_with_exceptions" : { 32 | "type" : "custom", 33 | "tokenizer" : "my_icu_tokenizer", 34 | "filter" : [ "my_icu_folder_filter_with_exceptions" ] 35 | } 36 | } 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/icu_normalize.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "char_filter" : { 5 | "my_icu_normalizer" : { 6 | "type" : "icu_normalizer", 7 | "normalization_name" : "utr30" 8 | }, 9 | "my_icu_normalizer_with_exceptions" : { 10 | "type" : "icu_normalizer", 11 | "normalization_name" : "utr30", 12 | "unicode_set_filter" : "[^åäöÅÄÖ]" 13 | } 14 | }, 15 | "tokenizer" : { 16 | "my_icu_tokenizer" : { 17 | "type" : "icu_tokenizer" 18 | } 19 | }, 20 | "analyzer" : { 21 | "my_icu_analyzer" : { 22 | "type" : "custom", 23 | "tokenizer" : "my_icu_tokenizer", 24 | "char_filter" : "my_icu_normalizer" 25 | }, 26 | "my_icu_analyzer_with_exceptions" : { 27 | "type" : "custom", 28 | "tokenizer" : "my_icu_tokenizer", 29 | "char_filter" : [ "my_icu_normalizer_with_exceptions" ] 30 | } 31 | } 32 | } 33 | } 34 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/icu_numberformat.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "filter" : { 5 | "spellout_de" : { 6 | "type" : "icu_numberformat", 7 | "locale" : "de", 8 | "format" : "spellout" 9 | }, 10 | "spellout_en" : { 11 | "type" : "icu_numberformat", 12 | "locale" : "en_US", 13 | "format" : "spellout" 14 | } 15 | }, 16 | "tokenizer" : { 17 | "my_tokenizer" : { 18 | "type" : "icu_tokenizer", 19 | "filter" : "spellout_de" 20 | } 21 | } 22 | } 23 | } 24 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/icu_tokenizer.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "tokenizer" : { 5 | "my_icu_tokenizer" : { 6 | "type" : "icu_tokenizer" 7 | }, 8 | "my_hyphen_icu_tokenizer" : { 9 | "type" : "icu_tokenizer", 10 | "rulefiles" : "Latn:Latin-dont-break-on-hyphens.rbbi" 11 | } 12 | } 13 | } 14 | } 15 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/icu_transform.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "filter" : { 5 | "my_icu_transformer_ch" : { 6 | "type" : "icu_transform", 7 | "id" : "Traditional-Simplified" 8 | }, 9 | "my_icu_transformer_han" : { 10 | "type" : "icu_transform", 11 | "id" : "Han-Latin" 12 | }, 13 | "my_icu_transformer_katakana" : { 14 | "type" : "icu_transform", 15 | "id" : "Katakana-Hiragana" 16 | }, 17 | "my_icu_transformer_cyr" : { 18 | "type" : "icu_transform", 19 | "id" : "Cyrillic-Latin" 20 | }, 21 | "my_icu_transformer_cyr_reverse" : { 22 | "type" : "icu_transform", 23 | "id" : "Cyrillic-Latin", 24 | "dir" : "reverse" 25 | }, 26 | "my_icu_transformer_any_latin" : { 27 | "type" : "icu_transform", 28 | "id" : "Any-Latin" 29 | }, 30 | "my_icu_transformer_nfd" : { 31 | "type" : "icu_transform", 32 | "id" : "NFD; [:Nonspacing Mark:] Remove" 33 | }, 34 | "my_icu_transformer_rules" : { 35 | "type" : "icu_transform", 36 | "id" : "test", 37 | "dir" : "forward", 38 | "rules" : "a > b; b > c;" 39 | } 40 | }, 41 | "tokenizer" : { 42 | "my_icu_tokenizer_ch" : { 43 | "type" : "icu_tokenizer", 44 | "filter" : [ "my_icu_transformer_ch" ] 45 | }, 46 | "my_icu_tokenizer_han" : { 47 | "type" : "icu_tokenizer", 48 | "filter" : [ "my_icu_transformer_han" ] 49 | }, 50 | "my_icu_tokenizer_katakana" : { 51 | "type" : "icu_tokenizer", 52 | "filter" : [ "my_icu_transformer_katakana" ] 53 | }, 54 | "my_icu_tokenizer_cyr" : { 55 | "type" : "icu_tokenizer", 56 | "filter" : [ "my_icu_transformer_cyr" ] 57 | }, 58 | "my_icu_tokenizer_any_latin" : { 59 | "type" : "icu_tokenizer", 60 | "filter" : [ "my_icu_transformer_any_latin" ] 61 | }, 62 | "my_icu_tokenizer_nfd" : { 63 | "type" : "icu_tokenizer", 64 | "filter" : [ "my_icu_transformer_nfd" ] 65 | }, 66 | "my_icu_tokenizer_rules" : { 67 | "type" : "icu_tokenizer", 68 | "filter" : [ "my_icu_transformer_rules" ] 69 | } 70 | } 71 | } 72 | } 73 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "date_detection": false, 3 | "properties": { 4 | "text": { 5 | "type": "text", 6 | "analyzer": "my_analyzer" 7 | } 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "index": { 3 | "analysis": { 4 | "tokenizer": { 5 | "my_hyphen_icu_tokenizer" : { 6 | "type" : "icu_tokenizer", 7 | "rulefiles" : "Latn:icu/Latin-dont-break-on-hyphens.rbbi" 8 | } 9 | }, 10 | "analyzer": { 11 | "default": { 12 | "type": "keyword" 13 | }, 14 | "my_analyzer" : { 15 | "type" : "custom", 16 | "tokenizer" : "my_hyphen_icu_tokenizer" 17 | } 18 | } 19 | } 20 | } 21 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/sortform/sortform.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis": { 4 | "analyzer" : { 5 | "german_phonebook_with_sortform" : { 6 | "type" : "sortform", 7 | "language" : "de", 8 | "country" : "DE", 9 | "strength" : "quaternary", 10 | "alternate" : "shifted", 11 | "rules" : "& ae , a\u0308 & AE , A\u0308 & oe , o\u0308 & OE , O\u0308 & ue , u\u0308 & UE , u\u0308 & ss , \u00df", 12 | "filter" : [ 13 | "sortform" 14 | ], 15 | "char_filter" : [] 16 | } 17 | } 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/worddelimiter/worddelimiter.json: -------------------------------------------------------------------------------- 1 | { 2 | "index":{ 3 | "analysis":{ 4 | "filter" : { 5 | "wd" : { 6 | "type" : "worddelimiter2", 7 | "generate_word_parts" : true, 8 | "generate_number_parts" : true, 9 | "catenate_all" : true, 10 | "split_on_case_change" : true, 11 | "split_on_numerics" : true, 12 | "stem_english_possessive" : true 13 | } 14 | } 15 | } 16 | } 17 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/base64-2-decoded.txt: -------------------------------------------------------------------------------- 1 | God Save the Queen (alternatively God Save the King) -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/base64-2-mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "someType" : { 3 | "properties": { 4 | "content": { 5 | "type": "text", 6 | "fields" : { 7 | "language": { 8 | "type": "langdetect", 9 | "binary": true 10 | } 11 | } 12 | } 13 | } 14 | } 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/base64-2.txt: -------------------------------------------------------------------------------- 1 | R29kIFNhdmUgdGhlIFF1ZWVuIChhbHRlcm5hdGl2ZWx5IEdvZCBTYXZlIHRoZSBLaW5nKQ== 2 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/base64-decoded.txt: -------------------------------------------------------------------------------- 1 | This is a very simple text in plain english 2 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/base64-mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "someType" : { 3 | "properties" : { 4 | "someField":{ 5 | "type" : "langdetect", 6 | "languages": [ "en", "fr", "de", "it", "es" ], 7 | "binary" : true 8 | } 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/base64.txt: -------------------------------------------------------------------------------- 1 | VGhpcyBpcyBhIHZlcnkgc2ltcGxlIHRleHQgaW4gcGxhaW4gZW5nbGlzaAo= 2 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/chinese.txt: -------------------------------------------------------------------------------- 1 | 位于美国首都华盛顿都会圈的希望中文学校5日晚举办活动庆祝建立20周年。从中国大陆留学生为子女学中文而自发建立的学习班,到学生规模在全美名列前茅的中文学校,这个平台的发展也折射出美国的中文教育热度逐步提升。 2 | 希望中文学校是大华盛顿地区最大中文学校,现有7个校区逾4000名学生,规模在美国东部数一数二。不过,见证了希望中文学校20年发展的人们起初根本无法想象这个小小的中文教育平台能发展到今日之规模。 -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/english.txt: -------------------------------------------------------------------------------- 1 | This is a very small example of a text -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/german.txt: -------------------------------------------------------------------------------- 1 | Das ist ein kleiner Text als Beispiel -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/japanese.txt: -------------------------------------------------------------------------------- 1 | 1冊6,000円(雑費送料含む)で頒布いたしますので、ご希望の方は、氏名・送り先住所・電話番号・希望冊数をご記入頂き、書面(E-Mailも可)でお送りください。お支払いは郵送する折に振込用紙を同封しますので、その用紙にてお振込みをお願いいたします。 2 | 3 | ご注文お待ちしております。 -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/korean.txt: -------------------------------------------------------------------------------- 1 | 20조에 육박하는 사교육 시장을 자랑하는 대한민국. 정교사로 학교 강단에 서는 교육인력뿐 아니라 사교육 분야에서 지식을 전달하는 전문 교육 인력의 질도 나날이 제고되고 있다. 2 | 3 | 전문성을 가진 인력의 필요성 증가를 직시해 양질의 영어교사를 배출하겠다는 목적으로 서강대학교 외국어교육원이 특별한 강사 양성 과정을 마련해 영어 교육자로의 길을 모색하고 있는 많은 이들의 관심을 받고 있다. 4 | 5 | 서강대학교 영어교육원은 오는 5월 31일까지 ‘어린이 영어전문가 과정’ 수강생을 모집한다. 6월 24일부터 8월 16일까지 두 달여에 걸쳐 진행되는 이 과정은 온라인 선행학습에 오프라인 학습 8주가 더해진 후 TKT 본 시험응시로 마무리 된다. 6 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/mapping-to-fields.json: -------------------------------------------------------------------------------- 1 | { 2 | "someType" : { 3 | "properties" : { 4 | "someField":{ 5 | "type" : "langdetect", 6 | "languages" : [ "de", "en", "fr", "nl", "it" ], 7 | "language_to" : { 8 | "de": "german_field", 9 | "en": "english_field" 10 | } 11 | }, 12 | "german_field" : { 13 | "analyzer" : "german", 14 | "type": "text" 15 | }, 16 | "english_field" : { 17 | "analyzer" : "english", 18 | "type" : "text" 19 | } 20 | } 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "someType" : { 3 | "properties" : { 4 | "someField":{ 5 | "type" : "langdetect", 6 | "languages" : [ "de", "en", "fr", "nl", "it" ], 7 | "map" : { 8 | "de" : "Deutsch" 9 | } 10 | } 11 | } 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "index" : { 3 | "analysis": { 4 | "analyzer": { 5 | "default": { 6 | "type": "standard" 7 | } 8 | } 9 | } 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/short-text-mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "someType" : { 3 | "properties" : { 4 | "someField" : { 5 | "type" : "langdetect", 6 | "profile" : "shorttext" 7 | } 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/simple-mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "someType" : { 3 | "properties" : { 4 | "someField": { 5 | "type" : "langdetect", 6 | "languages" : [ "de", "en", "fr", "nl", "it" ] 7 | } 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/doc-simple-document.json: -------------------------------------------------------------------------------- 1 | { 2 | "dc" : { 3 | "creator": "first author name" 4 | }, 5 | "author" : { 6 | "authorID": "1" 7 | } 8 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/doc-simple-mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "doc": { 3 | "properties": { 4 | "author": { 5 | "properties": { 6 | "authorName": { 7 | "type": "text" 8 | }, 9 | "authorID": { 10 | "type": "ref", 11 | "ref_index": "ref", 12 | "ref_type": "ref", 13 | "ref_fields": [ 14 | "author" 15 | ], 16 | "copy_to": [ 17 | "dc.creator" 18 | ] 19 | } 20 | } 21 | }, 22 | "dc": { 23 | "properties": { 24 | "creator": { 25 | "type": "text" 26 | } 27 | } 28 | } 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/doc-simple-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "index": { 3 | "analysis": { 4 | "analyzer": { 5 | "default": { 6 | "type": "standard" 7 | } 8 | } 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/gnd-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "index": { 3 | "mapping" : { 4 | "total_fields" : { 5 | "limit": 10000 6 | } 7 | }, 8 | "analysis": { 9 | "analyzer": { 10 | "default": { 11 | "type": "keyword" 12 | } 13 | } 14 | } 15 | } 16 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-doc-book.json: -------------------------------------------------------------------------------- 1 | { 2 | "title" : "A title", 3 | "authorID" : "1", 4 | "dc" : { 5 | "creator" : "A creator" 6 | }, 7 | "bib" : { 8 | "contributor" : "A contributor" 9 | } 10 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-mapping-authorities.json: -------------------------------------------------------------------------------- 1 | { 2 | "docs" : { 3 | "properties" : { 4 | "authorID": { 5 | "type" : "ref", 6 | "ref_index" : "authorities", 7 | "ref_type" : "persons", 8 | "ref_fields" : [ "author" ], 9 | "copy_to" : [ 10 | "dc.creator", 11 | "bib.contributor" 12 | ] 13 | }, 14 | "dc" : { 15 | "properties" : { 16 | "creator" : { 17 | "type" : "text" 18 | } 19 | } 20 | }, 21 | "bib" : { 22 | "properties" : { 23 | "contributor" : { 24 | "type" : "text" 25 | } 26 | } 27 | } 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-mapping-books-test.json: -------------------------------------------------------------------------------- 1 | { 2 | "test" : { 3 | "properties" : { 4 | "authorID": { 5 | "type" : "ref", 6 | "ref_index" : "authorities", 7 | "ref_type" : "persons", 8 | "ref_fields" : [ "author" ], 9 | "copy_to" : [ 10 | "dc.creator", 11 | "bib.contributor" 12 | ] 13 | }, 14 | "dc" : { 15 | "properties" : { 16 | "creator" : { 17 | "type" : "text" 18 | } 19 | } 20 | }, 21 | "bib" : { 22 | "properties" : { 23 | "contributor" : { 24 | "type" : "text" 25 | } 26 | } 27 | } 28 | } 29 | } 30 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-mapping-from-id.json: -------------------------------------------------------------------------------- 1 | { 2 | "docs" : { 3 | "properties" : { 4 | "ref" : { 5 | "type" : "text" 6 | }, 7 | "authorID": { 8 | "type" : "ref", 9 | "ref_index" : "authorities", 10 | "ref_type" : "persons", 11 | "ref_fields" : [ "author" ], 12 | "copy_to" : [ 13 | "ref" 14 | ] 15 | } 16 | } 17 | } 18 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-mapping-nested.json: -------------------------------------------------------------------------------- 1 | { 2 | "nested" : { 3 | "properties": { 4 | "person": { 5 | "properties": { 6 | "authorName": { 7 | "type": "text" 8 | }, 9 | "authorID": { 10 | "type": "ref", 11 | "ref_index": "authorities", 12 | "ref_type": "persons", 13 | "ref_fields": [ 14 | "author" 15 | ], 16 | "copy_to": [ 17 | "dc.creator", 18 | "bib.contributor" 19 | ] 20 | } 21 | } 22 | }, 23 | "dc": { 24 | "properties": { 25 | "creator": { 26 | "type": "text" 27 | } 28 | } 29 | }, 30 | "bib": { 31 | "properties": { 32 | "contributor": { 33 | "type": "text" 34 | } 35 | } 36 | } 37 | } 38 | } 39 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "some_type" : { 3 | "properties" : { 4 | "someField": { 5 | "type": "ref", 6 | "ref_index": "test", 7 | "ref_type": "test", 8 | "ref_fields": [ 9 | "myfield" 10 | ], 11 | "copy_to": [ 12 | "ref" 13 | ] 14 | }, 15 | "ref" : { 16 | "type" : "text" 17 | } 18 | } 19 | } 20 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-simple-document.json: -------------------------------------------------------------------------------- 1 | { 2 | "author" : "second author name" 3 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-simple-mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "ref" : { 3 | "properties": { 4 | "author": { 5 | "type": "text", 6 | "store" : true 7 | } 8 | } 9 | } 10 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-simple-settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "index": { 3 | "analysis": { 4 | "analyzer": { 5 | "default": { 6 | "type": "keyword" 7 | } 8 | } 9 | } 10 | } 11 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/title-document-1.json: -------------------------------------------------------------------------------- 1 | { 2 | "type": "u", 3 | "boost": "0.1", 4 | "xbib": [ 5 | { 6 | "uid": "(DE-605)HT007215476" 7 | }, 8 | { 9 | "uid": "(DE-605)008427902" 10 | }, 11 | { 12 | "identifier": [ 13 | "DE-61", 14 | "DE-385" 15 | ] 16 | } 17 | ], 18 | "RecordIdentifier": { 19 | "identifierForTheRecord": "(DE-605)HT007215476" 20 | }, 21 | "RecordIdentifierSuper": { 22 | "recordIdentifierSuper": "(DE-605)HT007215468" 23 | }, 24 | "RecordCodes": [ 25 | "Autopsie", 26 | "MAB-Zeichenvorrat", 27 | "Unicode / ISO 10646 (UTF 8)", 28 | "RAK-WB" 29 | ], 30 | "Language": { 31 | "languageSource": "ger", 32 | "language": "Deutsch" 33 | }, 34 | "VolumeDesignation": { 35 | "volumeDesignation": "3" 36 | }, 37 | "SortableVolumeDesignation": { 38 | "volumeDesignation": "3" 39 | }, 40 | "Person": [ 41 | { 42 | "personName": "Tucholsky, Kurt", 43 | "personBio": "1890-1935", 44 | "personIdentifier": "(DE-588)11862444X", 45 | "identifierGND": "11862444X" 46 | }, 47 | { 48 | "personName": "Gerold-Tucholsky, Mary", 49 | "personRole": "[Hrsg.]", 50 | "personIdentifier": "(DE-588)188272283", 51 | "identifierGND": "188272283" 52 | } 53 | ], 54 | "TitleStatement": [ 55 | { 56 | "titleMain": "1921 - 1924" 57 | }, 58 | { 59 | "titleMain": "Gesammelte Werke" 60 | } 61 | ], 62 | "TitleAddendum": { 63 | "title": "in 10 Bänden" 64 | }, 65 | "CreatorStatement": { 66 | "creatorStatement": "Kurt Tucholsky. Hrsg. von Mary Gerold-Tucholsky ..." 67 | }, 68 | "Edition": { 69 | "edition": "182. - 201. Tsd." 70 | }, 71 | "PublicationPlace": { 72 | "printingPlace": "Reinbek bei Hamburg" 73 | }, 74 | "PublisherName": { 75 | "printerName": "Rowohlt" 76 | }, 77 | "DateProper": { 78 | "date": "1995" 79 | }, 80 | "Extent": { 81 | "extent": "534 S." 82 | }, 83 | "RecordSystemNumber": { 84 | "systemNumber": "(DE-605)008427902" 85 | }, 86 | "dc": [ 87 | { 88 | "type": "keine Angabe" 89 | }, 90 | { 91 | "format": "keine Angabe" 92 | }, 93 | { 94 | "date": 1995 95 | }, 96 | { 97 | "language": "Deutsch" 98 | } 99 | ], 100 | "collection": "hbz Verbundkatalog" 101 | } -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/standardnumber/mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "someType" : { 3 | "properties" : { 4 | "someField":{ 5 | "type" : "standardnumber", 6 | "standardnumbers" : "isbn" 7 | } 8 | } 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/query/decompound/decompound_query.json: -------------------------------------------------------------------------------- 1 | { 2 | "settings": { 3 | "index": { 4 | "number_of_shards": 1, 5 | "number_of_replicas": 0, 6 | "analysis": { 7 | "filter": { 8 | "decomp":{ 9 | "type" : "decompound", 10 | "use_payload": true, 11 | "use_cache": true 12 | } 13 | }, 14 | "analyzer": { 15 | "decomp": { 16 | "type": "custom", 17 | "tokenizer" : "standard", 18 | "filter" : [ 19 | "decomp", 20 | "lowercase" 21 | ] 22 | }, 23 | "lowercase": { 24 | "type": "custom", 25 | "tokenizer" : "standard", 26 | "filter" : [ 27 | "lowercase" 28 | ] 29 | } 30 | } 31 | } 32 | } 33 | }, 34 | "mappings": { 35 | "_doc": { 36 | "properties": { 37 | "text": { 38 | "type": "text", 39 | "analyzer": "decomp", 40 | "search_analyzer": "lowercase" 41 | } 42 | } 43 | } 44 | } 45 | } -------------------------------------------------------------------------------- /src/test/resources/rest-api-spec/test/analysis_icu/10_basic.yml: -------------------------------------------------------------------------------- 1 | # Integration tests for ICU analysis components 2 | # 3 | "Tokenizer": 4 | - do: 5 | indices.analyze: 6 | body: 7 | text: Foo Bar 8 | tokenizer: icu_tokenizer 9 | - length: { tokens: 2 } 10 | - match: { tokens.0.token: Foo } 11 | - match: { tokens.1.token: Bar } 12 | --- 13 | "Normalization filter": 14 | - do: 15 | indices.analyze: 16 | body: 17 | filter: [icu_normalizer] 18 | text: Foo Bar Ruß 19 | tokenizer: keyword 20 | - length: { tokens: 1 } 21 | - match: { tokens.0.token: foo bar russ } 22 | --- 23 | "Normalization charfilter": 24 | - do: 25 | indices.analyze: 26 | body: 27 | char_filter: [icu_normalizer] 28 | text: Foo Bar Ruß 29 | tokenizer: keyword 30 | - length: { tokens: 1 } 31 | - match: { tokens.0.token: foo bar russ } 32 | --- 33 | "Folding filter": 34 | - do: 35 | indices.analyze: 36 | body: 37 | filter: [icu_folding] 38 | text: Foo Bar résumé 39 | tokenizer: keyword 40 | - length: { tokens: 1 } 41 | - match: { tokens.0.token: foo bar resume } 42 | --- 43 | "Normalization with a UnicodeSet Filter": 44 | - do: 45 | indices.create: 46 | index: test 47 | body: 48 | settings: 49 | index: 50 | analysis: 51 | char_filter: 52 | charfilter_icu_normalizer: 53 | type: icu_normalizer 54 | unicode_set_filter: "[^ß]" 55 | filter: 56 | tokenfilter_icu_normalizer: 57 | type: icu_normalizer 58 | unicode_set_filter: "[^ßB]" 59 | tokenfilter_icu_folding: 60 | type: icu_folding 61 | unicode_set_filter: "[^â]" 62 | - do: 63 | indices.analyze: 64 | index: test 65 | body: 66 | char_filter: ["charfilter_icu_normalizer"] 67 | tokenizer: keyword 68 | text: charfilter Föo Bâr Ruß 69 | - length: { tokens: 1 } 70 | - match: { tokens.0.token: charfilter föo bâr ruß } 71 | - do: 72 | indices.analyze: 73 | index: test 74 | body: 75 | tokenizer: keyword 76 | filter: ["tokenfilter_icu_normalizer"] 77 | text: tokenfilter Föo Bâr Ruß 78 | - length: { tokens: 1 } 79 | - match: { tokens.0.token: tokenfilter föo Bâr ruß } 80 | - do: 81 | indices.analyze: 82 | index: test 83 | body: 84 | tokenizer: keyword 85 | filter: ["tokenfilter_icu_folding"] 86 | text: icufolding Föo Bâr Ruß 87 | - length: { tokens: 1 } 88 | - match: { tokens.0.token: icufolding foo bâr russ } 89 | -------------------------------------------------------------------------------- /src/test/resources/rest-api-spec/test/analysis_icu/20_search.yml: -------------------------------------------------------------------------------- 1 | # Integration tests for ICU analysis component 2 | # 3 | --- 4 | "Index ICU content": 5 | - do: 6 | indices.create: 7 | index: test 8 | body: 9 | settings: 10 | index: 11 | analysis: 12 | analyzer: 13 | my_analyzer: 14 | type: icu_collation 15 | filter: ["standard", "lowercase"] 16 | language: en 17 | strength: primary 18 | mappings: 19 | type: 20 | properties: 21 | text: 22 | type: text 23 | analyzer: my_analyzer 24 | 25 | - do: 26 | index: 27 | index: test 28 | type: type 29 | id: 1 30 | body: { "text": "Bâton enflammé" } 31 | - do: 32 | indices.refresh: {} 33 | 34 | - do: 35 | search: 36 | index: test 37 | body: 38 | query: 39 | match: 40 | text: baton enflamme 41 | - match: { hits.total: 1 } 42 | --------------------------------------------------------------------------------