├── .gitignore
├── .travis.yml
├── CREDITS.txt
├── LICENSE.txt
├── NOTICE.txt
├── README.adoc
├── bin
    └── langdetect.sh
├── build.gradle
├── config
    └── checkstyle
    │   └── checkstyle.xml
├── gradle.properties
├── gradle
    ├── ext.gradle
    ├── publish.gradle
    ├── sourcequality.gradle
    └── wrapper
    │   ├── gradle-wrapper.jar
    │   └── gradle-wrapper.properties
├── gradlew
├── gradlew.bat
├── licenses
    ├── icu4j-62.1.jar.sha1
    ├── icu4j-LICENSE.txt
    ├── icu4j-NOTICE.txt
    ├── standardnumber-1.0.1.jar.sha1
    ├── standardnumber-LICENSE.txt
    └── standardnumber-NOTICE.txt
├── settings.gradle
└── src
    ├── docs
        └── asciidoc
        │   ├── css
        │       └── foundation.css
        │   ├── hyphen.adoc
        │   ├── icu.adoc
        │   ├── langdetect.adoc
        │   └── standardnumbers.adoc
    ├── main
        ├── java
        │   └── org
        │   │   └── xbib
        │   │       └── elasticsearch
        │   │           └── plugin
        │   │               └── bundle
        │   │                   ├── BundlePlugin.java
        │   │                   ├── action
        │   │                       ├── isbnformat
        │   │                       │   ├── ISBNFormatAction.java
        │   │                       │   ├── ISBNFormatRequest.java
        │   │                       │   ├── ISBNFormatRequestBuilder.java
        │   │                       │   ├── ISBNFormatResponse.java
        │   │                       │   ├── TransportISBNFormatAction.java
        │   │                       │   └── package-info.java
        │   │                       └── langdetect
        │   │                       │   ├── LangdetectAction.java
        │   │                       │   ├── LangdetectRequest.java
        │   │                       │   ├── LangdetectRequestBuilder.java
        │   │                       │   ├── LangdetectResponse.java
        │   │                       │   ├── TransportLangdetectAction.java
        │   │                       │   └── package-info.java
        │   │                   ├── common
        │   │                       ├── decompound
        │   │                       │   ├── fst
        │   │                       │   │   ├── FstDecompounder.java
        │   │                       │   │   └── package-info.java
        │   │                       │   └── patricia
        │   │                       │   │   ├── CompactPatriciaTrie.java
        │   │                       │   │   ├── Decompounder.java
        │   │                       │   │   ├── LFUCache.java
        │   │                       │   │   ├── Node.java
        │   │                       │   │   └── package-info.java
        │   │                       ├── fsa
        │   │                       │   ├── ConstantArcSizeFSA.java
        │   │                       │   ├── Dictionary.java
        │   │                       │   ├── FSA.java
        │   │                       │   ├── FSABuilder.java
        │   │                       │   ├── FSAFinalStatesIterator.java
        │   │                       │   ├── FSAFlags.java
        │   │                       │   ├── FSATraversal.java
        │   │                       │   ├── MatchResult.java
        │   │                       │   └── StateVisitor.java
        │   │                       ├── fst
        │   │                       │   └── FstCompiler.java
        │   │                       ├── langdetect
        │   │                       │   ├── LangProfile.java
        │   │                       │   ├── LangdetectService.java
        │   │                       │   ├── Language.java
        │   │                       │   ├── LanguageDetectionException.java
        │   │                       │   ├── NGram.java
        │   │                       │   └── package-info.java
        │   │                       ├── reference
        │   │                       │   ├── ReferenceService.java
        │   │                       │   └── package-info.java
        │   │                       └── standardnumber
        │   │                       │   ├── StandardnumberService.java
        │   │                       │   └── package-info.java
        │   │                   ├── index
        │   │                       ├── analysis
        │   │                       │   ├── autophrase
        │   │                       │   │   ├── AutoPhrasingTokenFilter.java
        │   │                       │   │   ├── AutoPhrasingTokenFilterFactory.java
        │   │                       │   │   └── package-info.java
        │   │                       │   ├── baseform
        │   │                       │   │   ├── BaseformTokenFilter.java
        │   │                       │   │   ├── BaseformTokenFilterFactory.java
        │   │                       │   │   └── package-info.java
        │   │                       │   ├── concat
        │   │                       │   │   ├── ConcatTokenFilter.java
        │   │                       │   │   ├── ConcatTokenFilterFactory.java
        │   │                       │   │   ├── PairTokenFilter.java
        │   │                       │   │   ├── PairTokenFilterFactory.java
        │   │                       │   │   └── package-info.java
        │   │                       │   ├── decompound
        │   │                       │   │   ├── fst
        │   │                       │   │   │   ├── FstDecompoundTokenFilter.java
        │   │                       │   │   │   ├── FstDecompoundTokenFilterFactory.java
        │   │                       │   │   │   └── package-info.java
        │   │                       │   │   └── patricia
        │   │                       │   │   │   ├── DecompoundTokenFilter.java
        │   │                       │   │   │   ├── DecompoundTokenFilterFactory.java
        │   │                       │   │   │   └── package-info.java
        │   │                       │   ├── german
        │   │                       │   │   ├── GermanNormalizationFilterFactory.java
        │   │                       │   │   └── package-info.java
        │   │                       │   ├── hyphen
        │   │                       │   │   ├── HyphenAnalyzer.java
        │   │                       │   │   ├── HyphenAnalyzerProvider.java
        │   │                       │   │   ├── HyphenTokenFilter.java
        │   │                       │   │   ├── HyphenTokenFilterFactory.java
        │   │                       │   │   ├── HyphenTokenizer.java
        │   │                       │   │   ├── HyphenTokenizerFactory.java
        │   │                       │   │   └── package-info.java
        │   │                       │   ├── icu
        │   │                       │   │   ├── IcuCollationAttributeFactory.java
        │   │                       │   │   ├── IcuCollationKeyAnalyzer.java
        │   │                       │   │   ├── IcuCollationKeyAnalyzerProvider.java
        │   │                       │   │   ├── IcuCollationTokenizerFactory.java
        │   │                       │   │   ├── IcuFoldingCharFilterFactory.java
        │   │                       │   │   ├── IcuFoldingTokenFilterFactory.java
        │   │                       │   │   ├── IcuNormalizerCharFilter.java
        │   │                       │   │   ├── IcuNormalizerCharFilterFactory.java
        │   │                       │   │   ├── IcuNormalizerFilter.java
        │   │                       │   │   ├── IcuNormalizerTokenFilterFactory.java
        │   │                       │   │   ├── IcuNumberFormatTokenFilter.java
        │   │                       │   │   ├── IcuNumberFormatTokenFilterFactory.java
        │   │                       │   │   ├── IcuTransformTokenFilter.java
        │   │                       │   │   ├── IcuTransformTokenFilterFactory.java
        │   │                       │   │   ├── IndexableBinaryStringTools.java
        │   │                       │   │   ├── package-info.java
        │   │                       │   │   ├── segmentation
        │   │                       │   │   │   ├── BreakIteratorWrapper.java
        │   │                       │   │   │   ├── CharArrayIterator.java
        │   │                       │   │   │   ├── CompositeBreakIterator.java
        │   │                       │   │   │   ├── DefaultIcuTokenizerConfig.java
        │   │                       │   │   │   ├── IcuTokenizer.java
        │   │                       │   │   │   ├── IcuTokenizerConfig.java
        │   │                       │   │   │   ├── IcuTokenizerFactory.java
        │   │                       │   │   │   ├── ScriptIterator.java
        │   │                       │   │   │   └── package-info.java
        │   │                       │   │   ├── tokenattributes
        │   │                       │   │   │   ├── ScriptAttribute.java
        │   │                       │   │   │   ├── ScriptAttributeImpl.java
        │   │                       │   │   │   └── package-info.java
        │   │                       │   │   └── tools
        │   │                       │   │   │   ├── RBBIRuleCompiler.java
        │   │                       │   │   │   ├── UTR30DataFileGenerator.java
        │   │                       │   │   │   └── package-info.java
        │   │                       │   ├── lemmatize
        │   │                       │   │   ├── LemmatizeTokenFilter.java
        │   │                       │   │   └── LemmatizeTokenFilterFactory.java
        │   │                       │   ├── naturalsort
        │   │                       │   │   ├── NaturalSortKeyAnalyzer.java
        │   │                       │   │   ├── NaturalSortKeyAnalyzerProvider.java
        │   │                       │   │   ├── NaturalSortKeyAttributeFactory.java
        │   │                       │   │   ├── NaturalSortKeyAttributeImpl.java
        │   │                       │   │   ├── NaturalSortKeyTokenizerFactory.java
        │   │                       │   │   └── package-info.java
        │   │                       │   ├── sortform
        │   │                       │   │   ├── SortformAnalyzerProvider.java
        │   │                       │   │   ├── SortformTokenFilter.java
        │   │                       │   │   └── SortformTokenFilterFactory.java
        │   │                       │   ├── standardnumber
        │   │                       │   │   ├── StandardnumberAnalyzer.java
        │   │                       │   │   ├── StandardnumberAnalyzerProvider.java
        │   │                       │   │   ├── StandardnumberTokenFilter.java
        │   │                       │   │   └── StandardnumberTokenFilterFactory.java
        │   │                       │   ├── symbolname
        │   │                       │   │   ├── SymbolnameTokenFilter.java
        │   │                       │   │   ├── SymbolnameTokenFilterFactory.java
        │   │                       │   │   └── package-info.java
        │   │                       │   ├── worddelimiter
        │   │                       │   │   ├── WordDelimiterFilter.java
        │   │                       │   │   ├── WordDelimiterFilter2.java
        │   │                       │   │   ├── WordDelimiterFilter2Factory.java
        │   │                       │   │   ├── WordDelimiterFilterFactory.java
        │   │                       │   │   ├── WordDelimiterFlags.java
        │   │                       │   │   ├── WordDelimiterIterator.java
        │   │                       │   │   └── package-info.java
        │   │                       │   └── year
        │   │                       │   │   ├── GregorianYearTokenFilter.java
        │   │                       │   │   └── GregorianYearTokenFilterFactory.java
        │   │                       └── mapper
        │   │                       │   ├── icu
        │   │                       │       └── IcuCollationKeyFieldMapper.java
        │   │                       │   ├── langdetect
        │   │                       │       └── LangdetectMapper.java
        │   │                       │   ├── reference
        │   │                       │       ├── ReferenceMapper.java
        │   │                       │       ├── ReferenceMapperModule.java
        │   │                       │       └── ReferenceMapperTypeParser.java
        │   │                       │   └── standardnumber
        │   │                       │       ├── StandardnumberMapper.java
        │   │                       │       ├── StandardnumberMapperModule.java
        │   │                       │       └── StandardnumberMapperTypeParser.java
        │   │                   ├── package-info.java
        │   │                   ├── query
        │   │                       └── decompound
        │   │                       │   ├── CustomSpanPayloadCheckQuery.java
        │   │                       │   ├── ExactPhraseQueryBuilder.java
        │   │                       │   └── QueryTransformer.java
        │   │                   └── rest
        │   │                       └── action
        │   │                           ├── isbnformat
        │   │                               ├── RestISBNFormatterAction.java
        │   │                               └── package-info.java
        │   │                           └── langdetect
        │   │                               └── RestLangdetectAction.java
        ├── jflex
        │   └── HyphenTokenizer.jflex
        ├── plugin-metadata
        │   └── plugin-security.policy
        └── resources
        │   └── org
        │       └── xbib
        │           └── elasticsearch
        │               └── plugin
        │                   └── bundle
        │                       ├── common
        │                           └── langdetect
        │                           │   ├── af
        │                           │   ├── ar
        │                           │   ├── bg
        │                           │   ├── bn
        │                           │   ├── cs
        │                           │   ├── da
        │                           │   ├── de
        │                           │   ├── el
        │                           │   ├── en
        │                           │   ├── es
        │                           │   ├── et
        │                           │   ├── fa
        │                           │   ├── fi
        │                           │   ├── fr
        │                           │   ├── gu
        │                           │   ├── he
        │                           │   ├── hi
        │                           │   ├── hr
        │                           │   ├── hu
        │                           │   ├── id
        │                           │   ├── it
        │                           │   ├── ja
        │                           │   ├── kn
        │                           │   ├── ko
        │                           │   ├── language.json
        │                           │   ├── lt
        │                           │   ├── lv
        │                           │   ├── mk
        │                           │   ├── ml
        │                           │   ├── mr
        │                           │   ├── ne
        │                           │   ├── nl
        │                           │   ├── no
        │                           │   ├── pa
        │                           │   ├── pl
        │                           │   ├── pt
        │                           │   ├── ro
        │                           │   ├── ru
        │                           │   ├── shorttext
        │                           │       ├── bg
        │                           │       ├── bn
        │                           │       ├── cs
        │                           │       ├── da
        │                           │       ├── de
        │                           │       ├── en
        │                           │       ├── es
        │                           │       ├── fa
        │                           │       ├── fi
        │                           │       ├── fr
        │                           │       ├── gu
        │                           │       ├── hi
        │                           │       ├── hr
        │                           │       ├── hu
        │                           │       ├── id
        │                           │       ├── it
        │                           │       ├── lt
        │                           │       ├── lv
        │                           │       ├── mk
        │                           │       ├── nl
        │                           │       ├── no
        │                           │       ├── pa
        │                           │       ├── pl
        │                           │       ├── pt
        │                           │       ├── ro
        │                           │       ├── sv
        │                           │       ├── ta
        │                           │       ├── te
        │                           │       ├── tr
        │                           │       ├── uk
        │                           │       ├── ur
        │                           │       └── vi
        │                           │   ├── sk
        │                           │   ├── sl
        │                           │   ├── so
        │                           │   ├── sq
        │                           │   ├── sv
        │                           │   ├── sw
        │                           │   ├── ta
        │                           │   ├── te
        │                           │   ├── th
        │                           │   ├── tl
        │                           │   ├── tr
        │                           │   ├── uk
        │                           │   ├── ur
        │                           │   ├── vi
        │                           │   ├── zh-cn
        │                           │   └── zh-tw
        │                       ├── icu
        │                           ├── KeywordTokenizer.brk
        │                           ├── Latin-break-only-on-whitespace.brk
        │                           ├── Latin-dont-break-on-hyphens.brk
        │                           └── folding
        │                           │   ├── BasicFoldings.txt
        │                           │   ├── DiacriticFolding.txt
        │                           │   ├── DingbatFolding.txt
        │                           │   ├── HanRadicalFolding.txt
        │                           │   ├── NativeDigitFolding.txt
        │                           │   ├── nfc.txt
        │                           │   ├── nfkc.txt
        │                           │   └── nfkc_cf.txt
        │                       └── index
        │                           └── analysis
        │                               ├── baseform
        │                                   ├── de-lemma-utf8.txt
        │                                   └── en-lemma-utf8.txt
        │                               ├── decompound
        │                                   ├── fst
        │                                   │   └── words.fst
        │                                   └── patricia
        │                                   │   ├── grfExt.tree
        │                                   │   ├── kompVHic.tree
        │                                   │   └── kompVVic.tree
        │                               └── icu
        │                                   ├── segmentation
        │                                       ├── Default.brk
        │                                       ├── Default.rbbi
        │                                       ├── KeywordTokenizer.rbbi
        │                                       ├── Latin-break-only-on-whitespace.rbbi
        │                                       ├── Latin-dont-break-on-hyphens.rbbi
        │                                       ├── MyanmarSyllable.brk
        │                                       └── MyanmarSyllable.rbbi
        │                                   └── utr30.nrm
    └── test
        ├── java
            └── org
            │   └── xbib
            │       └── elasticsearch
            │           └── plugin
            │               └── bundle
            │                   └── test
            │                       ├── MultiMap.java
            │                       ├── TreeMultiMap.java
            │                       ├── common
            │                           └── decompound
            │                           │   └── patricia
            │                           │       ├── DecompounderTest.java
            │                           │       └── LFUCacheTest.java
            │                       ├── index
            │                           ├── analysis
            │                           │   ├── autophrase
            │                           │   │   └── AutoPhrasingTokenFilterTests.java
            │                           │   ├── baseform
            │                           │   │   ├── BaseformTokenFilterTests.java
            │                           │   │   └── DictionaryTest.java
            │                           │   ├── concat
            │                           │   │   └── ConcatTokenFilterTests.java
            │                           │   ├── decompound
            │                           │   │   ├── fst
            │                           │   │   │   └── FstDecompoundTokenFilterTests.java
            │                           │   │   └── patricia
            │                           │   │   │   └── DecompoundTokenFilterTests.java
            │                           │   ├── german
            │                           │   │   ├── GermanNormalizationTests.java
            │                           │   │   └── UnstemmedGermanNormalizationTests.java
            │                           │   ├── hyphen
            │                           │   │   └── HyphenTokenizerTests.java
            │                           │   ├── icu
            │                           │   │   ├── IcuAnalysisTests.java
            │                           │   │   ├── IcuClientYamlTestSuiteIT.java
            │                           │   │   ├── IcuCollationAnalyzerTests.java
            │                           │   │   ├── IcuCollationKeyAnalyzerTests.java
            │                           │   │   ├── IcuFoldingFilterTests.java
            │                           │   │   ├── IcuNormalizeCharTests.java
            │                           │   │   ├── IcuNormalizerFilterTests.java
            │                           │   │   ├── IcuNumberFormatTests.java
            │                           │   │   ├── IcuTokenizerTests.java
            │                           │   │   ├── IcuTransformFilterTests.java
            │                           │   │   ├── segmentation
            │                           │   │   │   ├── CJKBigramFilterTests.java
            │                           │   │   │   ├── CharArrayIteratorTests.java
            │                           │   │   │   ├── IcuTokenizerCJKTests.java
            │                           │   │   │   ├── IcuTokenizerFactoryTests.java
            │                           │   │   │   ├── MyanmarSyllableTests.java
            │                           │   │   │   └── SegmentationIcuTokenizerTests.java
            │                           │   │   └── tools
            │                           │   │   │   ├── RBBIRuleCompilerTest.java
            │                           │   │   │   └── UTR30DataFileGeneratorTest.java
            │                           │   ├── lemmatize
            │                           │   │   ├── LemmatizeSearchTests.java
            │                           │   │   └── LemmatizeTokenFilterTests.java
            │                           │   ├── naturalsort
            │                           │   │   └── NaturalSortKeyTests.java
            │                           │   ├── sortform
            │                           │   │   └── SortFormTests.java
            │                           │   ├── symbolname
            │                           │   │   └── SymbolnameTokenFilterTests.java
            │                           │   └── worddelimiter
            │                           │   │   └── WordDelimiterFilter2Tests.java
            │                           └── mapper
            │                           │   ├── langdetect
            │                           │       ├── DetectLanguageTests.java
            │                           │       ├── DetectorTests.java
            │                           │       ├── LangDetectActionTests.java
            │                           │       ├── LangDetectBinaryTests.java
            │                           │       ├── LangDetectChineseTests.java
            │                           │       ├── LangDetectGermanTests.java
            │                           │       ├── LangProfileTests.java
            │                           │       ├── LangdetectMappingTests.java
            │                           │       ├── LanguageTests.java
            │                           │       ├── NGramTests.java
            │                           │       └── SimpleDetectorTests.java
            │                           │   ├── reference
            │                           │       ├── GNDReferenceMappingTests.java
            │                           │       ├── ReferenceMappingTests.java
            │                           │       └── SimpleReferenceMappingTests.java
            │                           │   └── standardnumber
            │                           │       └── StandardnumberMappingTests.java
            │                       └── query
            │                           └── decompound
            │                               └── DecompoundQueryTests.java
        └── resources
            ├── log4j2.xml
            ├── org
                └── xbib
                │   └── elasticsearch
                │       └── plugin
                │           └── bundle
                │               └── test
                │                   ├── index
                │                       ├── analysis
                │                       │   ├── concat
                │                       │   │   └── concat_analysis.json
                │                       │   ├── decompound
                │                       │   │   ├── fst
                │                       │   │   │   └── decompound_analysis.json
                │                       │   │   └── patricia
                │                       │   │   │   ├── decompound_analysis.json
                │                       │   │   │   └── keywords_analysis.json
                │                       │   ├── document.json
                │                       │   ├── expansion
                │                       │   │   └── expansion_analysis.json
                │                       │   ├── german
                │                       │   │   ├── german_normalization_analysis.json
                │                       │   │   └── unstemmed.json
                │                       │   ├── hyphen
                │                       │   │   ├── custom_hyphen_tokenizer.json
                │                       │   │   ├── hyphen_analyzer.json
                │                       │   │   ├── hyphen_tokenizer.json
                │                       │   │   └── hyphen_tokenizer_without_subwords.json
                │                       │   ├── icu
                │                       │   │   ├── icu_collation.json
                │                       │   │   ├── icu_folding.json
                │                       │   │   ├── icu_normalize.json
                │                       │   │   ├── icu_numberformat.json
                │                       │   │   ├── icu_tokenizer.json
                │                       │   │   └── icu_transform.json
                │                       │   ├── mapping.json
                │                       │   ├── settings.json
                │                       │   ├── sortform
                │                       │   │   └── sortform.json
                │                       │   └── worddelimiter
                │                       │   │   └── worddelimiter.json
                │                       └── mapper
                │                       │   ├── langdetect
                │                       │       ├── base64-2-decoded.txt
                │                       │       ├── base64-2-mapping.json
                │                       │       ├── base64-2.txt
                │                       │       ├── base64-decoded.txt
                │                       │       ├── base64-mapping.json
                │                       │       ├── base64.txt
                │                       │       ├── chinese.txt
                │                       │       ├── english.txt
                │                       │       ├── german.txt
                │                       │       ├── japanese.txt
                │                       │       ├── korean.txt
                │                       │       ├── mapping-to-fields.json
                │                       │       ├── mapping.json
                │                       │       ├── settings.json
                │                       │       ├── short-text-mapping.json
                │                       │       └── simple-mapping.json
                │                       │   ├── reference
                │                       │       ├── doc-simple-document.json
                │                       │       ├── doc-simple-mapping.json
                │                       │       ├── doc-simple-settings.json
                │                       │       ├── gnd-document.json
                │                       │       ├── gnd-mapping.json
                │                       │       ├── gnd-settings.json
                │                       │       ├── ref-doc-book.json
                │                       │       ├── ref-mapping-authorities.json
                │                       │       ├── ref-mapping-books-test.json
                │                       │       ├── ref-mapping-from-id.json
                │                       │       ├── ref-mapping-nested.json
                │                       │       ├── ref-mapping.json
                │                       │       ├── ref-simple-document.json
                │                       │       ├── ref-simple-mapping.json
                │                       │       ├── ref-simple-settings.json
                │                       │       ├── title-document-1.json
                │                       │       ├── title-document-2.json
                │                       │       ├── title-mapping.json
                │                       │       └── title-settings.json
                │                       │   └── standardnumber
                │                       │       └── mapping.json
                │                   └── query
                │                       └── decompound
                │                           └── decompound_query.json
            └── rest-api-spec
                └── test
                    └── analysis_icu
                        ├── 10_basic.yml
                        └── 20_search.yml


/.gitignore:
--------------------------------------------------------------------------------
 1 | /data
 2 | /work
 3 | /logs
 4 | /.idea
 5 | /target
 6 | .DS_Store
 7 | *.iml
 8 | /.settings
 9 | /.classpath
10 | /.project
11 | /.gradle
12 | /build
13 | /plugins
14 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | sudo: false
2 | language: java
3 | jdk:
4 |   - oraclejdk9
5 | 
6 | cache:
7 |   directories:
8 |     - $HOME/.m2
9 | 


--------------------------------------------------------------------------------
/CREDITS.txt:
--------------------------------------------------------------------------------
 1 | The plugin bundle wouldn't be possible without the hard work of many authors
 2 | who generously published their work under an open source license.
 3 | 
 4 | This file should contain all the credits to them. If you miss a credit, please
 5 | notify me about it and it will be added as soon as possible.
 6 | 
 7 | The ICU analysis is heavily based on Apache Lucene ICU
 8 | 
 9 | https://github.com/apache/lucene-solr/tree/master/lucene/analysis/icu
10 | 
11 | The AutoPhraseTokenFilter is derived from
12 | 
13 | https://github.com/lucidworks/auto-phrase-tokenfilter
14 | 
15 | The ConcatTokenFilter is authored by Sujit Pal and was taken from
16 | 
17 | http://sujitpal.blogspot.de/2011/07/lucene-token-concatenating-tokenfilter_30.html
18 | 
19 | The Decompound token filter is a reworked implementation of the
20 | link:http://wortschatz.uni-leipzig.de/~cbiemann/software/toolbox/Baseforms%20Tool.htm[Baseforms Tool]
21 | found in the http://wortschatz.uni-leipzig.de/~cbiemann/software/toolbox/index.htm[ASV toolbox]
22 | of http://asv.informatik.uni-leipzig.de/staff/Chris_Biemann[Chris Biemann],
23 | Automatische Sprachverarbeitung of Leipzig University.
24 | 
25 | The FSA in package org.xbib.elastixsearch.common.fsa which provides the dictionary structure for
26 | the baseform tokenizer is a derived version of
27 | 
28 | https://github.com/morfologik/morfologik-stemming/tree/master/morfologik-fsa/src/main/java/morfologik/fsa
29 | 
30 | Thanks to GBI-Genios Deutsche Wirtschaftsdatenbank GmbH for adding the caching-functionality and the "Exact phrase matches".
31 | The implementation of an exact phrase match query can ignore/skip decompounded tokens while matching phrases.
32 | The LFU cache for the Patricia Decompounder was inspired by the use of ConcurrentHashMap cache
33 | in the original pull request: https://github.com/jprante/elasticsearch-analysis-decompound/pull/54/
34 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/NOTICE.txt


--------------------------------------------------------------------------------
/bin/langdetect.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | curl -XDELETE 'localhost:9200/test'
 4 | 
 5 | curl -XPUT 'localhost:9200/test'
 6 | 
 7 | curl -XPOST 'localhost:9200/test/article/_mapping' -d '
 8 | {
 9 |   "article" : {
10 |     "properties" : {
11 |        "content" : { "type" : "langdetect" }
12 |     }
13 |   }
14 | }
15 | '
16 | 
17 | curl -XPUT 'localhost:9200/test/article/1' -d '
18 | {
19 |   "title" : "Some title",
20 |   "content" : "Oh, say can you see by the dawn`s early light, What so proudly we hailed at the twilight`s last gleaming?"
21 | }
22 | '
23 | 
24 | curl -XPUT 'localhost:9200/test/article/2' -d '
25 | {
26 |   "title" : "Ein Titel",
27 |   "content" : "Einigkeit und Recht und Freiheit für das deutsche Vaterland!"
28 | }
29 | '
30 | 
31 | curl -XPUT 'localhost:9200/test/article/3' -d '
32 | {
33 |   "title" : "Un titre",
34 |   "content" : "Allons enfants de la Patrie, Le jour de gloire est arrivé!"
35 | }
36 | '
37 | 
38 | curl -XGET 'localhost:9200/test/_refresh'
39 | 
40 | curl -XPOST 'localhost:9200/test/article/_search' -d '
41 | {
42 |    "query" : {
43 |        "term" : {
44 |             "content" : "eng"
45 |        }
46 |    }
47 | }
48 | '
49 | curl -XPOST 'localhost:9200/test/_search' -d '
50 | {
51 |    "query" : {
52 |        "term" : {
53 |             "content" : "ger"
54 |        }
55 |    }
56 | }
57 | '
58 | 
59 | curl -XPOST 'localhost:9200/test/_search' -d '
60 | {
61 |    "query" : {
62 |        "term" : {
63 |             "content" : "fre"
64 |        }
65 |    }
66 | }
67 | '
68 | 


--------------------------------------------------------------------------------
/gradle.properties:
--------------------------------------------------------------------------------
 1 | group = org.xbib.elasticsearch.plugin
 2 | name = elasticsearch-plugin-bundle
 3 | version = 6.3.2.3
 4 | 
 5 | elasticsearch.version = 6.3.2
 6 | lucene.version = 7.3.1
 7 | 
 8 | icu4j.version = 62.1
 9 | log4j.version = 2.11.0
10 | jackson.version = 2.8.11
11 | standardnumber.version = 1.0.1
12 | junit.version = 4.12
13 | wagon.version = 3.0.0
14 | spatial4j.version = 0.7
15 | jts.version = 1.15.1
16 | jna.version = 4.5.1
17 | checkstyle.version = 8.13
18 | 
19 | org.gradle.warning.mode = all


--------------------------------------------------------------------------------
/gradle/ext.gradle:
--------------------------------------------------------------------------------
 1 | ext {
 2 |     pluginName = 'bundle'
 3 |     pluginClassname  = 'org.xbib.elasticsearch.plugin.bundle.BundlePlugin'
 4 |     pluginDescription = 'A bundle of plugins for Elasticsearch'
 5 |     user = 'jprante'
 6 |     name = 'elasticsearch-plugin-bundle'
 7 |     scmUrl = 'https://github.com/' + user + '/' + name
 8 |     scmConnection = 'scm:git:git://github.com/' + user + '/' + name + '.git'
 9 |     scmDeveloperConnection = 'scm:git:git://github.com/' + user + '/' + name + '.git'
10 | }
11 | 


--------------------------------------------------------------------------------
/gradle/publish.gradle:
--------------------------------------------------------------------------------
 1 | 
 2 | task xbibUpload(type: Upload) {
 3 |     group = 'publish'
 4 |     configuration = configurations.archives
 5 |     uploadDescriptor = true
 6 |     repositories {
 7 |         if (project.hasProperty('xbibUsername')) {
 8 |             mavenDeployer {
 9 |                 configuration = configurations.wagon
10 |                 repository(url: uri(project.property('xbibUrl'))) {
11 |                     authentication(userName: xbibUsername, privateKey: xbibPrivateKey)
12 |                 }
13 |             }
14 |         }
15 |     }
16 | }
17 | 
18 | task sonatypeUpload(type: Upload) {
19 |     group = 'publish'
20 |     configuration = configurations.archives
21 |     uploadDescriptor = true
22 |     repositories {
23 |         if (project.hasProperty('ossrhUsername')) {
24 |             mavenDeployer {
25 |                 beforeDeployment { MavenDeployment deployment -> signing.signPom(deployment) }
26 |                 repository(url: uri(ossrhReleaseUrl)) {
27 |                     authentication(userName: ossrhUsername, password: ossrhPassword)
28 |                 }
29 |                 snapshotRepository(url: uri(ossrhSnapshotUrl)) {
30 |                     authentication(userName: ossrhUsername, password: ossrhPassword)
31 |                 }
32 |                 pom.project {
33 |                     groupId project.group
34 |                     artifactId project.name
35 |                     version project.version
36 |                     name project.name
37 |                     description pluginDescription
38 |                     packaging 'jar'
39 |                     inceptionYear '2012'
40 |                     url scmUrl
41 |                     organization {
42 |                         name 'xbib'
43 |                         url 'http://xbib.org'
44 |                     }
45 |                     developers {
46 |                         developer {
47 |                             id user
48 |                             name 'Jörg Prante'
49 |                             email 'joergprante@gmail.com'
50 |                             url 'https://github.com/jprante'
51 |                         }
52 |                     }
53 |                     scm {
54 |                         url scmUrl
55 |                         connection scmConnection
56 |                         developerConnection scmDeveloperConnection
57 |                     }
58 |                     licenses {
59 |                         license {
60 |                             name 'Affero GNU Public License Version 3'
61 |                             url 'http://www.gnu.org/licenses/agpl-3.0.html'
62 |                         }
63 |                     }
64 |                 }
65 |             }
66 |         }
67 |     }
68 | }
69 | 
70 | nexusStaging {
71 |     packageGroup = "org.xbib"
72 | }
73 | 


--------------------------------------------------------------------------------
/gradle/sourcequality.gradle:
--------------------------------------------------------------------------------
 1 | 
 2 | sonarqube {
 3 |     properties {
 4 |         property "sonar.projectName", "${project.group} ${project.name}"
 5 |         property "sonar.sourceEncoding", "UTF-8"
 6 |         property "sonar.tests", "src/test/java"
 7 |         property "sonar.scm.provider", "git"
 8 |         property "sonar.java.coveragePlugin", "jacoco"
 9 |         property "sonar.junit.reportsPath", "build/test-results/test/"
10 |     }
11 | }
12 | 


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/gradle/wrapper/gradle-wrapper.jar


--------------------------------------------------------------------------------
/gradle/wrapper/gradle-wrapper.properties:
--------------------------------------------------------------------------------
1 | #Fri Mar 15 22:26:04 CET 2019
2 | distributionBase=GRADLE_USER_HOME
3 | distributionPath=wrapper/dists
4 | zipStoreBase=GRADLE_USER_HOME
5 | zipStorePath=wrapper/dists
6 | distributionUrl=https\://services.gradle.org/distributions/gradle-4.10.3-all.zip
7 | 


--------------------------------------------------------------------------------
/gradlew.bat:
--------------------------------------------------------------------------------
 1 | @if "%DEBUG%" == "" @echo off
 2 | @rem ##########################################################################
 3 | @rem
 4 | @rem  Gradle startup script for Windows
 5 | @rem
 6 | @rem ##########################################################################
 7 | 
 8 | @rem Set local scope for the variables with windows NT shell
 9 | if "%OS%"=="Windows_NT" setlocal
10 | 
11 | set DIRNAME=%~dp0
12 | if "%DIRNAME%" == "" set DIRNAME=.
13 | set APP_BASE_NAME=%~n0
14 | set APP_HOME=%DIRNAME%
15 | 
16 | @rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
17 | set DEFAULT_JVM_OPTS="-Xmx64m"
18 | 
19 | @rem Find java.exe
20 | if defined JAVA_HOME goto findJavaFromJavaHome
21 | 
22 | set JAVA_EXE=java.exe
23 | %JAVA_EXE% -version >NUL 2>&1
24 | if "%ERRORLEVEL%" == "0" goto init
25 | 
26 | echo.
27 | echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
28 | echo.
29 | echo Please set the JAVA_HOME variable in your environment to match the
30 | echo location of your Java installation.
31 | 
32 | goto fail
33 | 
34 | :findJavaFromJavaHome
35 | set JAVA_HOME=%JAVA_HOME:"=%
36 | set JAVA_EXE=%JAVA_HOME%/bin/java.exe
37 | 
38 | if exist "%JAVA_EXE%" goto init
39 | 
40 | echo.
41 | echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
42 | echo.
43 | echo Please set the JAVA_HOME variable in your environment to match the
44 | echo location of your Java installation.
45 | 
46 | goto fail
47 | 
48 | :init
49 | @rem Get command-line arguments, handling Windows variants
50 | 
51 | if not "%OS%" == "Windows_NT" goto win9xME_args
52 | 
53 | :win9xME_args
54 | @rem Slurp the command line arguments.
55 | set CMD_LINE_ARGS=
56 | set _SKIP=2
57 | 
58 | :win9xME_args_slurp
59 | if "x%~1" == "x" goto execute
60 | 
61 | set CMD_LINE_ARGS=%*
62 | 
63 | :execute
64 | @rem Setup the command line
65 | 
66 | set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
67 | 
68 | @rem Execute Gradle
69 | "%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
70 | 
71 | :end
72 | @rem End local scope for the variables with windows NT shell
73 | if "%ERRORLEVEL%"=="0" goto mainEnd
74 | 
75 | :fail
76 | rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
77 | rem the _cmd.exe /c_ return code!
78 | if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
79 | exit /b 1
80 | 
81 | :mainEnd
82 | if "%OS%"=="Windows_NT" endlocal
83 | 
84 | :omega
85 | 


--------------------------------------------------------------------------------
/licenses/icu4j-62.1.jar.sha1:
--------------------------------------------------------------------------------
1 | 7a4d00d5ec5febd252a6182e8b6e87a0a9821f81


--------------------------------------------------------------------------------
/licenses/icu4j-LICENSE.txt:
--------------------------------------------------------------------------------
 1 | UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
 2 | 
 3 | Unicode Data Files include all data files under the directories
 4 | http://www.unicode.org/Public/, http://www.unicode.org/reports/,
 5 | http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
 6 | http://www.unicode.org/utility/trac/browser/.
 7 | 
 8 | Unicode Data Files do not include PDF online code charts under the
 9 | directory http://www.unicode.org/Public/.
10 | 
11 | Software includes any source code published in the Unicode Standard
12 | or under the directories
13 | http://www.unicode.org/Public/, http://www.unicode.org/reports/,
14 | http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and
15 | http://www.unicode.org/utility/trac/browser/.
16 | 
17 | NOTICE TO USER: Carefully read the following legal agreement.
18 | BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
19 | DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
20 | YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
21 | TERMS AND CONDITIONS OF THIS AGREEMENT.
22 | IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
23 | THE DATA FILES OR SOFTWARE.
24 | 
25 | COPYRIGHT AND PERMISSION NOTICE
26 | 
27 | Copyright © 1991-2016 Unicode, Inc. All rights reserved.
28 | Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
29 | 
30 | Permission is hereby granted, free of charge, to any person obtaining
31 | a copy of the Unicode data files and any associated documentation
32 | (the "Data Files") or Unicode software and any associated documentation
33 | (the "Software") to deal in the Data Files or Software
34 | without restriction, including without limitation the rights to use,
35 | copy, modify, merge, publish, distribute, and/or sell copies of
36 | the Data Files or Software, and to permit persons to whom the Data Files
37 | or Software are furnished to do so, provided that either
38 | (a) this copyright and permission notice appear with all copies
39 | of the Data Files or Software, or
40 | (b) this copyright and permission notice appear in associated
41 | Documentation.
42 | 
43 | THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
44 | ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
45 | WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
46 | NONINFRINGEMENT OF THIRD PARTY RIGHTS.
47 | IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
48 | NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
49 | DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
50 | DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
51 | TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
52 | PERFORMANCE OF THE DATA FILES OR SOFTWARE.
53 | 
54 | Except as contained in this notice, the name of a copyright holder
55 | shall not be used in advertising or otherwise to promote the sale,
56 | use or other dealings in these Data Files or Software without prior
57 | written authorization of the copyright holder.


--------------------------------------------------------------------------------
/licenses/icu4j-NOTICE.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/licenses/icu4j-NOTICE.txt


--------------------------------------------------------------------------------
/licenses/standardnumber-1.0.1.jar.sha1:
--------------------------------------------------------------------------------
1 | 9d1cf31cbc87cc9cdfd505fd30d3598da4eee700


--------------------------------------------------------------------------------
/licenses/standardnumber-NOTICE.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/licenses/standardnumber-NOTICE.txt


--------------------------------------------------------------------------------
/settings.gradle:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/settings.gradle


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/action/isbnformat/ISBNFormatAction.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.action.isbnformat;
 2 | 
 3 | import org.elasticsearch.action.Action;
 4 | import org.elasticsearch.client.ElasticsearchClient;
 5 | 
 6 | /**
 7 |  * ISBN format action.
 8 |  */
 9 | public class ISBNFormatAction extends Action<ISBNFormatRequest, ISBNFormatResponse, ISBNFormatRequestBuilder> {
10 | 
11 |     public static final String NAME = "isbnformat";
12 | 
13 |     public static final ISBNFormatAction INSTANCE = new ISBNFormatAction();
14 | 
15 |     private ISBNFormatAction() {
16 |         super(NAME);
17 |     }
18 | 
19 |     @Override
20 |     public ISBNFormatRequestBuilder newRequestBuilder(ElasticsearchClient client) {
21 |         return new ISBNFormatRequestBuilder(client);
22 |     }
23 | 
24 |     @Override
25 |     public ISBNFormatResponse newResponse() {
26 |         return new ISBNFormatResponse();
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/action/isbnformat/ISBNFormatRequest.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.action.isbnformat;
 2 | 
 3 | import org.elasticsearch.action.ActionRequest;
 4 | import org.elasticsearch.action.ActionRequestValidationException;
 5 | import org.elasticsearch.common.io.stream.StreamInput;
 6 | import org.elasticsearch.common.io.stream.StreamOutput;
 7 | 
 8 | import java.io.IOException;
 9 | 
10 | import static org.elasticsearch.action.ValidateActions.addValidationError;
11 | 
12 | /**
13 |  * ISBN format request.
14 |  */
15 | public class ISBNFormatRequest extends ActionRequest {
16 | 
17 |     private String value;
18 | 
19 |     @Override
20 |     public ActionRequestValidationException validate() {
21 |         ActionRequestValidationException validationException = null;
22 |         if (value == null) {
23 |             validationException = addValidationError("value is missing", null);
24 |         }
25 |         return validationException;
26 |     }
27 | 
28 |     public String getValue() {
29 |         return value;
30 |     }
31 | 
32 |     public ISBNFormatRequest setValue(String value) {
33 |         this.value = value;
34 |         return this;
35 |     }
36 | 
37 |     @Override
38 |     public void readFrom(StreamInput in) throws IOException {
39 |         super.readFrom(in);
40 |         value = in.readString();
41 |     }
42 | 
43 |     @Override
44 |     public void writeTo(StreamOutput out) throws IOException {
45 |         super.writeTo(out);
46 |         out.writeString(value);
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/action/isbnformat/ISBNFormatRequestBuilder.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.action.isbnformat;
 2 | 
 3 | import org.elasticsearch.action.ActionRequestBuilder;
 4 | import org.elasticsearch.client.ElasticsearchClient;
 5 | 
 6 | /**
 7 |  * ISBN format request builder.
 8 |  */
 9 | public class ISBNFormatRequestBuilder
10 |         extends ActionRequestBuilder<ISBNFormatRequest, ISBNFormatResponse, ISBNFormatRequestBuilder> {
11 | 
12 |     public ISBNFormatRequestBuilder(ElasticsearchClient client) {
13 |         super(client, ISBNFormatAction.INSTANCE, new ISBNFormatRequest());
14 |     }
15 | 
16 |     public ISBNFormatRequestBuilder setValue(String string) {
17 |         request.setValue(string);
18 |         return this;
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/action/isbnformat/ISBNFormatResponse.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.action.isbnformat;
 2 | 
 3 | import org.elasticsearch.action.ActionResponse;
 4 | import org.elasticsearch.common.xcontent.StatusToXContentObject;
 5 | import org.elasticsearch.common.xcontent.ToXContent;
 6 | import org.elasticsearch.common.xcontent.XContentBuilder;
 7 | import org.elasticsearch.rest.RestStatus;
 8 | 
 9 | import java.io.IOException;
10 | 
11 | import static org.elasticsearch.rest.RestStatus.OK;
12 | 
13 | /**
14 |  * ISBN format response.
15 |  */
16 | public class ISBNFormatResponse extends ActionResponse implements StatusToXContentObject {
17 | 
18 |     private String isbn10;
19 | 
20 |     private String isbn10Formatted;
21 | 
22 |     private String isbn13;
23 | 
24 |     private String isbn13Formatted;
25 | 
26 |     private String invalid;
27 | 
28 |     public ISBNFormatResponse setIsbn10(String value) {
29 |         this.isbn10 = value;
30 |         return this;
31 |     }
32 | 
33 |     public ISBNFormatResponse setIsbn10Formatted(String value) {
34 |         this.isbn10Formatted = value;
35 |         return this;
36 |     }
37 | 
38 |     public ISBNFormatResponse setIsbn13(String value) {
39 |         this.isbn13 = value;
40 |         return this;
41 |     }
42 | 
43 |     public ISBNFormatResponse setIsbn13Formatted(String value) {
44 |         this.isbn13Formatted = value;
45 |         return this;
46 |     }
47 | 
48 |     public ISBNFormatResponse setInvalid(String value) {
49 |         this.invalid = value;
50 |         return this;
51 |     }
52 | 
53 |     @Override
54 |     public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params params) throws IOException {
55 |         builder.startObject()
56 |                 .startObject("result")
57 |                 .field("isbn10", isbn10)
58 |                 .field("isbn10formatted", isbn10Formatted)
59 |                 .field("isbn13", isbn13)
60 |                 .field("isbn13formatted", isbn13Formatted)
61 |                 .field("invalid", invalid)
62 |                 .endObject()
63 |                 .endObject();
64 |         return builder;
65 |     }
66 | 
67 |     @Override
68 |     public RestStatus status() {
69 |         return OK;
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/action/isbnformat/TransportISBNFormatAction.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.action.isbnformat;
 2 | 
 3 | import org.elasticsearch.action.ActionListener;
 4 | import org.elasticsearch.action.support.ActionFilters;
 5 | import org.elasticsearch.action.support.TransportAction;
 6 | import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
 7 | import org.elasticsearch.common.inject.Inject;
 8 | import org.elasticsearch.common.settings.Settings;
 9 | import org.elasticsearch.threadpool.ThreadPool;
10 | import org.elasticsearch.transport.TransportService;
11 | import org.xbib.elasticsearch.plugin.bundle.common.standardnumber.StandardnumberService;
12 | 
13 | /**
14 |  * Transport action for ISBN format action.
15 |  */
16 | public class TransportISBNFormatAction extends TransportAction<ISBNFormatRequest, ISBNFormatResponse> {
17 | 
18 |     private final StandardnumberService standardnumberService;
19 | 
20 |     @Inject
21 |     public TransportISBNFormatAction(Settings settings, ThreadPool threadPool,
22 |                                      ActionFilters actionFilters,
23 |                                      IndexNameExpressionResolver indexNameExpressionResolver,
24 |                                      TransportService transportService,
25 |                                      StandardnumberService standardnumberService) {
26 |         super(settings, ISBNFormatAction.NAME, threadPool, actionFilters, indexNameExpressionResolver,
27 |                 transportService.getTaskManager());
28 |         this.standardnumberService = standardnumberService;
29 |     }
30 | 
31 |     @Override
32 |     protected void doExecute(ISBNFormatRequest request, ActionListener<ISBNFormatResponse> listener) {
33 |         ISBNFormatResponse response = new ISBNFormatResponse();
34 |         try {
35 |             standardnumberService.handle(request.getValue(), response);
36 |         } catch (IllegalArgumentException e) {
37 |             logger.debug(e.getMessage(), e);
38 |             response.setInvalid(request.getValue());
39 |         }
40 |         listener.onResponse(response);
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/action/isbnformat/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes for ISBN formatter action.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.action.isbnformat;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/action/langdetect/LangdetectAction.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.action.langdetect;
 2 | 
 3 | import org.elasticsearch.action.Action;
 4 | import org.elasticsearch.client.ElasticsearchClient;
 5 | 
 6 | /**
 7 |  * Language detection action.
 8 |  */
 9 | public class LangdetectAction extends Action<LangdetectRequest, LangdetectResponse, LangdetectRequestBuilder> {
10 | 
11 |     public static final String NAME = "langdetect";
12 | 
13 |     public static final LangdetectAction INSTANCE = new LangdetectAction();
14 | 
15 |     private LangdetectAction() {
16 |         super(NAME);
17 |     }
18 | 
19 |     @Override
20 |     public LangdetectRequestBuilder newRequestBuilder(ElasticsearchClient client) {
21 |         return new LangdetectRequestBuilder(client);
22 |     }
23 | 
24 |     @Override
25 |     public LangdetectResponse newResponse() {
26 |         return new LangdetectResponse();
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/action/langdetect/LangdetectRequest.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.action.langdetect;
 2 | 
 3 | import org.elasticsearch.action.ActionRequest;
 4 | import org.elasticsearch.action.ActionRequestValidationException;
 5 | import org.elasticsearch.common.io.stream.StreamInput;
 6 | import org.elasticsearch.common.io.stream.StreamOutput;
 7 | 
 8 | import java.io.IOException;
 9 | 
10 | import static org.elasticsearch.action.ValidateActions.addValidationError;
11 | 
12 | /**
13 |  * Language detection request.
14 |  */
15 | public class LangdetectRequest extends ActionRequest {
16 | 
17 |     private String profile;
18 | 
19 |     private String text;
20 | 
21 |     @Override
22 |     public ActionRequestValidationException validate() {
23 |         ActionRequestValidationException validationException = null;
24 |         if (text == null) {
25 |             validationException = addValidationError("text is missing", null);
26 |         }
27 |         return validationException;
28 |     }
29 | 
30 |     public String getProfile() {
31 |         return profile;
32 |     }
33 | 
34 |     public LangdetectRequest setProfile(String profile) {
35 |         this.profile = profile;
36 |         return this;
37 |     }
38 | 
39 |     public String getText() {
40 |         return text;
41 |     }
42 | 
43 |     public LangdetectRequest setText(String text) {
44 |         this.text = text;
45 |         return this;
46 |     }
47 | 
48 |     @Override
49 |     public void readFrom(StreamInput in) throws IOException {
50 |         super.readFrom(in);
51 |         text = in.readString();
52 |         profile = in.readOptionalString();
53 |     }
54 | 
55 |     @Override
56 |     public void writeTo(StreamOutput out) throws IOException {
57 |         super.writeTo(out);
58 |         out.writeString(text);
59 |         out.writeOptionalString(profile);
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/action/langdetect/LangdetectRequestBuilder.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.action.langdetect;
 2 | 
 3 | import org.elasticsearch.action.ActionRequestBuilder;
 4 | import org.elasticsearch.client.ElasticsearchClient;
 5 | 
 6 | /**
 7 |  * Language detection request builder.
 8 |  */
 9 | public class LangdetectRequestBuilder extends ActionRequestBuilder<LangdetectRequest, LangdetectResponse, LangdetectRequestBuilder> {
10 | 
11 |     public LangdetectRequestBuilder(ElasticsearchClient client) {
12 |         super(client, LangdetectAction.INSTANCE, new LangdetectRequest());
13 |     }
14 | 
15 |     public LangdetectRequestBuilder setProfile(String string) {
16 |         request.setProfile(string);
17 |         return this;
18 |     }
19 | 
20 |     public LangdetectRequestBuilder setText(String string) {
21 |         request.setText(string);
22 |         return this;
23 |     }
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/action/langdetect/LangdetectResponse.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.action.langdetect;
 2 | 
 3 | import org.elasticsearch.action.ActionResponse;
 4 | import org.elasticsearch.common.Strings;
 5 | import org.elasticsearch.common.xcontent.StatusToXContentObject;
 6 | import org.elasticsearch.common.xcontent.ToXContent;
 7 | import org.elasticsearch.common.xcontent.XContentBuilder;
 8 | import org.elasticsearch.rest.RestStatus;
 9 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.Language;
10 | 
11 | import java.io.IOException;
12 | import java.util.ArrayList;
13 | import java.util.List;
14 | 
15 | import static org.elasticsearch.rest.RestStatus.OK;
16 | 
17 | /**
18 |  * Language detection response.
19 |  */
20 | public class LangdetectResponse extends ActionResponse implements StatusToXContentObject {
21 | 
22 |     private String profile;
23 | 
24 |     private List<Language> languages = new ArrayList<>();
25 | 
26 |     public String getProfile() {
27 |         return profile;
28 |     }
29 | 
30 |     public LangdetectResponse setProfile(String profile) {
31 |         this.profile = profile;
32 |         return this;
33 |     }
34 | 
35 |     public List<Language> getLanguages() {
36 |         return languages;
37 |     }
38 | 
39 |     public LangdetectResponse setLanguages(List<Language> languages) {
40 |         this.languages = languages;
41 |         return this;
42 |     }
43 | 
44 |     @Override
45 |     public XContentBuilder toXContent(XContentBuilder builder, ToXContent.Params params) throws IOException {
46 |         if (!Strings.isNullOrEmpty(profile)) {
47 |             builder.field("profile", profile);
48 |         }
49 |         builder.startArray("languages");
50 |         for (Language lang : languages) {
51 |             builder.startObject().field("language", lang.getLanguage())
52 |                     .field("probability", lang.getProbability()).endObject();
53 |         }
54 |         builder.endArray();
55 |         return builder;
56 |     }
57 | 
58 |     @Override
59 |     public RestStatus status() {
60 |         return OK;
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/action/langdetect/TransportLangdetectAction.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.action.langdetect;
 2 | 
 3 | import org.elasticsearch.action.ActionListener;
 4 | import org.elasticsearch.action.support.ActionFilters;
 5 | import org.elasticsearch.action.support.TransportAction;
 6 | import org.elasticsearch.cluster.metadata.IndexNameExpressionResolver;
 7 | import org.elasticsearch.common.inject.Inject;
 8 | import org.elasticsearch.common.settings.Settings;
 9 | import org.elasticsearch.threadpool.ThreadPool;
10 | import org.elasticsearch.transport.TransportService;
11 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.LangdetectService;
12 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.Language;
13 | 
14 | import java.util.HashMap;
15 | import java.util.List;
16 | import java.util.Map;
17 | 
18 | /**
19 |  * Transport action for language detection response.
20 |  */
21 | public class TransportLangdetectAction extends TransportAction<LangdetectRequest, LangdetectResponse> {
22 | 
23 |     private static final Map<String, LangdetectService> services = new HashMap<>();
24 | 
25 |     @Inject
26 |     public TransportLangdetectAction(Settings settings, ThreadPool threadPool,
27 |                                      ActionFilters actionFilters,
28 |                                      IndexNameExpressionResolver indexNameExpressionResolver,
29 |                                      TransportService transportService) {
30 |         super(settings, LangdetectAction.NAME, threadPool, actionFilters, indexNameExpressionResolver, transportService.getTaskManager());
31 |         services.put("", new LangdetectService(settings));
32 |     }
33 | 
34 |     @Override
35 |     protected void doExecute(LangdetectRequest request, ActionListener<LangdetectResponse> listener) {
36 |         String profile = request.getProfile();
37 |         if (profile == null) {
38 |             profile = "";
39 |         }
40 |         if (!services.containsKey(profile)) {
41 |             services.put(profile, new LangdetectService(settings, profile));
42 |         }
43 |         List<Language> langs = services.get(profile).detectAll(request.getText());
44 |         listener.onResponse(new LangdetectResponse().setLanguages(langs).setProfile(request.getProfile()));
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/action/langdetect/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes for language detection action.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.action.langdetect;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/common/decompound/fst/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes for Finite-State-Transformer based decompounder.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.common.decompound.fst;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/common/decompound/patricia/Node.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.common.decompound.patricia;
 2 | 
 3 | import java.util.ArrayList;
 4 | import java.util.List;
 5 | 
 6 | /**
 7 |  * Node.
 8 |  */
 9 | class Node {
10 | 
11 |     private String content;
12 | 
13 |     private int pos;
14 | 
15 |     private List<String> classes;
16 | 
17 |     private List<Node> children;
18 | 
19 |     Node() {
20 |         this.content = "";
21 |         this.classes = new ArrayList<>();
22 |         this.children = new ArrayList<>();
23 |     }
24 | 
25 |     Node(String content) {
26 |         this.content = content;
27 |         this.classes = new ArrayList<>();
28 |         this.children = new ArrayList<>();
29 |     }
30 | 
31 |     public void setContent(String content) {
32 |         this.content = content;
33 |     }
34 | 
35 |     public String getContent() {
36 |         return content;
37 |     }
38 | 
39 |     public void setPos(int pos) {
40 |         this.pos = pos;
41 |     }
42 | 
43 |     public int getPos() {
44 |         return pos;
45 |     }
46 | 
47 |     public Node classes(List<String> classes) {
48 |         this.classes = classes;
49 |         return this;
50 |     }
51 | 
52 |     public List<String> classes() {
53 |         return classes;
54 |     }
55 | 
56 |     public Node children(List<Node> children) {
57 |         this.children = children;
58 |         return this;
59 |     }
60 | 
61 |     public List<Node> children() {
62 |         return children;
63 |     }
64 | 
65 |     @Override
66 |     public String toString() {
67 |         return "[" + content + ',' + classes + ']';
68 |     }
69 | }


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/common/decompound/patricia/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes for Patricia-Trie based decompounder.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.common.decompound.patricia;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/common/fsa/FSAFlags.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.common.fsa;
 2 | 
 3 | import java.util.Set;
 4 | 
 5 | /**
 6 |  * FSA automaton flags. Where applicable, flags follow Daciuk's <code>fsa</code> package.
 7 |  */
 8 | public enum FSAFlags {
 9 |     /**
10 |      * Daciuk: flexible FSA encoding.
11 |      */
12 |     FLEXIBLE(1),
13 | 
14 |     /**
15 |      * Daciuk: stop bit in use.
16 |      */
17 |     STOPBIT(1 << 1),
18 | 
19 |     /**
20 |      * Daciuk: next bit in use.
21 |      */
22 |     NEXTBIT(1 << 2),
23 | 
24 |     /**
25 |      * Daciuk: tails compression.
26 |      */
27 |     TAILS(1 << 3),
28 | 
29 |     /*
30 |      * These flags are outside of byte range (never occur in Daciuk's FSA).
31 |      */
32 | 
33 |     /**
34 |      * The FSA contains right-language count numbers on states.
35 |      *
36 |      * @see FSA#getRightLanguageCount(int)
37 |      */
38 |     NUMBERS(1 << 8),
39 | 
40 |     /**
41 |      * The FSA supports legacy built-in separator and filler characters (Daciuk's FSA package
42 |      * compatibility).
43 |      */
44 |     SEPARATORS(1 << 9);
45 | 
46 |     /**
47 |      * Bit mask for the corresponding flag.
48 |      */
49 |     public final int bits;
50 | 
51 |     FSAFlags(int bits) {
52 |         this.bits = bits;
53 |     }
54 | 
55 |     /**
56 |      * @param flag  flag
57 |      * @param flags flags
58 |      * @return <code>true</code> if the corresponding flag is set in the bit set.
59 |      */
60 |     public static boolean isSet(int flags, FSAFlags flag) {
61 |         return (flags & flag.bits) != 0;
62 |     }
63 | 
64 |     public static short asShort(Set<FSAFlags> flags) {
65 |         short value = 0;
66 |         for (FSAFlags f : flags) {
67 |             value |= f.bits;
68 |         }
69 |         return value;
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/common/fsa/MatchResult.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.common.fsa;
 2 | 
 3 | /**
 4 |  * A matching result returned from {@link FSATraversal}.
 5 |  *
 6 |  * @see FSATraversal
 7 |  */
 8 | public final class MatchResult {
 9 |     /**
10 |      * The automaton has exactly one match for the input sequence.
11 |      */
12 |     public static final int EXACT_MATCH = 0;
13 | 
14 |     /**
15 |      * The automaton has no match for the input sequence.
16 |      */
17 |     public static final int NO_MATCH = -1;
18 | 
19 |     /**
20 |      * The automaton contains a prefix of the input sequence. That is:
21 |      * one of the input sequences used to build the automaton is a
22 |      * prefix of the input sequence that is shorter than the sequence.
23 |      * {@link MatchResult#index} will contain an index of the
24 |      * first character of the input sequence not present in the
25 |      * dictionary.
26 |      */
27 |     public static final int AUTOMATON_HAS_PREFIX = -3;
28 | 
29 |     /**
30 |      * The sequence is a prefix of at least one sequence in the automaton.
31 |      * {@link MatchResult#node} returns the node from which all sequences
32 |      * with the given prefix start in the automaton.
33 |      */
34 |     public static final int SEQUENCE_IS_A_PREFIX = -4;
35 | 
36 |     /**
37 |      * One of the match kind constants defined in this class.
38 |      *
39 |      * @see #NO_MATCH
40 |      * @see #EXACT_MATCH
41 |      * @see #AUTOMATON_HAS_PREFIX
42 |      * @see #SEQUENCE_IS_A_PREFIX
43 |      */
44 |     private int kind;
45 | 
46 |     /**
47 |      * Input sequence's index, interpretation depends on {@link #kind}.
48 |      */
49 |     private int index;
50 | 
51 |     /**
52 |      * Automaton node, interpretation depends on the {@link #kind}.
53 |      */
54 |     private int node;
55 | 
56 |     /**
57 |      * Constructor.
58 |      */
59 |     public MatchResult() {
60 |         reset(NO_MATCH, 0, 0);
61 |     }
62 | 
63 |     /**
64 |      * Reset.
65 |      * @param index index
66 |      * @param kind kind
67 |      * @param node node
68 |      */
69 |     void reset(int kind, int index, int node) {
70 |         this.kind = kind;
71 |         this.index = index;
72 |         this.node = node;
73 |     }
74 | 
75 |     public int getKind() {
76 |         return kind;
77 |     }
78 | 
79 |     public int getIndex() {
80 |         return index;
81 |     }
82 | 
83 |     public int getNode() {
84 |         return node;
85 |     }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/common/fsa/StateVisitor.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.common.fsa;
 2 | 
 3 | /**
 4 |  * State visitor.
 5 |  *
 6 |  * @see FSA#visitInPostOrder(StateVisitor)
 7 |  * @see FSA#visitInPreOrder(StateVisitor)
 8 |  */
 9 | 
10 | @FunctionalInterface
11 | public interface StateVisitor {
12 | 
13 |     boolean accept(int state);
14 | }
15 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/common/langdetect/LangProfile.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.common.langdetect;
 2 | 
 3 | import org.elasticsearch.common.xcontent.XContentHelper;
 4 | import org.elasticsearch.common.xcontent.json.JsonXContent;
 5 | 
 6 | import java.io.IOException;
 7 | import java.io.InputStream;
 8 | import java.util.ArrayList;
 9 | import java.util.HashMap;
10 | import java.util.List;
11 | import java.util.Map;
12 | 
13 | /**
14 |  * Language profile.
15 |  */
16 | public class LangProfile {
17 | 
18 |     private String name;
19 | 
20 |     private Map<String, Integer> freq;
21 | 
22 |     private List<Integer> nWords;
23 | 
24 |     public LangProfile() {
25 |         this.freq = new HashMap<>();
26 |         this.nWords = new ArrayList<>(NGram.N_GRAM);
27 |         for (int i = 0; i < NGram.N_GRAM; i++) {
28 |             nWords.add(0);
29 |         }
30 |     }
31 | 
32 |     public void add(String gram) {
33 |         if (name == null || gram == null) {
34 |             return;
35 |         }
36 |         int len = gram.length();
37 |         if (len < 1 || len > NGram.N_GRAM) {
38 |             return;
39 |         }
40 |         nWords.set(len - 1, nWords.get(len - 1) + 1);
41 |         if (freq.containsKey(gram)) {
42 |             freq.put(gram, freq.get(gram) + 1);
43 |         } else {
44 |             freq.put(gram, 1);
45 |         }
46 |     }
47 | 
48 |     public String getName() {
49 |         return name;
50 |     }
51 | 
52 |     public void setName(String name) {
53 |         this.name = name;
54 |     }
55 | 
56 |     public List<Integer> getNWords() {
57 |         return nWords;
58 |     }
59 | 
60 |     public Map<String, Integer> getFreq() {
61 |         return freq;
62 |     }
63 | 
64 |     public void setFreq(Map<String, Integer> freq) {
65 |         this.freq = freq;
66 |     }
67 | 
68 |     @SuppressWarnings("unchecked")
69 |     public void read(InputStream input) throws IOException {
70 |         Map<String, Object> map = XContentHelper.convertToMap(JsonXContent.jsonXContent, input, true);
71 |         freq = (Map<String, Integer>) map.get("freq");
72 |         name = (String) map.get("name");
73 |         nWords = (List<Integer>) map.get("n_words");
74 |     }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/common/langdetect/Language.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.common.langdetect;
 2 | 
 3 | import org.elasticsearch.common.io.stream.StreamInput;
 4 | import org.elasticsearch.common.io.stream.StreamOutput;
 5 | import org.elasticsearch.common.io.stream.Streamable;
 6 | 
 7 | import java.io.IOException;
 8 | 
 9 | /**
10 |  * Language.
11 |  */
12 | public class Language implements Streamable {
13 | 
14 |     private String lang;
15 | 
16 |     private double prob;
17 | 
18 |     public Language(String lang, double prob) {
19 |         this.lang = lang;
20 |         this.prob = prob;
21 |     }
22 | 
23 |     public String getLanguage() {
24 |         return lang;
25 |     }
26 | 
27 |     public double getProbability() {
28 |         return prob;
29 |     }
30 | 
31 |     @Override
32 |     public void readFrom(StreamInput in) throws IOException {
33 |         this.lang = in.readString();
34 |         this.prob = in.readDouble();
35 |     }
36 | 
37 |     @Override
38 |     public void writeTo(StreamOutput out) throws IOException {
39 |         out.writeString(lang);
40 |         out.writeDouble(prob);
41 |     }
42 | 
43 |     @Override
44 |     public String toString() {
45 |         return lang + " (prob=" + prob + ")";
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/common/langdetect/LanguageDetectionException.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.common.langdetect;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | /**
 6 |  * Language detection exception.
 7 |  */
 8 | public class LanguageDetectionException extends IOException {
 9 | 
10 |     public LanguageDetectionException(String message) {
11 |         super(message);
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/common/langdetect/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes for language detection implementation.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.common.langdetect;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/common/reference/ReferenceService.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.common.reference;
 2 | 
 3 | import org.elasticsearch.client.Client;
 4 | import org.elasticsearch.common.component.AbstractLifecycleComponent;
 5 | import org.elasticsearch.common.inject.Inject;
 6 | import org.elasticsearch.common.inject.Injector;
 7 | import org.elasticsearch.common.settings.Settings;
 8 | import org.xbib.elasticsearch.plugin.bundle.index.mapper.reference.ReferenceMapperTypeParser;
 9 | 
10 | /**
11 |  * Reference service.
12 |  */
13 | public class ReferenceService extends AbstractLifecycleComponent {
14 | 
15 |     private final Injector injector;
16 | 
17 |     @Inject
18 |     public ReferenceService(Settings settings, Injector injector) {
19 |         super(settings);
20 |         this.injector = injector;
21 |     }
22 | 
23 |     @Override
24 |     protected void doStart() {
25 |         // get the client from the injector
26 |         Client client = injector.getInstance(Client.class);
27 |         // copy the client to the mapper type parser
28 |         ReferenceMapperTypeParser referenceMapperTypeParser = injector.getInstance(ReferenceMapperTypeParser.class);
29 |         referenceMapperTypeParser.setClient(client);
30 |     }
31 | 
32 |     @Override
33 |     protected void doStop() {
34 |         // nothing to stop
35 |     }
36 | 
37 |     @Override
38 |     protected void doClose() {
39 |         // nothing to close
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/common/reference/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes for reference mapper implementation.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.common.reference;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/common/standardnumber/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes for standard number implementation.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.common.standardnumber;


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/autophrase/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes for auto phrase token filter.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.autophrase;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/baseform/BaseformTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.baseform;
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream;
 4 | import org.elasticsearch.ElasticsearchException;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | import org.elasticsearch.env.Environment;
 7 | import org.elasticsearch.index.IndexSettings;
 8 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 9 | import org.xbib.elasticsearch.plugin.bundle.common.fsa.Dictionary;
10 | 
11 | import java.io.IOException;
12 | import java.io.InputStreamReader;
13 | import java.nio.charset.StandardCharsets;
14 | 
15 | /**
16 |  * Base form token filter factory.
17 |  */
18 | public class BaseformTokenFilterFactory extends AbstractTokenFilterFactory {
19 | 
20 |     private final boolean respectKeywords;
21 | 
22 |     private final Dictionary dictionary;
23 | 
24 |     public BaseformTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
25 |         super(indexSettings, name, settings);
26 |         this.respectKeywords = settings.getAsBoolean("respect_keywords", false);
27 |         this.dictionary = createDictionary(settings);
28 |     }
29 | 
30 |     @Override
31 |     public TokenStream create(TokenStream tokenStream) {
32 |         return new BaseformTokenFilter(tokenStream, dictionary, respectKeywords);
33 |     }
34 | 
35 |     private Dictionary createDictionary(Settings settings) {
36 |         try {
37 |             String lang = settings.get("language", "de");
38 |             String path = lang + "-lemma-utf8.txt";
39 |             return new Dictionary().loadLines(new InputStreamReader(getClass().getResourceAsStream(path), StandardCharsets.UTF_8));
40 |         } catch (IOException e) {
41 |             throw new ElasticsearchException("resources in settings not found: " + settings, e);
42 |         }
43 |     }
44 | }
45 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/baseform/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes for baseform token filter.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.baseform;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/concat/ConcatTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.concat;
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.env.Environment;
 6 | import org.elasticsearch.index.IndexSettings;
 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 8 | 
 9 | /**
10 |  * Concat token filter factory.
11 |  */
12 | public class ConcatTokenFilterFactory extends AbstractTokenFilterFactory {
13 | 
14 |     public ConcatTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
15 |         super(indexSettings, name, settings);
16 |     }
17 | 
18 |     @Override
19 |     public TokenStream create(TokenStream tokenStream) {
20 |         return new ConcatTokenFilter(tokenStream);
21 | 
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/concat/PairTokenFilter.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.concat;
 2 | 
 3 | import org.apache.lucene.analysis.TokenFilter;
 4 | import org.apache.lucene.analysis.TokenStream;
 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.ArrayDeque;
 9 | import java.util.Deque;
10 | import java.util.LinkedList;
11 | import java.util.Map;
12 | import java.util.Queue;
13 | 
14 | /**
15 |  * Pair token filter.
16 |  */
17 | public final class PairTokenFilter extends TokenFilter {
18 | 
19 |     private final CharTermAttribute termAttr;
20 | 
21 |     private final Map<String, String> pairs;
22 | 
23 |     private final Queue<String> queue;
24 | 
25 |     protected PairTokenFilter(TokenStream input, Map<String, String> pairs) {
26 |         super(input);
27 |         this.termAttr = addAttribute(CharTermAttribute.class);
28 |         this.pairs = pairs;
29 |         this.queue = new LinkedList<>();
30 |     }
31 | 
32 |     @Override
33 |     public boolean incrementToken() throws IOException {
34 |         if (!queue.isEmpty()) {
35 |             termAttr.append(queue.poll());
36 |             return true;
37 |         }
38 |         if (!input.incrementToken()) {
39 |             return false;
40 |         }
41 |         Deque<String> stack = new ArrayDeque<>();
42 |         while (pairs.containsKey(termAttr.toString())) {
43 |             String term = termAttr.toString();
44 |             stack.push(term);
45 |             if (!input.incrementToken()) {
46 |                 break;
47 |             }
48 |             String next = termAttr.toString();
49 |             if (pairs.get(term).equals(next)) {
50 |                 stack.pop();
51 |                 stack.push(term + " " + next);
52 |                 break;
53 |             } else if (!pairs.containsKey(next)) {
54 |                 stack.push(next);
55 |             }
56 |         }
57 |         for (String term : stack) {
58 |             queue.add(term);
59 |         }
60 |         if (!queue.isEmpty()) {
61 |             termAttr.setEmpty().append(queue.poll());
62 |         }
63 |         return true;
64 |     }
65 | 
66 |     @Override
67 |     public boolean equals(Object object) {
68 |         return object instanceof PairTokenFilter &&
69 |                 pairs.equals( ((PairTokenFilter)object).pairs);
70 |     }
71 | 
72 |     @Override
73 |     public int hashCode() {
74 |         return pairs.hashCode();
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/concat/PairTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.concat;
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.env.Environment;
 6 | import org.elasticsearch.index.IndexSettings;
 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 8 | 
 9 | import java.util.LinkedHashMap;
10 | import java.util.Map;
11 | 
12 | /**
13 |  * Pair token filter factory.
14 |  */
15 | public class PairTokenFilterFactory extends AbstractTokenFilterFactory {
16 | 
17 |     private final Map<String, String> pairs;
18 | 
19 |     public PairTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
20 |         super(indexSettings, name, settings);
21 |         this.pairs = new LinkedHashMap<>();
22 |         Settings pairsSettings = settings.getAsSettings("pairs");
23 |         for (String key: pairsSettings.keySet()) {
24 |             pairs.put(key, pairsSettings.get(key));
25 |         }
26 |     }
27 | 
28 |     @Override
29 |     public TokenStream create(TokenStream tokenStream) {
30 |         return new PairTokenFilter(tokenStream, pairs);
31 | 
32 |     }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/concat/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes for concat token filter.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.concat;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/fst/FstDecompoundTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.decompound.fst;
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.env.Environment;
 6 | import org.elasticsearch.index.IndexSettings;
 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 8 | import org.xbib.elasticsearch.plugin.bundle.common.decompound.fst.FstDecompounder;
 9 | 
10 | import java.io.IOException;
11 | import java.util.List;
12 | 
13 | /**
14 |  * Finite state decompound token filter factory.
15 |  */
16 | public class FstDecompoundTokenFilterFactory extends AbstractTokenFilterFactory {
17 | 
18 |     private final FstDecompounder decompounder;
19 | 
20 |     private final Boolean respectKeywords;
21 | 
22 |     private final Boolean subwordsonly;
23 | 
24 |     public FstDecompoundTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
25 |                                            Settings settings) {
26 |         super(indexSettings, name, settings);
27 |         this.decompounder = createDecompounder(settings);
28 |         this.respectKeywords = settings.getAsBoolean("respect_keywords", false);
29 |         this.subwordsonly = settings.getAsBoolean("subwords_only", false);
30 |     }
31 | 
32 |     @Override
33 |     public TokenStream create(TokenStream tokenStream) {
34 |         return new FstDecompoundTokenFilter(tokenStream, decompounder, respectKeywords, subwordsonly);
35 |     }
36 | 
37 |     private FstDecompounder createDecompounder(Settings settings) {
38 |         try {
39 |             String words = settings.get("fst", "words.fst");
40 |             List<String> glueMorphs = settings.getAsList("glue_morphs");
41 |             return new FstDecompounder(getClass().getResourceAsStream(words), glueMorphs);
42 |         } catch (IOException e) {
43 |             throw new IllegalArgumentException("fst decompounder resources in settings not found: " + settings, e);
44 |         }
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/fst/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Finite-State-Transformer base expand token filter.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.decompound.fst;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/patricia/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Patricia-Trie based decompounder.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.decompound.patricia;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/german/GermanNormalizationFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.german;
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream;
 4 | import org.apache.lucene.analysis.de.GermanNormalizationFilter;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | import org.elasticsearch.env.Environment;
 7 | import org.elasticsearch.index.IndexSettings;
 8 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 9 | 
10 | /**
11 |  * German normalization filter factory.
12 |  */
13 | public class GermanNormalizationFilterFactory extends AbstractTokenFilterFactory {
14 | 
15 |     public GermanNormalizationFilterFactory(IndexSettings indexSettings, Environment environment, String name,
16 |                                             Settings settings) {
17 |         super(indexSettings, name, settings);
18 |     }
19 | 
20 |     @Override
21 |     public TokenStream create(TokenStream tokenStream) {
22 |         return new GermanNormalizationFilter(tokenStream);
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/german/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * German normalization filter.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.german;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/hyphen/HyphenAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.hyphen;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.apache.lucene.analysis.Tokenizer;
 5 | 
 6 | /**
 7 |  * Hyphen analyzer.
 8 |  */
 9 | public class HyphenAnalyzer extends Analyzer {
10 | 
11 |     private final HyphenTokenizerFactory tokenizerFactory;
12 | 
13 |     public HyphenAnalyzer(HyphenTokenizerFactory tokenizerFactory) {
14 |         this.tokenizerFactory = tokenizerFactory;
15 |     }
16 | 
17 |     @Override
18 |     protected TokenStreamComponents createComponents(String fieldName) {
19 |         Tokenizer tokenizer = tokenizerFactory.create();
20 |         return new TokenStreamComponents(tokenizer, tokenizer);
21 |     }
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/hyphen/HyphenTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.hyphen;
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.env.Environment;
 6 | import org.elasticsearch.index.IndexSettings;
 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 8 | 
 9 | /**
10 |  * Hyphen token filter factory.
11 |  */
12 | public class HyphenTokenFilterFactory extends AbstractTokenFilterFactory {
13 | 
14 |     private final char[] hyphenchars;
15 | 
16 |     private final boolean subwords;
17 | 
18 |     private final boolean respectKeywords;
19 | 
20 |     public HyphenTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
21 |                                     Settings settings) {
22 |         super(indexSettings, name, settings);
23 |         this.hyphenchars = settings.get("hyphens") != null ? settings.get("hyphens").toCharArray() : HyphenTokenFilter.HYPHEN;
24 |         this.subwords = settings.getAsBoolean("subwords", true);
25 |         this.respectKeywords = settings.getAsBoolean("respect_keywords", false);
26 |     }
27 | 
28 |     @Override
29 |     public TokenStream create(TokenStream tokenStream) {
30 |         return new HyphenTokenFilter(tokenStream, hyphenchars, subwords, respectKeywords);
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/hyphen/HyphenTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.hyphen;
 2 | 
 3 | import org.apache.lucene.analysis.Tokenizer;
 4 | import org.apache.lucene.analysis.standard.StandardAnalyzer;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | import org.elasticsearch.env.Environment;
 7 | import org.elasticsearch.index.IndexSettings;
 8 | import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 9 | 
10 | /**
11 |  * Hyphen tokenizer factory.
12 |  */
13 | public class HyphenTokenizerFactory extends AbstractTokenizerFactory {
14 | 
15 |     private final Integer maxTokenLength;
16 | 
17 |     public HyphenTokenizerFactory(IndexSettings indexSettings, Environment environment, String name,
18 |                                   Settings settings) {
19 |         super(indexSettings, name, settings);
20 |         this.maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
21 |     }
22 | 
23 |     @Override
24 |     public Tokenizer create() {
25 |         return new HyphenTokenizer(maxTokenLength);
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/hyphen/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes for hyphen analysis.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.hyphen;


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuCollationKeyAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu;
 2 | 
 3 | import com.ibm.icu.text.Collator;
 4 | import org.apache.lucene.analysis.Analyzer;
 5 | import org.apache.lucene.analysis.core.KeywordTokenizer;
 6 | import org.apache.lucene.collation.CollationKeyAnalyzer;
 7 | 
 8 | /**
 9 |  * Configures a {@link KeywordTokenizer} with an {@link IcuCollationAttributeFactory}.
10 |  * <p>
11 |  * Converts the token into its {@link com.ibm.icu.text.CollationKey} and
12 |  * then encodes the CollationKey directly.
13 |  * </p>
14 |  * <p>
15 |  * <strong>WARNING:</strong> Make sure you use exactly the same Collator at
16 |  * index and query time -- CollationKeys are only comparable when produced by
17 |  * the same Collator.  {@link com.ibm.icu.text.RuleBasedCollator}s are
18 |  * independently versioned, so it is safe to search against stored
19 |  * CollationKeys if the following are exactly the same (best practice is
20 |  * to store this information with the index and check that they remain the
21 |  * same at query time):
22 |  * </p>
23 |  * <ol>
24 |  * <li>
25 |  * Collator version - see {@link Collator#getVersion()}
26 |  * </li>
27 |  * <li>
28 |  * The collation strength used - see {@link Collator#setStrength(int)}
29 |  * </li>
30 |  * </ol>
31 |  * <p>
32 |  * CollationKeys generated by ICU Collators are not compatible with those
33 |  * generated by java.text.Collators.  Specifically, if you use
34 |  * ICUCollationKeyAnalyzer to generate index terms, do not use
35 |  * {@link CollationKeyAnalyzer} on the query side, or vice versa.
36 |  * </p>
37 |  * <p>
38 |  * ICUCollationKeyAnalyzer is significantly faster and generates significantly
39 |  * shorter keys than CollationKeyAnalyzer.  See
40 |  * <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
41 |  * >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
42 |  * generation timing and key length comparisons between ICU4J and
43 |  * java.text.Collator over several languages.
44 |  * </p>
45 |  */
46 | public final class IcuCollationKeyAnalyzer extends Analyzer {
47 | 
48 |     private final IcuCollationAttributeFactory factory;
49 | 
50 |     public IcuCollationKeyAnalyzer(Collator collator) {
51 |         this.factory = new IcuCollationAttributeFactory(collator);
52 |     }
53 | 
54 |     @Override
55 |     protected TokenStreamComponents createComponents(String fieldName) {
56 |         KeywordTokenizer tokenizer = new KeywordTokenizer(factory, KeywordTokenizer.DEFAULT_BUFFER_SIZE);
57 |         return new TokenStreamComponents(tokenizer, tokenizer);
58 |     }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuCollationTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu;
 2 | 
 3 | import com.ibm.icu.text.Collator;
 4 | import org.apache.lucene.analysis.Tokenizer;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | import org.elasticsearch.env.Environment;
 7 | import org.elasticsearch.index.IndexSettings;
 8 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation.IcuTokenizer;
 9 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation.IcuTokenizerFactory;
10 | 
11 | /**
12 |  * This {@link IcuTokenizer} uses an ICU @{@link Collator} as a char attribute factory.
13 |  */
14 | public class IcuCollationTokenizerFactory extends IcuTokenizerFactory {
15 | 
16 |     private final IcuCollationAttributeFactory factory;
17 | 
18 |     public IcuCollationTokenizerFactory(IndexSettings indexSettings, Environment environment, String name,
19 |                                         Settings settings) {
20 |         super(indexSettings, environment, name, settings);
21 |         this.factory = new IcuCollationAttributeFactory(IcuCollationKeyAnalyzerProvider.createCollator(settings));
22 |     }
23 | 
24 |     @Override
25 |     public Tokenizer create() {
26 |         return new IcuTokenizer(factory, config);
27 |     }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuFoldingCharFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu;
 2 | 
 3 | import org.elasticsearch.common.settings.Settings;
 4 | import org.elasticsearch.env.Environment;
 5 | import org.elasticsearch.index.IndexSettings;
 6 | 
 7 | import java.io.InputStream;
 8 | 
 9 | /**
10 |  * Applies foldings from UTR#30 Character Foldings.
11 |  * Can be filtered to handle certain characters in a specified way.
12 |  * See http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html
13 |  * E.g national chars that should be retained, like unicodeSetFilter : "[^åäöÅÄÖ]".
14 |  */
15 | public class IcuFoldingCharFilterFactory extends IcuNormalizerCharFilterFactory {
16 | 
17 |     public IcuFoldingCharFilterFactory(IndexSettings indexSettings, Environment environment, String name,
18 |                                        Settings settings) {
19 |         super(indexSettings, environment, name, settings);
20 |     }
21 | 
22 |     @Override
23 |     protected String getNormalizationName(Settings settings) {
24 |         return settings.get("normalization_name", "utr30");
25 |     }
26 | 
27 |     @Override
28 |     protected InputStream getNormalizationResource(Settings settings) {
29 |         InputStream inputStream = null;
30 |         if ("utr30".equals(getNormalizationName(settings))) {
31 |             inputStream = getClass().getResourceAsStream("utr30.nrm");
32 |         }
33 |         return inputStream;
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuFoldingTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu;
 2 | 
 3 | import org.elasticsearch.common.settings.Settings;
 4 | import org.elasticsearch.env.Environment;
 5 | import org.elasticsearch.index.IndexSettings;
 6 | 
 7 | import java.io.InputStream;
 8 | 
 9 | /**
10 |  * Applies foldings from UTR#30 Character Foldings.
11 |  * Can be filtered to handle certain characters in a specified way.
12 |  * See http://icu-project.org/apiref/icu4j/com/ibm/icu/text/UnicodeSet.html
13 |  * E.g national chars that should be retained, like unicode_set_filter : "[^åäöÅÄÖ]".
14 |  */
15 | public class IcuFoldingTokenFilterFactory extends IcuNormalizerTokenFilterFactory {
16 | 
17 |     public IcuFoldingTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
18 |                                         Settings settings) {
19 |         super(indexSettings, environment, name, settings);
20 |     }
21 | 
22 |     @Override
23 |     public Object getMultiTermComponent() {
24 |         return this;
25 |     }
26 | 
27 |     @Override
28 |     protected String getNormalizationName(Settings settings) {
29 |         return settings.get("normalization_name", "utr30");
30 |     }
31 | 
32 |     @Override
33 |     protected InputStream getNormalizationResource(Settings settings) {
34 |         InputStream inputStream = null;
35 |         if ("utr30".equals(getNormalizationName(settings))) {
36 |             inputStream = getClass().getResourceAsStream("utr30.nrm");
37 |         }
38 |         return inputStream;
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuNormalizerCharFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu;
 2 | 
 3 | import com.ibm.icu.text.FilteredNormalizer2;
 4 | import com.ibm.icu.text.Normalizer2;
 5 | import com.ibm.icu.text.UnicodeSet;
 6 | import org.elasticsearch.common.settings.Settings;
 7 | import org.elasticsearch.env.Environment;
 8 | import org.elasticsearch.index.IndexSettings;
 9 | import org.elasticsearch.index.analysis.AbstractCharFilterFactory;
10 | import org.elasticsearch.index.analysis.MultiTermAwareComponent;
11 | 
12 | import java.io.InputStream;
13 | import java.io.Reader;
14 | 
15 | /**
16 |  * ICU normalizer char filter factory.
17 |  */
18 | public class IcuNormalizerCharFilterFactory extends AbstractCharFilterFactory implements MultiTermAwareComponent {
19 | 
20 |     private final Normalizer2 normalizer;
21 | 
22 |     public IcuNormalizerCharFilterFactory(IndexSettings indexSettings, Environment environment, String name,
23 |                                           Settings settings) {
24 |         super(indexSettings, name);
25 |         Normalizer2 base = Normalizer2.getInstance(getNormalizationResource(settings),
26 |                 getNormalizationName(settings), getNormalizationMode(settings));
27 |         String unicodeSetFilter = settings.get("unicode_set_filter");
28 |         this.normalizer = unicodeSetFilter != null ?
29 |                 new FilteredNormalizer2(base, new UnicodeSet(unicodeSetFilter).freeze()) : base;
30 |     }
31 | 
32 |     @Override
33 |     public Reader create(Reader reader) {
34 |         return new IcuNormalizerCharFilter(reader, normalizer);
35 |     }
36 | 
37 |     @Override
38 |     public Object getMultiTermComponent() {
39 |         return this;
40 |     }
41 | 
42 |     protected InputStream getNormalizationResource(Settings settings) {
43 |         InputStream inputStream = null;
44 |         if ("utr30".equals(getNormalizationName(settings))) {
45 |             inputStream = getClass().getResourceAsStream("utr30.nrm");
46 |         }
47 |         return inputStream;
48 |     }
49 | 
50 |     protected String getNormalizationName(Settings settings) {
51 |         return settings.get("normalization_name", "nfkc_cf");
52 |     }
53 | 
54 |     protected Normalizer2.Mode getNormalizationMode(Settings settings) {
55 |         Normalizer2.Mode normalizationMode;
56 |         switch (settings.get("normalization_mode", "compose")) {
57 |             case "compose_contiguous":
58 |                 normalizationMode = Normalizer2.Mode.COMPOSE_CONTIGUOUS;
59 |                 break;
60 |             case "decompose":
61 |                 normalizationMode = Normalizer2.Mode.DECOMPOSE;
62 |                 break;
63 |             case "fcd":
64 |                 normalizationMode = Normalizer2.Mode.FCD;
65 |                 break;
66 |             default:
67 |                 normalizationMode = Normalizer2.Mode.COMPOSE;
68 |                 break;
69 |         }
70 |         return normalizationMode;
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuNormalizerFilter.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu;
 2 | 
 3 | import com.ibm.icu.text.Normalizer;
 4 | import com.ibm.icu.text.Normalizer2;
 5 | import org.apache.lucene.analysis.TokenFilter;
 6 | import org.apache.lucene.analysis.TokenStream;
 7 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 8 | 
 9 | import java.io.IOException;
10 | 
11 | /**
12 |  * Normalize token text with ICU {@link Normalizer2}.
13 |  * <p>
14 |  * With this filter, you can normalize text in the following ways:
15 |  * <ul>
16 |  * <li> NFKC Normalization, Case Folding, and removing Ignorables (the default)
17 |  * <li> Using a standard Normalization mode (NFC, NFD, NFKC, NFKD)
18 |  * <li> Based on rules from a custom normalization mapping.
19 |  * </ul>
20 |  * <p>
21 |  * If you use the defaults, this filter is a simple way to standardize Unicode text
22 |  * in a language-independent way for search:
23 |  * <ul>
24 |  * <li> The case folding that it does can be seen as a replacement for
25 |  * LowerCaseFilter: For example, it handles cases such as the Greek sigma, so that
26 |  * "Μάϊος" and "ΜΆΪΟΣ" will match correctly.
27 |  * <li> The normalization will standardizes different forms of the same
28 |  * character in Unicode. For example, CJK full-width numbers will be standardized
29 |  * to their ASCII forms.
30 |  * <li> Ignorables such as Zero-Width Joiner and Variation Selectors are removed.
31 |  * These are typically modifier characters that affect display.
32 |  * </ul>
33 |  *
34 |  * @see Normalizer2
35 |  * @see com.ibm.icu.text.FilteredNormalizer2
36 |  */
37 | public class IcuNormalizerFilter extends TokenFilter {
38 | 
39 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
40 | 
41 |     private final Normalizer2 normalizer;
42 | 
43 |     private final StringBuilder buffer = new StringBuilder();
44 | 
45 |     /**
46 |      * Create a new Normalizer2Filter with the specified Normalizer2.
47 |      *
48 |      * @param input      stream
49 |      * @param normalizer normalizer to use
50 |      */
51 |     public IcuNormalizerFilter(TokenStream input, Normalizer2 normalizer) {
52 |         super(input);
53 |         this.normalizer = normalizer;
54 |     }
55 | 
56 |     @Override
57 |     public final boolean incrementToken() throws IOException {
58 |         if (input.incrementToken()) {
59 |             if (normalizer.quickCheck(termAtt) != Normalizer.YES) {
60 |                 buffer.setLength(0);
61 |                 normalizer.normalize(termAtt, buffer);
62 |                 termAtt.setEmpty().append(buffer);
63 |             }
64 |             return true;
65 |         } else {
66 |             return false;
67 |         }
68 |     }
69 | 
70 |     @Override
71 |     public boolean equals(Object object) {
72 |         return object instanceof IcuNormalizerFilter;
73 |     }
74 | 
75 |     @Override
76 |     public int hashCode() {
77 |         return 0;
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuNormalizerTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu;
 2 | 
 3 | import com.ibm.icu.text.FilteredNormalizer2;
 4 | import com.ibm.icu.text.Normalizer2;
 5 | import com.ibm.icu.text.UnicodeSet;
 6 | import org.apache.lucene.analysis.TokenStream;
 7 | import org.elasticsearch.common.settings.Settings;
 8 | import org.elasticsearch.env.Environment;
 9 | import org.elasticsearch.index.IndexSettings;
10 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
11 | import org.elasticsearch.index.analysis.MultiTermAwareComponent;
12 | 
13 | import java.io.InputStream;
14 | 
15 | /**
16 |  * Uses the {@link IcuNormalizerFilter} to normalize tokens.
17 |  *
18 |  * The <code>name</code> can be used to provide the type of normalization to perform,
19 |  * the <code>mode</code> can be used to provide the mode of normalization.
20 |  */
21 | public class IcuNormalizerTokenFilterFactory extends AbstractTokenFilterFactory implements MultiTermAwareComponent {
22 | 
23 |     private final Normalizer2 normalizer;
24 | 
25 |     public IcuNormalizerTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
26 |                                            Settings settings) {
27 |         super(indexSettings, name, settings);
28 | 
29 |         Normalizer2 base = Normalizer2.getInstance(getNormalizationResource(settings),
30 |                 getNormalizationName(settings), getNormalizationMode(settings));
31 | 
32 |         String unicodeSetFilter = settings.get("unicode_set_filter");
33 |         this.normalizer = unicodeSetFilter != null ?
34 |                 new FilteredNormalizer2(base, new UnicodeSet(unicodeSetFilter).freeze()) : base;
35 |     }
36 | 
37 |     @Override
38 |     public TokenStream create(TokenStream tokenStream) {
39 |         return new IcuNormalizerFilter(tokenStream, normalizer);
40 |     }
41 | 
42 |     @Override
43 |     public Object getMultiTermComponent() {
44 |         return this;
45 |     }
46 | 
47 |     protected InputStream getNormalizationResource(Settings settings) {
48 |         InputStream inputStream = null;
49 |         if ("utr30".equals(getNormalizationName(settings))) {
50 |             inputStream = getClass().getResourceAsStream("utr30.nrm");
51 |         }
52 |         return inputStream;
53 |     }
54 | 
55 |     protected String getNormalizationName(Settings settings) {
56 |         return settings.get("normalization_name", "nfkc_cf");
57 |     }
58 | 
59 |     protected Normalizer2.Mode getNormalizationMode(Settings settings) {
60 |         Normalizer2.Mode normalizationMode;
61 |         switch (settings.get("normalization_mode", "compose")) {
62 |             case "compose_contiguous":
63 |                 normalizationMode = Normalizer2.Mode.COMPOSE_CONTIGUOUS;
64 |                 break;
65 |             case "decompose":
66 |                 normalizationMode = Normalizer2.Mode.DECOMPOSE;
67 |                 break;
68 |             case "fcd":
69 |                 normalizationMode = Normalizer2.Mode.FCD;
70 |                 break;
71 |             default:
72 |                 normalizationMode = Normalizer2.Mode.COMPOSE;
73 |                 break;
74 |         }
75 |         return normalizationMode;
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuNumberFormatTokenFilter.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu;
 2 | 
 3 | import com.ibm.icu.text.NumberFormat;
 4 | import org.apache.lucene.analysis.TokenFilter;
 5 | import org.apache.lucene.analysis.TokenStream;
 6 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 7 | import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
 8 | 
 9 | import java.io.IOException;
10 | import java.text.ParsePosition;
11 | 
12 | /**
13 |  * ICU number format token filter.
14 |  */
15 | public final class IcuNumberFormatTokenFilter extends TokenFilter {
16 | 
17 |     private final NumberFormat numberFormat;
18 | 
19 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
20 |     private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
21 | 
22 |     public IcuNumberFormatTokenFilter(TokenStream input, NumberFormat numberFormat) {
23 |         super(input);
24 |         this.numberFormat = numberFormat;
25 |     }
26 | 
27 |     @Override
28 |     public boolean incrementToken() throws IOException {
29 |         if (!input.incrementToken()) {
30 |             return false;
31 |         } else {
32 |             String s = termAtt.toString();
33 |             ParsePosition parsePosition = new ParsePosition(0);
34 |             Number result = numberFormat.parse(s, parsePosition);
35 |             if (parsePosition.getIndex() > 0) {
36 |                 // zehn-tausend -> zehntausend
37 |                 // one hundred thousand -> onehundredthousand
38 |                 s = numberFormat.format(result).replaceAll("[\u00AD\u0020]", "");
39 |             }
40 |             termAtt.setEmpty().append(s);
41 |             typeAtt.setType("<ALPHANUM>");
42 |             return true;
43 |         }
44 |     }
45 | 
46 |     @Override
47 |     public boolean equals(Object object) {
48 |         return object instanceof IcuNumberFormatTokenFilter &&
49 |                 numberFormat.equals(((IcuNumberFormatTokenFilter) object).numberFormat);
50 |     }
51 | 
52 |     @Override
53 |     public int hashCode() {
54 |         return numberFormat.hashCode();
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuNumberFormatTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu;
 2 | 
 3 | import com.ibm.icu.text.RuleBasedNumberFormat;
 4 | import com.ibm.icu.util.ULocale;
 5 | import org.apache.lucene.analysis.TokenStream;
 6 | import org.elasticsearch.common.settings.Settings;
 7 | import org.elasticsearch.env.Environment;
 8 | import org.elasticsearch.index.IndexSettings;
 9 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
10 | 
11 | import java.util.Locale;
12 | 
13 | /**
14 |  * ICU number format token filter factory.
15 |  */
16 | public class IcuNumberFormatTokenFilterFactory extends AbstractTokenFilterFactory {
17 | 
18 |     private final ULocale locale;
19 | 
20 |     private final int format;
21 | 
22 |     private final boolean lenient;
23 | 
24 |     private final boolean grouping;
25 | 
26 |     public IcuNumberFormatTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
27 |                                              Settings settings) {
28 |         super(indexSettings, name, settings);
29 |         this.locale = settings.get("locale") != null ? new ULocale(settings.get("locale")) : ULocale.getDefault();
30 |         String formatStr = settings.get("format", "SPELLOUT");
31 |         switch (formatStr.toUpperCase(Locale.ROOT)) {
32 |             case "DURATION":
33 |                 format = RuleBasedNumberFormat.DURATION;
34 |                 break;
35 |             case "NUMBERING_SYSTEM":
36 |                 format = RuleBasedNumberFormat.NUMBERING_SYSTEM;
37 |                 break;
38 |             case "NUMBERSTYLE":
39 |                 format = RuleBasedNumberFormat.NUMBERSTYLE;
40 |                 break;
41 |             case "ORDINAL":
42 |                 format = RuleBasedNumberFormat.ORDINAL;
43 |                 break;
44 |             case "SPELLOUT":
45 |             default:
46 |                 format = RuleBasedNumberFormat.SPELLOUT;
47 |                 break;
48 |         }
49 |         // RBNF parsing is incredibly slow when lenient is enabled but the only method to parse compound number words
50 |         this.lenient = settings.getAsBoolean("lenient", true);
51 |         this.grouping = settings.getAsBoolean("grouping", true);
52 |     }
53 | 
54 |     @Override
55 |     public TokenStream create(TokenStream tokenStream) {
56 |         // create a new number format instance for each token stream
57 |         RuleBasedNumberFormat ruleBasedNumberFormat = new RuleBasedNumberFormat(locale, format);
58 |         ruleBasedNumberFormat.setLenientParseMode(lenient);
59 |         ruleBasedNumberFormat.setGroupingUsed(grouping);
60 |         return new IcuNumberFormatTokenFilter(tokenStream, ruleBasedNumberFormat);
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/IcuTransformTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu;
 2 | 
 3 | import com.ibm.icu.text.Transliterator;
 4 | import com.ibm.icu.text.UnicodeSet;
 5 | import org.apache.lucene.analysis.TokenStream;
 6 | import org.elasticsearch.common.settings.Settings;
 7 | import org.elasticsearch.env.Environment;
 8 | import org.elasticsearch.index.IndexSettings;
 9 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
10 | 
11 | /**
12 |  * ICU transform token filter factory.
13 |  */
14 | public class IcuTransformTokenFilterFactory extends AbstractTokenFilterFactory {
15 | 
16 |     private final Transliterator transliterator;
17 | 
18 |     public IcuTransformTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
19 |                                           Settings settings) {
20 |         super(indexSettings, name, settings);
21 |         String id = settings.get("id", "Null");
22 |         String direction = settings.get("dir", "forward");
23 |         int dir = "forward".equals(direction) ? Transliterator.FORWARD : Transliterator.REVERSE;
24 |         String rules = settings.get("rules");
25 |         this.transliterator = rules != null ?
26 |                 Transliterator.createFromRules(id, rules, dir) :
27 |                 Transliterator.getInstance(id, dir);
28 |         String unicodeSetFilter = settings.get("unicodeSetFilter");
29 |         if (unicodeSetFilter != null) {
30 |             transliterator.setFilter(new UnicodeSet(unicodeSetFilter).freeze());
31 |         }
32 |     }
33 | 
34 |     @Override
35 |     public TokenStream create(TokenStream tokenStream) {
36 |         return new IcuTransformTokenFilter(tokenStream, transliterator);
37 |     }
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes for Elasticsearch analysis by International Components for Unicode.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/CharArrayIterator.java:
--------------------------------------------------------------------------------
  1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation;
  2 | 
  3 | import java.text.CharacterIterator;
  4 | 
  5 | /**
  6 |  * Wraps a char[] as CharacterIterator for processing with a BreakIterator.
  7 |  */
  8 | public final class CharArrayIterator implements CharacterIterator {
  9 | 
 10 |     private char[] array;
 11 | 
 12 |     private int start;
 13 | 
 14 |     private int index;
 15 | 
 16 |     private int length;
 17 | 
 18 |     private int limit;
 19 | 
 20 |     public char[] getText() {
 21 |         return array;
 22 |     }
 23 | 
 24 |     public int getStart() {
 25 |         return start;
 26 |     }
 27 | 
 28 |     public int getLength() {
 29 |         return length;
 30 |     }
 31 | 
 32 |     /**
 33 |      * Set a new region of text to be examined by this iterator.
 34 |      *
 35 |      * @param array  text buffer to examine
 36 |      * @param start  offset into buffer
 37 |      * @param length maximum length to examine
 38 |      */
 39 |     public void setText(final char[] array, int start, int length) {
 40 |         this.array = array;
 41 |         this.start = start;
 42 |         this.index = start;
 43 |         this.length = length;
 44 |         this.limit = start + length;
 45 |     }
 46 | 
 47 |     @Override
 48 |     public char current() {
 49 |         return (index == limit) ? DONE : array[index];
 50 |     }
 51 | 
 52 |     @Override
 53 |     public char first() {
 54 |         index = start;
 55 |         return current();
 56 |     }
 57 | 
 58 |     @Override
 59 |     public int getBeginIndex() {
 60 |         return 0;
 61 |     }
 62 | 
 63 |     @Override
 64 |     public int getEndIndex() {
 65 |         return length;
 66 |     }
 67 | 
 68 |     @Override
 69 |     public int getIndex() {
 70 |         return index - start;
 71 |     }
 72 | 
 73 |     @Override
 74 |     public char last() {
 75 |         index = (limit == start) ? limit : limit - 1;
 76 |         return current();
 77 |     }
 78 | 
 79 |     @Override
 80 |     public char next() {
 81 |         if (++index >= limit) {
 82 |             index = limit;
 83 |             return DONE;
 84 |         } else {
 85 |             return current();
 86 |         }
 87 |     }
 88 | 
 89 |     @Override
 90 |     public char previous() {
 91 |         if (--index < start) {
 92 |             index = start;
 93 |             return DONE;
 94 |         } else {
 95 |             return current();
 96 |         }
 97 |     }
 98 | 
 99 |     @Override
100 |     public char setIndex(int position) {
101 |         if (position < getBeginIndex() || position > getEndIndex()) {
102 |             throw new IllegalArgumentException("Illegal Position: " + position);
103 |         }
104 |         index = start + position;
105 |         return current();
106 |     }
107 | 
108 |     @Override
109 |     public CharArrayIterator clone() {
110 |         CharArrayIterator clone = new CharArrayIterator();
111 |         clone.setText(array, start, length);
112 |         clone.index = index;
113 |         return clone;
114 |     }
115 | }
116 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/IcuTokenizerConfig.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation;
 2 | 
 3 | import com.ibm.icu.text.BreakIterator;
 4 | 
 5 | /**
 6 |  * Class that allows for tailored Unicode Text Segmentation on
 7 |  * a per-writing system basis.
 8 |  */
 9 | public interface IcuTokenizerConfig {
10 | 
11 |     /**
12 |      * Return a breakiterator capable of processing a given script.
13 |      *
14 |      * @param script script
15 |      * @return iterator
16 |      */
17 |     BreakIterator getBreakIterator(int script);
18 | 
19 |     /**
20 |      * Return a token type value for a given script and BreakIterator
21 |      * rule status.
22 |      *
23 |      * @param script     script
24 |      * @param ruleStatus rule status
25 |      * @return type
26 |      */
27 |     String getType(int script, int ruleStatus);
28 | 
29 |     /**
30 |      * @return true if Han, Hiragana, and Katakana scripts should all be returned as Japanese
31 |      */
32 |     boolean combineCJ();
33 | }
34 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes for text segmentation with International Components for Unicode.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/tokenattributes/ScriptAttribute.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tokenattributes;
 2 | 
 3 | import org.apache.lucene.util.Attribute;
 4 | 
 5 | /**
 6 |  * This attribute stores the UTR #24 script value for a token of text.
 7 |  */
 8 | public interface ScriptAttribute extends Attribute {
 9 |     /**
10 |      * Get the numeric code for this script value.
11 |      * This is the constant value from {@link com.ibm.icu.lang.UScript}.
12 |      *
13 |      * @return numeric code
14 |      */
15 |     int getCode();
16 | 
17 |     /**
18 |      * Set the numeric code for this script value.
19 |      * This is the constant value from {@link com.ibm.icu.lang.UScript}.
20 |      *
21 |      * @param code numeric code
22 |      */
23 |     void setCode(int code);
24 | 
25 |     /**
26 |      * Get the full name.
27 |      *
28 |      * @return UTR #24 full name.
29 |      */
30 |     String getName();
31 | 
32 |     /**
33 |      * Get the abbreviated name.
34 |      *
35 |      * @return UTR #24 abbreviated name.
36 |      */
37 |     String getShortName();
38 | }
39 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/tokenattributes/ScriptAttributeImpl.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tokenattributes;
 2 | 
 3 | import com.ibm.icu.lang.UScript;
 4 | import org.apache.lucene.util.AttributeImpl;
 5 | import org.apache.lucene.util.AttributeReflector;
 6 | 
 7 | /**
 8 |  * Implementation of {@link ScriptAttribute} that stores the script as an integer.
 9 |  */
10 | public class ScriptAttributeImpl extends AttributeImpl implements ScriptAttribute, Cloneable {
11 |     private int code = UScript.COMMON;
12 | 
13 |     public ScriptAttributeImpl() {}
14 | 
15 |     @Override
16 |     public int getCode() {
17 |         return code;
18 |     }
19 | 
20 |     @Override
21 |     public void setCode(int code) {
22 |         this.code = code;
23 |     }
24 | 
25 |     @Override
26 |     public String getName() {
27 |         return UScript.getName(code);
28 |     }
29 | 
30 |     @Override
31 |     public String getShortName() {
32 |         return UScript.getShortName(code);
33 |     }
34 | 
35 |     @Override
36 |     public void clear() {
37 |         code = UScript.COMMON;
38 |     }
39 | 
40 |     @Override
41 |     public void copyTo(AttributeImpl target) {
42 |         ScriptAttribute t = (ScriptAttribute) target;
43 |         t.setCode(code);
44 |     }
45 | 
46 |     @Override
47 |     public boolean equals(Object other) {
48 |         return this == other || other instanceof ScriptAttributeImpl &&
49 |                 ((ScriptAttributeImpl) other).code == code;
50 |     }
51 | 
52 |     @Override
53 |     public ScriptAttributeImpl clone() {
54 |         ScriptAttributeImpl attribute = (ScriptAttributeImpl) super.clone();
55 |         attribute.code = this.code;
56 |         return attribute;
57 |     }
58 | 
59 |     @Override
60 |     public int hashCode() {
61 |         return code;
62 |     }
63 | 
64 |     @Override
65 |     public void reflectWith(AttributeReflector reflector) {
66 |         String name = code == UScript.JAPANESE ? "Chinese/Japanese" : getName();
67 |         reflector.reflect(ScriptAttribute.class, "script", name);
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/tokenattributes/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes fro token attributes of International Components for Unicode.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tokenattributes;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/tools/RBBIRuleCompiler.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tools;
 2 | 
 3 | import com.ibm.icu.text.RuleBasedBreakIterator;
 4 | import org.apache.logging.log4j.LogManager;
 5 | import org.apache.logging.log4j.Logger;
 6 | 
 7 | import java.io.BufferedReader;
 8 | import java.io.IOException;
 9 | import java.io.InputStream;
10 | import java.io.InputStreamReader;
11 | import java.io.OutputStream;
12 | import java.nio.charset.StandardCharsets;
13 | import java.nio.file.Files;
14 | import java.nio.file.Path;
15 | 
16 | /**
17 |  * Utility to convert RuleBasedBreakIterator (.rbbi) files into binary compiled form (.brk).
18 |  */
19 | public class RBBIRuleCompiler {
20 | 
21 |     private static final Logger logger = LogManager.getLogger(RBBIRuleCompiler.class.getName());
22 | 
23 |     public void compile(Path inputPath, Path outputPath) throws IOException {
24 |         compile(Files.newInputStream(inputPath), Files.newOutputStream(outputPath));
25 |     }
26 | 
27 |     public void compile(InputStream inputStream, OutputStream outputStream) throws IOException {
28 |         String rules = getRules(inputStream);
29 |         try (OutputStream os = outputStream) {
30 |             new RuleBasedBreakIterator(rules);
31 |             RuleBasedBreakIterator.compileRules(rules, os);
32 |         } catch (IllegalArgumentException e) {
33 |             logger.error(e.getMessage(), e);
34 |         }
35 |     }
36 | 
37 |     private String getRules(InputStream inputStream) throws IOException {
38 |         StringBuilder rules = new StringBuilder();
39 |         try (BufferedReader bufferedReader =
40 |                      new BufferedReader(new InputStreamReader(inputStream, StandardCharsets.UTF_8))) {
41 |             String line;
42 |             while ((line = bufferedReader.readLine()) != null) {
43 |                 if (!line.startsWith("#")) {
44 |                     rules.append(line);
45 |                     rules.append('\n');
46 |                 }
47 |             }
48 |         }
49 |         return rules.toString();
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/tools/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Classes for ICU tools.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tools;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/lemmatize/LemmatizeTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.lemmatize;
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream;
 4 | import org.elasticsearch.ElasticsearchException;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | import org.elasticsearch.env.Environment;
 7 | import org.elasticsearch.index.IndexSettings;
 8 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 9 | import org.xbib.elasticsearch.plugin.bundle.common.fsa.Dictionary;
10 | 
11 | import java.io.InputStream;
12 | import java.io.InputStreamReader;
13 | import java.io.Reader;
14 | import java.nio.charset.StandardCharsets;
15 | import java.util.zip.GZIPInputStream;
16 | 
17 | /**
18 |  * Lemmatize token filter factory.
19 |  */
20 | public class LemmatizeTokenFilterFactory extends AbstractTokenFilterFactory {
21 | 
22 |     private final Dictionary dictionary;
23 | 
24 |     private final boolean respectKeywords;
25 | 
26 |     private final boolean lemmaOnly;
27 | 
28 |     public LemmatizeTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) {
29 |         super(indexSettings, name, settings);
30 |         this.respectKeywords = settings.getAsBoolean("respect_keywords", false);
31 |         this.lemmaOnly = settings.getAsBoolean("lemma_only", true);
32 |         this.dictionary = createDictionary(settings);
33 |     }
34 | 
35 |     @Override
36 |     public TokenStream create(TokenStream tokenStream) {
37 |         return new LemmatizeTokenFilter(tokenStream, dictionary, respectKeywords, lemmaOnly);
38 |     }
39 | 
40 |     private Dictionary createDictionary(Settings settings) {
41 |         String language = settings.get("language", "en");
42 |         try {
43 |             String resource = settings.get("resource", "lemmatization-" + language + ".fsa.gz");
44 |             if (resource.endsWith(".fsa") || resource.endsWith("fsa.gz")) {
45 |                 // FSA
46 |                 InputStream inputStream = getClass().getResourceAsStream(resource);
47 |                 if (resource.endsWith(".gz")) {
48 |                     inputStream = new GZIPInputStream(inputStream);
49 |                 }
50 |                 Dictionary dictionary = new Dictionary().loadFSA(inputStream);
51 |                 inputStream.close();
52 |                 return dictionary;
53 |             } else {
54 |                 // Text
55 |                 InputStream inputStream = getClass().getResourceAsStream(resource);
56 |                 if (resource.endsWith(".gz")) {
57 |                     inputStream = new GZIPInputStream(inputStream);
58 |                 }
59 |                 Reader reader = new InputStreamReader(inputStream, StandardCharsets.UTF_8);
60 |                 Dictionary dictionary = new Dictionary().loadLinesReverse(reader);
61 |                 reader.close();
62 |                 return dictionary;
63 |             }
64 |         } catch (Exception e) {
65 |             throw new ElasticsearchException("resources for language " + language +
66 |                     " in settings not found: " + settings, e);
67 |         }
68 |     }
69 | }
70 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/naturalsort/NaturalSortKeyAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.naturalsort;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.apache.lucene.analysis.core.KeywordTokenizer;
 5 | 
 6 | import java.text.Collator;
 7 | 
 8 | /**
 9 |  * Natural sort key analyzer.
10 |  */
11 | public class NaturalSortKeyAnalyzer extends Analyzer {
12 | 
13 |     private final NaturalSortKeyAttributeFactory factory;
14 | 
15 |     private final int bufferSize;
16 | 
17 |     public NaturalSortKeyAnalyzer(Collator collator, int bufferSize, int digits, int maxtoken) {
18 |         this.factory = new NaturalSortKeyAttributeFactory(collator, digits, maxtoken);
19 |         this.bufferSize = bufferSize;
20 |     }
21 | 
22 |     @Override
23 |     protected TokenStreamComponents createComponents(String fieldName) {
24 |         KeywordTokenizer tokenizer = new KeywordTokenizer(factory, bufferSize);
25 |         return new TokenStreamComponents(tokenizer, tokenizer);
26 |     }
27 | 
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/naturalsort/NaturalSortKeyAnalyzerProvider.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.naturalsort;
 2 | 
 3 | import org.apache.lucene.analysis.core.KeywordTokenizer;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.env.Environment;
 6 | import org.elasticsearch.index.IndexSettings;
 7 | import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
 8 | 
 9 | import java.text.Collator;
10 | import java.util.Locale;
11 | 
12 | /**
13 |  * Natural sort key analyzer provider.
14 |  */
15 | public class NaturalSortKeyAnalyzerProvider extends AbstractIndexAnalyzerProvider<NaturalSortKeyAnalyzer> {
16 | 
17 |     private final Collator collator;
18 | 
19 |     private final int digits;
20 | 
21 |     private final int maxTokens;
22 | 
23 |     private final int bufferSize;
24 | 
25 |     public NaturalSortKeyAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name,
26 |                                           Settings settings) {
27 |         super(indexSettings, name, settings);
28 |         this.collator = createCollator(settings);
29 |         this.digits = settings.getAsInt("digits", 1);
30 |         this.maxTokens = settings.getAsInt("maxTokens", 2);
31 |         this.bufferSize = settings.getAsInt("bufferSize", KeywordTokenizer.DEFAULT_BUFFER_SIZE);
32 |     }
33 | 
34 |     protected static Collator createCollator(Settings settings) {
35 |         return Collator.getInstance(new Locale(settings.get("locale", Locale.getDefault().toString())));
36 |     }
37 | 
38 |     @Override
39 |     public NaturalSortKeyAnalyzer get() {
40 |         return new NaturalSortKeyAnalyzer(collator, bufferSize, digits, maxTokens);
41 |     }
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/naturalsort/NaturalSortKeyAttributeFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.naturalsort;
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream;
 4 | import org.apache.lucene.util.AttributeFactory;
 5 | 
 6 | import java.text.Collator;
 7 | 
 8 | /**
 9 |  * Natural sort key attribute factory.
10 |  */
11 | public class NaturalSortKeyAttributeFactory
12 |         extends AttributeFactory.StaticImplementationAttributeFactory<NaturalSortKeyAttributeImpl> {
13 | 
14 |     private final Collator collator;
15 | 
16 |     private final int digits;
17 | 
18 |     private final int maxTokens;
19 | 
20 |     public NaturalSortKeyAttributeFactory(Collator collator, int digits, int maxTokens) {
21 |         this(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, collator, digits, maxTokens);
22 |     }
23 | 
24 |     public NaturalSortKeyAttributeFactory(AttributeFactory delegate, Collator collator, int digits, int maxTokens) {
25 |         super(delegate, NaturalSortKeyAttributeImpl.class);
26 |         this.collator = collator;
27 |         this.digits = digits;
28 |         this.maxTokens = maxTokens;
29 |     }
30 | 
31 |     @Override
32 |     protected NaturalSortKeyAttributeImpl createInstance() {
33 |         return new NaturalSortKeyAttributeImpl(collator, digits, maxTokens);
34 |     }
35 | 
36 |     @Override
37 |     public boolean equals(Object object) {
38 |         return object instanceof NaturalSortKeyAttributeFactory &&
39 |                 collator.equals(((NaturalSortKeyAttributeFactory)object).collator) &&
40 |                 Integer.compare(digits, ((NaturalSortKeyAttributeFactory)object).digits) == 0 &&
41 |                 Integer.compare(maxTokens, ((NaturalSortKeyAttributeFactory)object).maxTokens) == 0;
42 |     }
43 | 
44 |     @Override
45 |     public int hashCode() {
46 |         return collator.hashCode() ^ Integer.hashCode(digits) ^ Integer.hashCode(maxTokens);
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/naturalsort/NaturalSortKeyAttributeImpl.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.naturalsort;
 2 | 
 3 | import org.apache.lucene.analysis.tokenattributes.CharTermAttributeImpl;
 4 | import org.apache.lucene.util.BytesRef;
 5 | 
 6 | import java.text.Collator;
 7 | import java.util.Locale;
 8 | import java.util.regex.Matcher;
 9 | import java.util.regex.Pattern;
10 | 
11 | /**
12 |  * Natural sort key attribute implementation.
13 |  */
14 | public class NaturalSortKeyAttributeImpl extends CharTermAttributeImpl {
15 | 
16 |     private static final Pattern numberPattern = Pattern.compile("(\\+|\\-)?([0-9]+)");
17 | 
18 |     private final Collator collator;
19 | 
20 |     private final int digits;
21 | 
22 |     private final int maxTokens;
23 | 
24 |     public NaturalSortKeyAttributeImpl(Collator collator, int digits, int maxTokens) {
25 |         this.collator = collator;
26 |         this.digits = digits;
27 |         this.maxTokens = maxTokens;
28 |     }
29 | 
30 |     @Override
31 |     public BytesRef getBytesRef() {
32 |         byte[] collationKey = collator.getCollationKey(natural(toString())).toByteArray();
33 |         final BytesRef ref = this.builder.get();
34 |         ref.bytes = collationKey;
35 |         ref.offset = 0;
36 |         ref.length = collationKey.length;
37 |         return ref;
38 |     }
39 | 
40 |     private String natural(String s) {
41 |         StringBuffer sb = new StringBuffer();
42 |         Matcher m = numberPattern.matcher(s);
43 |         int foundTokens = 0;
44 |         while (m.find()) {
45 |             int len = m.group(2).length();
46 |             String fmt = "%0" + digits + "d";
47 |             String repl = String.format(Locale.ROOT, fmt, len) + m.group();
48 |             m.appendReplacement(sb, repl);
49 |             foundTokens++;
50 |             if (foundTokens >= maxTokens) {
51 |                 break;
52 |             }
53 |         }
54 |         m.appendTail(sb);
55 |         return sb.toString();
56 |     }
57 | 
58 |     @Override
59 |     public boolean equals(Object object) {
60 |         return object instanceof NaturalSortKeyAttributeImpl &&
61 |                 collator.equals(((NaturalSortKeyAttributeImpl)object).collator) &&
62 |                 Integer.compare(digits, ((NaturalSortKeyAttributeImpl)object).digits) == 0 &&
63 |                 Integer.compare(maxTokens, ((NaturalSortKeyAttributeImpl)object).maxTokens) == 0;
64 |     }
65 | 
66 |     @Override
67 |     public int hashCode() {
68 |         return collator.hashCode() ^ Integer.hashCode(digits) ^ Integer.hashCode(maxTokens);
69 |     }
70 | }
71 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/naturalsort/NaturalSortKeyTokenizerFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.naturalsort;
 2 | 
 3 | import org.apache.lucene.analysis.Tokenizer;
 4 | import org.apache.lucene.analysis.core.KeywordTokenizer;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | import org.elasticsearch.env.Environment;
 7 | import org.elasticsearch.index.IndexSettings;
 8 | import org.elasticsearch.index.analysis.AbstractTokenizerFactory;
 9 | 
10 | import java.text.Collator;
11 | 
12 | /**
13 |  * Natural sort key tokenizer factory.
14 |  */
15 | public class NaturalSortKeyTokenizerFactory extends AbstractTokenizerFactory {
16 | 
17 |     private final NaturalSortKeyAttributeFactory factory;
18 | 
19 |     private final int bufferSize;
20 | 
21 |     public NaturalSortKeyTokenizerFactory(IndexSettings indexSettings, Environment environment, String name,
22 |                                           Settings settings) {
23 |         super(indexSettings, name, settings);
24 |         Collator collator = NaturalSortKeyAnalyzerProvider.createCollator(settings);
25 |         int digits = settings.getAsInt("digits", 1);
26 |         int maxTokens = settings.getAsInt("maxTokens", 2);
27 |         this.factory = new NaturalSortKeyAttributeFactory(collator, digits, maxTokens);
28 |         this.bufferSize = settings.getAsInt("bufferSize", KeywordTokenizer.DEFAULT_BUFFER_SIZE);
29 |     }
30 | 
31 |     @Override
32 |     public Tokenizer create() {
33 |         return new KeywordTokenizer(factory, bufferSize);
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/naturalsort/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Natural sort.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.naturalsort;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/sortform/SortformTokenFilter.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.sortform;
 2 | 
 3 | import org.apache.lucene.analysis.TokenFilter;
 4 | import org.apache.lucene.analysis.TokenStream;
 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.regex.Pattern;
 9 | 
10 | /**
11 |  * Sort form token filter.
12 |  */
13 | public class SortformTokenFilter extends TokenFilter {
14 | 
15 |     private static final Pattern[] patterns = {
16 |             Pattern.compile("\\s*<<.*?>>\\s*"),
17 |             Pattern.compile("\\s*<.*?>\\s*"),
18 |             Pattern.compile("\\s*\u0098.*?\u009C\\s*"),
19 |             Pattern.compile("\\s*\u02BE.*?\u02BB\\s*"),
20 |             Pattern.compile("\\s*\u00AC.*?\u00AC\\s*")
21 |     };
22 |     private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
23 | 
24 |     protected SortformTokenFilter(TokenStream input) {
25 |         super(input);
26 |     }
27 | 
28 |     @Override
29 |     public final boolean incrementToken() throws IOException {
30 |         if (!input.incrementToken()) {
31 |             return false;
32 |         } else {
33 |             String s = termAtt.toString();
34 |             for (Pattern pattern : patterns) {
35 |                 s = pattern.matcher(s).replaceAll("");
36 |             }
37 |             termAtt.setEmpty().append(s);
38 |             return true;
39 |         }
40 |     }
41 | 
42 |     @Override
43 |     public boolean equals(Object object) {
44 |         return object instanceof SortformTokenFilter;
45 |     }
46 | 
47 |     @Override
48 |     public int hashCode() {
49 |         return 0;
50 |     }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/sortform/SortformTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.sortform;
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.env.Environment;
 6 | import org.elasticsearch.index.IndexSettings;
 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 8 | 
 9 | /**
10 |  * Sort form token filter factory.
11 |  */
12 | public class SortformTokenFilterFactory extends AbstractTokenFilterFactory {
13 | 
14 |     public SortformTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
15 |                                       Settings settings) {
16 |         super(indexSettings, name, settings);
17 |     }
18 | 
19 |     @Override
20 |     public TokenStream create(TokenStream tokenStream) {
21 |         return new SortformTokenFilter(tokenStream);
22 |     }
23 | 
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/standardnumber/StandardnumberAnalyzer.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.standardnumber;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.apache.lucene.analysis.TokenStream;
 5 | import org.apache.lucene.analysis.Tokenizer;
 6 | import org.elasticsearch.index.analysis.TokenFilterFactory;
 7 | import org.elasticsearch.index.analysis.TokenizerFactory;
 8 | 
 9 | import java.util.Collections;
10 | 
11 | /**
12 |  * Standard number analyzer.
13 |  */
14 | public class StandardnumberAnalyzer extends Analyzer {
15 | 
16 |     private final TokenizerFactory tokenizerFactory;
17 |     private final StandardnumberTokenFilterFactory stdnumTokenFilterFactory;
18 | 
19 |     public StandardnumberAnalyzer(TokenizerFactory tokenizerFactory,
20 |                                   StandardnumberTokenFilterFactory stdnumTokenFilterFactory) {
21 |         this.tokenizerFactory = tokenizerFactory;
22 |         this.stdnumTokenFilterFactory = stdnumTokenFilterFactory;
23 |     }
24 | 
25 |     @Override
26 |     protected TokenStreamComponents createComponents(String fieldName) {
27 |         Tokenizer tokenizer = tokenizerFactory.create();
28 |         TokenStream tokenStream = tokenizer;
29 |         for (TokenFilterFactory tokenFilter : Collections.singletonList(stdnumTokenFilterFactory)) {
30 |             tokenStream = tokenFilter.create(tokenStream);
31 |         }
32 |         return new TokenStreamComponents(tokenizer, tokenStream);
33 |     }
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/standardnumber/StandardnumberAnalyzerProvider.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.standardnumber;
 2 | 
 3 | import org.elasticsearch.common.settings.Settings;
 4 | import org.elasticsearch.env.Environment;
 5 | import org.elasticsearch.index.IndexSettings;
 6 | import org.elasticsearch.index.analysis.AbstractIndexAnalyzerProvider;
 7 | import org.elasticsearch.index.analysis.WhitespaceTokenizerFactory;
 8 | import org.xbib.elasticsearch.plugin.bundle.index.mapper.standardnumber.StandardnumberMapper;
 9 | 
10 | /**
11 |  * Standard number analyzer provider.
12 |  */
13 | public class StandardnumberAnalyzerProvider extends AbstractIndexAnalyzerProvider<StandardnumberAnalyzer> {
14 | 
15 |     private final StandardnumberAnalyzer analyzer;
16 | 
17 |     public StandardnumberAnalyzerProvider(IndexSettings indexSettings, Environment environment, String name,
18 |                                           Settings settings, StandardnumberMapper.TypeParser standardNumberTypeParser) {
19 |         super(indexSettings, name, settings);
20 |         WhitespaceTokenizerFactory tokenizerFactory =
21 |                 new WhitespaceTokenizerFactory(indexSettings, environment, name, settings);
22 |         StandardnumberTokenFilterFactory stdnumTokenFilterFactory =
23 |                 new StandardnumberTokenFilterFactory(indexSettings, environment, name, settings, standardNumberTypeParser);
24 |         this.analyzer = new StandardnumberAnalyzer(tokenizerFactory, stdnumTokenFilterFactory);
25 |     }
26 | 
27 |     @Override
28 |     public StandardnumberAnalyzer get() {
29 |         return this.analyzer;
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/standardnumber/StandardnumberTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.standardnumber;
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.env.Environment;
 6 | import org.elasticsearch.index.IndexSettings;
 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 8 | import org.xbib.elasticsearch.plugin.bundle.index.mapper.standardnumber.StandardnumberMapper;
 9 | import org.xbib.elasticsearch.plugin.bundle.common.standardnumber.StandardnumberService;
10 | 
11 | /**
12 |  * Standard number token filter factory.
13 |  */
14 | public class StandardnumberTokenFilterFactory extends AbstractTokenFilterFactory {
15 | 
16 |     private final Settings settings;
17 | 
18 |     private final StandardnumberService standardnumberService;
19 | 
20 |     public StandardnumberTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
21 |                                             Settings settings, StandardnumberMapper.TypeParser standardNumberTypeParser) {
22 |         super(indexSettings, name, settings);
23 |         this.settings = settings;
24 |         this.standardnumberService = new StandardnumberService(settings);
25 |         this.standardnumberService.setStandardNumberTypeParser(standardNumberTypeParser);
26 |     }
27 | 
28 |     @Override
29 |     public TokenStream create(TokenStream tokenStream) {
30 |         return new StandardnumberTokenFilter(tokenStream, standardnumberService, settings);
31 |     }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/symbolname/SymbolnameTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.symbolname;
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.env.Environment;
 6 | import org.elasticsearch.index.IndexSettings;
 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 8 | 
 9 | /**
10 |  * Symbol name token filter factory.
11 |  */
12 | public class SymbolnameTokenFilterFactory extends AbstractTokenFilterFactory {
13 | 
14 |     public SymbolnameTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
15 |                                         Settings settings) {
16 |         super(indexSettings, name, settings);
17 |     }
18 | 
19 |     @Override
20 |     public TokenStream create(TokenStream tokenStream) {
21 |         return new SymbolnameTokenFilter(tokenStream);
22 |     }
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/symbolname/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Symbol name token filter.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.symbolname;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/worddelimiter/WordDelimiterFlags.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.worddelimiter;
 2 | 
 3 | /**
 4 |  * Flags for {@link WordDelimiterFilter}.
 5 |  */
 6 | public interface WordDelimiterFlags {
 7 |     int LOWER = 0x01;
 8 | 
 9 |     int UPPER = 0x02;
10 | 
11 |     int DIGIT = 0x04;
12 | 
13 |     int SUBWORD_DELIM = 0x08;
14 | 
15 |     int ALPHA = 0x03;
16 | 
17 |     int ALPHANUM = 0x07;
18 | 
19 |     /**
20 |      * Causes parts of words to be generated:
21 |      * "PowerShot" =&gt; "Power" "Shot"
22 |      */
23 |     int GENERATE_WORD_PARTS = 1;
24 | 
25 |     /**
26 |      * Causes number subwords to be generated:
27 |      * "500-42" =&gt; "500" "42"
28 |      */
29 |     int GENERATE_NUMBER_PARTS = 2;
30 | 
31 |     /**
32 |      * Causes maximum runs of word parts to be catenated:
33 |      * "wi-fi" =&gt; "wifi"
34 |      */
35 |     int CATENATE_WORDS = 4;
36 | 
37 |     /**
38 |      * Causes maximum runs of word parts to be catenated:
39 |      * "wi-fi" =&gt;"wifi"
40 |      */
41 |     int CATENATE_NUMBERS = 8;
42 | 
43 |     /**
44 |      * Causes all subword parts to be catenated:
45 |      * "wi-fi-4000" =&gt; "wifi4000"
46 |      */
47 |     int CATENATE_ALL = 16;
48 | 
49 |     /**
50 |      * Causes original words are preserved and added to the subword list (Defaults to false)
51 |      * "500-42" =&gt; "500" "42" "500-42"
52 |      */
53 |     int PRESERVE_ORIGINAL = 32;
54 | 
55 |     /**
56 |      * If not set, causes case changes to be ignored (subwords will only be generated
57 |      * given SUBWORD_DELIM tokens)
58 |      */
59 |     int SPLIT_ON_CASE_CHANGE = 64;
60 | 
61 |     /**
62 |      * If not set, causes numeric changes to be ignored (subwords will only be generated
63 |      * given SUBWORD_DELIM tokens).
64 |      */
65 |     int SPLIT_ON_NUMERICS = 128;
66 | 
67 |     /**
68 |      * Causes trailing "'s" to be removed for each subword
69 |      * "O'Neil's" =&gt; "O", "Neil"
70 |      */
71 |     int STEM_ENGLISH_POSSESSIVE = 256;
72 | 
73 |     /**
74 |      * Causes every parts to share the same position.
75 |      * The default is off and causes each intermediate part to take its own position.
76 |      */
77 |     int ALL_PARTS_AT_SAME_POSITION = 512;
78 | 
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/worddelimiter/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Word delimiter filter.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.worddelimiter;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/year/GregorianYearTokenFilter.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.year;
 2 | 
 3 | import org.apache.lucene.analysis.TokenFilter;
 4 | import org.apache.lucene.analysis.TokenStream;
 5 | import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 6 | 
 7 | import java.io.IOException;
 8 | import java.util.regex.Matcher;
 9 | import java.util.regex.Pattern;
10 | 
11 | /**
12 |  * Gregorian year token filter.
13 |  */
14 | public class GregorianYearTokenFilter extends TokenFilter {
15 | 
16 |     private static final Pattern pattern = Pattern.compile("(\\d{4})");
17 | 
18 |     protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
19 | 
20 |     private final String defaultYear;
21 | 
22 |     protected GregorianYearTokenFilter(TokenStream input, String defaultYear) {
23 |         super(input);
24 |         this.defaultYear = defaultYear;
25 |     }
26 | 
27 |     @Override
28 |     public final boolean incrementToken() throws IOException {
29 |         if (!input.incrementToken()) {
30 |             return false;
31 |         } else {
32 |             String s = termAtt.toString();
33 |             Matcher m = pattern.matcher(s);
34 |             termAtt.setEmpty();
35 |             if (!m.matches()) {
36 |                 termAtt.append(defaultYear);
37 |             } else {
38 |                 while (m.find()) {
39 |                     termAtt.append(m.group());
40 |                 }
41 |             }
42 |             return true;
43 |         }
44 |     }
45 | 
46 |     @Override
47 |     public boolean equals(Object object) {
48 |         return object instanceof GregorianYearTokenFilter &&
49 |                 defaultYear.equals(((GregorianYearTokenFilter)object).defaultYear);
50 |     }
51 | 
52 |     @Override
53 |     public int hashCode() {
54 |         return defaultYear.hashCode();
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/analysis/year/GregorianYearTokenFilterFactory.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.analysis.year;
 2 | 
 3 | import org.apache.lucene.analysis.TokenStream;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.env.Environment;
 6 | import org.elasticsearch.index.IndexSettings;
 7 | import org.elasticsearch.index.analysis.AbstractTokenFilterFactory;
 8 | 
 9 | /**
10 |  * Gregorian year token filter factory.
11 |  */
12 | public class GregorianYearTokenFilterFactory extends AbstractTokenFilterFactory {
13 | 
14 |     private final String defaultYear;
15 | 
16 |     public GregorianYearTokenFilterFactory(IndexSettings indexSettings, Environment environment, String name,
17 |                                            Settings settings) {
18 |         super(indexSettings, name, settings);
19 |         defaultYear = settings.get("default_year", "0000");
20 |     }
21 | 
22 |     @Override
23 |     public TokenStream create(TokenStream tokenStream) {
24 |         return new GregorianYearTokenFilter(tokenStream, defaultYear);
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/mapper/reference/ReferenceMapperModule.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.mapper.reference;
 2 | 
 3 | import org.elasticsearch.common.inject.AbstractModule;
 4 | import org.xbib.elasticsearch.plugin.bundle.common.reference.ReferenceService;
 5 | 
 6 | /**
 7 |  * Reference field mapper module.
 8 |  */
 9 | public class ReferenceMapperModule extends AbstractModule {
10 | 
11 |     private final ReferenceMapperTypeParser typeParser;
12 | 
13 |     public ReferenceMapperModule(ReferenceMapperTypeParser typeParser) {
14 |         this.typeParser = typeParser;
15 |     }
16 | 
17 |     @Override
18 |     protected void configure() {
19 |         bind(ReferenceService.class).asEagerSingleton();
20 |         bind(ReferenceMapperTypeParser.class).toInstance(typeParser);
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/mapper/reference/ReferenceMapperTypeParser.java:
--------------------------------------------------------------------------------
1 | package org.xbib.elasticsearch.plugin.bundle.index.mapper.reference;
2 | 
3 | /**
4 |  * Reference field mapper type parser.
5 |  */
6 | public final class ReferenceMapperTypeParser extends ReferenceMapper.TypeParser {
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/mapper/standardnumber/StandardnumberMapperModule.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.index.mapper.standardnumber;
 2 | 
 3 | import org.elasticsearch.common.inject.AbstractModule;
 4 | import org.xbib.elasticsearch.plugin.bundle.common.standardnumber.StandardnumberService;
 5 | 
 6 | /**
 7 |  * Standard number field mapper module.
 8 |  */
 9 | public class StandardnumberMapperModule extends AbstractModule {
10 | 
11 |     private final StandardnumberMapperTypeParser typeParser;
12 | 
13 |     public StandardnumberMapperModule(StandardnumberMapperTypeParser typeParser) {
14 |         this.typeParser = typeParser;
15 |     }
16 | 
17 |     @Override
18 |     protected void configure() {
19 |         bind(StandardnumberService.class).asEagerSingleton();
20 |         bind(StandardnumberMapperTypeParser.class).toInstance(typeParser);
21 |     }
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/index/mapper/standardnumber/StandardnumberMapperTypeParser.java:
--------------------------------------------------------------------------------
1 | package org.xbib.elasticsearch.plugin.bundle.index.mapper.standardnumber;
2 | 
3 | /**
4 |  * Standard number field mapper type parser.
5 |  */
6 | public final class StandardnumberMapperTypeParser extends StandardnumberMapper.TypeParser {
7 | }
8 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Plugin bundle for Elasticsearch.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle;
5 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/rest/action/isbnformat/RestISBNFormatterAction.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.rest.action.isbnformat;
 2 | 
 3 | import org.elasticsearch.client.node.NodeClient;
 4 | import org.elasticsearch.common.inject.Inject;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | import org.elasticsearch.rest.BaseRestHandler;
 7 | import org.elasticsearch.rest.RestController;
 8 | import org.elasticsearch.rest.RestRequest;
 9 | import org.elasticsearch.rest.action.RestStatusToXContentListener;
10 | import org.xbib.elasticsearch.plugin.bundle.action.isbnformat.ISBNFormatAction;
11 | import org.xbib.elasticsearch.plugin.bundle.action.isbnformat.ISBNFormatRequest;
12 | 
13 | import java.io.IOException;
14 | 
15 | import static org.elasticsearch.rest.RestRequest.Method.GET;
16 | 
17 | /**
18 |  * REST ISBN format action.
19 |  */
20 | public class RestISBNFormatterAction extends BaseRestHandler {
21 | 
22 |     @Inject
23 |     public RestISBNFormatterAction(Settings settings, RestController controller) {
24 |         super(settings);
25 |         controller.registerHandler(GET, "/_isbn", this);
26 |         controller.registerHandler(GET, "/_isbn/{value}", this);
27 |     }
28 | 
29 |     @Override
30 |     public String getName() {
31 |         return "ISBN";
32 |     }
33 | 
34 |     @Override
35 |     protected RestChannelConsumer prepareRequest(RestRequest request, NodeClient client) throws IOException {
36 |         final String value = request.param("value");
37 |         final ISBNFormatRequest isbnFormatRequest = new ISBNFormatRequest().setValue(value);
38 |         return channel -> client.execute(ISBNFormatAction.INSTANCE, isbnFormatRequest,
39 |                     new RestStatusToXContentListener<>(channel));
40 |     }
41 | }
42 | 


--------------------------------------------------------------------------------
/src/main/java/org/xbib/elasticsearch/plugin/bundle/rest/action/isbnformat/package-info.java:
--------------------------------------------------------------------------------
1 | /**
2 |  * Rest action for ISBN format.
3 |  */
4 | package org.xbib.elasticsearch.plugin.bundle.rest.action.isbnformat;
5 | 


--------------------------------------------------------------------------------
/src/main/plugin-metadata/plugin-security.policy:
--------------------------------------------------------------------------------
 1 | grant codeBase "${codebase.elasticsearch-plugin-bundle}" {
 2 |   permission java.io.FilePermission "*", "read";
 3 |   permission java.lang.RuntimePermission "accessDeclaredMembers";
 4 |   permission java.lang.RuntimePermission "getClassLoader";
 5 |   permission java.lang.reflect.ReflectPermission "suppressAccessChecks";
 6 | };
 7 | 
 8 | grant codeBase "${codebase.icu4j}" {
 9 |   permission java.io.FilePermission "*", "read";
10 |   permission java.lang.RuntimePermission "accessDeclaredMembers";
11 |   permission java.lang.RuntimePermission "getClassLoader";
12 |   permission java.lang.reflect.ReflectPermission "suppressAccessChecks";
13 | };
14 | 


--------------------------------------------------------------------------------
/src/main/resources/org/xbib/elasticsearch/plugin/bundle/icu/KeywordTokenizer.brk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/icu/KeywordTokenizer.brk


--------------------------------------------------------------------------------
/src/main/resources/org/xbib/elasticsearch/plugin/bundle/icu/Latin-break-only-on-whitespace.brk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/icu/Latin-break-only-on-whitespace.brk


--------------------------------------------------------------------------------
/src/main/resources/org/xbib/elasticsearch/plugin/bundle/icu/Latin-dont-break-on-hyphens.brk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/icu/Latin-dont-break-on-hyphens.brk


--------------------------------------------------------------------------------
/src/main/resources/org/xbib/elasticsearch/plugin/bundle/icu/folding/DingbatFolding.txt:
--------------------------------------------------------------------------------
  1 | # Copyright 2001-2010 Unicode, Inc.
  2 | # 
  3 | # Disclaimer
  4 | # 
  5 | # This source code is provided as is by Unicode, Inc. No claims are
  6 | # made as to fitness for any particular purpose. No warranties of any
  7 | # kind are expressed or implied. The recipient agrees to determine
  8 | # applicability of information provided. If this file has been
  9 | # purchased on magnetic or optical media from Unicode, Inc., the
 10 | # sole remedy for any claim will be exchange of defective media
 11 | # within 90 days of receipt.
 12 | # 
 13 | # Limitations on Rights to Redistribute This Code
 14 | # 
 15 | # Unicode, Inc. hereby grants the right to freely use the information
 16 | # supplied in this file in the creation of products supporting the
 17 | # Unicode Standard, and to make copies of this file in any form
 18 | # for internal or external distribution as long as this notice
 19 | # remains attached.
 20 | 
 21 | ### Custom Normalization mappings for UTR#30 
 22 | ### (http://www.unicode.org/reports/tr30/tr30-4.html)
 23 | ###
 24 | ### Created from Unicode 5.2 UCD
 25 | ###
 26 | 
 27 | #### WARNING ####
 28 | #### Rule: lines direct content generation.
 29 | #### All non-comments will be REMOVED when this file's contents
 30 | #### are generated by 'ant gen-utr30-data-files'.
 31 | #### Use "# Rule: verbatim" to keep non-comments up until
 32 | #### the next "# Rule:" line.
 33 | #### WARNING ####
 34 | 
 35 | # Folds dingbats and other adorned forms
 36 | # Generated from ASCIIFoldingFilter
 37 | # Rule: verbatim
 38 | 24EB>0031 0031
 39 | 24EC>0031 0032
 40 | 24ED>0031 0033
 41 | 24EE>0031 0034
 42 | 24EF>0031 0035
 43 | 24F0>0031 0036
 44 | 24F1>0031 0037
 45 | 24F2>0031 0038
 46 | 24F3>0031 0039
 47 | 24F4>0032 0030
 48 | 24F5>0031
 49 | 24F6>0032
 50 | 24F7>0033
 51 | 24F8>0034
 52 | 24F9>0035
 53 | 24FA>0036
 54 | 24FB>0037
 55 | 24FC>0038
 56 | 24FD>0039
 57 | 24FE>0031 0030
 58 | 24FF>0030
 59 | 275B>0027
 60 | 275C>0027
 61 | 275D>0022
 62 | 275E>0022
 63 | 2768>0028
 64 | 2769>0029
 65 | 276A>0028
 66 | 276B>0029
 67 | 276C>003C
 68 | 276D>003E
 69 | 276E>0022
 70 | 276F>0022
 71 | 2770>003C
 72 | 2771>003E
 73 | 2772>005B
 74 | 2773>005D
 75 | 2774>007B
 76 | 2775>007D
 77 | 2776>0031
 78 | 2777>0032
 79 | 2778>0033
 80 | 2779>0034
 81 | 277A>0035
 82 | 277B>0036
 83 | 277C>0037
 84 | 277D>0038
 85 | 277E>0039
 86 | 277F>0031 0030
 87 | 2780>0031
 88 | 2781>0032
 89 | 2782>0033
 90 | 2783>0034
 91 | 2784>0035
 92 | 2785>0036
 93 | 2786>0037
 94 | 2787>0038
 95 | 2788>0039
 96 | 2789>0031 0030
 97 | 278A>0031
 98 | 278B>0032
 99 | 278C>0033
100 | 278D>0034
101 | 278E>0035
102 | 278F>0036
103 | 2790>0037
104 | 2791>0038
105 | 2792>0039
106 | 2793>0031 0030
107 | 


--------------------------------------------------------------------------------
/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/fst/words.fst:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/fst/words.fst


--------------------------------------------------------------------------------
/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/patricia/grfExt.tree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/patricia/grfExt.tree


--------------------------------------------------------------------------------
/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/patricia/kompVHic.tree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/patricia/kompVHic.tree


--------------------------------------------------------------------------------
/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/patricia/kompVVic.tree:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/decompound/patricia/kompVVic.tree


--------------------------------------------------------------------------------
/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/Default.brk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/Default.brk


--------------------------------------------------------------------------------
/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/KeywordTokenizer.rbbi:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # RBBI Keyword tokenizer: keep everything as a single token.
18 | 
19 | # Apply rule status {200}=RBBI.WORD_LETTER, which is mapped
20 | # to <ALPHANUM> token type by DefaultICUTokenizerConfig.
21 | .+ {200};
22 | 


--------------------------------------------------------------------------------
/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/Latin-break-only-on-whitespace.rbbi:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # Break only on whitespace; assign token type from set { <ALPHANUM>, <NUM>, <OTHER> }
18 | #
19 | 
20 | !!forward;
21 | 
22 | $Whitespace = [\p{Whitespace}];
23 | $NonWhitespace = [\P{Whitespace}];
24 | $Letter = [\p{Letter}];
25 | $Number = [\p{Number}];
26 | 
27 | # Default rule status is {0}=RBBI.WORD_NONE => not tokenized by ICUTokenizer
28 | $Whitespace;
29 | 
30 | # Assign rule status {200}=RBBI.WORD_LETTER when the token contains a letter char
31 | # Mapped to <ALPHANUM> token type by DefaultICUTokenizerConfig
32 | $NonWhitespace* $Letter $NonWhitespace*   {200};
33 | 
34 | # Assign rule status {100}=RBBI.WORD_NUM when the token contains a numeric char
35 | # Mapped to <NUM> token type by DefaultICUTokenizerConfig
36 | $NonWhitespace* $Number $NonWhitespace*   {100};
37 | 
38 | # Assign rule status {1} (no RBBI equivalent) when the token contains neither a letter nor a numeric char
39 | # Mapped to <OTHER> token type by DefaultICUTokenizerConfig
40 | $NonWhitespace+   {1};
41 | 


--------------------------------------------------------------------------------
/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/MyanmarSyllable.brk:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/MyanmarSyllable.brk


--------------------------------------------------------------------------------
/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/segmentation/MyanmarSyllable.rbbi:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | #
18 | # Parses Myanmar text, with syllable as token.
19 | #
20 | 
21 | $Cons = [[:Other_Letter:]&[:Myanmar:]];
22 | $Virama = [\u1039];
23 | $Asat = [\u103A];
24 | 
25 | $WordJoin = [:Line_Break=Word_Joiner:];
26 | 
27 | #
28 | # default numerical definitions
29 | #
30 | $Extend       = [\p{Word_Break = Extend}];
31 | $Format       = [\p{Word_Break = Format}];
32 | $MidNumLet    = [\p{Word_Break = MidNumLet}];
33 | $MidNum       = [\p{Word_Break = MidNum}];
34 | $Numeric      = [\p{Word_Break = Numeric}];
35 | $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
36 | $MidNumLetEx    = $MidNumLet    ($Extend |  $Format)*;
37 | $MidNumEx       = $MidNum       ($Extend |  $Format)*;
38 | $NumericEx      = $Numeric      ($Extend |  $Format)*;
39 | $ExtendNumLetEx = $ExtendNumLet ($Extend |  $Format)*;
40 | 
41 | $ConsEx = $Cons ($Extend | $Format)*;
42 | $AsatEx = $Cons $Asat ($Virama $ConsEx)? ($Extend | $Format)*;
43 | $MyanmarSyllableEx = $ConsEx ($Virama $ConsEx)? ($AsatEx)*;
44 | $MyanmarJoinedSyllableEx = $MyanmarSyllableEx ($WordJoin $MyanmarSyllableEx)*;
45 | 
46 | !!forward;
47 | $MyanmarJoinedSyllableEx {200};
48 | 
49 | # default numeric rules
50 | $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)*  {100};


--------------------------------------------------------------------------------
/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/utr30.nrm:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jprante/elasticsearch-plugin-bundle/164387f1356162acaec5642126e6d64098ef19f3/src/main/resources/org/xbib/elasticsearch/plugin/bundle/index/analysis/icu/utr30.nrm


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/MultiMap.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test;
 2 | 
 3 | import java.util.Collection;
 4 | import java.util.Set;
 5 | 
 6 | public interface MultiMap<K, V> {
 7 | 
 8 |     void clear();
 9 | 
10 |     int size();
11 | 
12 |     boolean isEmpty();
13 | 
14 |     boolean containsKey(K key);
15 | 
16 |     Collection<V> get(K key);
17 | 
18 |     Set<K> keySet();
19 | 
20 |     Collection<Set<V>> values();
21 | 
22 |     Collection<V> put(K key, V value);
23 | 
24 |     Collection<V> remove(K key);
25 | 
26 |     Collection<V> remove(K key, V value);
27 | 
28 |     void putAll(K key, Collection<V> values);
29 | 
30 | }
31 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/TreeMultiMap.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test;
 2 | 
 3 | import java.util.Collection;
 4 | import java.util.LinkedHashSet;
 5 | import java.util.Map;
 6 | import java.util.Set;
 7 | import java.util.TreeMap;
 8 | import java.util.TreeSet;
 9 | 
10 | public class TreeMultiMap<K, V> implements MultiMap<K, V> {
11 | 
12 |     private final Map<K, Set<V>> map = new TreeMap<>();
13 | 
14 |     @Override
15 |     public int size() {
16 |         return map.size();
17 |     }
18 | 
19 |     @Override
20 |     public void clear() {
21 |         map.clear();
22 |     }
23 | 
24 |     @Override
25 |     public boolean isEmpty() {
26 |         return map.isEmpty();
27 |     }
28 | 
29 |     @Override
30 |     public boolean containsKey(K key) {
31 |         return map.containsKey(key);
32 |     }
33 | 
34 |     @Override
35 |     public Set<K> keySet() {
36 |         return map.keySet();
37 |     }
38 | 
39 |     @Override
40 |     public Collection<Set<V>> values() {
41 |         return map.values();
42 |     }
43 | 
44 |     @Override
45 |     public Collection<V> put(K key, V value) {
46 |         Set<V> set = map.get(key);
47 |         if (set == null) {
48 |             set = new TreeSet<>();
49 |         }
50 |         set.add(value);
51 |         return map.put(key, set);
52 |     }
53 | 
54 |     @Override
55 |     public void putAll(K key, Collection<V> values) {
56 |         Set<V> set = map.get(key);
57 |         if (set == null) {
58 |             set = new LinkedHashSet<>();
59 |             map.put(key, set);
60 |         }
61 |         set.addAll(values);
62 |     }
63 | 
64 |     @Override
65 |     public Collection<V> get(K key) {
66 |         return map.get(key);
67 |     }
68 | 
69 |     @Override
70 |     public Set<V> remove(K key) {
71 |         return map.remove(key);
72 |     }
73 | 
74 |     @Override
75 |     public Set<V> remove(K key, V value) {
76 |         Set<V> set = map.get(key);
77 |         if (set != null) {
78 |             set.remove(value);
79 |         }
80 |         return set;
81 |     }
82 | 
83 |     @Override
84 |     public boolean equals(Object obj) {
85 |         return obj != null && obj instanceof TreeMultiMap && map.equals(((TreeMultiMap) obj).map);
86 |     }
87 | 
88 |     @Override
89 |     public int hashCode() {
90 |         return map.hashCode();
91 |     }
92 | 
93 |     @Override
94 |     public String toString() {
95 |         return map.toString();
96 |     }
97 | }
98 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/common/decompound/patricia/LFUCacheTest.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.common.decompound.patricia;
 2 | 
 3 | import com.carrotsearch.randomizedtesting.annotations.SuppressForbidden;
 4 | import org.junit.Test;
 5 | import org.xbib.elasticsearch.plugin.bundle.common.decompound.patricia.LFUCache;
 6 | 
 7 | import static org.junit.Assert.assertEquals;
 8 | 
 9 | public class LFUCacheTest {
10 | 
11 |     @SuppressForbidden(value = "execute this to test LFU cache")
12 |     @Test
13 |     public void testCache() {
14 |         LFUCache<Integer, Integer> cache = new LFUCache<>(100, 0.90f);
15 |         for (int i = 0; i < 500; i++) {
16 |             cache.computeIfAbsent(i, f -> f % 2);
17 |         }
18 |         assertEquals(50, cache.size());
19 |     }
20 | }
21 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/baseform/DictionaryTest.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.baseform;
 2 | 
 3 | import org.apache.lucene.util.SuppressForbidden;
 4 | import org.xbib.elasticsearch.plugin.bundle.common.fsa.Dictionary;
 5 | 
 6 | import java.io.BufferedReader;
 7 | import java.io.IOException;
 8 | import java.io.InputStreamReader;
 9 | import java.nio.charset.CharacterCodingException;
10 | import java.nio.charset.StandardCharsets;
11 | 
12 | /**
13 |  * Dictionary tests.
14 |  */
15 | public class DictionaryTest {
16 | 
17 |     @SuppressForbidden(reason = "accessing local resources from classpath")
18 |     public void testVerifyDE() throws IOException {
19 |         Dictionary dictionary = new Dictionary();
20 |         InputStreamReader reader = new InputStreamReader(getClass().getResourceAsStream("de-lemma-utf8.txt"),
21 |                 StandardCharsets.UTF_8);
22 |         dictionary.loadLines(reader);
23 |         reader.close();
24 |         BufferedReader br = new BufferedReader(new InputStreamReader(getClass().getResourceAsStream("de-lemma-utf8.txt"),
25 |                 StandardCharsets.UTF_8));
26 |         String line;
27 |         while ((line = br.readLine()) != null) {
28 |             if (!line.startsWith("#")) {
29 |                 if (!check(line, dictionary)) {
30 |                     break;
31 |                 }
32 |             }
33 |         }
34 |         br.close();
35 |     }
36 | 
37 |     private boolean check(String line, Dictionary dictionary) throws CharacterCodingException {
38 |         int pos = line.indexOf("\t");
39 |         String word = pos > 0 ? line.substring(0, pos) : line;
40 |         try {
41 |             CharSequence baseform = dictionary.lookup(word);
42 |         } catch (StackOverflowError e) {
43 |             // if stack overflow error occurs, we have faulty entries
44 |             return false;
45 |         }
46 |         return true;
47 |     }
48 | }
49 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/concat/ConcatTokenFilterTests.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.concat;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.index.Index;
 6 | import org.elasticsearch.test.ESTestCase;
 7 | import org.elasticsearch.test.ESTokenStreamTestCase;
 8 | import org.xbib.elasticsearch.plugin.bundle.BundlePlugin;
 9 | 
10 | /**
11 |  * Concat token filter tests.
12 |  */
13 | public class ConcatTokenFilterTests extends ESTokenStreamTestCase {
14 | 
15 |     public void testConcat() throws Exception {
16 |         String source = "Das ist ein Schlüsselwort, ein Bindestrichwort";
17 |         String[] expected = {
18 |                 "Das ist ein Schlüsselwort ein Bindestrichwort"
19 |         };
20 |         String resource = "concat_analysis.json";
21 |         Settings settings = Settings.builder()
22 |                 .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
23 |                 .build();
24 |         ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
25 |                 settings,
26 |                 new BundlePlugin(Settings.EMPTY));
27 |         Analyzer analyzer = analysis.indexAnalyzers.get("concat");
28 |         assertNotNull(analyzer);
29 |         assertTokenStreamContents(analyzer.tokenStream("test-field", source), expected);
30 |     }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/decompound/fst/FstDecompoundTokenFilterTests.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.decompound.fst;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.elasticsearch.analysis.common.CommonAnalysisPlugin;
 5 | import org.elasticsearch.common.settings.Settings;
 6 | import org.elasticsearch.index.Index;
 7 | import org.elasticsearch.test.ESTestCase;
 8 | import org.elasticsearch.test.ESTokenStreamTestCase;
 9 | import org.xbib.elasticsearch.plugin.bundle.BundlePlugin;
10 | 
11 | /**
12 |  * Finite state transducer decompound token filter tests.
13 |  */
14 | public class FstDecompoundTokenFilterTests extends ESTokenStreamTestCase {
15 | 
16 |     public void testDecompound() throws Exception {
17 | 
18 |         String source = "Die Jahresfeier der Rechtsanwaltskanzleien auf dem Donaudampfschiff hat viel Ökosteuer gekostet";
19 | 
20 |         String[] expected = {
21 |                 "Die",
22 |                 "Jahresfeier",
23 |                 "jahres",
24 |                 "feier",
25 |                 "der",
26 |                 "Rechtsanwaltskanzleien",
27 |                 "rechts",
28 |                 "anwalts",
29 |                 "kanzleien",
30 |                 "auf",
31 |                 "dem",
32 |                 "Donaudampfschiff",
33 |                 "donau",
34 |                 "dampf",
35 |                 "schiff",
36 |                 "hat",
37 |                 "viel",
38 |                 "Ökosteuer",
39 |                 "ökos",
40 |                 "teuer",
41 |                 "gekostet"
42 |         };
43 | 
44 |         Settings settings = Settings.builder()
45 |                 .put("index.analysis.analyzer.myanalyzer.type", "custom")
46 |                 .put("index.analysis.analyzer.myanalyzer.tokenizer", "standard")
47 |                 .put("index.analysis.analyzer.myanalyzer.filter.0", "fst_decompound")
48 |                 .put("index.analysis.analyzer.myanalyzer.filter.1", "unique")
49 |                 .build();
50 |         ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
51 |                 settings,
52 |                 new BundlePlugin(Settings.EMPTY), new CommonAnalysisPlugin());
53 |         Analyzer myanalyzer = analysis.indexAnalyzers.get("myanalyzer");
54 |         assertAnalyzesTo(myanalyzer, source, expected);
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/german/GermanNormalizationTests.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.german;
 2 | 
 3 | import org.apache.lucene.analysis.Tokenizer;
 4 | import org.elasticsearch.Version;
 5 | import org.elasticsearch.cluster.metadata.IndexMetaData;
 6 | import org.elasticsearch.common.settings.Settings;
 7 | import org.elasticsearch.index.Index;
 8 | import org.elasticsearch.index.analysis.TokenFilterFactory;
 9 | import org.elasticsearch.test.ESTestCase;
10 | import org.elasticsearch.test.ESTokenStreamTestCase;
11 | import org.xbib.elasticsearch.plugin.bundle.BundlePlugin;
12 | 
13 | import java.io.IOException;
14 | import java.io.StringReader;
15 | 
16 | /**
17 |  * German normalization tests.
18 |  */
19 | public class GermanNormalizationTests extends ESTokenStreamTestCase {
20 | 
21 |     public void testGerman1() throws IOException {
22 | 
23 |         String source = "Ein schöner Tag in Köln im Café an der Straßenecke";
24 | 
25 |         String[] expected = {
26 |             "Ein",
27 |             "schoner",
28 |             "Tag",
29 |             "in",
30 |             "Koln",
31 |             "im",
32 |             "Café",
33 |             "an",
34 |             "der",
35 |             "Strassenecke"
36 |         };
37 |         String resource = "german_normalization_analysis.json";
38 |         Settings settings = Settings.builder()
39 |                 .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
40 |                 .put("path.home", System.getProperty("path.home"))
41 |                 .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
42 |                 .build();
43 |         ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
44 |                 settings,
45 |                 new BundlePlugin(Settings.EMPTY));
46 | 
47 |         TokenFilterFactory tokenFilter = analysis.tokenFilter.get("umlaut");
48 |         Tokenizer tokenizer = analysis.tokenizer.get("standard").create();
49 |         tokenizer.setReader(new StringReader(source));
50 |         assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
51 |     }
52 | }
53 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/IcuAnalysisTests.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.icu;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.elasticsearch.common.settings.Settings;
 5 | import org.elasticsearch.index.Index;
 6 | import org.elasticsearch.index.analysis.CharFilterFactory;
 7 | import org.elasticsearch.index.analysis.NamedAnalyzer;
 8 | import org.elasticsearch.index.analysis.TokenFilterFactory;
 9 | import org.elasticsearch.index.analysis.TokenizerFactory;
10 | import org.elasticsearch.test.ESTestCase;
11 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation.IcuTokenizerFactory;
12 | import org.xbib.elasticsearch.plugin.bundle.BundlePlugin;
13 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.IcuFoldingTokenFilterFactory;
14 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.IcuNormalizerCharFilterFactory;
15 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.IcuNormalizerTokenFilterFactory;
16 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.IcuTransformTokenFilterFactory;
17 | 
18 | import java.io.IOException;
19 | 
20 | import static org.hamcrest.CoreMatchers.instanceOf;
21 | 
22 | /**
23 |  * ICU analysis tests
24 |  */
25 | public class IcuAnalysisTests extends ESTestCase {
26 | 
27 |     public void testDefaultsIcuAnalysis() throws IOException {
28 | 
29 |         TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY,
30 |                 new BundlePlugin(Settings.EMPTY));
31 | 
32 |         CharFilterFactory charFilterFactory = analysis.charFilter.get("icu_normalizer");
33 |         assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class));
34 | 
35 |         TokenizerFactory tf = analysis.tokenizer.get("icu_tokenizer");
36 |         assertThat(tf, instanceOf(IcuTokenizerFactory.class));
37 | 
38 |         TokenFilterFactory filterFactory = analysis.tokenFilter.get("icu_normalizer");
39 |         assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class));
40 | 
41 |         filterFactory = analysis.tokenFilter.get("icu_folding");
42 |         assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class));
43 | 
44 |         filterFactory = analysis.tokenFilter.get("icu_transform");
45 |         assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class));
46 | 
47 |         Analyzer analyzer = analysis.indexAnalyzers.get( "icu_collation");
48 |         assertThat(analyzer, instanceOf(NamedAnalyzer.class));
49 |     }
50 | }
51 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/IcuClientYamlTestSuiteIT.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.icu;
 2 | 
 3 | import com.carrotsearch.randomizedtesting.annotations.Name;
 4 | import com.carrotsearch.randomizedtesting.annotations.ParametersFactory;
 5 | 
 6 | import org.elasticsearch.test.rest.yaml.ClientYamlTestCandidate;
 7 | import org.elasticsearch.test.rest.yaml.ESClientYamlSuiteTestCase;
 8 | 
 9 | public class IcuClientYamlTestSuiteIT extends ESClientYamlSuiteTestCase {
10 | 
11 |     public IcuClientYamlTestSuiteIT(@Name("yaml") ClientYamlTestCandidate testCandidate) {
12 |         super(testCandidate);
13 |     }
14 | 
15 |     @ParametersFactory
16 |     public static Iterable<Object[]> parameters() throws Exception {
17 |         return ESClientYamlSuiteTestCase.createParameters();
18 |     }
19 | }
20 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/IcuNormalizerFilterTests.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.icu;
 2 | 
 3 | import com.ibm.icu.text.Normalizer2;
 4 | import org.apache.lucene.analysis.Analyzer;
 5 | import org.apache.lucene.analysis.MockTokenizer;
 6 | import org.apache.lucene.analysis.Tokenizer;
 7 | import org.apache.lucene.analysis.core.KeywordTokenizer;
 8 | import org.elasticsearch.test.ESTokenStreamTestCase;
 9 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.IcuNormalizerFilter;
10 | 
11 | /**
12 |  * ICU normalizer filter tests.
13 |  */
14 | public class IcuNormalizerFilterTests extends ESTokenStreamTestCase {
15 | 
16 |     public void testDefaults() throws Exception {
17 |         Analyzer a = new Analyzer() {
18 |             @Override
19 |             public TokenStreamComponents createComponents(String fieldName) {
20 |                 Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
21 |                 return new TokenStreamComponents(tokenizer,
22 |                         new IcuNormalizerFilter(tokenizer,
23 |                                 Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)));
24 |             }
25 |         };
26 |         assertAnalyzesTo(a, "This is a test", new String[] { "this", "is", "a", "test" });
27 |         assertAnalyzesTo(a, "Ruß", new String[] { "russ" });
28 |         assertAnalyzesTo(a, "ΜΆΪΟΣ", new String[] { "μάϊοσ" });
29 |         assertAnalyzesTo(a, "Μάϊος", new String[] { "μάϊοσ" });
30 |         assertAnalyzesTo(a, "𐐖", new String[] { "𐐾" });
31 |         assertAnalyzesTo(a, "ﴳﴺﰧ", new String[] { "طمطمطم" });
32 |         assertAnalyzesTo(a, "क्‍ष", new String[] { "क्ष" });
33 |         a.close();
34 |     }
35 | 
36 |     public void testAlternate() throws Exception {
37 |         Analyzer a = new Analyzer() {
38 |             @Override
39 |             public TokenStreamComponents createComponents(String fieldName) {
40 |                 Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
41 |                 return new TokenStreamComponents(tokenizer, new IcuNormalizerFilter(
42 |                         tokenizer,
43 |                         Normalizer2.getInstance(null, "nfc", Normalizer2.Mode.DECOMPOSE)));
44 |             }
45 |         };
46 |         assertAnalyzesTo(a, "\u00E9", new String[] { "\u0065\u0301" });
47 |         a.close();
48 |     }
49 | 
50 |     public void testEmptyTerm() throws Exception {
51 |         Analyzer a = new Analyzer() {
52 |             @Override
53 |             protected TokenStreamComponents createComponents(String fieldName) {
54 |                 Tokenizer tokenizer = new KeywordTokenizer();
55 |                 return new TokenStreamComponents(tokenizer,
56 |                         new IcuNormalizerFilter(tokenizer,
57 |                                 Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)));
58 |             }
59 |         };
60 |         checkOneTerm(a, "", "");
61 |         a.close();
62 |     }
63 | }
64 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/segmentation/CharArrayIteratorTests.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.icu.segmentation;
 2 | 
 3 | import org.elasticsearch.test.ESTestCase;
 4 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation.CharArrayIterator;
 5 | 
 6 | import java.text.CharacterIterator;
 7 | 
 8 | /**
 9 |  * Char array iterator tests.
10 |  */
11 | public class CharArrayIteratorTests extends ESTestCase {
12 | 
13 |     public void testBasicUsage() {
14 |         CharArrayIterator ci = new CharArrayIterator();
15 |         ci.setText("testing".toCharArray(), 0, "testing".length());
16 |         assertEquals(0, ci.getBeginIndex());
17 |         assertEquals(7, ci.getEndIndex());
18 |         assertEquals(0, ci.getIndex());
19 |         assertEquals('t', ci.current());
20 |         assertEquals('e', ci.next());
21 |         assertEquals('g', ci.last());
22 |         assertEquals('n', ci.previous());
23 |         assertEquals('t', ci.first());
24 |         assertEquals(CharacterIterator.DONE, ci.previous());
25 |     }
26 | 
27 |     public void testFirst() {
28 |         CharArrayIterator ci = new CharArrayIterator();
29 |         ci.setText("testing".toCharArray(), 0, "testing".length());
30 |         ci.next();
31 |         assertEquals('t', ci.first());
32 |         assertEquals(ci.getBeginIndex(), ci.getIndex());
33 |         ci.setText(new char[] {}, 0, 0);
34 |         assertEquals(CharacterIterator.DONE, ci.first());
35 |     }
36 | 
37 |     public void testLast() {
38 |         CharArrayIterator ci = new CharArrayIterator();
39 |         ci.setText("testing".toCharArray(), 0, "testing".length());
40 |         assertEquals('g', ci.last());
41 |         assertEquals(ci.getIndex(), ci.getEndIndex() - 1);
42 |         ci.setText(new char[] {}, 0, 0);
43 |         assertEquals(CharacterIterator.DONE, ci.last());
44 |         assertEquals(ci.getEndIndex(), ci.getIndex());
45 |     }
46 | 
47 |     public void testCurrent() {
48 |         CharArrayIterator ci = new CharArrayIterator();
49 |         ci.setText("testing".toCharArray(), 0, "testing".length());
50 |         assertEquals('t', ci.current());
51 |         ci.last();
52 |         ci.next();
53 |         assertEquals(CharacterIterator.DONE, ci.current());
54 |     }
55 | 
56 |     public void testNext() {
57 |         CharArrayIterator ci = new CharArrayIterator();
58 |         ci.setText("te".toCharArray(), 0, 2);
59 |         assertEquals('e', ci.next());
60 |         assertEquals(1, ci.getIndex());
61 |         assertEquals(CharacterIterator.DONE, ci.next());
62 |         assertEquals(ci.getEndIndex(), ci.getIndex());
63 |     }
64 | 
65 |     /*public void testSetIndex() {
66 |         CharArrayIterator ci = new CharArrayIterator();
67 |         ci.setText("test".toCharArray(), 0, "test".length());
68 |         ci.setIndex(5);
69 |     }*/
70 | 
71 |     public void testClone() {
72 |         char text[] = "testing".toCharArray();
73 |         CharArrayIterator ci = new CharArrayIterator();
74 |         ci.setText(text, 0, text.length);
75 |         ci.next();
76 |         CharArrayIterator ci2 = ci.clone();
77 |         assertEquals(ci.getIndex(), ci2.getIndex());
78 |         assertEquals(ci.next(), ci2.next());
79 |         assertEquals(ci.last(), ci2.last());
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/segmentation/IcuTokenizerCJKTests.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.icu.segmentation;
 2 | 
 3 | import org.apache.lucene.analysis.Analyzer;
 4 | import org.apache.lucene.util.AttributeFactory;
 5 | import org.elasticsearch.test.ESTokenStreamTestCase;
 6 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation.DefaultIcuTokenizerConfig;
 7 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.segmentation.IcuTokenizer;
 8 | 
 9 | /**
10 |  * ICU tokenizer CJK tests.
11 |  */
12 | public class IcuTokenizerCJKTests extends ESTokenStreamTestCase {
13 | 
14 |     private static Analyzer create() {
15 |         return new Analyzer() {
16 |             @Override
17 |             protected TokenStreamComponents createComponents(String fieldName) {
18 |                 return new TokenStreamComponents(new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
19 |                         new DefaultIcuTokenizerConfig(true, true)));
20 |             }
21 |         };
22 |     }
23 | 
24 |     public static void destroyAnalyzer(Analyzer a) {
25 |         a.close();
26 |     }
27 | 
28 |     public void testSimpleChinese() throws Exception {
29 |         Analyzer a = create();
30 |         assertAnalyzesTo(a, "我购买了道具和服装。",
31 |                 new String[] { "我", "购买", "了", "道具", "和", "服装" }
32 |         );
33 |         destroyAnalyzer(a);
34 |     }
35 | 
36 |     public void testChineseNumerics() throws Exception {
37 |         Analyzer a = create();
38 |         assertAnalyzesTo(a, "９４８３", new String[] { "９４８３" });
39 |         assertAnalyzesTo(a, "院內分機９４８３。",
40 |                 new String[] { "院", "內", "分機", "９４８３" });
41 |         assertAnalyzesTo(a, "院內分機9483。",
42 |                 new String[] { "院", "內", "分機", "9483" });
43 |         destroyAnalyzer(a);
44 |     }
45 | 
46 |     public void testSimpleJapanese() throws Exception {
47 |         Analyzer a = create();
48 |         assertAnalyzesTo(a, "それはまだ実験段階にあります",
49 |                 new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます"  }
50 |         );
51 |         destroyAnalyzer(a);
52 |     }
53 | 
54 |     public void testJapaneseTypes() throws Exception {
55 |         Analyzer a = create();
56 |         assertAnalyzesTo(a, "仮名遣い カタカナ",
57 |                 new String[] { "仮名遣い", "カタカナ" },
58 |                 new String[] { "<IDEOGRAPHIC>", "<IDEOGRAPHIC>" });
59 |         destroyAnalyzer(a);
60 |     }
61 | 
62 |     public void testKorean() throws Exception {
63 |         Analyzer a = create();
64 |         // Korean words
65 |         assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
66 |         destroyAnalyzer(a);
67 |     }
68 | 
69 |     public void testKoreanTypes() throws Exception {
70 |         Analyzer a = create();
71 |         assertAnalyzesTo(a, "훈민정음", new String[] { "훈민정음" }, new String[] { "<HANGUL>" });
72 |         destroyAnalyzer(a);
73 |     }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/tools/RBBIRuleCompilerTest.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.icu.tools;
 2 | 
 3 | import com.carrotsearch.randomizedtesting.annotations.SuppressForbidden;
 4 | import org.junit.Test;
 5 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tools.RBBIRuleCompiler;
 6 | 
 7 | import java.io.IOException;
 8 | import java.io.InputStream;
 9 | import java.io.OutputStream;
10 | import java.nio.file.Files;
11 | import java.nio.file.Paths;
12 | 
13 | /**
14 |  * RBBI rule compiler test.
15 |  */
16 | public class RBBIRuleCompilerTest {
17 | 
18 |     @SuppressForbidden(value = "execute this test to create brk files")
19 |     @Test
20 |     public void testRBBICompile() throws IOException {
21 |         RBBIRuleCompiler rbbiRuleCompiler = new RBBIRuleCompiler();
22 |         String[] rbbis = {
23 |                 "/icu/Default.rbbi",
24 |                 "/icu/KeywordTokenizer.rbbi",
25 |                 "/icu/Latin-break-only-on-whitespace.rbbi",
26 |                 "/icu/Latin-dont-break-on-hyphens.rbbi",
27 |                 "/icu/MyanmarSyllable.rbbi"
28 |         };
29 |         for (String rbbi : rbbis) {
30 |             InputStream inputStream = getClass().getResourceAsStream(rbbi);
31 |             int pos1 = rbbi.lastIndexOf("/");
32 |             int pos2 = rbbi.lastIndexOf(".rbbi");
33 |             String basename = rbbi.substring(pos1, pos2);
34 |             System.err.println(basename);
35 |             OutputStream outputStream = Files.newOutputStream(Paths.get( "build" + basename + ".brk"));
36 |             rbbiRuleCompiler.compile(inputStream, outputStream);
37 |         }
38 |     }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/tools/UTR30DataFileGeneratorTest.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.analysis.icu.tools;
 2 | 
 3 | import com.carrotsearch.randomizedtesting.annotations.SuppressForbidden;
 4 | import org.junit.Test;
 5 | import org.xbib.elasticsearch.plugin.bundle.index.analysis.icu.tools.UTR30DataFileGenerator;
 6 | 
 7 | /**
 8 |  * UTR-30 data file generator test.
 9 |  */
10 | public class UTR30DataFileGeneratorTest {
11 | 
12 |     @SuppressForbidden(value = "execute this test to download utr30 files")
13 |     @Test
14 |     public void generateUTR30() throws Exception {
15 |         UTR30DataFileGenerator utr30DataFileGenerator = new UTR30DataFileGenerator();
16 |         utr30DataFileGenerator.execute("release-62-1", "build/");
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/DetectLanguageTests.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.mapper.langdetect;
 2 | 
 3 | import org.elasticsearch.common.io.Streams;
 4 | import org.elasticsearch.test.ESTestCase;
 5 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.LangdetectService;
 6 | 
 7 | import java.io.InputStreamReader;
 8 | import java.io.Reader;
 9 | import java.io.StringWriter;
10 | import java.io.Writer;
11 | import java.nio.charset.StandardCharsets;
12 | 
13 | public class DetectLanguageTests extends ESTestCase {
14 | 
15 |     public void testEnglish() throws Exception {
16 |         testLanguage("english.txt", "en");
17 |     }
18 | 
19 |     public void testChinese() throws Exception {
20 |         testLanguage("chinese.txt", "zh-cn");
21 |     }
22 | 
23 |     public void testJapanese() throws Exception {
24 |         testLanguage("japanese.txt", "ja");
25 |     }
26 | 
27 |     public void testKorean() throws Exception {
28 |         testLanguage("korean.txt", "ko");
29 |     }
30 | 
31 |     private void testLanguage(String path, String lang) throws Exception {
32 |         Reader reader = new InputStreamReader(getClass().getResourceAsStream(path), StandardCharsets.UTF_8);
33 |         Writer writer = new StringWriter();
34 |         Streams.copy(reader, writer);
35 |         reader.close();
36 |         writer.close();
37 |         LangdetectService detect = new LangdetectService();
38 |         assertEquals(lang, detect.detectAll(writer.toString()).get(0).getLanguage());
39 |     }
40 | 
41 | }
42 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/DetectorTests.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.mapper.langdetect;
 2 | 
 3 | import org.elasticsearch.common.settings.Settings;
 4 | import org.elasticsearch.test.ESTestCase;
 5 | 
 6 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.LangProfile;
 7 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.LangdetectService;
 8 | 
 9 | /**
10 |  * Detector test.
11 |  */
12 | public class DetectorTests extends ESTestCase {
13 | 
14 |     private static final String TRAINING_EN = "a a a b b c c d e";
15 | 
16 |     private static final String TRAINING_FR = "a b b c c c d d d";
17 | 
18 |     private static final String TRAINING_JA = "\u3042 \u3042 \u3042 \u3044 \u3046 \u3048 \u3048";
19 | 
20 |     public static LangdetectService create() throws Exception {
21 |         LangdetectService detect = new LangdetectService(Settings.EMPTY);
22 |         LangProfile profile_en = new LangProfile();
23 |         profile_en.setName("en_test");
24 |         for (String w : TRAINING_EN.split(" ")) {
25 |             profile_en.add(w);
26 |         }
27 |         detect.addProfile(profile_en, 0, 3);
28 |         LangProfile profile_fr = new LangProfile();
29 |         profile_fr.setName("fr_test");
30 |         for (String w : TRAINING_FR.split(" ")) {
31 |             profile_fr.add(w);
32 |         }
33 |         detect.addProfile(profile_fr, 1, 3);
34 |         LangProfile profile_ja = new LangProfile();
35 |         profile_ja.setName("ja_test");
36 |         for (String w : TRAINING_JA.split(" ")) {
37 |             profile_ja.add(w);
38 |         }
39 |         detect.addProfile(profile_ja, 2, 3);
40 |         return detect;
41 |     }
42 | 
43 |     public void testDetector1() throws Exception {
44 |         LangdetectService detect = create();
45 |         assertEquals(detect.detectAll("a").get(0).getLanguage(), "en_test");
46 |     }
47 | 
48 |     public void testDetector2() throws Exception {
49 |         LangdetectService detect = create();
50 |         assertEquals(detect.detectAll("b d").get(0).getLanguage(), "fr_test");
51 |     }
52 | 
53 |     public void testDetector3() throws Exception {
54 |         LangdetectService detect = create();
55 |         assertEquals(detect.detectAll("d e").get(0).getLanguage(), "en_test");
56 |     }
57 | 
58 |     public void testDetector4() throws Exception {
59 |         LangdetectService detect = create();
60 |         assertEquals(detect.detectAll("\u3042\u3042\u3042\u3042a").get(0).getLanguage(), "ja_test");
61 |     }
62 | 
63 |     public void testPunctuation() throws Exception {
64 |         LangdetectService detect = create();
65 |         assertTrue(detect.detectAll("...").isEmpty());
66 |     }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/LangProfileTests.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.mapper.langdetect;
 2 | 
 3 | import org.elasticsearch.test.ESTestCase;
 4 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.LangProfile;
 5 | 
 6 | public class LangProfileTests extends ESTestCase {
 7 | 
 8 |     public final void testLangProfile() {
 9 |         LangProfile profile = new LangProfile();
10 |         assertEquals(profile.getName(), null);
11 |     }
12 | 
13 |     public final void testLangProfileStringInt() {
14 |         LangProfile profile = new LangProfile();
15 |         profile.setName("en");
16 |         assertEquals(profile.getName(), "en");
17 |     }
18 | 
19 |     public final void testAdd() {
20 |         LangProfile profile = new LangProfile();
21 |         profile.setName("en");
22 |         profile.add("a");
23 |         assertEquals((int) profile.getFreq().get("a"), 1);
24 |         profile.add("a");
25 |         assertEquals((int) profile.getFreq().get("a"), 2);
26 |         //profile.omitLessFreq();
27 |     }
28 | 
29 |     public final void testAddIllegally1() {
30 |         LangProfile profile = new LangProfile();
31 |         profile.add("a");
32 |         assertEquals(profile.getFreq().get("a"), null);
33 |     }
34 | 
35 |     public final void testAddIllegally2() {
36 |         LangProfile profile = new LangProfile();
37 |         profile.setName("en");
38 |         profile.add("a");
39 |         profile.add("");
40 |         profile.add("abcd");
41 |         assertEquals((int) profile.getFreq().get("a"), 1);
42 |         assertEquals(profile.getFreq().get(""), null);
43 |         assertEquals(profile.getFreq().get("abcd"), null);
44 |     }
45 | 
46 |     public final void testOmitLessFreq() {
47 |         LangProfile profile = new LangProfile();
48 |         profile.setName("en");
49 |         String[] grams = "a b c \u3042 \u3044 \u3046 \u3048 \u304a \u304b \u304c \u304d \u304e \u304f".split(" ");
50 |         for (int i = 0; i < 5; ++i) {
51 |             for (String g : grams) {
52 |                 profile.add(g);
53 |             }
54 |         }
55 |         profile.add("\u3050");
56 | 
57 |         assertEquals((int) profile.getFreq().get("a"), 5);
58 |         assertEquals((int) profile.getFreq().get("\u3042"), 5);
59 |         assertEquals((int) profile.getFreq().get("\u3050"), 1);
60 |         //profile.omitLessFreq();
61 |         //assertEquals(profile.freq.get("a"), null);
62 |         //assertEquals((int) profile.freq.get("\u3042"), 5);
63 |         //assertEquals(profile.freq.get("\u3050"), null);
64 |     }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/LanguageTests.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.mapper.langdetect;
 2 | 
 3 | import org.elasticsearch.test.ESTestCase;
 4 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.Language;
 5 | 
 6 | public class LanguageTests extends ESTestCase {
 7 | 
 8 |     public final void testLanguage() {
 9 |         Language lang = new Language(null, 0);
10 |         assertEquals(lang.getLanguage(), null);
11 |         assertEquals(lang.getProbability(), 0.0, 0.0001);
12 |         assertEquals(lang.getLanguage(), null);
13 | 
14 |         Language lang2 = new Language("en", 1.0);
15 |         assertEquals(lang2.getLanguage(), "en");
16 |         assertEquals(lang2.getProbability(), 1.0, 0.0001);
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/src/test/java/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/SimpleDetectorTests.java:
--------------------------------------------------------------------------------
 1 | package org.xbib.elasticsearch.plugin.bundle.test.index.mapper.langdetect;
 2 | 
 3 | import org.elasticsearch.test.ESTestCase;
 4 | import org.xbib.elasticsearch.plugin.bundle.common.langdetect.LangdetectService;
 5 | 
 6 | public class SimpleDetectorTests extends ESTestCase {
 7 | 
 8 |     public void testDetector() throws Exception {
 9 |         LangdetectService detect = new LangdetectService();
10 |         assertEquals("de", detect.detectAll("Das kann deutsch sein").get(0).getLanguage());
11 |         assertEquals("en", detect.detectAll("This is a very small test").get(0).getLanguage());
12 |     }
13 | }
14 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j2.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <configuration status="OFF">
 3 |     <appenders>
 4 |         <Console name="Console" target="SYSTEM_OUT">
 5 |             <PatternLayout pattern="[%d{ABSOLUTE}][%-5p][%-25c][%t] %m%n"/>
 6 |         </Console>
 7 |         <RollingFile name="RollingFile" fileName="logs/test.log"
 8 |                      filePattern="logs/test-%d{yyyy-MM-dd}-%i.log">
 9 |             <PatternLayout pattern="[%d{ABSOLUTE}][%-5p][%-25c][%t] %m%n"/>
10 |             <Policies>
11 |                 <TimeBasedTriggeringPolicy />
12 |                 <SizeBasedTriggeringPolicy size="100 MB"/>
13 |             </Policies>
14 |         </RollingFile>
15 |     </appenders>
16 |     <Loggers>
17 |         <Root level="debug">
18 |             <AppenderRef ref="Console" />
19 |             <AppenderRef ref="RollingFile" />
20 |         </Root>
21 |     </Loggers>
22 | </configuration>


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/concat/concat_analysis.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index":{
 3 |         "analysis":{
 4 |             "analyzer" : {
 5 |                 "concat" : {
 6 |                     "type" : "custom",
 7 |                     "tokenizer" : "standard",
 8 |                     "filter" : [ "concat" ]
 9 |                 }
10 |             }
11 |         }
12 |     }
13 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/decompound/fst/decompound_analysis.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "index":{
 3 |     "analysis":{
 4 |       "filter":{
 5 |         "decomp":{
 6 |           "type":"fst_decompound"
 7 |         }
 8 |       },
 9 |       "analyzer" : {
10 |         "decomp" : {
11 |           "type": "custom",
12 |           "filter" : ["decomp", "unique" ]
13 |         }
14 |       }
15 |     }
16 |   }
17 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/decompound/patricia/decompound_analysis.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index":{
 3 |         "analysis":{
 4 |             "filter":{
 5 |                 "decomp":{
 6 |                     "type":"decompound"
 7 |                 }
 8 |             },
 9 |             "tokenizer" : {
10 |                 "decomp" : {
11 |                    "type":"standard",
12 |                    "filter" : "decomp"
13 |                 }            
14 |             }
15 |         }
16 |     }
17 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/decompound/patricia/keywords_analysis.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index": {
 3 |         "analysis": {
 4 |             "analyzer": {
 5 |                 "decompounding_default": {
 6 |                     "tokenizer": "decomp",
 7 |                     "filter": [
 8 |                         "keywords",
 9 |                         "decomp"
10 |                     ],
11 |                     "type": "custom"
12 |                 },
13 |                 "with_keywords": {
14 |                     "tokenizer": "decomp",
15 |                     "filter": [
16 |                         "keywords",
17 |                         "decomp_with_keywords"
18 |                     ],
19 |                     "type": "custom"
20 |                 },
21 |                 "with_keywords_disabled": {
22 |                     "tokenizer": "decomp",
23 |                     "filter": [
24 |                         "keywords",
25 |                         "decomp_with_keywords_disabled"
26 |                     ],
27 |                     "type": "custom"
28 |                 },
29 |                 "with_subwords_only": {
30 |                     "tokenizer": "decomp",
31 |                     "filter": [
32 |                         "decomp_subwords_only"
33 |                     ],
34 |                     "type": "custom"
35 |                 }
36 |             },
37 |             "filter": {
38 |                 "keywords": {
39 |                     "type": "keyword_marker",
40 |                     "keywords": [
41 |                         "Schlüsselwort"
42 |                     ]
43 |                 },
44 |                 "decomp": {
45 |                     "type": "decompound"
46 |                 },
47 |                 "decomp_with_keywords": {
48 |                     "type": "decompound",
49 |                     "respect_keywords": true
50 |                 },
51 |                 "decomp_with_keywords_disabled": {
52 |                     "type": "decompound",
53 |                     "respect_keywords": false
54 |                 },
55 |                 "decomp_subwords_only" : {
56 |                     "type" : "decompound",
57 |                     "subwords_only" : true
58 |                 }
59 |             },
60 |             "tokenizer": {
61 |                 "decomp": {
62 |                     "type": "standard",
63 |                     "filter": "decomp"
64 |                 }
65 |             }
66 |         }
67 |     }
68 | }
69 | 
70 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/document.json:
--------------------------------------------------------------------------------
1 | {
2 |   "text" : "Hello World"
3 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/expansion/expansion_analysis.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "index":{
 3 |     "analysis":{
 4 |       "analyzer" : {
 5 |         "expansion" : {
 6 |           "type": "custom",
 7 |           "filter" : ["expansion", "unique" ]
 8 |         }
 9 |       }
10 |     }
11 |   }
12 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/german/german_normalization_analysis.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index":{
 3 |         "analysis":{
 4 |             "filter":{
 5 |                 "umlaut":{
 6 |                     "type":"german_normalize"
 7 |                 }
 8 |             },
 9 |             "tokenizer" : {
10 |                 "umlaut" : {
11 |                    "type":"standard",
12 |                    "filter" : "umlaut"
13 |                 }            
14 |             }
15 |         }
16 |     }
17 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/hyphen/custom_hyphen_tokenizer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index":{
 3 |         "analysis":{
 4 |             "tokenizer" : {
 5 |                 "my_hyphen_tokenizer" : {
 6 |                     "type" : "hyphen",
 7 |                     "hyphens": "."
 8 |                 }
 9 |             }
10 |         }
11 |     }
12 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/hyphen/hyphen_analyzer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index":{
 3 |         "analysis":{
 4 |             "analyzer" : {
 5 |                 "my_hyphen_analyzer" : {
 6 |                     "type" : "hyphen"
 7 |                 }
 8 |             }
 9 |         }
10 |     }
11 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/hyphen/hyphen_tokenizer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index":{
 3 |         "analysis":{
 4 |             "tokenizer" : {
 5 |                 "my_icu_tokenizer" : {
 6 |                    "type" : "icu_tokenizer",
 7 |                    "rulefiles" : "Latn:Latin-dont-break-on-hyphens.rbbi"
 8 |                 },
 9 |                 "my_hyphen_tokenizer" : {
10 |                     "type" : "hyphen"
11 |                 }
12 |             }
13 |         }
14 |     }
15 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/hyphen/hyphen_tokenizer_without_subwords.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index":{
 3 |         "analysis":{
 4 |             "tokenizer" : {
 5 |                 "my_hyphen_tokenizer" : {
 6 |                     "type" : "hyphen"
 7 |                 }
 8 |             },
 9 |             "filter" : {
10 |                 "my_hyphen_tokenfilter" : {
11 |                     "type" : "hyphen",
12 |                     "subwords" : false
13 |                 }
14 |             }
15 |         }
16 |     }
17 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/icu_collation.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "index":{
 3 |     "analysis":{
 4 |       "analyzer" : {
 5 |         "icu_german_collate" : {
 6 |           "type" : "icu_collation",
 7 |           "language" : "de",
 8 |           "country" : "DE",
 9 |           "strength" : "primary",
10 |           "rules" : "& ae , a\u0308 & AE , A\u0308& oe , o\u0308 & OE , O\u0308& ue , u\u0308 & UE , u\u0308"
11 |         },
12 |         "icu_german_collate_without_punct" : {
13 |           "type" : "icu_collation",
14 |           "language" : "de",
15 |           "country" : "DE",
16 |           "strength" : "quaternary",
17 |           "alternate" : "shifted",
18 |           "rules" : "& ae , a\u0308 & AE , A\u0308& oe , o\u0308 & OE , O\u0308& ue , u\u0308 & UE , u\u0308"
19 |         },
20 |         "german_phonebook" : {
21 |           "type" : "icu_collation",
22 |           "locale" : "de@collation=phonebook",
23 |           "strength" : "primary"
24 |         },
25 |         "reorder" : {
26 |           "type" : "icu_collation",
27 |           "rules" : "& ae , a\u0308 & AE , A\u0308& oe , o\u0308 & OE , O\u0308& ue , u\u0308 & UE , u\u0308",
28 |           "strength" : "tertiary",
29 |           "reorder" : [ "Latn", "digit", "punctuation", "symbol", "currency", "others", "space"]
30 |         }
31 |       }
32 |     }
33 |   }
34 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/icu_folding.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "index":{
 3 |     "analysis":{
 4 |       "char_filter" : {
 5 |         "my_icu_folder" : {
 6 |           "type" : "icu_folding"
 7 |         }
 8 |       },
 9 |       "tokenizer" : {
10 |         "my_icu_tokenizer" : {
11 |           "type" : "icu_tokenizer"
12 |         }
13 |       },
14 |       "filter" : {
15 |         "my_icu_folder_filter" : {
16 |           "type" : "icu_folding",
17 |           "normalization_name" : "utr30"
18 |         },
19 |         "my_icu_folder_filter_with_exceptions" : {
20 |           "type" : "icu_folding",
21 |           "normalization_name" : "utr30",
22 |           "unicode_set_filter" : "[^åäöÅÄÖ]"
23 |         }
24 |       },
25 |       "analyzer" : {
26 |         "my_icu_analyzer" : {
27 |           "type" : "custom",
28 |           "tokenizer" : "my_icu_tokenizer",
29 |           "filter" : [ "my_icu_folder_filter" ]
30 |         },
31 |         "my_icu_analyzer_with_exceptions" : {
32 |           "type" : "custom",
33 |           "tokenizer" : "my_icu_tokenizer",
34 |           "filter" : [ "my_icu_folder_filter_with_exceptions" ]
35 |         }
36 |       }
37 |     }
38 |   }
39 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/icu_normalize.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "index":{
 3 |     "analysis":{
 4 |       "char_filter" : {
 5 |         "my_icu_normalizer" : {
 6 |           "type" : "icu_normalizer",
 7 |           "normalization_name" : "utr30"
 8 |         },
 9 |         "my_icu_normalizer_with_exceptions" : {
10 |           "type" : "icu_normalizer",
11 |           "normalization_name" : "utr30",
12 |           "unicode_set_filter" : "[^åäöÅÄÖ]"
13 |         }
14 |       },
15 |       "tokenizer" : {
16 |         "my_icu_tokenizer" : {
17 |           "type" : "icu_tokenizer"
18 |         }
19 |       },
20 |       "analyzer" : {
21 |         "my_icu_analyzer" : {
22 |           "type" : "custom",
23 |           "tokenizer" : "my_icu_tokenizer",
24 |           "char_filter" : "my_icu_normalizer"
25 |         },
26 |         "my_icu_analyzer_with_exceptions" : {
27 |           "type" : "custom",
28 |           "tokenizer" : "my_icu_tokenizer",
29 |           "char_filter" : [ "my_icu_normalizer_with_exceptions" ]
30 |         }
31 |       }
32 |     }
33 |   }
34 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/icu_numberformat.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index":{
 3 |         "analysis":{
 4 |             "filter" : {
 5 |                 "spellout_de" : {
 6 |                   "type" : "icu_numberformat",
 7 |                   "locale" : "de",
 8 |                   "format" : "spellout"
 9 |                 },
10 |                 "spellout_en" : {
11 |                   "type" : "icu_numberformat",
12 |                   "locale" : "en_US",
13 |                   "format" : "spellout"
14 |                 }
15 |             },
16 |             "tokenizer" : {
17 |                 "my_tokenizer" : {
18 |                   "type" : "icu_tokenizer",
19 |                   "filter" : "spellout_de"
20 |                 }
21 |             }
22 |         }
23 |     }
24 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/icu_tokenizer.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index":{
 3 |         "analysis":{
 4 |             "tokenizer" : {
 5 |                 "my_icu_tokenizer" : {
 6 |                    "type" : "icu_tokenizer"
 7 |                 },
 8 |                 "my_hyphen_icu_tokenizer" : {
 9 |                    "type" : "icu_tokenizer",
10 |                    "rulefiles" : "Latn:Latin-dont-break-on-hyphens.rbbi"
11 |                 }
12 |             }
13 |         }
14 |     }
15 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/icu/icu_transform.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "index":{
 3 |     "analysis":{
 4 |       "filter" : {
 5 |         "my_icu_transformer_ch" : {
 6 |           "type" : "icu_transform",
 7 |           "id" : "Traditional-Simplified"
 8 |         },
 9 |         "my_icu_transformer_han" : {
10 |           "type" : "icu_transform",
11 |           "id" : "Han-Latin"
12 |         },
13 |         "my_icu_transformer_katakana" : {
14 |           "type" : "icu_transform",
15 |           "id" : "Katakana-Hiragana"
16 |         },
17 |         "my_icu_transformer_cyr" : {
18 |           "type" : "icu_transform",
19 |           "id" : "Cyrillic-Latin"
20 |         },
21 |         "my_icu_transformer_cyr_reverse" : {
22 |           "type" : "icu_transform",
23 |           "id" : "Cyrillic-Latin",
24 |           "dir" : "reverse"
25 |         },
26 |         "my_icu_transformer_any_latin" : {
27 |           "type" : "icu_transform",
28 |           "id" : "Any-Latin"
29 |         },
30 |         "my_icu_transformer_nfd" : {
31 |           "type" : "icu_transform",
32 |           "id" : "NFD; [:Nonspacing Mark:] Remove"
33 |         },
34 |         "my_icu_transformer_rules" : {
35 |           "type" : "icu_transform",
36 |           "id" : "test",
37 |           "dir" : "forward",
38 |           "rules" : "a > b; b > c;"
39 |         }
40 |       },
41 |       "tokenizer" : {
42 |         "my_icu_tokenizer_ch" : {
43 |           "type" : "icu_tokenizer",
44 |           "filter" : [ "my_icu_transformer_ch" ]
45 |         },
46 |         "my_icu_tokenizer_han" : {
47 |           "type" : "icu_tokenizer",
48 |           "filter" : [ "my_icu_transformer_han" ]
49 |         },
50 |         "my_icu_tokenizer_katakana" : {
51 |           "type" : "icu_tokenizer",
52 |           "filter" : [ "my_icu_transformer_katakana" ]
53 |         },
54 |         "my_icu_tokenizer_cyr" : {
55 |           "type" : "icu_tokenizer",
56 |           "filter" : [ "my_icu_transformer_cyr" ]
57 |         },
58 |         "my_icu_tokenizer_any_latin" : {
59 |           "type" : "icu_tokenizer",
60 |           "filter" : [ "my_icu_transformer_any_latin" ]
61 |         },
62 |         "my_icu_tokenizer_nfd" : {
63 |           "type" : "icu_tokenizer",
64 |           "filter" : [ "my_icu_transformer_nfd" ]
65 |         },
66 |         "my_icu_tokenizer_rules" : {
67 |           "type" : "icu_tokenizer",
68 |           "filter" : [ "my_icu_transformer_rules" ]
69 |         }
70 |       }
71 |     }
72 |   }
73 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "date_detection": false,
 3 |   "properties": {
 4 |     "text": {
 5 |       "type": "text",
 6 |       "analyzer": "my_analyzer"
 7 |     }
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "index": {
 3 |     "analysis": {
 4 |       "tokenizer": {
 5 |         "my_hyphen_icu_tokenizer" : {
 6 |           "type" : "icu_tokenizer",
 7 |           "rulefiles" : "Latn:icu/Latin-dont-break-on-hyphens.rbbi"
 8 |         }
 9 |       },
10 |       "analyzer": {
11 |         "default": {
12 |           "type": "keyword"
13 |         },
14 |         "my_analyzer" : {
15 |           "type" : "custom",
16 |           "tokenizer" : "my_hyphen_icu_tokenizer"
17 |         }
18 |       }
19 |     }
20 |   }
21 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/sortform/sortform.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index":{
 3 |         "analysis": {
 4 |             "analyzer" : {
 5 |                 "german_phonebook_with_sortform" : {
 6 |                    "type" : "sortform",
 7 |                    "language" : "de",
 8 |                    "country" : "DE",
 9 |                    "strength" : "quaternary",
10 |                    "alternate" : "shifted",
11 |                    "rules" : "& ae , a\u0308 & AE , A\u0308 & oe , o\u0308 & OE , O\u0308 & ue , u\u0308 & UE , u\u0308 & ss , \u00df",
12 |                    "filter" : [
13 |                        "sortform"
14 |                    ],
15 |                    "char_filter" : []
16 |                 }
17 |             }
18 |         }
19 |     }
20 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/analysis/worddelimiter/worddelimiter.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "index":{
 3 |         "analysis":{
 4 |             "filter" : {
 5 |                 "wd" : {
 6 |                    "type" : "worddelimiter2",
 7 |                    "generate_word_parts" : true,
 8 |                    "generate_number_parts" : true,
 9 |                    "catenate_all" : true,
10 |                    "split_on_case_change" : true,
11 |                    "split_on_numerics" : true,
12 |                    "stem_english_possessive" : true
13 |                 }
14 |             }
15 |         }
16 |     }
17 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/base64-2-decoded.txt:
--------------------------------------------------------------------------------
1 | God Save the Queen (alternatively God Save the King)


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/base64-2-mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "someType" : {
 3 |     "properties": {
 4 |       "content": {
 5 |           "type": "text",
 6 |           "fields" : {
 7 |             "language": {
 8 |               "type": "langdetect",
 9 |               "binary": true
10 |             }
11 |           }
12 |         }
13 |       }
14 |     }
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/base64-2.txt:
--------------------------------------------------------------------------------
1 | R29kIFNhdmUgdGhlIFF1ZWVuIChhbHRlcm5hdGl2ZWx5IEdvZCBTYXZlIHRoZSBLaW5nKQ==
2 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/base64-decoded.txt:
--------------------------------------------------------------------------------
1 | This is a very simple text in plain english
2 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/base64-mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "someType" : {
 3 |     "properties" : {
 4 |       "someField":{
 5 |         "type" : "langdetect",
 6 |         "languages": [ "en", "fr", "de", "it", "es" ],
 7 |         "binary" : true
 8 |       }
 9 |     }
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/base64.txt:
--------------------------------------------------------------------------------
1 | VGhpcyBpcyBhIHZlcnkgc2ltcGxlIHRleHQgaW4gcGxhaW4gZW5nbGlzaAo=
2 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/chinese.txt:
--------------------------------------------------------------------------------
1 | 位于美国首都华盛顿都会圈的希望中文学校５日晚举办活动庆祝建立２０周年。从中国大陆留学生为子女学中文而自发建立的学习班，到学生规模在全美名列前茅的中文学校，这个平台的发展也折射出美国的中文教育热度逐步提升。
2 | 希望中文学校是大华盛顿地区最大中文学校，现有７个校区逾４０００名学生，规模在美国东部数一数二。不过，见证了希望中文学校２０年发展的人们起初根本无法想象这个小小的中文教育平台能发展到今日之规模。


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/english.txt:
--------------------------------------------------------------------------------
1 | This is a very small example of a text


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/german.txt:
--------------------------------------------------------------------------------
1 | Das ist ein kleiner Text als Beispiel


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/japanese.txt:
--------------------------------------------------------------------------------
1 | 1冊6,000円（雑費送料含む）で頒布いたしますので、ご希望の方は、氏名・送り先住所・電話番号・希望冊数をご記入頂き、書面（E-Mailも可）でお送りください。お支払いは郵送する折に振込用紙を同封しますので、その用紙にてお振込みをお願いいたします。
2 | 
3 | ご注文お待ちしております。


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/korean.txt:
--------------------------------------------------------------------------------
1 | 20조에 육박하는 사교육 시장을 자랑하는 대한민국. 정교사로 학교 강단에 서는 교육인력뿐 아니라 사교육 분야에서 지식을 전달하는 전문 교육 인력의 질도 나날이 제고되고 있다.
2 | 
3 | 전문성을 가진 인력의 필요성 증가를 직시해 양질의 영어교사를 배출하겠다는 목적으로 서강대학교 외국어교육원이 특별한 강사 양성 과정을 마련해 영어 교육자로의 길을 모색하고 있는 많은 이들의 관심을 받고 있다.
4 | 
5 | 서강대학교 영어교육원은 오는 5월 31일까지 ‘어린이 영어전문가 과정’ 수강생을 모집한다. 6월 24일부터 8월 16일까지 두 달여에 걸쳐 진행되는 이 과정은 온라인 선행학습에 오프라인 학습 8주가 더해진 후 TKT 본 시험응시로 마무리 된다.
6 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/mapping-to-fields.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "someType" : {
 3 |     "properties" : {
 4 |       "someField":{
 5 |         "type" : "langdetect",
 6 |         "languages" : [ "de", "en", "fr", "nl", "it" ],
 7 |         "language_to" : {
 8 |           "de": "german_field",
 9 |           "en": "english_field"
10 |         }
11 |       },
12 |       "german_field" : {
13 |         "analyzer" : "german",
14 |         "type": "text"
15 |       },
16 |       "english_field" : {
17 |         "analyzer" : "english",
18 |         "type" : "text"
19 |       }
20 |     }
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "someType" : {
 3 |     "properties" : {
 4 |       "someField":{
 5 |         "type" : "langdetect",
 6 |         "languages" : [ "de", "en", "fr", "nl", "it" ],
 7 |         "map" : {
 8 |           "de" : "Deutsch"
 9 |         }
10 |       }
11 |     }
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "index" : {
 3 |     "analysis": {
 4 |       "analyzer": {
 5 |         "default": {
 6 |           "type": "standard"
 7 |         }
 8 |       }
 9 |     }
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/short-text-mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "someType" : {
 3 |     "properties" : {
 4 |       "someField" : {
 5 |         "type" : "langdetect",
 6 |         "profile" : "shorttext"
 7 |       }
 8 |     }
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/langdetect/simple-mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "someType" : {
 3 |     "properties" : {
 4 |       "someField": {
 5 |         "type" : "langdetect",
 6 |         "languages" : [ "de", "en", "fr", "nl", "it" ]
 7 |       }
 8 |     }
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/doc-simple-document.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dc" : {
3 |     "creator": "first author name"
4 |   },
5 |   "author" : {
6 |     "authorID": "1"
7 |   }
8 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/doc-simple-mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "doc": {
 3 |     "properties": {
 4 |       "author": {
 5 |         "properties": {
 6 |           "authorName": {
 7 |             "type": "text"
 8 |           },
 9 |           "authorID": {
10 |             "type": "ref",
11 |             "ref_index": "ref",
12 |             "ref_type": "ref",
13 |             "ref_fields": [
14 |               "author"
15 |             ],
16 |             "copy_to": [
17 |               "dc.creator"
18 |             ]
19 |           }
20 |         }
21 |       },
22 |       "dc": {
23 |         "properties": {
24 |           "creator": {
25 |             "type": "text"
26 |           }
27 |         }
28 |       }
29 |     }
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/doc-simple-settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "index": {
 3 |     "analysis": {
 4 |       "analyzer": {
 5 |         "default": {
 6 |           "type": "standard"
 7 |         }
 8 |       }
 9 |     }
10 |   }
11 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/gnd-settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "index": {
 3 |     "mapping" : {
 4 |       "total_fields" : {
 5 |         "limit": 10000
 6 |       }
 7 |     },
 8 |     "analysis": {
 9 |       "analyzer": {
10 |         "default": {
11 |           "type": "keyword"
12 |         }
13 |       }
14 |     }
15 |   }
16 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-doc-book.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "title" : "A title",
 3 |     "authorID" : "1",
 4 |     "dc" : {
 5 |         "creator" : "A creator"
 6 |     },
 7 |     "bib" : {
 8 |         "contributor" : "A contributor"
 9 |     }
10 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-mapping-authorities.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "docs" : {
 3 |         "properties" : {
 4 |             "authorID": {
 5 |                 "type" : "ref",
 6 |                 "ref_index" : "authorities",
 7 |                 "ref_type" : "persons",
 8 |                 "ref_fields" : [ "author" ],
 9 |                 "copy_to" : [
10 |                     "dc.creator",
11 |                     "bib.contributor"
12 |                 ]
13 |             },
14 |             "dc" : {
15 |                 "properties" : {
16 |                     "creator" : {
17 |                         "type" : "text"
18 |                     }
19 |                 }
20 |             },
21 |             "bib" : {
22 |                 "properties" : {
23 |                     "contributor" : {
24 |                         "type" : "text"
25 |                     }
26 |                 }
27 |             }
28 |         }
29 |     }
30 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-mapping-books-test.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "test" : {
 3 |     "properties" : {
 4 |       "authorID": {
 5 |         "type" : "ref",
 6 |         "ref_index" : "authorities",
 7 |         "ref_type" : "persons",
 8 |         "ref_fields" : [ "author" ],
 9 |         "copy_to" : [
10 |           "dc.creator",
11 |           "bib.contributor"
12 |         ]
13 |       },
14 |       "dc" : {
15 |         "properties" : {
16 |           "creator" : {
17 |             "type" : "text"
18 |           }
19 |         }
20 |       },
21 |       "bib" : {
22 |         "properties" : {
23 |           "contributor" : {
24 |             "type" : "text"
25 |           }
26 |         }
27 |       }
28 |     }
29 |   }
30 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-mapping-from-id.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "docs" : {
 3 |         "properties" : {
 4 |             "ref" : {
 5 |                 "type" : "text"
 6 |             },
 7 |             "authorID": {
 8 |                 "type" : "ref",
 9 |                 "ref_index" : "authorities",
10 |                 "ref_type" : "persons",
11 |                 "ref_fields" : [ "author" ],
12 |                 "copy_to" : [
13 |                     "ref"
14 |                 ]
15 |             }
16 |         }
17 |     }
18 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-mapping-nested.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "nested" : {
 3 |     "properties": {
 4 |       "person": {
 5 |         "properties": {
 6 |           "authorName": {
 7 |             "type": "text"
 8 |           },
 9 |           "authorID": {
10 |             "type": "ref",
11 |             "ref_index": "authorities",
12 |             "ref_type": "persons",
13 |             "ref_fields": [
14 |               "author"
15 |             ],
16 |             "copy_to": [
17 |               "dc.creator",
18 |               "bib.contributor"
19 |             ]
20 |           }
21 |         }
22 |       },
23 |       "dc": {
24 |         "properties": {
25 |           "creator": {
26 |             "type": "text"
27 |           }
28 |         }
29 |       },
30 |       "bib": {
31 |         "properties": {
32 |           "contributor": {
33 |             "type": "text"
34 |           }
35 |         }
36 |       }
37 |     }
38 |   }
39 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "some_type" : {
 3 |         "properties" : {
 4 |             "someField": {
 5 |                 "type": "ref",
 6 |                 "ref_index": "test",
 7 |                 "ref_type": "test",
 8 |                 "ref_fields": [
 9 |                     "myfield"
10 |                 ],
11 |                 "copy_to": [
12 |                     "ref"
13 |                 ]
14 |             },
15 |             "ref" : {
16 |                 "type" : "text"
17 |             }
18 |         }
19 |     }
20 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-simple-document.json:
--------------------------------------------------------------------------------
1 | {
2 |   "author" : "second author name"
3 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-simple-mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "ref" : {
 3 |     "properties": {
 4 |       "author": {
 5 |         "type": "text",
 6 |         "store" : true
 7 |       }
 8 |     }
 9 |   }
10 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/ref-simple-settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "index": {
 3 |     "analysis": {
 4 |       "analyzer": {
 5 |         "default": {
 6 |           "type": "keyword"
 7 |         }
 8 |       }
 9 |     }
10 |   }
11 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/reference/title-document-1.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "type": "u",
  3 |     "boost": "0.1",
  4 |     "xbib": [
  5 |       {
  6 |         "uid": "(DE-605)HT007215476"
  7 |       },
  8 |       {
  9 |         "uid": "(DE-605)008427902"
 10 |       },
 11 |       {
 12 |         "identifier": [
 13 |           "DE-61",
 14 |           "DE-385"
 15 |         ]
 16 |       }
 17 |     ],
 18 |     "RecordIdentifier": {
 19 |       "identifierForTheRecord": "(DE-605)HT007215476"
 20 |     },
 21 |     "RecordIdentifierSuper": {
 22 |       "recordIdentifierSuper": "(DE-605)HT007215468"
 23 |     },
 24 |     "RecordCodes": [
 25 |       "Autopsie",
 26 |       "MAB-Zeichenvorrat",
 27 |       "Unicode / ISO 10646 (UTF 8)",
 28 |       "RAK-WB"
 29 |     ],
 30 |     "Language": {
 31 |       "languageSource": "ger",
 32 |       "language": "Deutsch"
 33 |     },
 34 |     "VolumeDesignation": {
 35 |       "volumeDesignation": "3"
 36 |     },
 37 |     "SortableVolumeDesignation": {
 38 |       "volumeDesignation": "3"
 39 |     },
 40 |     "Person": [
 41 |       {
 42 |         "personName": "Tucholsky, Kurt",
 43 |         "personBio": "1890-1935",
 44 |         "personIdentifier": "(DE-588)11862444X",
 45 |         "identifierGND": "11862444X"
 46 |       },
 47 |       {
 48 |         "personName": "Gerold-Tucholsky, Mary",
 49 |         "personRole": "[Hrsg.]",
 50 |         "personIdentifier": "(DE-588)188272283",
 51 |         "identifierGND": "188272283"
 52 |       }
 53 |     ],
 54 |     "TitleStatement": [
 55 |       {
 56 |         "titleMain": "1921 - 1924"
 57 |       },
 58 |       {
 59 |         "titleMain": "Gesammelte Werke"
 60 |       }
 61 |     ],
 62 |     "TitleAddendum": {
 63 |       "title": "in 10 Bänden"
 64 |     },
 65 |     "CreatorStatement": {
 66 |       "creatorStatement": "Kurt Tucholsky. Hrsg. von Mary Gerold-Tucholsky ..."
 67 |     },
 68 |     "Edition": {
 69 |       "edition": "182. - 201. Tsd."
 70 |     },
 71 |     "PublicationPlace": {
 72 |       "printingPlace": "Reinbek bei Hamburg"
 73 |     },
 74 |     "PublisherName": {
 75 |       "printerName": "Rowohlt"
 76 |     },
 77 |     "DateProper": {
 78 |       "date": "1995"
 79 |     },
 80 |     "Extent": {
 81 |       "extent": "534 S."
 82 |     },
 83 |     "RecordSystemNumber": {
 84 |       "systemNumber": "(DE-605)008427902"
 85 |     },
 86 |     "dc": [
 87 |       {
 88 |         "type": "keine Angabe"
 89 |       },
 90 |       {
 91 |         "format": "keine Angabe"
 92 |       },
 93 |       {
 94 |         "date": 1995
 95 |       },
 96 |       {
 97 |         "language": "Deutsch"
 98 |       }
 99 |     ],
100 |     "collection": "hbz Verbundkatalog"
101 | }


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/index/mapper/standardnumber/mapping.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "someType" : {
 3 |     "properties" : {
 4 |       "someField":{
 5 |         "type" : "standardnumber",
 6 |         "standardnumbers" : "isbn"
 7 |       }
 8 |     }
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/src/test/resources/org/xbib/elasticsearch/plugin/bundle/test/query/decompound/decompound_query.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "settings": {
 3 |     "index": {
 4 |       "number_of_shards": 1,
 5 |       "number_of_replicas": 0,
 6 |       "analysis": {
 7 |         "filter": {
 8 |           "decomp":{
 9 |             "type" : "decompound",
10 |             "use_payload": true,
11 |             "use_cache": true
12 |           }
13 |         },
14 |         "analyzer": {
15 |           "decomp": {
16 |             "type": "custom",
17 |             "tokenizer" : "standard",
18 |             "filter" : [
19 |               "decomp",
20 |               "lowercase"
21 |             ]
22 |           },
23 |           "lowercase": {
24 |             "type": "custom",
25 |             "tokenizer" : "standard",
26 |             "filter" : [
27 |               "lowercase"
28 |             ]
29 |           }
30 |         }
31 |       }
32 |     }
33 |   },
34 |   "mappings": {
35 |     "_doc": {
36 |       "properties": {
37 |         "text": {
38 |           "type": "text",
39 |           "analyzer": "decomp",
40 |           "search_analyzer": "lowercase"
41 |         }
42 |       }
43 |     }
44 |   }
45 | }


--------------------------------------------------------------------------------
/src/test/resources/rest-api-spec/test/analysis_icu/10_basic.yml:
--------------------------------------------------------------------------------
 1 | # Integration tests for ICU analysis components
 2 | #
 3 | "Tokenizer":
 4 |     - do:
 5 |         indices.analyze:
 6 |           body:
 7 |             text:         Foo Bar
 8 |             tokenizer:    icu_tokenizer
 9 |     - length: { tokens: 2 }
10 |     - match:  { tokens.0.token: Foo }
11 |     - match:  { tokens.1.token: Bar }
12 | ---
13 | "Normalization filter":
14 |     - do:
15 |         indices.analyze:
16 |           body:
17 |             filter:       [icu_normalizer]
18 |             text:         Foo Bar Ruß
19 |             tokenizer:    keyword
20 |     - length: { tokens: 1 }
21 |     - match:  { tokens.0.token: foo bar russ }
22 | ---
23 | "Normalization charfilter":
24 |     - do:
25 |         indices.analyze:
26 |           body:
27 |             char_filter:  [icu_normalizer]
28 |             text:         Foo Bar Ruß
29 |             tokenizer:    keyword
30 |     - length: { tokens: 1 }
31 |     - match:  { tokens.0.token: foo bar russ }
32 | ---
33 | "Folding filter":
34 |     - do:
35 |         indices.analyze:
36 |           body:
37 |             filter:       [icu_folding]
38 |             text:         Foo Bar résumé
39 |             tokenizer:    keyword
40 |     - length: { tokens: 1 }
41 |     - match:  { tokens.0.token: foo bar resume }
42 | ---
43 | "Normalization with a UnicodeSet Filter":
44 |     - do:
45 |         indices.create:
46 |             index:  test
47 |             body:
48 |                 settings:
49 |                     index:
50 |                         analysis:
51 |                             char_filter:
52 |                                 charfilter_icu_normalizer:
53 |                                     type: icu_normalizer
54 |                                     unicode_set_filter: "[^ß]"
55 |                             filter:
56 |                                 tokenfilter_icu_normalizer:
57 |                                     type: icu_normalizer
58 |                                     unicode_set_filter: "[^ßB]"
59 |                                 tokenfilter_icu_folding:
60 |                                     type: icu_folding
61 |                                     unicode_set_filter: "[^â]"
62 |     - do:
63 |         indices.analyze:
64 |           index:    test
65 |           body:
66 |             char_filter: ["charfilter_icu_normalizer"]
67 |             tokenizer:  keyword
68 |             text:     charfilter Föo Bâr Ruß
69 |     - length: { tokens: 1 }
70 |     - match:  { tokens.0.token: charfilter föo bâr ruß }
71 |     - do:
72 |         indices.analyze:
73 |           index:    test
74 |           body:
75 |             tokenizer:  keyword
76 |             filter: ["tokenfilter_icu_normalizer"]
77 |             text:     tokenfilter Föo Bâr Ruß
78 |     - length: { tokens: 1 }
79 |     - match:  { tokens.0.token: tokenfilter föo Bâr ruß }
80 |     - do:
81 |         indices.analyze:
82 |           index:    test
83 |           body:
84 |             tokenizer:  keyword
85 |             filter: ["tokenfilter_icu_folding"]
86 |             text:     icufolding Föo Bâr Ruß
87 |     - length: { tokens: 1 }
88 |     - match:  { tokens.0.token: icufolding foo bâr russ }
89 | 


--------------------------------------------------------------------------------
/src/test/resources/rest-api-spec/test/analysis_icu/20_search.yml:
--------------------------------------------------------------------------------
 1 | # Integration tests for ICU analysis component
 2 | #
 3 | ---
 4 | "Index ICU content":
 5 |     - do:
 6 |         indices.create:
 7 |             index:  test
 8 |             body:
 9 |                 settings:
10 |                     index:
11 |                         analysis:
12 |                             analyzer:
13 |                                 my_analyzer:
14 |                                     type: icu_collation
15 |                                     filter: ["standard", "lowercase"]
16 |                                     language: en
17 |                                     strength: primary
18 |                 mappings:
19 |                     type:
20 |                         properties:
21 |                             text:
22 |                                 type:     text
23 |                                 analyzer: my_analyzer
24 | 
25 |     - do:
26 |         index:
27 |             index:  test
28 |             type:   type
29 |             id:     1
30 |             body:   { "text": "Bâton enflammé" }
31 |     - do:
32 |         indices.refresh: {}
33 | 
34 |     - do:
35 |         search:
36 |             index: test
37 |             body:
38 |                 query:
39 |                     match:
40 |                         text: baton enflamme
41 |     - match: { hits.total: 1 }
42 | 


--------------------------------------------------------------------------------