├── .github └── workflows │ ├── lint.yml │ └── tests.yml ├── .gitignore ├── .golangci.yml ├── AUTHORS ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── analysis ├── analyzer │ ├── keyword.go │ ├── simple.go │ ├── standard.go │ ├── standard_test.go │ └── web.go ├── char │ ├── asciifolding.go │ ├── asciifolding_test.go │ ├── html.go │ ├── regexp.go │ ├── regexp_test.go │ └── zerowidthnonjoiner.go ├── freq.go ├── freq_test.go ├── lang │ ├── ar │ │ ├── analyzer_ar.go │ │ ├── analyzer_ar_test.go │ │ ├── arabic_normalize.go │ │ ├── arabic_normalize_test.go │ │ ├── stemmer_ar.go │ │ ├── stemmer_ar_test.go │ │ ├── stop_filter_ar.go │ │ └── stop_words_ar.go │ ├── bg │ │ ├── stop_filter_bg.go │ │ └── stop_words_bg.go │ ├── ca │ │ ├── articles_ca.go │ │ ├── elision_ca.go │ │ ├── elision_ca_test.go │ │ ├── stop_filter_ca.go │ │ └── stop_words_ca.go │ ├── cjk │ │ ├── analyzer_cjk.go │ │ ├── analyzer_cjk_test.go │ │ ├── cjk_bigram.go │ │ ├── cjk_bigram_test.go │ │ ├── cjk_width.go │ │ └── cjk_width_test.go │ ├── ckb │ │ ├── analyzer_ckb.go │ │ ├── analyzer_ckb_test.go │ │ ├── sorani_normalize.go │ │ ├── sorani_normalize_test.go │ │ ├── sorani_stemmer_filter.go │ │ ├── sorani_stemmer_filter_test.go │ │ ├── stop_filter_ckb.go │ │ └── stop_words_ckb.go │ ├── cs │ │ ├── stop_filter_cs.go │ │ └── stop_words_cs.go │ ├── da │ │ ├── analyzer_da.go │ │ ├── analyzer_da_test.go │ │ ├── stemmer_da.go │ │ ├── stop_filter_da.go │ │ └── stop_words_da.go │ ├── de │ │ ├── analyzer_de.go │ │ ├── analyzer_de_test.go │ │ ├── german_normalize.go │ │ ├── german_normalize_test.go │ │ ├── light_stemmer_de.go │ │ ├── stemmer_de_snowball.go │ │ ├── stemmer_de_test.go │ │ ├── stop_filter_de.go │ │ └── stop_words_de.go │ ├── el │ │ ├── stop_filter_el.go │ │ └── stop_words_el.go │ ├── en │ │ ├── analyzer_en.go │ │ ├── analyzer_en_test.go │ │ ├── possessive_filter_en.go │ │ ├── possessive_filter_en_test.go │ │ ├── stemmer_en_snowball.go │ │ ├── stemmer_en_test.go │ │ ├── stop_filter_en.go │ │ └── stop_words_en.go │ ├── es │ │ ├── analyzer_es.go │ │ ├── analyzer_es_test.go │ │ ├── light_stemmer_es.go │ │ ├── stemmer_es_snowball.go │ │ ├── stemmer_es_snowball_test.go │ │ ├── stop_filter_es.go │ │ └── stop_words_es.go │ ├── eu │ │ ├── stop_filter_eu.go │ │ └── stop_words_eu.go │ ├── fa │ │ ├── analyzer_fa.go │ │ ├── analyzer_fa_test.go │ │ ├── persian_normalize.go │ │ ├── persian_normalize_test.go │ │ ├── stop_filter_fa.go │ │ └── stop_words_fa.go │ ├── fi │ │ ├── analyzer_fi.go │ │ ├── analyzer_fi_test.go │ │ ├── stemmer_fi.go │ │ ├── stop_filter_fi.go │ │ └── stop_words_fi.go │ ├── fr │ │ ├── analyzer_fr.go │ │ ├── analyzer_fr_test.go │ │ ├── articles_fr.go │ │ ├── elision_fr.go │ │ ├── elision_fr_test.go │ │ ├── light_stemmer_fr.go │ │ ├── light_stemmer_fr_test.go │ │ ├── minimal_stemmer_fr.go │ │ ├── minimal_stemmer_fr_test.go │ │ ├── stemmer_fr_snowball.go │ │ ├── stemmer_fr_snowball_test.go │ │ ├── stop_filter_fr.go │ │ └── stop_words_fr.go │ ├── ga │ │ ├── articles_ga.go │ │ ├── elision_ga.go │ │ ├── elision_ga_test.go │ │ ├── stop_filter_ga.go │ │ └── stop_words_ga.go │ ├── gl │ │ ├── stop_filter_gl.go │ │ └── stop_words_gl.go │ ├── hi │ │ ├── analyzer_hi.go │ │ ├── analyzer_hi_test.go │ │ ├── hindi_normalize.go │ │ ├── hindi_normalize_test.go │ │ ├── hindi_stemmer_filter.go │ │ ├── hindi_stemmer_filter_test.go │ │ ├── stop_filter_hi.go │ │ └── stop_words_hi.go │ ├── hu │ │ ├── analyzer_hu.go │ │ ├── analyzer_hu_test.go │ │ ├── stemmer_hu.go │ │ ├── stop_filter_hu.go │ │ └── stop_words_hu.go │ ├── hy │ │ ├── stop_filter_hy.go │ │ └── stop_words_hy.go │ ├── id │ │ ├── stop_filter_id.go │ │ └── stop_words_id.go │ ├── in │ │ ├── indic_normalize.go │ │ ├── indic_normalize_test.go │ │ └── scripts.go │ ├── it │ │ ├── analyzer_it.go │ │ ├── analyzer_it_test.go │ │ ├── articles_it.go │ │ ├── elision_it.go │ │ ├── elision_it_test.go │ │ ├── light_stemmer_it.go │ │ ├── light_stemmer_it_test.go │ │ ├── stemmer_it_snowball.go │ │ ├── stemmer_it_snowball_test.go │ │ ├── stop_filter_it.go │ │ └── stop_words_it.go │ ├── nl │ │ ├── analyzer_nl.go │ │ ├── analyzer_nl_test.go │ │ ├── stemmer_nl.go │ │ ├── stop_filter_nl.go │ │ └── stop_words_nl.go │ ├── no │ │ ├── analyzer_no.go │ │ ├── analyzer_no_test.go │ │ ├── stemmer_no.go │ │ ├── stop_filter_no.go │ │ └── stop_words_no.go │ ├── pt │ │ ├── analyzer_pt.go │ │ ├── analyzer_pt_test.go │ │ ├── light_stemmer_pt.go │ │ ├── light_stemmer_pt_test.go │ │ ├── stop_filter_pt.go │ │ └── stop_words_pt.go │ ├── ro │ │ ├── analyzer_ro.go │ │ ├── analyzer_ro_test.go │ │ ├── stemmer_ro.go │ │ ├── stop_filter_ro.go │ │ └── stop_words_ro.go │ ├── ru │ │ ├── analyzer_ru.go │ │ ├── analyzer_ru_test.go │ │ ├── stemmer_ru.go │ │ ├── stemmer_ru_test.go │ │ ├── stop_filter_ru.go │ │ └── stop_words_ru.go │ ├── sv │ │ ├── analyzer_sv.go │ │ ├── analyzer_sv_test.go │ │ ├── stemmer_sv.go │ │ ├── stop_filter_sv.go │ │ └── stop_words_sv.go │ └── tr │ │ ├── analyzer_tr.go │ │ ├── analyzer_tr_test.go │ │ ├── stemmer_tr.go │ │ ├── stop_filter_tr.go │ │ └── stop_words_tr.go ├── size.go ├── test_words.txt ├── token │ ├── apostrophe.go │ ├── apostrophe_test.go │ ├── camelcase.go │ ├── camelcase_parser.go │ ├── camelcase_states.go │ ├── camelcase_test.go │ ├── dict.go │ ├── dict_test.go │ ├── edgengram.go │ ├── edgengram_test.go │ ├── elision.go │ ├── elision_test.go │ ├── keyword.go │ ├── keyword_test.go │ ├── length.go │ ├── length_test.go │ ├── lowercase.go │ ├── lowercase_test.go │ ├── ngram.go │ ├── ngram_test.go │ ├── porter.go │ ├── porter_test.go │ ├── reverse.go │ ├── reverse_test.go │ ├── shingle.go │ ├── shingle_test.go │ ├── stop.go │ ├── stop_test.go │ ├── truncate.go │ ├── truncate_test.go │ ├── unicodenorm.go │ ├── unicodenorm_test.go │ ├── unique.go │ └── unique_test.go ├── tokenizer │ ├── character.go │ ├── character_test.go │ ├── exception.go │ ├── exception_test.go │ ├── letter.go │ ├── regexp.go │ ├── regexp_test.go │ ├── single.go │ ├── single_test.go │ ├── unicode.go │ ├── unicode_test.go │ ├── web.go │ ├── web_test.go │ ├── whitespace.go │ └── whitespace_test.go ├── tokenmap.go ├── tokenmap_test.go ├── type.go ├── util.go └── util_test.go ├── batch.go ├── cmd └── bluge │ ├── cmd │ ├── list.go │ ├── root.go │ └── snapshot.go │ └── main.go ├── config.go ├── doc.go ├── docs └── bluge.png ├── document.go ├── field.go ├── field_test.go ├── go.mod ├── go.sum ├── index ├── batch.go ├── communication.go ├── config.go ├── count.go ├── deletion.go ├── deletion_test.go ├── dictionary.go ├── dictionary_test.go ├── directory.go ├── directory_fs.go ├── directory_fs_nix.go ├── directory_fs_windows.go ├── directory_mem.go ├── docstub_test.go ├── empty.go ├── event.go ├── event_test.go ├── introducer.go ├── lock │ ├── lock.go │ ├── lock_nix.go │ ├── lock_test.go │ └── lock_windows.go ├── merge.go ├── merge_test.go ├── mergeplan │ ├── merge_plan.go │ ├── merge_plan_test.go │ └── sort.go ├── optimize.go ├── persister.go ├── postings.go ├── postings_all.go ├── segment.go ├── segment_plugin.go ├── sizes.go ├── snapshot.go ├── snapshot_test.go ├── stats.go ├── unadorned.go ├── writer.go ├── writer_offline.go └── writer_test.go ├── index_test.go ├── multisearch.go ├── multisearch_test.go ├── numeric ├── bin.go ├── bin_test.go ├── float.go ├── float_test.go ├── geo │ ├── README.md │ ├── benchmark_geohash_test.go │ ├── geo.go │ ├── geo_dist.go │ ├── geo_dist_test.go │ ├── geo_test.go │ ├── geohash.go │ ├── geohash_test.go │ ├── parse.go │ ├── parse_test.go │ ├── sloppy.go │ ├── sloppy_test.go │ └── versus_test.go ├── prefix_coded.go └── prefix_coded_test.go ├── query.go ├── reader.go ├── search.go ├── search ├── aggregations.go ├── aggregations │ ├── aggregtation_test.go │ ├── cardinality.go │ ├── count.go │ ├── duration.go │ ├── filter.go │ ├── metric.go │ ├── percentiles.go │ ├── range.go │ ├── range_date.go │ └── terms.go ├── collector.go ├── collector │ ├── all.go │ ├── all_test.go │ ├── bench_test.go │ ├── heap.go │ ├── iterator.go │ ├── search_test.go │ ├── size.go │ ├── slice.go │ ├── topn.go │ └── topn_test.go ├── explanation.go ├── highlight │ ├── format_ansi.go │ ├── format_html.go │ ├── format_html_test.go │ ├── fragment_scorer_simple.go │ ├── fragment_scorer_simple_test.go │ ├── fragment_simple.go │ ├── fragment_simple_test.go │ ├── highlighter.go │ ├── highlighter_ansi.go │ ├── highlighter_html.go │ ├── highlighter_simple.go │ ├── highlighter_simple_test.go │ ├── term_locations.go │ └── term_locations_test.go ├── pool.go ├── pool_test.go ├── search.go ├── search_test.go ├── searcher │ ├── base_test.go │ ├── docstub_test.go │ ├── ordered_searchers_list.go │ ├── search_boolean.go │ ├── search_boolean_test.go │ ├── search_conjunction.go │ ├── search_conjunction_test.go │ ├── search_disjunction.go │ ├── search_disjunction_heap.go │ ├── search_disjunction_slice.go │ ├── search_disjunction_test.go │ ├── search_filter.go │ ├── search_fuzzy.go │ ├── search_fuzzy_test.go │ ├── search_geoboundingbox.go │ ├── search_geoboundingbox_test.go │ ├── search_geopointdistance.go │ ├── search_geopointdistance_test.go │ ├── search_geopolygon.go │ ├── search_geopolygon_test.go │ ├── search_match_all.go │ ├── search_match_all_test.go │ ├── search_match_none.go │ ├── search_match_none_test.go │ ├── search_multi_term.go │ ├── search_numeric_range.go │ ├── search_numeric_range_test.go │ ├── search_phrase.go │ ├── search_phrase_test.go │ ├── search_regexp.go │ ├── search_regexp_test.go │ ├── search_term.go │ ├── search_term_prefix.go │ ├── search_term_range.go │ ├── search_term_range_test.go │ ├── search_term_test.go │ ├── size.go │ └── stub_test.go ├── similarity │ ├── bm25.go │ ├── composite.go │ └── constant.go ├── size.go ├── sort.go ├── source.go ├── util.go └── util_test.go ├── search_test.go ├── size.go ├── test ├── aggregations_test.go ├── basic_test.go ├── fosdem_test.go ├── geo_test.go ├── integration.go ├── integration_test.go ├── phrase_test.go └── sort_test.go ├── writer.go ├── writer_offline.go └── writer_offline_test.go /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - master 5 | pull_request: 6 | name: Lint 7 | jobs: 8 | golangci: 9 | name: lint 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v2 13 | - name: golangci-lint 14 | uses: golangci/golangci-lint-action@v2 15 | with: 16 | version: v1.45.2 17 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - master 5 | pull_request: 6 | name: Tests 7 | jobs: 8 | test: 9 | strategy: 10 | matrix: 11 | go-version: [1.15.x, 1.16.x] 12 | platform: [ubuntu-latest, macos-latest, windows-latest] 13 | runs-on: ${{ matrix.platform }} 14 | steps: 15 | - name: Install Go 16 | uses: actions/setup-go@v1 17 | with: 18 | go-version: ${{ matrix.go-version }} 19 | - name: Checkout code 20 | uses: actions/checkout@v2 21 | - name: Test 22 | run: | 23 | go version 24 | go test -race ./... 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #* 2 | *.sublime-* 3 | *~ 4 | .#* 5 | .project 6 | .settings 7 | **/.idea/ 8 | **/*.iml 9 | .DS_Store 10 | query_string.y.go.tmp 11 | /analysis/token_filters/cld2/cld2-read-only 12 | /analysis/token_filters/cld2/libcld2_full.a 13 | /cmd/bluge/bluge 14 | vendor/** 15 | !vendor/manifest 16 | /y.output 17 | /search/query/y.output 18 | *.test 19 | tags 20 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | # This is the official list of Bluge authors for copyright purposes. 2 | # 3 | # Names should be added to this file as one of 4 | # Organization's name 5 | # Individual's name 6 | # Individual's name 7 | # 8 | # Please keep the list sorted. 9 | 10 | Marty Schoch 11 | Michael Schuett 12 | Akshay Shekher 13 | Sergio Rubio 14 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to Bluge 2 | 3 | Bluge is an open source project. 4 | 5 | Thank you for your contribution, we appreciate your help! 6 | 7 | ## Contributing code 8 | 9 | Portions of existing code are copyright Couchbase, Inc. 10 | 11 | All new contributions should be copyright The Bluge Authors. New contributors should add an appropriate entry to the AUTHORS file at the root of the repository. All contributions must be distributed under the Apache License found in the LICENSE file. 12 | -------------------------------------------------------------------------------- /analysis/analyzer/keyword.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package analyzer 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/tokenizer" 20 | ) 21 | 22 | func NewKeywordAnalyzer() *analysis.Analyzer { 23 | return &analysis.Analyzer{ 24 | Tokenizer: tokenizer.NewSingleTokenTokenizer(), 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /analysis/analyzer/simple.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package analyzer 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func NewSimpleAnalyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewLetterTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | }, 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /analysis/analyzer/standard.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package analyzer 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func NewStandardAnalyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | }, 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /analysis/analyzer/web.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package analyzer 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/lang/en" 20 | "github.com/blugelabs/bluge/analysis/token" 21 | "github.com/blugelabs/bluge/analysis/tokenizer" 22 | ) 23 | 24 | func NewWebAnalyzer() *analysis.Analyzer { 25 | return &analysis.Analyzer{ 26 | Tokenizer: tokenizer.NewWebTokenizer(), 27 | TokenFilters: []analysis.TokenFilter{ 28 | token.NewLowerCaseFilter(), 29 | en.StopWordsFilter(), 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /analysis/char/html.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package char 16 | 17 | import ( 18 | "regexp" 19 | ) 20 | 21 | var htmlCharFilterRegexp = regexp.MustCompile(`\s]+))?)+\s*|\s*)/?>`) 22 | 23 | func NewHTMLCharFilter() *RegexpCharFilter { 24 | return NewRegexpCharFilter(htmlCharFilterRegexp, []byte(" ")) 25 | } 26 | -------------------------------------------------------------------------------- /analysis/char/regexp.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package char 16 | 17 | import ( 18 | "regexp" 19 | ) 20 | 21 | type RegexpCharFilter struct { 22 | r *regexp.Regexp 23 | replacement []byte 24 | } 25 | 26 | func NewRegexpCharFilter(r *regexp.Regexp, replacement []byte) *RegexpCharFilter { 27 | return &RegexpCharFilter{ 28 | r: r, 29 | replacement: replacement, 30 | } 31 | } 32 | 33 | func (s *RegexpCharFilter) Filter(input []byte) []byte { 34 | return s.r.ReplaceAll(input, s.replacement) 35 | } 36 | -------------------------------------------------------------------------------- /analysis/char/zerowidthnonjoiner.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package char 16 | 17 | import ( 18 | "regexp" 19 | ) 20 | 21 | var zeroWidthNonJoinerRegexp = regexp.MustCompile(`\x{200C}`) 22 | 23 | func NewZeroWidthNonJoinerCharFilter() *RegexpCharFilter { 24 | return NewRegexpCharFilter(zeroWidthNonJoinerRegexp, []byte(" ")) 25 | } 26 | -------------------------------------------------------------------------------- /analysis/lang/ar/analyzer_ar.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ar 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | "golang.org/x/text/unicode/norm" 22 | ) 23 | 24 | func Analyzer() *analysis.Analyzer { 25 | return &analysis.Analyzer{ 26 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 27 | TokenFilters: []analysis.TokenFilter{ 28 | token.NewLowerCaseFilter(), 29 | token.NewUnicodeNormalizeFilter(norm.NFKC), 30 | StopWordsFilter(), 31 | NormalizeFilter(), 32 | StemmerFilter(), 33 | }, 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /analysis/lang/ar/stop_filter_ar.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ar 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/bg/stop_filter_bg.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package bg 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/ca/articles_ca.go: -------------------------------------------------------------------------------- 1 | package ca 2 | 3 | import ( 4 | "github.com/blugelabs/bluge/analysis" 5 | ) 6 | 7 | const ArticlesName = "articles_ca" 8 | 9 | // this content was obtained from: 10 | // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis 11 | 12 | var CatalanArticles = []byte(` 13 | d 14 | l 15 | m 16 | n 17 | s 18 | t 19 | `) 20 | 21 | func Articles() analysis.TokenMap { 22 | rv := analysis.NewTokenMap() 23 | rv.LoadBytes(CatalanArticles) 24 | return rv 25 | } 26 | -------------------------------------------------------------------------------- /analysis/lang/ca/elision_ca.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ca 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func ElisionFilter() *token.ElisionFilter { 22 | return token.NewElisionFilter(Articles()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/ca/elision_ca_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ca 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestFrenchElision(t *testing.T) { 25 | tests := []struct { 26 | input analysis.TokenStream 27 | output analysis.TokenStream 28 | }{ 29 | { 30 | input: analysis.TokenStream{ 31 | &analysis.Token{ 32 | Term: []byte("l'Institut"), 33 | }, 34 | &analysis.Token{ 35 | Term: []byte("d'Estudis"), 36 | }, 37 | }, 38 | output: analysis.TokenStream{ 39 | &analysis.Token{ 40 | Term: []byte("Institut"), 41 | }, 42 | &analysis.Token{ 43 | Term: []byte("Estudis"), 44 | }, 45 | }, 46 | }, 47 | } 48 | 49 | elisionFilter := ElisionFilter() 50 | for _, test := range tests { 51 | actual := elisionFilter.Filter(test.input) 52 | if !reflect.DeepEqual(actual, test.output) { 53 | t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /analysis/lang/ca/stop_filter_ca.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ca 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/cjk/analyzer_cjk.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package cjk 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | NewWidthFilter(), 28 | token.NewLowerCaseFilter(), 29 | NewBigramFilter(false), 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /analysis/lang/ckb/analyzer_ckb.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ckb 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | NormalizeFilter(), 28 | token.NewLowerCaseFilter(), 29 | StopWordsFilter(), 30 | StemmerFilter(), 31 | }, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /analysis/lang/ckb/analyzer_ckb_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ckb 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestSoraniAnalyzer(t *testing.T) { 25 | tests := []struct { 26 | input []byte 27 | output analysis.TokenStream 28 | }{ 29 | // stop word removal 30 | { 31 | input: []byte("ئەم پیاوە"), 32 | output: analysis.TokenStream{ 33 | &analysis.Token{ 34 | Term: []byte("پیاو"), 35 | PositionIncr: 2, 36 | Start: 7, 37 | End: 17, 38 | }, 39 | }, 40 | }, 41 | { 42 | input: []byte("پیاوە"), 43 | output: analysis.TokenStream{ 44 | &analysis.Token{ 45 | Term: []byte("پیاو"), 46 | PositionIncr: 1, 47 | Start: 0, 48 | End: 10, 49 | }, 50 | }, 51 | }, 52 | { 53 | input: []byte("پیاو"), 54 | output: analysis.TokenStream{ 55 | &analysis.Token{ 56 | Term: []byte("پیاو"), 57 | PositionIncr: 1, 58 | Start: 0, 59 | End: 8, 60 | }, 61 | }, 62 | }, 63 | } 64 | 65 | analyzer := Analyzer() 66 | for _, test := range tests { 67 | actual := analyzer.Analyze(test.input) 68 | if !reflect.DeepEqual(actual, test.output) { 69 | t.Errorf("expected %v, got %v", test.output, actual) 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /analysis/lang/ckb/stop_filter_ckb.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ckb 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/cs/stop_filter_cs.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package cs 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/da/analyzer_da.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package da 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | StopWordsFilter(), 29 | StemmerFilter(), 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /analysis/lang/da/analyzer_da_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package da 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestDanishAnalyzer(t *testing.T) { 25 | tests := []struct { 26 | input []byte 27 | output analysis.TokenStream 28 | }{ 29 | // stemming 30 | { 31 | input: []byte("undersøg"), 32 | output: analysis.TokenStream{ 33 | &analysis.Token{ 34 | Term: []byte("undersøg"), 35 | PositionIncr: 1, 36 | Start: 0, 37 | End: 9, 38 | }, 39 | }, 40 | }, 41 | { 42 | input: []byte("undersøgelse"), 43 | output: analysis.TokenStream{ 44 | &analysis.Token{ 45 | Term: []byte("undersøg"), 46 | PositionIncr: 1, 47 | Start: 0, 48 | End: 13, 49 | }, 50 | }, 51 | }, 52 | // stop word 53 | { 54 | input: []byte("på"), 55 | output: analysis.TokenStream{}, 56 | }, 57 | } 58 | 59 | analyzer := Analyzer() 60 | for _, test := range tests { 61 | actual := analyzer.Analyze(test.input) 62 | if !reflect.DeepEqual(actual, test.output) { 63 | t.Errorf("expected %v, got %v", test.output, actual) 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /analysis/lang/da/stemmer_da.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package da 16 | 17 | import ( 18 | "github.com/blevesearch/snowballstem" 19 | "github.com/blevesearch/snowballstem/danish" 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type DanishStemmerFilter struct { 24 | } 25 | 26 | func StemmerFilter() *DanishStemmerFilter { 27 | return &DanishStemmerFilter{} 28 | } 29 | 30 | func (s *DanishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | env := snowballstem.NewEnv(string(token.Term)) 33 | danish.Stem(env) 34 | token.Term = []byte(env.Current()) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/da/stop_filter_da.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package da 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/de/analyzer_de.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package de 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | StopWordsFilter(), 29 | NormalizeFilter(), 30 | LightStemmerFilter(), 31 | }, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /analysis/lang/de/stemmer_de_snowball.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package de 16 | 17 | import ( 18 | "github.com/blevesearch/snowballstem" 19 | "github.com/blevesearch/snowballstem/german" 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type GermanStemmerFilter struct { 24 | } 25 | 26 | func StemmerFilter() *GermanStemmerFilter { 27 | return &GermanStemmerFilter{} 28 | } 29 | 30 | func (s *GermanStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | env := snowballstem.NewEnv(string(token.Term)) 33 | german.Stem(env) 34 | token.Term = []byte(env.Current()) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/de/stop_filter_de.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package de 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/el/stop_filter_el.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package el 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/el/stop_words_el.go: -------------------------------------------------------------------------------- 1 | package el 2 | 3 | import ( 4 | "github.com/blugelabs/bluge/analysis" 5 | ) 6 | 7 | // this content was obtained from: 8 | // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ 9 | // ` was changed to ' to allow for literal string 10 | 11 | var StopWordsBytes = []byte(`# Lucene Greek Stopwords list 12 | # Note: by default this file is used after GreekLowerCaseFilter, 13 | # so when modifying this file use 'σ' instead of 'ς' 14 | ο 15 | η 16 | το 17 | οι 18 | τα 19 | του 20 | τησ 21 | των 22 | τον 23 | την 24 | και 25 | κι 26 | κ 27 | ειμαι 28 | εισαι 29 | ειναι 30 | ειμαστε 31 | ειστε 32 | στο 33 | στον 34 | στη 35 | στην 36 | μα 37 | αλλα 38 | απο 39 | για 40 | προσ 41 | με 42 | σε 43 | ωσ 44 | παρα 45 | αντι 46 | κατα 47 | μετα 48 | θα 49 | να 50 | δε 51 | δεν 52 | μη 53 | μην 54 | επι 55 | ενω 56 | εαν 57 | αν 58 | τοτε 59 | που 60 | πωσ 61 | ποιοσ 62 | ποια 63 | ποιο 64 | ποιοι 65 | ποιεσ 66 | ποιων 67 | ποιουσ 68 | αυτοσ 69 | αυτη 70 | αυτο 71 | αυτοι 72 | αυτων 73 | αυτουσ 74 | αυτεσ 75 | αυτα 76 | εκεινοσ 77 | εκεινη 78 | εκεινο 79 | εκεινοι 80 | εκεινεσ 81 | εκεινα 82 | εκεινων 83 | εκεινουσ 84 | οπωσ 85 | ομωσ 86 | ισωσ 87 | οσο 88 | οτι 89 | `) 90 | 91 | func StopWords() analysis.TokenMap { 92 | rv := analysis.NewTokenMap() 93 | rv.LoadBytes(StopWordsBytes) 94 | return rv 95 | } 96 | -------------------------------------------------------------------------------- /analysis/lang/en/analyzer_en.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package en implements an analyzer with reasonable defaults for processing 16 | // English text. 17 | // 18 | // It strips possessive suffixes ('s), transforms tokens to lower case, 19 | // removes stopwords from a built-in list, and applies porter stemming. 20 | // 21 | // The built-in stopwords list is defined in EnglishStopWords. 22 | package en 23 | 24 | import ( 25 | "github.com/blugelabs/bluge/analysis" 26 | "github.com/blugelabs/bluge/analysis/token" 27 | "github.com/blugelabs/bluge/analysis/tokenizer" 28 | ) 29 | 30 | const AnalyzerName = "en" 31 | 32 | func NewAnalyzer() *analysis.Analyzer { 33 | return &analysis.Analyzer{ 34 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 35 | TokenFilters: []analysis.TokenFilter{ 36 | NewPossessiveFilter(), 37 | token.NewLowerCaseFilter(), 38 | StopWordsFilter(), 39 | StemmerFilter(), 40 | }, 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /analysis/lang/en/possessive_filter_en.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package en 16 | 17 | import ( 18 | "unicode/utf8" 19 | 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | const rightSingleQuotationMark = '’' 24 | const apostrophe = '\'' 25 | const fullWidthApostrophe = ''' 26 | 27 | // PossessiveFilter implements a TokenFilter which 28 | // strips the English possessive suffix ('s) from tokens. 29 | // It handle a variety of apostrophe types, is case-insensitive 30 | // and doesn't distinguish between possessive and contraction. 31 | // (ie "She's So Rad" becomes "She So Rad") 32 | type PossessiveFilter struct { 33 | } 34 | 35 | func NewPossessiveFilter() *PossessiveFilter { 36 | return &PossessiveFilter{} 37 | } 38 | 39 | func (s *PossessiveFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 40 | for _, token := range input { 41 | lastRune, lastRuneSize := utf8.DecodeLastRune(token.Term) 42 | if lastRune == 's' || lastRune == 'S' { 43 | nextLastRune, nextLastRuneSize := utf8.DecodeLastRune(token.Term[:len(token.Term)-lastRuneSize]) 44 | if nextLastRune == rightSingleQuotationMark || 45 | nextLastRune == apostrophe || 46 | nextLastRune == fullWidthApostrophe { 47 | token.Term = token.Term[:len(token.Term)-lastRuneSize-nextLastRuneSize] 48 | } 49 | } 50 | } 51 | return input 52 | } 53 | -------------------------------------------------------------------------------- /analysis/lang/en/stemmer_en_snowball.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package en 16 | 17 | import ( 18 | "github.com/blevesearch/snowballstem" 19 | "github.com/blevesearch/snowballstem/english" 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type EnglishStemmerFilter struct { 24 | } 25 | 26 | func StemmerFilter() *EnglishStemmerFilter { 27 | return &EnglishStemmerFilter{} 28 | } 29 | 30 | func (s *EnglishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | env := snowballstem.NewEnv(string(token.Term)) 33 | english.Stem(env) 34 | token.Term = []byte(env.Current()) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/en/stemmer_en_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package en 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestSnowballEnglishStemmer(t *testing.T) { 25 | tests := []struct { 26 | input analysis.TokenStream 27 | output analysis.TokenStream 28 | }{ 29 | { 30 | input: analysis.TokenStream{ 31 | &analysis.Token{ 32 | Term: []byte("enjoy"), 33 | }, 34 | }, 35 | output: analysis.TokenStream{ 36 | &analysis.Token{ 37 | Term: []byte("enjoy"), 38 | }, 39 | }, 40 | }, 41 | { 42 | input: analysis.TokenStream{ 43 | &analysis.Token{ 44 | Term: []byte("enjoyed"), 45 | }, 46 | }, 47 | output: analysis.TokenStream{ 48 | &analysis.Token{ 49 | Term: []byte("enjoy"), 50 | }, 51 | }, 52 | }, 53 | { 54 | input: analysis.TokenStream{ 55 | &analysis.Token{ 56 | Term: []byte("enjoyable"), 57 | }, 58 | }, 59 | output: analysis.TokenStream{ 60 | &analysis.Token{ 61 | Term: []byte("enjoy"), 62 | }, 63 | }, 64 | }, 65 | } 66 | 67 | filter := StemmerFilter() 68 | for _, test := range tests { 69 | actual := filter.Filter(test.input) 70 | if !reflect.DeepEqual(actual, test.output) { 71 | t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /analysis/lang/en/stop_filter_en.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package en 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/es/analyzer_es.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package es 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | StopWordsFilter(), 29 | LightStemmerFilter(), 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /analysis/lang/es/stemmer_es_snowball.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package es 16 | 17 | import ( 18 | "github.com/blevesearch/snowballstem" 19 | "github.com/blevesearch/snowballstem/spanish" 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type SpanishStemmerFilter struct { 24 | } 25 | 26 | func StemmerFilter() *SpanishStemmerFilter { 27 | return &SpanishStemmerFilter{} 28 | } 29 | 30 | func (s *SpanishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | env := snowballstem.NewEnv(string(token.Term)) 33 | spanish.Stem(env) 34 | token.Term = []byte(env.Current()) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/es/stop_filter_es.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package es 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/eu/stop_filter_eu.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package eu 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/eu/stop_words_eu.go: -------------------------------------------------------------------------------- 1 | package eu 2 | 3 | import ( 4 | "github.com/blugelabs/bluge/analysis" 5 | ) 6 | 7 | // this content was obtained from: 8 | // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ 9 | // ` was changed to ' to allow for literal string 10 | 11 | var StopWordsBytes = []byte(`# example set of basque stopwords 12 | al 13 | anitz 14 | arabera 15 | asko 16 | baina 17 | bat 18 | batean 19 | batek 20 | bati 21 | batzuei 22 | batzuek 23 | batzuetan 24 | batzuk 25 | bera 26 | beraiek 27 | berau 28 | berauek 29 | bere 30 | berori 31 | beroriek 32 | beste 33 | bezala 34 | da 35 | dago 36 | dira 37 | ditu 38 | du 39 | dute 40 | edo 41 | egin 42 | ere 43 | eta 44 | eurak 45 | ez 46 | gainera 47 | gu 48 | gutxi 49 | guzti 50 | haiei 51 | haiek 52 | haietan 53 | hainbeste 54 | hala 55 | han 56 | handik 57 | hango 58 | hara 59 | hari 60 | hark 61 | hartan 62 | hau 63 | hauei 64 | hauek 65 | hauetan 66 | hemen 67 | hemendik 68 | hemengo 69 | hi 70 | hona 71 | honek 72 | honela 73 | honetan 74 | honi 75 | hor 76 | hori 77 | horiei 78 | horiek 79 | horietan 80 | horko 81 | horra 82 | horrek 83 | horrela 84 | horretan 85 | horri 86 | hortik 87 | hura 88 | izan 89 | ni 90 | noiz 91 | nola 92 | non 93 | nondik 94 | nongo 95 | nor 96 | nora 97 | ze 98 | zein 99 | zen 100 | zenbait 101 | zenbat 102 | zer 103 | zergatik 104 | ziren 105 | zituen 106 | zu 107 | zuek 108 | zuen 109 | zuten 110 | `) 111 | 112 | func StopWords() analysis.TokenMap { 113 | rv := analysis.NewTokenMap() 114 | rv.LoadBytes(StopWordsBytes) 115 | return rv 116 | } 117 | -------------------------------------------------------------------------------- /analysis/lang/fa/analyzer_fa.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package fa 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/char" 20 | "github.com/blugelabs/bluge/analysis/lang/ar" 21 | "github.com/blugelabs/bluge/analysis/token" 22 | "github.com/blugelabs/bluge/analysis/tokenizer" 23 | ) 24 | 25 | func Analyzer() *analysis.Analyzer { 26 | return &analysis.Analyzer{ 27 | CharFilters: []analysis.CharFilter{ 28 | char.NewZeroWidthNonJoinerCharFilter(), 29 | }, 30 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 31 | TokenFilters: []analysis.TokenFilter{ 32 | token.NewLowerCaseFilter(), 33 | ar.NormalizeFilter(), 34 | NormalizeFilter(), 35 | StopWordsFilter(), 36 | }, 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /analysis/lang/fa/persian_normalize.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package fa 16 | 17 | import ( 18 | "bytes" 19 | 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | const ( 24 | Yeh = '\u064A' 25 | FarsiYeh = '\u06CC' 26 | YehBarree = '\u06D2' 27 | Keheh = '\u06A9' 28 | Kaf = '\u0643' 29 | HamzaAbove = '\u0654' 30 | HehYeh = '\u06C0' 31 | HehGoal = '\u06C1' 32 | Heh = '\u0647' 33 | ) 34 | 35 | type PersianNormalizeFilter struct { 36 | } 37 | 38 | func NormalizeFilter() *PersianNormalizeFilter { 39 | return &PersianNormalizeFilter{} 40 | } 41 | 42 | func (s *PersianNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 43 | for _, token := range input { 44 | term := normalize(token.Term) 45 | token.Term = term 46 | } 47 | return input 48 | } 49 | 50 | func normalize(input []byte) []byte { 51 | runes := bytes.Runes(input) 52 | for i := 0; i < len(runes); i++ { 53 | switch runes[i] { 54 | case FarsiYeh, YehBarree: 55 | runes[i] = Yeh 56 | case Keheh: 57 | runes[i] = Kaf 58 | case HehYeh, HehGoal: 59 | runes[i] = Heh 60 | case HamzaAbove: // necessary for HEH + HAMZA 61 | runes = analysis.DeleteRune(runes, i) 62 | i-- 63 | } 64 | } 65 | return analysis.BuildTermFromRunes(runes) 66 | } 67 | -------------------------------------------------------------------------------- /analysis/lang/fa/stop_filter_fa.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package fa 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/fi/analyzer_fi.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package fi 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | StopWordsFilter(), 29 | StemmerFilter(), 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /analysis/lang/fi/analyzer_fi_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package fi 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestFinishAnalyzer(t *testing.T) { 25 | tests := []struct { 26 | input []byte 27 | output analysis.TokenStream 28 | }{ 29 | // stemming 30 | { 31 | input: []byte("edeltäjiinsä"), 32 | output: analysis.TokenStream{ 33 | &analysis.Token{ 34 | Term: []byte("edeltäj"), 35 | }, 36 | }, 37 | }, 38 | { 39 | input: []byte("edeltäjistään"), 40 | output: analysis.TokenStream{ 41 | &analysis.Token{ 42 | Term: []byte("edeltäj"), 43 | }, 44 | }, 45 | }, 46 | // stop word 47 | { 48 | input: []byte("olla"), 49 | output: analysis.TokenStream{}, 50 | }, 51 | } 52 | 53 | analyzer := Analyzer() 54 | for _, test := range tests { 55 | actual := analyzer.Analyze(test.input) 56 | if len(actual) != len(test.output) { 57 | t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) 58 | } 59 | for i, tok := range actual { 60 | if !reflect.DeepEqual(tok.Term, test.output[i].Term) { 61 | t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) 62 | } 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /analysis/lang/fi/stemmer_fi.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package fi 16 | 17 | import ( 18 | "github.com/blevesearch/snowballstem" 19 | "github.com/blevesearch/snowballstem/finnish" 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type FinnishStemmerFilter struct { 24 | } 25 | 26 | func StemmerFilter() *FinnishStemmerFilter { 27 | return &FinnishStemmerFilter{} 28 | } 29 | 30 | func (s *FinnishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | env := snowballstem.NewEnv(string(token.Term)) 33 | finnish.Stem(env) 34 | token.Term = []byte(env.Current()) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/fi/stop_filter_fi.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package fi 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/fr/analyzer_fr.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package fr 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | ElisionFilter(), 29 | StopWordsFilter(), 30 | LightStemmerFilter(), 31 | }, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /analysis/lang/fr/articles_fr.go: -------------------------------------------------------------------------------- 1 | package fr 2 | 3 | import ( 4 | "github.com/blugelabs/bluge/analysis" 5 | ) 6 | 7 | // this content was obtained from: 8 | // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis 9 | 10 | var FrenchArticles = []byte(` 11 | l 12 | m 13 | t 14 | qu 15 | n 16 | s 17 | j 18 | d 19 | c 20 | jusqu 21 | quoiqu 22 | lorsqu 23 | puisqu 24 | `) 25 | 26 | func Articles() analysis.TokenMap { 27 | rv := analysis.NewTokenMap() 28 | rv.LoadBytes(FrenchArticles) 29 | return rv 30 | } 31 | -------------------------------------------------------------------------------- /analysis/lang/fr/elision_fr.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package fr 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func ElisionFilter() *token.ElisionFilter { 22 | return token.NewElisionFilter(Articles()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/fr/elision_fr_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package fr 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestFrenchElision(t *testing.T) { 25 | tests := []struct { 26 | input analysis.TokenStream 27 | output analysis.TokenStream 28 | }{ 29 | { 30 | input: analysis.TokenStream{ 31 | &analysis.Token{ 32 | Term: []byte("l'avion"), 33 | }, 34 | }, 35 | output: analysis.TokenStream{ 36 | &analysis.Token{ 37 | Term: []byte("avion"), 38 | }, 39 | }, 40 | }, 41 | } 42 | 43 | elisionFilter := ElisionFilter() 44 | for _, test := range tests { 45 | actual := elisionFilter.Filter(test.input) 46 | if !reflect.DeepEqual(actual, test.output) { 47 | t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /analysis/lang/fr/stemmer_fr_snowball.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package fr 16 | 17 | import ( 18 | "github.com/blevesearch/snowballstem" 19 | "github.com/blevesearch/snowballstem/french" 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type FrenchStemmerFilter struct { 24 | } 25 | 26 | func StemmerFilter() *FrenchStemmerFilter { 27 | return &FrenchStemmerFilter{} 28 | } 29 | 30 | func (s *FrenchStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | env := snowballstem.NewEnv(string(token.Term)) 33 | french.Stem(env) 34 | token.Term = []byte(env.Current()) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/fr/stop_filter_fr.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package fr 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/ga/articles_ga.go: -------------------------------------------------------------------------------- 1 | package ga 2 | 3 | import ( 4 | "github.com/blugelabs/bluge/analysis" 5 | ) 6 | 7 | // this content was obtained from: 8 | // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis 9 | 10 | var IrishArticles = []byte(` 11 | d 12 | m 13 | b 14 | `) 15 | 16 | func Articles() analysis.TokenMap { 17 | rv := analysis.NewTokenMap() 18 | rv.LoadBytes(IrishArticles) 19 | return rv 20 | } 21 | -------------------------------------------------------------------------------- /analysis/lang/ga/elision_ga.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ga 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func ElisionFilter() *token.ElisionFilter { 22 | return token.NewElisionFilter(Articles()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/ga/elision_ga_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ga 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestFrenchElision(t *testing.T) { 25 | tests := []struct { 26 | input analysis.TokenStream 27 | output analysis.TokenStream 28 | }{ 29 | { 30 | input: analysis.TokenStream{ 31 | &analysis.Token{ 32 | Term: []byte("b'fhearr"), 33 | }, 34 | }, 35 | output: analysis.TokenStream{ 36 | &analysis.Token{ 37 | Term: []byte("fhearr"), 38 | }, 39 | }, 40 | }, 41 | } 42 | 43 | elisionFilter := ElisionFilter() 44 | for _, test := range tests { 45 | actual := elisionFilter.Filter(test.input) 46 | if !reflect.DeepEqual(actual, test.output) { 47 | t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /analysis/lang/ga/stop_filter_ga.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ga 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/ga/stop_words_ga.go: -------------------------------------------------------------------------------- 1 | package ga 2 | 3 | import ( 4 | "github.com/blugelabs/bluge/analysis" 5 | ) 6 | 7 | // this content was obtained from: 8 | // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ 9 | // ` was changed to ' to allow for literal string 10 | 11 | var StopWordsBytes = []byte(` 12 | a 13 | ach 14 | ag 15 | agus 16 | an 17 | aon 18 | ar 19 | arna 20 | as 21 | b' 22 | ba 23 | beirt 24 | bhúr 25 | caoga 26 | ceathair 27 | ceathrar 28 | chomh 29 | chtó 30 | chuig 31 | chun 32 | cois 33 | céad 34 | cúig 35 | cúigear 36 | d' 37 | daichead 38 | dar 39 | de 40 | deich 41 | deichniúr 42 | den 43 | dhá 44 | do 45 | don 46 | dtí 47 | dá 48 | dár 49 | dó 50 | faoi 51 | faoin 52 | faoina 53 | faoinár 54 | fara 55 | fiche 56 | gach 57 | gan 58 | go 59 | gur 60 | haon 61 | hocht 62 | i 63 | iad 64 | idir 65 | in 66 | ina 67 | ins 68 | inár 69 | is 70 | le 71 | leis 72 | lena 73 | lenár 74 | m' 75 | mar 76 | mo 77 | mé 78 | na 79 | nach 80 | naoi 81 | naonúr 82 | ná 83 | ní 84 | níor 85 | nó 86 | nócha 87 | ocht 88 | ochtar 89 | os 90 | roimh 91 | sa 92 | seacht 93 | seachtar 94 | seachtó 95 | seasca 96 | seisear 97 | siad 98 | sibh 99 | sinn 100 | sna 101 | sé 102 | sí 103 | tar 104 | thar 105 | thú 106 | triúr 107 | trí 108 | trína 109 | trínár 110 | tríocha 111 | tú 112 | um 113 | ár 114 | é 115 | éis 116 | í 117 | ó 118 | ón 119 | óna 120 | ónár 121 | `) 122 | 123 | func StopWords() analysis.TokenMap { 124 | rv := analysis.NewTokenMap() 125 | rv.LoadBytes(StopWordsBytes) 126 | return rv 127 | } 128 | -------------------------------------------------------------------------------- /analysis/lang/gl/stop_filter_gl.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package gl 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/hi/analyzer_hi.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package hi 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/lang/in" 20 | "github.com/blugelabs/bluge/analysis/token" 21 | "github.com/blugelabs/bluge/analysis/tokenizer" 22 | ) 23 | 24 | func Analyzer() *analysis.Analyzer { 25 | return &analysis.Analyzer{ 26 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 27 | TokenFilters: []analysis.TokenFilter{ 28 | token.NewLowerCaseFilter(), 29 | in.NormalizeFilter(), 30 | NormalizeFilter(), 31 | StopWordsFilter(), 32 | StemmerFilter(), 33 | }, 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /analysis/lang/hi/analyzer_hi_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package hi 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestHindiAnalyzer(t *testing.T) { 25 | tests := []struct { 26 | input []byte 27 | output analysis.TokenStream 28 | }{ 29 | // two ways to write 'hindi' itself 30 | { 31 | input: []byte("हिन्दी"), 32 | output: analysis.TokenStream{ 33 | &analysis.Token{ 34 | Term: []byte("हिंद"), 35 | PositionIncr: 1, 36 | Start: 0, 37 | End: 18, 38 | }, 39 | }, 40 | }, 41 | { 42 | input: []byte("हिंदी"), 43 | output: analysis.TokenStream{ 44 | &analysis.Token{ 45 | Term: []byte("हिंद"), 46 | PositionIncr: 1, 47 | Start: 0, 48 | End: 15, 49 | }, 50 | }, 51 | }, 52 | } 53 | 54 | analyzer := Analyzer() 55 | for _, test := range tests { 56 | actual := analyzer.Analyze(test.input) 57 | if !reflect.DeepEqual(actual, test.output) { 58 | t.Errorf("expected %v, got %v", test.output, actual) 59 | } 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /analysis/lang/hi/stop_filter_hi.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package hi 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/hu/analyzer_hu.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package hu 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | StopWordsFilter(), 29 | StemmerFilter(), 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /analysis/lang/hu/analyzer_hu_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package hu 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestHungarianAnalyzer(t *testing.T) { 25 | tests := []struct { 26 | input []byte 27 | output analysis.TokenStream 28 | }{ 29 | // stemming 30 | { 31 | input: []byte("babakocsi"), 32 | output: analysis.TokenStream{ 33 | &analysis.Token{ 34 | Term: []byte("babakocs"), 35 | }, 36 | }, 37 | }, 38 | { 39 | input: []byte("babakocsijáért"), 40 | output: analysis.TokenStream{ 41 | &analysis.Token{ 42 | Term: []byte("babakocs"), 43 | }, 44 | }, 45 | }, 46 | // stop word 47 | { 48 | input: []byte("által"), 49 | output: analysis.TokenStream{}, 50 | }, 51 | } 52 | 53 | analyzer := Analyzer() 54 | for _, test := range tests { 55 | actual := analyzer.Analyze(test.input) 56 | if len(actual) != len(test.output) { 57 | t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) 58 | } 59 | for i, tok := range actual { 60 | if !reflect.DeepEqual(tok.Term, test.output[i].Term) { 61 | t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) 62 | } 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /analysis/lang/hu/stemmer_hu.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package hu 16 | 17 | import ( 18 | "github.com/blevesearch/snowballstem" 19 | "github.com/blevesearch/snowballstem/hungarian" 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type HungarianStemmerFilter struct { 24 | } 25 | 26 | func StemmerFilter() *HungarianStemmerFilter { 27 | return &HungarianStemmerFilter{} 28 | } 29 | 30 | func (s *HungarianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | env := snowballstem.NewEnv(string(token.Term)) 33 | hungarian.Stem(env) 34 | token.Term = []byte(env.Current()) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/hu/stop_filter_hu.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package hu 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/hy/stop_filter_hy.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package hy 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/hy/stop_words_hy.go: -------------------------------------------------------------------------------- 1 | package hy 2 | 3 | import ( 4 | "github.com/blugelabs/bluge/analysis" 5 | ) 6 | 7 | // this content was obtained from: 8 | // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ 9 | // ` was changed to ' to allow for literal string 10 | 11 | var StopWordsBytes = []byte(`# example set of Armenian stopwords. 12 | այդ 13 | այլ 14 | այն 15 | այս 16 | դու 17 | դուք 18 | եմ 19 | են 20 | ենք 21 | ես 22 | եք 23 | է 24 | էի 25 | էին 26 | էինք 27 | էիր 28 | էիք 29 | էր 30 | ըստ 31 | թ 32 | ի 33 | ին 34 | իսկ 35 | իր 36 | կամ 37 | համար 38 | հետ 39 | հետո 40 | մենք 41 | մեջ 42 | մի 43 | ն 44 | նա 45 | նաև 46 | նրա 47 | նրանք 48 | որ 49 | որը 50 | որոնք 51 | որպես 52 | ու 53 | ում 54 | պիտի 55 | վրա 56 | և 57 | `) 58 | 59 | func StopWords() analysis.TokenMap { 60 | rv := analysis.NewTokenMap() 61 | rv.LoadBytes(StopWordsBytes) 62 | return rv 63 | } 64 | -------------------------------------------------------------------------------- /analysis/lang/id/stop_filter_id.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package id 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/in/indic_normalize.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package in 16 | 17 | import ( 18 | "bytes" 19 | 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type IndicNormalizeFilter struct { 24 | } 25 | 26 | func NormalizeFilter() *IndicNormalizeFilter { 27 | return &IndicNormalizeFilter{} 28 | } 29 | 30 | func (s *IndicNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | runes := bytes.Runes(token.Term) 33 | runes = normalize(runes) 34 | token.Term = analysis.BuildTermFromRunes(runes) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/it/analyzer_it.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package it 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | ElisionFilter(), 29 | StopWordsFilter(), 30 | LightStemmerFilter(), 31 | }, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /analysis/lang/it/articles_it.go: -------------------------------------------------------------------------------- 1 | package it 2 | 3 | import ( 4 | "github.com/blugelabs/bluge/analysis" 5 | ) 6 | 7 | // this content was obtained from: 8 | // lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis 9 | 10 | var ItalianArticles = []byte(` 11 | c 12 | l 13 | all 14 | dall 15 | dell 16 | nell 17 | sull 18 | coll 19 | pell 20 | gl 21 | agl 22 | dagl 23 | degl 24 | negl 25 | sugl 26 | un 27 | m 28 | t 29 | s 30 | v 31 | d 32 | `) 33 | 34 | func Articles() analysis.TokenMap { 35 | rv := analysis.NewTokenMap() 36 | rv.LoadBytes(ItalianArticles) 37 | return rv 38 | } 39 | -------------------------------------------------------------------------------- /analysis/lang/it/elision_it.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package it 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func ElisionFilter() *token.ElisionFilter { 22 | return token.NewElisionFilter(Articles()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/it/elision_it_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package it 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestItalianElision(t *testing.T) { 25 | tests := []struct { 26 | input analysis.TokenStream 27 | output analysis.TokenStream 28 | }{ 29 | { 30 | input: analysis.TokenStream{ 31 | &analysis.Token{ 32 | Term: []byte("dell'Italia"), 33 | }, 34 | }, 35 | output: analysis.TokenStream{ 36 | &analysis.Token{ 37 | Term: []byte("Italia"), 38 | }, 39 | }, 40 | }, 41 | } 42 | 43 | elisionFilter := ElisionFilter() 44 | for _, test := range tests { 45 | actual := elisionFilter.Filter(test.input) 46 | if !reflect.DeepEqual(actual, test.output) { 47 | t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) 48 | } 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /analysis/lang/it/light_stemmer_it_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package it 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestItalianLightStemmer(t *testing.T) { 25 | tests := []struct { 26 | input analysis.TokenStream 27 | output analysis.TokenStream 28 | }{ 29 | { 30 | input: analysis.TokenStream{ 31 | &analysis.Token{ 32 | Term: []byte("ragazzo"), 33 | }, 34 | }, 35 | output: analysis.TokenStream{ 36 | &analysis.Token{ 37 | Term: []byte("ragazz"), 38 | }, 39 | }, 40 | }, 41 | { 42 | input: analysis.TokenStream{ 43 | &analysis.Token{ 44 | Term: []byte("ragazzi"), 45 | }, 46 | }, 47 | output: analysis.TokenStream{ 48 | &analysis.Token{ 49 | Term: []byte("ragazz"), 50 | }, 51 | }, 52 | }, 53 | } 54 | 55 | filter := LightStemmerFilter() 56 | for _, test := range tests { 57 | actual := filter.Filter(test.input) 58 | if !reflect.DeepEqual(actual, test.output) { 59 | t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /analysis/lang/it/stemmer_it_snowball.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package it 16 | 17 | import ( 18 | "github.com/blevesearch/snowballstem" 19 | "github.com/blevesearch/snowballstem/italian" 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type ItalianStemmerFilter struct { 24 | } 25 | 26 | func StemmerFilter() *ItalianStemmerFilter { 27 | return &ItalianStemmerFilter{} 28 | } 29 | 30 | func (s *ItalianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | env := snowballstem.NewEnv(string(token.Term)) 33 | italian.Stem(env) 34 | token.Term = []byte(env.Current()) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/it/stop_filter_it.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package it 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/nl/analyzer_nl.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package nl 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | StopWordsFilter(), 29 | StemmerFilter(), 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /analysis/lang/nl/analyzer_nl_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package nl 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestDutchAnalyzer(t *testing.T) { 25 | tests := []struct { 26 | input []byte 27 | output analysis.TokenStream 28 | }{ 29 | // stemming 30 | { 31 | input: []byte("lichamelijk"), 32 | output: analysis.TokenStream{ 33 | &analysis.Token{ 34 | Term: []byte("licham"), 35 | }, 36 | }, 37 | }, 38 | { 39 | input: []byte("lichamelijke"), 40 | output: analysis.TokenStream{ 41 | &analysis.Token{ 42 | Term: []byte("licham"), 43 | }, 44 | }, 45 | }, 46 | // stop word 47 | { 48 | input: []byte("van"), 49 | output: analysis.TokenStream{}, 50 | }, 51 | } 52 | 53 | analyzer := Analyzer() 54 | for _, test := range tests { 55 | actual := analyzer.Analyze(test.input) 56 | if len(actual) != len(test.output) { 57 | t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) 58 | } 59 | for i, tok := range actual { 60 | if !reflect.DeepEqual(tok.Term, test.output[i].Term) { 61 | t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) 62 | } 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /analysis/lang/nl/stemmer_nl.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package nl 16 | 17 | import ( 18 | "github.com/blevesearch/snowballstem" 19 | "github.com/blevesearch/snowballstem/dutch" 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type DutchStemmerFilter struct { 24 | } 25 | 26 | func StemmerFilter() *DutchStemmerFilter { 27 | return &DutchStemmerFilter{} 28 | } 29 | 30 | func (s *DutchStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | env := snowballstem.NewEnv(string(token.Term)) 33 | dutch.Stem(env) 34 | token.Term = []byte(env.Current()) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/nl/stop_filter_nl.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package nl 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/no/analyzer_no.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package no 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | StopWordsFilter(), 29 | StemmerFilter(), 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /analysis/lang/no/analyzer_no_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package no 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestNorwegianAnalyzer(t *testing.T) { 25 | tests := []struct { 26 | input []byte 27 | output analysis.TokenStream 28 | }{ 29 | // stemming 30 | { 31 | input: []byte("havnedistriktene"), 32 | output: analysis.TokenStream{ 33 | &analysis.Token{ 34 | Term: []byte("havnedistrikt"), 35 | }, 36 | }, 37 | }, 38 | { 39 | input: []byte("havnedistrikter"), 40 | output: analysis.TokenStream{ 41 | &analysis.Token{ 42 | Term: []byte("havnedistrikt"), 43 | }, 44 | }, 45 | }, 46 | // stop word 47 | { 48 | input: []byte("det"), 49 | output: analysis.TokenStream{}, 50 | }, 51 | } 52 | 53 | analyzer := Analyzer() 54 | for _, test := range tests { 55 | actual := analyzer.Analyze(test.input) 56 | if len(actual) != len(test.output) { 57 | t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) 58 | } 59 | for i, tok := range actual { 60 | if !reflect.DeepEqual(tok.Term, test.output[i].Term) { 61 | t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) 62 | } 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /analysis/lang/no/stemmer_no.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package no 16 | 17 | import ( 18 | "github.com/blevesearch/snowballstem" 19 | "github.com/blevesearch/snowballstem/norwegian" 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type NorwegianStemmerFilter struct { 24 | } 25 | 26 | func StemmerFilter() *NorwegianStemmerFilter { 27 | return &NorwegianStemmerFilter{} 28 | } 29 | 30 | func (s *NorwegianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | env := snowballstem.NewEnv(string(token.Term)) 33 | norwegian.Stem(env) 34 | token.Term = []byte(env.Current()) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/no/stop_filter_no.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package no 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/pt/analyzer_pt.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package pt 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | StopWordsFilter(), 29 | LightStemmerFilter(), 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /analysis/lang/pt/analyzer_pt_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package pt 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestPortugueseAnalyzer(t *testing.T) { 25 | tests := []struct { 26 | input []byte 27 | output analysis.TokenStream 28 | }{ 29 | // stemming 30 | { 31 | input: []byte("quilométricas"), 32 | output: analysis.TokenStream{ 33 | &analysis.Token{ 34 | Term: []byte("quilometric"), 35 | }, 36 | }, 37 | }, 38 | { 39 | input: []byte("quilométricos"), 40 | output: analysis.TokenStream{ 41 | &analysis.Token{ 42 | Term: []byte("quilometric"), 43 | }, 44 | }, 45 | }, 46 | // stop word 47 | { 48 | input: []byte("não"), 49 | output: analysis.TokenStream{}, 50 | }, 51 | } 52 | 53 | analyzer := Analyzer() 54 | for _, test := range tests { 55 | actual := analyzer.Analyze(test.input) 56 | if len(actual) != len(test.output) { 57 | t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) 58 | } 59 | for i, tok := range actual { 60 | if !reflect.DeepEqual(tok.Term, test.output[i].Term) { 61 | t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) 62 | } 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /analysis/lang/pt/stop_filter_pt.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package pt 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/ro/analyzer_ro.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ro 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | StopWordsFilter(), 29 | StemmerFilter(), 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /analysis/lang/ro/analyzer_ro_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ro 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestRomanianAnalyzer(t *testing.T) { 25 | tests := []struct { 26 | input []byte 27 | output analysis.TokenStream 28 | }{ 29 | // stemming 30 | { 31 | input: []byte("absenţa"), 32 | output: analysis.TokenStream{ 33 | &analysis.Token{ 34 | Term: []byte("absenţ"), 35 | }, 36 | }, 37 | }, 38 | { 39 | input: []byte("absenţi"), 40 | output: analysis.TokenStream{ 41 | &analysis.Token{ 42 | Term: []byte("absenţ"), 43 | }, 44 | }, 45 | }, 46 | // stop word 47 | { 48 | input: []byte("îl"), 49 | output: analysis.TokenStream{}, 50 | }, 51 | } 52 | 53 | analyzer := Analyzer() 54 | for _, test := range tests { 55 | actual := analyzer.Analyze(test.input) 56 | if len(actual) != len(test.output) { 57 | t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) 58 | } 59 | for i, tok := range actual { 60 | if !reflect.DeepEqual(tok.Term, test.output[i].Term) { 61 | t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) 62 | } 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /analysis/lang/ro/stemmer_ro.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ro 16 | 17 | import ( 18 | "github.com/blevesearch/snowballstem" 19 | "github.com/blevesearch/snowballstem/romanian" 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type RomanianStemmerFilter struct { 24 | } 25 | 26 | func StemmerFilter() *RomanianStemmerFilter { 27 | return &RomanianStemmerFilter{} 28 | } 29 | 30 | func (s *RomanianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | env := snowballstem.NewEnv(string(token.Term)) 33 | romanian.Stem(env) 34 | token.Term = []byte(env.Current()) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/ro/stop_filter_ro.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ro 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/ru/analyzer_ru.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ru 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | StopWordsFilter(), 29 | StemmerFilter(), 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /analysis/lang/ru/stemmer_ru.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ru 16 | 17 | import ( 18 | "github.com/blevesearch/snowballstem" 19 | "github.com/blevesearch/snowballstem/russian" 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type RussianStemmerFilter struct { 24 | } 25 | 26 | func StemmerFilter() *RussianStemmerFilter { 27 | return &RussianStemmerFilter{} 28 | } 29 | 30 | func (s *RussianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | env := snowballstem.NewEnv(string(token.Term)) 33 | russian.Stem(env) 34 | token.Term = []byte(env.Current()) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/ru/stemmer_ru_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ru 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestSnowballRussianStemmer(t *testing.T) { 25 | tests := []struct { 26 | input analysis.TokenStream 27 | output analysis.TokenStream 28 | }{ 29 | { 30 | input: analysis.TokenStream{ 31 | &analysis.Token{ 32 | Term: []byte("актеров"), 33 | }, 34 | }, 35 | output: analysis.TokenStream{ 36 | &analysis.Token{ 37 | Term: []byte("актер"), 38 | }, 39 | }, 40 | }, 41 | { 42 | input: analysis.TokenStream{ 43 | &analysis.Token{ 44 | Term: []byte("километров"), 45 | }, 46 | }, 47 | output: analysis.TokenStream{ 48 | &analysis.Token{ 49 | Term: []byte("километр"), 50 | }, 51 | }, 52 | }, 53 | } 54 | 55 | filter := StemmerFilter() 56 | for _, test := range tests { 57 | actual := filter.Filter(test.input) 58 | if !reflect.DeepEqual(actual, test.output) { 59 | t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) 60 | } 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /analysis/lang/ru/stop_filter_ru.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package ru 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/sv/analyzer_sv.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package sv 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewLowerCaseFilter(), 28 | StopWordsFilter(), 29 | StemmerFilter(), 30 | }, 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /analysis/lang/sv/analyzer_sv_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package sv 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | func TestSwedishAnalyzer(t *testing.T) { 25 | tests := []struct { 26 | input []byte 27 | output analysis.TokenStream 28 | }{ 29 | // stemming 30 | { 31 | input: []byte("jaktkarlarne"), 32 | output: analysis.TokenStream{ 33 | &analysis.Token{ 34 | Term: []byte("jaktkarl"), 35 | }, 36 | }, 37 | }, 38 | { 39 | input: []byte("jaktkarlens"), 40 | output: analysis.TokenStream{ 41 | &analysis.Token{ 42 | Term: []byte("jaktkarl"), 43 | }, 44 | }, 45 | }, 46 | // stop word 47 | { 48 | input: []byte("och"), 49 | output: analysis.TokenStream{}, 50 | }, 51 | } 52 | 53 | analyzer := Analyzer() 54 | for _, test := range tests { 55 | actual := analyzer.Analyze(test.input) 56 | if len(actual) != len(test.output) { 57 | t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) 58 | } 59 | for i, tok := range actual { 60 | if !reflect.DeepEqual(tok.Term, test.output[i].Term) { 61 | t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) 62 | } 63 | } 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /analysis/lang/sv/stemmer_sv.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package sv 16 | 17 | import ( 18 | "github.com/blevesearch/snowballstem" 19 | "github.com/blevesearch/snowballstem/swedish" 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type SwedishStemmerFilter struct { 24 | } 25 | 26 | func StemmerFilter() *SwedishStemmerFilter { 27 | return &SwedishStemmerFilter{} 28 | } 29 | 30 | func (s *SwedishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | env := snowballstem.NewEnv(string(token.Term)) 33 | swedish.Stem(env) 34 | token.Term = []byte(env.Current()) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/sv/stop_filter_sv.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package sv 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/lang/tr/analyzer_tr.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package tr 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "github.com/blugelabs/bluge/analysis/token" 20 | "github.com/blugelabs/bluge/analysis/tokenizer" 21 | ) 22 | 23 | func Analyzer() *analysis.Analyzer { 24 | return &analysis.Analyzer{ 25 | Tokenizer: tokenizer.NewUnicodeTokenizer(), 26 | TokenFilters: []analysis.TokenFilter{ 27 | token.NewApostropheFilter(), 28 | token.NewLowerCaseFilter(), 29 | StopWordsFilter(), 30 | StemmerFilter(), 31 | }, 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /analysis/lang/tr/stemmer_tr.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package tr 16 | 17 | import ( 18 | "github.com/blevesearch/snowballstem" 19 | "github.com/blevesearch/snowballstem/turkish" 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type TurkishStemmerFilter struct { 24 | } 25 | 26 | func StemmerFilter() *TurkishStemmerFilter { 27 | return &TurkishStemmerFilter{} 28 | } 29 | 30 | func (s *TurkishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | env := snowballstem.NewEnv(string(token.Term)) 33 | turkish.Stem(env) 34 | token.Term = []byte(env.Current()) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/lang/tr/stop_filter_tr.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package tr 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis/token" 19 | ) 20 | 21 | func StopWordsFilter() *token.StopTokensFilter { 22 | return token.NewStopTokensFilter(StopWords()) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/size.go: -------------------------------------------------------------------------------- 1 | package analysis 2 | 3 | import "reflect" 4 | 5 | var sizeOfMap int 6 | var sizeOfPtr int 7 | var sizeOfString int 8 | 9 | func init() { 10 | var m map[int]int 11 | sizeOfMap = int(reflect.TypeOf(m).Size()) 12 | var ptr *int 13 | sizeOfPtr = int(reflect.TypeOf(ptr).Size()) 14 | var str string 15 | sizeOfString = int(reflect.TypeOf(str).Size()) 16 | } 17 | -------------------------------------------------------------------------------- /analysis/test_words.txt: -------------------------------------------------------------------------------- 1 | # full line comment 2 | marty 3 | steve # trailing comment 4 | | different format of comment 5 | dustin 6 | siri | different style trailing comment 7 | multiple words with different whitespace -------------------------------------------------------------------------------- /analysis/token/apostrophe.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package token 16 | 17 | import ( 18 | "bytes" 19 | 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | const Apostrophes = string(Apostrophe) + string(RightSingleQuotationMark) 24 | 25 | type ApostropheFilter struct{} 26 | 27 | func NewApostropheFilter() *ApostropheFilter { 28 | return &ApostropheFilter{} 29 | } 30 | 31 | func (s *ApostropheFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 32 | for _, token := range input { 33 | firstApostrophe := bytes.IndexAny(token.Term, Apostrophes) 34 | if firstApostrophe >= 0 { 35 | // found an apostrophe 36 | token.Term = token.Term[0:firstApostrophe] 37 | } 38 | } 39 | 40 | return input 41 | } 42 | -------------------------------------------------------------------------------- /analysis/token/elision.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package token 16 | 17 | import ( 18 | "unicode/utf8" 19 | 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | const RightSingleQuotationMark = '’' 24 | const Apostrophe = '\'' 25 | 26 | type ElisionFilter struct { 27 | articles analysis.TokenMap 28 | } 29 | 30 | func NewElisionFilter(articles analysis.TokenMap) *ElisionFilter { 31 | return &ElisionFilter{ 32 | articles: articles, 33 | } 34 | } 35 | 36 | func (s *ElisionFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 37 | for _, token := range input { 38 | term := token.Term 39 | for i := 0; i < len(term); { 40 | r, size := utf8.DecodeRune(term[i:]) 41 | if r == Apostrophe || r == RightSingleQuotationMark { 42 | // see if the prefix matches one of the articles 43 | prefix := term[0:i] 44 | _, articleMatch := s.articles[string(prefix)] 45 | if articleMatch { 46 | token.Term = term[i+size:] 47 | break 48 | } 49 | } 50 | i += size 51 | } 52 | } 53 | return input 54 | } 55 | -------------------------------------------------------------------------------- /analysis/token/keyword.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package token 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | ) 20 | 21 | type KeyWordMarkerFilter struct { 22 | keyWords analysis.TokenMap 23 | } 24 | 25 | func NewKeyWordMarkerFilter(keyWords analysis.TokenMap) *KeyWordMarkerFilter { 26 | return &KeyWordMarkerFilter{ 27 | keyWords: keyWords, 28 | } 29 | } 30 | 31 | func (f *KeyWordMarkerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 32 | for _, token := range input { 33 | _, isKeyWord := f.keyWords[string(token.Term)] 34 | if isKeyWord { 35 | token.KeyWord = true 36 | } 37 | } 38 | return input 39 | } 40 | -------------------------------------------------------------------------------- /analysis/token/length.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package token 16 | 17 | import ( 18 | "unicode/utf8" 19 | 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type LengthFilter struct { 24 | min int 25 | max int 26 | } 27 | 28 | func NewLengthFilter(min, max int) *LengthFilter { 29 | return &LengthFilter{ 30 | min: min, 31 | max: max, 32 | } 33 | } 34 | 35 | func (f *LengthFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 36 | rv := make(analysis.TokenStream, 0, len(input)) 37 | 38 | var skipped int 39 | for _, token := range input { 40 | wordLen := utf8.RuneCount(token.Term) 41 | if f.min > 0 && f.min > wordLen { 42 | skipped += token.PositionIncr 43 | continue 44 | } 45 | if f.max > 0 && f.max < wordLen { 46 | skipped += token.PositionIncr 47 | continue 48 | } 49 | if skipped > 0 { 50 | token.PositionIncr += skipped 51 | skipped = 0 52 | } 53 | rv = append(rv, token) 54 | } 55 | 56 | return rv 57 | } 58 | -------------------------------------------------------------------------------- /analysis/token/porter.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package token 16 | 17 | import ( 18 | "bytes" 19 | 20 | "github.com/blevesearch/go-porterstemmer" 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | type PorterStemmer struct{} 25 | 26 | func NewPorterStemmer() *PorterStemmer { 27 | return &PorterStemmer{} 28 | } 29 | 30 | func (s *PorterStemmer) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | // if it is not a protected keyword, stem it 33 | if !token.KeyWord { 34 | termRunes := bytes.Runes(token.Term) 35 | stemmedRunes := porterstemmer.StemWithoutLowerCasing(termRunes) 36 | token.Term = analysis.BuildTermFromRunes(stemmedRunes) 37 | } 38 | } 39 | return input 40 | } 41 | -------------------------------------------------------------------------------- /analysis/token/reverse.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package token 16 | 17 | import ( 18 | "unicode" 19 | "unicode/utf8" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | type ReverseFilter struct{} 25 | 26 | func NewReverseFilter() *ReverseFilter { 27 | return &ReverseFilter{} 28 | } 29 | 30 | func (f *ReverseFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 31 | for _, token := range input { 32 | token.Term = reverse(token.Term) 33 | } 34 | return input 35 | } 36 | 37 | // reverse(..) will generate a reversed version of the provided 38 | // unicode array and return it back to its caller. 39 | func reverse(s []byte) []byte { 40 | cursorIn := 0 41 | inputRunes := []rune(string(s)) 42 | cursorOut := len(s) 43 | output := make([]byte, len(s)) 44 | for i := 0; i < len(inputRunes); { 45 | wid := utf8.RuneLen(inputRunes[i]) 46 | i++ 47 | for i < len(inputRunes) { 48 | r := inputRunes[i] 49 | if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Me, r) || unicode.Is(unicode.Mc, r) { 50 | wid += utf8.RuneLen(r) 51 | i++ 52 | } else { 53 | break 54 | } 55 | } 56 | copy(output[cursorOut-wid:cursorOut], s[cursorIn:cursorIn+wid]) 57 | cursorIn += wid 58 | cursorOut -= wid 59 | } 60 | 61 | return output 62 | } 63 | -------------------------------------------------------------------------------- /analysis/token/stop.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // Package stop implements a TokenFilter removing tokens found in 16 | // a TokenMap. 17 | // 18 | // It constructor takes the following arguments: 19 | // 20 | // "stop_token_map" (string): the name of the token map identifying tokens to 21 | // remove. 22 | package token 23 | 24 | import ( 25 | "github.com/blugelabs/bluge/analysis" 26 | ) 27 | 28 | type StopTokensFilter struct { 29 | stopTokens analysis.TokenMap 30 | } 31 | 32 | func NewStopTokensFilter(stopTokens analysis.TokenMap) *StopTokensFilter { 33 | return &StopTokensFilter{ 34 | stopTokens: stopTokens, 35 | } 36 | } 37 | 38 | func (f *StopTokensFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 39 | var j, skipped int 40 | for _, token := range input { 41 | _, isStopToken := f.stopTokens[string(token.Term)] 42 | if !isStopToken { 43 | token.PositionIncr += skipped 44 | skipped = 0 45 | input[j] = token 46 | j++ 47 | } else { 48 | skipped += token.PositionIncr 49 | } 50 | } 51 | 52 | return input[:j] 53 | } 54 | -------------------------------------------------------------------------------- /analysis/token/truncate.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package token 16 | 17 | import ( 18 | "unicode/utf8" 19 | 20 | "github.com/blugelabs/bluge/analysis" 21 | ) 22 | 23 | type TruncateTokenFilter struct { 24 | length int 25 | } 26 | 27 | func NewTruncateTokenFilter(length int) *TruncateTokenFilter { 28 | return &TruncateTokenFilter{ 29 | length: length, 30 | } 31 | } 32 | 33 | func (s *TruncateTokenFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 34 | for _, token := range input { 35 | wordLen := utf8.RuneCount(token.Term) 36 | if wordLen > s.length { 37 | token.Term = analysis.TruncateRunes(token.Term, wordLen-s.length) 38 | } 39 | } 40 | return input 41 | } 42 | -------------------------------------------------------------------------------- /analysis/token/unicodenorm.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package token 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | "golang.org/x/text/unicode/norm" 20 | ) 21 | 22 | type UnicodeNormalizeFilter struct { 23 | form norm.Form 24 | } 25 | 26 | func NewUnicodeNormalizeFilter(form norm.Form) *UnicodeNormalizeFilter { 27 | return &UnicodeNormalizeFilter{ 28 | form: form, 29 | } 30 | } 31 | 32 | func (s *UnicodeNormalizeFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 33 | for _, token := range input { 34 | token.Term = s.form.Bytes(token.Term) 35 | } 36 | return input 37 | } 38 | -------------------------------------------------------------------------------- /analysis/token/unique.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package token 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | ) 20 | 21 | const initialMapFactor = 4 22 | 23 | // UniqueTermFilter retains only the tokens which mark the first occurrence of 24 | // a term. Tokens whose term appears in a preceding token are dropped. 25 | type UniqueTermFilter struct{} 26 | 27 | func NewUniqueTermFilter() *UniqueTermFilter { 28 | return &UniqueTermFilter{} 29 | } 30 | 31 | func (f *UniqueTermFilter) Filter(input analysis.TokenStream) analysis.TokenStream { 32 | encounteredTerms := make(map[string]struct{}, len(input)/initialMapFactor) 33 | var j, skipped int 34 | for _, token := range input { 35 | term := string(token.Term) 36 | if _, ok := encounteredTerms[term]; ok { 37 | skipped += token.PositionIncr 38 | continue 39 | } 40 | token.PositionIncr += skipped 41 | skipped = 0 42 | encounteredTerms[term] = struct{}{} 43 | input[j] = token 44 | j++ 45 | } 46 | return input[:j] 47 | } 48 | -------------------------------------------------------------------------------- /analysis/tokenizer/letter.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package tokenizer 16 | 17 | import ( 18 | "unicode" 19 | ) 20 | 21 | func NewLetterTokenizer() *CharacterTokenizer { 22 | return NewCharacterTokenizer(unicode.IsLetter) 23 | } 24 | -------------------------------------------------------------------------------- /analysis/tokenizer/regexp.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package tokenizer 16 | 17 | import ( 18 | "regexp" 19 | "strconv" 20 | 21 | "github.com/blugelabs/bluge/analysis" 22 | ) 23 | 24 | var IdeographRegexp = regexp.MustCompile(`\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}`) 25 | 26 | type RegexpTokenizer struct { 27 | r *regexp.Regexp 28 | } 29 | 30 | func NewRegexpTokenizer(r *regexp.Regexp) *RegexpTokenizer { 31 | return &RegexpTokenizer{ 32 | r: r, 33 | } 34 | } 35 | 36 | func (rt *RegexpTokenizer) Tokenize(input []byte) analysis.TokenStream { 37 | matches := rt.r.FindAllIndex(input, -1) 38 | rv := make(analysis.TokenStream, 0, len(matches)) 39 | for _, match := range matches { 40 | matchBytes := input[match[0]:match[1]] 41 | if match[1]-match[0] > 0 { 42 | token := analysis.Token{ 43 | Term: matchBytes, 44 | Start: match[0], 45 | End: match[1], 46 | PositionIncr: 1, 47 | Type: detectTokenType(matchBytes), 48 | } 49 | rv = append(rv, &token) 50 | } 51 | } 52 | return rv 53 | } 54 | 55 | func detectTokenType(termBytes []byte) analysis.TokenType { 56 | if IdeographRegexp.Match(termBytes) { 57 | return analysis.Ideographic 58 | } 59 | _, err := strconv.ParseFloat(string(termBytes), 64) 60 | if err == nil { 61 | return analysis.Numeric 62 | } 63 | return analysis.AlphaNumeric 64 | } 65 | -------------------------------------------------------------------------------- /analysis/tokenizer/single.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package tokenizer 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/analysis" 19 | ) 20 | 21 | type SingleTokenTokenizer struct{} 22 | 23 | func NewSingleTokenTokenizer() *SingleTokenTokenizer { 24 | return &SingleTokenTokenizer{} 25 | } 26 | 27 | func (t *SingleTokenTokenizer) Tokenize(input []byte) analysis.TokenStream { 28 | return MakeTokenStream(input) 29 | } 30 | 31 | func MakeToken(input []byte) *analysis.Token { 32 | return &analysis.Token{ 33 | Term: input, 34 | PositionIncr: 1, 35 | Start: 0, 36 | End: len(input), 37 | Type: analysis.AlphaNumeric, 38 | } 39 | } 40 | 41 | func MakeTokenStream(input []byte) analysis.TokenStream { 42 | return analysis.TokenStream{ 43 | MakeToken(input), 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /analysis/tokenizer/web.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package tokenizer 16 | 17 | import ( 18 | "regexp" 19 | "strings" 20 | ) 21 | 22 | var email = `(?:[a-z0-9!#$%&'*+/=?^_` + "`" + `{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_` + "`" + `{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])` 23 | var url = `(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s` + "`" + `!()\[\]{};:'".,<>?«»“”‘’]))` 24 | var twitterHandle = `@([a-zA-Z0-9_]){1,15}` 25 | var twitterHashtag = `#([a-zA-Z0-9_])+` 26 | var exceptions = []string{email, url, twitterHandle, twitterHashtag} 27 | 28 | var exceptionsRegexp = regexp.MustCompile(strings.Join(exceptions, "|")) 29 | 30 | func NewWebTokenizer() *ExceptionsTokenizer { 31 | return NewExceptionsTokenizer(exceptionsRegexp, NewUnicodeTokenizer()) 32 | } 33 | -------------------------------------------------------------------------------- /analysis/tokenizer/whitespace.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package tokenizer 16 | 17 | import ( 18 | "unicode" 19 | ) 20 | 21 | func NewWhitespaceTokenizer() *CharacterTokenizer { 22 | return NewCharacterTokenizer(notSpace) 23 | } 24 | 25 | func notSpace(r rune) bool { 26 | return !unicode.IsSpace(r) 27 | } 28 | -------------------------------------------------------------------------------- /analysis/tokenmap_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package analysis 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | ) 21 | 22 | func TestTokenMapLoadFile(t *testing.T) { 23 | tokenMap := NewTokenMap() 24 | err := tokenMap.LoadFile("test_words.txt") 25 | if err != nil { 26 | t.Fatal(err) 27 | } 28 | 29 | expectedTokens := NewTokenMap() 30 | expectedTokens.AddToken("marty") 31 | expectedTokens.AddToken("steve") 32 | expectedTokens.AddToken("dustin") 33 | expectedTokens.AddToken("siri") 34 | expectedTokens.AddToken("multiple") 35 | expectedTokens.AddToken("words") 36 | expectedTokens.AddToken("with") 37 | expectedTokens.AddToken("different") 38 | expectedTokens.AddToken("whitespace") 39 | 40 | if !reflect.DeepEqual(tokenMap, expectedTokens) { 41 | t.Errorf("expected %#v, got %#v", expectedTokens, tokenMap) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /batch.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package bluge 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/index" 19 | ) 20 | 21 | const _idField = "_id" 22 | 23 | type Identifier string 24 | 25 | func (i Identifier) Field() string { 26 | return _idField 27 | } 28 | 29 | func (i Identifier) Term() []byte { 30 | return []byte(i) 31 | } 32 | 33 | // NewBatch creates a new empty batch. 34 | func NewBatch() *index.Batch { 35 | return index.NewBatch() 36 | } 37 | -------------------------------------------------------------------------------- /cmd/bluge/cmd/list.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package cmd 16 | 17 | import ( 18 | "fmt" 19 | 20 | "github.com/blugelabs/bluge/index" 21 | 22 | "github.com/spf13/cobra" 23 | ) 24 | 25 | var listCmd = &cobra.Command{ 26 | Use: "list [path]", 27 | Short: "lists the contents of the bluge index", 28 | Long: `The list command will list the contents of the Bluge index.`, 29 | RunE: func(cmd *cobra.Command, args []string) error { 30 | 31 | if len(args) < 1 { 32 | return fmt.Errorf("must specify path to index") 33 | } 34 | 35 | dir := index.NewFileSystemDirectory(args[0]) 36 | 37 | snapshotIDs, err := dir.List(index.ItemKindSnapshot) 38 | if err != nil { 39 | return fmt.Errorf("error listing snapshots: %v", err) 40 | } 41 | for _, snapshotID := range snapshotIDs { 42 | fmt.Printf("snapshot: %d\n", snapshotID) 43 | } 44 | 45 | return nil 46 | }, 47 | } 48 | 49 | func init() { 50 | RootCmd.AddCommand(listCmd) 51 | } 52 | -------------------------------------------------------------------------------- /cmd/bluge/cmd/root.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package cmd 16 | 17 | import ( 18 | "fmt" 19 | "os" 20 | 21 | "github.com/spf13/cobra" 22 | ) 23 | 24 | // RootCmd represents the base command when called without any subcommands 25 | var RootCmd = &cobra.Command{ 26 | Use: "bluge", 27 | Short: "command-line tool to interact with a bluge index", 28 | Long: `Bluge is a command-line tool to interact with a bluge index.`, 29 | PersistentPreRunE: func(cmd *cobra.Command, args []string) error { 30 | return nil 31 | }, 32 | PersistentPostRunE: func(cmd *cobra.Command, args []string) error { 33 | return nil 34 | }, 35 | } 36 | 37 | // Execute adds all child commands to the root command sets flags appropriately. 38 | // This is called by main.main(). It only needs to happen once to the rootCmd. 39 | func Execute() { 40 | if err := RootCmd.Execute(); err != nil { 41 | fmt.Println(err) 42 | os.Exit(-1) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /cmd/bluge/main.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package main 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/cmd/bluge/cmd" 19 | ) 20 | 21 | func main() { 22 | cmd.Execute() 23 | } 24 | -------------------------------------------------------------------------------- /docs/bluge.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/blugelabs/bluge/57414197005148539c5dc5db8ab581594969df79/docs/bluge.png -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/blugelabs/bluge 2 | 3 | go 1.13 4 | 5 | require ( 6 | github.com/RoaringBitmap/roaring v0.9.4 7 | github.com/axiomhq/hyperloglog v0.0.0-20191112132149-a4c4c47bc57f 8 | github.com/bits-and-blooms/bitset v1.2.0 9 | github.com/blevesearch/go-porterstemmer v1.0.3 10 | github.com/blevesearch/mmap-go v1.0.4 11 | github.com/blevesearch/segment v0.9.0 12 | github.com/blevesearch/snowballstem v0.9.0 13 | github.com/blevesearch/vellum v1.0.7 14 | github.com/blugelabs/bluge_segment_api v0.2.0 15 | github.com/blugelabs/ice v1.0.0 16 | github.com/blugelabs/ice/v2 v2.0.1 17 | github.com/caio/go-tdigest v3.1.0+incompatible 18 | github.com/leesper/go_rng v0.0.0-20190531154944-a612b043e353 // indirect 19 | github.com/spf13/cobra v0.0.5 20 | golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a 21 | golang.org/x/text v0.3.0 22 | gonum.org/v1/gonum v0.7.0 // indirect 23 | ) 24 | -------------------------------------------------------------------------------- /index/batch.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package index 16 | 17 | import segment "github.com/blugelabs/bluge_segment_api" 18 | 19 | type Batch struct { 20 | documents []segment.Document 21 | ids []segment.Term 22 | persistedCallback func(error) 23 | } 24 | 25 | func NewBatch() *Batch { 26 | return &Batch{} 27 | } 28 | 29 | func (b *Batch) Insert(doc segment.Document) { 30 | b.documents = append(b.documents, doc) 31 | } 32 | 33 | func (b *Batch) Update(id segment.Term, doc segment.Document) { 34 | b.documents = append(b.documents, doc) 35 | b.ids = append(b.ids, id) 36 | } 37 | 38 | func (b *Batch) Delete(id segment.Term) { 39 | b.ids = append(b.ids, id) 40 | } 41 | 42 | func (b *Batch) Reset() { 43 | b.documents = b.documents[:0] 44 | b.ids = b.ids[:0] 45 | b.persistedCallback = nil 46 | } 47 | 48 | func (b *Batch) SetPersistedCallback(f func(error)) { 49 | b.persistedCallback = f 50 | } 51 | 52 | func (b *Batch) PersistedCallback() func(error) { 53 | return b.persistedCallback 54 | } 55 | -------------------------------------------------------------------------------- /index/communication.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package index 16 | 17 | import segment "github.com/blugelabs/bluge_segment_api" 18 | 19 | type notificationChan chan struct{} 20 | 21 | type epochWatcher struct { 22 | epoch uint64 23 | notifyCh notificationChan 24 | } 25 | 26 | type epochWatchers []*epochWatcher 27 | 28 | func (e *epochWatchers) Add(watcher *epochWatcher) { 29 | *e = append(*e, watcher) 30 | } 31 | 32 | func (e *epochWatchers) NotifySatisfiedWatchers(epoch uint64) { 33 | var epochWatchersNext epochWatchers 34 | for _, w := range *e { 35 | if w.epoch < epoch { 36 | close(w.notifyCh) 37 | } else { 38 | epochWatchersNext.Add(w) 39 | } 40 | } 41 | *e = epochWatchersNext 42 | } 43 | 44 | type watcherChan chan *epochWatcher 45 | 46 | func (w watcherChan) NotifyUsAfter(epoch uint64, closeCh chan struct{}) (*epochWatcher, error) { 47 | ew := &epochWatcher{ 48 | epoch: epoch, 49 | notifyCh: make(notificationChan, 1), 50 | } 51 | select { 52 | case <-closeCh: 53 | return nil, segment.ErrClosed 54 | case w <- ew: 55 | } 56 | return ew, nil 57 | } 58 | -------------------------------------------------------------------------------- /index/deletion_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package index 16 | 17 | import ( 18 | "fmt" 19 | "reflect" 20 | "testing" 21 | ) 22 | 23 | func TestDeletableEpochs(t *testing.T) { 24 | tests := []struct { 25 | name string 26 | n int 27 | knownEpochs []uint64 28 | deletableEpochs []uint64 29 | }{ 30 | { 31 | name: "empty", 32 | n: 1, 33 | knownEpochs: nil, 34 | deletableEpochs: nil, 35 | }, 36 | { 37 | name: "one", 38 | n: 1, 39 | knownEpochs: []uint64{1}, 40 | deletableEpochs: nil, 41 | }, 42 | { 43 | name: "many", 44 | n: 1, 45 | knownEpochs: []uint64{1, 2, 3, 4}, 46 | deletableEpochs: []uint64{1, 2, 3}, 47 | }, 48 | } 49 | 50 | for _, test := range tests { 51 | test := test 52 | t.Run(fmt.Sprintf("%s-%d", test.name, test.n), func(t *testing.T) { 53 | policy := NewKeepNLatestDeletionPolicy(test.n) 54 | for _, epoch := range test.knownEpochs { 55 | policy.Commit(&Snapshot{epoch: epoch}) 56 | } 57 | if !reflect.DeepEqual(policy.deletableEpochs, test.deletableEpochs) { 58 | t.Errorf("expected deletable: %#v, got %#v", test.deletableEpochs, policy.deletableEpochs) 59 | } 60 | }) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /index/directory_fs_nix.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | //go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd 16 | // +build darwin dragonfly freebsd linux netbsd openbsd 17 | 18 | package index 19 | 20 | import ( 21 | "os" 22 | "path/filepath" 23 | ) 24 | 25 | func (d *FileSystemDirectory) remove(kind string, id uint64) error { 26 | segmentPath := filepath.Join(d.path, d.fileName(kind, id)) 27 | segmentFile, err := d.openExclusive(segmentPath, os.O_CREATE|os.O_RDWR, d.newFilePerm) 28 | if err != nil { 29 | return err 30 | } 31 | defer func() { 32 | _ = segmentFile.Close() 33 | }() 34 | 35 | return os.Remove(segmentPath) 36 | } 37 | -------------------------------------------------------------------------------- /index/directory_fs_windows.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package index 16 | 17 | import ( 18 | "os" 19 | "path/filepath" 20 | ) 21 | 22 | func (d *FileSystemDirectory) remove(kind string, id uint64) error { 23 | segmentPath := filepath.Join(d.path, d.fileName(kind, id)) 24 | return os.Remove(segmentPath) 25 | } 26 | -------------------------------------------------------------------------------- /index/empty.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package index 16 | 17 | import segment "github.com/blugelabs/bluge_segment_api" 18 | 19 | type emptyPostingsIterator struct{} 20 | 21 | func (e *emptyPostingsIterator) Next() (segment.Posting, error) { 22 | return nil, nil 23 | } 24 | 25 | func (e *emptyPostingsIterator) Advance(uint64) (segment.Posting, error) { 26 | return nil, nil 27 | } 28 | 29 | func (e *emptyPostingsIterator) Size() int { 30 | return 0 31 | } 32 | 33 | func (e *emptyPostingsIterator) Empty() bool { 34 | return true 35 | } 36 | 37 | func (e *emptyPostingsIterator) Count() uint64 { 38 | return 0 39 | } 40 | 41 | func (e *emptyPostingsIterator) Close() error { 42 | return nil 43 | } 44 | 45 | var anEmptyPostingsIterator = &emptyPostingsIterator{} 46 | -------------------------------------------------------------------------------- /index/event.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package index 16 | 17 | import "time" 18 | 19 | // Event represents the information provided in an OnEvent() callback. 20 | type Event struct { 21 | Kind int 22 | Chill *Writer 23 | Duration time.Duration 24 | } 25 | 26 | // Kinds of index events 27 | const ( 28 | EventKindCloseStart = 1 // when the index has started to close 29 | EventKindClose = 2 // when the index has been fully closed 30 | EventKindMergerProgress = 3 // when the index has completed a round of merge operations 31 | EventKindPersisterProgress = 4 // when the index has completed a round of persistence operations 32 | EventKindBatchIntroductionStart = 5 // when the index has started to introduce a new batch 33 | EventKindBatchIntroduction = 6 // when index has finished introducing a batch 34 | EventKindMergeTaskIntroductionStart = 7 // when the index has started to introduce a merge 35 | EventKindMergeTaskIntroduction = 8 // when the index has finished introdocing a merge 36 | 37 | ) 38 | -------------------------------------------------------------------------------- /index/event_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package index 16 | 17 | import ( 18 | "testing" 19 | ) 20 | 21 | func TestEventBatchIntroductionStart(t *testing.T) { 22 | testConfig, cleanup := CreateConfig("TestEventBatchIntroductionStart") 23 | defer func() { 24 | err := cleanup() 25 | if err != nil { 26 | t.Fatal(err) 27 | } 28 | }() 29 | 30 | var count int 31 | testConfig.EventCallback = func(e Event) { 32 | if e.Kind == EventKindBatchIntroductionStart { 33 | count++ 34 | } 35 | } 36 | 37 | idx, err := OpenWriter(testConfig) 38 | if err != nil { 39 | t.Fatal(err) 40 | } 41 | 42 | doc := &FakeDocument{ 43 | NewFakeField("_id", "1", true, false, false), 44 | NewFakeField("name", "test", false, false, true), 45 | } 46 | 47 | b := NewBatch() 48 | b.Update(testIdentifier("1"), doc) 49 | err = idx.Batch(b) 50 | if err != nil { 51 | t.Errorf("Error updating index: %v", err) 52 | } 53 | 54 | defer func() { 55 | err := idx.Close() 56 | if err != nil { 57 | t.Fatal(err) 58 | } 59 | }() 60 | 61 | if count != 1 { 62 | t.Fatalf("expected to see 1 batch introduction event event, saw %d", count) 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /index/lock/lock.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package lock 16 | 17 | import ( 18 | "os" 19 | ) 20 | 21 | type LockedFile interface { 22 | File() *os.File 23 | Exclusive() bool 24 | Close() error 25 | } 26 | 27 | type DefaultLockedFile struct { 28 | f *os.File 29 | exclusive bool 30 | } 31 | 32 | func OpenExclusive(path string, flag int, perm os.FileMode) (LockedFile, error) { 33 | return open(path, flag, perm, true) 34 | } 35 | 36 | func OpenShared(path string, flag int, perm os.FileMode) (LockedFile, error) { 37 | return open(path, flag, perm, false) 38 | } 39 | 40 | func (e *DefaultLockedFile) File() *os.File { 41 | return e.f 42 | } 43 | 44 | func (e *DefaultLockedFile) Exclusive() bool { 45 | return e.exclusive 46 | } 47 | 48 | func (e *DefaultLockedFile) Close() error { 49 | err := e.unlock() 50 | err2 := e.f.Close() 51 | if err2 != nil && err == nil { 52 | err = err2 53 | } 54 | return err 55 | } 56 | -------------------------------------------------------------------------------- /index/lock/lock_nix.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | //go:build darwin || dragonfly || freebsd || linux || netbsd || openbsd 16 | // +build darwin dragonfly freebsd linux netbsd openbsd 17 | 18 | package lock 19 | 20 | import ( 21 | "os" 22 | 23 | "golang.org/x/sys/unix" 24 | ) 25 | 26 | func open(path string, flag int, perm os.FileMode, exclusive bool) (LockedFile, error) { 27 | f, err := os.OpenFile(path, flag, perm) 28 | if err != nil { 29 | return nil, err 30 | } 31 | how := unix.LOCK_SH | unix.LOCK_NB 32 | if exclusive { 33 | how = unix.LOCK_EX | unix.LOCK_NB 34 | } 35 | err = unix.Flock(int(f.Fd()), how) 36 | if err != nil { 37 | _ = f.Close() 38 | return nil, err 39 | } 40 | return &DefaultLockedFile{ 41 | f: f, 42 | exclusive: exclusive, 43 | }, nil 44 | } 45 | 46 | func (e *DefaultLockedFile) unlock() error { 47 | return nil 48 | } 49 | -------------------------------------------------------------------------------- /index/lock/lock_windows.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package lock 16 | 17 | import ( 18 | "os" 19 | 20 | "golang.org/x/sys/windows" 21 | ) 22 | 23 | func open(path string, flag int, perm os.FileMode, exclusive bool) (LockedFile, error) { 24 | f, err := os.OpenFile(path, flag, perm) 25 | if err != nil { 26 | return nil, err 27 | } 28 | 29 | lockFlags := uint32(windows.LOCKFILE_FAIL_IMMEDIATELY) 30 | if exclusive { 31 | lockFlags |= uint32(windows.LOCKFILE_EXCLUSIVE_LOCK) 32 | } 33 | 34 | err = windows.LockFileEx(windows.Handle(f.Fd()), lockFlags, 0, 1, 0, &windows.Overlapped{}) 35 | if err != nil { 36 | _ = f.Close() 37 | return nil, err 38 | } 39 | 40 | return &DefaultLockedFile{ 41 | f: f, 42 | exclusive: exclusive, 43 | }, nil 44 | } 45 | 46 | func (e *DefaultLockedFile) unlock() error { 47 | return windows.UnlockFileEx(windows.Handle(e.f.Fd()), 0, 1, 0, &windows.Overlapped{}) 48 | } 49 | -------------------------------------------------------------------------------- /index/mergeplan/sort.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package mergeplan 16 | 17 | type byLiveSizeDescending []Segment 18 | 19 | func (a byLiveSizeDescending) Len() int { return len(a) } 20 | 21 | func (a byLiveSizeDescending) Swap(i, j int) { a[i], a[j] = a[j], a[i] } 22 | 23 | func (a byLiveSizeDescending) Less(i, j int) bool { 24 | if a[i].LiveSize() != a[j].LiveSize() { 25 | return a[i].LiveSize() > a[j].LiveSize() 26 | } 27 | return a[i].ID() < a[j].ID() 28 | } 29 | -------------------------------------------------------------------------------- /index/sizes.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package index 16 | 17 | import ( 18 | "reflect" 19 | ) 20 | 21 | func init() { 22 | var i int 23 | sizeOfInt = int(reflect.TypeOf(i).Size()) 24 | var ptr *int 25 | sizeOfPtr = int(reflect.TypeOf(ptr).Size()) 26 | 27 | var pi postingsIterator 28 | reflectStaticSizeIndexSnapshotTermFieldReader = int(reflect.TypeOf(pi).Size()) 29 | var pia postingsIteratorAll 30 | reflectStaticSizeIndexSnapshotDocIDReader = int(reflect.TypeOf(pia).Size()) 31 | var is interface{} = Snapshot{} 32 | reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size()) 33 | var pib unadornedPostingsIteratorBitmap 34 | reflectStaticSizeUnadornedPostingsIteratorBitmap = int(reflect.TypeOf(pib).Size()) 35 | var pi1h unadornedPostingsIterator1Hit 36 | reflectStaticSizeUnadornedPostingsIterator1Hit = int(reflect.TypeOf(pi1h).Size()) 37 | var up unadornedPosting 38 | reflectStaticSizeUnadornedPosting = int(reflect.TypeOf(up).Size()) 39 | } 40 | 41 | var sizeOfInt int 42 | var sizeOfPtr int 43 | 44 | var reflectStaticSizeIndexSnapshotTermFieldReader int 45 | var reflectStaticSizeIndexSnapshotDocIDReader int 46 | var reflectStaticSizeIndexSnapshot int 47 | var reflectStaticSizeUnadornedPostingsIteratorBitmap int 48 | var reflectStaticSizeUnadornedPostingsIterator1Hit int 49 | var reflectStaticSizeUnadornedPosting int 50 | -------------------------------------------------------------------------------- /numeric/bin_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package numeric 16 | 17 | import "testing" 18 | 19 | func TestInterleaveDeinterleave(t *testing.T) { 20 | tests := []struct { 21 | v1 uint64 22 | v2 uint64 23 | }{ 24 | {0, 0}, 25 | {1, 1}, 26 | {27, 39}, 27 | {1<<32 - 1, 1<<32 - 1}, // largest that should still work 28 | } 29 | 30 | for _, test := range tests { 31 | i := Interleave(test.v1, test.v2) 32 | gotv1 := Deinterleave(i) 33 | gotv2 := Deinterleave(i >> 1) 34 | if gotv1 != test.v1 { 35 | t.Errorf("expected v1: %d, got %d, interleaved was %x", test.v1, gotv1, i) 36 | } 37 | if gotv2 != test.v2 { 38 | t.Errorf("expected v2: %d, got %d, interleaved was %x", test.v2, gotv2, i) 39 | } 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /numeric/float.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package numeric 16 | 17 | import ( 18 | "math" 19 | ) 20 | 21 | func Float64ToInt64(f float64) int64 { 22 | fasint := int64(math.Float64bits(f)) 23 | if fasint < 0 { 24 | fasint ^= 0x7fffffffffffffff 25 | } 26 | return fasint 27 | } 28 | 29 | func Int64ToFloat64(i int64) float64 { 30 | if i < 0 { 31 | i ^= 0x7fffffffffffffff 32 | } 33 | return math.Float64frombits(uint64(i)) 34 | } 35 | -------------------------------------------------------------------------------- /numeric/float_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2014 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package numeric 16 | 17 | import ( 18 | "testing" 19 | ) 20 | 21 | // test that the float/sortable int operations work both ways 22 | // and that the corresponding integers sort the same as 23 | // the original floats would have 24 | func TestSortabledFloat64ToInt64(t *testing.T) { 25 | tests := []struct { 26 | input float64 27 | }{ 28 | { 29 | input: -4640094584139352638, 30 | }, 31 | { 32 | input: -167.42, 33 | }, 34 | { 35 | input: -1.11, 36 | }, 37 | { 38 | input: 0, 39 | }, 40 | { 41 | input: 3.14, 42 | }, 43 | { 44 | input: 167.42, 45 | }, 46 | } 47 | 48 | var lastInt64 *int64 49 | for _, test := range tests { 50 | actual := Float64ToInt64(test.input) 51 | if lastInt64 != nil { 52 | // check that this float is greater than the last one 53 | if actual <= *lastInt64 { 54 | t.Errorf("expected greater than prev, this: %d, last %d", actual, *lastInt64) 55 | } 56 | } 57 | lastInt64 = &actual 58 | convertedBack := Int64ToFloat64(actual) 59 | // assert that we got back what we started with 60 | if convertedBack != test.input { 61 | t.Errorf("expected %f, got %f", test.input, convertedBack) 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /numeric/geo/README.md: -------------------------------------------------------------------------------- 1 | # geo support in blube 2 | 3 | First, all of this geo code is a Go adaptation of the [Lucene 5.3.2 sandbox geo support](https://lucene.apache.org/core/5_3_2/sandbox/org/apache/lucene/util/package-summary.html). 4 | 5 | ## Notes 6 | 7 | - All of the APIs will use float64 for lon/lat values. 8 | - When describing a point in function arguments or return values, we always use the order lon, lat. 9 | - High level APIs will use TopLeft and BottomRight to describe bounding boxes. This may not map cleanly to min/max lon/lat when crossing the dateline. The lower level APIs will use min/max lon/lat and require the higher-level code to split boxes accordingly. 10 | -------------------------------------------------------------------------------- /numeric/geo/benchmark_geohash_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2019 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package geo 16 | 17 | import ( 18 | "testing" 19 | ) 20 | 21 | func BenchmarkGeoHashLen5NewDecode(b *testing.B) { 22 | b.ResetTimer() 23 | hash := "d3hn3" 24 | for i := 0; i < b.N; i++ { 25 | _, _ = DecodeGeoHash(hash) 26 | } 27 | } 28 | 29 | func BenchmarkGeoHashLen6NewDecode(b *testing.B) { 30 | b.ResetTimer() 31 | hash := "u4pruy" 32 | for i := 0; i < b.N; i++ { 33 | _, _ = DecodeGeoHash(hash) 34 | } 35 | } 36 | 37 | func BenchmarkGeoHashLen7NewDecode(b *testing.B) { 38 | b.ResetTimer() 39 | hash := "u4pruyd" 40 | for i := 0; i < b.N; i++ { 41 | _, _ = DecodeGeoHash(hash) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /search/aggregations/cardinality.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package aggregations 16 | 17 | import ( 18 | "github.com/axiomhq/hyperloglog" 19 | "github.com/blugelabs/bluge/search" 20 | ) 21 | 22 | type CardinalityMetric struct { 23 | src search.TextValuesSource 24 | } 25 | 26 | func Cardinality(src search.TextValuesSource) *CardinalityMetric { 27 | return &CardinalityMetric{ 28 | src: src, 29 | } 30 | } 31 | 32 | func (c *CardinalityMetric) Fields() []string { 33 | return c.src.Fields() 34 | } 35 | 36 | func (c *CardinalityMetric) Calculator() search.Calculator { 37 | rv := &CardinalityCalculator{ 38 | src: c.src, 39 | sketch: hyperloglog.New16(), 40 | } 41 | return rv 42 | } 43 | 44 | type CardinalityCalculator struct { 45 | src search.TextValuesSource 46 | sketch *hyperloglog.Sketch 47 | } 48 | 49 | func (c *CardinalityCalculator) Value() float64 { 50 | return float64(c.sketch.Estimate()) 51 | } 52 | 53 | func (c *CardinalityCalculator) Consume(d *search.DocumentMatch) { 54 | for _, val := range c.src.Values(d) { 55 | c.sketch.Insert(val) 56 | } 57 | } 58 | 59 | func (c *CardinalityCalculator) Merge(other search.Calculator) { 60 | if other, ok := other.(*CardinalityCalculator); ok { 61 | _ = c.sketch.Merge(other.sketch) 62 | } 63 | } 64 | 65 | func (c *CardinalityCalculator) Finish() { 66 | 67 | } 68 | -------------------------------------------------------------------------------- /search/aggregations/count.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package aggregations 16 | 17 | import "github.com/blugelabs/bluge/search" 18 | 19 | var staticCount = []float64{1} 20 | 21 | type countingSource struct{} 22 | 23 | func (*countingSource) Fields() []string { 24 | return nil 25 | } 26 | 27 | func (*countingSource) Numbers(_ *search.DocumentMatch) []float64 { 28 | return staticCount 29 | } 30 | 31 | var countSource = &countingSource{} 32 | 33 | func CountMatches() *SingleValueMetric { 34 | return Sum(countSource) 35 | } 36 | -------------------------------------------------------------------------------- /search/aggregations/duration.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package aggregations 16 | 17 | import ( 18 | "time" 19 | 20 | "github.com/blugelabs/bluge/search" 21 | ) 22 | 23 | type DurationMetric struct{} 24 | 25 | func Duration() *DurationMetric { 26 | return &DurationMetric{} 27 | } 28 | 29 | func (d *DurationMetric) Fields() []string { 30 | return nil 31 | } 32 | 33 | func (d *DurationMetric) Calculator() search.Calculator { 34 | return &DurationCalculator{ 35 | origin: time.Now(), 36 | } 37 | } 38 | 39 | type DurationCalculator struct { 40 | origin time.Time 41 | since time.Duration 42 | } 43 | 44 | func (d *DurationCalculator) Consume(*search.DocumentMatch) {} 45 | 46 | func (d *DurationCalculator) Finish() { 47 | d.since = time.Since(d.origin) 48 | } 49 | 50 | func (d *DurationCalculator) Merge(other search.Calculator) {} 51 | 52 | func (d *DurationCalculator) Duration() time.Duration { 53 | return d.since 54 | } 55 | -------------------------------------------------------------------------------- /search/collector.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package search 16 | 17 | import ( 18 | "context" 19 | ) 20 | 21 | type Collector interface { 22 | Collect(context.Context, Aggregations, Collectible) (DocumentMatchIterator, error) 23 | 24 | Size() int 25 | BackingSize() int 26 | } 27 | 28 | type Collectible interface { 29 | Next(ctx *Context) (*DocumentMatch, error) 30 | DocumentMatchPoolSize() int 31 | Close() error 32 | } 33 | 34 | type DocumentMatchIterator interface { 35 | Next() (*DocumentMatch, error) 36 | Aggregations() *Bucket 37 | } 38 | -------------------------------------------------------------------------------- /search/collector/all_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package collector 16 | 17 | import ( 18 | "context" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/search" 22 | "github.com/blugelabs/bluge/search/aggregations" 23 | ) 24 | 25 | func TestAllCollector(t *testing.T) { 26 | matches := makeMatches(99, 11) 27 | searcher := &stubSearcher{ 28 | matches: matches, 29 | } 30 | 31 | aggs := make(search.Aggregations) 32 | aggs.Add("count", aggregations.CountMatches()) 33 | 34 | collector := NewAllCollector() 35 | dmi, err := collector.Collect(context.Background(), aggs, searcher) 36 | if err != nil { 37 | t.Fatal(err) 38 | } 39 | 40 | var count uint64 41 | next, err := dmi.Next() 42 | for err == nil && next != nil { 43 | count++ 44 | 45 | // test that we can see aggregations while iterating with this collector 46 | if dmi.Aggregations().Count() != count { 47 | t.Errorf("expected aggregations count to match running count, %d != %d", 48 | count, dmi.Aggregations().Count()) 49 | } 50 | 51 | next, err = dmi.Next() 52 | } 53 | if err != nil { 54 | t.Fatalf("error iterator matches: %v", err) 55 | } 56 | 57 | if count != 99 { 58 | t.Errorf("expected to see 99 hits, saw: %d", count) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /search/collector/iterator.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package collector 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/search" 19 | ) 20 | 21 | type TopNIterator struct { 22 | results search.DocumentMatchCollection 23 | bucket *search.Bucket 24 | index int 25 | err error 26 | } 27 | 28 | func (i *TopNIterator) Next() (*search.DocumentMatch, error) { 29 | if i.err != nil { 30 | return nil, i.err 31 | } 32 | if i.index < len(i.results) { 33 | rv := i.results[i.index] 34 | i.index++ 35 | return rv, nil 36 | } 37 | return nil, nil 38 | } 39 | 40 | func (i *TopNIterator) Aggregations() *search.Bucket { 41 | return i.bucket 42 | } 43 | -------------------------------------------------------------------------------- /search/collector/search_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package collector 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/search" 19 | ) 20 | 21 | type stubSearcher struct { 22 | index int 23 | matches []*search.DocumentMatch 24 | } 25 | 26 | func (ss *stubSearcher) Next(ctx *search.Context) (*search.DocumentMatch, error) { 27 | if ss.index < len(ss.matches) { 28 | rv := ctx.DocumentMatchPool.Get() 29 | rv.Number = ss.matches[ss.index].Number 30 | rv.Score = ss.matches[ss.index].Score 31 | ss.index++ 32 | return rv, nil 33 | } 34 | return nil, nil 35 | } 36 | 37 | func (ss *stubSearcher) DocumentMatchPoolSize() int { 38 | return 0 39 | } 40 | 41 | func (ss *stubSearcher) Close() error { 42 | return nil 43 | } 44 | -------------------------------------------------------------------------------- /search/collector/size.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package collector 16 | 17 | import ( 18 | "reflect" 19 | ) 20 | 21 | func init() { 22 | var ptr *int 23 | sizeOfPtr = int(reflect.TypeOf(ptr).Size()) 24 | var str string 25 | sizeOfString = int(reflect.TypeOf(str).Size()) 26 | var coll TopNCollector 27 | reflectStaticSizeTopNCollector = int(reflect.TypeOf(coll).Size()) 28 | } 29 | 30 | var sizeOfPtr int 31 | var sizeOfString int 32 | var reflectStaticSizeTopNCollector int 33 | -------------------------------------------------------------------------------- /search/explanation.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package search 16 | 17 | import ( 18 | "encoding/json" 19 | "fmt" 20 | ) 21 | 22 | type Explanation struct { 23 | Value float64 `json:"value"` 24 | Message string `json:"message"` 25 | Children []*Explanation `json:"children,omitempty"` 26 | } 27 | 28 | func NewExplanation(value float64, msg string, children ...*Explanation) *Explanation { 29 | return &Explanation{ 30 | Value: value, 31 | Message: msg, 32 | Children: children, 33 | } 34 | } 35 | 36 | func (e *Explanation) String() string { 37 | js, err := json.MarshalIndent(e, "", " ") 38 | if err != nil { 39 | return fmt.Sprintf("error serializing explanation to json: %v", err) 40 | } 41 | return string(js) 42 | } 43 | 44 | func (e *Explanation) Size() int { 45 | sizeInBytes := reflectStaticSizeExplanation + sizeOfPtr + 46 | len(e.Message) 47 | 48 | for _, entry := range e.Children { 49 | sizeInBytes += entry.Size() 50 | } 51 | 52 | return sizeInBytes 53 | } 54 | -------------------------------------------------------------------------------- /search/highlight/fragment_scorer_simple.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package highlight 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/search" 19 | ) 20 | 21 | // FragmentScorer will score fragments by how many 22 | // unique terms occur in the fragment with no regard for 23 | // any boost values used in the original query 24 | type SimpleFragmentScorer struct { 25 | tlm search.TermLocationMap 26 | } 27 | 28 | func NewFragmentScorer(tlm search.TermLocationMap) *SimpleFragmentScorer { 29 | return &SimpleFragmentScorer{ 30 | tlm: tlm, 31 | } 32 | } 33 | 34 | func (s *SimpleFragmentScorer) Score(f *Fragment) { 35 | score := 0.0 36 | for _, locations := range s.tlm { 37 | for _, location := range locations { 38 | if location.Start >= f.Start && location.End <= f.End { 39 | score += 1.0 40 | // once we find a term in the fragment 41 | // don't care about additional matches 42 | break 43 | } 44 | } 45 | } 46 | f.Score = score 47 | } 48 | -------------------------------------------------------------------------------- /search/highlight/highlighter.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package highlight 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/search" 19 | ) 20 | 21 | type Fragment struct { 22 | Orig []byte 23 | Start int 24 | End int 25 | Score float64 26 | Index int // used by heap 27 | } 28 | 29 | func (f *Fragment) Overlaps(other *Fragment) bool { 30 | if other.Start >= f.Start && other.Start < f.End { 31 | return true 32 | } else if f.Start >= other.Start && f.Start < other.End { 33 | return true 34 | } 35 | return false 36 | } 37 | 38 | type Fragmenter interface { 39 | Fragment([]byte, TermLocations) []*Fragment 40 | } 41 | 42 | type FragmentFormatter interface { 43 | Format(f *Fragment, orderedTermLocations TermLocations) string 44 | } 45 | 46 | type FragmentScorer interface { 47 | Score(f *Fragment) float64 48 | } 49 | 50 | type Highlighter interface { 51 | BestFragment(tlm search.TermLocationMap, orig []byte) string 52 | BestFragments(tlm search.TermLocationMap, orig []byte, num int) []string 53 | } 54 | -------------------------------------------------------------------------------- /search/highlight/highlighter_ansi.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package highlight 16 | 17 | func NewANSIHighlighter() *SimpleHighlighter { 18 | fragmenter := NewSimpleFragmenter() 19 | formatter := NewANSIFragmentFormatter() 20 | return NewSimpleHighlighter(fragmenter, formatter, DefaultSeparator) 21 | } 22 | 23 | func NewANSIHighlighterColor(color string) *SimpleHighlighter { 24 | fragmenter := NewSimpleFragmenter() 25 | formatter := NewANSIFragmentFormatterColor(color) 26 | return NewSimpleHighlighter(fragmenter, formatter, DefaultSeparator) 27 | } 28 | -------------------------------------------------------------------------------- /search/highlight/highlighter_html.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package highlight 16 | 17 | func NewHTMLHighlighter() *SimpleHighlighter { 18 | fragmenter := NewSimpleFragmenter() 19 | formatter := NewHTMLFragmentFormatter() 20 | return NewSimpleHighlighter(fragmenter, formatter, DefaultSeparator) 21 | } 22 | 23 | func NewHTMLHighlighterTags(before, after string) *SimpleHighlighter { 24 | fragmenter := NewSimpleFragmenter() 25 | formatter := NewHTMLFragmentFormatterTags(before, after) 26 | return NewSimpleHighlighter(fragmenter, formatter, DefaultSeparator) 27 | } 28 | -------------------------------------------------------------------------------- /search/search_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package search 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | ) 21 | 22 | func TestLocationsDedupe(t *testing.T) { 23 | a := &Location{} 24 | b := &Location{Pos: 1} 25 | c := &Location{Pos: 2} 26 | 27 | tests := []struct { 28 | input Locations 29 | expect Locations 30 | }{ 31 | {Locations{}, Locations{}}, 32 | {Locations{a}, Locations{a}}, 33 | {Locations{a, b, c}, Locations{a, b, c}}, 34 | {Locations{a, a}, Locations{a}}, 35 | {Locations{a, a, a}, Locations{a}}, 36 | {Locations{a, b}, Locations{a, b}}, 37 | {Locations{b, a}, Locations{a, b}}, 38 | {Locations{c, b, a, c, b, a, c, b, a}, Locations{a, b, c}}, 39 | } 40 | 41 | for testi, test := range tests { 42 | res := test.input.Dedupe() 43 | if !reflect.DeepEqual(res, test.expect) { 44 | t.Errorf("testi: %d, test: %+v, res: %+v", testi, test, res) 45 | } 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /search/searcher/ordered_searchers_list.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package searcher 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/search" 19 | ) 20 | 21 | type OrderedSearcherList []search.Searcher 22 | 23 | // sort.Interface 24 | 25 | func (otrl OrderedSearcherList) Len() int { 26 | return len(otrl) 27 | } 28 | 29 | func (otrl OrderedSearcherList) Less(i, j int) bool { 30 | return otrl[i].Count() < otrl[j].Count() 31 | } 32 | 33 | func (otrl OrderedSearcherList) Swap(i, j int) { 34 | otrl[i], otrl[j] = otrl[j], otrl[i] 35 | } 36 | -------------------------------------------------------------------------------- /search/searcher/search_match_none.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package searcher 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/search" 19 | ) 20 | 21 | type MatchNoneSearcher struct{} 22 | 23 | func NewMatchNoneSearcher(indexReader search.Reader, options search.SearcherOptions) (*MatchNoneSearcher, error) { 24 | return &MatchNoneSearcher{}, nil 25 | } 26 | 27 | func (s *MatchNoneSearcher) Size() int { 28 | return reflectStaticSizeMatchNoneSearcher + sizeOfPtr 29 | } 30 | 31 | func (s *MatchNoneSearcher) Count() uint64 { 32 | return uint64(0) 33 | } 34 | 35 | func (s *MatchNoneSearcher) Weight() float64 { 36 | return 0 37 | } 38 | 39 | func (s *MatchNoneSearcher) SetQueryNorm(_ float64) {} 40 | 41 | func (s *MatchNoneSearcher) Next(ctx *search.Context) (*search.DocumentMatch, error) { 42 | return nil, nil 43 | } 44 | 45 | func (s *MatchNoneSearcher) Advance(ctx *search.Context, number uint64) (*search.DocumentMatch, error) { 46 | return nil, nil 47 | } 48 | 49 | func (s *MatchNoneSearcher) Close() error { 50 | return nil 51 | } 52 | 53 | func (s *MatchNoneSearcher) Min() int { 54 | return 0 55 | } 56 | 57 | func (s *MatchNoneSearcher) DocumentMatchPoolSize() int { 58 | return 0 59 | } 60 | -------------------------------------------------------------------------------- /search/searcher/search_numeric_range_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package searcher 16 | 17 | import ( 18 | "reflect" 19 | "testing" 20 | 21 | "github.com/blugelabs/bluge/numeric" 22 | ) 23 | 24 | func TestSplitRange(t *testing.T) { 25 | min := numeric.Float64ToInt64(1.0) 26 | max := numeric.Float64ToInt64(5.0) 27 | ranges := splitInt64Range(min, max, 4) 28 | enumerated := ranges.Enumerate(nil) 29 | if len(enumerated) != 135 { 30 | t.Errorf("expected 135 terms, got %d", len(enumerated)) 31 | } 32 | } 33 | 34 | func TestIncrementBytes(t *testing.T) { 35 | tests := []struct { 36 | in []byte 37 | out []byte 38 | }{ 39 | { 40 | in: []byte{0}, 41 | out: []byte{1}, 42 | }, 43 | { 44 | in: []byte{0, 0}, 45 | out: []byte{0, 1}, 46 | }, 47 | { 48 | in: []byte{0, 255}, 49 | out: []byte{1, 0}, 50 | }, 51 | } 52 | 53 | for _, test := range tests { 54 | actual := incrementBytes(test.in) 55 | if !reflect.DeepEqual(actual, test.out) { 56 | t.Errorf("expected %#v, got %#v", test.out, actual) 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /search/searcher/search_term_prefix.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Couchbase, Inc. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package searcher 16 | 17 | import ( 18 | "github.com/blugelabs/bluge/search" 19 | ) 20 | 21 | func NewTermPrefixSearcher(indexReader search.Reader, prefix, field string, 22 | boost float64, scorer search.Scorer, compScorer search.CompositeScorer, 23 | options search.SearcherOptions) (search.Searcher, error) { 24 | // find the terms with this prefix 25 | kBeg := []byte(prefix) 26 | kEnd := incrementBytes(kBeg) 27 | fieldDict, err := indexReader.DictionaryIterator(field, nil, kBeg, kEnd) 28 | if err != nil { 29 | return nil, err 30 | } 31 | defer func() { 32 | if cerr := fieldDict.Close(); cerr != nil && err == nil { 33 | err = cerr 34 | } 35 | }() 36 | 37 | var terms []string 38 | tfd, err := fieldDict.Next() 39 | for err == nil && tfd != nil { 40 | terms = append(terms, tfd.Term()) 41 | if tooManyClauses(len(terms)) { 42 | return nil, tooManyClausesErr(field, len(terms)) 43 | } 44 | tfd, err = fieldDict.Next() 45 | } 46 | if err != nil { 47 | return nil, err 48 | } 49 | 50 | return NewMultiTermSearcher(indexReader, terms, field, boost, scorer, compScorer, options, true) 51 | } 52 | -------------------------------------------------------------------------------- /search/similarity/constant.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package similarity 16 | 17 | import "github.com/blugelabs/bluge/search" 18 | 19 | type ConstantScorer float64 20 | 21 | func (c ConstantScorer) Score(_ int, _ float64) float64 { 22 | return float64(c) 23 | } 24 | 25 | func (c ConstantScorer) Explain(_ int, _ float64) *search.Explanation { 26 | return search.NewExplanation(float64(c), "constant") 27 | } 28 | 29 | func (c ConstantScorer) ScoreComposite(_ []*search.DocumentMatch) float64 { 30 | return float64(c) 31 | } 32 | func (c ConstantScorer) ExplainComposite(_ []*search.DocumentMatch) *search.Explanation { 33 | return search.NewExplanation(float64(c), "constant") 34 | } 35 | -------------------------------------------------------------------------------- /search/size.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package search 16 | 17 | import ( 18 | "reflect" 19 | ) 20 | 21 | func init() { 22 | var ptr *int 23 | sizeOfPtr = int(reflect.TypeOf(ptr).Size()) 24 | var str string 25 | sizeOfString = int(reflect.TypeOf(str).Size()) 26 | var slice []int 27 | sizeOfSlice = int(reflect.TypeOf(slice).Size()) 28 | var e Explanation 29 | reflectStaticSizeExplanation = int(reflect.TypeOf(e).Size()) 30 | var dm DocumentMatch 31 | reflectStaticSizeDocumentMatch = int(reflect.TypeOf(dm).Size()) 32 | var sc Context 33 | reflectStaticSizeSearchContext = int(reflect.TypeOf(sc).Size()) 34 | var l Location 35 | reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) 36 | var dmp DocumentMatchPool 37 | reflectStaticSizeDocumentMatchPool = int(reflect.TypeOf(dmp).Size()) 38 | } 39 | 40 | var sizeOfPtr int 41 | var sizeOfString int 42 | var sizeOfSlice int 43 | 44 | var reflectStaticSizeExplanation int 45 | var reflectStaticSizeDocumentMatch int 46 | var reflectStaticSizeSearchContext int 47 | var reflectStaticSizeLocation int 48 | var reflectStaticSizeDocumentMatchPool int 49 | -------------------------------------------------------------------------------- /size.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package bluge 16 | 17 | import ( 18 | "reflect" 19 | 20 | "github.com/blugelabs/bluge/search" 21 | ) 22 | 23 | var documentMatchEmptySize int 24 | var searchContextEmptySize int 25 | var reflectStaticSizeBaseField int 26 | var sizeOfSlice int 27 | var sizeOfString int 28 | var sizeOfPtr int 29 | var sizeOfBool int 30 | 31 | func init() { 32 | var dm search.DocumentMatch 33 | documentMatchEmptySize = dm.Size() 34 | var sc search.Context 35 | searchContextEmptySize = sc.Size() 36 | var f TermField 37 | reflectStaticSizeBaseField = int(reflect.TypeOf(f).Size()) 38 | var slice []int 39 | sizeOfSlice = int(reflect.TypeOf(slice).Size()) 40 | var str string 41 | sizeOfString = int(reflect.TypeOf(str).Size()) 42 | var ptr *int 43 | sizeOfPtr = int(reflect.TypeOf(ptr).Size()) 44 | var b bool 45 | sizeOfBool = int(reflect.TypeOf(b).Size()) 46 | } 47 | -------------------------------------------------------------------------------- /test/integration.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 The Bluge Authors. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | package test 16 | 17 | import ( 18 | "testing" 19 | 20 | "github.com/blugelabs/bluge/search" 21 | "github.com/blugelabs/bluge/search/highlight" 22 | 23 | "github.com/blugelabs/bluge" 24 | ) 25 | 26 | type match struct { 27 | Number uint64 28 | Score float64 29 | SortValue [][]byte 30 | Fields map[string][][]byte 31 | ExpectHighlights []*ExpectHighlight 32 | Locations search.FieldTermLocationMap 33 | } 34 | 35 | func newIDMatches(ids ...string) []*match { 36 | result := []*match{} 37 | 38 | for _, id := range ids { 39 | result = append(result, &match{ 40 | Fields: map[string][][]byte{ 41 | "_id": {[]byte(id)}, 42 | }}) 43 | } 44 | return result 45 | } 46 | 47 | type ExpectHighlight struct { 48 | Highlighter highlight.Highlighter 49 | Field string 50 | Result string 51 | } 52 | 53 | type RequestVerify struct { 54 | Comment string 55 | Request bluge.SearchRequest 56 | Aggregations search.Aggregations 57 | ExpectTotal int 58 | ExpectMatches []*match 59 | VerifyAggregations func(t *testing.T, bucket *search.Bucket) 60 | } 61 | 62 | type IntegrationTest struct { 63 | Name string 64 | DataLoad func(writer *bluge.Writer) error 65 | Tests func() []*RequestVerify 66 | } 67 | --------------------------------------------------------------------------------