├── .github ├── scripts │ └── python │ │ └── update_version.py └── workflows │ ├── publish-python.yaml │ └── run-tests.yaml ├── .gitignore ├── LICENSE ├── README.md ├── assets └── comparison.png ├── bm25s ├── __init__.py ├── hf.py ├── numba │ ├── __init__.py │ ├── retrieve_utils.py │ └── selection.py ├── scoring.py ├── selection.py ├── stopwords.py ├── tokenization.py ├── utils │ ├── __init__.py │ ├── beir.py │ ├── benchmark.py │ ├── corpus.py │ └── json_functions.py └── version.py ├── examples ├── evaluate_on_beir.py ├── index_and_retrieve_with_numba.py ├── index_nq.py ├── index_to_hf.py ├── index_with_metadata.py ├── nltk_stemmer.py ├── retrieve_from_hf.py ├── retrieve_nq.py ├── retrieve_nq_with_batching.py ├── retrieve_with_numba_advanced.py ├── retrieve_with_numba_hf.py ├── save_and_reload_end_to_end.py ├── tokenize_multiprocess.py └── tokenizer_class.py ├── setup.py └── tests ├── README.md ├── __init__.py ├── comparison ├── test_bm25_pt.py ├── test_bm25s_indexing.py ├── test_jsonl_corpus.py ├── test_rank_bm25.py ├── test_rank_bm25l.py ├── test_rank_bm25plus.py └── test_utils_corpus.py ├── comparison_full ├── test_bm25_pt.py └── test_rank_bm25.py ├── core ├── test_allow_empty.py ├── test_core_coverage.py ├── test_corpus.py ├── test_hf_utils.py ├── test_init_utils.py ├── test_json_functions.py ├── test_retrieve.py ├── test_save_load.py ├── test_scoring.py ├── test_selection.py ├── test_tokenization_extended.py ├── test_tokenizer.py ├── test_tokenizer_misc.py ├── test_topk.py ├── test_utils_corpus.py └── test_vocab_dict.py ├── data └── nfcorpus.txt ├── numba ├── test_numba_backend_retrieve.py └── test_topk_numba.py ├── requirements-comparison.txt ├── requirements-core.txt └── stopwords └── test_stopwords.py /.github/scripts/python/update_version.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/.github/scripts/python/update_version.py -------------------------------------------------------------------------------- /.github/workflows/publish-python.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/.github/workflows/publish-python.yaml -------------------------------------------------------------------------------- /.github/workflows/run-tests.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/.github/workflows/run-tests.yaml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/README.md -------------------------------------------------------------------------------- /assets/comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/assets/comparison.png -------------------------------------------------------------------------------- /bm25s/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/bm25s/__init__.py -------------------------------------------------------------------------------- /bm25s/hf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/bm25s/hf.py -------------------------------------------------------------------------------- /bm25s/numba/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /bm25s/numba/retrieve_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/bm25s/numba/retrieve_utils.py -------------------------------------------------------------------------------- /bm25s/numba/selection.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/bm25s/numba/selection.py -------------------------------------------------------------------------------- /bm25s/scoring.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/bm25s/scoring.py -------------------------------------------------------------------------------- /bm25s/selection.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/bm25s/selection.py -------------------------------------------------------------------------------- /bm25s/stopwords.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/bm25s/stopwords.py -------------------------------------------------------------------------------- /bm25s/tokenization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/bm25s/tokenization.py -------------------------------------------------------------------------------- /bm25s/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from . import benchmark, beir, corpus, json_functions -------------------------------------------------------------------------------- /bm25s/utils/beir.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/bm25s/utils/beir.py -------------------------------------------------------------------------------- /bm25s/utils/benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/bm25s/utils/benchmark.py -------------------------------------------------------------------------------- /bm25s/utils/corpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/bm25s/utils/corpus.py -------------------------------------------------------------------------------- /bm25s/utils/json_functions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/bm25s/utils/json_functions.py -------------------------------------------------------------------------------- /bm25s/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1dev0" 2 | -------------------------------------------------------------------------------- /examples/evaluate_on_beir.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/examples/evaluate_on_beir.py -------------------------------------------------------------------------------- /examples/index_and_retrieve_with_numba.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/examples/index_and_retrieve_with_numba.py -------------------------------------------------------------------------------- /examples/index_nq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/examples/index_nq.py -------------------------------------------------------------------------------- /examples/index_to_hf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/examples/index_to_hf.py -------------------------------------------------------------------------------- /examples/index_with_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/examples/index_with_metadata.py -------------------------------------------------------------------------------- /examples/nltk_stemmer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/examples/nltk_stemmer.py -------------------------------------------------------------------------------- /examples/retrieve_from_hf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/examples/retrieve_from_hf.py -------------------------------------------------------------------------------- /examples/retrieve_nq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/examples/retrieve_nq.py -------------------------------------------------------------------------------- /examples/retrieve_nq_with_batching.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/examples/retrieve_nq_with_batching.py -------------------------------------------------------------------------------- /examples/retrieve_with_numba_advanced.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/examples/retrieve_with_numba_advanced.py -------------------------------------------------------------------------------- /examples/retrieve_with_numba_hf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/examples/retrieve_with_numba_hf.py -------------------------------------------------------------------------------- /examples/save_and_reload_end_to_end.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/examples/save_and_reload_end_to_end.py -------------------------------------------------------------------------------- /examples/tokenize_multiprocess.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/examples/tokenize_multiprocess.py -------------------------------------------------------------------------------- /examples/tokenizer_class.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/examples/tokenizer_class.py -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/setup.py -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/README.md -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/__init__.py -------------------------------------------------------------------------------- /tests/comparison/test_bm25_pt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/comparison/test_bm25_pt.py -------------------------------------------------------------------------------- /tests/comparison/test_bm25s_indexing.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/comparison/test_bm25s_indexing.py -------------------------------------------------------------------------------- /tests/comparison/test_jsonl_corpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/comparison/test_jsonl_corpus.py -------------------------------------------------------------------------------- /tests/comparison/test_rank_bm25.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/comparison/test_rank_bm25.py -------------------------------------------------------------------------------- /tests/comparison/test_rank_bm25l.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/comparison/test_rank_bm25l.py -------------------------------------------------------------------------------- /tests/comparison/test_rank_bm25plus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/comparison/test_rank_bm25plus.py -------------------------------------------------------------------------------- /tests/comparison/test_utils_corpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/comparison/test_utils_corpus.py -------------------------------------------------------------------------------- /tests/comparison_full/test_bm25_pt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/comparison_full/test_bm25_pt.py -------------------------------------------------------------------------------- /tests/comparison_full/test_rank_bm25.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/comparison_full/test_rank_bm25.py -------------------------------------------------------------------------------- /tests/core/test_allow_empty.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_allow_empty.py -------------------------------------------------------------------------------- /tests/core/test_core_coverage.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_core_coverage.py -------------------------------------------------------------------------------- /tests/core/test_corpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_corpus.py -------------------------------------------------------------------------------- /tests/core/test_hf_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_hf_utils.py -------------------------------------------------------------------------------- /tests/core/test_init_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_init_utils.py -------------------------------------------------------------------------------- /tests/core/test_json_functions.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_json_functions.py -------------------------------------------------------------------------------- /tests/core/test_retrieve.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_retrieve.py -------------------------------------------------------------------------------- /tests/core/test_save_load.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_save_load.py -------------------------------------------------------------------------------- /tests/core/test_scoring.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_scoring.py -------------------------------------------------------------------------------- /tests/core/test_selection.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_selection.py -------------------------------------------------------------------------------- /tests/core/test_tokenization_extended.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_tokenization_extended.py -------------------------------------------------------------------------------- /tests/core/test_tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_tokenizer.py -------------------------------------------------------------------------------- /tests/core/test_tokenizer_misc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_tokenizer_misc.py -------------------------------------------------------------------------------- /tests/core/test_topk.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_topk.py -------------------------------------------------------------------------------- /tests/core/test_utils_corpus.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_utils_corpus.py -------------------------------------------------------------------------------- /tests/core/test_vocab_dict.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/core/test_vocab_dict.py -------------------------------------------------------------------------------- /tests/data/nfcorpus.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/data/nfcorpus.txt -------------------------------------------------------------------------------- /tests/numba/test_numba_backend_retrieve.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/numba/test_numba_backend_retrieve.py -------------------------------------------------------------------------------- /tests/numba/test_topk_numba.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/numba/test_topk_numba.py -------------------------------------------------------------------------------- /tests/requirements-comparison.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xhluca/bm25s/HEAD/tests/requirements-comparison.txt -------------------------------------------------------------------------------- /tests/requirements-core.txt: -------------------------------------------------------------------------------- 1 | -e .[full] -------------------------------------------------------------------------------- /tests/stopwords/test_stopwords.py: -------------------------------------------------------------------------------- 1 | """ 2 | Testing for stopwords needs to be define. 3 | """ --------------------------------------------------------------------------------