├── .gitattributes ├── .gitignore ├── .readthedocs.yaml ├── LICENSE ├── README.md ├── docs ├── Makefile ├── conf.py ├── index.rst ├── make.bat ├── requirements.in ├── requirements.txt └── sources │ ├── _static │ └── custom.css │ ├── guides │ ├── bi-phase-ivfpq.md │ ├── configs.md │ ├── data.md │ ├── index.rst │ ├── lecard.md │ └── quick-start.md │ ├── models │ ├── BaseModel.rst │ └── index.rst │ ├── scripts │ ├── index.rst │ ├── negative.rst │ └── preprocess.rst │ └── utils │ ├── index.rst │ ├── my_index.rst │ └── util.rst └── src ├── anserini └── target │ ├── anserini-0.15.1-SNAPSHOT-fatjar.jar │ ├── anserini-0.15.1-SNAPSHOT.jar │ └── appassembler │ ├── bin │ ├── ApproximateNearestNeighborEval │ ├── ApproximateNearestNeighborEval.bat │ ├── ApproximateNearestNeighborSearch │ ├── ApproximateNearestNeighborSearch.bat │ ├── DumpAnalyzedQueries │ ├── DumpAnalyzedQueries.bat │ ├── ExtractAverageDocumentLength │ ├── ExtractAverageDocumentLength.bat │ ├── ExtractDocumentLengths │ ├── ExtractDocumentLengths.bat │ ├── ExtractNorms │ ├── ExtractNorms.bat │ ├── ExtractTopDfTerms │ ├── ExtractTopDfTerms.bat │ ├── FeatureExtractorCli │ ├── FeatureExtractorCli.bat │ ├── IndexCollection │ ├── IndexCollection.bat │ ├── IndexReaderUtils │ ├── IndexReaderUtils.bat │ ├── IndexVectorCollection │ ├── IndexVectorCollection.bat │ ├── IndexVectors │ ├── IndexVectors.bat │ ├── SearchCollection │ ├── SearchCollection.bat │ ├── SearchMsmarco │ ├── SearchMsmarco.bat │ ├── SearchVectorCollection │ ├── SearchVectorCollection.bat │ ├── SimpleSearcher │ ├── SimpleSearcher.bat │ ├── SimpleTweetSearcher │ └── SimpleTweetSearcher.bat │ └── repo │ ├── annotations-java5-23.0.0.jar │ ├── anserini-0.15.1-SNAPSHOT.jar │ ├── ant-1.10.11.jar │ ├── ant-launcher-1.10.11.jar │ ├── api-0.18.0.jar │ ├── args4j-2.33.jar │ ├── cbor-0.8.jar │ ├── checker-qual-3.8.0.jar │ ├── commons-compress-1.21.jar │ ├── commons-csv-1.8.jar │ ├── commons-io-2.5.jar │ ├── commons-lang3-3.5.jar │ ├── commons-pool2-2.11.1.jar │ ├── commons-text-1.9.jar │ ├── error_prone_annotations-2.5.1.jar │ ├── failureaccess-1.0.1.jar │ ├── fastutil-8.5.8.jar │ ├── gson-2.9.0.jar │ ├── guava-30.1.1-jre.jar │ ├── j2objc-annotations-1.3.jar │ ├── jackson-annotations-2.12.5.jar │ ├── jackson-core-2.12.5.jar │ ├── jackson-databind-2.12.6.1.jar │ ├── jackson-dataformat-yaml-2.12.5.jar │ ├── jackson-datatype-jdk8-2.12.5.jar │ ├── jbibtex-1.0.19.jar │ ├── jna-5.11.0.jar │ ├── jsoup-1.15.3.jar │ ├── jsr305-2.0.1.jar │ ├── listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar │ ├── log4j-api-2.17.1.jar │ ├── log4j-core-2.17.1.jar │ ├── lucene-analysis-common-9.3.0.jar │ ├── lucene-analysis-kuromoji-9.3.0.jar │ ├── lucene-analysis-morfologik-9.3.0.jar │ ├── lucene-backward-codecs-9.3.0.jar │ ├── lucene-codecs-9.3.0.jar │ ├── lucene-core-9.3.0.jar │ ├── lucene-queries-9.3.0.jar │ ├── lucene-queryparser-9.3.0.jar │ ├── lucene-sandbox-9.3.0.jar │ ├── mockito-all-1.10.19.jar │ ├── morfologik-fsa-2.1.9.jar │ ├── morfologik-polish-2.1.9.jar │ ├── morfologik-stemming-2.1.9.jar │ ├── morfologik-ukrainian-search-4.9.1.jar │ ├── slf4j-api-1.7.32.jar │ ├── slf4j-simple-1.7.32.jar │ ├── snakeyaml-1.27.jar │ ├── tokenizers-0.18.0.jar │ ├── trec-car-tools-java-19.jar │ ├── twitter-text-2.0.10.jar │ ├── wikiclean-1.1.jar │ └── xz-1.9.jar ├── data └── config │ ├── _default.yaml │ ├── _example.yaml │ ├── ance.yaml │ ├── ar2.yaml │ ├── autotsg.yaml │ ├── base │ ├── LECARD.yaml │ ├── MS300k-unseen.yaml │ ├── MS300k.yaml │ ├── MS600k.yaml │ ├── MSMARCO-passage.yaml │ ├── NQ-50k-seen.yaml │ ├── NQ-50k-unseen.yaml │ ├── NQ-open.yaml │ ├── NQ320k-seen.yaml │ ├── NQ320k-unseen.yaml │ ├── NQ320k.yaml │ ├── Rand300k-filter.yaml │ ├── Top300k-filter.yaml │ └── _default.yaml │ ├── bivfpq-nq.yaml │ ├── bivfpq.yaml │ ├── bm25.yaml │ ├── bow.yaml │ ├── coil.yaml │ ├── colbert.yaml │ ├── crossenc.yaml │ ├── deepimpact.yaml │ ├── deepspeed.json │ ├── distillvq.yaml │ ├── dpr.yaml │ ├── dsi.yaml │ ├── dsiqg.yaml │ ├── extra │ └── code.yaml │ ├── genre.yaml │ ├── index │ ├── _default.yaml │ ├── bm25.yaml │ ├── faiss.yaml │ ├── fm.yaml │ ├── impact-tok.yaml │ ├── impact-word.yaml │ ├── impact.yaml │ ├── invhit.yaml │ ├── invvec.yaml │ ├── none.yaml │ ├── trie.yaml │ └── wordset.yaml │ ├── ivf.yaml │ ├── mode │ ├── _eval.yaml │ ├── cluster.yaml │ ├── code.yaml │ ├── deploy.yaml │ ├── encode-query.yaml │ ├── encode-text.yaml │ ├── encode.yaml │ ├── eval.yaml │ ├── index.yaml │ ├── migrate.yaml │ ├── script.yaml │ └── train.yaml │ ├── model │ ├── _default.yaml │ ├── dense.yaml │ ├── generative.yaml │ ├── ranker.yaml │ └── sparse.yaml │ ├── rankt5.yaml │ ├── retromae.yaml │ ├── script │ ├── _default.yaml │ ├── doct5.yaml │ ├── download.yaml │ ├── eval.yaml │ ├── negative.yaml │ ├── preprocess.yaml │ └── ttest.yaml │ ├── seal.yaml │ ├── sequer.yaml │ ├── sparta.yaml │ ├── spladev2.yaml │ ├── tokivf.yaml │ ├── topivf.yaml │ ├── unicoil.yaml │ └── uniretriever.yaml ├── models ├── AR2.py ├── AutoModel.py ├── AutoTSG.py ├── BM25.py ├── BOW.py ├── BaseModel.py ├── COIL.py ├── ColBERT.py ├── CrossEnc.py ├── DPR.py ├── DSI.py ├── DeepImpact.py ├── IVF.py ├── KeyRank.py ├── RankT5.py ├── SEAL.py ├── SPARTA.py ├── SPLADE.py ├── UniCOIL.py ├── UniRetriever.py └── VQ.py ├── notebooks └── data.ipynb ├── run.py ├── scripts ├── doct5.py ├── download.py ├── eval.py ├── evalnq.py ├── negative.py ├── openai.py ├── preprocess.py └── ttest.py └── utils ├── __init__.py ├── data.py ├── index.py ├── static.py ├── trainer.py └── util.py /.gitattributes: -------------------------------------------------------------------------------- 1 | anserini-*.jar filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | cache 2 | *.pyc 3 | *.log 4 | __pycache__ 5 | test*.py 6 | tmp*.py 7 | tmp*.sh 8 | discussions 9 | reviews 10 | backup 11 | ppts 12 | 13 | # Byte-compiled / optimized / DLL files 14 | __pycache__/ 15 | *.py[cod] 16 | *$py.class 17 | 18 | # C extensions 19 | *.so 20 | 21 | # auto-generated Sphinx api docs 22 | /docs/generated 23 | 24 | # Distribution / packaging 25 | .Python 26 | build/ 27 | develop-eggs/ 28 | dist/ 29 | downloads/ 30 | eggs/ 31 | .eggs/ 32 | lib/ 33 | lib64/ 34 | parts/ 35 | sdist/ 36 | var/ 37 | wheels/ 38 | pip-wheel-metadata/ 39 | share/python-wheels/ 40 | *.egg-info/ 41 | .installed.cfg 42 | *.egg 43 | MANIFEST 44 | 45 | # PyInstaller 46 | # Usually these files are written by a python script from a template 47 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 48 | *.manifest 49 | *.spec 50 | 51 | # Installer logs 52 | pip-log.txt 53 | pip-delete-this-directory.txt 54 | 55 | # Unit test / coverage reports 56 | htmlcov/ 57 | .tox/ 58 | .nox/ 59 | .coverage 60 | .coverage.* 61 | .cache 62 | nosetests.xml 63 | coverage.xml 64 | *.cover 65 | *.py,cover 66 | .hypothesis/ 67 | .pytest_cache/ 68 | 69 | # Translations 70 | *.mo 71 | *.pot 72 | 73 | # Django stuff: 74 | *.log 75 | local_settings.py 76 | db.sqlite3 77 | db.sqlite3-journal 78 | 79 | # Flask stuff: 80 | instance/ 81 | .webassets-cache 82 | 83 | # Scrapy stuff: 84 | .scrapy 85 | 86 | # Sphinx documentation 87 | docs/_build/ 88 | 89 | # Jupyter Notebook 90 | .ipynb_checkpoints 91 | 92 | # IPython 93 | profile_default/ 94 | ipython_config.py 95 | 96 | # pyenv 97 | .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 107 | __pypackages__/ 108 | 109 | # Celery stuff 110 | celerybeat-schedule 111 | celerybeat.pid 112 | 113 | # SageMath parsed files 114 | *.sage.py 115 | 116 | # Environments 117 | .env 118 | .venv 119 | env/ 120 | venv/ 121 | ENV/ 122 | env.bak/ 123 | venv.bak/ 124 | 125 | # Spyder project settings 126 | .spyderproject 127 | .spyproject 128 | 129 | # Rope project settings 130 | .ropeproject 131 | 132 | # mkdocs documentation 133 | /site 134 | 135 | # mypy 136 | .mypy_cache/ 137 | .dmypy.json 138 | dmypy.json 139 | 140 | # Pyre type checker 141 | .pyre/ 142 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the version of Python and other tools you might need 9 | build: 10 | os: ubuntu-20.04 11 | tools: 12 | python: "3.9" 13 | 14 | # Build documentation in the docs/ directory with Sphinx 15 | sphinx: 16 | configuration: docs/conf.py 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: docs/requirements.txt 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [year] [fullname] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Adon 2 | 3 | **This repo is deprecated.** 4 | 5 | Adon is an all-in-one python framework for **Ad**-h**o**c I**n**formation Retrieval. It is highlighted for the following properties: 6 | 7 | 1. **All-in-one.** Adon efficiently implements the entire ad-hoc retrieval pipeline: including the data loading, model training, encoding, indexing, and evaluation. Based on the this, Adon implements various neural retrieval/reranking models (Sparse, Dense, the newly proposed Generative, and the cross-encoder). One can easily train/evaluate a model with one line of command. 8 | 2. **Elastic.** The components of Adon are carefully designed to be highly extendable and independent with one another. You can focus on developing a specific part (e.g. encoding model) without inspecting details about the others (e.g. implementation of ANN indexes). You can also develop your own model within the Adon's framework by only adding a configuration file and a model's implementation. 9 | 10 | Please refer to [the docs](http://Adon.readthedocs.io/) for the detailed introduction and user guides. 11 | 12 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | 13 | # this is necessary for the local sphinx to find code from another directory 14 | # however in the 15 | import pathlib 16 | import sys 17 | # add the src folder to the code directory 18 | sys.path.insert(0, (pathlib.Path(__file__).parents[1].resolve() / "src").as_posix()) 19 | 20 | # -- Project information ----------------------------------------------------- 21 | 22 | project = "Adon" 23 | copyright = "2022, namespace-Pt" 24 | author = "namespace-Pt" 25 | 26 | extensions = [ 27 | "sphinx.ext.duration", 28 | "sphinx.ext.doctest", 29 | # for generating documents from docstrings 30 | "sphinx.ext.autodoc", 31 | "sphinx.ext.autosummary", 32 | "sphinx.ext.intersphinx", 33 | # for google docstring 34 | "sphinx.ext.napoleon", 35 | # for markdown parsing 36 | "myst_parser", 37 | # for parsing markdown emoji 38 | # "sphinxemoji.sphinxemoji", 39 | ] 40 | 41 | intersphinx_mapping = { 42 | "rtd": ("https://docs.readthedocs.io/en/stable/", None), 43 | "python": ("https://docs.python.org/3/", None), 44 | "sphinx": ("https://www.sphinx-doc.org/en/master/", None), 45 | } 46 | intersphinx_disabled_domains = ["std"] 47 | 48 | templates_path = ["_templates"] 49 | 50 | # -- Options for EPUB output 51 | epub_show_urls = "footnote" 52 | 53 | # List of patterns, relative to source directory, that match files and 54 | # directories to ignore when looking for source files. 55 | # This pattern also affects html_static_path and html_extra_path. 56 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "discussions", "reviews", "backup", "ppts"] 57 | 58 | # -- Options for HTML output ------------------------------------------------- 59 | 60 | # The theme to use for HTML and HTML Help pages. See the documentation for 61 | # a list of builtin themes. 62 | # 63 | html_theme = "sphinx_rtd_theme" 64 | 65 | # Add any paths that contain custom static files (such as style sheets) here, 66 | # relative to this directory. They are copied after the builtin static files, 67 | # so a file named "default.css" will overwrite the builtin "default.css". 68 | html_static_path = ["sources/_static"] 69 | html_css_files = ['custom.css'] 70 | 71 | # mock these packages exist 72 | autodoc_mock_imports = ["faiss", "torch", "torch_scatter", "transformers", "pandas", "omegaconf", "hydra", "psutil", "tqdm"] 73 | 74 | # do not change the order of classes or functions in the module 75 | autodoc_member_order = 'bysource' 76 | 77 | # add numbered references 78 | numfig = True -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to Adon's documentation! 2 | ========================================= 3 | 4 | .. note:: 5 | 6 | This project is under active development. The docs are not complete. 7 | 8 | 9 | Adon is an all-in-one python framework for Ad-hoc Information Retrieval. It is highlighted for the following properties: 10 | 11 | 1. **All-in-one.** Adon efficiently implements the entire ad-hoc retrieval pipeline: including the data loading, model training, encoding, indexing, and evaluation. Based on the this, Adon implements various neural retrieval/reranking models (Sparse, Dense, the newly proposed Generative, and the cross-encoder). One can easily train/evaluate a model with one line of command. 12 | 2. **Elastic.** The components of Adon are carefully designed to be highly extendable and independent with one another. You can focus on developing a specific part (e.g. encoding model) without inspecting details about the others (e.g. implementation of ANN indexes). You can also develop your own model within the Adon's framework by only adding a configuration file and a model's implementation. 13 | 14 | Before you start, make sure you installed the dependencies: 15 | 16 | .. code-block:: console 17 | 18 | export CUDA=11.6 19 | conda create -n adon python=3.9.12 20 | conda install pytorch==1.12.1 cudatoolkit=$CUDA -c conda-forge -c pytorch 21 | conda install faiss-gpu==1.7.2 -c conda-forge 22 | pip install torch_scatter -f https://data.pyg.org/whl/torch-1.12.0+$CUDA.html 23 | pip install transformers==4.21.3 hydra-core==1.2.0 notebook ipywidgets psutil 24 | 25 | .. note:: 26 | 27 | Make sure the cudatoolkit version matches your machine! 28 | 29 | Then clone the repository: 30 | 31 | .. code-block:: console 32 | 33 | git lfs install 34 | git clone https://github.com/namespace-Pt/Adon 35 | cd Adon 36 | 37 | Now, let's start the journey with :doc:`sources/guides/quick-start`. 38 | 39 | Contents 40 | -------- 41 | .. toctree:: 42 | :maxdepth: 2 43 | 44 | Home 45 | sources/guides/index 46 | sources/utils/index 47 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.in: -------------------------------------------------------------------------------- 1 | sphinx==4.4.0 2 | myst_parser==0.18.1 3 | docutils==0.16 4 | numpy==1.23.5 5 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile with Python 3.9 3 | # by the following command: 4 | # 5 | # pip-compile '.\requirements.in' 6 | # 7 | --index-url https://pypi.tuna.tsinghua.edu.cn/simple 8 | 9 | alabaster==0.7.12 10 | # via sphinx 11 | babel==2.10.3 12 | # via sphinx 13 | certifi==2022.6.15 14 | # via requests 15 | charset-normalizer==2.1.0 16 | # via requests 17 | colorama==0.4.6 18 | # via sphinx 19 | docutils==0.16 20 | # via 21 | # -r .\requirements.in 22 | # myst-parser 23 | # sphinx 24 | idna==3.3 25 | # via requests 26 | imagesize==1.4.1 27 | # via sphinx 28 | importlib-metadata==5.0.0 29 | # via sphinx 30 | jinja2==3.1.2 31 | # via 32 | # myst-parser 33 | # sphinx 34 | markdown-it-py==2.1.0 35 | # via 36 | # mdit-py-plugins 37 | # myst-parser 38 | markupsafe==2.1.1 39 | # via jinja2 40 | mdit-py-plugins==0.3.1 41 | # via myst-parser 42 | mdurl==0.1.2 43 | # via markdown-it-py 44 | myst-parser==0.18.1 45 | # via -r .\requirements.in 46 | numpy==1.23.5 47 | # via -r .\requirements.in 48 | packaging==21.3 49 | # via sphinx 50 | pygments==2.12.0 51 | # via sphinx 52 | pyparsing==3.0.9 53 | # via packaging 54 | pytz==2022.1 55 | # via babel 56 | pyyaml==6.0 57 | # via myst-parser 58 | requests==2.28.1 59 | # via sphinx 60 | snowballstemmer==2.2.0 61 | # via sphinx 62 | sphinx==4.4.0 63 | # via 64 | # -r .\requirements.in 65 | # myst-parser 66 | sphinxcontrib-applehelp==1.0.2 67 | # via sphinx 68 | sphinxcontrib-devhelp==1.0.2 69 | # via sphinx 70 | sphinxcontrib-htmlhelp==2.0.0 71 | # via sphinx 72 | sphinxcontrib-jsmath==1.0.1 73 | # via sphinx 74 | sphinxcontrib-qthelp==1.0.3 75 | # via sphinx 76 | sphinxcontrib-serializinghtml==1.1.5 77 | # via sphinx 78 | typing-extensions==4.4.0 79 | # via myst-parser 80 | urllib3==1.26.9 81 | # via requests 82 | zipp==3.10.0 83 | # via importlib-metadata 84 | -------------------------------------------------------------------------------- /docs/sources/_static/custom.css: -------------------------------------------------------------------------------- 1 | .wy-nav-content { 2 | max-width: 90% !important; 3 | } -------------------------------------------------------------------------------- /docs/sources/guides/bi-phase-ivfpq.md: -------------------------------------------------------------------------------- 1 | # Bi-Phase IVFPQ 2 | This tutorial explains how to reproduce our paper [Bi-Phase IVFPQ](https://arxiv.org/abs/2210.05521) on MSMARCO passage collection. 3 | 4 | ## Reproducing From Checkpoint 5 | 1. Make sure you finished the data processing steps in {doc}`data`. Then you should download the checkpoint and the necessary index files on [OneDrive](https://1drv.ms/u/s!Aipk4vd2SBrtwAuOh3NWVBlh-1AE?e=cX0nXJ). The uncompressed files would look like: 6 | ``` 7 | Bi-Phase-IVFPQ-MSMARCO 8 | ├── ckpts 9 | │ ├── DistillVQ_d-RetroMAE 10 | │ │ └── best 11 | │ ├── TokIVF 12 | │ │ └── best 13 | │ └── TopIVF 14 | │ └── best 15 | └── index 16 | └── RetroMAE 17 | └── faiss 18 | ├── IVF10000,PQ64x8 19 | └── OPQ96,PQ96x8 20 | ``` 21 | Move the `ckpts/*` to `src/data/cache/MSMARCO-passage/ckpts/`. Move the `index/*` to `src/data/cache/MSMARCO-passage/index/`. 22 | 23 | 2. Since Bi-IVFPQ is a general IVFPQ framework. It relies on off-the-shelf embeddings to work. Here we use the [distilled RetroMAE](https://arxiv.org/abs/2205.12035) as the embedding model. 24 | We encode all documents and queries using RetroMAE and save the resulted embeddings: 25 | ```bash 26 | # uses four gpus 27 | torchrun --nproc_per_node=4 run.py RetroMAE ++mode=eval ++plm=retromae_distill ++save_encode 28 | ``` 29 | The resulted file will be stored at `src/data/cache/MSMARCO-passage/encode/RetroMAE/`. The evaluation defaults to use the `Flat` index that scans the database for each query. The metrics should be similar to: 30 | |MRR@10|Recall@10|Recall@100|Recall@1000| 31 | |:-:|:-:|:-:|:-:| 32 | |0.4155|0.708|0.9268|0.9876| 33 | 3. **Prepare PQ module.** 34 | ```bash 35 | python run.py DistillVQ_d-RetroMAE ++mode=eval ++save_index 36 | ``` 37 | This evaluates the performance of exaustive PQ with 96 subvectors, whose metrics should be similar to: 38 | |MRR@10|Recall@10|Recall@100|Recall@1000| 39 | |:-:|:-:|:-:|:-:| 40 | 0.3993|0.6846|0.9207|0.9845| 41 | 42 | 4. **Prepare Topic IVF.** 43 | ```bash 44 | python run.py TopIVF ++mode=eval 45 | ``` 46 | This evaluates the performance of topic-phase IVF followed by PQ verification when selecting `20` topics for each query. The metrics should be 47 | |MRR@10|Recall@10|Recall@100|Recall@1000| 48 | |:-:|:-:|:-:|:-:| 49 | 0.355|0.5947|0.7917|0.8423|19900| 50 | 51 | 5. **Prepare Term IVF.** 52 | ```bash 53 | # you should use more gpus than TopIVF because TokIVF involves a BERT and hence heavier 54 | torchrun --nproc_per_node=4 run.py TokIVF ++mode=eval ++save_encode 55 | ``` 56 | - If you encounter memory issues when building the inverted index, please run the above command with `++index_shard=64 ++index_thread=5` (increase shard number and decrease parallel process number). The default values are specified at `src/data/config/index/invhit.yaml`. 57 | 58 | This evaluates the performance of term-phase IVF followed by PQ verification when selecting `3` terms for each document. The metrics should be 59 | |MRR@10|Recall@10|Recall@100|Recall@1000| 60 | |:-:|:-:|:-:|:-:| 61 | |0.395|0.6741|0.8864|0.9315| 62 | 63 | 6. **Chain Them Together.** 64 | ```bash 65 | torchrun --nproc_per_node=4 run.py BIVFPQ 66 | ``` 67 | This evaluates Bi-phase IVFPQ under its default settings: **3 terms** and **1 topic** for each document to index, **all included terms** and **20 topics** for each query to search. The results should be: 68 | |MRR@10|Recall@10|Recall@100|Recall@1000| 69 | |:-:|:-:|:-:|:-:| 70 | |0.3986|0.6829|0.9144|0.9742| 71 | 72 | You can easily try different configurations: 73 | ```bash 74 | # index 5 terms for each document 75 | torchrun --nproc_per_node=4 run.py BIVFPQ ++x_text_gate_k=5 76 | # search 10 topics for each query 77 | torchrun --nproc_per_node=4 run.py BIVFPQ ++y_query_gate_k=10 78 | ``` 79 | You can inspect `src/data/config/BIVFPQ.yaml` for more details. -------------------------------------------------------------------------------- /docs/sources/guides/configs.md: -------------------------------------------------------------------------------- 1 | # Configs 2 | -------------------------------------------------------------------------------- /docs/sources/guides/index.rst: -------------------------------------------------------------------------------- 1 | Guides 2 | ======= 3 | 4 | In this section, you'll learn the basic features of Uni-Retriever. 5 | 6 | .. toctree:: 7 | :glob: 8 | 9 | data 10 | configs 11 | quick-start 12 | bi-phase-ivfpq -------------------------------------------------------------------------------- /docs/sources/guides/lecard.md: -------------------------------------------------------------------------------- 1 | # LeCaRD类案检索 2 | 3 | 本文档涵盖了如何在中文预料——[LeCaRD类案检索数据集](https://github.com/myx666/LeCaRD)上训练基本的Dense Retriever(DPR)。 4 | 5 | 6 | 7 | ```bash 8 | python run.py DPR base=LECARD ++plm=bert-chinese ++batch_size=4 9 | 10 | python run.py CrossEnc base=LECARD ++query_length=256 11 | ``` 12 | -------------------------------------------------------------------------------- /docs/sources/models/BaseModel.rst: -------------------------------------------------------------------------------- 1 | BaseModel 2 | ========= 3 | 4 | BaseModel 5 | --------- 6 | 7 | .. autoclass:: models.BaseModel.BaseModel 8 | :members: 9 | :private-members: 10 | :special-members: __init__ 11 | 12 | BaseSparseModel 13 | --------------- 14 | 15 | .. autoclass:: models.BaseModel.BaseSparseModel 16 | :members: 17 | :private-members: 18 | :special-members: __init__ 19 | 20 | BaseDenseModel 21 | -------------- 22 | 23 | .. autoclass:: models.BaseModel.BaseDenseModel 24 | :members: 25 | :private-members: 26 | :special-members: __init__ 27 | 28 | BaseGenerativeModel 29 | ------------------- 30 | 31 | .. autoclass:: models.BaseModel.BaseGenerativeModel 32 | :members: 33 | :private-members: 34 | :special-members: __init__ 35 | -------------------------------------------------------------------------------- /docs/sources/models/index.rst: -------------------------------------------------------------------------------- 1 | Models 2 | ====== 3 | 4 | .. toctree:: 5 | :glob: 6 | 7 | * -------------------------------------------------------------------------------- /docs/sources/scripts/index.rst: -------------------------------------------------------------------------------- 1 | Scripts 2 | ======= 3 | 4 | .. toctree:: 5 | :glob: 6 | 7 | preprocess 8 | negative -------------------------------------------------------------------------------- /docs/sources/scripts/negative.rst: -------------------------------------------------------------------------------- 1 | negative.py 2 | =========== 3 | 4 | .. automodule:: scripts.negative 5 | :members: 6 | :private-members: -------------------------------------------------------------------------------- /docs/sources/scripts/preprocess.rst: -------------------------------------------------------------------------------- 1 | preprocess.py 2 | ============= 3 | 4 | .. automodule:: scripts.preprocess 5 | :members: 6 | :private-members: -------------------------------------------------------------------------------- /docs/sources/utils/index.rst: -------------------------------------------------------------------------------- 1 | Utils 2 | ===== 3 | 4 | .. toctree:: 5 | :glob: 6 | 7 | * -------------------------------------------------------------------------------- /docs/sources/utils/my_index.rst: -------------------------------------------------------------------------------- 1 | Index 2 | ===== 3 | 4 | .. automodule:: utils.index 5 | :members: 6 | :special-members: __init__ 7 | :private-members: -------------------------------------------------------------------------------- /docs/sources/utils/util.rst: -------------------------------------------------------------------------------- 1 | Util 2 | ==== 3 | 4 | .. automodule:: utils.util 5 | :members: 6 | :private-members: 7 | :special-members: __init__ 8 | -------------------------------------------------------------------------------- /src/anserini/target/anserini-0.15.1-SNAPSHOT-fatjar.jar: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:451a0a1878c9351a5f1b9c0c2a48c96149e89dd19d353e21ad571b16e345a4fe 3 | size 135226095 4 | -------------------------------------------------------------------------------- /src/anserini/target/anserini-0.15.1-SNAPSHOT.jar: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:69c94ec169d31284d484ba1eaaaaf0efdc21258a3e10643474132fc35b07d618 3 | size 64258109 4 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/ApproximateNearestNeighborEval.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="ApproximateNearestNeighborEval" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.ann.ApproximateNearestNeighborEval %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/ApproximateNearestNeighborSearch.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="ApproximateNearestNeighborSearch" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.ann.ApproximateNearestNeighborSearch %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/DumpAnalyzedQueries.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="DumpAnalyzedQueries" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.util.DumpAnalyzedQueries %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/ExtractAverageDocumentLength.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="ExtractAverageDocumentLength" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.util.ExtractAverageDocumentLength %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/ExtractDocumentLengths.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="ExtractDocumentLengths" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.util.ExtractDocumentLengths %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/ExtractNorms: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2001-2006 The Apache Software Foundation. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ---------------------------------------------------------------------------- 17 | # 18 | # Copyright (c) 2001-2006 The Apache Software Foundation. All rights 19 | # reserved. 20 | 21 | 22 | # resolve links - $0 may be a softlink 23 | PRG="$0" 24 | 25 | while [ -h "$PRG" ]; do 26 | ls=`ls -ld "$PRG"` 27 | link=`expr "$ls" : '.*-> \(.*\)$'` 28 | if expr "$link" : '/.*' > /dev/null; then 29 | PRG="$link" 30 | else 31 | PRG=`dirname "$PRG"`/"$link" 32 | fi 33 | done 34 | 35 | PRGDIR=`dirname "$PRG"` 36 | BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd` 37 | 38 | # Reset the REPO variable. If you need to influence this use the environment setup file. 39 | REPO= 40 | 41 | 42 | # OS specific support. $var _must_ be set to either true or false. 43 | cygwin=false; 44 | darwin=false; 45 | case "`uname`" in 46 | CYGWIN*) cygwin=true ;; 47 | Darwin*) darwin=true 48 | if [ -z "$JAVA_VERSION" ] ; then 49 | JAVA_VERSION="CurrentJDK" 50 | else 51 | echo "Using Java version: $JAVA_VERSION" 52 | fi 53 | if [ -z "$JAVA_HOME" ]; then 54 | if [ -x "/usr/libexec/java_home" ]; then 55 | JAVA_HOME=`/usr/libexec/java_home` 56 | else 57 | JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home 58 | fi 59 | fi 60 | ;; 61 | esac 62 | 63 | if [ -z "$JAVA_HOME" ] ; then 64 | if [ -r /etc/gentoo-release ] ; then 65 | JAVA_HOME=`java-config --jre-home` 66 | fi 67 | fi 68 | 69 | # For Cygwin, ensure paths are in UNIX format before anything is touched 70 | if $cygwin ; then 71 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 72 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"` 73 | fi 74 | 75 | # If a specific java binary isn't specified search for the standard 'java' binary 76 | if [ -z "$JAVACMD" ] ; then 77 | if [ -n "$JAVA_HOME" ] ; then 78 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 79 | # IBM's JDK on AIX uses strange locations for the executables 80 | JAVACMD="$JAVA_HOME/jre/sh/java" 81 | else 82 | JAVACMD="$JAVA_HOME/bin/java" 83 | fi 84 | else 85 | JAVACMD=`which java` 86 | fi 87 | fi 88 | 89 | if [ ! -x "$JAVACMD" ] ; then 90 | echo "Error: JAVA_HOME is not defined correctly." 1>&2 91 | echo " We cannot execute $JAVACMD" 1>&2 92 | exit 1 93 | fi 94 | 95 | if [ -z "$REPO" ] 96 | then 97 | REPO="$BASEDIR"/repo 98 | fi 99 | 100 | CLASSPATH="$BASEDIR"/etc:"$REPO"/* 101 | 102 | ENDORSED_DIR= 103 | if [ -n "$ENDORSED_DIR" ] ; then 104 | CLASSPATH=$BASEDIR/$ENDORSED_DIR/*:$CLASSPATH 105 | fi 106 | 107 | if [ -n "$CLASSPATH_PREFIX" ] ; then 108 | CLASSPATH=$CLASSPATH_PREFIX:$CLASSPATH 109 | fi 110 | 111 | # For Cygwin, switch paths to Windows format before running java 112 | if $cygwin; then 113 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"` 114 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` 115 | [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"` 116 | [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"` 117 | [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"` 118 | fi 119 | 120 | exec "$JAVACMD" $JAVA_OPTS -Xms512M -Xmx31G \ 121 | -classpath "$CLASSPATH" \ 122 | -Dapp.name="ExtractNorms" \ 123 | -Dapp.pid="$$" \ 124 | -Dapp.repo="$REPO" \ 125 | -Dapp.home="$BASEDIR" \ 126 | -Dbasedir="$BASEDIR" \ 127 | io.anserini.util.ExtractNorms \ 128 | "$@" 129 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/ExtractNorms.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="ExtractNorms" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.util.ExtractNorms %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/ExtractTopDfTerms: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2001-2006 The Apache Software Foundation. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ---------------------------------------------------------------------------- 17 | # 18 | # Copyright (c) 2001-2006 The Apache Software Foundation. All rights 19 | # reserved. 20 | 21 | 22 | # resolve links - $0 may be a softlink 23 | PRG="$0" 24 | 25 | while [ -h "$PRG" ]; do 26 | ls=`ls -ld "$PRG"` 27 | link=`expr "$ls" : '.*-> \(.*\)$'` 28 | if expr "$link" : '/.*' > /dev/null; then 29 | PRG="$link" 30 | else 31 | PRG=`dirname "$PRG"`/"$link" 32 | fi 33 | done 34 | 35 | PRGDIR=`dirname "$PRG"` 36 | BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd` 37 | 38 | # Reset the REPO variable. If you need to influence this use the environment setup file. 39 | REPO= 40 | 41 | 42 | # OS specific support. $var _must_ be set to either true or false. 43 | cygwin=false; 44 | darwin=false; 45 | case "`uname`" in 46 | CYGWIN*) cygwin=true ;; 47 | Darwin*) darwin=true 48 | if [ -z "$JAVA_VERSION" ] ; then 49 | JAVA_VERSION="CurrentJDK" 50 | else 51 | echo "Using Java version: $JAVA_VERSION" 52 | fi 53 | if [ -z "$JAVA_HOME" ]; then 54 | if [ -x "/usr/libexec/java_home" ]; then 55 | JAVA_HOME=`/usr/libexec/java_home` 56 | else 57 | JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home 58 | fi 59 | fi 60 | ;; 61 | esac 62 | 63 | if [ -z "$JAVA_HOME" ] ; then 64 | if [ -r /etc/gentoo-release ] ; then 65 | JAVA_HOME=`java-config --jre-home` 66 | fi 67 | fi 68 | 69 | # For Cygwin, ensure paths are in UNIX format before anything is touched 70 | if $cygwin ; then 71 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 72 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"` 73 | fi 74 | 75 | # If a specific java binary isn't specified search for the standard 'java' binary 76 | if [ -z "$JAVACMD" ] ; then 77 | if [ -n "$JAVA_HOME" ] ; then 78 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 79 | # IBM's JDK on AIX uses strange locations for the executables 80 | JAVACMD="$JAVA_HOME/jre/sh/java" 81 | else 82 | JAVACMD="$JAVA_HOME/bin/java" 83 | fi 84 | else 85 | JAVACMD=`which java` 86 | fi 87 | fi 88 | 89 | if [ ! -x "$JAVACMD" ] ; then 90 | echo "Error: JAVA_HOME is not defined correctly." 1>&2 91 | echo " We cannot execute $JAVACMD" 1>&2 92 | exit 1 93 | fi 94 | 95 | if [ -z "$REPO" ] 96 | then 97 | REPO="$BASEDIR"/repo 98 | fi 99 | 100 | CLASSPATH="$BASEDIR"/etc:"$REPO"/* 101 | 102 | ENDORSED_DIR= 103 | if [ -n "$ENDORSED_DIR" ] ; then 104 | CLASSPATH=$BASEDIR/$ENDORSED_DIR/*:$CLASSPATH 105 | fi 106 | 107 | if [ -n "$CLASSPATH_PREFIX" ] ; then 108 | CLASSPATH=$CLASSPATH_PREFIX:$CLASSPATH 109 | fi 110 | 111 | # For Cygwin, switch paths to Windows format before running java 112 | if $cygwin; then 113 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"` 114 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` 115 | [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"` 116 | [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"` 117 | [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"` 118 | fi 119 | 120 | exec "$JAVACMD" $JAVA_OPTS -Xms512M -Xmx31G \ 121 | -classpath "$CLASSPATH" \ 122 | -Dapp.name="ExtractTopDfTerms" \ 123 | -Dapp.pid="$$" \ 124 | -Dapp.repo="$REPO" \ 125 | -Dapp.home="$BASEDIR" \ 126 | -Dbasedir="$BASEDIR" \ 127 | io.anserini.util.ExtractTopDfTerms \ 128 | "$@" 129 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/ExtractTopDfTerms.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="ExtractTopDfTerms" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.util.ExtractTopDfTerms %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/FeatureExtractorCli.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="FeatureExtractorCli" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.ltr.FeatureExtractorCli %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/IndexCollection: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2001-2006 The Apache Software Foundation. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ---------------------------------------------------------------------------- 17 | # 18 | # Copyright (c) 2001-2006 The Apache Software Foundation. All rights 19 | # reserved. 20 | 21 | 22 | # resolve links - $0 may be a softlink 23 | PRG="$0" 24 | 25 | while [ -h "$PRG" ]; do 26 | ls=`ls -ld "$PRG"` 27 | link=`expr "$ls" : '.*-> \(.*\)$'` 28 | if expr "$link" : '/.*' > /dev/null; then 29 | PRG="$link" 30 | else 31 | PRG=`dirname "$PRG"`/"$link" 32 | fi 33 | done 34 | 35 | PRGDIR=`dirname "$PRG"` 36 | BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd` 37 | 38 | # Reset the REPO variable. If you need to influence this use the environment setup file. 39 | REPO= 40 | 41 | 42 | # OS specific support. $var _must_ be set to either true or false. 43 | cygwin=false; 44 | darwin=false; 45 | case "`uname`" in 46 | CYGWIN*) cygwin=true ;; 47 | Darwin*) darwin=true 48 | if [ -z "$JAVA_VERSION" ] ; then 49 | JAVA_VERSION="CurrentJDK" 50 | else 51 | echo "Using Java version: $JAVA_VERSION" 52 | fi 53 | if [ -z "$JAVA_HOME" ]; then 54 | if [ -x "/usr/libexec/java_home" ]; then 55 | JAVA_HOME=`/usr/libexec/java_home` 56 | else 57 | JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home 58 | fi 59 | fi 60 | ;; 61 | esac 62 | 63 | if [ -z "$JAVA_HOME" ] ; then 64 | if [ -r /etc/gentoo-release ] ; then 65 | JAVA_HOME=`java-config --jre-home` 66 | fi 67 | fi 68 | 69 | # For Cygwin, ensure paths are in UNIX format before anything is touched 70 | if $cygwin ; then 71 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 72 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"` 73 | fi 74 | 75 | # If a specific java binary isn't specified search for the standard 'java' binary 76 | if [ -z "$JAVACMD" ] ; then 77 | if [ -n "$JAVA_HOME" ] ; then 78 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 79 | # IBM's JDK on AIX uses strange locations for the executables 80 | JAVACMD="$JAVA_HOME/jre/sh/java" 81 | else 82 | JAVACMD="$JAVA_HOME/bin/java" 83 | fi 84 | else 85 | JAVACMD=`which java` 86 | fi 87 | fi 88 | 89 | if [ ! -x "$JAVACMD" ] ; then 90 | echo "Error: JAVA_HOME is not defined correctly." 1>&2 91 | echo " We cannot execute $JAVACMD" 1>&2 92 | exit 1 93 | fi 94 | 95 | if [ -z "$REPO" ] 96 | then 97 | REPO="$BASEDIR"/repo 98 | fi 99 | 100 | CLASSPATH="$BASEDIR"/etc:"$REPO"/* 101 | 102 | ENDORSED_DIR= 103 | if [ -n "$ENDORSED_DIR" ] ; then 104 | CLASSPATH=$BASEDIR/$ENDORSED_DIR/*:$CLASSPATH 105 | fi 106 | 107 | if [ -n "$CLASSPATH_PREFIX" ] ; then 108 | CLASSPATH=$CLASSPATH_PREFIX:$CLASSPATH 109 | fi 110 | 111 | # For Cygwin, switch paths to Windows format before running java 112 | if $cygwin; then 113 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"` 114 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` 115 | [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"` 116 | [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"` 117 | [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"` 118 | fi 119 | 120 | exec "$JAVACMD" $JAVA_OPTS -Xms512M -Xmx31G \ 121 | -classpath "$CLASSPATH" \ 122 | -Dapp.name="IndexCollection" \ 123 | -Dapp.pid="$$" \ 124 | -Dapp.repo="$REPO" \ 125 | -Dapp.home="$BASEDIR" \ 126 | -Dbasedir="$BASEDIR" \ 127 | io.anserini.index.IndexCollection \ 128 | "$@" 129 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/IndexCollection.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="IndexCollection" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.index.IndexCollection %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/IndexReaderUtils: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2001-2006 The Apache Software Foundation. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ---------------------------------------------------------------------------- 17 | # 18 | # Copyright (c) 2001-2006 The Apache Software Foundation. All rights 19 | # reserved. 20 | 21 | 22 | # resolve links - $0 may be a softlink 23 | PRG="$0" 24 | 25 | while [ -h "$PRG" ]; do 26 | ls=`ls -ld "$PRG"` 27 | link=`expr "$ls" : '.*-> \(.*\)$'` 28 | if expr "$link" : '/.*' > /dev/null; then 29 | PRG="$link" 30 | else 31 | PRG=`dirname "$PRG"`/"$link" 32 | fi 33 | done 34 | 35 | PRGDIR=`dirname "$PRG"` 36 | BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd` 37 | 38 | # Reset the REPO variable. If you need to influence this use the environment setup file. 39 | REPO= 40 | 41 | 42 | # OS specific support. $var _must_ be set to either true or false. 43 | cygwin=false; 44 | darwin=false; 45 | case "`uname`" in 46 | CYGWIN*) cygwin=true ;; 47 | Darwin*) darwin=true 48 | if [ -z "$JAVA_VERSION" ] ; then 49 | JAVA_VERSION="CurrentJDK" 50 | else 51 | echo "Using Java version: $JAVA_VERSION" 52 | fi 53 | if [ -z "$JAVA_HOME" ]; then 54 | if [ -x "/usr/libexec/java_home" ]; then 55 | JAVA_HOME=`/usr/libexec/java_home` 56 | else 57 | JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home 58 | fi 59 | fi 60 | ;; 61 | esac 62 | 63 | if [ -z "$JAVA_HOME" ] ; then 64 | if [ -r /etc/gentoo-release ] ; then 65 | JAVA_HOME=`java-config --jre-home` 66 | fi 67 | fi 68 | 69 | # For Cygwin, ensure paths are in UNIX format before anything is touched 70 | if $cygwin ; then 71 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 72 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"` 73 | fi 74 | 75 | # If a specific java binary isn't specified search for the standard 'java' binary 76 | if [ -z "$JAVACMD" ] ; then 77 | if [ -n "$JAVA_HOME" ] ; then 78 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 79 | # IBM's JDK on AIX uses strange locations for the executables 80 | JAVACMD="$JAVA_HOME/jre/sh/java" 81 | else 82 | JAVACMD="$JAVA_HOME/bin/java" 83 | fi 84 | else 85 | JAVACMD=`which java` 86 | fi 87 | fi 88 | 89 | if [ ! -x "$JAVACMD" ] ; then 90 | echo "Error: JAVA_HOME is not defined correctly." 1>&2 91 | echo " We cannot execute $JAVACMD" 1>&2 92 | exit 1 93 | fi 94 | 95 | if [ -z "$REPO" ] 96 | then 97 | REPO="$BASEDIR"/repo 98 | fi 99 | 100 | CLASSPATH="$BASEDIR"/etc:"$REPO"/* 101 | 102 | ENDORSED_DIR= 103 | if [ -n "$ENDORSED_DIR" ] ; then 104 | CLASSPATH=$BASEDIR/$ENDORSED_DIR/*:$CLASSPATH 105 | fi 106 | 107 | if [ -n "$CLASSPATH_PREFIX" ] ; then 108 | CLASSPATH=$CLASSPATH_PREFIX:$CLASSPATH 109 | fi 110 | 111 | # For Cygwin, switch paths to Windows format before running java 112 | if $cygwin; then 113 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"` 114 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` 115 | [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"` 116 | [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"` 117 | [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"` 118 | fi 119 | 120 | exec "$JAVACMD" $JAVA_OPTS -Xms512M -Xmx31G \ 121 | -classpath "$CLASSPATH" \ 122 | -Dapp.name="IndexReaderUtils" \ 123 | -Dapp.pid="$$" \ 124 | -Dapp.repo="$REPO" \ 125 | -Dapp.home="$BASEDIR" \ 126 | -Dbasedir="$BASEDIR" \ 127 | io.anserini.index.IndexReaderUtils \ 128 | "$@" 129 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/IndexReaderUtils.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="IndexReaderUtils" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.index.IndexReaderUtils %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/IndexVectorCollection.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="IndexVectorCollection" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.index.IndexVectorCollection %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/IndexVectors: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2001-2006 The Apache Software Foundation. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ---------------------------------------------------------------------------- 17 | # 18 | # Copyright (c) 2001-2006 The Apache Software Foundation. All rights 19 | # reserved. 20 | 21 | 22 | # resolve links - $0 may be a softlink 23 | PRG="$0" 24 | 25 | while [ -h "$PRG" ]; do 26 | ls=`ls -ld "$PRG"` 27 | link=`expr "$ls" : '.*-> \(.*\)$'` 28 | if expr "$link" : '/.*' > /dev/null; then 29 | PRG="$link" 30 | else 31 | PRG=`dirname "$PRG"`/"$link" 32 | fi 33 | done 34 | 35 | PRGDIR=`dirname "$PRG"` 36 | BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd` 37 | 38 | # Reset the REPO variable. If you need to influence this use the environment setup file. 39 | REPO= 40 | 41 | 42 | # OS specific support. $var _must_ be set to either true or false. 43 | cygwin=false; 44 | darwin=false; 45 | case "`uname`" in 46 | CYGWIN*) cygwin=true ;; 47 | Darwin*) darwin=true 48 | if [ -z "$JAVA_VERSION" ] ; then 49 | JAVA_VERSION="CurrentJDK" 50 | else 51 | echo "Using Java version: $JAVA_VERSION" 52 | fi 53 | if [ -z "$JAVA_HOME" ]; then 54 | if [ -x "/usr/libexec/java_home" ]; then 55 | JAVA_HOME=`/usr/libexec/java_home` 56 | else 57 | JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home 58 | fi 59 | fi 60 | ;; 61 | esac 62 | 63 | if [ -z "$JAVA_HOME" ] ; then 64 | if [ -r /etc/gentoo-release ] ; then 65 | JAVA_HOME=`java-config --jre-home` 66 | fi 67 | fi 68 | 69 | # For Cygwin, ensure paths are in UNIX format before anything is touched 70 | if $cygwin ; then 71 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 72 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"` 73 | fi 74 | 75 | # If a specific java binary isn't specified search for the standard 'java' binary 76 | if [ -z "$JAVACMD" ] ; then 77 | if [ -n "$JAVA_HOME" ] ; then 78 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 79 | # IBM's JDK on AIX uses strange locations for the executables 80 | JAVACMD="$JAVA_HOME/jre/sh/java" 81 | else 82 | JAVACMD="$JAVA_HOME/bin/java" 83 | fi 84 | else 85 | JAVACMD=`which java` 86 | fi 87 | fi 88 | 89 | if [ ! -x "$JAVACMD" ] ; then 90 | echo "Error: JAVA_HOME is not defined correctly." 1>&2 91 | echo " We cannot execute $JAVACMD" 1>&2 92 | exit 1 93 | fi 94 | 95 | if [ -z "$REPO" ] 96 | then 97 | REPO="$BASEDIR"/repo 98 | fi 99 | 100 | CLASSPATH="$BASEDIR"/etc:"$REPO"/* 101 | 102 | ENDORSED_DIR= 103 | if [ -n "$ENDORSED_DIR" ] ; then 104 | CLASSPATH=$BASEDIR/$ENDORSED_DIR/*:$CLASSPATH 105 | fi 106 | 107 | if [ -n "$CLASSPATH_PREFIX" ] ; then 108 | CLASSPATH=$CLASSPATH_PREFIX:$CLASSPATH 109 | fi 110 | 111 | # For Cygwin, switch paths to Windows format before running java 112 | if $cygwin; then 113 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"` 114 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` 115 | [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"` 116 | [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"` 117 | [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"` 118 | fi 119 | 120 | exec "$JAVACMD" $JAVA_OPTS -Xms512M -Xmx31G \ 121 | -classpath "$CLASSPATH" \ 122 | -Dapp.name="IndexVectors" \ 123 | -Dapp.pid="$$" \ 124 | -Dapp.repo="$REPO" \ 125 | -Dapp.home="$BASEDIR" \ 126 | -Dbasedir="$BASEDIR" \ 127 | io.anserini.ann.IndexVectors \ 128 | "$@" 129 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/IndexVectors.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="IndexVectors" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.ann.IndexVectors %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/SearchCollection.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="SearchCollection" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.search.SearchCollection %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/SearchMsmarco: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2001-2006 The Apache Software Foundation. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ---------------------------------------------------------------------------- 17 | # 18 | # Copyright (c) 2001-2006 The Apache Software Foundation. All rights 19 | # reserved. 20 | 21 | 22 | # resolve links - $0 may be a softlink 23 | PRG="$0" 24 | 25 | while [ -h "$PRG" ]; do 26 | ls=`ls -ld "$PRG"` 27 | link=`expr "$ls" : '.*-> \(.*\)$'` 28 | if expr "$link" : '/.*' > /dev/null; then 29 | PRG="$link" 30 | else 31 | PRG=`dirname "$PRG"`/"$link" 32 | fi 33 | done 34 | 35 | PRGDIR=`dirname "$PRG"` 36 | BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd` 37 | 38 | # Reset the REPO variable. If you need to influence this use the environment setup file. 39 | REPO= 40 | 41 | 42 | # OS specific support. $var _must_ be set to either true or false. 43 | cygwin=false; 44 | darwin=false; 45 | case "`uname`" in 46 | CYGWIN*) cygwin=true ;; 47 | Darwin*) darwin=true 48 | if [ -z "$JAVA_VERSION" ] ; then 49 | JAVA_VERSION="CurrentJDK" 50 | else 51 | echo "Using Java version: $JAVA_VERSION" 52 | fi 53 | if [ -z "$JAVA_HOME" ]; then 54 | if [ -x "/usr/libexec/java_home" ]; then 55 | JAVA_HOME=`/usr/libexec/java_home` 56 | else 57 | JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home 58 | fi 59 | fi 60 | ;; 61 | esac 62 | 63 | if [ -z "$JAVA_HOME" ] ; then 64 | if [ -r /etc/gentoo-release ] ; then 65 | JAVA_HOME=`java-config --jre-home` 66 | fi 67 | fi 68 | 69 | # For Cygwin, ensure paths are in UNIX format before anything is touched 70 | if $cygwin ; then 71 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 72 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"` 73 | fi 74 | 75 | # If a specific java binary isn't specified search for the standard 'java' binary 76 | if [ -z "$JAVACMD" ] ; then 77 | if [ -n "$JAVA_HOME" ] ; then 78 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 79 | # IBM's JDK on AIX uses strange locations for the executables 80 | JAVACMD="$JAVA_HOME/jre/sh/java" 81 | else 82 | JAVACMD="$JAVA_HOME/bin/java" 83 | fi 84 | else 85 | JAVACMD=`which java` 86 | fi 87 | fi 88 | 89 | if [ ! -x "$JAVACMD" ] ; then 90 | echo "Error: JAVA_HOME is not defined correctly." 1>&2 91 | echo " We cannot execute $JAVACMD" 1>&2 92 | exit 1 93 | fi 94 | 95 | if [ -z "$REPO" ] 96 | then 97 | REPO="$BASEDIR"/repo 98 | fi 99 | 100 | CLASSPATH="$BASEDIR"/etc:"$REPO"/* 101 | 102 | ENDORSED_DIR= 103 | if [ -n "$ENDORSED_DIR" ] ; then 104 | CLASSPATH=$BASEDIR/$ENDORSED_DIR/*:$CLASSPATH 105 | fi 106 | 107 | if [ -n "$CLASSPATH_PREFIX" ] ; then 108 | CLASSPATH=$CLASSPATH_PREFIX:$CLASSPATH 109 | fi 110 | 111 | # For Cygwin, switch paths to Windows format before running java 112 | if $cygwin; then 113 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"` 114 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` 115 | [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"` 116 | [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"` 117 | [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"` 118 | fi 119 | 120 | exec "$JAVACMD" $JAVA_OPTS -Xms512M -Xmx31G \ 121 | -classpath "$CLASSPATH" \ 122 | -Dapp.name="SearchMsmarco" \ 123 | -Dapp.pid="$$" \ 124 | -Dapp.repo="$REPO" \ 125 | -Dapp.home="$BASEDIR" \ 126 | -Dbasedir="$BASEDIR" \ 127 | io.anserini.search.SearchMsmarco \ 128 | "$@" 129 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/SearchMsmarco.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="SearchMsmarco" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.search.SearchMsmarco %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/SearchVectorCollection.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="SearchVectorCollection" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.search.SearchVectorCollection %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/SimpleSearcher: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | # ---------------------------------------------------------------------------- 3 | # Copyright 2001-2006 The Apache Software Foundation. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # ---------------------------------------------------------------------------- 17 | # 18 | # Copyright (c) 2001-2006 The Apache Software Foundation. All rights 19 | # reserved. 20 | 21 | 22 | # resolve links - $0 may be a softlink 23 | PRG="$0" 24 | 25 | while [ -h "$PRG" ]; do 26 | ls=`ls -ld "$PRG"` 27 | link=`expr "$ls" : '.*-> \(.*\)$'` 28 | if expr "$link" : '/.*' > /dev/null; then 29 | PRG="$link" 30 | else 31 | PRG=`dirname "$PRG"`/"$link" 32 | fi 33 | done 34 | 35 | PRGDIR=`dirname "$PRG"` 36 | BASEDIR=`cd "$PRGDIR/.." >/dev/null; pwd` 37 | 38 | # Reset the REPO variable. If you need to influence this use the environment setup file. 39 | REPO= 40 | 41 | 42 | # OS specific support. $var _must_ be set to either true or false. 43 | cygwin=false; 44 | darwin=false; 45 | case "`uname`" in 46 | CYGWIN*) cygwin=true ;; 47 | Darwin*) darwin=true 48 | if [ -z "$JAVA_VERSION" ] ; then 49 | JAVA_VERSION="CurrentJDK" 50 | else 51 | echo "Using Java version: $JAVA_VERSION" 52 | fi 53 | if [ -z "$JAVA_HOME" ]; then 54 | if [ -x "/usr/libexec/java_home" ]; then 55 | JAVA_HOME=`/usr/libexec/java_home` 56 | else 57 | JAVA_HOME=/System/Library/Frameworks/JavaVM.framework/Versions/${JAVA_VERSION}/Home 58 | fi 59 | fi 60 | ;; 61 | esac 62 | 63 | if [ -z "$JAVA_HOME" ] ; then 64 | if [ -r /etc/gentoo-release ] ; then 65 | JAVA_HOME=`java-config --jre-home` 66 | fi 67 | fi 68 | 69 | # For Cygwin, ensure paths are in UNIX format before anything is touched 70 | if $cygwin ; then 71 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --unix "$JAVA_HOME"` 72 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --unix "$CLASSPATH"` 73 | fi 74 | 75 | # If a specific java binary isn't specified search for the standard 'java' binary 76 | if [ -z "$JAVACMD" ] ; then 77 | if [ -n "$JAVA_HOME" ] ; then 78 | if [ -x "$JAVA_HOME/jre/sh/java" ] ; then 79 | # IBM's JDK on AIX uses strange locations for the executables 80 | JAVACMD="$JAVA_HOME/jre/sh/java" 81 | else 82 | JAVACMD="$JAVA_HOME/bin/java" 83 | fi 84 | else 85 | JAVACMD=`which java` 86 | fi 87 | fi 88 | 89 | if [ ! -x "$JAVACMD" ] ; then 90 | echo "Error: JAVA_HOME is not defined correctly." 1>&2 91 | echo " We cannot execute $JAVACMD" 1>&2 92 | exit 1 93 | fi 94 | 95 | if [ -z "$REPO" ] 96 | then 97 | REPO="$BASEDIR"/repo 98 | fi 99 | 100 | CLASSPATH="$BASEDIR"/etc:"$REPO"/* 101 | 102 | ENDORSED_DIR= 103 | if [ -n "$ENDORSED_DIR" ] ; then 104 | CLASSPATH=$BASEDIR/$ENDORSED_DIR/*:$CLASSPATH 105 | fi 106 | 107 | if [ -n "$CLASSPATH_PREFIX" ] ; then 108 | CLASSPATH=$CLASSPATH_PREFIX:$CLASSPATH 109 | fi 110 | 111 | # For Cygwin, switch paths to Windows format before running java 112 | if $cygwin; then 113 | [ -n "$CLASSPATH" ] && CLASSPATH=`cygpath --path --windows "$CLASSPATH"` 114 | [ -n "$JAVA_HOME" ] && JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` 115 | [ -n "$HOME" ] && HOME=`cygpath --path --windows "$HOME"` 116 | [ -n "$BASEDIR" ] && BASEDIR=`cygpath --path --windows "$BASEDIR"` 117 | [ -n "$REPO" ] && REPO=`cygpath --path --windows "$REPO"` 118 | fi 119 | 120 | exec "$JAVACMD" $JAVA_OPTS -Xms512M -Xmx31G \ 121 | -classpath "$CLASSPATH" \ 122 | -Dapp.name="SimpleSearcher" \ 123 | -Dapp.pid="$$" \ 124 | -Dapp.repo="$REPO" \ 125 | -Dapp.home="$BASEDIR" \ 126 | -Dbasedir="$BASEDIR" \ 127 | io.anserini.search.SimpleSearcher \ 128 | "$@" 129 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/SimpleSearcher.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="SimpleSearcher" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.search.SimpleSearcher %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/bin/SimpleTweetSearcher.bat: -------------------------------------------------------------------------------- 1 | @REM ---------------------------------------------------------------------------- 2 | @REM Copyright 2001-2006 The Apache Software Foundation. 3 | @REM 4 | @REM Licensed under the Apache License, Version 2.0 (the "License"); 5 | @REM you may not use this file except in compliance with the License. 6 | @REM You may obtain a copy of the License at 7 | @REM 8 | @REM http://www.apache.org/licenses/LICENSE-2.0 9 | @REM 10 | @REM Unless required by applicable law or agreed to in writing, software 11 | @REM distributed under the License is distributed on an "AS IS" BASIS, 12 | @REM WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | @REM See the License for the specific language governing permissions and 14 | @REM limitations under the License. 15 | @REM ---------------------------------------------------------------------------- 16 | @REM 17 | @REM Copyright (c) 2001-2006 The Apache Software Foundation. All rights 18 | @REM reserved. 19 | 20 | @echo off 21 | 22 | set ERROR_CODE=0 23 | 24 | :init 25 | @REM Decide how to startup depending on the version of windows 26 | 27 | @REM -- Win98ME 28 | if NOT "%OS%"=="Windows_NT" goto Win9xArg 29 | 30 | @REM set local scope for the variables with windows NT shell 31 | if "%OS%"=="Windows_NT" @setlocal 32 | 33 | @REM -- 4NT shell 34 | if "%eval[2+2]" == "4" goto 4NTArgs 35 | 36 | @REM -- Regular WinNT shell 37 | set CMD_LINE_ARGS=%* 38 | goto WinNTGetScriptDir 39 | 40 | @REM The 4NT Shell from jp software 41 | :4NTArgs 42 | set CMD_LINE_ARGS=%$ 43 | goto WinNTGetScriptDir 44 | 45 | :Win9xArg 46 | @REM Slurp the command line arguments. This loop allows for an unlimited number 47 | @REM of arguments (up to the command line limit, anyway). 48 | set CMD_LINE_ARGS= 49 | :Win9xApp 50 | if %1a==a goto Win9xGetScriptDir 51 | set CMD_LINE_ARGS=%CMD_LINE_ARGS% %1 52 | shift 53 | goto Win9xApp 54 | 55 | :Win9xGetScriptDir 56 | set SAVEDIR=%CD% 57 | %0\ 58 | cd %0\..\.. 59 | set BASEDIR=%CD% 60 | cd %SAVEDIR% 61 | set SAVE_DIR= 62 | goto repoSetup 63 | 64 | :WinNTGetScriptDir 65 | for %%i in ("%~dp0..") do set "BASEDIR=%%~fi" 66 | 67 | :repoSetup 68 | set REPO= 69 | 70 | 71 | if "%JAVACMD%"=="" set JAVACMD=java 72 | 73 | if "%REPO%"=="" set REPO=%BASEDIR%\repo 74 | 75 | set CLASSPATH="%BASEDIR%"\etc;"%REPO%"\* 76 | 77 | set ENDORSED_DIR= 78 | if NOT "%ENDORSED_DIR%" == "" set CLASSPATH="%BASEDIR%"\%ENDORSED_DIR%\*;%CLASSPATH% 79 | 80 | if NOT "%CLASSPATH_PREFIX%" == "" set CLASSPATH=%CLASSPATH_PREFIX%;%CLASSPATH% 81 | 82 | @REM Reaching here means variables are defined and arguments have been captured 83 | :endInit 84 | 85 | %JAVACMD% %JAVA_OPTS% -Xms512M -Xmx31G -classpath %CLASSPATH% -Dapp.name="SimpleTweetSearcher" -Dapp.repo="%REPO%" -Dapp.home="%BASEDIR%" -Dbasedir="%BASEDIR%" io.anserini.search.SimpleTweetSearcher %CMD_LINE_ARGS% 86 | if %ERRORLEVEL% NEQ 0 goto error 87 | goto end 88 | 89 | :error 90 | if "%OS%"=="Windows_NT" @endlocal 91 | set ERROR_CODE=%ERRORLEVEL% 92 | 93 | :end 94 | @REM set local scope for the variables with windows NT shell 95 | if "%OS%"=="Windows_NT" goto endNT 96 | 97 | @REM For old DOS remove the set variables from ENV - we assume they were not set 98 | @REM before we started - at least we don't leave any baggage around 99 | set CMD_LINE_ARGS= 100 | goto postExec 101 | 102 | :endNT 103 | @REM If error code is set to 1 then the endlocal was done already in :error. 104 | if %ERROR_CODE% EQU 0 @endlocal 105 | 106 | 107 | :postExec 108 | 109 | if "%FORCE_EXIT_ON_ERROR%" == "on" ( 110 | if %ERROR_CODE% NEQ 0 exit %ERROR_CODE% 111 | ) 112 | 113 | exit /B %ERROR_CODE% 114 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/annotations-java5-23.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/annotations-java5-23.0.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/anserini-0.15.1-SNAPSHOT.jar: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:69c94ec169d31284d484ba1eaaaaf0efdc21258a3e10643474132fc35b07d618 3 | size 64258109 4 | -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/ant-1.10.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/ant-1.10.11.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/ant-launcher-1.10.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/ant-launcher-1.10.11.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/api-0.18.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/api-0.18.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/args4j-2.33.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/args4j-2.33.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/cbor-0.8.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/cbor-0.8.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/checker-qual-3.8.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/checker-qual-3.8.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/commons-compress-1.21.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/commons-compress-1.21.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/commons-csv-1.8.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/commons-csv-1.8.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/commons-io-2.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/commons-io-2.5.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/commons-lang3-3.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/commons-lang3-3.5.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/commons-pool2-2.11.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/commons-pool2-2.11.1.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/commons-text-1.9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/commons-text-1.9.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/error_prone_annotations-2.5.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/error_prone_annotations-2.5.1.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/failureaccess-1.0.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/failureaccess-1.0.1.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/fastutil-8.5.8.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/fastutil-8.5.8.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/gson-2.9.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/gson-2.9.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/guava-30.1.1-jre.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/guava-30.1.1-jre.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/j2objc-annotations-1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/j2objc-annotations-1.3.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/jackson-annotations-2.12.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/jackson-annotations-2.12.5.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/jackson-core-2.12.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/jackson-core-2.12.5.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/jackson-databind-2.12.6.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/jackson-databind-2.12.6.1.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/jackson-dataformat-yaml-2.12.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/jackson-dataformat-yaml-2.12.5.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/jackson-datatype-jdk8-2.12.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/jackson-datatype-jdk8-2.12.5.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/jbibtex-1.0.19.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/jbibtex-1.0.19.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/jna-5.11.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/jna-5.11.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/jsoup-1.15.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/jsoup-1.15.3.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/jsr305-2.0.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/jsr305-2.0.1.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/listenablefuture-9999.0-empty-to-avoid-conflict-with-guava.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/log4j-api-2.17.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/log4j-api-2.17.1.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/log4j-core-2.17.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/log4j-core-2.17.1.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/lucene-analysis-common-9.3.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/lucene-analysis-common-9.3.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/lucene-analysis-kuromoji-9.3.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/lucene-analysis-kuromoji-9.3.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/lucene-analysis-morfologik-9.3.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/lucene-analysis-morfologik-9.3.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/lucene-backward-codecs-9.3.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/lucene-backward-codecs-9.3.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/lucene-codecs-9.3.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/lucene-codecs-9.3.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/lucene-core-9.3.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/lucene-core-9.3.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/lucene-queries-9.3.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/lucene-queries-9.3.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/lucene-queryparser-9.3.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/lucene-queryparser-9.3.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/lucene-sandbox-9.3.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/lucene-sandbox-9.3.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/mockito-all-1.10.19.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/mockito-all-1.10.19.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/morfologik-fsa-2.1.9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/morfologik-fsa-2.1.9.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/morfologik-polish-2.1.9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/morfologik-polish-2.1.9.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/morfologik-stemming-2.1.9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/morfologik-stemming-2.1.9.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/morfologik-ukrainian-search-4.9.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/morfologik-ukrainian-search-4.9.1.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/slf4j-api-1.7.32.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/slf4j-api-1.7.32.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/slf4j-simple-1.7.32.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/slf4j-simple-1.7.32.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/snakeyaml-1.27.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/snakeyaml-1.27.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/tokenizers-0.18.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/tokenizers-0.18.0.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/trec-car-tools-java-19.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/trec-car-tools-java-19.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/twitter-text-2.0.10.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/twitter-text-2.0.10.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/wikiclean-1.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/wikiclean-1.1.jar -------------------------------------------------------------------------------- /src/anserini/target/appassembler/repo/xz-1.9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/anserini/target/appassembler/repo/xz-1.9.jar -------------------------------------------------------------------------------- /src/data/config/_default.yaml: -------------------------------------------------------------------------------- 1 | # disable the hydra outputs 2 | defaults: 3 | - _self_ 4 | # override package to be imported from the folder 5 | - override /hydra/hydra_logging@_group_: none 6 | - override /hydra/job_logging@_group_: none 7 | 8 | hydra: 9 | output_subdir: null 10 | run: 11 | dir: . -------------------------------------------------------------------------------- /src/data/config/_example.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - mode: train 6 | - model: sparse 7 | - index: invvec 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | base: 12 | eval_batch_size: 2 13 | num_worker: 0 14 | 15 | train: 16 | batch_size: 2 17 | neg_type: random 18 | -------------------------------------------------------------------------------- /src/data/config/ance.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: dense 6 | - index: faiss 7 | - mode: train 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | base: 12 | plm: bert 13 | 14 | model: 15 | model_type: dpr 16 | 17 | index: 18 | index_type: Flat 19 | 20 | train: 21 | nneg: 7 22 | neg_type: DPR 23 | learning_rate: 3e-5 24 | scheduler: linear 25 | 26 | -------------------------------------------------------------------------------- /src/data/config/ar2.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: dense 6 | - index: faiss 7 | - mode: eval 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | base: 12 | text_col: [1, 2] 13 | 14 | model: 15 | model_type: ar2 16 | untie_encoder: true 17 | 18 | index: 19 | index_type: Flat 20 | 21 | eval: 22 | eval_posting_length: true 23 | 24 | -------------------------------------------------------------------------------- /src/data/config/autotsg.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: NQ320k 5 | - model: generative 6 | - index: wordset 7 | - mode: train 8 | - extra: code 9 | # add _self_ here so that the following arguments can be rewritten 10 | - _self_ 11 | 12 | base: 13 | plm: t5 14 | parallel: query 15 | 16 | model: 17 | model_type: autotsg 18 | nbeam: 10 19 | 20 | train: 21 | # only query-pos pair 22 | neg_type: none 23 | epoch: 50 24 | batch_size: 400 25 | bf16: true 26 | 27 | learning_rate: 1e-3 28 | scheduler: linear 29 | eval_delay: 20e 30 | early_stop_patience: 5 31 | main_metric: MRR@10 32 | 33 | code: 34 | code_type: words_comma_plus_stem 35 | code_length: 26 36 | code_sep: "," 37 | reduce_code: min 38 | permute_code: 0 39 | -------------------------------------------------------------------------------- /src/data/config/base/LECARD.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | dataset: LECARD 5 | text_length: 512 6 | query_length: 512 7 | 8 | max_text_length: 512 9 | max_query_length: 512 10 | 11 | plm: bert-chinese 12 | 13 | text_col: [1] 14 | text_col_sep: "sep" 15 | 16 | eval_metric: [mrr, map, precision, ndcg, recall] 17 | eval_metric_cutoff: [5, 10, 20, 30] -------------------------------------------------------------------------------- /src/data/config/base/MS300k-unseen.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | dataset: MS300k-unseen 5 | text_length: 512 6 | query_length: 64 7 | 8 | max_text_length: 512 9 | max_query_length: 64 10 | 11 | plm: t5 12 | 13 | # title + abstract 14 | text_col: [1, 2, 3] 15 | text_col_sep: " " 16 | -------------------------------------------------------------------------------- /src/data/config/base/MS300k.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | dataset: MS300k 5 | text_length: 512 6 | query_length: 64 7 | 8 | max_text_length: 512 9 | max_query_length: 64 10 | 11 | plm: t5 12 | 13 | # title + abstract 14 | text_col: [1, 2, 3] 15 | text_col_sep: " " 16 | -------------------------------------------------------------------------------- /src/data/config/base/MS600k.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | dataset: MS600k 5 | text_length: 512 6 | query_length: 64 7 | 8 | max_text_length: 512 9 | max_query_length: 64 10 | 11 | plm: t5 12 | 13 | # title + abstract 14 | text_col: [1, 2, 3] 15 | text_col_sep: " " 16 | -------------------------------------------------------------------------------- /src/data/config/base/MSMARCO-passage.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | dataset: MSMARCO-passage 5 | text_length: 128 6 | query_length: 32 7 | 8 | max_text_length: 256 9 | max_query_length: 64 10 | 11 | plm: bert 12 | 13 | text_col: [2] 14 | text_col_sep: "sep" 15 | -------------------------------------------------------------------------------- /src/data/config/base/NQ-50k-seen.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | dataset: NQ-50k-seen 5 | text_length: 512 6 | query_length: 32 7 | 8 | max_text_length: 512 9 | max_query_length: 64 10 | 11 | plm: t5 12 | 13 | # title + abstract + content 14 | text_col: [1, 2, 3] 15 | # t5 by default has no sep token 16 | text_col_sep: " " 17 | -------------------------------------------------------------------------------- /src/data/config/base/NQ-50k-unseen.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | dataset: NQ-50k-unseen 5 | text_length: 512 6 | query_length: 32 7 | 8 | max_text_length: 512 9 | max_query_length: 64 10 | 11 | plm: t5 12 | 13 | # title + abstract + content 14 | text_col: [1, 2, 3] 15 | # t5 by default has no sep token 16 | text_col_sep: " " 17 | -------------------------------------------------------------------------------- /src/data/config/base/NQ-open.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | dataset: NQ-open 5 | text_length: 128 6 | query_length: 32 7 | 8 | max_text_length: 256 9 | max_query_length: 64 10 | 11 | plm: bert 12 | 13 | text_col: [1, 2] 14 | text_col_sep: sep 15 | -------------------------------------------------------------------------------- /src/data/config/base/NQ320k-seen.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | dataset: NQ320k-seen 5 | text_length: 512 6 | query_length: 32 7 | 8 | max_text_length: 512 9 | max_query_length: 64 10 | 11 | plm: t5 12 | 13 | # title + abstract + content 14 | text_col: [1, 2, 3] 15 | # t5 by default has no sep token 16 | text_col_sep: " " 17 | -------------------------------------------------------------------------------- /src/data/config/base/NQ320k-unseen.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | dataset: NQ320k-unseen 5 | text_length: 512 6 | query_length: 32 7 | 8 | max_text_length: 512 9 | max_query_length: 64 10 | 11 | plm: t5 12 | 13 | # title + abstract + content 14 | text_col: [1, 2, 3] 15 | # t5 by default has no sep token 16 | text_col_sep: " " 17 | -------------------------------------------------------------------------------- /src/data/config/base/NQ320k.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | dataset: NQ320k 5 | text_length: 512 6 | query_length: 32 7 | 8 | max_text_length: 512 9 | max_query_length: 64 10 | 11 | plm: t5 12 | 13 | # title + abstract + content 14 | text_col: [1, 2, 3] 15 | # t5 by default has no sep token 16 | text_col_sep: " " 17 | -------------------------------------------------------------------------------- /src/data/config/base/Rand300k-filter.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | dataset: Rand300k-filter 5 | text_length: 512 6 | query_length: 64 7 | 8 | max_text_length: 512 9 | max_query_length: 64 10 | 11 | plm: t5 12 | 13 | # title + abstract 14 | text_col: [1, 2, 3] 15 | text_col_sep: " " 16 | -------------------------------------------------------------------------------- /src/data/config/base/Top300k-filter.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | dataset: Top300k-filter 5 | text_length: 512 6 | query_length: 64 7 | 8 | max_text_length: 512 9 | max_query_length: 64 10 | 11 | plm: t5 12 | 13 | # title + abstract 14 | text_col: [1, 2, 3] 15 | text_col_sep: " " 16 | -------------------------------------------------------------------------------- /src/data/config/base/_default.yaml: -------------------------------------------------------------------------------- 1 | # the root directory of the raw data 2 | data_root: /share/project/peitian/Data/Adon 3 | plm_root: /share/project/peitian/Data/huggingface-PLMs 4 | seed: 42 5 | 6 | # the device to run the model or script 7 | device: 0 8 | 9 | text_type: default 10 | data_format: memmap 11 | num_worker: 2 12 | 13 | # the batch size fed to the loader_eval 14 | eval_batch_size: 100 15 | # the dataset to evaluate the model or run commands 16 | eval_set: dev 17 | # the mode to evaluate the model: retrieve or rerank 18 | eval_mode: retrieve 19 | # use the debug mode (will train 2 steps and encode 10 steps) 20 | debug: false 21 | 22 | # when using distributed training/evaluating, we can choose to split text 23 | # or query across processes 24 | parallel: text 25 | -------------------------------------------------------------------------------- /src/data/config/bivfpq-nq.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: NQ-open 5 | - model: _default 6 | - mode: eval 7 | 8 | model: 9 | model_type: uniretriever 10 | 11 | return_embedding: true 12 | embedding_src: AR2 13 | 14 | x_model: TokIVF 15 | x_index_type: invhit 16 | x_hits: 0 17 | x_load_encode: true 18 | x_text_gate_k: 3 19 | x_load_ckpt: best 20 | x_posting_prune: 0.996 21 | 22 | y_model: TopIVF 23 | y_index_type: invhit 24 | y_hits: 0 25 | y_load_encode: false 26 | y_query_gate_k: 20 27 | y_load_ckpt: best 28 | 29 | x_eval_flops: false 30 | y_eval_flops: false 31 | x_eval_posting_length: true 32 | y_eval_posting_length: true 33 | 34 | eval: 35 | verifier_type: pq 36 | verifier_src: DistillVQ_d-AR2 37 | verifier_index: OPQ96,PQ96x8 38 | -------------------------------------------------------------------------------- /src/data/config/bivfpq.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: _default 6 | - mode: eval 7 | 8 | base: 9 | text_col: [1, 2] 10 | 11 | model: 12 | return_embedding: true 13 | embedding_src: RetroMAE 14 | 15 | model_type: uniretriever 16 | 17 | x_model: TokIVF 18 | x_index_type: invhit 19 | x_hits: 0 20 | x_load_encode: true 21 | x_text_gate_k: 3 22 | x_load_ckpt: best 23 | x_posting_prune: 0.996 24 | 25 | y_model: TopIVF 26 | y_index_type: invhit 27 | y_hits: 0 28 | y_load_encode: false 29 | y_query_gate_k: 20 30 | y_load_ckpt: best 31 | 32 | x_eval_flops: false 33 | y_eval_flops: false 34 | x_eval_posting_length: true 35 | y_eval_posting_length: true 36 | 37 | eval: 38 | verifier_type: pq 39 | verifier_src: DistillVQ_d-RetroMAE 40 | verifier_index: OPQ96,PQ96x8 41 | -------------------------------------------------------------------------------- /src/data/config/bm25.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: sparse 6 | - index: bm25 7 | - mode: eval 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | base: 12 | device: cpu 13 | plm: bert 14 | eval_batch_size: 1 15 | 16 | model: 17 | model_type: bm25 18 | 19 | index: 20 | pretokenize: false 21 | -------------------------------------------------------------------------------- /src/data/config/bow.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: NQ 5 | - model: generative 6 | - index: wordset 7 | - mode: train 8 | - extra: code 9 | # add _self_ here so that the following arguments can be rewritten 10 | - _self_ 11 | 12 | base: 13 | plm: t5 14 | parallel: query 15 | 16 | model: 17 | model_type: bow 18 | nbeam: 10 19 | 20 | train: 21 | # only query-pos pair 22 | neg_type: none 23 | epoch: 50 24 | batch_size: 400 25 | bf16: true 26 | 27 | learning_rate: 1e-3 28 | scheduler: linear 29 | eval_delay: 20e 30 | early_stop_patience: 5 31 | main_metric: MRR@10 32 | 33 | code: 34 | code_type: words_comma_plus_stem 35 | code_length: 26 36 | code_sep: "," 37 | reduce_code: min 38 | permute_code: 0 39 | -------------------------------------------------------------------------------- /src/data/config/coil.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: sparse 6 | - index: invvec 7 | - mode: train 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | model: 12 | model_type: coil 13 | return_special_mask: true 14 | token_dim: 32 15 | 16 | train: 17 | nneg: 7 18 | 19 | eval: 20 | eval_posting_length: true 21 | -------------------------------------------------------------------------------- /src/data/config/colbert.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: ranker 6 | - mode: train 7 | # add _self_ here so that the following arguments can be rewritten 8 | - _self_ 9 | 10 | base: 11 | plm: bert 12 | 13 | model: 14 | model_type: colbert 15 | token_dim: 128 16 | 17 | train: 18 | learning_rate: 3e-5 19 | 20 | eval: 21 | eval_mode: rerank 22 | -------------------------------------------------------------------------------- /src/data/config/crossenc.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: ranker 6 | - train: neg 7 | - mode: train 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | base: 12 | plm: bert 13 | 14 | model: 15 | model_type: crossenc 16 | 17 | train: 18 | batch_size: 16 19 | learning_rate: 3e-5 20 | nneg: 7 21 | -------------------------------------------------------------------------------- /src/data/config/deepimpact.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: sparse 6 | - index: impact 7 | - mode: train 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | base: 12 | plm: bert 13 | 14 | model: 15 | model_type: deepimpact 16 | return_special_mask: true 17 | 18 | index: 19 | granularity: word 20 | 21 | train: 22 | nneg: 7 23 | max_grad_norm: 2.0 24 | -------------------------------------------------------------------------------- /src/data/config/deepspeed.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "zero_optimization": { 12 | "stage": 2, 13 | "allgather_partitions": true, 14 | "allgather_bucket_size": 5e8, 15 | "overlap_comm": true, 16 | "reduce_scatter": true, 17 | "reduce_bucket_size": 5e8, 18 | "contiguous_gradients": true 19 | }, 20 | 21 | "gradient_accumulation_steps": "auto", 22 | "gradient_clipping": "auto", 23 | "steps_per_print": 2000, 24 | "train_batch_size": "auto", 25 | "train_micro_batch_size_per_gpu": "auto", 26 | "wall_clock_breakdown": false, 27 | "checkpoint-activations": true, 28 | "checkpoint-num-layers": 1, 29 | "partition-activations": true, 30 | "synchronize-each-layer": true 31 | } 32 | -------------------------------------------------------------------------------- /src/data/config/distillvq.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: dense 6 | - index: faiss 7 | - mode: train 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | model: 12 | model_type: distillvq 13 | 14 | return_embedding: true 15 | embedding_src: RetroMAE 16 | 17 | # dynamically update ivf assignments 18 | train_ivf_assign: false 19 | # dynamically update pq assignments 20 | train_pq_assign: false 21 | # train query encoder together with the index 22 | train_encoder: false 23 | # freeze pq centroids, only update IVF centroids 24 | freeze_pq: false 25 | 26 | index: 27 | index_type: OPQ96,PQ96x8 28 | 29 | train: 30 | distill_src: RetroMAE 31 | enable_distill: bi 32 | 33 | epoch: 50 34 | batch_size: 128 35 | nneg: 31 36 | 37 | learning_rate: 1e-5 38 | learning_rate_pq: 1e-4 39 | learning_rate_ivf: 1e-4 40 | scheduler: linear 41 | 42 | eval: 43 | eval_posting_length: true 44 | -------------------------------------------------------------------------------- /src/data/config/dpr.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: dense 6 | - index: faiss 7 | - mode: train 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | base: 12 | plm: bert 13 | 14 | model: 15 | model_type: dpr 16 | 17 | index: 18 | index_type: Flat 19 | 20 | train: 21 | nneg: 7 22 | learning_rate: 3e-5 23 | scheduler: linear 24 | -------------------------------------------------------------------------------- /src/data/config/dsi.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: NQ320k 5 | - model: generative 6 | - index: trie 7 | - mode: train 8 | - extra: code 9 | # add _self_ here so that the following arguments can be rewritten 10 | - _self_ 11 | 12 | base: 13 | plm: t5 14 | parallel: query 15 | 16 | model: 17 | model_type: dsi 18 | nbeam: 10 19 | 20 | train: 21 | train_set: [train, doc] 22 | # only query-pos pair 23 | neg_type: none 24 | epoch: 80 25 | bf16: true 26 | 27 | learning_rate: 1e-3 28 | scheduler: linear 29 | batch_size: 400 30 | eval_delay: 40e 31 | early_stop_patience: 0 32 | 33 | main_metric: MRR@10 34 | 35 | code: 36 | code_type: ANCE_hier 37 | code_length: 10 38 | -------------------------------------------------------------------------------- /src/data/config/dsiqg.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: NQ320k 5 | - model: generative 6 | - index: trie 7 | - mode: train 8 | - extra: code 9 | # add _self_ here so that the following arguments can be rewritten 10 | - _self_ 11 | 12 | base: 13 | plm: t5 14 | parallel: query 15 | 16 | model: 17 | model_type: dsiqg 18 | nbeam: 10 19 | 20 | train: 21 | bf16: true 22 | # only query-pos pair 23 | neg_type: none 24 | epoch: 80 25 | 26 | learning_rate: 1e-3 27 | scheduler: linear 28 | batch_size: 400 29 | eval_delay: 20e 30 | early_stop_patience: 10 31 | 32 | main_metric: MRR@10 33 | 34 | train_set: [train, doct5] 35 | 36 | code: 37 | code_type: id 38 | code_length: 8 39 | -------------------------------------------------------------------------------- /src/data/config/extra/code.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_.code 2 | 3 | code_type: none 4 | code_length: 0 5 | code_tokenizer: t5 6 | code_sep: " " 7 | code_src: none 8 | return_code: true 9 | return_query_code: false 10 | -------------------------------------------------------------------------------- /src/data/config/genre.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: NQ320k 5 | - model: generative 6 | - index: trie 7 | - mode: train 8 | - extra: code 9 | # add _self_ here so that the following arguments can be rewritten 10 | - _self_ 11 | 12 | base: 13 | plm: t5 14 | parallel: query 15 | 16 | model: 17 | model_type: genre 18 | nbeam: 10 19 | 20 | train: 21 | bf16: true 22 | # only query-pos pair 23 | neg_type: none 24 | 25 | epoch: 80 26 | eval_delay: 10e 27 | 28 | learning_rate: 1e-3 29 | scheduler: linear 30 | batch_size: 400 31 | main_metric: MRR@10 32 | 33 | 34 | code: 35 | code_type: title 36 | code_length: 26 37 | 38 | -------------------------------------------------------------------------------- /src/data/config/index/_default.yaml: -------------------------------------------------------------------------------- 1 | index_type: none 2 | 3 | # load the existing index 4 | load_index: false 5 | save_index: false 6 | -------------------------------------------------------------------------------- /src/data/config/index/bm25.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | - _self_ 4 | 5 | index_type: bm25 6 | 7 | load_collection: false 8 | load_index: false 9 | 10 | index_thread: 32 11 | language: eng 12 | 13 | # k1 and b used in bm25 14 | k1: 0.82 15 | b: 0.68 16 | pretokenize: false 17 | granularity: word 18 | 19 | -------------------------------------------------------------------------------- /src/data/config/index/faiss.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | - _self_ 4 | 5 | index_type: flat 6 | 7 | nprobe: 1 8 | by_residual: true 9 | hnswef: 1000 10 | -------------------------------------------------------------------------------- /src/data/config/index/fm.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | - _self_ 4 | 5 | index_type: fm 6 | load_index: true 7 | load_collection: true 8 | index_thread: 32 9 | -------------------------------------------------------------------------------- /src/data/config/index/impact-tok.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | - _self_ 4 | 5 | index_type: impact-tok 6 | 7 | load_collection: false 8 | quantize_bit: 8 9 | language: eng 10 | 11 | index_thread: 32 12 | -------------------------------------------------------------------------------- /src/data/config/index/impact-word.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | - _self_ 4 | 5 | index_type: impact-word 6 | 7 | load_collection: false 8 | quantize_bit: 8 9 | language: eng 10 | 11 | reduce: max 12 | 13 | index_thread: 32 14 | -------------------------------------------------------------------------------- /src/data/config/index/impact.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | - _self_ 4 | 5 | index_type: impact 6 | 7 | load_collection: false 8 | load_index: false 9 | 10 | index_thread: 32 11 | language: eng 12 | 13 | quantize_bit: 8 14 | granularity: token 15 | reduce: max 16 | -------------------------------------------------------------------------------- /src/data/config/index/invhit.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | - _self_ 4 | 5 | index_type: invhit 6 | 7 | # the fraction or number of items per posting 8 | posting_prune: 0. 9 | index_shard: 32 10 | index_thread: 10 11 | load_index: true 12 | save_index: true 13 | -------------------------------------------------------------------------------- /src/data/config/index/invvec.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | - _self_ 4 | 5 | index_type: invvec 6 | 7 | # the fraction or number of items per posting 8 | posting_prune: 0. 9 | index_shard: 32 10 | index_thread: 10 11 | load_index: true 12 | save_index: true 13 | -------------------------------------------------------------------------------- /src/data/config/index/none.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/namespace-Pt/Adon/b9914c77085fc3a48b2370312d8e8ba986ca6ca1/src/data/config/index/none.yaml -------------------------------------------------------------------------------- /src/data/config/index/trie.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | - _self_ 4 | 5 | index_type: trie 6 | load_index: true 7 | save_index: true 8 | -------------------------------------------------------------------------------- /src/data/config/index/wordset.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | - _self_ 4 | 5 | index_type: wordset 6 | index_thread: 10 7 | index_shard: 32 8 | 9 | load_index: true 10 | save_index: true 11 | 12 | # early stop when decoding? (specifically designed for wordset index) 13 | wordset_early_stop: true 14 | # at which step to enable early stop 15 | early_stop_start_len: 0 16 | -------------------------------------------------------------------------------- /src/data/config/ivf.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: sparse 6 | - index: invhit 7 | - mode: eval 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | model: 12 | model_type: ivf 13 | 14 | query_gate_k: 20 15 | 16 | vq_src: RetroMAE 17 | vq_index: IVF10000,PQ64x8 18 | embedding_src: RetroMAE 19 | return_embedding: true 20 | 21 | load_ckpt: none 22 | 23 | eval: 24 | hits: 0 25 | eval_posting_length: true 26 | 27 | verifier_type: pq 28 | verifier_src: DistillVQ_d-RetroMAE 29 | verifier_index: OPQ96,PQ96x8 30 | -------------------------------------------------------------------------------- /src/data/config/mode/_eval.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_.eval 2 | 3 | # evaluation metrics, seperated by colon 4 | eval_metric: [mrr, recall] 5 | # the cutoff for each evaluation metric 6 | eval_metric_cutoff: [1,5,10,100,1000] 7 | # the cutoff for retrieval result 8 | hits: 1000 9 | # evaluate flops? 10 | eval_flops: false 11 | # evaluate posting length in inverted indexes? 12 | eval_posting_length: false 13 | 14 | # the post verifier 15 | verifier_type: none 16 | # the source of verifier 17 | verifier_src: none 18 | # the (pq) index used 19 | verifier_index: none 20 | # the final hits 21 | verifier_hits: 1000 22 | -------------------------------------------------------------------------------- /src/data/config/mode/cluster.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | mode: cluster 3 | # the dense embedding can be used for clusterring 4 | cluster_type: hier-l2 5 | # the number of clusters 6 | ncluster: 10 7 | # the number of leaf node in hierarchical clusterring 8 | nleaf: 100 9 | -------------------------------------------------------------------------------- /src/data/config/mode/code.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | mode: code -------------------------------------------------------------------------------- /src/data/config/mode/deploy.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | mode: deploy 3 | -------------------------------------------------------------------------------- /src/data/config/mode/encode-query.yaml: -------------------------------------------------------------------------------- 1 | mode: encode-query -------------------------------------------------------------------------------- /src/data/config/mode/encode-text.yaml: -------------------------------------------------------------------------------- 1 | mode: encode-text -------------------------------------------------------------------------------- /src/data/config/mode/encode.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_.infer 2 | 3 | mode: encode 4 | do_text: true 5 | do_query: true -------------------------------------------------------------------------------- /src/data/config/mode/eval.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_.eval 2 | defaults: 3 | - _eval 4 | 5 | mode: eval 6 | -------------------------------------------------------------------------------- /src/data/config/mode/index.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | defaults: 3 | - _eval 4 | 5 | mode: index 6 | -------------------------------------------------------------------------------- /src/data/config/mode/migrate.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | mode: migrate -------------------------------------------------------------------------------- /src/data/config/mode/script.yaml: -------------------------------------------------------------------------------- 1 | mode: script -------------------------------------------------------------------------------- /src/data/config/mode/train.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_.train 2 | defaults: 3 | - _eval # load the configs 4 | 5 | mode: train 6 | 7 | # default to use negative 8 | loader_train: neg 9 | # query set for training 10 | train_set: [train] 11 | 12 | epoch: 20 13 | # the total batch size 14 | batch_size: 128 15 | # mixed precision 16 | fp16: false 17 | bf16: false 18 | # gradient accumulation 19 | grad_accum_step: 1 20 | # Stop training when the evaluation results is inferior to the best one for ? times. 21 | early_stop_patience: 5 22 | # clip grad 23 | max_grad_norm: 0 24 | # maximum steps for training 25 | max_step: 0 26 | # wandb 27 | report_to: none 28 | # deepspeed configuration file path 29 | deepspeed: null 30 | 31 | learning_rate: 3e-6 32 | adam_beta1: 0.9 33 | adam_beta2: 0.999 34 | adam_epsilon: 1e-8 35 | weight_decay: 0.01 36 | scheduler: constant 37 | warmup_ratio: 0.1 38 | warmup_step: 0 39 | 40 | main_metric: Recall@10 41 | # interval of testing the model performance 42 | eval_step: 1e 43 | # donot test the model performance before eval_delay steps 44 | eval_delay: 0 45 | # if true, save the model after validation 46 | # otherwise, only store the ever-best performance model 47 | save_at_eval: false 48 | 49 | 50 | # how many hard negatives to use? 51 | nneg: 1 52 | # what kind of hard negatives to use? 53 | neg_type: BM25 54 | # use inbatch negative? 55 | enable_inbatch_negative: true 56 | # gather all the embeddings across processes in distributed training? 57 | enable_all_gather: true 58 | # distillation 59 | enable_distill: false 60 | # distill from which model? 61 | distill_src: none 62 | -------------------------------------------------------------------------------- /src/data/config/model/_default.yaml: -------------------------------------------------------------------------------- 1 | # model class 2 | model_type: null 3 | 4 | # the checkpoint to load 5 | load_ckpt: null 6 | # the checkpoint path to save 7 | save_ckpt: best 8 | 9 | # load the encoded cache 10 | load_encode: false 11 | # save the encoded result 12 | save_encode: false 13 | load_text_encode: false 14 | load_query_encode: false 15 | 16 | # load the existing retrieval result 17 | load_result: false 18 | # save the model after main function 19 | save_model: false 20 | # save the retrieval result together with the score of each retrieved document 21 | save_score: false 22 | # file name for the retrieval results 23 | save_res: retrieval_result 24 | 25 | -------------------------------------------------------------------------------- /src/data/config/model/dense.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | # metric used for dense retrieval 5 | dense_metric: ip 6 | # separate query and document encoder? 7 | untie_encoder: false 8 | -------------------------------------------------------------------------------- /src/data/config/model/generative.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | # the beams when decoding 5 | nbeam: 10 6 | # how to measure relevance, generative probability or eos hidden states 7 | rank_type: prob 8 | # stop if the threshold has been reduced to 9 | beam_trsd: 0 10 | # how many steps to examine threshold 11 | trsd_start_len: 0 12 | # sample instead of topk? 13 | decode_do_sample: false 14 | decode_do_greedy: false 15 | decode_renorm_logit: false 16 | 17 | sample_topk: null 18 | sample_topp: null 19 | sample_typicalp: null 20 | sample_tau: null 21 | -------------------------------------------------------------------------------- /src/data/config/model/ranker.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | # concate query and text as sentence pair 5 | return_pair: true 6 | -------------------------------------------------------------------------------- /src/data/config/model/sparse.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | - _default 3 | 4 | # the number of tokens to keep in text 5 | text_gate_k: 0 6 | # the number of tokens to keep in query 7 | query_gate_k: 0 8 | 9 | # return attention mask for the eos/sep token 10 | return_special_mask: false 11 | # return the attention mask for the first occurance of a token in a piece of text 12 | return_first_mask: false 13 | # separate query and document encoder? 14 | untie_encoder: false 15 | -------------------------------------------------------------------------------- /src/data/config/rankt5.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: ranker 6 | - mode: train 7 | # add _self_ here so that the following arguments can be rewritten 8 | - _self_ 9 | 10 | base: 11 | plm: t5 12 | 13 | model: 14 | model_type: rankt5 15 | ranking_token: 32089 # 16 | 17 | query_prefix: "Query:" 18 | text_prefix: "Text:" 19 | 20 | train: 21 | batch_size: 32 22 | learning_rate: 1e-4 23 | nneg: 7 24 | -------------------------------------------------------------------------------- /src/data/config/retromae.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: dense 6 | - index: faiss 7 | - mode: eval 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | base: 12 | plm: retromae_distill 13 | text_col: [1, 2] 14 | text_length: 140 15 | 16 | model: 17 | model_type: dpr 18 | 19 | index: 20 | index_type: Flat 21 | 22 | train: 23 | nneg: 15 24 | learning_rate: 2e-5 25 | scheduler: linear 26 | 27 | eval: 28 | eval_posting_length: true -------------------------------------------------------------------------------- /src/data/config/script/_default.yaml: -------------------------------------------------------------------------------- 1 | defaults: 2 | # essential to relocate the package 3 | - /_default 4 | 5 | mode: script -------------------------------------------------------------------------------- /src/data/config/script/doct5.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | # very essential to put the package directive so that the following config parameters are situated at the root layer 3 | 4 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 5 | defaults: 6 | - _default 7 | # add group package so the default list can be overriden from cli by name 8 | - /base@_group_: NQ 9 | - _self_ 10 | 11 | base: 12 | plm: doct5 13 | eval_batch_size: 50 14 | 15 | # sometimes we want to tokenize the generated queries with another plm and save the results 16 | dest_plm: t5 17 | # how many queries to generate for each document 18 | query_per_doc: 10 19 | # load previously stored memmap file? 20 | load_encode: false 21 | # how many threads to use? 22 | tokenize_thread: 32 23 | -------------------------------------------------------------------------------- /src/data/config/script/download.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | # very essential to put the package directive so that the following config parameters are situated at the root layer 3 | 4 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 5 | defaults: 6 | - _default 7 | # add group package so the default list can be overriden from cli by name 8 | - /base@_group_: MSMARCO-passage 9 | -------------------------------------------------------------------------------- /src/data/config/script/eval.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | # very essential to put the package directive so that the following config parameters are situated at the root layer 3 | 4 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 5 | defaults: 6 | - _default 7 | # add group package so the default list can be overriden from cli by name 8 | - /base@_group_: MSMARCO-passage 9 | - /mode@_here_: 10 | - _eval 11 | - _self_ 12 | 13 | src: ??? 14 | -------------------------------------------------------------------------------- /src/data/config/script/negative.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | # very essential to put the package directive so that the following config parameters are situated at the root layer 3 | 4 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 5 | defaults: 6 | - _default 7 | # add group package so the default list can be overriden from cli by name 8 | - /base@_group_: MSMARCO-passage 9 | - _self_ 10 | 11 | query_set: [train] 12 | neg_type: BM25 13 | save_name: default 14 | hits: 200 -------------------------------------------------------------------------------- /src/data/config/script/preprocess.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | # very essential to put the package directive so that the following config parameters are situated at the root layer 3 | 4 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 5 | defaults: 6 | - _default 7 | # add group package so the default list can be overriden from cli by name 8 | - /base@_group_: MSMARCO-passage 9 | - _self_ 10 | 11 | do_text: true 12 | do_query: true 13 | query_set: [train, dev] 14 | pretokenize: true 15 | tokenize_thread: 32 16 | -------------------------------------------------------------------------------- /src/data/config/script/ttest.yaml: -------------------------------------------------------------------------------- 1 | # @package _global_ 2 | # very essential to put the package directive so that the following config parameters are situated at the root layer 3 | 4 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 5 | defaults: 6 | - _default 7 | # add group package so the default list can be overriden from cli by name 8 | - /base@_group_: MSMARCO-passage 9 | - /mode@_group_: script 10 | - _self_ 11 | 12 | x_model: ??? 13 | y_model: ??? 14 | ttest_metric: [MRR@10, Recall@10] 15 | -------------------------------------------------------------------------------- /src/data/config/seal.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: NQ320k 5 | - model: generative 6 | - index: fm 7 | - mode: eval 8 | - extra: code 9 | # add _self_ here so that the following arguments can be rewritten 10 | - _self_ 11 | 12 | base: 13 | plm: bart 14 | parallel: query 15 | eval_batch_size: 1 16 | 17 | model: 18 | model_type: seal 19 | nbeam: 10 20 | 21 | train: 22 | # only query-pos pair 23 | neg_type: none 24 | epoch: 80 25 | max_grad_norm: 0.1 26 | 27 | learning_rate: 1e-3 28 | scheduler: linear 29 | batch_size: 400 30 | eval_delay: 40e 31 | 32 | main_metric: MRR@10 33 | 34 | code: 35 | code_type: seal 36 | code_length: 10 37 | return_code: false 38 | # return_query_code: true 39 | -------------------------------------------------------------------------------- /src/data/config/sequer.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: NQ 5 | - model: ranker 6 | - index: trie 7 | - mode: train 8 | - extra: code 9 | # add _self_ here so that the following arguments can be rewritten 10 | - _self_ 11 | 12 | base: 13 | plm: t5 14 | parallel: query 15 | 16 | model: 17 | model_type: sequer 18 | train_scheme: contra 19 | # how to rank when evaluating 20 | rank_type: eos 21 | # beam size 22 | nbeam: 10 23 | 24 | index: 25 | # threshold for relaxed beam search 26 | beam_trsd: 0 27 | trsd_start_len: 3 28 | 29 | train: 30 | epoch: 50 31 | batch_size: 64 32 | learning_rate: 3e-5 33 | neg_type: BM25 34 | nneg: 23 35 | main_metric: MRR@10 36 | return_prefix_mask: true 37 | bf16: true 38 | 39 | eval: 40 | eval_mode: rerank 41 | cand_type: BM25 42 | ncand: 100 43 | 44 | code: 45 | code_type: words_comma_plus_stem 46 | code_length: 26 47 | code_sep: "," 48 | -------------------------------------------------------------------------------- /src/data/config/sparta.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: sparse 6 | - index: invvec 7 | - mode: train 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | model: 12 | model_type: sparta 13 | text_decode_k: 200 14 | 15 | index: 16 | return_first_mask: false 17 | load_index: false 18 | save_index: false 19 | 20 | train: 21 | nneg: 7 22 | learning_rate: 3e-5 23 | 24 | eval: 25 | eval_posting_length: true 26 | eval_flops: true 27 | -------------------------------------------------------------------------------- /src/data/config/spladev2.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: sparse 6 | - index: impact 7 | - mode: train 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | model: 12 | model_type: spladev2 13 | save_encode: true 14 | 15 | text_decode_k: 128 16 | query_decode_k: 128 17 | text_lambda: 1e-2 18 | query_lambda: 1e-2 19 | 20 | 21 | index: 22 | return_first_mask: false 23 | load_index: false 24 | save_index: false 25 | 26 | train: 27 | 28 | learning_rate: 2e-5 29 | scheduler: linear 30 | batch_size: 64 31 | nneg: 7 32 | lambda_warmup_step: 0 33 | eval_step: 5e 34 | 35 | eval: 36 | eval_posting_length: true 37 | eval_flops: true 38 | -------------------------------------------------------------------------------- /src/data/config/tokivf.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: sparse 6 | - index: invhit 7 | - mode: train 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | base: 12 | plm: retromae_msmarco 13 | text_col: [1, 2] 14 | 15 | model: 16 | model_type: tokivf 17 | return_special_mask: true 18 | # how many token postings to scan 19 | text_gate_k: 3 20 | 21 | index: 22 | # what percentile of the inverted lists are kept 23 | posting_prune: 0.996 24 | 25 | train: 26 | nneg: 7 27 | enable_distill: bi 28 | distill_src: RetroMAE 29 | 30 | eval: 31 | eval_posting_length: true 32 | verifier_type: pq 33 | verifier_src: DistillVQ_d-RetroMAE 34 | verifier_index: OPQ96,PQ96x8 35 | -------------------------------------------------------------------------------- /src/data/config/topivf.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: sparse 6 | - index: invhit 7 | - mode: train 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | base: 12 | eval_batch_size: 500 13 | 14 | model: 15 | model_type: topivf 16 | 17 | query_gate_k: 20 18 | 19 | vq_src: RetroMAE 20 | vq_index: IVF10000,PQ64x8 21 | 22 | embedding_src: RetroMAE 23 | return_embedding: true 24 | 25 | load_ckpt: none 26 | 27 | enable_commit_loss: true 28 | 29 | train: 30 | epoch: 50 31 | 32 | learning_rate: 1e-4 33 | scheduler: linear 34 | 35 | eval: 36 | hits: 0 37 | eval_posting_length: true 38 | verifier_type: pq 39 | verifier_src: DistillVQ_d-RetroMAE 40 | verifier_index: OPQ96,PQ96x8 41 | -------------------------------------------------------------------------------- /src/data/config/unicoil.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: sparse 6 | - index: invvec 7 | - mode: train 8 | # add _self_ here so that the following arguments can be rewritten 9 | - _self_ 10 | 11 | base: 12 | plm: bert 13 | 14 | model: 15 | model_type: unicoil 16 | return_special_mask: true 17 | return_first_mask: true 18 | 19 | train: 20 | nneg: 7 21 | 22 | eval: 23 | eval_posting_length: true 24 | eval_flops: true 25 | -------------------------------------------------------------------------------- /src/data/config/uniretriever.yaml: -------------------------------------------------------------------------------- 1 | # load the default lists, whose parameters can be changed by referencing its namespace in the following 2 | defaults: 3 | - _default 4 | - base: MSMARCO-passage 5 | - model: _default 6 | - mode: eval 7 | # add _self_ here so that the following arguments can be rewritten 8 | - _self_ 9 | 10 | mode: 11 | verifier_type: flat 12 | verifier_src: AR2 13 | # verifier_index: OPQ96,PQ96x8 14 | 15 | eval: 16 | model_type: uniretriever 17 | load_index: true 18 | 19 | x_model: BM25 20 | x_index_type: bm25 21 | x_hits: 1000 22 | 23 | y_model: AR2 24 | y_index_type: IVF10000,PQ64x8 25 | y_hits: 1000 26 | 27 | x_load_encode: true 28 | y_load_encode: true 29 | x_load_index: true 30 | y_load_index: true 31 | 32 | x_load_ckpt: best 33 | y_load_ckpt: best 34 | 35 | x_verifier_type: none 36 | y_verifier_type: none 37 | 38 | x_eval_posting_length: true 39 | y_eval_posting_length: true 40 | -------------------------------------------------------------------------------- /src/models/AR2.py: -------------------------------------------------------------------------------- 1 | from .DPR import DPR 2 | from utils.util import load_pickle 3 | 4 | 5 | class AR2(DPR): 6 | def __init__(self, config): 7 | super().__init__(config) 8 | 9 | 10 | def forward(self, x): 11 | raise NotImplementedError("AR2 training not implemented!") 12 | -------------------------------------------------------------------------------- /src/models/AutoModel.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from .BaseModel import BaseModel 4 | 5 | from .AR2 import AR2 6 | from .AutoTSG import AutoTSG 7 | from .BM25 import BM25 8 | from .COIL import COIL 9 | from .ColBERT import ColBERT 10 | from .DSI import DSI, GENRE, DSIQG 11 | from .DeepImpact import DeepImpact 12 | from .DPR import DPR 13 | from .IVF import IVF, TopIVF, TokIVF 14 | from .SPARTA import SPARTA 15 | from .SPLADE import SPLADEv2 16 | from .SEAL import SEAL 17 | from .RankT5 import RankT5 18 | from .CrossEnc import CrossEncoder 19 | from .UniCOIL import UniCOIL 20 | from .UniRetriever import UniRetriever 21 | from .VQ import DistillVQ 22 | 23 | MODEL_MAP = { 24 | "ar2": AR2, 25 | "bm25": BM25, 26 | "autotsg": AutoTSG, 27 | "coil": COIL, 28 | "colbert": ColBERT, 29 | "crossenc": CrossEncoder, 30 | "deepimpact": DeepImpact, 31 | "distillvq": DistillVQ, 32 | "dpr": DPR, 33 | "dsi": DSI, 34 | "dsiqg": DSIQG, 35 | "genre": GENRE, 36 | "ivf": IVF, 37 | "rankt5": RankT5, 38 | "sparta": SPARTA, 39 | "spladev2": SPLADEv2, 40 | "seal": SEAL, 41 | "topivf": TopIVF, 42 | "tokivf": TokIVF, 43 | "unicoil": UniCOIL, 44 | "uniretriever": UniRetriever 45 | } 46 | 47 | 48 | class AutoModel(BaseModel): 49 | @classmethod 50 | def from_pretrained(cls, ckpt_path, **kwargs): 51 | state_dict = torch.load(ckpt_path, map_location="cpu") 52 | 53 | config = state_dict["config"] 54 | model_name_current = os.path.abspath(ckpt_path).split(os.sep)[-2] 55 | model_name_ckpt = config.name 56 | model_type = model_name_current.split("_")[0].lower() 57 | 58 | # override model name 59 | config.update(**kwargs, name=model_name_current) 60 | # re-initialize the config so the distributed information is properly set 61 | config.__post_init__() 62 | 63 | try: 64 | model = MODEL_MAP[model_type](config).to(config.device) 65 | except KeyError: 66 | raise NotImplementedError(f"Model {model_type} not implemented!") 67 | if model_name_ckpt != model_name_current: 68 | model.logger.warning(f"model name in the checkpoint is {model_name_ckpt}, while it's {model_name_current} now!") 69 | 70 | model.logger.info(f"loading model from {ckpt_path} with checkpoint config...") 71 | model.load_state_dict(state_dict["model"]) 72 | model.metrics = state_dict["metrics"] 73 | 74 | model.eval() 75 | return model 76 | -------------------------------------------------------------------------------- /src/models/ColBERT.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from transformers import AutoModel 4 | from .BaseModel import BaseModel 5 | 6 | 7 | 8 | class ColBERT(BaseModel): 9 | def __init__(self, config): 10 | super().__init__(config) 11 | 12 | self._set_encoder() 13 | 14 | self.tokenProject = nn.Linear(self.textEncoder.config.hidden_size, config.token_dim) 15 | 16 | 17 | def _encode_text(self, **kwargs): 18 | for k, v in kwargs.items(): 19 | # B, 1+N, L -> B * (1+N), L 20 | if v.dim() == 3: 21 | kwargs[k] = v.view(-1, v.shape[-1]) 22 | 23 | token_all_embedding = self.textEncoder(**kwargs)[0] 24 | token_embedding = self.tokenProject(token_all_embedding) 25 | return token_embedding 26 | 27 | 28 | def _encode_query(self, **kwargs): 29 | token_all_embedding = self.queryEncoder(**kwargs)[0] 30 | token_embedding = self.tokenProject(token_all_embedding) 31 | return token_embedding 32 | 33 | 34 | def forward(self, x): 35 | x = self._move_to_device(x) 36 | 37 | query_token_embedding = self._encode_query(**x["query"]) # B, LQ, D 38 | text_token_embedding = self._encode_text(**x["text"]) # B*(1+N), LS, D 39 | 40 | if self.config.is_distributed and self.config.enable_all_gather: 41 | query_token_embedding = self._gather_tensors(query_token_embedding) 42 | text_token_embedding = self._gather_tensors(text_token_embedding) 43 | 44 | query_text_score = torch.einsum('qin,tjn->qitj', query_token_embedding, text_token_embedding) 45 | query_text_score = query_text_score.max(dim=-1)[0] # B, LQ, B*(1+N) 46 | score = query_text_score.sum(dim=1) # B, B*(1+N) 47 | 48 | B = score.shape[0] 49 | if self.config.enable_inbatch_negative: 50 | label = torch.arange(B, device=self.config.device) 51 | label = label * (text_token_embedding.shape[0] // query_token_embedding.shape[0]) 52 | else: 53 | label = torch.zeros(B, dtype=torch.long, device=self.config.device) 54 | score = score.view(B, B, -1)[range(B), range(B)] # B, 1+N 55 | 56 | loss = self._compute_loss(score, label, self._compute_teacher_score(x)) 57 | return loss 58 | 59 | 60 | def rerank_step(self, x): 61 | """ 62 | given a query and a sequence, output the sequence's score 63 | """ 64 | x = self._move_to_device(x) 65 | query_token_embedding = self._encode_query(**x["query"]) # B, LQ, D 66 | text_token_embedding = self._encode_text(**x["text"]) # B, LS, D 67 | 68 | query_text_score = query_token_embedding.matmul(text_token_embedding.transpose(-1,-2)) 69 | score = query_text_score.max(dim=-1)[0].sum(dim=-1) # B 70 | return score 71 | 72 | 73 | def retrieve(self, manager, loaders): 74 | self.logger.error("currently we do not support retrieval with ColBERT, instead we evaluate it by reranking task") 75 | raise 76 | 77 | -------------------------------------------------------------------------------- /src/models/CrossEnc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from transformers import AutoModelForSequenceClassification 4 | from .BaseModel import BaseModel 5 | 6 | 7 | 8 | class CrossEncoder(BaseModel): 9 | def __init__(self, config): 10 | super().__init__(config) 11 | 12 | self.plm = AutoModelForSequenceClassification.from_pretrained(config.plm_dir, num_labels=1) 13 | # self.plm.pooler = None 14 | if config.code_size > 0: 15 | self.plm.resize_token_embeddings(config.vocab_size + config.code_size) 16 | 17 | 18 | def _compute_score(self, **kwargs): 19 | """ concate the query and the input text; 20 | Args: 21 | query_token_id: B, LQ 22 | text_token_id: B, LS 23 | Returns: 24 | tensor of [B] 25 | """ 26 | for k, v in kwargs.items(): 27 | # B, 1+N, L -> B * (1+N), L 28 | if v.dim() == 3: 29 | kwargs[k] = v.view(-1, v.shape[-1]) 30 | 31 | score = self.plm(**kwargs).logits.squeeze(-1) 32 | return score 33 | 34 | 35 | def rerank_step(self, x): 36 | x = self._move_to_device(x) 37 | if "text_code" in x: 38 | # concate query and text code as inputs 39 | query_token_id = x["query"]["input_ids"] # B, L 40 | query_attn_mask = x["query"]["attention_mask"] 41 | 42 | text_code = x["text_code"] # B, 1+N, LC or B, LC 43 | if text_code.dim() == 3: 44 | text_code = text_code.flatten(0, 1) # B*(1+N) or B, LC 45 | 46 | M, L = text_code.shape[0] // query_token_id.shape[0], query_token_id.shape[-1] 47 | 48 | pair_token_id = torch.zeros((text_code.shape[0], text_code.shape[-1] + query_token_id.shape[-1] - 1), device=text_code.device) 49 | pair_token_id[:, :L] = query_token_id.repeat_interleave(M, 0) 50 | # remove the leading 0 51 | pair_token_id[:, L:] = text_code[:, 1:] 52 | 53 | pair_attn_mask = torch.zeros_like(pair_token_id) 54 | pair_attn_mask[:, :L] = query_attn_mask.repeat_interleave(M, 0) 55 | pair_attn_mask[:, L:] = (text_code != -1).float() 56 | 57 | pair = { 58 | "input_ids": pair_token_id, 59 | "attention_mask": pair_attn_mask 60 | } 61 | if "token_type_ids" in x["query"]: 62 | pair_type_id = torch.zeros_like(pair_attn_mask) 63 | pair_type_id[:, L:] = 1 64 | pair["token_type_ids"] = pair_type_id 65 | else: 66 | pair = x["pair"] 67 | 68 | score = self._compute_score(**pair) # B or B*(1+N) 69 | return score 70 | 71 | 72 | def forward(self, x): 73 | pair = x["pair"] 74 | score = self.rerank_step(x) # B*(1+N) 75 | 76 | if pair["input_ids"].dim() == 3: 77 | # use cross entropy loss 78 | score = score.view(x["pair"]["input_ids"].shape[0], -1) 79 | label = torch.zeros(score.shape[0], dtype=torch.long, device=self.config.device) 80 | loss = F.cross_entropy(score, label) 81 | 82 | elif pair["input_ids"].dim() == 2: 83 | label = x["label"] 84 | loss = F.binary_cross_entropy(torch.sigmoid(score), label) 85 | 86 | return loss 87 | 88 | 89 | -------------------------------------------------------------------------------- /src/models/DPR.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | import torch.nn.functional as F 5 | from transformers import AutoModel, AutoTokenizer 6 | from .BaseModel import BaseDenseModel 7 | 8 | 9 | 10 | class DPR(BaseDenseModel): 11 | """ 12 | The basic dense retriever. `Paper `_. 13 | """ 14 | def __init__(self, config): 15 | super().__init__(config) 16 | 17 | self._set_encoder() 18 | self._output_dim = self.textEncoder.config.hidden_size 19 | 20 | 21 | def _encode_text(self, **kwargs): 22 | """ 23 | encode tokens with bert 24 | """ 25 | for k, v in kwargs.items(): 26 | # B, 1+N, L -> B * (1+N), L 27 | if v.dim() == 3: 28 | kwargs[k] = v.view(-1, v.shape[-1]) 29 | 30 | embedding = self.textEncoder(**kwargs)[0][:, 0] 31 | if self.config.dense_metric == "cos": 32 | embedding = F.normalize(embedding, dim=-1) 33 | return embedding 34 | 35 | 36 | def _encode_query(self, **kwargs): 37 | embedding = self.queryEncoder(**kwargs)[0][:, 0] 38 | if self.config.dense_metric == "cos": 39 | embedding = F.normalize(embedding, dim=-1) 40 | return embedding 41 | 42 | 43 | def forward(self, x): 44 | x = self._move_to_device(x) 45 | query_embedding = self._encode_query(**x["query"]) # B, D 46 | text_embedding = self._encode_text(**x["text"]) # *, D 47 | 48 | if self.config.is_distributed and self.config.enable_all_gather: 49 | query_embedding = self._gather_tensors(query_embedding) 50 | text_embedding = self._gather_tensors(text_embedding) 51 | 52 | if self.config.dense_metric == "ip": 53 | score = query_embedding.matmul(text_embedding.transpose(-1,-2)) # B, B*(1+N) 54 | elif self.config.dense_metric == "cos": 55 | score = self._cos_sim(query_embedding, text_embedding) 56 | elif self.config.dense_metric == "l2": 57 | score = self._l2_sim(query_embedding, text_embedding) 58 | else: 59 | raise NotImplementedError 60 | 61 | B = query_embedding.size(0) 62 | # in batch negative 63 | if self.config.enable_inbatch_negative: 64 | label = torch.arange(B, device=self.config.device) 65 | label = label * (text_embedding.shape[0] // query_embedding.shape[0]) 66 | else: 67 | label = torch.zeros(B, dtype=torch.long, device=self.config.device) 68 | score = score.view(B, B, -1)[range(B), range(B)] # B, 1+N 69 | 70 | loss = self._compute_loss(score, label, self._compute_teacher_score(x)) 71 | return loss 72 | 73 | 74 | def rerank_step(self, x): 75 | """ 76 | given a query and a sequence, output the sequence's score 77 | """ 78 | query_embedding = self._encode_query(**x["query"]) # B, D 79 | text_embedding = self._encode_text(**x["text"]) # B, D 80 | B = query_embedding.size(0) 81 | score = query_embedding.matmul(text_embedding.transpose(-1, -2))[range(B), range(B)] 82 | return score 83 | 84 | 85 | def deploy(self): 86 | deploy_dir = os.path.join(self.config.cache_root, "deploy", self.name) 87 | os.makedirs(deploy_dir, exist_ok=True) 88 | 89 | AutoTokenizer.from_pretrained(self.config.plm_dir).save_pretrained(deploy_dir) 90 | if self.config.untie_encoder: 91 | self.queryEncoder.save_pretrained(os.path.join(deploy_dir, "query")) 92 | self.textEncoder.save_pretrained(os.path.join(deploy_dir, "text")) 93 | else: 94 | self.logger.info(f"saving plm model and tokenizer at {deploy_dir}...") 95 | self.plm.save_pretrained(deploy_dir) -------------------------------------------------------------------------------- /src/models/DeepImpact.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from .UniCOIL import UniCOIL 3 | from .BaseModel import BaseSparseModel 4 | 5 | 6 | 7 | class DeepImpact(UniCOIL): 8 | def __init__(self, config): 9 | """ 10 | `DeepImpact model `_ 11 | """ 12 | super().__init__(config) 13 | 14 | 15 | def encode_query_step(self, x): 16 | """ 17 | not contextualized 18 | """ 19 | return BaseSparseModel.encode_query_step(self, x) 20 | -------------------------------------------------------------------------------- /src/models/KeyRank.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .BaseModel import BaseModel 3 | 4 | class KeyRank(BaseModel): 5 | """ 6 | Select keywords from the document for ranking, using REINFORCE policy gradient. 7 | """ 8 | def __init__(self, config): 9 | super().__init__(config) 10 | 11 | 12 | def forward(self, x): 13 | pass 14 | 15 | 16 | def rerank_step(self, x): 17 | pass 18 | -------------------------------------------------------------------------------- /src/models/RankT5.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from transformers import T5ForConditionalGeneration 4 | from .BaseModel import BaseModel 5 | 6 | 7 | 8 | class RankT5(BaseModel): 9 | def __init__(self, config): 10 | super().__init__(config) 11 | self.plm = T5ForConditionalGeneration.from_pretrained(config.plm_dir) 12 | 13 | 14 | def _compute_score(self, **kwargs): 15 | """ concate the query and the input text; 16 | Args: 17 | query_token_id: B, LQ 18 | text_token_id: B, LS 19 | Returns: 20 | tensor of [B] 21 | """ 22 | for k, v in kwargs.items(): 23 | # B, 1+N, L -> B * (1+N), L 24 | if v.dim() == 3: 25 | kwargs[k] = v.view(-1, v.shape[-1]) 26 | 27 | batch_size = kwargs["input_ids"].shape[0] 28 | score = self.plm(**kwargs, decoder_input_ids=torch.zeros((batch_size, 1), dtype=torch.long, device=self.config.device)).logits[:, 0, self.config.ranking_token] 29 | return score 30 | 31 | 32 | def rerank_step(self, x): 33 | x = self._move_to_device(x) 34 | pair = x["pair"] 35 | score = self._compute_score(**pair) 36 | return score 37 | 38 | 39 | def forward(self, x): 40 | pair = x["pair"] 41 | score = self.rerank_step(x) # B*(1+N) 42 | 43 | if pair["input_ids"].dim() == 3: 44 | # use cross entropy loss 45 | score = score.view(x["pair"]["input_ids"].shape[0], -1) 46 | label = torch.zeros(score.shape[0], dtype=torch.long, device=self.config.device) 47 | loss = F.cross_entropy(score, label) 48 | 49 | elif pair["input_ids"].dim() == 2: 50 | label = x["label"] 51 | loss = F.binary_cross_entropy(torch.sigmoid(score), label) 52 | 53 | return loss 54 | 55 | 56 | -------------------------------------------------------------------------------- /src/models/SPARTA.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | import torch.nn.functional as F 5 | import torch.distributed as dist 6 | from tqdm import tqdm 7 | from transformers import AutoModel, AutoTokenizer 8 | from .BaseModel import BaseSparseModel 9 | from utils.util import BaseOutput 10 | 11 | 12 | class SPARTA(BaseSparseModel): 13 | def __init__(self, config): 14 | super().__init__(config) 15 | 16 | self.plm = AutoModel.from_pretrained(config.plm_dir) 17 | self.plm.pooler = None 18 | 19 | self._skip_special_tokens = True 20 | self._text_length = self.config.text_decode_k 21 | 22 | 23 | def _encode_text(self, **kwargs): 24 | for k, v in kwargs.items(): 25 | # B, 1+N, L -> B * (1+N), L 26 | if v.dim() == 3: 27 | kwargs[k] = v.view(-1, v.shape[-1]) 28 | 29 | token_embedding = self.plm(**kwargs)[0] # B, L, D 30 | return token_embedding 31 | 32 | 33 | def _encode_query(self, token_id): 34 | return self.plm.embeddings.word_embeddings(token_id) 35 | 36 | 37 | def forward(self, x): 38 | x = self._move_to_device(x) 39 | 40 | query_token_embedding = self._encode_query(x["query"]["input_ids"]) # B, L, D 41 | text_token_embedding = self._encode_text(**x["text"]) # B*(1+N), L, D 42 | 43 | if self.config.is_distributed and self.config.enable_all_gather: 44 | query_token_embedding = self._gather_tensors(query_token_embedding) 45 | text_token_embedding = self._gather_tensors(text_token_embedding) 46 | 47 | query_text_score = torch.einsum('qin,tjn->qitj', query_token_embedding, text_token_embedding) 48 | query_text_score = query_text_score.max(dim=-1)[0] # B, LQ, B*(1+N) 49 | query_text_score = torch.log(torch.relu(query_text_score) + 1) 50 | score = query_text_score.sum(dim=1) # B, B*(1+N) 51 | 52 | B = score.shape[0] 53 | if self.config.enable_inbatch_negative: 54 | label = torch.arange(B, device=self.config.device) 55 | label = label * (text_token_embedding.shape[0] // query_token_embedding.shape[0]) 56 | else: 57 | label = torch.zeros(B, dtype=torch.long, device=self.config.device) 58 | score = score.view(B, B, -1)[range(B), range(B)] # B, 1+N 59 | 60 | loss = self._compute_loss(score, label, self._compute_teacher_score(x)) 61 | return loss 62 | 63 | 64 | def encode_text_step(self, x): 65 | """ 66 | Pre-compute interactions of all possible tokens with each text token, keep the most matching text token; then only index the topk decoded tokens (top k important tokens in the sense that they will contribute most to the final text score) 67 | """ 68 | text = self._move_to_device(x["text"]) 69 | text_token_embedding = self._encode_text(**text) # B, L, D 70 | vocab_embedding = self.plm.embeddings.word_embeddings.weight # V, D 71 | text_token_embedding = torch.einsum("vd,...ld->...lv", vocab_embedding, text_token_embedding) # B, L, V 72 | text_embedding = torch.log(torch.relu(text_token_embedding.max(1)[0]) + 1) # B, V 73 | 74 | text_token_id, text_token_weight = text_embedding.topk(k=self._text_length) 75 | return text_token_id.cpu().numpy(), text_token_weight.unsqueeze(-1).cpu().numpy() 76 | 77 | -------------------------------------------------------------------------------- /src/models/UniRetriever.py: -------------------------------------------------------------------------------- 1 | import os 2 | from collections import defaultdict 3 | from .BaseModel import BaseModel 4 | from utils.util import load_pickle 5 | 6 | 7 | class UniRetriever(BaseModel): 8 | def __init__(self, config): 9 | from .AutoModel import AutoModel as AM 10 | super().__init__(config) 11 | 12 | if config.x_model != "none": 13 | additional_kwargs = { 14 | "data_root": config.data_root, 15 | "plm_root": config.plm_root, 16 | "text_col": config.text_col, 17 | "device": config.get("x_device", config.device), 18 | "verifier_type": config.verifier_type, 19 | "verifier_src": config.verifier_src, 20 | "verifier_index": config.verifier_index 21 | } 22 | for k,v in config.items(): 23 | if k.startswith("x_") and k != "x_model": 24 | additional_kwargs[k[2:]] = v 25 | 26 | XModel = AM.from_pretrained(os.path.join(config.cache_root, "ckpts", config.x_model, config.x_load_ckpt), **additional_kwargs) 27 | else: 28 | XModel = None 29 | 30 | if config.y_model != "none": 31 | additional_kwargs = { 32 | "data_root": config.data_root, 33 | "plm_root": config.plm_root, 34 | "text_col": config.text_col, 35 | "device": config.get("y_device", config.device), 36 | "verifier_type": config.verifier_type, 37 | "verifier_src": config.verifier_src, 38 | "verifier_index": config.verifier_index 39 | } 40 | for k,v in config.items(): 41 | if k.startswith("y_") and k != "y_model": 42 | additional_kwargs[k[2:]] = v 43 | 44 | YModel = AM.from_pretrained(os.path.join(config.cache_root, "ckpts", config.y_model, config.y_load_ckpt), **additional_kwargs) 45 | 46 | else: 47 | YModel = None 48 | 49 | self.XModel = XModel 50 | self.YModel = YModel 51 | 52 | 53 | def retrieve(self, loaders): 54 | """ retrieve by index 55 | 56 | Args: 57 | encode_query: if true, compute query embedding before retrieving 58 | """ 59 | if self.XModel is not None: 60 | x_retrieval_result = self.XModel.retrieve(loaders) 61 | self.metrics.update({f"X {k}": v for k, v in self.XModel.metrics.items() if k in ["Posting_List_Length"]}) 62 | else: 63 | x_retrieval_result = {} 64 | 65 | if self.YModel is not None: 66 | y_retrieval_result = self.YModel.retrieve(loaders) 67 | self.metrics.update({f"Y {k}": v for k, v in self.YModel.metrics.items() if k in ["Posting_List_Length"]}) 68 | else: 69 | y_retrieval_result = {} 70 | 71 | try: 72 | posting_length = self.metrics["X Posting_List_Length"] + self.metrics["Y Posting_List_Length"] 73 | flops = round((posting_length) * 48 / len(loaders["text"].dataset), 2) 74 | self.metrics.update({"Posting_List_length": posting_length, "FLOPs": flops}) 75 | except: 76 | pass 77 | 78 | if self.config.get("save_intm_result"): 79 | self.XModel._gather_retrieval_result( 80 | x_retrieval_result, 81 | retrieval_result_path=os.path.join(self.retrieve_dir, "x_retrieval_result.pkl") 82 | ) 83 | self.YModel._gather_retrieval_result( 84 | y_retrieval_result, 85 | retrieval_result_path=os.path.join(self.retrieve_dir, "y_retrieval_result.pkl") 86 | ) 87 | 88 | loader_query = loaders["query"] 89 | retrieval_result = {} 90 | for qidx in range(loader_query.sampler.start, loader_query.sampler.end): 91 | res = dict(x_retrieval_result.get(qidx, [])) 92 | res.update(dict(y_retrieval_result.get(qidx, []))) 93 | sorted_res = sorted(res.items(), key=lambda x: x[1], reverse=True)[:self.config.hits] 94 | retrieval_result[qidx] = sorted_res 95 | 96 | return retrieval_result 97 | -------------------------------------------------------------------------------- /src/notebooks/data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import sys\n", 11 | "if sys.path[-1] != \"../\":\n", 12 | " sys.path.append(\"../\")\n", 13 | " os.chdir(\"../\")\n", 14 | "\n", 15 | "import numpy as np\n", 16 | "from collections import defaultdict\n", 17 | "from transformers import AutoModel, AutoTokenizer\n", 18 | "\n", 19 | "import torch\n", 20 | "import torch.nn.functional as F\n", 21 | "from models.AutoModel import AutoModel as AM\n", 22 | "from utils.util import *\n", 23 | "from utils.index import *\n", 24 | "from utils.data import *\n", 25 | "\n", 26 | "from hydra import initialize, compose\n", 27 | "\n", 28 | "config = Config()\n", 29 | "with initialize(version_base=None, config_path=\"../data/config/\"):\n", 30 | " overrides = (\n", 31 | " ).split(\" \")\n", 32 | " hydra_config = compose(config_name=\"_example\", overrides=overrides)\n", 33 | " config._from_hydra(hydra_config)\n", 34 | "\n", 35 | "loaders = prepare_data(config)\n", 36 | "loader_text = loaders[\"text\"]\n", 37 | "loader_query = loaders[\"query\"]\n", 38 | "# loader_rerank = loaders[\"rerank\"]\n", 39 | "loader_train = prepare_train_data(config, loader_text.dataset, return_dataloader=True)\n", 40 | "\n", 41 | "text_dataset = loader_text.dataset\n", 42 | "query_dataset = loader_query.dataset\n", 43 | "train_dataset = loader_train.dataset\n", 44 | "\n", 45 | "X1 = iter(loader_train)\n", 46 | "X2 = iter(loader_text)\n", 47 | "X3 = iter(loader_query)\n", 48 | "# X4 = iter(loader_rerank)\n", 49 | "\n", 50 | "x = next(X1)\n", 51 | "x2 = next(X2)\n", 52 | "x3 = next(X3)\n", 53 | "# x4 = next(X4)" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": null, 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "t = AutoTokenizer.from_pretrained(config.plm_dir)\n", 63 | "# m = AutoModel.from_pretrained(config.plm_dir)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "t.decode(x[\"query\"][\"input_ids\"][1]), t.batch_decode(x[\"text\"][\"input_ids\"][1])\n", 73 | "\n", 74 | "# idx = 1000000\n", 75 | "# t.decode(train_dataset[idx][\"query\"][\"input_ids\"]), t.batch_decode(train_dataset[idx][\"text\"][\"input_ids\"])" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [] 84 | } 85 | ], 86 | "metadata": { 87 | "kernelspec": { 88 | "display_name": "Python 3.9.12", 89 | "language": "python", 90 | "name": "python3" 91 | }, 92 | "language_info": { 93 | "codemirror_mode": { 94 | "name": "ipython", 95 | "version": 3 96 | }, 97 | "file_extension": ".py", 98 | "mimetype": "text/x-python", 99 | "name": "python", 100 | "nbconvert_exporter": "python", 101 | "pygments_lexer": "ipython3", 102 | "version": "3.9.12" 103 | }, 104 | "orig_nbformat": 4, 105 | "vscode": { 106 | "interpreter": { 107 | "hash": "778a5a6b0df35a46498564cf16af2e5ec016022ef7dc9d5934de67fcb1f6bfb9" 108 | } 109 | } 110 | }, 111 | "nbformat": 4, 112 | "nbformat_minor": 2 113 | } 114 | -------------------------------------------------------------------------------- /src/run.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import hydra 3 | from omegaconf import OmegaConf 4 | from utils.util import Config 5 | from utils.data import prepare_data 6 | from models.AutoModel import MODEL_MAP 7 | 8 | name = None 9 | 10 | @hydra.main(version_base=None, config_path="data/config/") 11 | def get_config(hydra_config: OmegaConf): 12 | config._from_hydra(hydra_config) 13 | config.name = name 14 | 15 | 16 | def main(config:Config): 17 | """ train/dev/test the model (in distributed) 18 | 19 | Args: 20 | rank: current process id 21 | world_size: total gpus 22 | """ 23 | loaders = prepare_data(config) 24 | model = MODEL_MAP[config.model_type](config).to(config.device) 25 | 26 | if config.mode == "train": 27 | from utils.trainer import train 28 | model.load() 29 | train(model, loaders) 30 | 31 | elif config.mode == "eval": 32 | model.load() 33 | model.evaluate(loaders) 34 | 35 | elif config.mode == "encode": 36 | model.load() 37 | model.encode(loaders) 38 | 39 | elif config.mode == "cluster": 40 | model.load() 41 | model.cluster(loaders) 42 | 43 | elif config.mode == "code": 44 | model.load() 45 | model.generate_code(loaders) 46 | 47 | elif config.mode == "migrate": 48 | from utils.util import load_from_previous 49 | if config.is_main_proc: 50 | path = f"{config.cache_root}/ckpts/{model.name}/{config.load_ckpt}" 51 | load_from_previous(model, path) 52 | model.save() 53 | 54 | elif config.mode == "deploy": 55 | model.load() 56 | model.deploy() 57 | 58 | elif config.mode == "index": 59 | model.load() 60 | model.index(loaders) 61 | 62 | else: 63 | raise ValueError(f"Invalid mode {config.mode}!") 64 | 65 | if config.save_model: 66 | model.save() 67 | 68 | 69 | 70 | if __name__ == "__main__": 71 | # get the model full name 72 | name = sys.argv.pop(1) 73 | # parse the config_name, which is the first part in the list split by _ 74 | config_name = name.split("_")[0].lower() 75 | # add the parsed config_name back to the sys.argv so that hydra can use it 76 | sys.argv.insert(1, config_name) 77 | sys.argv.insert(1, "--config-name") 78 | 79 | # manually action="store_true" because hydra doesn't support it 80 | for i, arg in enumerate(sys.argv): 81 | if i > 2 and "=" not in arg: 82 | sys.argv[i] += "=true" 83 | 84 | config = Config() 85 | get_config() 86 | 87 | main(config) 88 | -------------------------------------------------------------------------------- /src/scripts/download.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from utils.util import Config 3 | 4 | import hydra 5 | from pathlib import Path 6 | from omegaconf import OmegaConf 7 | @hydra.main(version_base=None, config_path="../data/config/", config_name=f"script/{Path(__file__).stem}") 8 | def get_config(hydra_config: OmegaConf): 9 | config._from_hydra(hydra_config) 10 | 11 | 12 | if __name__ == "__main__": 13 | # manually action="store_true" because hydra doesn't support it 14 | for i, arg in enumerate(sys.argv): 15 | if "=" not in arg: 16 | sys.argv[i] += "=true" 17 | 18 | config = Config() 19 | get_config() 20 | -------------------------------------------------------------------------------- /src/scripts/eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from utils.util import compute_metrics, load_pickle, Config 4 | 5 | import hydra 6 | from pathlib import Path 7 | from omegaconf import OmegaConf 8 | @hydra.main(version_base=None, config_path="../data/config/", config_name=f"script/{Path(__file__).stem}") 9 | def get_config(hydra_config: OmegaConf): 10 | config._from_hydra(hydra_config) 11 | 12 | 13 | if __name__ == "__main__": 14 | # manually action="store_true" because hydra doesn't support it 15 | for i, arg in enumerate(sys.argv): 16 | if "=" not in arg: 17 | sys.argv[i] += "=true" 18 | 19 | config = Config() 20 | get_config() 21 | 22 | if os.path.exists(config.src): 23 | path = config.src 24 | elif os.path.exists(os.path.join(config.cache_root, config.eval_mode, config.src, config.eval_set, "retrieval_result.pkl")): 25 | path = os.path.join(config.cache_root, config.eval_mode, config.src, config.eval_set, "retrieval_result.pkl") 26 | else: 27 | raise FileNotFoundError 28 | 29 | retrieval_result = load_pickle(path) 30 | 31 | ground_truth = load_pickle(os.path.join(config.cache_root, "dataset", "query", config.eval_set, "positives.pkl")) 32 | metrics = compute_metrics(retrieval_result, ground_truth, metrics=config.eval_metric, cutoffs=config.eval_metric_cutoff) 33 | print() 34 | print(metrics) -------------------------------------------------------------------------------- /src/scripts/negative.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate negatives from the ``retrieval_result`` returned by :func:`models.BaseModel.BaseModel.retrieve` over ``train`` set. 3 | """ 4 | import sys 5 | import numpy as np 6 | from tqdm import tqdm 7 | from collections import defaultdict 8 | from utils.util import load_pickle, save_pickle, Config 9 | 10 | import hydra 11 | from pathlib import Path 12 | from omegaconf import OmegaConf 13 | @hydra.main(version_base=None, config_path="../data/config/", config_name=f"script/{Path(__file__).stem}") 14 | def get_config(hydra_config: OmegaConf): 15 | config._from_hydra(hydra_config) 16 | 17 | 18 | if __name__ == "__main__": 19 | # manually action="store_true" because hydra doesn't support it 20 | for i, arg in enumerate(sys.argv): 21 | if "=" not in arg: 22 | sys.argv[i] += "=true" 23 | 24 | config = Config() 25 | get_config() 26 | 27 | for query_set in config.query_set: 28 | positives = load_pickle(f"{config.cache_root}/dataset/query/{query_set}/positives.pkl") 29 | 30 | retrieval_result = load_pickle(f"{config.cache_root}/retrieve/{config.neg_type}/{query_set}/retrieval_result.pkl") 31 | hard_negatives = defaultdict(list) 32 | for k,v in tqdm(retrieval_result.items(), desc="Collecting Negatives", ncols=100): 33 | for i, x in enumerate(v[:config.hits]): 34 | if x in positives[k]: 35 | continue 36 | hard_negatives[k].append(x) 37 | 38 | nnegs = np.array([len(x) for x in hard_negatives.values()]) 39 | print(f"the collected query number is {len(hard_negatives)}, whose negative number is MEAN: {np.round(nnegs.mean(), 1)}, MAX: {nnegs.max()}, MIN: {nnegs.min()}") 40 | 41 | if config.save_name != "default": 42 | save_name = config.save_name 43 | else: 44 | save_name = config.neg_type 45 | save_path = f"{config.cache_root}/dataset/query/{query_set}/negatives_{save_name}.pkl" 46 | save_pickle(dict(hard_negatives), save_path) 47 | print(f"saved negatives at {save_path}") -------------------------------------------------------------------------------- /src/scripts/ttest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import scipy.stats as stats 4 | from utils.util import load_pickle, compute_metrics, Config 5 | 6 | import hydra 7 | from pathlib import Path 8 | from omegaconf import OmegaConf 9 | @hydra.main(version_base=None, config_path="../data/config/", config_name=f"script/{Path(__file__).stem}") 10 | def get_config(hydra_config: OmegaConf): 11 | config._from_hydra(hydra_config) 12 | 13 | 14 | if __name__ == "__main__": 15 | # manually action="store_true" because hydra doesn't support it 16 | for i, arg in enumerate(sys.argv): 17 | if "=" not in arg: 18 | sys.argv[i] += "=true" 19 | 20 | config = Config() 21 | get_config() 22 | 23 | if os.path.exists(config.x_model): 24 | x_path = config.x_model 25 | elif os.path.exists(os.path.join(config.cache_root, config.eval_mode, config.x_model, config.eval_set, "retrieval_result.pkl")): 26 | x_path = os.path.join(config.cache_root, config.eval_mode, config.x_model, config.eval_set, "retrieval_result.pkl") 27 | else: 28 | raise FileNotFoundError 29 | 30 | if os.path.exists(config.y_model): 31 | y_path = config.y_model 32 | elif os.path.exists(os.path.join(config.cache_root, config.eval_mode, config.y_model, config.eval_set, "retrieval_result.pkl")): 33 | y_path = os.path.join(config.cache_root, config.eval_mode, config.y_model, config.eval_set, "retrieval_result.pkl") 34 | else: 35 | raise FileNotFoundError 36 | 37 | print(x_path, y_path) 38 | 39 | x_retrieval_result = load_pickle(x_path) 40 | y_retrieval_result = load_pickle(y_path) 41 | 42 | ground_truth = load_pickle(os.path.join(config.cache_root, "dataset", config.eval_set, "positives.pkl")) 43 | 44 | all_metrics = set() 45 | cutoffs = set() 46 | for metric in config.ttest_metric: 47 | if "@" in metric: 48 | metric_body, cutoff = metric.split("@") 49 | all_metrics.add(metric_body.lower()) 50 | cutoffs.add(int(cutoff)) 51 | else: 52 | all_metrics.add(metric.lower()) 53 | 54 | all_metrics = list(all_metrics) 55 | cutoffs = list(cutoffs) 56 | 57 | x_metrics_per_query = compute_metrics(x_retrieval_result, ground_truth, metrics=all_metrics, cutoffs=cutoffs, return_each_query=True) 58 | y_metrics_per_query = compute_metrics(y_retrieval_result, ground_truth, metrics=all_metrics, cutoffs=cutoffs, return_each_query=True) 59 | 60 | print("*" * 10 + f" {config.x_model} (X) v.s. {config.y_model} (Y) " + "*" * 10) 61 | for metric in config.ttest_metric: 62 | print(f"the p of {metric}: {' '*(20 - len(metric))}{stats.ttest_rel(x_metrics_per_query[metric], y_metrics_per_query[metric]).pvalue}") 63 | -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # very essential that we import faiss before torch on zhiyuan machine 2 | import faiss 3 | import torch 4 | 5 | import logging 6 | logging.basicConfig(level=logging.INFO, format="[%(asctime)s] %(levelname)s (%(name)s) %(message)s") 7 | logging.getLogger("faiss.loader").setLevel(logging.ERROR) 8 | logging.getLogger("torch.distributed.distributed_c10d").setLevel(logging.WARNING) 9 | 10 | import transformers 11 | # prevent warning of transformers 12 | transformers.logging.set_verbosity_error() 13 | 14 | import os 15 | os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true' 16 | -------------------------------------------------------------------------------- /src/utils/static.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch.utils.data import DataLoader 4 | from typing import * 5 | 6 | DEVICE = Union[int,Literal["cpu"]] 7 | 8 | RETRIEVAL_MAPPING = Union[dict[int, list[int]], dict[int, list[tuple[int,float]]]] 9 | ID_MAPPING = dict[str, int] 10 | 11 | DENSE_METRIC = Literal["ip", "cos", "l2"] 12 | DATA_FORMAT = Literal["memmap", "raw"] 13 | 14 | TENSOR = torch.Tensor 15 | LOADERS = dict[str,DataLoader] 16 | NN_MODULE = torch.nn.Module 17 | INDICES = Union[np.ndarray,list,torch.Tensor] 18 | 19 | PLM_MAP = { 20 | "bert": { 21 | # different model may share the same tokenizer, so we can load the same tokenized data for them 22 | "tokenizer": "bert", 23 | "load_name": "bert-base-uncased" 24 | }, 25 | "distilbert": { 26 | "tokenizer": "bert", 27 | "load_name": "distilbert-base-uncased", 28 | }, 29 | "ernie": { 30 | "tokenizer": "bert", 31 | "load_name": "nghuyong/ernie-2.0-en" 32 | }, 33 | "bert-chinese": { 34 | "tokenizer": "bert-chinese", 35 | "load_name": "bert-base-chinese" 36 | }, 37 | "bert-xingshi": { 38 | "tokenizer": "bert-xingshi", 39 | "load_name": "null" 40 | }, 41 | "t5-small": { 42 | "tokenizer": "t5", 43 | "load_name": "t5-small" 44 | }, 45 | "t5": { 46 | "tokenizer": "t5", 47 | "load_name": "t5-base" 48 | }, 49 | "t5-large": { 50 | "tokenizer": "t5", 51 | "load_name": "t5-large" 52 | }, 53 | "doct5": { 54 | "tokenizer": "t5", 55 | "load_name": "castorini/doc2query-t5-base-msmarco" 56 | }, 57 | "distilsplade": { 58 | "tokenizer": "bert", 59 | "load_name": "null" 60 | }, 61 | "splade": { 62 | "tokenizer": "bert", 63 | "load_name": "null" 64 | }, 65 | "bart": { 66 | "tokenizer": "bart", 67 | "load_name": "facebook/bart-base" 68 | }, 69 | "bart-large": { 70 | "tokenizer": "bart", 71 | "load_name": "facebook/bart-large" 72 | }, 73 | "retromae": { 74 | "tokenizer": "bert", 75 | "load_name": "Shitao/RetroMAE" 76 | }, 77 | "retromae_msmarco": { 78 | "tokenizer": "bert", 79 | "load_name": "Shitao/RetroMAE_MSMARCO" 80 | }, 81 | "retromae_distill": { 82 | "tokenizer": "bert", 83 | "load_name": "Shitao/RetroMAE_MSMARCO_distill" 84 | }, 85 | "deberta": { 86 | "tokenizer": "deberta", 87 | "load_name": "microsoft/deberta-base" 88 | }, 89 | "keyt5": { 90 | "tokenizer": "t5", 91 | "load_name": "snrspeaks/KeyPhraseTransformer" 92 | }, 93 | "seal": { 94 | "tokenizer": "bart", 95 | "load_name": "tuner007/pegasus_paraphrase" 96 | }, 97 | } --------------------------------------------------------------------------------