├── .gitignore
├── LICENSE
├── README.md
├── docs
    ├── dr-msmarco-passage.md
    ├── rr-msmarco-passage.md
    └── scale-t5-weights.md
├── pyproject.toml
├── scripts
    ├── evaluate.py
    ├── gtr
    │   └── convert_sbert_ckpt.ipynb
    ├── kilt-dpr
    │   ├── convert_to_evaluation.py
    │   └── convert_trec_to_provenance.py
    ├── msmarco
    │   ├── build_hn.py
    │   └── build_train.py
    ├── nq-dpr
    │   └── build_train.py
    ├── scale_t5_weights.py
    └── split_embeddings.py
├── setup.py
├── src
    └── openmatch
    │   ├── __init__.py
    │   ├── arguments.py
    │   ├── dataset
    │       ├── __init__.py
    │       ├── beir_dataset.py
    │       ├── data_collator.py
    │       ├── inference_dataset.py
    │       └── train_dataset.py
    │   ├── driver
    │       ├── build_index.py
    │       ├── rerank.py
    │       ├── retrieve.py
    │       ├── retrieve_beir.py
    │       ├── successive_retrieve.py
    │       ├── train_dr.py
    │       └── train_rr.py
    │   ├── loss.py
    │   ├── modeling
    │       ├── __init__.py
    │       ├── dense_retrieval_model.py
    │       ├── linear.py
    │       └── reranking_model.py
    │   ├── retriever
    │       ├── __init__.py
    │       ├── dense_retriever.py
    │       └── reranker.py
    │   ├── trainer
    │       ├── __init__.py
    │       ├── dense_trainer.py
    │       └── reranker_trainer.py
    │   └── utils.py
└── v1
    ├── Contrastive_Supervision_Synthesis
        ├── bm25_retriever
        │   ├── bin
        │   │   ├── ApproximateNearestNeighborEval
        │   │   ├── ApproximateNearestNeighborEval.bat
        │   │   ├── ApproximateNearestNeighborSearch
        │   │   ├── ApproximateNearestNeighborSearch.bat
        │   │   ├── DumpAnalyzedQueries
        │   │   ├── DumpAnalyzedQueries.bat
        │   │   ├── ExtractAverageDocumentLength
        │   │   ├── ExtractAverageDocumentLength.bat
        │   │   ├── ExtractDocumentLengths
        │   │   ├── ExtractDocumentLengths.bat
        │   │   ├── ExtractNorms
        │   │   ├── ExtractNorms.bat
        │   │   ├── FeatureExtractorCli
        │   │   ├── FeatureExtractorCli.bat
        │   │   ├── IndexCollection
        │   │   ├── IndexCollection.bat
        │   │   ├── IndexUtils
        │   │   ├── IndexUtils.bat
        │   │   ├── IndexVectors
        │   │   ├── IndexVectors.bat
        │   │   ├── SearchCollection
        │   │   ├── SearchCollection.bat
        │   │   ├── SearchElastic
        │   │   ├── SearchElastic.bat
        │   │   ├── SearchMsmarco
        │   │   ├── SearchMsmarco.bat
        │   │   ├── SearchSolr
        │   │   └── SearchSolr.bat
        │   ├── build_index.sh
        │   ├── repo
        │   │   ├── HdrHistogram-2.1.9.jar
        │   │   ├── aggs-matrix-stats-client-7.0.0.jar
        │   │   ├── annotations-java5-19.0.0.jar
        │   │   ├── anserini-0.7.3-SNAPSHOT.jar
        │   │   ├── anserini-fastutil-6.5.6.jar
        │   │   ├── ant-1.9.1.jar
        │   │   ├── ant-launcher-1.9.1.jar
        │   │   ├── args4j-2.32.jar
        │   │   ├── cbor-0.7.jar
        │   │   ├── commons-codec-1.11.jar
        │   │   ├── commons-compress-1.18.jar
        │   │   ├── commons-io-2.5.jar
        │   │   ├── commons-lang3-3.5.jar
        │   │   ├── commons-logging-1.1.3.jar
        │   │   ├── commons-math3-3.6.1.jar
        │   │   ├── commons-pool2-2.6.0.jar
        │   │   ├── compiler-0.9.3.jar
        │   │   ├── elasticsearch-7.0.0.jar
        │   │   ├── elasticsearch-cli-7.0.0.jar
        │   │   ├── elasticsearch-core-7.0.0.jar
        │   │   ├── elasticsearch-geo-7.0.0.jar
        │   │   ├── elasticsearch-rest-client-7.0.0.jar
        │   │   ├── elasticsearch-rest-high-level-client-7.0.0.jar
        │   │   ├── elasticsearch-secure-sm-7.0.0.jar
        │   │   ├── elasticsearch-x-content-7.0.0.jar
        │   │   ├── guava-18.0.jar
        │   │   ├── hppc-0.7.1.jar
        │   │   ├── http2-client-9.4.19.v20190610.jar
        │   │   ├── http2-common-9.4.19.v20190610.jar
        │   │   ├── http2-hpack-9.4.19.v20190610.jar
        │   │   ├── http2-http-client-transport-9.4.19.v20190610.jar
        │   │   ├── httpasyncclient-4.1.4.jar
        │   │   ├── httpclient-4.5.6.jar
        │   │   ├── httpcore-4.4.10.jar
        │   │   ├── httpcore-nio-4.4.11.jar
        │   │   ├── httpmime-4.5.6.jar
        │   │   ├── jackson-annotations-2.10.0.pr1.jar
        │   │   ├── jackson-core-2.10.0.pr1.jar
        │   │   ├── jackson-databind-2.10.0.pr1.jar
        │   │   ├── jackson-dataformat-cbor-2.8.11.jar
        │   │   ├── jackson-dataformat-smile-2.8.11.jar
        │   │   ├── jackson-dataformat-yaml-2.10.0.pr1.jar
        │   │   ├── jackson-datatype-jdk8-2.10.0.pr1.jar
        │   │   ├── jcl-over-slf4j-1.7.24.jar
        │   │   ├── jetty-alpn-client-9.4.19.v20190610.jar
        │   │   ├── jetty-alpn-java-client-9.4.19.v20190610.jar
        │   │   ├── jetty-client-9.4.19.v20190610.jar
        │   │   ├── jetty-http-9.4.19.v20190610.jar
        │   │   ├── jetty-io-9.4.19.v20190610.jar
        │   │   ├── jetty-util-9.4.19.v20190610.jar
        │   │   ├── jna-4.5.1.jar
        │   │   ├── joda-time-2.10.1.jar
        │   │   ├── jopt-simple-5.0.2.jar
        │   │   ├── jsoup-1.8.3.jar
        │   │   ├── jsr305-2.0.1.jar
        │   │   ├── lang-mustache-client-7.0.0.jar
        │   │   ├── log4j-api-2.12.1.jar
        │   │   ├── log4j-core-2.12.1.jar
        │   │   ├── lucene-analyzers-common-8.0.0.jar
        │   │   ├── lucene-backward-codecs-8.0.0.jar
        │   │   ├── lucene-core-8.3.0.jar
        │   │   ├── lucene-grouping-8.0.0.jar
        │   │   ├── lucene-highlighter-8.0.0.jar
        │   │   ├── lucene-join-8.0.0.jar
        │   │   ├── lucene-memory-8.0.0.jar
        │   │   ├── lucene-misc-8.0.0.jar
        │   │   ├── lucene-queries-8.0.0.jar
        │   │   ├── lucene-queryparser-8.0.0.jar
        │   │   ├── lucene-sandbox-8.0.0.jar
        │   │   ├── lucene-spatial-8.0.0.jar
        │   │   ├── lucene-spatial-extras-8.0.0.jar
        │   │   ├── lucene-spatial3d-8.0.0.jar
        │   │   ├── lucene-suggest-8.0.0.jar
        │   │   ├── mockito-all-1.10.19.jar
        │   │   ├── netty-buffer-4.1.29.Final.jar
        │   │   ├── netty-codec-4.1.29.Final.jar
        │   │   ├── netty-common-4.1.29.Final.jar
        │   │   ├── netty-handler-4.1.29.Final.jar
        │   │   ├── netty-resolver-4.1.29.Final.jar
        │   │   ├── netty-transport-4.1.29.Final.jar
        │   │   ├── netty-transport-native-epoll-4.1.29.Final.jar
        │   │   ├── netty-transport-native-unix-common-4.1.29.Final.jar
        │   │   ├── parent-join-client-7.0.0.jar
        │   │   ├── rank-eval-client-7.0.0.jar
        │   │   ├── sesame-model-4.1.2.jar
        │   │   ├── sesame-rio-api-4.1.2.jar
        │   │   ├── sesame-rio-datatypes-4.1.2.jar
        │   │   ├── sesame-rio-languages-4.1.2.jar
        │   │   ├── sesame-rio-ntriples-4.1.2.jar
        │   │   ├── sesame-util-4.1.2.jar
        │   │   ├── slf4j-api-1.7.24.jar
        │   │   ├── slf4j-simple-1.7.29.jar
        │   │   ├── snakeyaml-1.24.jar
        │   │   ├── solr-solrj-8.3.0.jar
        │   │   ├── stax2-api-3.1.4.jar
        │   │   ├── t-digest-3.2.jar
        │   │   ├── trec-car-tools-java-13.jar
        │   │   ├── twitter-text-2.0.10.jar
        │   │   ├── wdtk-datamodel-0.10.0.jar
        │   │   ├── wdtk-dumpfiles-0.10.0.jar
        │   │   ├── wdtk-storage-0.10.0.jar
        │   │   ├── wdtk-util-0.10.0.jar
        │   │   ├── wikiclean-1.1.jar
        │   │   ├── woodstox-core-asl-4.4.1.jar
        │   │   ├── xz-1.5.jar
        │   │   ├── zookeeper-3.5.5.jar
        │   │   └── zookeeper-jute-3.5.5.jar
        │   └── retrieve.sh
        ├── contrastqg
        │   ├── __init__.py
        │   ├── dataloaders
        │   │   ├── __init__.py
        │   │   ├── generate_loader.py
        │   │   ├── loader_utils.py
        │   │   └── t5_utils.py
        │   └── transformers
        │   │   ├── __init__.py
        │   │   ├── activations.py
        │   │   ├── benchmark
        │   │       ├── __init__.py
        │   │       ├── benchmark.py
        │   │       ├── benchmark_args.py
        │   │       ├── benchmark_args_utils.py
        │   │       └── benchmark_utils.py
        │   │   ├── commands
        │   │       ├── __init__.py
        │   │       ├── convert.py
        │   │       ├── download.py
        │   │       ├── env.py
        │   │       ├── run.py
        │   │       ├── serving.py
        │   │       ├── train.py
        │   │       ├── transformers_cli.py
        │   │       └── user.py
        │   │   ├── configuration_albert.py
        │   │   ├── configuration_auto.py
        │   │   ├── configuration_bart.py
        │   │   ├── configuration_bert.py
        │   │   ├── configuration_camembert.py
        │   │   ├── configuration_ctrl.py
        │   │   ├── configuration_distilbert.py
        │   │   ├── configuration_electra.py
        │   │   ├── configuration_encoder_decoder.py
        │   │   ├── configuration_flaubert.py
        │   │   ├── configuration_gpt2.py
        │   │   ├── configuration_longformer.py
        │   │   ├── configuration_marian.py
        │   │   ├── configuration_mmbt.py
        │   │   ├── configuration_openai.py
        │   │   ├── configuration_reformer.py
        │   │   ├── configuration_roberta.py
        │   │   ├── configuration_t5.py
        │   │   ├── configuration_transfo_xl.py
        │   │   ├── configuration_utils.py
        │   │   ├── configuration_xlm.py
        │   │   ├── configuration_xlm_roberta.py
        │   │   ├── configuration_xlnet.py
        │   │   ├── convert_albert_original_tf_checkpoint_to_pytorch.py
        │   │   ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py
        │   │   ├── convert_bert_original_tf_checkpoint_to_pytorch.py
        │   │   ├── convert_bert_pytorch_checkpoint_to_original_tf.py
        │   │   ├── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py
        │   │   ├── convert_electra_original_tf_checkpoint_to_pytorch.py
        │   │   ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py
        │   │   ├── convert_graph_to_onnx.py
        │   │   ├── convert_longformer_original_pytorch_lightning_to_pytorch.py
        │   │   ├── convert_marian_to_pytorch.py
        │   │   ├── convert_openai_original_tf_checkpoint_to_pytorch.py
        │   │   ├── convert_pytorch_checkpoint_to_tf2.py
        │   │   ├── convert_reformer_trax_checkpoint_to_pytorch.py
        │   │   ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py
        │   │   ├── convert_t5_original_tf_checkpoint_to_pytorch.py
        │   │   ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
        │   │   ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py
        │   │   ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py
        │   │   ├── data
        │   │       ├── __init__.py
        │   │       ├── data_collator.py
        │   │       ├── datasets
        │   │       │   ├── __init__.py
        │   │       │   ├── glue.py
        │   │       │   └── language_modeling.py
        │   │       ├── metrics
        │   │       │   ├── __init__.py
        │   │       │   └── squad_metrics.py
        │   │       └── processors
        │   │       │   ├── __init__.py
        │   │       │   ├── glue.py
        │   │       │   ├── squad.py
        │   │       │   ├── utils.py
        │   │       │   └── xnli.py
        │   │   ├── file_utils.py
        │   │   ├── hf_api.py
        │   │   ├── hf_argparser.py
        │   │   ├── modelcard.py
        │   │   ├── modeling_albert.py
        │   │   ├── modeling_auto.py
        │   │   ├── modeling_bart.py
        │   │   ├── modeling_bert.py
        │   │   ├── modeling_camembert.py
        │   │   ├── modeling_ctrl.py
        │   │   ├── modeling_distilbert.py
        │   │   ├── modeling_electra.py
        │   │   ├── modeling_encoder_decoder.py
        │   │   ├── modeling_flaubert.py
        │   │   ├── modeling_gpt2.py
        │   │   ├── modeling_longformer.py
        │   │   ├── modeling_marian.py
        │   │   ├── modeling_mmbt.py
        │   │   ├── modeling_openai.py
        │   │   ├── modeling_reformer.py
        │   │   ├── modeling_roberta.py
        │   │   ├── modeling_t5.py
        │   │   ├── modeling_tf_albert.py
        │   │   ├── modeling_tf_auto.py
        │   │   ├── modeling_tf_bert.py
        │   │   ├── modeling_tf_camembert.py
        │   │   ├── modeling_tf_ctrl.py
        │   │   ├── modeling_tf_distilbert.py
        │   │   ├── modeling_tf_electra.py
        │   │   ├── modeling_tf_flaubert.py
        │   │   ├── modeling_tf_gpt2.py
        │   │   ├── modeling_tf_openai.py
        │   │   ├── modeling_tf_pytorch_utils.py
        │   │   ├── modeling_tf_roberta.py
        │   │   ├── modeling_tf_t5.py
        │   │   ├── modeling_tf_transfo_xl.py
        │   │   ├── modeling_tf_transfo_xl_utilities.py
        │   │   ├── modeling_tf_utils.py
        │   │   ├── modeling_tf_xlm.py
        │   │   ├── modeling_tf_xlm_roberta.py
        │   │   ├── modeling_tf_xlnet.py
        │   │   ├── modeling_transfo_xl.py
        │   │   ├── modeling_transfo_xl_utilities.py
        │   │   ├── modeling_utils.py
        │   │   ├── modeling_xlm.py
        │   │   ├── modeling_xlm_roberta.py
        │   │   ├── modeling_xlnet.py
        │   │   ├── optimization.py
        │   │   ├── optimization_tf.py
        │   │   ├── pipelines.py
        │   │   ├── tokenization_albert.py
        │   │   ├── tokenization_auto.py
        │   │   ├── tokenization_bart.py
        │   │   ├── tokenization_bert.py
        │   │   ├── tokenization_bert_japanese.py
        │   │   ├── tokenization_camembert.py
        │   │   ├── tokenization_ctrl.py
        │   │   ├── tokenization_distilbert.py
        │   │   ├── tokenization_electra.py
        │   │   ├── tokenization_flaubert.py
        │   │   ├── tokenization_gpt2.py
        │   │   ├── tokenization_longformer.py
        │   │   ├── tokenization_marian.py
        │   │   ├── tokenization_openai.py
        │   │   ├── tokenization_reformer.py
        │   │   ├── tokenization_roberta.py
        │   │   ├── tokenization_t5.py
        │   │   ├── tokenization_transfo_xl.py
        │   │   ├── tokenization_utils.py
        │   │   ├── tokenization_xlm.py
        │   │   ├── tokenization_xlm_roberta.py
        │   │   ├── tokenization_xlnet.py
        │   │   ├── trainer.py
        │   │   ├── trainer_tf.py
        │   │   ├── trainer_utils.py
        │   │   ├── training_args.py
        │   │   └── training_args_tf.py
        ├── preprocess
        │   ├── prepro_dataset.sh
        │   ├── sample_contrast_pairs.sh
        │   └── utils
        │   │   ├── prepro_dataset.py
        │   │   └── sample_contrast_pairs.py
        ├── run_shell
        │   ├── cqg_inference.sh
        │   ├── qg_inference.sh
        │   └── train_nlg.sh
        └── scripts
        │   ├── config.py
        │   ├── inference.py
        │   ├── model.py
        │   ├── train.py
        │   └── utils.py
    ├── Dockerfile
    ├── LICENSE
    ├── LeToR
        ├── RankLib-2.1-patched.jar
        └── gen_trec.py
    ├── OpenMatch
        ├── __init__.py
        ├── data
        │   ├── __init__.py
        │   ├── dataloader.py
        │   ├── datasets
        │   │   ├── __init__.py
        │   │   ├── bert_dataset.py
        │   │   ├── bertmaxp_dataset.py
        │   │   ├── bertmlm_dataset.py
        │   │   ├── dataset.py
        │   │   ├── edrm_dataset.py
        │   │   ├── meta_bert_dataset.py
        │   │   └── roberta_dataset.py
        │   └── tokenizers
        │   │   ├── __init__.py
        │   │   ├── tokenizer.py
        │   │   └── word_tokenizer.py
        ├── extractors
        │   ├── __init__.py
        │   └── classic_extractor.py
        ├── metrics
        │   ├── __init__.py
        │   └── metric.py
        ├── models
        │   ├── __init__.py
        │   ├── bert.py
        │   ├── bert_maxp.py
        │   ├── conv_knrm.py
        │   ├── edrm.py
        │   ├── knrm.py
        │   └── tk.py
        ├── modules
        │   ├── __init__.py
        │   ├── attentions
        │   │   ├── __init__.py
        │   │   ├── multi_head_attention.py
        │   │   └── scaled_dot_product_attention.py
        │   ├── embedders
        │   │   ├── __init__.py
        │   │   └── embedder.py
        │   ├── encoders
        │   │   ├── __init__.py
        │   │   ├── cnn_encoder.py
        │   │   ├── feed_forward_encoder.py
        │   │   ├── positional_encoder.py
        │   │   └── transformer_encoder.py
        │   └── matchers
        │   │   ├── __init__.py
        │   │   └── kernel_matcher.py
        └── utils.py
    ├── README.md
    ├── checkpoints
        └── README.md
    ├── coor_ascent.sh
    ├── data
        ├── dev_toy.jsonl
        ├── docs_toy.jsonl
        ├── filter.py
        ├── preprocess.py
        ├── qrels_toy
        ├── queries_toy.jsonl
        ├── test_toy.jsonl
        ├── toy.trec
        ├── train_clas_toy.jsonl
        └── train_rank_toy.jsonl
    ├── docs
        ├── contrastive-supervision-synthesis.md
        ├── distributed training.md
        ├── experiments-adhoc.md
        ├── experiments-classic.md
        ├── experiments-msmarco-doc.md
        ├── experiments-msmarco.md
        ├── experiments-treccovid.md
        ├── meta-learning-to-rank.md
        └── openmatch.md
    ├── features
        └── README.md
    ├── gen_feature.py
    ├── gen_feature.sh
    ├── gen_feature_bert.sh
    ├── inference.py
    ├── inference.sh
    ├── inference_bert.sh
    ├── magic_module.py
    ├── meta_dist_train.py
    ├── meta_dist_train.sh
    ├── requirements.txt
    ├── results
        └── README.md
    ├── retrievers
        ├── ANCE
        │   ├── CODE_OF_CONDUCT.md
        │   ├── LICENSE
        │   ├── README.md
        │   ├── SECURITY.md
        │   ├── commands
        │   │   ├── data_download.sh
        │   │   ├── run_ann_data_gen.sh
        │   │   ├── run_ann_data_gen_dpr.sh
        │   │   ├── run_ann_data_gen_lyz.sh
        │   │   ├── run_inference.sh
        │   │   ├── run_train.sh
        │   │   ├── run_train_dpr.sh
        │   │   ├── run_train_lyz.sh
        │   │   └── run_train_warmup.sh
        │   ├── data
        │   │   ├── DPR_data.py
        │   │   ├── msmarco_data.py
        │   │   └── process_fn.py
        │   ├── drivers
        │   │   ├── run_ann.py
        │   │   ├── run_ann_data_gen.py
        │   │   ├── run_ann_data_gen_dpr.py
        │   │   ├── run_ann_dpr.py
        │   │   └── run_warmup.py
        │   ├── evaluation
        │   │   ├── Calculate Metrics.ipynb
        │   │   ├── Calculate_Metrics.py
        │   │   └── convert_trec.py
        │   ├── model
        │   │   └── models.py
        │   ├── setup.py
        │   └── utils
        │   │   ├── dpr_utils.py
        │   │   ├── eval_mrr.py
        │   │   ├── lamb.py
        │   │   ├── msmarco_eval.py
        │   │   └── util.py
        ├── DANCE
        │   ├── ANCE_setup.py
        │   ├── README.md
        │   ├── commands
        │   │   ├── data_download.sh
        │   │   ├── run_ann_data_gen.sh
        │   │   ├── run_ann_data_gen_dpr.sh
        │   │   ├── run_inference.sh
        │   │   ├── run_train.sh
        │   │   ├── run_train_dpr.sh
        │   │   └── run_train_warmup.sh
        │   ├── data
        │   │   ├── DPR_data.py
        │   │   ├── custom_data.py
        │   │   ├── msmarco_data.py
        │   │   ├── process_fn.py
        │   │   └── validation_split.py
        │   ├── drivers
        │   │   ├── run_ann.py
        │   │   ├── run_ann_data_gen.py
        │   │   ├── run_ann_data_gen_dpr.py
        │   │   ├── run_ann_data_inference_eval.py
        │   │   ├── run_ann_dpr.py
        │   │   ├── run_ann_emb_inference.py
        │   │   └── run_warmup.py
        │   ├── evaluation
        │   │   ├── Calculate Metrics.ipynb
        │   │   ├── Calculate_Metrics.py
        │   │   └── retrieval.py
        │   ├── model
        │   │   └── models.py
        │   ├── requirements.txt
        │   └── utils
        │   │   ├── dpr_utils.py
        │   │   ├── eval_mrr.py
        │   │   ├── indexing_utils.py
        │   │   ├── lamb.py
        │   │   ├── metric.py
        │   │   ├── msmarco_eval.py
        │   │   ├── trec_convert.py
        │   │   └── util.py
        ├── README.md
        ├── bm25_retriever
        │   ├── bin
        │   │   ├── ApproximateNearestNeighborEval
        │   │   ├── ApproximateNearestNeighborEval.bat
        │   │   ├── ApproximateNearestNeighborSearch
        │   │   ├── ApproximateNearestNeighborSearch.bat
        │   │   ├── DumpAnalyzedQueries
        │   │   ├── DumpAnalyzedQueries.bat
        │   │   ├── ExtractAverageDocumentLength
        │   │   ├── ExtractAverageDocumentLength.bat
        │   │   ├── ExtractDocumentLengths
        │   │   ├── ExtractDocumentLengths.bat
        │   │   ├── ExtractNorms
        │   │   ├── ExtractNorms.bat
        │   │   ├── FeatureExtractorCli
        │   │   ├── FeatureExtractorCli.bat
        │   │   ├── IndexCollection
        │   │   ├── IndexCollection.bat
        │   │   ├── IndexUtils
        │   │   ├── IndexUtils.bat
        │   │   ├── IndexVectors
        │   │   ├── IndexVectors.bat
        │   │   ├── SearchCollection
        │   │   ├── SearchCollection.bat
        │   │   ├── SearchElastic
        │   │   ├── SearchElastic.bat
        │   │   ├── SearchMsmarco
        │   │   ├── SearchMsmarco.bat
        │   │   ├── SearchSolr
        │   │   └── SearchSolr.bat
        │   └── repo
        │   │   ├── HdrHistogram-2.1.9.jar
        │   │   ├── aggs-matrix-stats-client-7.0.0.jar
        │   │   ├── annotations-java5-19.0.0.jar
        │   │   ├── anserini-0.7.3-SNAPSHOT.jar
        │   │   ├── anserini-fastutil-6.5.6.jar
        │   │   ├── ant-1.9.1.jar
        │   │   ├── ant-launcher-1.9.1.jar
        │   │   ├── args4j-2.32.jar
        │   │   ├── cbor-0.7.jar
        │   │   ├── commons-codec-1.11.jar
        │   │   ├── commons-compress-1.18.jar
        │   │   ├── commons-io-2.5.jar
        │   │   ├── commons-lang3-3.5.jar
        │   │   ├── commons-logging-1.1.3.jar
        │   │   ├── commons-math3-3.6.1.jar
        │   │   ├── commons-pool2-2.6.0.jar
        │   │   ├── compiler-0.9.3.jar
        │   │   ├── elasticsearch-7.0.0.jar
        │   │   ├── elasticsearch-cli-7.0.0.jar
        │   │   ├── elasticsearch-core-7.0.0.jar
        │   │   ├── elasticsearch-geo-7.0.0.jar
        │   │   ├── elasticsearch-rest-client-7.0.0.jar
        │   │   ├── elasticsearch-rest-high-level-client-7.0.0.jar
        │   │   ├── elasticsearch-secure-sm-7.0.0.jar
        │   │   ├── elasticsearch-x-content-7.0.0.jar
        │   │   ├── guava-18.0.jar
        │   │   ├── hppc-0.7.1.jar
        │   │   ├── http2-client-9.4.19.v20190610.jar
        │   │   ├── http2-common-9.4.19.v20190610.jar
        │   │   ├── http2-hpack-9.4.19.v20190610.jar
        │   │   ├── http2-http-client-transport-9.4.19.v20190610.jar
        │   │   ├── httpasyncclient-4.1.4.jar
        │   │   ├── httpclient-4.5.6.jar
        │   │   ├── httpcore-4.4.10.jar
        │   │   ├── httpcore-nio-4.4.11.jar
        │   │   ├── httpmime-4.5.6.jar
        │   │   ├── jackson-annotations-2.10.0.pr1.jar
        │   │   ├── jackson-core-2.10.0.pr1.jar
        │   │   ├── jackson-databind-2.10.0.pr1.jar
        │   │   ├── jackson-dataformat-cbor-2.8.11.jar
        │   │   ├── jackson-dataformat-smile-2.8.11.jar
        │   │   ├── jackson-dataformat-yaml-2.10.0.pr1.jar
        │   │   ├── jackson-datatype-jdk8-2.10.0.pr1.jar
        │   │   ├── jcl-over-slf4j-1.7.24.jar
        │   │   ├── jetty-alpn-client-9.4.19.v20190610.jar
        │   │   ├── jetty-alpn-java-client-9.4.19.v20190610.jar
        │   │   ├── jetty-client-9.4.19.v20190610.jar
        │   │   ├── jetty-http-9.4.19.v20190610.jar
        │   │   ├── jetty-io-9.4.19.v20190610.jar
        │   │   ├── jetty-util-9.4.19.v20190610.jar
        │   │   ├── jna-4.5.1.jar
        │   │   ├── joda-time-2.10.1.jar
        │   │   ├── jopt-simple-5.0.2.jar
        │   │   ├── jsoup-1.8.3.jar
        │   │   ├── jsr305-2.0.1.jar
        │   │   ├── lang-mustache-client-7.0.0.jar
        │   │   ├── log4j-api-2.12.1.jar
        │   │   ├── log4j-core-2.12.1.jar
        │   │   ├── lucene-analyzers-common-8.0.0.jar
        │   │   ├── lucene-backward-codecs-8.0.0.jar
        │   │   ├── lucene-core-8.3.0.jar
        │   │   ├── lucene-grouping-8.0.0.jar
        │   │   ├── lucene-highlighter-8.0.0.jar
        │   │   ├── lucene-join-8.0.0.jar
        │   │   ├── lucene-memory-8.0.0.jar
        │   │   ├── lucene-misc-8.0.0.jar
        │   │   ├── lucene-queries-8.0.0.jar
        │   │   ├── lucene-queryparser-8.0.0.jar
        │   │   ├── lucene-sandbox-8.0.0.jar
        │   │   ├── lucene-spatial-8.0.0.jar
        │   │   ├── lucene-spatial-extras-8.0.0.jar
        │   │   ├── lucene-spatial3d-8.0.0.jar
        │   │   ├── lucene-suggest-8.0.0.jar
        │   │   ├── mockito-all-1.10.19.jar
        │   │   ├── netty-buffer-4.1.29.Final.jar
        │   │   ├── netty-codec-4.1.29.Final.jar
        │   │   ├── netty-common-4.1.29.Final.jar
        │   │   ├── netty-handler-4.1.29.Final.jar
        │   │   ├── netty-resolver-4.1.29.Final.jar
        │   │   ├── netty-transport-4.1.29.Final.jar
        │   │   ├── netty-transport-native-epoll-4.1.29.Final.jar
        │   │   ├── netty-transport-native-unix-common-4.1.29.Final.jar
        │   │   ├── parent-join-client-7.0.0.jar
        │   │   ├── rank-eval-client-7.0.0.jar
        │   │   ├── sesame-model-4.1.2.jar
        │   │   ├── sesame-rio-api-4.1.2.jar
        │   │   ├── sesame-rio-datatypes-4.1.2.jar
        │   │   ├── sesame-rio-languages-4.1.2.jar
        │   │   ├── sesame-rio-ntriples-4.1.2.jar
        │   │   ├── sesame-util-4.1.2.jar
        │   │   ├── slf4j-api-1.7.24.jar
        │   │   ├── slf4j-simple-1.7.29.jar
        │   │   ├── snakeyaml-1.24.jar
        │   │   ├── solr-solrj-8.3.0.jar
        │   │   ├── stax2-api-3.1.4.jar
        │   │   ├── t-digest-3.2.jar
        │   │   ├── trec-car-tools-java-13.jar
        │   │   ├── twitter-text-2.0.10.jar
        │   │   ├── wdtk-datamodel-0.10.0.jar
        │   │   ├── wdtk-dumpfiles-0.10.0.jar
        │   │   ├── wdtk-storage-0.10.0.jar
        │   │   ├── wdtk-util-0.10.0.jar
        │   │   ├── wikiclean-1.1.jar
        │   │   ├── woodstox-core-asl-4.4.1.jar
        │   │   ├── xz-1.5.jar
        │   │   ├── zookeeper-3.5.5.jar
        │   │   └── zookeeper-jute-3.5.5.jar
        ├── openmatch_ance_retriver_readme.md
        └── venv_ANCE.requirements
    ├── setup.py
    ├── train.py
    ├── train.sh
    ├── train_bert.sh
    ├── train_bert_dist.sh
    ├── train_bertmlm.py
    └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .ropeproject
90 | 
91 | .DS_Store
92 | */.DS_Store
93 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 THUNLP
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | **This repo is deprecated. Please check out our new repo at https://github.com/OpenMatch/OpenMatch.**
 2 | 
 3 | # OpenMatch v2
 4 | 
 5 | An all-in-one toolkit for information retrieval. Under active development.
 6 | 
 7 | ## Install
 8 | 
 9 | ```bash
10 | git clone https://github.com/thunlp/OpenMatch.git
11 | cd OpenMatch
12 | pip install -e .
13 | ```
14 | 
15 | `-e` means **editable**, i.e. you can change the code directly in your directory.
16 | 
17 | We do not include all the requirements in the package. You may need to manually install `torch`, `tensorboard`.
18 | 
19 | You may also need faiss for dense retrieval. You can install either `faiss-cpu` or `faiss-gpu`, according to your enviroment. Note that if you want to perform search on GPUs, you need to install the version of `faiss-gpu` compatible with your CUDA. In some cases (usually CUDA >= 11.0) `pip` installs a wrong version. If you encounter errors during search on GPUs, you may try installing it from `conda`. 
20 | 
21 | ## Features
22 | 
23 | - Human-friendly interface for dense retriever and re-ranker training and testing
24 | - Various PLMs supported (BERT, RoBERTa, T5...)
25 | - Native support for common IR & QA Datasets (MS MARCO, NQ, KILT, BEIR, ...)
26 | - Deep integration with Huggingface Transformers and Datasets
27 | - Efficient training and inference via stream-style data loading
28 | 
29 | ## Docs
30 | 
31 | See docs folder.
32 | 
33 | ## Project Organizers
34 | 
35 | - Zhiyuan Liu
36 |   * Tsinghua University
37 |   * [Homepage](http://nlp.csai.tsinghua.edu.cn/~lzy/)
38 | - Zhenghao Liu
39 |   * Northeastern University
40 |   * [Homepage](https://edwardzh.github.io/)
41 | - Chenyan Xiong
42 |   * Microsoft Research AI
43 |   * [Homepage](https://www.microsoft.com/en-us/research/people/cxiong/)
44 | - Maosong Sun
45 |   * Tsinghua University
46 |   * [Homepage](http://nlp.csai.tsinghua.edu.cn/staff/sms/)
47 | 
48 | ## Acknowledgments
49 | 
50 | Our implementation uses [Tevatron](https://github.com/texttron/tevatron) as the starting point. We thank its authors for their contributions.
51 | 
52 | ## Contact
53 | 
54 | Please email to yushi17@foxmail.com.
55 | 


--------------------------------------------------------------------------------
/docs/scale-t5-weights.md:
--------------------------------------------------------------------------------
 1 | # T5 Weights Scaling
 2 | 
 3 | For stable mixed-precision training on NVIDIA GPUs, it's recommended to scale the weights of the pre-trained T5 model. 
 4 | 
 5 | First you need to manually download the T5 model. Search for your model on Hugging Face, and switch to the "Files and versions" tab. Right click the download arrows, copy the download links of `config.json`, `pytorch_model.bin`, `spiece.model`, `tokenizer.json` and download them in your directory.
 6 | 
 7 | Run the following command to scale the weights:
 8 | 
 9 | ```bash
10 | python scripts/scale_t5_weights.py --input_model_path /path/to/t5-base  --output_model_path /path/to/t5-base-scaled  --num_layers 12
11 | ```
12 | 
13 | For larger T5 models, change `--num_layers` to the corresponding number of model layers.


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=42"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/scripts/kilt-dpr/convert_to_evaluation.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | from tqdm import tqdm
 4 | 
 5 | 
 6 | if __name__ == "__main__":
 7 |     parser = argparse.ArgumentParser()
 8 |     parser.add_argument("--kilt_queries_file", type=str)
 9 |     parser.add_argument("--provenance_file", type=str)
10 |     parser.add_argument("--output_evaluation_file", type=str)
11 |     args = parser.parse_args()
12 | 
13 |     raw_data = []
14 |     with open(args.kilt_queries_file, "r") as f:
15 |         for line in f:
16 |             raw_data.append(json.loads(line))
17 | 
18 |     with open(args.provenance_file, "r") as f:
19 |         provenance = json.load(f)
20 | 
21 |     # consider only valid data - filter out invalid
22 |     validated_data = {}
23 |     query_data = []
24 |     for element in raw_data:
25 |         #if utils.validate_datapoint(element, logger=None):
26 |         if element["id"] in validated_data:
27 |             raise ValueError("ids are not unique in input data!")
28 |         validated_data[element["id"]] = element
29 |         query_data.append(
30 |             {"query": element["input"], "id": element["id"]}
31 |         )
32 | 
33 |     if len(provenance) != len(query_data):
34 |         print("WARNING: provenance and query data are not of the same length!")
35 | 
36 |     # write prediction files
37 |     if provenance:
38 |         print("writing prediction file to {}".format(args.output_evaluation_file))
39 | 
40 |         predictions = []
41 |         for query_id in provenance.keys():
42 |             element = validated_data[query_id]
43 |             new_output = [{"provenance": provenance[query_id]}]
44 |             # append the answers
45 |             if "output" in element:
46 |                 for o in element["output"]:
47 |                     if "answer" in o:
48 |                         new_output.append({"answer": o["answer"]})
49 |             element["output"] = new_output
50 |             predictions.append(element)
51 | 
52 |     with open(args.output_evaluation_file, "w") as outfile:
53 |         for p in predictions:
54 |             json.dump(p, outfile)
55 |             outfile.write("\n")


--------------------------------------------------------------------------------
/scripts/kilt-dpr/convert_trec_to_provenance.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import csv
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("--trec_file", type=str)
10 |     parser.add_argument("--kilt_queries_file", type=str, default=None)
11 |     parser.add_argument("--passage_collection", type=str)
12 |     parser.add_argument("--output_provenance_file", type=str)
13 |     args = parser.parse_args()
14 | 
15 |     queries = []
16 |     if args.kilt_queries_file is not None:
17 |         with open(args.kilt_queries_file, "r") as f:
18 |             for line in f:
19 |                 obj = json.loads(line)
20 |                 queries.append(obj)
21 | 
22 |     pid2content = []
23 |     with open(args.passage_collection, "r") as f:
24 |         reader = csv.reader(f, delimiter="\t")
25 |         next(reader)
26 |         i = 0
27 |         for row in tqdm(reader):
28 |             pid, text, wikipedia_title, wikipedia_id, _, _ = row
29 |             pid = int(pid)
30 |             assert pid == i
31 |             pid2content.append({"text": text, "wikipedia_title": wikipedia_title, "wikipedia_id": wikipedia_id})
32 |             i += 1
33 | 
34 |     provenance = {}
35 |     with open(args.trec_file, "r") as f:
36 |         last_qid = 0
37 |         for line in f:
38 |             qid, _, pid, rank, score, _ = line.strip().split()
39 |             qid = int(qid)
40 |             pid = int(pid)
41 |             rank = int(rank)
42 |             real_qid = queries[qid - 1]["id"] if len(queries) > 0 else str(qid)
43 |             if qid != last_qid:  # new query
44 |                 provenance[real_qid] = []
45 |                 last_qid = qid
46 |             provenance[real_qid].append({"score": score, "text": pid2content[pid]["text"], "wikipedia_title": pid2content[pid]["wikipedia_title"], "wikipedia_id": pid2content[pid]["wikipedia_id"]})
47 |     
48 |     with open(args.output_provenance_file, "w") as f:
49 |         json.dump(provenance, f, indent=4)


--------------------------------------------------------------------------------
/scripts/nq-dpr/build_train.py:
--------------------------------------------------------------------------------
 1 | # Adapted from Tevatron (https://github.com/texttron/tevatron)
 2 | 
 3 | import json
 4 | import os
 5 | from argparse import ArgumentParser
 6 | 
 7 | from transformers import AutoTokenizer, PreTrainedTokenizer
 8 | from tqdm import tqdm
 9 | from openmatch.utils import fill_template
10 | 
11 | parser = ArgumentParser()
12 | parser.add_argument('--input', type=str, required=True)
13 | parser.add_argument('--output', type=str, required=True)
14 | parser.add_argument('--query_template', type=str)
15 | parser.add_argument('--doc_template', type=str)
16 | parser.add_argument('--tokenizer', type=str, required=False, default='bert-base-uncased')
17 | parser.add_argument('--minimum-negatives', type=int, required=False, default=1)
18 | args = parser.parse_args()
19 | 
20 | tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True)
21 | 
22 | data = json.load(open(args.input))
23 | 
24 | save_dir = os.path.split(args.output)[0]
25 | if not os.path.exists(save_dir):
26 |     os.makedirs(save_dir)
27 | 
28 | with open(args.output, 'w') as f:
29 |     for idx, item in enumerate(tqdm(data)):
30 |         if len(item['hard_negative_ctxs']) < args.minimum_negatives or len(item['positive_ctxs']) < 1:
31 |             continue
32 | 
33 |         group = {}
34 |         positives = []
35 |         for pos in item['positive_ctxs']:
36 |             positives.append(fill_template(args.doc_template, pos))
37 |         negatives = []
38 |         for neg in item['hard_negative_ctxs']:
39 |             negatives.append(fill_template(args.doc_template, neg))
40 | 
41 |         query = tokenizer.encode(fill_template(args.query_template, item), add_special_tokens=False, max_length=32, truncation=True)
42 |         positives = tokenizer(
43 |             positives, add_special_tokens=False, max_length=128, truncation=True, padding=False)['input_ids']
44 |         negatives = tokenizer(
45 |             negatives, add_special_tokens=False, max_length=128, truncation=True, padding=False)['input_ids']
46 | 
47 |         group['query'] = query
48 |         group['positives'] = positives
49 |         group['negatives'] = negatives
50 | 
51 |         f.write(json.dumps(group) + '\n')
52 | 


--------------------------------------------------------------------------------
/scripts/scale_t5_weights.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from transformers import AutoModel
 3 | import copy
 4 | import argparse
 5 | import os
 6 | import shutil
 7 | import glob
 8 | 
 9 | 
10 | if __name__ == "__main__":
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--input_model_path", type=str)
13 |     parser.add_argument("--output_model_path", type=str)
14 |     parser.add_argument("--num_layers", type=int, default=12)
15 |     args = parser.parse_args()
16 | 
17 |     # scale model weights
18 |     original_model = AutoModel.from_pretrained(args.input_model_path)
19 |     state_dict = original_model.state_dict()
20 | 
21 |     keys = state_dict.keys()
22 |     new_state_dict = copy.deepcopy(state_dict)
23 | 
24 |     for i in range(args.num_layers):
25 |         new_state_dict[f'encoder.block.{i}.layer.0.SelfAttention.o.weight'] /= 100
26 |         new_state_dict[f'encoder.block.{i}.layer.1.DenseReluDense.wi.weight'] /= 10
27 |         new_state_dict[f'encoder.block.{i}.layer.1.DenseReluDense.wo.weight'] /= 10
28 | 
29 |         new_state_dict[f'decoder.block.{i}.layer.1.EncDecAttention.o.weight'] /= 100
30 |         new_state_dict[f'decoder.block.{i}.layer.0.SelfAttention.o.weight'] /= 100
31 |         new_state_dict[f'decoder.block.{i}.layer.2.DenseReluDense.wi.weight'] /= 10
32 |         new_state_dict[f'decoder.block.{i}.layer.2.DenseReluDense.wo.weight'] /= 10
33 |     new_state_dict['shared.weight'] /= 100
34 | 
35 |     os.makedirs(args.output_model_path, exist_ok=True)
36 |     torch.save(new_state_dict, os.path.join(args.output_model_path, "pytorch_model.bin"))
37 | 
38 |     # copy other files
39 |     files = glob.glob(os.path.join(args.input_model_path, "*"))
40 |     for file in files:
41 |         if file != os.path.join(args.input_model_path, "pytorch_model.bin"):
42 |             shutil.copy(file, args.output_model_path)


--------------------------------------------------------------------------------
/scripts/split_embeddings.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import argparse
 3 | import pickle
 4 | from tqdm import trange
 5 | 
 6 | 
 7 | if __name__ == "__main__":
 8 |     parser = argparse.ArgumentParser()
 9 |     parser.add_argument("--input_embedding", type=str)
10 |     parser.add_argument("--output_embeddings", type=str)
11 |     parser.add_argument("--num_splits", type=int, default=2)
12 |     args = parser.parse_args()
13 | 
14 |     with open(args.input_embedding, "rb") as f:
15 |         embedding, lookup = pickle.load(f)
16 |         lookup = np.array(lookup)
17 |     
18 |     for split in trange(args.num_splits):
19 |         embedding_split = embedding[split::args.num_splits]
20 |         lookup_split = lookup[split::args.num_splits]
21 |         with open(args.output_embeddings + f".{split}", "wb") as f:
22 |             pickle.dump((embedding_split, lookup_split.tolist()), f, protocol=4)


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r", encoding="utf-8") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="openmatch-thunlp",
 8 |     version="0.0.1",
 9 |     author="Shi Yu",
10 |     author_email="yushi17@foxmail.com",
11 |     description="An python package for research on Information Retrieval",
12 |     long_description=long_description,
13 |     long_description_content_type="text/markdown",
14 |     classifiers=[
15 |         "Programming Language :: Python :: 3",
16 |         "License :: OSI Approved :: MIT License",
17 |         "Topic :: Text Processing :: Indexing",
18 |         "Intended Audience :: Information Technology"
19 |     ],
20 |     package_dir={"": "src"},
21 |     packages=setuptools.find_packages(where="src"),
22 |     python_requires=">=3.7",
23 |     install_requires=[
24 |         "transformers>=4.10.0",
25 |         "sentencepiece",
26 |         "datasets>=1.1.3"
27 |     ]
28 | )
29 | 


--------------------------------------------------------------------------------
/src/openmatch/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/src/openmatch/__init__.py


--------------------------------------------------------------------------------
/src/openmatch/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | from .beir_dataset import BEIRQueryDataset, BEIRCorpusDataset, BEIRDataset
2 | from .data_collator import DRInferenceCollator, QPCollator, PairCollator, RRInferenceCollator
3 | from .inference_dataset import JsonlDataset, TsvDataset, InferenceDataset
4 | from .train_dataset import DRTrainDataset, DREvalDataset, RRTrainDataset, RREvalDataset


--------------------------------------------------------------------------------
/src/openmatch/driver/build_index.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | from openmatch.arguments import DataArguments
 5 | from openmatch.arguments import InferenceArguments as EncodingArguments
 6 | from openmatch.arguments import ModelArguments
 7 | from openmatch.dataset import InferenceDataset
 8 | from openmatch.modeling import DRModelForInference
 9 | from openmatch.retriever import Retriever
10 | from transformers import AutoConfig, AutoTokenizer, HfArgumentParser
11 | 
12 | 
13 | def main():
14 |     parser = HfArgumentParser((ModelArguments, DataArguments, EncodingArguments))
15 |     if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
16 |         model_args, data_args, encoding_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
17 |     else:
18 |         model_args, data_args, encoding_args = parser.parse_args_into_dataclasses()
19 |         model_args: ModelArguments
20 |         data_args: DataArguments
21 |         encoding_args: EncodingArguments
22 | 
23 |     num_labels = 1
24 |     config = AutoConfig.from_pretrained(
25 |         model_args.config_name if model_args.config_name else model_args.model_name_or_path,
26 |         num_labels=num_labels,
27 |         cache_dir=model_args.cache_dir,
28 |     )
29 |     tokenizer = AutoTokenizer.from_pretrained(
30 |         model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
31 |         cache_dir=model_args.cache_dir,
32 |     )
33 | 
34 |     model = DRModelForInference.build(
35 |         model_args=model_args,
36 |         config=config,
37 |         cache_dir=model_args.cache_dir,
38 |     )
39 | 
40 |     corpus_dataset = InferenceDataset.load(
41 |         tokenizer=tokenizer,
42 |         data_args=data_args,
43 |         is_query=False,
44 |         stream=True,
45 |         batch_size=encoding_args.per_device_eval_batch_size,
46 |         num_processes=encoding_args.world_size,
47 |         process_index=encoding_args.process_index,
48 |         cache_dir=model_args.cache_dir
49 |     )
50 | 
51 |     Retriever.build_embeddings(model, corpus_dataset, encoding_args)
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     main()
56 | 


--------------------------------------------------------------------------------
/src/openmatch/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | from .dense_retrieval_model import DRModel, DRModelForInference, DROutput
2 | from .reranking_model import RRModel, RROutput


--------------------------------------------------------------------------------
/src/openmatch/modeling/linear.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import json
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | from torch import Tensor
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | class LinearHead(nn.Module):
13 |     def __init__(
14 |             self,
15 |             input_dim: int = 768,
16 |             output_dim: int = 768,
17 |     ):
18 |         super(LinearHead, self).__init__()
19 |         self.linear = nn.Linear(input_dim, output_dim, bias=False)
20 |         self.config = {'input_dim': input_dim, 'output_dim': output_dim}
21 | 
22 |     def forward(self, rep: Tensor = None):
23 |         return self.linear(rep)
24 | 
25 |     @classmethod
26 |     def load(cls, ckpt_dir: str):
27 |         logger.info(f'Loading linear head from {ckpt_dir}')
28 |         model_path = os.path.join(ckpt_dir, 'linear.pt')
29 |         config_path = os.path.join(ckpt_dir, 'head_config.json')
30 |         with open(config_path, 'r') as f:
31 |             config = json.load(f)
32 |         model = cls(**config)
33 |         model.load_state_dict(torch.load(model_path))
34 |         return model
35 | 
36 |     def save(self, save_path):
37 |         torch.save(self.state_dict(), os.path.join(save_path, 'linear.pt'))
38 |         with open(os.path.join(save_path, 'head_config.json'), 'w') as f:
39 |             json.dump(self.config, f, indent=4)


--------------------------------------------------------------------------------
/src/openmatch/retriever/__init__.py:
--------------------------------------------------------------------------------
1 | from .dense_retriever import Retriever, SuccessiveRetriever
2 | from .reranker import RRPredictDataset, Reranker


--------------------------------------------------------------------------------
/src/openmatch/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | from .dense_trainer import DRTrainer, GCDenseTrainer
2 | from .reranker_trainer import RRTrainer


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/build_index.sh:
--------------------------------------------------------------------------------
1 | export dataset_name= ## you need to set this
2 | export data_path=## you need to set this
3 | 
4 | ./bin/IndexCollection -collection JsonCollection -input $data_path/corpus -index $dataset_name -generator LuceneDocumentGenerator -threads 8 -storePositions -storeDocvectors -storeRawDocs
5 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/HdrHistogram-2.1.9.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/HdrHistogram-2.1.9.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/aggs-matrix-stats-client-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/aggs-matrix-stats-client-7.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/annotations-java5-19.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/annotations-java5-19.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/anserini-0.7.3-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/anserini-0.7.3-SNAPSHOT.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/anserini-fastutil-6.5.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/anserini-fastutil-6.5.6.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/ant-1.9.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/ant-1.9.1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/ant-launcher-1.9.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/ant-launcher-1.9.1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/args4j-2.32.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/args4j-2.32.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/cbor-0.7.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/cbor-0.7.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-codec-1.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-codec-1.11.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-compress-1.18.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-compress-1.18.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-io-2.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-io-2.5.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-lang3-3.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-lang3-3.5.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-logging-1.1.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-logging-1.1.3.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-math3-3.6.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-math3-3.6.1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-pool2-2.6.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-pool2-2.6.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/compiler-0.9.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/compiler-0.9.3.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-7.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-cli-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-cli-7.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-core-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-core-7.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-geo-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-geo-7.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-rest-client-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-rest-client-7.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-rest-high-level-client-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-rest-high-level-client-7.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-secure-sm-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-secure-sm-7.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-x-content-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-x-content-7.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/guava-18.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/guava-18.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/hppc-0.7.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/hppc-0.7.1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-client-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-client-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-common-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-common-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-hpack-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-hpack-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-http-client-transport-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-http-client-transport-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpasyncclient-4.1.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpasyncclient-4.1.4.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpclient-4.5.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpclient-4.5.6.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpcore-4.4.10.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpcore-4.4.10.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpcore-nio-4.4.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpcore-nio-4.4.11.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpmime-4.5.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpmime-4.5.6.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-annotations-2.10.0.pr1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-annotations-2.10.0.pr1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-core-2.10.0.pr1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-core-2.10.0.pr1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-databind-2.10.0.pr1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-databind-2.10.0.pr1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-dataformat-cbor-2.8.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-dataformat-cbor-2.8.11.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-dataformat-smile-2.8.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-dataformat-smile-2.8.11.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-dataformat-yaml-2.10.0.pr1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-dataformat-yaml-2.10.0.pr1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-datatype-jdk8-2.10.0.pr1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-datatype-jdk8-2.10.0.pr1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jcl-over-slf4j-1.7.24.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jcl-over-slf4j-1.7.24.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-alpn-client-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-alpn-client-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-alpn-java-client-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-alpn-java-client-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-client-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-client-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-http-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-http-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-io-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-io-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-util-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-util-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jna-4.5.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jna-4.5.1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/joda-time-2.10.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/joda-time-2.10.1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jopt-simple-5.0.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jopt-simple-5.0.2.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jsoup-1.8.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jsoup-1.8.3.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jsr305-2.0.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jsr305-2.0.1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lang-mustache-client-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lang-mustache-client-7.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/log4j-api-2.12.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/log4j-api-2.12.1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/log4j-core-2.12.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/log4j-core-2.12.1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-analyzers-common-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-analyzers-common-8.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-backward-codecs-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-backward-codecs-8.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-core-8.3.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-core-8.3.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-grouping-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-grouping-8.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-highlighter-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-highlighter-8.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-join-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-join-8.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-memory-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-memory-8.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-misc-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-misc-8.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-queries-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-queries-8.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-queryparser-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-queryparser-8.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-sandbox-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-sandbox-8.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-spatial-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-spatial-8.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-spatial-extras-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-spatial-extras-8.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-spatial3d-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-spatial3d-8.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-suggest-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-suggest-8.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/mockito-all-1.10.19.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/mockito-all-1.10.19.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-buffer-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-buffer-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-codec-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-codec-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-common-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-common-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-handler-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-handler-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-resolver-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-resolver-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-transport-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-transport-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-transport-native-epoll-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-transport-native-epoll-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-transport-native-unix-common-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-transport-native-unix-common-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/parent-join-client-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/parent-join-client-7.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/rank-eval-client-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/rank-eval-client-7.0.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-model-4.1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-model-4.1.2.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-api-4.1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-api-4.1.2.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-datatypes-4.1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-datatypes-4.1.2.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-languages-4.1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-languages-4.1.2.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-ntriples-4.1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-ntriples-4.1.2.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-util-4.1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-util-4.1.2.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/slf4j-api-1.7.24.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/slf4j-api-1.7.24.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/slf4j-simple-1.7.29.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/slf4j-simple-1.7.29.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/snakeyaml-1.24.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/snakeyaml-1.24.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/solr-solrj-8.3.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/solr-solrj-8.3.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/stax2-api-3.1.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/stax2-api-3.1.4.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/t-digest-3.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/t-digest-3.2.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/trec-car-tools-java-13.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/trec-car-tools-java-13.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/twitter-text-2.0.10.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/twitter-text-2.0.10.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-datamodel-0.10.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-datamodel-0.10.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-dumpfiles-0.10.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-dumpfiles-0.10.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-storage-0.10.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-storage-0.10.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-util-0.10.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-util-0.10.0.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wikiclean-1.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wikiclean-1.1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/woodstox-core-asl-4.4.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/woodstox-core-asl-4.4.1.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/xz-1.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/xz-1.5.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/zookeeper-3.5.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/zookeeper-3.5.5.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/zookeeper-jute-3.5.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/zookeeper-jute-3.5.5.jar


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/bm25_retriever/retrieve.sh:
--------------------------------------------------------------------------------
1 | export dataset_name= ## you need to set this
2 | export generator_folder=qg_t5-base ## qg_t5-small ; qg_t5-base
3 | export data_path= ## you need to set this
4 | 
5 | ./bin/SearchCollection -index $dataset_name -topicreader TsvString -topics $data_path/qid2query.tsv -bm25 -output $data_path/bm25_retrieval.trec
6 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/__init__.py:
--------------------------------------------------------------------------------
 1 | from .transformers import (
 2 |     MODEL_FOR_QUESTION_ANSWERING_MAPPING,
 3 |     WEIGHTS_NAME,
 4 |     AdamW,
 5 |     AutoConfig,
 6 |     AutoTokenizer,
 7 |     get_linear_schedule_with_warmup,
 8 |     ModuleUtilsMixin,
 9 |     BertSelfAttention,
10 |     BertPreTrainedModel,
11 |     T5Tokenizer, 
12 |     T5ForConditionalGeneration,
13 | )
14 | from . import dataloaders


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/dataloaders/__init__.py:
--------------------------------------------------------------------------------
 1 | def select_tokenizer(args): 
 2 |     if "t5" in args.pretrain_generator_type:
 3 |         return {"gen_tokenizer":T5_Tokenizer(args)}
 4 |     raise ValueError('Invalid generator class: %s' % args.pretrain_generator_type)
 5 |     
 6 |     
 7 | def select_data_loader(args, do_finetune=False):
 8 |     dataloder_dict = {"build_generate_dataset":generate_dataset}
 9 |     
10 |     if "t5" in args.pretrain_generator_type:
11 |         dataloder_dict["gen_batchify"] = t5_batchify_for_test
12 |         return dataloder_dict
13 |     
14 |     raise ValueError('Invalid generator class: %s' % args.pretrain_generator_type)
15 |     
16 | 
17 | from .generate_loader import generate_dataset
18 | from .t5_utils import (
19 |     T5_Tokenizer,
20 |     t5_batchify_for_test,
21 | )
22 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/dataloaders/generate_loader.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import torch
 4 | from torch.utils.data import Dataset
 5 | 
 6 | from . import loader_utils
 7 | from .t5_utils import t5_pair_converter, t5_single_converter
 8 | logger = logging.getLogger()
 9 | 
10 | 
11 | 
12 | generate_feature_converter = {
13 |     "contrastqg":t5_pair_converter,
14 |     "qg":t5_single_converter
15 | }
16 | 
17 | 
18 | class generate_dataset(Dataset):
19 |     def __init__(
20 |         self, 
21 |         args,
22 |         data_dir,
23 |         tokenizer, 
24 |     ):
25 |         """
26 |         :param intput_dir: examples.jsonl ("pos_docid"/"neg_docid"); docid2doc.jsonl
27 |         :param tokenizer: T5Tokenizer or None
28 |         """
29 |         # load pairs {"pos_docid", "neg_docid"}
30 |         if args.generator_mode == "contrastqg":
31 |             examples = loader_utils.load_json2list(os.path.join(data_dir, "qg_%s/contrast_pairs.jsonl"%args.pretrain_generator_type))
32 |         else:
33 |             examples = loader_utils.load_json2list(os.path.join(data_dir, "pos_docids.jsonl"))
34 |         logger.info('[%s] needs generate %d examples'%(args.generator_mode, len(examples)))
35 |         
36 |         # load docid2doc {"docid":doc}
37 |         docid2doc = loader_utils.load_json2dict(
38 |             os.path.join(data_dir, "docid2doc.jsonl"), 
39 |             id_name="docid", 
40 |             text_key="doc",
41 |         )
42 |         
43 |         # load docid2doc
44 |         self.args = args
45 |         self.dataset = {"docid2doc":None, "qid2query":None}
46 |         self.dataset["docid2doc"] = docid2doc
47 |         self.tokenizer = tokenizer
48 |         self.examples = examples
49 |                 
50 |     def __len__(self):
51 |         return len(self.examples)
52 |     
53 |     def reset_examples(self, examples):
54 |         self.examples = examples
55 |     
56 |     def reset_qid2query(self, qid2query):
57 |         self.dataset["qid2query"] = qid2query
58 | 
59 |     def __getitem__(self, index):
60 |         return generate_feature_converter[self.args.generator_mode](
61 |             index,
62 |             ex=self.examples[index],
63 |             dataset=self.dataset, 
64 |             tokenizer=self.tokenizer,
65 |         )


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/dataloaders/loader_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import json
 4 | import logging
 5 | from tqdm import tqdm
 6 |         
 7 | logger = logging.getLogger()
 8 | 
 9 | 
10 | 
11 | 
12 | def load_corpus(data_dir):
13 |     """
14 |     :param data_dir: docid2doc
15 |     :param tokenizer: 
16 |     """
17 |     # load docid2doc
18 |     logger.info('start load corpus ...')
19 |     orig_corpus = load_json2dict(
20 |         os.path.join(data_dir, "docid2doc.jsonl"), 
21 |         id_name="docid", 
22 |         text_key="doc",
23 |     )       
24 |     return corpus
25 | 
26 |     
27 | def load_json2list(file_path):
28 |     """used in load_dataset."""
29 |     data_list = []
30 |     with open(file_path, mode='r', encoding='utf-8') as fi:
31 |         for idx, line in enumerate(tqdm(fi)):
32 |             data = json.loads(line)
33 |             data_list.append(data)
34 |     return data_list
35 | 
36 | 
37 | def load_json2dict(file_path, id_name, text_key):
38 |     """used in load_dataset."""
39 |     data_dict = {}
40 |     with open(file_path, mode='r', encoding='utf-8') as fi:
41 |         for idx, line in enumerate(tqdm(fi)):
42 |             data = json.loads(line)
43 |             data_dict[data[id_name]] = data[text_key]
44 |     return data_dict
45 | 
46 | 
47 | # ---------------------------------------------------------------------------
48 | # ---------------------------------------------------------------------------
49 | def save_tokenized_corpus(dataset, cache_dir):
50 |     """
51 |     :param: dataset dict has keys : docid2doc
52 |     :param: save dir
53 |     """
54 |     if not os.path.exists(cache_dir):
55 |         os.mkdir(cache_dir)
56 | 
57 |     save_dict2jsonl(
58 |         data_dict=dataset["docid2doc"], 
59 |         output_path=os.path.join(cache_dir, "docid2doc.jsonl"), 
60 |         id_name="docid", 
61 |         text_key="doc"
62 |     )
63 |     
64 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/activations.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import math
 3 | 
 4 | import torch
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | 
11 | def swish(x):
12 |     return x * torch.sigmoid(x)
13 | 
14 | 
15 | def _gelu_python(x):
16 |     """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
17 |         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
18 |         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
19 |         This is now written in C in torch.nn.functional
20 |         Also see https://arxiv.org/abs/1606.08415
21 |     """
22 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
23 | 
24 | 
25 | def gelu_new(x):
26 |     """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
27 |         Also see https://arxiv.org/abs/1606.08415
28 |     """
29 |     return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
30 | 
31 | 
32 | if torch.__version__ < "1.4.0":
33 |     gelu = _gelu_python
34 | else:
35 |     gelu = F.gelu
36 | 
37 | 
38 | def gelu_fast(x):
39 |     return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
40 | 
41 | 
42 | ACT2FN = {
43 |     "relu": F.relu,
44 |     "swish": swish,
45 |     "gelu": gelu,
46 |     "tanh": torch.tanh,
47 |     "gelu_new": gelu_new,
48 |     "gelu_fast": gelu_fast,
49 | }
50 | 
51 | 
52 | def get_activation(activation_string):
53 |     if activation_string in ACT2FN:
54 |         return ACT2FN[activation_string]
55 |     else:
56 |         raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys())))
57 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/benchmark/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | from ..file_utils import is_torch_available
 6 | 
 7 | 
 8 | if is_torch_available():
 9 |     from .benchmark_args import PyTorchBenchmarkArguments
10 |     from .benchmark import PyTorchBenchmark
11 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/commands/__init__.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from argparse import ArgumentParser
 3 | 
 4 | 
 5 | class BaseTransformersCLICommand(ABC):
 6 |     @staticmethod
 7 |     @abstractmethod
 8 |     def register_subcommand(parser: ArgumentParser):
 9 |         raise NotImplementedError()
10 | 
11 |     @abstractmethod
12 |     def run(self):
13 |         raise NotImplementedError()
14 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/commands/download.py:
--------------------------------------------------------------------------------
 1 | from argparse import ArgumentParser
 2 | 
 3 | from transformers.commands import BaseTransformersCLICommand
 4 | 
 5 | 
 6 | def download_command_factory(args):
 7 |     return DownloadCommand(args.model, args.cache_dir, args.force)
 8 | 
 9 | 
10 | class DownloadCommand(BaseTransformersCLICommand):
11 |     @staticmethod
12 |     def register_subcommand(parser: ArgumentParser):
13 |         download_parser = parser.add_parser("download")
14 |         download_parser.add_argument(
15 |             "--cache-dir", type=str, default=None, help="Path to location to store the models"
16 |         )
17 |         download_parser.add_argument(
18 |             "--force", action="store_true", help="Force the model to be download even if already in cache-dir"
19 |         )
20 |         download_parser.add_argument("model", type=str, help="Name of the model to download")
21 |         download_parser.set_defaults(func=download_command_factory)
22 | 
23 |     def __init__(self, model: str, cache: str, force: bool):
24 |         self._model = model
25 |         self._cache = cache
26 |         self._force = force
27 | 
28 |     def run(self):
29 |         from transformers import AutoModel, AutoTokenizer
30 | 
31 |         AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
32 |         AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force)
33 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/commands/env.py:
--------------------------------------------------------------------------------
 1 | import platform
 2 | from argparse import ArgumentParser
 3 | 
 4 | from transformers import __version__ as version
 5 | from transformers import is_tf_available, is_torch_available
 6 | from transformers.commands import BaseTransformersCLICommand
 7 | 
 8 | 
 9 | def info_command_factory(_):
10 |     return EnvironmentCommand()
11 | 
12 | 
13 | class EnvironmentCommand(BaseTransformersCLICommand):
14 |     @staticmethod
15 |     def register_subcommand(parser: ArgumentParser):
16 |         download_parser = parser.add_parser("env")
17 |         download_parser.set_defaults(func=info_command_factory)
18 | 
19 |     def run(self):
20 |         pt_version = "not installed"
21 |         pt_cuda_available = "NA"
22 |         if is_torch_available():
23 |             import torch
24 | 
25 |             pt_version = torch.__version__
26 |             pt_cuda_available = torch.cuda.is_available()
27 | 
28 |         tf_version = "not installed"
29 |         tf_cuda_available = "NA"
30 |         if is_tf_available():
31 |             import tensorflow as tf
32 | 
33 |             tf_version = tf.__version__
34 |             try:
35 |                 # deprecated in v2.1
36 |                 tf_cuda_available = tf.test.is_gpu_available()
37 |             except AttributeError:
38 |                 # returns list of devices, convert to bool
39 |                 tf_cuda_available = bool(tf.config.list_physical_devices("GPU"))
40 | 
41 |         info = {
42 |             "`transformers` version": version,
43 |             "Platform": platform.platform(),
44 |             "Python version": platform.python_version(),
45 |             "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available),
46 |             "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available),
47 |             "Using GPU in script?": "<fill in>",
48 |             "Using distributed or parallel set-up in script?": "<fill in>",
49 |         }
50 | 
51 |         print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n")
52 |         print(self.format_dict(info))
53 | 
54 |         return info
55 | 
56 |     @staticmethod
57 |     def format_dict(d):
58 |         return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n"
59 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/commands/transformers_cli.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from argparse import ArgumentParser
 3 | 
 4 | from transformers.commands.convert import ConvertCommand
 5 | from transformers.commands.download import DownloadCommand
 6 | from transformers.commands.env import EnvironmentCommand
 7 | from transformers.commands.run import RunCommand
 8 | from transformers.commands.serving import ServeCommand
 9 | from transformers.commands.user import UserCommands
10 | 
11 | 
12 | def main():
13 |     parser = ArgumentParser("Transformers CLI tool", usage="transformers-cli <command> [<args>]")
14 |     commands_parser = parser.add_subparsers(help="transformers-cli command helpers")
15 | 
16 |     # Register commands
17 |     ConvertCommand.register_subcommand(commands_parser)
18 |     DownloadCommand.register_subcommand(commands_parser)
19 |     EnvironmentCommand.register_subcommand(commands_parser)
20 |     RunCommand.register_subcommand(commands_parser)
21 |     ServeCommand.register_subcommand(commands_parser)
22 |     UserCommands.register_subcommand(commands_parser)
23 | 
24 |     # Let's go
25 |     args = parser.parse_args()
26 | 
27 |     if not hasattr(args, "func"):
28 |         parser.print_help()
29 |         exit(1)
30 | 
31 |     # Run
32 |     service = args.func(args)
33 |     service.run()
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     main()
38 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/configuration_camembert.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ CamemBERT configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | from .configuration_roberta import RobertaConfig
22 | 
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27 |     "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json",
28 |     "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json",
29 |     "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json",
30 | }
31 | 
32 | 
33 | class CamembertConfig(RobertaConfig):
34 |     """
35 |     This class overrides :class:`~transformers.RobertaConfig`. Please check the
36 |     superclass for the appropriate documentation alongside usage examples.
37 |     """
38 | 
39 |     model_type = "camembert"
40 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/configuration_marian.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The OPUS-NMT Team, Marian team, and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """ Marian model configuration """
16 | 
17 | from .configuration_bart import BartConfig
18 | 
19 | 
20 | PRETRAINED_CONFIG_ARCHIVE_MAP = {
21 |     "Helsinki-NLP/opus-mt-en-de": "https://s3.amazonaws.com/models.huggingface.co/bert/Helsinki-NLP/opus-mt-en-de/config.json",
22 | }
23 | 
24 | 
25 | class MarianConfig(BartConfig):
26 |     model_type = "marian"
27 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/configuration_mmbt.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright (c) Facebook, Inc. and its affiliates.
 3 | # Copyright (c) HuggingFace Inc. team.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ MMBT configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | 
22 | logger = logging.getLogger(__name__)
23 | 
24 | 
25 | class MMBTConfig(object):
26 |     """Configuration class to store the configuration of a `MMBT Model`.
27 | 
28 |     Args:
29 |         config (:obj:`~transformers.PreTrainedConfig`):
30 |             Config of the underlying Transformer models. Its values are
31 |             copied over to use a single config.
32 |         num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`):
33 |             Size of final Linear layer for classification.
34 |         modal_hidden_size (:obj:`int`, optional, defautls to 2048):
35 |             Embedding dimension of the non-text modality encoder.
36 |     """
37 | 
38 |     def __init__(self, config, num_labels=None, modal_hidden_size=2048):
39 |         self.__dict__ = config.__dict__
40 |         self.modal_hidden_size = modal_hidden_size
41 |         if num_labels:
42 |             self.num_labels = num_labels
43 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/configuration_xlm_roberta.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 3 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """ XLM-RoBERTa configuration """
17 | 
18 | 
19 | import logging
20 | 
21 | from .configuration_roberta import RobertaConfig
22 | 
23 | 
24 | logger = logging.getLogger(__name__)
25 | 
26 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
27 |     "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
28 |     "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
29 |     "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json",
30 |     "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json",
31 |     "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json",
32 |     "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json",
33 | }
34 | 
35 | 
36 | class XLMRobertaConfig(RobertaConfig):
37 |     """
38 |     This class overrides :class:`~transformers.RobertaConfig`. Please check the
39 |     superclass for the appropriate documentation alongside usage examples.
40 |     """
41 | 
42 |     model_type = "xlm-roberta"
43 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert ALBERT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = AlbertConfig.from_json_file(albert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = AlbertForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_albert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--albert_config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained ALBERT model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = BertConfig.from_json_file(bert_config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = BertForPreTraining(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_bert(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--bert_config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained BERT model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | import torch
 5 | 
 6 | from transformers.file_utils import WEIGHTS_NAME
 7 | 
 8 | 
 9 | DIALOGPT_MODELS = ["small", "medium", "large"]
10 | 
11 | OLD_KEY = "lm_head.decoder.weight"
12 | NEW_KEY = "lm_head.weight"
13 | 
14 | 
15 | def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str):
16 |     d = torch.load(checkpoint_path)
17 |     d[NEW_KEY] = d.pop(OLD_KEY)
18 |     os.makedirs(pytorch_dump_folder_path, exist_ok=True)
19 |     torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME))
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--dialogpt_path", default=".", type=str)
25 |     args = parser.parse_args()
26 |     for MODEL in DIALOGPT_MODELS:
27 |         checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl")
28 |         pytorch_dump_folder_path = f"./DialoGPT-{MODEL}"
29 |         convert_dialogpt_checkpoint(
30 |             checkpoint_path, pytorch_dump_folder_path,
31 |         )
32 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert T5 checkpoint."""
16 | 
17 | 
18 | import argparse
19 | import logging
20 | 
21 | import torch
22 | 
23 | from transformers import T5Config, T5Model, load_tf_weights_in_t5
24 | 
25 | 
26 | logging.basicConfig(level=logging.INFO)
27 | 
28 | 
29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
30 |     # Initialise PyTorch model
31 |     config = T5Config.from_json_file(config_file)
32 |     print("Building PyTorch model from configuration: {}".format(str(config)))
33 |     model = T5Model(config)
34 | 
35 |     # Load weights from tf checkpoint
36 |     load_tf_weights_in_t5(model, config, tf_checkpoint_path)
37 | 
38 |     # Save pytorch-model
39 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
40 |     torch.save(model.state_dict(), pytorch_dump_path)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     parser = argparse.ArgumentParser()
45 |     # Required parameters
46 |     parser.add_argument(
47 |         "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path."
48 |     )
49 |     parser.add_argument(
50 |         "--config_file",
51 |         default=None,
52 |         type=str,
53 |         required=True,
54 |         help="The config json file corresponding to the pre-trained T5 model. \n"
55 |         "This specifies the model architecture.",
56 |     )
57 |     parser.add_argument(
58 |         "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
59 |     )
60 |     args = parser.parse_args()
61 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path)
62 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/data/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
 3 | # module, but to preserve other warnings. So, don't check this module at all.
 4 | 
 5 | from .metrics import is_sklearn_available
 6 | from .processors import (
 7 |     DataProcessor,
 8 |     InputExample,
 9 |     InputFeatures,
10 |     SingleSentenceClassificationProcessor,
11 |     SquadExample,
12 |     SquadFeatures,
13 |     SquadV1Processor,
14 |     SquadV2Processor,
15 |     glue_convert_examples_to_features,
16 |     glue_output_modes,
17 |     glue_processors,
18 |     glue_tasks_num_labels,
19 |     squad_convert_examples_to_features,
20 |     xnli_output_modes,
21 |     xnli_processors,
22 |     xnli_tasks_num_labels,
23 | )
24 | 
25 | 
26 | if is_sklearn_available():
27 |     from .metrics import glue_compute_metrics, xnli_compute_metrics
28 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 | 
5 | from .glue import GlueDataset, GlueDataTrainingArguments
6 | from .language_modeling import LineByLineTextDataset, TextDataset
7 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/data/processors/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this
3 | # module, but to preserve other warnings. So, don't check this module at all.
4 | 
5 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels
6 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features
7 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor
8 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels
9 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/tokenization_bart.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | import logging
17 | 
18 | from .tokenization_roberta import RobertaTokenizer
19 | from .tokenization_xlm_roberta import XLMRobertaTokenizer
20 | 
21 | 
22 | logger = logging.getLogger(__name__)
23 | 
24 | 
25 | # vocab and merges same as roberta
26 | vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json"
27 | merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt"
28 | _all_bart_models = [
29 |     "facebook/bart-large",
30 |     "facebook/bart-large-mnli",
31 |     "facebook/bart-large-cnn",
32 |     "facebook/bart-large-xsum",
33 | ]
34 | 
35 | 
36 | class BartTokenizer(RobertaTokenizer):
37 |     # merges and vocab same as Roberta
38 |     max_model_input_sizes = {m: 1024 for m in _all_bart_models}
39 |     pretrained_vocab_files_map = {
40 |         "vocab_file": {m: vocab_url for m in _all_bart_models},
41 |         "merges_file": {m: merges_url for m in _all_bart_models},
42 |     }
43 | 
44 | 
45 | _all_mbart_models = ["facebook/mbart-large-en-ro"]
46 | SPM_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/mbart-large-en-ro/sentence.bpe.model"
47 | 
48 | 
49 | class MBartTokenizer(XLMRobertaTokenizer):
50 |     vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"}
51 |     max_model_input_sizes = {m: 1024 for m in _all_mbart_models}
52 |     pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}}
53 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/trainer_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, NamedTuple, Optional
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | class EvalPrediction(NamedTuple):
 7 |     """
 8 |     Evaluation output (always contains labels), to be used
 9 |     to compute metrics.
10 |     """
11 | 
12 |     predictions: np.ndarray
13 |     label_ids: np.ndarray
14 | 
15 | 
16 | class PredictionOutput(NamedTuple):
17 |     predictions: np.ndarray
18 |     label_ids: Optional[np.ndarray]
19 |     metrics: Optional[Dict[str, float]]
20 | 
21 | 
22 | class TrainOutput(NamedTuple):
23 |     global_step: int
24 |     training_loss: float
25 | 
26 | 
27 | PREFIX_CHECKPOINT_DIR = "checkpoint"
28 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/preprocess/prepro_dataset.sh:
--------------------------------------------------------------------------------
1 | export dataset_name= ## you need to set this
2 | export input_path=## you need to set this
3 | export output_path=## you need to set this
4 | 
5 | python ./utils/prepro_dataset.py \
6 | --dataset_name $dataset_name \
7 | --input_path $input_path \
8 | --output_path $output_path \
9 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/preprocess/sample_contrast_pairs.sh:
--------------------------------------------------------------------------------
 1 | export dataset_name= ## you need to set this
 2 | export input_path= ## you need to set this
 3 | export generator_folder=qg_t5-base ## qg_t5-small ; qg_t5-base
 4 | 
 5 | python ./utils/sample_contrast_pairs.py \
 6 | --dataset_name $dataset_name \
 7 | --generator_folder $generator_folder \
 8 | --input_path $input_path \
 9 | --topk 100 \
10 | --sample_n 5
11 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/run_shell/cqg_inference.sh:
--------------------------------------------------------------------------------
 1 | # !/bin/bash
 2 | ## --------------------------------------------
 3 | export CUDA=2
 4 | export pretrain_generator_type=t5-base ## t5-small ; t5-base
 5 | export per_gpu_gen_batch_size=200 ## 200; 400
 6 | export target_dataset= ## you need to set this
 7 | export generator_mode=contrastqg
 8 | ## --------------------------------------------
 9 | 
10 | ## --------------------------------------------
11 | export generator_load_dir= ## you need to set this
12 | export target_dataset_dir= ## you need to set this
13 | ## --------------------------------------------
14 | 
15 | 
16 | CUDA_VISIBLE_DEVICES=$CUDA python ../scripts/inference.py \
17 | --generator_mode $generator_mode \
18 | --pretrain_generator_type $pretrain_generator_type \
19 | --per_gpu_gen_batch_size $per_gpu_gen_batch_size \
20 | --generator_load_dir $generator_load_dir \
21 | --target_dataset_dir $target_dataset_dir \
22 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/run_shell/qg_inference.sh:
--------------------------------------------------------------------------------
 1 | # !/bin/bash
 2 | ## --------------------------------------------
 3 | export CUDA=2
 4 | export pretrain_generator_type=t5-base ## t5-small ; t5-base
 5 | export per_gpu_gen_batch_size=200 ## 200; 400
 6 | export target_dataset_name= ## you need to set this
 7 | export generator_mode=qg
 8 | ## --------------------------------------------
 9 | 
10 | ## --------------------------------------------
11 | export generator_load_dir= ## you need to set this
12 | export target_dataset_dir= ## you need to set this
13 | ## --------------------------------------------
14 | 
15 | CUDA_VISIBLE_DEVICES=$CUDA python ../scripts/inference.py \
16 | --generator_mode $generator_mode \
17 | --pretrain_generator_type $pretrain_generator_type \
18 | --per_gpu_gen_batch_size $per_gpu_gen_batch_size \
19 | --generator_load_dir $generator_load_dir \
20 | --target_dataset_dir $target_dataset_dir \
21 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/run_shell/train_nlg.sh:
--------------------------------------------------------------------------------
 1 | # #!/bin/bash
 2 | export CUDA=0
 3 | export generator_mode=qg # qg / contrastqg
 4 | export pretrain_generator_type=t5-small ## t5-small / t5-base
 5 | 
 6 | export pretrain_model_dir=../data/pretrain_model
 7 | export train_file=../data/source_data/toy_triples.train.small.tsv
 8 | export save_dir=../results
 9 | 
10 | ## ------------------------------------------------------------------
11 | ## ------------------------------------------------------------------
12 | CUDA_VISIBLE_DEVICES=$CUDA python ../scripts/train.py --run_mode train \
13 | --generator_mode $generator_mode \
14 | --pretrain_generator_type $pretrain_generator_type \
15 | --per_gpu_train_batch_size 4 \
16 | --gradient_accumulation_steps 1 \
17 | --pretrain_model_dir $pretrain_model_dir \
18 | --train_file $train_file \
19 | --save_dir $save_dir \
20 | 


--------------------------------------------------------------------------------
/v1/Contrastive_Supervision_Synthesis/scripts/model.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import math
 3 | import torch
 4 | from torch import nn, optim
 5 | import logging
 6 | import numpy as np
 7 | import torch.nn.functional as F
 8 | from torch.autograd import Variable
 9 | import utils
10 | from contrastqg import (T5ForConditionalGeneration)
11 | 
12 | logger = logging.getLogger()
13 | 
14 | class QGenerator(object):
15 |     def __init__(self, args, tokenizer):
16 |         self.network = T5ForConditionalGeneration.from_pretrained(args.pretrain_generator_type)
17 |         self.network.resize_token_embeddings(len(tokenizer))
18 |         self.network.load_state_dict(torch.load(args.generator_load_dir + '/models.pkl'))
19 |         logger.info("sccuess load checkpoint from {} !".format(args.generator_load_dir))
20 |         self.tokenizer = tokenizer
21 |         self.batchify_inputs = utils.select_gen_input_refactor(args)
22 | 
23 |         
24 |     def predict(self, inputs):        
25 |         self.network.eval()
26 |         outputs = self.network.generate(**inputs)
27 |         pred_tokens = self.tokenizer.convert_outputs_to_tokens(outputs)
28 |         return pred_tokens
29 | 
30 |     def set_device(self, device):
31 |         self.device = device
32 |         self.network.to(self.device)
33 |         
34 |   
35 |     def parallelize(self):
36 |         """Use data parallel to copy the model across several gpus.
37 |         This will take all gpus visible with CUDA_VISIBLE_DEVICES.
38 |         """
39 |         self.parallel = True
40 |         self.network = torch.nn.DataParallel(self.network)


--------------------------------------------------------------------------------
/v1/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime
 2 | LABEL maintainer="Yizhi Li <yizhi.li@hotmail.com>"
 3 | USER root
 4 | 
 5 | # installing full CUDA toolkit
 6 | RUN apt update
 7 | RUN pip install --upgrade pip
 8 | #RUN apt install -y build-essential g++ llvm-9-dev git cmake wget
 9 | RUN apt install -y build-essential g++ git cmake wget
10 | RUN conda install -y -c conda-forge cudatoolkit-dev
11 | # setting environment variables
12 | ENV CUDA_HOME "/opt/conda/pkgs/cuda-toolkit"
13 | ENV CUDA_TOOLKIT_ROOT_DIR $CUDA_HOME
14 | ENV LIBRARY_PATH "$CUDA_HOME/lib64:$LIBRARY_PATH"
15 | ENV LD_LIBRARY_PATH "$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH"
16 | ENV CFLAGS "-I$CUDA_HOME/include $CFLAGS"
17 | 
18 | # warning: no torch and torchvision in the requirements, need to install in advance
19 | RUN wget https://raw.githubusercontent.com/thunlp/OpenMatch/master/retrievers/venv_ANCE.requirements
20 | RUN pip install -r venv_ANCE.requirements
21 | RUN pip install tensorflow
22 | 
23 | WORKDIR /workspace
24 | RUN git clone https://github.com/NVIDIA/apex.git
25 | WORKDIR /workspace/apex
26 | RUN python setup.py install --cpp_ext --cuda_ext
27 | WORKDIR /workspace
28 | 
29 | RUN git clone https://github.com/microsoft/ANCE.git
30 | WORKDIR /workspace/ANCE
31 | RUN python setup.py install
32 | WORKDIR /workspace
33 | 
34 | RUN git clone https://github.com/thunlp/OpenMatch.git
35 | WORKDIR /workspace/OpenMatch
36 | RUN python setup.py install
37 | WORKDIR /workspace
38 | 
39 | ENTRYPOINT ["/bin/bash"]
40 | 


--------------------------------------------------------------------------------
/v1/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 THUNLP
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/v1/LeToR/RankLib-2.1-patched.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/LeToR/RankLib-2.1-patched.jar


--------------------------------------------------------------------------------
/v1/LeToR/gen_trec.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | def main():
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument('-dev', type=str, default='../data/dev_toy.jsonl')
 7 |     parser.add_argument('-res', type=str, default='../results/cknrm_ca.trec')
 8 |     parser.add_argument('-k', type=int, default=2)
 9 |     args = parser.parse_args()
10 | 
11 |     score_dic = {}
12 |     for i in range(args.k):
13 |         with open('f' + str(i+1) + '.score', 'r') as r:
14 |             for line in r:
15 |                 line = line.strip('\n').split('\t')
16 |                 score_dic[line[0] + '$' + line[1]] = line[2]
17 | 
18 |     if args.k == -1:
19 |         with open('f' + str(args.k+1) + '.score', 'r') as r:
20 |             for line in r:
21 |                 line = line.strip('\n').split('\t')
22 |                 score_dic[line[0] + '$' + line[1]] = line[2]
23 | 
24 |     outs = {}
25 |     with open(args.dev, 'r') as r:
26 |         qid = ''
27 |         cnt = 0
28 |         for line in r:
29 |             line = json.loads(line)
30 |             if line['query_id'] != qid:
31 |                 qid = line['query_id']
32 |                 cnt = 0
33 |                 outs[line['query_id']] = {}
34 |             outs[line['query_id']][line['doc_id']] = float(score_dic[line['query_id']+'$'+str(cnt)])
35 |             cnt += 1
36 | 
37 |     f = open(args.res, 'w')
38 |     for qid in outs:
39 |         ps = {}
40 |         out_idx = sorted(outs[qid].items(), key=lambda x:x[1], reverse=True)
41 |         for i, out in enumerate(out_idx):
42 |             if out[0] not in ps:
43 |                 ps[out[0]] = 1
44 |                 f.write(' '.join([qid, 'Q0', out[0], str(len(ps)), str(out[1]), 'default']) + '\n')
45 |     f.close()
46 | 
47 | if __name__ == "__main__":
48 |     main()
49 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/__init__.py:
--------------------------------------------------------------------------------
1 | from OpenMatch.data import *
2 | from OpenMatch.extractors import *
3 | from OpenMatch.metrics import *
4 | from OpenMatch.models import *
5 | from OpenMatch.modules import *
6 | from OpenMatch.utils import *
7 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/data/__init__.py:
--------------------------------------------------------------------------------
1 | from OpenMatch.data.dataloader import DataLoader
2 | from OpenMatch.data.datasets import *
3 | from OpenMatch.data.tokenizers import *
4 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/data/dataloader.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict, Any
 2 | 
 3 | import torch
 4 | from torch.utils.data import DataLoader, Sampler
 5 | from torch.utils.data.distributed import DistributedSampler
 6 | 
 7 | from OpenMatch.data.datasets import Dataset
 8 | 
 9 | class DataLoader(DataLoader):
10 |     def __init__(
11 |         self,
12 |         dataset: Dataset,
13 |         batch_size: int,
14 |         shuffle: str = False,
15 |         num_workers: int = 0,
16 |         sampler = None,
17 |     ) -> None:
18 |         super().__init__(
19 |             dataset = dataset,
20 |             batch_size = batch_size,
21 |             shuffle = shuffle,
22 |             num_workers = num_workers,
23 |             collate_fn = dataset.collate,
24 |             sampler = sampler,
25 |         )
26 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from OpenMatch.data.datasets.dataset import Dataset
2 | from OpenMatch.data.datasets.edrm_dataset import EDRMDataset
3 | from OpenMatch.data.datasets.bert_dataset import BertDataset
4 | from OpenMatch.data.datasets.meta_bert_dataset import MetaBertDataset
5 | from OpenMatch.data.datasets.roberta_dataset import RobertaDataset
6 | from OpenMatch.data.datasets.bertmlm_dataset import BertMLMDataset
7 | from OpenMatch.data.datasets.bertmaxp_dataset import BertMaxPDataset
8 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/data/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | from OpenMatch.data.tokenizers.tokenizer import Tokenizer
2 | from OpenMatch.data.tokenizers.word_tokenizer import WordTokenizer
3 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/data/tokenizers/word_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from nltk import word_tokenize
 4 | 
 5 | from OpenMatch.data.tokenizers import Tokenizer
 6 | 
 7 | class WordTokenizer(Tokenizer):
 8 |     def tokenize(self, text: str) -> List[str]:
 9 |         tokens = word_tokenize(text)
10 |         return tokens
11 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/extractors/__init__.py:
--------------------------------------------------------------------------------
1 | from OpenMatch.extractors.classic_extractor import ClassicExtractor
2 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from OpenMatch.metrics.metric import Metric
2 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/metrics/metric.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict
 2 | 
 3 | import pytrec_eval
 4 | 
 5 | class Metric():
 6 |     def get_metric(self, qrels: str, trec: str, metric: str = 'ndcg_cut_10') -> Dict[str, float]:
 7 |         with open(qrels, 'r') as f_qrel:
 8 |             qrel = pytrec_eval.parse_qrel(f_qrel)
 9 |         with open(trec, 'r') as f_run:
10 |             run = pytrec_eval.parse_run(f_run)
11 | 
12 |         evaluator = pytrec_eval.RelevanceEvaluator(qrel, pytrec_eval.supported_measures)
13 |         results = evaluator.evaluate(run)
14 |         for query_id, query_measures in sorted(results.items()):
15 |             pass
16 |         mes = {}
17 |         for measure in sorted(query_measures.keys()):
18 |             mes[measure] = pytrec_eval.compute_aggregated_measure(measure, [query_measures[measure] for query_measures in results.values()])
19 |         return mes[metric]
20 | 
21 |     def get_mrr(self, qrels: str, trec: str, metric: str = 'mrr_cut_10') -> float:
22 |         k = int(metric.split('_')[-1])
23 | 
24 |         qrel = {}
25 |         with open(qrels, 'r') as f_qrel:
26 |             for line in f_qrel:
27 |                 qid, _, did, label = line.strip().split()
28 |                 if qid not in qrel:
29 |                     qrel[qid] = {}
30 |                 qrel[qid][did] = int(label)
31 | 
32 |         run = {}
33 |         with open(trec, 'r') as f_run:
34 |             for line in f_run:
35 |                 qid, _, did, _, _, _ = line.strip().split()
36 |                 if qid not in run:
37 |                     run[qid] = []
38 |                 run[qid].append(did)
39 |         
40 |         mrr = 0.0
41 |         for qid in run:
42 |             rr = 0.0
43 |             for i, did in enumerate(run[qid][:k]):
44 |                 if qid in qrel and did in qrel[qid] and qrel[qid][did] > 0:
45 |                     rr = 1 / (i+1)
46 |                     break
47 |             mrr += rr
48 |         mrr /= len(run)
49 |         return mrr
50 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/models/__init__.py:
--------------------------------------------------------------------------------
1 | from OpenMatch.models.bert import Bert
2 | from OpenMatch.models.bert_maxp import BertMaxP
3 | from OpenMatch.models.conv_knrm import ConvKNRM
4 | from OpenMatch.models.knrm import KNRM
5 | from OpenMatch.models.tk import TK
6 | from OpenMatch.models.edrm import EDRM
7 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/models/bert.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from transformers import AutoConfig, AutoModel
 7 | 
 8 | class Bert(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         pretrained: str,
12 |         mode: str = 'cls',
13 |         task: str = 'ranking'
14 |     ) -> None:
15 |         super(Bert, self).__init__()
16 |         self._pretrained = pretrained
17 |         self._mode = mode
18 |         self._task = task
19 | 
20 |         self._config = AutoConfig.from_pretrained(self._pretrained)
21 |         self._model = AutoModel.from_pretrained(self._pretrained, config=self._config)
22 | 
23 |         if self._task == 'ranking':
24 |             self._dense = nn.Linear(self._config.hidden_size, 1)
25 |         elif self._task == 'classification':
26 |             self._dense = nn.Linear(self._config.hidden_size, 2)
27 |         else:
28 |             raise ValueError('Task must be `ranking` or `classification`.')
29 | 
30 |     def forward(self, input_ids: torch.Tensor, input_mask: torch.Tensor = None, segment_ids: torch.Tensor = None) -> Tuple[torch.Tensor, torch.Tensor]:
31 |         output = self._model(input_ids, attention_mask = input_mask, token_type_ids = segment_ids)
32 |         if self._mode == 'cls':
33 |             logits = output[0][:, 0, :]
34 |         elif self._mode == 'pooling':
35 |             logits = output[1]
36 |         else:
37 |             raise ValueError('Mode must be `cls` or `pooling`.')
38 |         score = self._dense(logits).squeeze(-1)
39 |         return score, logits
40 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/models/bert_maxp.py:
--------------------------------------------------------------------------------
 1 | from typing import Tuple
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from transformers import AutoConfig, AutoModel
 7 | 
 8 | class BertMaxP(nn.Module):
 9 |     def __init__(
10 |         self,
11 |         pretrained: str,
12 |         max_query_len: int,
13 |         max_doc_len: int,
14 |         mode: str = 'cls',
15 |         task: str = 'ranking'
16 |     ) -> None:
17 |         super(BertMaxP, self).__init__()
18 |         self._pretrained = pretrained
19 |         self._max_query_len = max_query_len
20 |         self._max_doc_len = max_doc_len
21 |         self._mode = mode
22 |         self._task = task
23 | 
24 |         self._config = AutoConfig.from_pretrained(self._pretrained)
25 |         self._model = AutoModel.from_pretrained(self._pretrained, config=self._config)
26 | 
27 |         self._dense1 = nn.Linear(self._config.hidden_size, 128)
28 |         self._activation = nn.ReLU()
29 | 
30 |         if self._task == 'ranking':
31 |             self._dense2 = nn.Linear(128, 1)
32 |         elif self._task == 'classification':
33 |             self._dense2 = nn.Linear(128, 2)
34 |         else:
35 |             raise ValueError('Task must be `ranking` or `classification`.')
36 | 
37 |     def forward(self, input_ids: torch.Tensor, input_mask: torch.Tensor = None, segment_ids: torch.Tensor = None) -> Tuple[torch.Tensor, torch.Tensor]:
38 |         num = input_ids.size()[0]
39 |         output = self._model(input_ids.view(num*4, self._max_query_len+self._max_doc_len+3), attention_mask = input_mask.view(num*4, self._max_query_len+self._max_doc_len+3), token_type_ids = segment_ids.view(num*4, self._max_query_len+self._max_doc_len+3))
40 | 
41 |         if self._mode == 'cls':
42 |             logits = output[0][:, 0, :].view(num,4,-1).max(dim=1)[0]
43 |         elif self._mode == 'pooling':
44 |             logits = output[1].view(num,4,-1).max(dim=1)[0]
45 |         else:
46 |             raise ValueError('Mode must be `cls` or `pooling`.')
47 |         logits = self._activation(self._dense1(logits))
48 |         score = self._dense2(logits).squeeze(-1)
49 |         return score, logits
50 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/models/conv_knrm.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from OpenMatch.modules.embedders import Embedder
 7 | from OpenMatch.modules.encoders import Conv1DEncoder
 8 | from OpenMatch.modules.matchers import KernelMatcher
 9 | 
10 | class ConvKNRM(nn.Module):
11 |     def __init__(
12 |         self,
13 |         vocab_size: int,
14 |         embed_dim: int,
15 |         kernel_num: int = 21,
16 |         kernel_dim: int = 128,
17 |         kernel_sizes: List[int] = [1, 2, 3],
18 |         embed_matrix: List[float] = None,
19 |         task: str = 'ranking'
20 |     ) -> None:
21 |         super(ConvKNRM, self).__init__()
22 |         self._vocab_size = vocab_size
23 |         self._embed_dim = embed_dim
24 |         self._kernel_num = kernel_num
25 |         self._kernel_dim = kernel_dim
26 |         self._kernel_sizes = kernel_sizes
27 |         self._embed_matrix = embed_matrix
28 |         self._task = task
29 | 
30 |         self._embedder = Embedder(self._vocab_size, self._embed_dim, self._embed_matrix)
31 |         self._encoder = Conv1DEncoder(self._embed_dim, self._kernel_dim, self._kernel_sizes)
32 |         self._matcher = KernelMatcher(self._encoder.get_output_dim(), self._kernel_num)
33 |         if self._task == 'ranking':
34 |             self._dense = nn.Linear(self._kernel_num * (len(self._kernel_sizes) ** 2), 1)
35 |         elif self._task == 'classification':
36 |             self._dense = nn.Linear(self._kernel_num * (len(self._kernel_sizes) ** 2), 2)
37 |         else:
38 |             raise ValueError('Task must be `ranking` or `classification`.')
39 | 
40 |     def forward(self, query_ids: torch.Tensor, query_masks: torch.Tensor, doc_ids: torch.Tensor, doc_masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
41 |         query_embed = self._embedder(query_ids)
42 |         doc_embed = self._embedder(doc_ids)
43 |         _, query_encs = self._encoder(query_embed, query_masks)
44 |         _, doc_encs = self._encoder(doc_embed, doc_masks)
45 | 
46 |         logits = torch.cat([self._matcher(query_enc, query_masks[:, :query_enc.size()[1]], doc_enc, doc_masks[:, :doc_enc.size()[1]])
47 |                   for query_enc in query_encs for doc_enc in doc_encs], dim=1)
48 |         score = self._dense(logits).squeeze(-1)
49 |         return score, logits
50 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/models/knrm.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | from OpenMatch.modules.embedders import Embedder
 7 | from OpenMatch.modules.matchers import KernelMatcher
 8 | 
 9 | class KNRM(nn.Module):
10 |     def __init__(
11 |         self,
12 |         vocab_size: int,
13 |         embed_dim: int,
14 |         kernel_num: int = 21,
15 |         embed_matrix: List[float] = None,
16 |         task: str = 'ranking'
17 |     ) -> None:
18 |         super(KNRM, self).__init__()
19 |         self._vocab_size = vocab_size
20 |         self._embed_dim = embed_dim
21 |         self._kernel_num = kernel_num
22 |         self._embed_matrix = embed_matrix
23 |         self._task = task
24 | 
25 |         self._embedder = Embedder(self._vocab_size, self._embed_dim, self._embed_matrix)
26 |         self._matcher = KernelMatcher(self._embed_dim, self._kernel_num)
27 |         if self._task == 'ranking':
28 |             self._dense = nn.Linear(self._kernel_num, 1)
29 |         elif self._task == 'classification':
30 |             self._dense = nn.Linear(self._kernel_num, 2)
31 |         else:
32 |             raise ValueError('Task must be `ranking` or `classification`.')
33 | 
34 |     def forward(self, query_ids: torch.Tensor, query_masks: torch.Tensor, doc_ids: torch.Tensor, doc_masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
35 |         query_embed = self._embedder(query_ids)
36 |         doc_embed = self._embedder(doc_ids)
37 | 
38 |         logits = self._matcher(query_embed, query_masks, doc_embed, doc_masks)
39 |         score = self._dense(logits).squeeze(-1)
40 |         return score, logits
41 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from OpenMatch.modules.attentions import *
2 | from OpenMatch.modules.embedders import *
3 | from OpenMatch.modules.encoders import *
4 | from OpenMatch.modules.matchers import *
5 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/modules/attentions/__init__.py:
--------------------------------------------------------------------------------
1 | from OpenMatch.modules.attentions.multi_head_attention import MultiHeadAttention
2 | from OpenMatch.modules.attentions.scaled_dot_product_attention import ScaledDotProductAttention
3 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/modules/attentions/multi_head_attention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from .scaled_dot_product_attention import ScaledDotProductAttention
 5 | 
 6 | class MultiHeadAttention(nn.Module):
 7 |     def __init__(
 8 |         self,
 9 |         embed_dim: int = 512,
10 |         head_num: int = 8,
11 |         dropout: float = 0.0
12 |     ) -> None:
13 |         super(MultiHeadAttention, self).__init__()
14 |         self._embed_dim = embed_dim
15 |         self._head_num = head_num
16 |         self._head_dim = self._embed_dim // self._head_num
17 |         assert self._head_dim * self._head_num == self._embed_dim, 'embed_dim must be divisible by num_heads'
18 | 
19 |         self._fcq = nn.Linear(self._embed_dim, self._head_dim * self._head_num)
20 |         self._fck = nn.Linear(self._embed_dim, self._head_dim * self._head_num)
21 |         self._fcv = nn.Linear(self._embed_dim, self._head_dim * self._head_num)
22 |         self._attention = ScaledDotProductAttention(dropout)
23 |         self._fc = nn.Linear(self._embed_dim, self._embed_dim)
24 |         self._dropout = nn.Dropout(dropout)
25 |         self._norm = nn.LayerNorm(self._embed_dim)
26 | 
27 |     def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attn_mask: torch.Tensor=None) -> torch.Tensor:
28 |         residual = query
29 |         batch_size = query.size(0)
30 |         query = self._fcq(query).view(batch_size * self._head_num, -1, self._head_dim)
31 |         key = self._fck(key).view(batch_size * self._head_num, -1, self._head_dim)
32 |         value = self._fcv(value).view(batch_size * self._head_num, -1, self._head_dim)
33 | 
34 |         scale = (query.size(-1) // self._head_num) ** -0.5
35 |         if attn_mask is not None:
36 |             attn_mask = attn_mask.repeat(self._head_num, 1, 1)
37 |         context, attn = self._attention(query, key, value, scale, attn_mask)
38 |         context = context.view(batch_size, -1, self._head_num * self._head_dim)
39 |         output = self._fc(context)
40 |         output = self._dropout(output)
41 |         output = self._norm(residual + output)
42 |         return output, attn
43 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/modules/attentions/scaled_dot_product_attention.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class ScaledDotProductAttention(nn.Module):
 5 |     def __init__(
 6 |         self,
 7 |         dropout: float = 0.0
 8 |     ) -> None:
 9 |         super(ScaledDotProductAttention, self).__init__()
10 |         self._dropout = nn.Dropout(dropout)
11 |         self._softmax = nn.Softmax(dim=2)
12 | 
13 |     def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, scale: float=None, attn_mask: torch.Tensor=None) -> torch.Tensor:
14 |         attn = torch.bmm(query, key.transpose(1, 2))
15 |         if scale is not None:
16 |             attn *= scale
17 |         if attn_mask is not None:
18 |             attn = attn.masked_fill(attn_mask, -1.0e32)
19 |         attn = self._softmax(attn)
20 |         attn = self._dropout(attn)
21 |         context = torch.bmm(attn, value)
22 |         return context, attn
23 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/modules/embedders/__init__.py:
--------------------------------------------------------------------------------
1 | from OpenMatch.modules.embedders.embedder import Embedder
2 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/modules/embedders/embedder.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | class Embedder(nn.Module):
 7 |     def __init__(
 8 |         self,
 9 |         vocab_size: int,
10 |         embed_dim: int,
11 |         embed_matrix: List[float] = None
12 |     ) -> None:
13 |         super(Embedder, self).__init__()
14 |         self._vocab_size = vocab_size
15 |         self._embed_dim = embed_dim
16 | 
17 |         self._embedder = nn.Embedding(self._vocab_size, self._embed_dim, padding_idx=0)
18 |         if embed_matrix is not None:
19 |             self._embed_matrix = torch.tensor(embed_matrix)
20 |             self._embedder.weight = nn.Parameter(self._embed_matrix, requires_grad=True)
21 | 
22 |     def forward(self, idx: torch.Tensor) -> torch.Tensor:
23 |         embed = self._embedder(idx)
24 |         return embed
25 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/modules/encoders/__init__.py:
--------------------------------------------------------------------------------
1 | from OpenMatch.modules.encoders.cnn_encoder import Conv1DEncoder
2 | from OpenMatch.modules.encoders.feed_forward_encoder import FeedForwardEncoder
3 | from OpenMatch.modules.encoders.positional_encoder import PositionalEncoder
4 | from OpenMatch.modules.encoders.transformer_encoder import TransformerEncoder
5 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/modules/encoders/cnn_encoder.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | 
 6 | class Conv1DEncoder(nn.Module):
 7 |     def __init__(
 8 |         self,
 9 |         embed_dim: int,
10 |         kernel_dim: int,
11 |         kernel_sizes: List[int] = [2, 3, 4, 5],
12 |         stride: int = 1
13 |     ) -> None:
14 |         super(Conv1DEncoder, self).__init__()
15 |         self._embed_dim = embed_dim
16 |         self._kernel_dim = kernel_dim
17 |         self._kernel_sizes = kernel_sizes
18 |         self._stride = stride
19 |         self._output_dim = self._kernel_dim * len(self._kernel_sizes)
20 | 
21 |         self._encoder = nn.ModuleList([
22 |             nn.Conv1d(
23 |                 in_channels=self._embed_dim,
24 |                 out_channels=self._kernel_dim,
25 |                 kernel_size=kernel_size,
26 |                 stride = self._stride
27 |             )
28 |             for kernel_size in self._kernel_sizes
29 |         ])
30 |         self._activation = nn.ReLU()
31 | 
32 |     def get_output_dim(self) -> int:
33 |         return self._output_dim
34 | 
35 |     def forward(self, embed: torch.Tensor, masks: torch.Tensor = None) -> Tuple[torch.Tensor, List[torch.Tensor]]:
36 |         if masks is not None:
37 |             embed = embed * masks.unsqueeze(-1)
38 |         embed = torch.transpose(embed, 1, 2)
39 | 
40 |         kernel_outputs = [self._activation(enc(embed)) for enc in self._encoder]
41 |         pooling_sums = [kernel_output.max(dim=2).values for kernel_output in kernel_outputs]
42 |         enc = (torch.cat(pooling_sums, dim=1) if len(pooling_sums) > 1 else pooling_sums[0])
43 |         return enc, [torch.transpose(kernel_output, 1, 2) for kernel_output in kernel_outputs]
44 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/modules/encoders/feed_forward_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class FeedForwardEncoder(nn.Module):
 5 |     def __init__(
 6 |         self,
 7 |         embed_dim: int,
 8 |         hidden_dim: int,
 9 |         dropout: float = 0.1
10 |     ) -> None:
11 |         super(FeedForwardEncoder, self).__init__()
12 |         self._embed_dim = embed_dim
13 |         self._hidden_dim = hidden_dim
14 | 
15 |         self._fc1 = torch.nn.Linear(self._embed_dim, self._hidden_dim)
16 |         self._fc2 = torch.nn.Linear(self._hidden_dim, self._embed_dim)
17 |         self._dropout = nn.Dropout(dropout)
18 |         self._activation = nn.ReLU()
19 |         self._norm = nn.LayerNorm(self._embed_dim)
20 | 
21 |     def forward(self, embed: torch.Tensor) -> torch.Tensor:
22 |         enc = self._dropout(self._fc2(self._activation(self._fc1(embed))))
23 |         enc = self._norm(embed + enc)
24 |         return enc
25 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/modules/encoders/positional_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class PositionalEncoder(nn.Module):
 5 |     def __init__(
 6 |         self,
 7 |         embed_dim: int,
 8 |         max_len: int = 512
 9 |     ) -> None:
10 |         super(PositionalEncoder, self).__init__()
11 |         self._embed_dim = embed_dim
12 |         self._max_len = max_len
13 | 
14 |         self._embed_matrix = torch.tensor(
15 |             [[pos / pow(1.0e4, 2.0 * (i // 2) / self._embed_dim) for i in range(self._embed_dim)] for pos in range(self._max_len)]
16 |         )
17 |         self._embed_matrix[:, 0::2] = torch.sin(self._embed_matrix[:, 0::2])
18 |         self._embed_matrix[:, 1::2] = torch.cos(self._embed_matrix[:, 1::2])
19 |         self._embedder = nn.Embedding(self._max_len, self._embed_dim)
20 |         self._embedder.weight = nn.Parameter(self._embed_matrix, requires_grad=False)
21 | 
22 |     def forward(self, embed: torch.Tensor) -> torch.Tensor:
23 |         token_len = embed.size()[1]
24 |         if embed.is_cuda:
25 |             ids = torch.cuda.LongTensor([l for l in range(token_len)])
26 |         else:
27 |             ids = torch.LongTensor([l for l in range(token_len)])
28 |         embed += self._embedder(ids)
29 |         return embed
30 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/modules/encoders/transformer_encoder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from OpenMatch.modules.attentions import MultiHeadAttention
 5 | from .feed_forward_encoder import FeedForwardEncoder
 6 | from .positional_encoder import PositionalEncoder
 7 | 
 8 | class TransformerEncoderLayer(nn.Module):
 9 |     def __init__(
10 |         self,   
11 |         embed_dim: int,
12 |         head_num: int = 8,
13 |         hidden_dim: int = 2048,
14 |         dropout: float = 0.0
15 |     ) -> None:
16 |         super(TransformerEncoderLayer, self).__init__()
17 |         self._embed_dim = embed_dim
18 |         self._head_num = head_num
19 |         self._hidden_dim = hidden_dim
20 |         self._dropout = dropout
21 | 
22 |         self._attention = MultiHeadAttention(self._embed_dim, self._head_num, dropout=self._dropout)
23 |         self._feed_forward = FeedForwardEncoder(self._embed_dim, self._hidden_dim, self._dropout)
24 | 
25 |     def forward(self, embed: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
26 |         embed, weights = self._attention(embed, embed, embed, attn_mask=mask)
27 |         enc = self._feed_forward(embed)
28 |         return enc
29 | 
30 | class TransformerEncoder(nn.Module):
31 |     def __init__(
32 |         self,
33 |         embed_dim: int,
34 |         head_num: int = 8,
35 |         hidden_dim: int = 2048,
36 |         layer_num: int = 6,
37 |         dropout: float = 0.0
38 |     ) -> None:
39 |         super(TransformerEncoder, self).__init__()
40 |         self._embed_dim = embed_dim
41 |         self._head_num = head_num
42 |         self._hidden_dim = hidden_dim
43 |         self._layer_num = layer_num
44 |         self._dropout = dropout
45 | 
46 |         self._pos_encoder = PositionalEncoder(self._embed_dim)
47 |         self._layers = nn.ModuleList([
48 |             TransformerEncoderLayer(self._embed_dim, self._head_num, self._hidden_dim, self._dropout) for _ in range(self._layer_num)
49 |         ])
50 | 
51 |     def forward(self, embed: torch.Tensor, mask: torch.Tensor) -> torch.Tensor:
52 |         enc = self._pos_encoder(embed)
53 |         for layer in self._layers:
54 |             enc = layer(enc, mask)
55 |         return enc
56 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/modules/matchers/__init__.py:
--------------------------------------------------------------------------------
1 | from OpenMatch.modules.matchers.kernel_matcher import KernelMatcher
2 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/modules/matchers/kernel_matcher.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict
 2 | 
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | class KernelMatcher(nn.Module):
 8 |     def __init__(
 9 |         self,
10 |         embed_dim: int,
11 |         kernel_num: int = 21
12 |     ) -> None:
13 |         super(KernelMatcher, self).__init__()
14 |         self._embed_dim = embed_dim
15 |         self._kernel_num = kernel_num
16 |         mus, sigmas = self.kernel_init(self._kernel_num)
17 |         self._mus = nn.Parameter(mus, requires_grad=False)
18 |         self._sigmas = nn.Parameter(sigmas, requires_grad=False)
19 | 
20 |     def kernel_init(self, kernel_num: int) -> Dict[str, torch.Tensor]:
21 |         mus = [1]
22 |         bin_size = 2.0/(kernel_num-1)
23 |         mus.append(1-bin_size/2)
24 |         for i in range(1, kernel_num-1):
25 |             mus.append(mus[i]-bin_size)
26 |         mus = torch.tensor(mus).view(1, 1, 1, kernel_num)
27 | 
28 |         sigmas = [0.001]
29 |         sigmas += [0.1]*(kernel_num-1)
30 |         sigmas = torch.tensor(sigmas).view(1, 1, 1, kernel_num)
31 |         return mus, sigmas
32 | 
33 |     def forward(self, k_embed: torch.Tensor, k_mask: torch.Tensor, v_embed: torch.Tensor, v_mask: torch.Tensor) -> torch.Tensor:
34 |         k_embed = k_embed * k_mask.unsqueeze(-1)
35 |         v_embed = v_embed * v_mask.unsqueeze(-1)
36 |         k_by_v_mask = torch.bmm(k_mask.float().unsqueeze(-1), v_mask.float().unsqueeze(-1).transpose(1, 2))
37 |         k_norm = F.normalize(k_embed, p=2, dim=2, eps=1e-10)
38 |         v_norm = F.normalize(v_embed, p=2, dim=2, eps=1e-10)
39 |         inter = (torch.bmm(k_norm, v_norm.transpose(1, 2)) * k_by_v_mask).unsqueeze(-1)
40 | 
41 |         kernel_outputs = torch.exp((-((inter-self._mus)**2)/(self._sigmas**2)/2))
42 |         kernel_outputs = kernel_outputs.sum(dim=2).clamp(min=1e-10).log() * 1e-2
43 |         logits = kernel_outputs.sum(dim=1)
44 |         return logits
45 | 


--------------------------------------------------------------------------------
/v1/OpenMatch/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from argparse import Action
 4 | 
 5 | class DictOrStr(Action):
 6 |     def __call__(self, parser, namespace, values, option_string=None):
 7 |          if '=' in values:
 8 |              my_dict = {}
 9 |              for kv in values.split(","):
10 |                  k,v = kv.split("=")
11 |                  my_dict[k] = v
12 |              setattr(namespace, self.dest, my_dict)
13 |          else:
14 |              setattr(namespace, self.dest, values)
15 | 
16 | def check_dir(path):
17 |     if not os.path.exists(path):
18 |         os.makedirs(path)
19 |     return path
20 | 
21 | def save_trec(rst_file, rst_dict):
22 |     with open(rst_file, 'w') as writer:
23 |         for q_id, scores in rst_dict.items():
24 |             res = sorted(scores.items(), key=lambda x: x[1][0], reverse=True)
25 |             for rank, value in enumerate(res):
26 |                 writer.write(q_id+' Q0 '+str(value[0])+' '+str(rank+1)+' '+str(value[1][0])+' openmatch\n')
27 |     return
28 | 
29 | def save_features(rst_file, features):
30 |     with open(rst_file, 'w') as writer:
31 |         for feature in features:
32 |             writer.write(feature+'\n')
33 |     return
34 | 


--------------------------------------------------------------------------------
/v1/checkpoints/README.md:
--------------------------------------------------------------------------------
1 | # Save Checkpoints
2 | Save checkpoints of neural rankers and coor-ascent weights.
3 | 


--------------------------------------------------------------------------------
/v1/coor_ascent.sh:
--------------------------------------------------------------------------------
1 | java -cp LeToR/RankLib-2.1-patched.jar ciir.umass.edu.features.FeatureManager -input features/bert_features -output features/ -k 2
2 | java -jar LeToR/RankLib-2.1-patched.jar -train features/bert_features -ranker 4 -kcv 2 -kcvmd checkpoints/ -kcvmn ca -metric2t NDCG@20 -metric2T NDCG@20
3 | java -jar LeToR/RankLib-2.1-patched.jar -load checkpoints/f1.ca -rank features/f1.test.bert_features -score f1.score
4 | java -jar LeToR/RankLib-2.1-patched.jar -load checkpoints/f2.ca -rank features/f2.test.bert_features -score f2.score
5 | python LeToR/gen_trec.py -dev data/dev_toy.jsonl -res results/bert_ca.trec -k 2
6 | rm f1.score
7 | rm f2.score
8 | 


--------------------------------------------------------------------------------
/v1/data/filter.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | def main():
 5 |     parser = argparse.ArgumentParser()
 6 |     parser.add_argument('-input_qrels', type=str, default=None)
 7 |     parser.add_argument('-input_trec', type=str)
 8 |     parser.add_argument('-topk', type=int, default=1000)
 9 |     parser.add_argument('-output', type=str)
10 |     args = parser.parse_args()
11 | 
12 |     last_qds = {}
13 |     if args.input_qrels is not None:
14 |         with open(args.input_qrels, 'r') as r:
15 |             for line in r:
16 |                 line = line.strip().split()
17 |                 if line[0] not in last_qds:
18 |                     last_qds[line[0]] = {}
19 |                 last_qds[line[0]][line[2]] = 1
20 | 
21 |     f = open(args.output, 'w')
22 |     qds = {}
23 |     with open(args.input_trec, 'r') as r:
24 |         for line in r:
25 |             line = line.strip().split()
26 |             if line[0] not in qds:
27 |                 qds[line[0]] = []
28 |             if len(qds[line[0]]) >= args.topk:
29 |                 continue
30 |             if line[0] in last_qds and line[2] in last_qds[line[0]]:
31 |                 continue
32 |             else:
33 |                 qds[line[0]].append(line[2])
34 |                 f.write(' '.join(line) + '\n')
35 |     f.close()
36 | 
37 | if __name__ == "__main__":
38 |     main()
39 | 


--------------------------------------------------------------------------------
/v1/data/qrels_toy:
--------------------------------------------------------------------------------
 1 | THUMSR-22 0 nz02frdm_5 0
 2 | THUMSR-22 0 hc0za6fh_1 2
 3 | THUMSR-22 0 oqy6ns00_2 2
 4 | THUMSR-22 0 gzc72bdy_5 1
 5 | THUMSR-22 0 5ciaonf0_1 1
 6 | THUMSR-22 0 hhsfq8bz_6 2
 7 | THUMSR-22 0 std4jddn_12 0
 8 | THUMSR-22 0 8eh90ber_0 0
 9 | THUMSR-22 0 2ioap802_12 0
10 | THUMSR-22 0 m9i9gu5g_6 1
11 | THUMSR-22 0 2u4d235j_1 2
12 | THUMSR-22 0 oqy6ns00_3 0
13 | THUMSR-22 0 k1lg8c7q_1 0
14 | THUMSR-22 0 sjyrr2bn_18 0
15 | THUMSR-22 0 dro22gwf_1 2
16 | THUMSR-22 0 fcgcittn_7 0
17 | THUMSR-22 0 8eh90ber_-1 0
18 | THUMSR-22 0 84ib5ol5_0 0
19 | THUMSR-22 0 mtn7ykep_6 0
20 | THUMSR-22 0 2x7l1s75_2 0
21 | THUMSR-22 0 1vm5r7pq_3 0
22 | THUMSR-22 0 i3b647wv_7 0
23 | THUMSR-22 0 oto8tdui_2 0
24 | THUMSR-22 0 qopcs6jy_9 0
25 | THUMSR-22 0 8riyl4h3_3 0
26 | THUMSR-22 0 b518n9dx_1 0
27 | THUMSR-22 0 wyz5jyjh_9 0
28 | THUMSR-22 0 jvxo5v63_2 0
29 | THUMSR-22 0 cetdqgff_13 0
30 | THUMSR-22 0 45dpoepu_12 0
31 | THUMSR-36 0 qdamvwxl_2 2
32 | THUMSR-36 0 er93bsdj_1 1
33 | THUMSR-36 0 91g3yial_0 1
34 | THUMSR-36 0 ys6s9rps_12 2
35 | THUMSR-36 0 38bqkxn5_0 0
36 | THUMSR-36 0 qla6edp4_3 0
37 | THUMSR-36 0 vsinwqnr_16 2
38 | THUMSR-36 0 vpodtbjk_20 0
39 | THUMSR-36 0 ropgq7tr_11 2
40 | THUMSR-36 0 xuv77kp6_0 2
41 | THUMSR-36 0 tfspedf1_11 2
42 | THUMSR-36 0 2tmu1wzk_0 1
43 | THUMSR-36 0 m5h19hy6_19 1
44 | THUMSR-36 0 qdamvwxl_3 2
45 | THUMSR-36 0 d3owtd98_32 1
46 | THUMSR-36 0 vnnwnxs2_2 1
47 | THUMSR-36 0 clrcu89e_1 1
48 | THUMSR-36 0 epbhdx55_1 0
49 | THUMSR-36 0 ufu9ggrv_1 0
50 | THUMSR-36 0 3rqrq2mg_5 0
51 | THUMSR-36 0 tfspedf1_2 2
52 | THUMSR-36 0 hvmw7g5q_0 2
53 | THUMSR-36 0 5wsj003j_1 2
54 | THUMSR-36 0 tfspedf1_1 2
55 | THUMSR-36 0 jin0fdcm_0 0
56 | THUMSR-36 0 ys6s9rps_14 2
57 | THUMSR-36 0 lyob5wfv_8 1
58 | THUMSR-36 0 d3owtd98_31 1
59 | THUMSR-36 0 kl9huu33_0 2
60 | THUMSR-36 0 iudq5jdu_6 0
61 | THUMSR-36 0 tfspedf1_10 2
62 | THUMSR-36 0 xlrf3dxx_2 0
63 | THUMSR-36 0 vnnwnxs2_0 1
64 | THUMSR-36 0 ll4rxd9p_16 0
65 | THUMSR-36 0 zyecue78_10 1
66 | THUMSR-36 0 5mnj3qr7_9 1
67 | THUMSR-36 0 lsrqko6p_5 0
68 | THUMSR-36 0 tfspedf1_5 2
69 | THUMSR-36 0 ys6s9rps_18 2
70 | THUMSR-36 0 42pjc0lo_5 2
71 | THUMSR-36 0 p8cmm6ty_3 0
72 | 


--------------------------------------------------------------------------------
/v1/data/queries_toy.jsonl:
--------------------------------------------------------------------------------
1 | {"query_id": "THUMSR-22", "query": "Classification treatment COVID-19"}
2 | {"query_id": "THUMSR-36", "query": "masks Covid-19"}
3 | 


--------------------------------------------------------------------------------
/v1/docs/distributed training.md:
--------------------------------------------------------------------------------
 1 | # Distributed training for BERT pretrained model
 2 | 
 3 | Our BERT model now support distributed training which will significantly increase the speed of training.
 4 | 
 5 | ## Training
 6 | 
 7 | The code to use distributed training functionality is in the shell file
 8 | 
 9 | ```
10 | sh train_bert_dist.sh
11 | ```
12 | 
13 | In the shell file, the code is written as
14 | 
15 | ```
16 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ # set visible CUDA GPUs
17 | python -u -m torch.distributed.launch \ #lauch distributed training
18 | --nproc_per_node=4 \ # number equals to how many GPUs used
19 | --master_port=12345 train.py \
20 |         -task ranking \
21 |         -model bert \
22 |         # do not use the single json file
23 |         -train queries=/path/to/queries.tsv,docs=/path/to/docs.tsv,qrels=/path/to/qrels.tsv,trec=/path/to/trec.tsv \
24 |         -max_input 1280000 \
25 |         -save ./checkpoints/bert.bin \
26 |         -dev ./data/dev_toy.jsonl \
27 |         -qrels ./data/qrels_toy \
28 |         -vocab bert-base-uncased \
29 |         -pretrain bert-base-uncased \
30 |         -res ./results/bert.trec \
31 |         -metric ndcg_cut_10 \
32 |         -max_query_len 32 \
33 |         -max_doc_len 256 \
34 |         -epoch 1 \
35 |         -batch_size 4 \
36 |         -lr 2e-5 \
37 |         -eval_every 100 \
38 |         -optimizer adamw \
39 |         -dev_eval_batch_size 128 \
40 |         -gradient_accumulation_steps 4 \
41 |         -n_warmup_steps 10000 \
42 |         -logging_step 100
43 | ```
44 | 
45 | ## Results
46 | 
47 | |Dataset|Retriever|Reranker|Coor-Ascent|MRR@100 (dev)|
48 | |:-------:|:-------:|:------:|:---------:|:-:|
49 | |MSMARCO-document|ANCE FirstP+BM25 (distributed)|BERT Base FirstP|-|0.432|
50 | |MSMARCO-document|ANCE FirstP+BM25 (single)|BERT Base FirstP|-|0.403|
51 | |MSMARCO-document|ANCE FirstP+BM25 (OpenMatch)|BERT Base FirstP|-|0.407|
52 | |MSMARCO-document|ANCE FirstP|-|-|0.373|
53 | 


--------------------------------------------------------------------------------
/v1/docs/experiments-adhoc.md:
--------------------------------------------------------------------------------
 1 | # Ad-hoc Search
 2 | All results is measured on ndcg@20 with 5 fold cross-validation. More details are available at [ClueWeb09](http://lemurproject.org/clueweb09/), [ClueWeb12](http://www.lemurproject.org/clueweb12.php/).
 3 | 
 4 | ## Datasets
 5 | Data can be downloaded from [Datasets](https://cloud.tsinghua.edu.cn/d/77741ef1c1704866814a/).
 6 | 
 7 | |Datasets|Queries/Anchors|Query/Anchor-Doc Pairs|Released Files|
 8 | |:-------|:-------------:|:--------------------:|:-------------|
 9 | |**ClueWeb09-B**|200|47.1K|Queries, Q-D Relations, SDM scores|
10 | |**Robust04**|249|311K|Queries, Q-D Relations, SDM scores|
11 | |**ClueWeb12-B13**|100|28.9K|Queries, Q-D Relations, SDM scores|
12 | 
13 | As we cannot release the document contents, the document IDs are used instead.
14 | 


--------------------------------------------------------------------------------
/v1/docs/experiments-classic.md:
--------------------------------------------------------------------------------
1 | # Classic Features
2 | We extract several classic IR features, and train learning-to-rank models, such as RankSVM, Coor-Ascent, on ClueWeb09-B, Robust04 and TREC-COVID datasets with 5 fold cross-validation. All the results can be found in our [paper](https://arxiv.org/abs/2012.14862) of ACL 2021.
3 | 
4 | The features consists of Boolean AND; Boolean OR; Coordinate match; Cosine similarity of bag-of-words vectors; TF-IDF; BM25; language models with no smoothing, Dirichlet smoothing, JM smoothing, and two-way smoothing. More details are available at [classic_extractor](../OpenMatch/extractors/classic_extractor.py).


--------------------------------------------------------------------------------
/v1/docs/meta-learning-to-rank.md:
--------------------------------------------------------------------------------
 1 | # Meta Learning to Rank
 2 | 
 3 | Here provides the guiding code for running meta-learning to reweight technique, which uses target data to reweight training data during the learning to rank process.
 4 | 
 5 | A detailed introduction to the technology can be found in the paper [**Few-Shot Text Ranking with Meta Adapted Synthetic Weak Supervision**](https://arxiv.org/pdf/2012.14862.pdf).
 6 | 
 7 | 
 8 | 
 9 | ## Running
10 | 
11 | 
12 | The code to run meta-learning is in the shell file
13 | ```
14 | bash meta_dist_train.sh
15 | ```
16 | In the shell file, the code is written as
17 | 
18 | ```
19 | export gpu_num=4 ## GPU Number
20 | export master_port=23900
21 | export job_name=MetaBERT
22 | 
23 | ## ************************************
24 | export DATA_DIR= ## please set your dataset path here.
25 | export SAVE_DIR= ## please set your saving path here.
26 | 
27 | ## ************************************
28 | CUDA_VISIBLE_DEVICES=0,1,2,3 OMP_NUM_THREADS=1 python -u -m torch.distributed.launch --nproc_per_node=$gpu_num --master_port $master_port meta_dist_train.py \
29 | -job_name $job_name \
30 | -save_folder $SAVE_DIR/results \
31 | -model bert \
32 | -task ranking \
33 | -max_input 12800000 \
34 | -train queries=$DATA_DIR/queries.train.tsv,docs=$DATA_DIR/collection.tsv,qrels=$DATA_DIR/qrels.train.tsv,trec=$DATA_DIR/trids_bm25_marco-10.tsv \
35 | -dev queries=$DATA_DIR/queries.dev.small.tsv,docs=$DATA_DIR/collection.tsv,qrels=$DATA_DIR/qrels.dev.small.tsv,trec=$DATA_DIR/run.msmarco-passage.dev.small.100.trec \
36 | -target trec=$DATA_DIR/devids_bm25_marco.tsv \
37 | -qrels $DATA_DIR/qrels.dev.small.tsv \
38 | -vocab bert-base-uncased \
39 | -pretrain bert-base-uncased \
40 | -metric mrr_cut_10 \
41 | -max_query_len 32 \
42 | -max_doc_len 221 \
43 | -epoch 3 \
44 | -train_batch_size 8 \
45 | -target_batch_size 16 \
46 | -gradient_accumulation_steps 2 \
47 | -dev_eval_batch_size 1024 \
48 | -lr 3e-6 \
49 | -n_warmup_steps 160000 \
50 | -logging_step 2000 \
51 | -eval_every 10000 \
52 | -eval_during_train \
53 | ```
54 | 
55 | The tsv format of `-target` data is totally the same with the `-train` data.
56 | 
57 | ```
58 | query_id \t pos_docid \t neg_docid
59 | ```
60 | 


--------------------------------------------------------------------------------
/v1/features/README.md:
--------------------------------------------------------------------------------
1 | # Save Features
2 | Save features of neural ranker and the score of retrieval model.
3 | 
4 | # Data Format
5 | ```shell
6 | label id:qid 1:feature1 2:feature2 ...
7 | ```
8 | 


--------------------------------------------------------------------------------
/v1/gen_feature.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | python gen_feature.py \
 3 |         -task ranking \
 4 |         -model cknrm \
 5 |         -max_input 1280000 \
 6 |         -vocab ./data/glove.6B.300d.txt \
 7 |         -checkpoint ./checkpoints/cknrm.bin \
 8 |         -dev ./data/dev_toy.jsonl \
 9 |         -res ./features/cknrm.trec \
10 |         -max_query_len 10 \
11 |         -max_doc_len 256 \
12 |         -batch_size 32
13 | 


--------------------------------------------------------------------------------
/v1/gen_feature_bert.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | python gen_feature.py \
 3 |         -task classification \
 4 |         -model bert \
 5 |         -max_input 1280000 \
 6 |         -dev ./data/dev_toy.jsonl \
 7 |         -vocab bert-base-uncased \
 8 |         -pretrain bert-base-uncased \
 9 |         -checkpoint ./checkpoints/bert.bin \
10 |         -res ./features/bert_features \
11 |         -max_query_len 32 \
12 |         -max_doc_len 256 \
13 |         -batch_size 32
14 | 


--------------------------------------------------------------------------------
/v1/inference.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | python inference.py \
 3 |         -task ranking \
 4 |         -model cknrm \
 5 |         -max_input 1280000 \
 6 |         -vocab ./data/glove.6B.300d.txt \
 7 |         -checkpoint ./checkpoints/cknrm.bin \
 8 |         -test ./data/test_toy.jsonl \
 9 |         -res ./results/cknrm.trec \
10 |         -max_query_len 10 \
11 |         -max_doc_len 256 \
12 |         -batch_size 32
13 | 


--------------------------------------------------------------------------------
/v1/inference_bert.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | python inference.py \
 3 |         -task classification \
 4 |         -model bert \
 5 |         -max_input 1280000 \
 6 |         -test ./data/test_toy.jsonl \
 7 |         -vocab bert-base-uncased \
 8 |         -pretrain bert-base-uncased \
 9 |         -checkpoint ./checkpoints/bert.bin \
10 |         -res ./results/bert.trec \
11 |         -max_query_len 32 \
12 |         -max_doc_len 256 \
13 |         -batch_size 32
14 | 


--------------------------------------------------------------------------------
/v1/meta_dist_train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | ## ************************************
 3 | ## GPU
 4 | export gpu_num=4 ## GPU Number
 5 | export master_port=23900
 6 | export job_name=MetaBERT
 7 | 
 8 | ## ************************************
 9 | export DATA_DIR= ## please set your dataset path here.
10 | export SAVE_DIR= ## please set your saving path here.
11 | 
12 | ## ************************************
13 | CUDA_VISIBLE_DEVICES=0,1,2,3 OMP_NUM_THREADS=1 python -u -m torch.distributed.launch --nproc_per_node=$gpu_num --master_port $master_port meta_dist_train.py \
14 | -job_name $job_name \
15 | -save_folder $SAVE_DIR/results \
16 | -model bert \
17 | -task ranking \
18 | -max_input 12800000 \
19 | -train queries=$DATA_DIR/queries.train.tsv,docs=$DATA_DIR/collection.tsv,qrels=$DATA_DIR/qrels.train.tsv,trec=$DATA_DIR/trids_bm25_marco-10.tsv \
20 | -dev queries=$DATA_DIR/queries.dev.small.tsv,docs=$DATA_DIR/collection.tsv,qrels=$DATA_DIR/qrels.dev.small.tsv,trec=$DATA_DIR/run.msmarco-passage.dev.small.100.trec \
21 | -target trec=$DATA_DIR/devids_bm25_marco.tsv \
22 | -qrels $DATA_DIR/qrels.dev.small.tsv \
23 | -vocab bert-base-uncased \
24 | -pretrain bert-base-uncased \
25 | -metric mrr_cut_10 \
26 | -max_query_len 32 \
27 | -max_doc_len 221 \
28 | -epoch 3 \
29 | -train_batch_size 8 \
30 | -target_batch_size 16 \
31 | -gradient_accumulation_steps 2 \
32 | -dev_eval_batch_size 1024 \
33 | -lr 3e-6 \
34 | -n_warmup_steps 160000 \
35 | -logging_step 2000 \
36 | -eval_every 10000 \
37 | -eval_during_train \
38 | 


--------------------------------------------------------------------------------
/v1/requirements.txt:
--------------------------------------------------------------------------------
1 | torch==1.4.0
2 | transformers==2.8.0
3 | faiss-cpu==1.6.3
4 | nltk==3.5
5 | pytrec_eval==0.4
6 | 


--------------------------------------------------------------------------------
/v1/results/README.md:
--------------------------------------------------------------------------------
1 | # Results
2 | All Results.
3 | 


--------------------------------------------------------------------------------
/v1/retrievers/ANCE/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/v1/retrievers/ANCE/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/v1/retrievers/ANCE/commands/data_download.sh:
--------------------------------------------------------------------------------
 1 | mkdir -p ../data/raw_data/
 2 | cd ../data/raw_data/
 3 | 
 4 | # download MSMARCO passage data
 5 | wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz
 6 | tar -zxvf collectionandqueries.tar.gz
 7 | rm collectionandqueries.tar.gz
 8 | 
 9 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz
10 | gunzip msmarco-passagetest2019-top1000.tsv.gz
11 | 
12 | wget https://msmarco.blob.core.windows.net/msmarcoranking/top1000.dev.tar.gz
13 | tar -zxvf top1000.dev.tar.gz
14 | rm top1000.dev.tar.gz
15 | 
16 | wget https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.small.tar.gz
17 | tar -zxvf triples.train.small.tar.gz
18 | rm triples.train.small.tar.gz
19 | 
20 | # download MSMARCO doc data
21 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz
22 | gunzip msmarco-docs.tsv.gz
23 | 
24 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz
25 | gunzip msmarco-doctrain-queries.tsv.gz
26 | 
27 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-qrels.tsv.gz
28 | gunzip msmarco-doctrain-qrels.tsv.gz
29 | 
30 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz
31 | gunzip msmarco-test2019-queries.tsv.gz
32 | 
33 | wget https://trec.nist.gov/data/deep/2019qrels-docs.txt
34 | 
35 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctest2019-top100.gz
36 | gunzip msmarco-doctest2019-top100.gz
37 | 
38 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-top100.gz
39 | gunzip msmarco-docdev-top100.gz
40 | 
41 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz
42 | gunzip msmarco-docdev-queries.tsv.gz
43 | 
44 | 
45 | # clone DPR repo and download NQ and TriviaQA datasets
46 | cd ../../../
47 | git clone https://github.com/facebookresearch/DPR
48 | cd DPR
49 | python data/download_data.py --resource data.wikipedia_split.psgs_w100
50 | python data/download_data.py --resource data.retriever.nq
51 | python data/download_data.py --resource data.retriever.trivia
52 | python data/download_data.py --resource data.retriever.qas.nq
53 | python data/download_data.py --resource data.retriever.qas.trivia
54 | python data/download_data.py --resource checkpoint.retriever.multiset.bert-base-encoder


--------------------------------------------------------------------------------
/v1/retrievers/ANCE/commands/run_ann_data_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # This script is for generate ann data for a model in training
 4 | #
 5 | # For the overall design of the ann driver, check run_train.sh
 6 | #
 7 | # This script continuously generate ann data using latest model from model_dir
 8 | # For training, run this script after initial ann data is created from run_train.sh
 9 | # Make sure parameter used here is consistent with the training script
10 | 
11 | # # Passage ANCE(FirstP) 
12 | # gpu_no=4
13 | # seq_length=512
14 | # model_type=rdot_nll
15 | # tokenizer_type="roberta-base"
16 | # base_data_dir="../data/raw_data/"
17 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/"
18 | # job_name="OSPass512"
19 | 
20 | 
21 | # # Document ANCE(FirstP) 
22 | # gpu_no=4
23 | # seq_length=512
24 | # model_type=rdot_nll
25 | # tokenizer_type="roberta-base"
26 | # base_data_dir="../data/raw_data/"
27 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/"
28 | # job_name="OSDoc512"
29 | 
30 | # # Document ANCE(MaxP)
31 | gpu_no=4
32 | seq_length=2048
33 | model_type=rdot_nll_multi_chunk
34 | tokenizer_type="roberta-base"
35 | base_data_dir="../data/raw_data/"
36 | preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/"
37 | job_name="OSDoc2048"
38 | 
39 | ##################################### Inital ANN Data generation ################################
40 | model_dir="${base_data_dir}${job_name}/"
41 | model_ann_data_dir="${model_dir}ann_data/"
42 | pretrained_checkpoint_dir="warmup checkpoint path"
43 | 
44 | initial_data_gen_cmd="\
45 | python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_data_gen.py --training_dir $model_dir \
46 | --init_model_dir $pretrained_checkpoint_dir --model_type $model_type --output_dir $model_ann_data_dir \
47 | --cache_dir "${model_ann_data_dir}cache/" --data_dir $preprocessed_data_dir --max_seq_length $seq_length \
48 | --per_gpu_eval_batch_size 16 --topk_training 200 --negative_sample 20 \
49 | "
50 | 
51 | echo $initial_data_gen_cmd
52 | eval $initial_data_gen_cmd
53 | 


--------------------------------------------------------------------------------
/v1/retrievers/ANCE/commands/run_ann_data_gen_dpr.sh:
--------------------------------------------------------------------------------
 1 | # tokenization
 2 | wiki_dir="../../../DPR/data/wikipedia_split/" # path for psgs_w100.tsv downloaded with DPR code
 3 | ans_dir="../../../DPR/data/retriever/qas/" # path for DPR question&answer csv files
 4 | question_dir="../../../DPR/data/retriever/" # path for DPR training data
 5 | data_type=0 #0 is nq, 1 is trivia, 2 is both
 6 | out_data_dir="../data/QA_NQ_data/" # change this for different data_type
 7 | 
 8 | tokenization_cmd="\
 9 | python ../data/DPR_data.py --wiki_dir $wiki_dir --question_dir $question_dir --data_type $data_type --answer_dir $ans_dir \
10 | --out_data_dir $out_data_dir \
11 | "
12 | 
13 | echo $tokenization_cmd
14 | eval $tokenization_cmd
15 | 
16 | 
17 | gpu_no=8
18 | 
19 | # model type
20 | model_type="dpr"
21 | seq_length=256
22 | 
23 | # ann parameters
24 | batch_size=16
25 | ann_topk=200
26 | ann_negative_sample=100
27 | 
28 | # input/output directories
29 | base_data_dir="${out_data_dir}"
30 | job_name="ann_NQ_test"
31 | model_dir="${base_data_dir}${job_name}/"
32 | model_ann_data_dir="${model_dir}ann_data/"
33 | pretrained_checkpoint_dir="../../../DPR/checkpoint/retriever/multiset/bert-base-encoder.cp"
34 | passage_path="../../../DPR/data/wikipedia_split/"
35 | test_qa_path="../../../DPR/data/retriever/qas/"
36 | trivia_test_qa_path="../../../DPR/data/retriever/qas/"
37 | 
38 | 
39 | data_gen_cmd="\
40 | sudo python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_data_gen_dpr.py --training_dir $model_dir \
41 | --init_model_dir $pretrained_checkpoint_dir --model_type $model_type --output_dir $model_ann_data_dir \
42 | --cache_dir "${model_ann_data_dir}cache/" --data_dir $base_data_dir --max_seq_length $seq_length \
43 | --per_gpu_eval_batch_size $batch_size --topk_training $ann_topk --negative_sample $ann_negative_sample \
44 | --passage_path $passage_path --test_qa_path $test_qa_path --trivia_test_qa_path $trivia_test_qa_path \
45 | "
46 | 
47 | echo $data_gen_cmd
48 | eval $data_gen_cmd


--------------------------------------------------------------------------------
/v1/retrievers/ANCE/commands/run_ann_data_gen_lyz.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # This script is for generate ann data for a model in training
 4 | #
 5 | # For the overall design of the ann driver, check run_train.sh
 6 | #
 7 | # This script continuously generate ann data using latest model from model_dir
 8 | # For training, run this script after initial ann data is created from run_train.sh
 9 | # Make sure parameter used here is consistent with the training script
10 | 
11 | # # Passage ANCE(FirstP) 
12 | gpu_no=4
13 | seq_length=512
14 | model_type=rdot_nll
15 | tokenizer_type="roberta-base"
16 | base_data_dir="../data/raw_data/"
17 | preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/"
18 | job_name="OSPass512"
19 | 
20 | 
21 | # # Document ANCE(FirstP) 
22 | # gpu_no=4
23 | # seq_length=512
24 | # model_type=rdot_nll
25 | # tokenizer_type="roberta-base"
26 | # base_data_dir="../data/raw_data/"
27 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/"
28 | # job_name="OSDoc512"
29 | 
30 | # # Document ANCE(MaxP)
31 | # gpu_no=4
32 | # seq_length=2048
33 | # model_type=rdot_nll_multi_chunk
34 | # tokenizer_type="roberta-base"
35 | # base_data_dir="../data/raw_data/"
36 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/"
37 | # job_name="OSDoc2048"
38 | 
39 | ##################################### Inital ANN Data generation ################################
40 | model_dir="${base_data_dir}${job_name}/"
41 | model_ann_data_dir="${model_dir}ann_data/"
42 | # pretrained_checkpoint_dir="warmup checkpoint path"
43 | pretrained_checkpoint_dir="../data/msmarco_passage_warmup_checkpoints/checkpoint-420000/"
44 | 
45 | initial_data_gen_cmd="\
46 | python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_data_gen.py --training_dir $model_dir \
47 | --init_model_dir $pretrained_checkpoint_dir --model_type $model_type --output_dir $model_ann_data_dir \
48 | --cache_dir "${model_ann_data_dir}cache/" --data_dir $preprocessed_data_dir --max_seq_length $seq_length \
49 | --per_gpu_eval_batch_size 16 --topk_training 200 --negative_sample 20 \
50 | "
51 | 
52 | echo $initial_data_gen_cmd
53 | eval $initial_data_gen_cmd
54 | 


--------------------------------------------------------------------------------
/v1/retrievers/ANCE/commands/run_inference.sh:
--------------------------------------------------------------------------------
 1 | # # Passage ANCE(FirstP) 
 2 | gpu_no=4
 3 | seq_length=512
 4 | model_type=rdot_nll
 5 | tokenizer_type="roberta-base"
 6 | base_data_dir="../data/raw_data/"
 7 | preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}_dev/"
 8 | job_name="OSPass512"
 9 | pretrained_checkpoint_dir=""
10 | 
11 | # # Document ANCE(FirstP) 
12 | # gpu_no=4
13 | # seq_length=512
14 | # model_type=rdot_nll
15 | # tokenizer_type="roberta-base"
16 | # base_data_dir="../data/raw_data/"
17 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/"
18 | # job_name="OSDoc512"
19 | # pretrained_checkpoint_dir=""
20 | 
21 | # # Document ANCE(MaxP)
22 | # gpu_no=4
23 | # seq_length=2048
24 | # model_type=rdot_nll_multi_chunk
25 | # tokenizer_type="roberta-base"
26 | # base_data_dir="../data/raw_data/"
27 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/"
28 | # job_name="OSDoc2048"
29 | # pretrained_checkpoint_dir=""
30 | 
31 | ##################################### Inference ################################
32 | model_dir="${base_data_dir}${job_name}/"
33 | model_ann_data_dir="${model_dir}ann_data_inf/"
34 | 
35 | initial_data_gen_cmd="\
36 | python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_data_gen.py --training_dir $pretrained_checkpoint_dir \
37 | --init_model_dir $pretrained_checkpoint_dir --model_type $model_type --output_dir $model_ann_data_dir \
38 | --cache_dir "${model_ann_data_dir}cache/" --data_dir $preprocessed_data_dir --max_seq_length $seq_length \
39 | --per_gpu_eval_batch_size 16 --topk_training 200 --negative_sample 20 --end_output_num 0 --inference \
40 | "
41 | 
42 | echo $initial_data_gen_cmd
43 | eval $initial_data_gen_cmd
44 | 


--------------------------------------------------------------------------------
/v1/retrievers/ANCE/commands/run_train_dpr.sh:
--------------------------------------------------------------------------------
 1 | gpu_no=8
 2 | 
 3 | # model type
 4 | model_type="dpr"
 5 | seq_length=256
 6 | triplet="--triplet --optimizer lamb" # set this to empty for non triplet model
 7 | 
 8 | # hyper parameters
 9 | batch_size=16
10 | gradient_accumulation_steps=1
11 | learning_rate=1e-5
12 | warmup_steps=1000
13 | 
14 | # input/output directories
15 | base_data_dir="../data/QA_NQ_data/" 
16 | job_name="ann_NQ_test"
17 | model_dir="${base_data_dir}${job_name}/"
18 | model_ann_data_dir="${model_dir}ann_data/"
19 | pretrained_checkpoint_dir="../../../DPR/checkpoint/retriever/multiset/bert-base-encoder.cp"
20 | 
21 | train_cmd="\
22 | sudo python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_dpr.py --model_type $model_type \
23 | --model_name_or_path $pretrained_checkpoint_dir --task_name MSMarco $triplet --data_dir $base_data_dir \
24 | --ann_dir $model_ann_data_dir --max_seq_length $seq_length --per_gpu_train_batch_size=$batch_size \
25 | --gradient_accumulation_steps $gradient_accumulation_steps --learning_rate $learning_rate --output_dir $model_dir \
26 | --warmup_steps $warmup_steps --logging_steps 100 --save_steps 1000 --log_dir "~/tensorboard/${DLWS_JOB_ID}/logs/${job_name}" \
27 | "
28 | 
29 | echo $train_cmd
30 | eval $train_cmd
31 | 
32 | echo "copy current script to model directory"
33 | sudo cp $0 $model_dir


--------------------------------------------------------------------------------
/v1/retrievers/ANCE/commands/run_train_warmup.sh:
--------------------------------------------------------------------------------
 1 | # This script is for training the warmup checkpoint for ANCE
 2 | data_dir="../data/raw_data/"
 3 | output_dir=""
 4 | cmd="python3 -m torch.distributed.launch --nproc_per_node=1 ../drivers/run_warmup.py --train_model_type rdot_nll \
 5 |   --model_name_or_path roberta-base \
 6 |   --task_name MSMarco --do_train --evaluate_during_training --data_dir ${data_dir}  --max_seq_length 128     --per_gpu_eval_batch_size=256 \
 7 |   --per_gpu_train_batch_size=32       --learning_rate 2e-4  --logging_steps 1000   --num_train_epochs 2.0   --output_dir ${output_dir} \
 8 |   --warmup_steps 1000  --overwrite_output_dir --save_steps 30000 --gradient_accumulation_steps 1  --expected_train_size 35000000 --logging_steps_per_eval 20 \
 9 |   --fp16 --optimizer lamb --log_dir ~/tensorboard/${DLWS_JOB_ID}/logs/OSpass "
10 | 
11 | echo $cmd
12 | eval $cmd
13 | 


--------------------------------------------------------------------------------
/v1/retrievers/ANCE/evaluation/convert_trec.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import pickle
 3 | import os
 4 | import tqdm
 5 | 
 6 | data_type=1
 7 | test_set=0
 8 | 
 9 | processed_data_dir = "../data/raw_data/ann_data_roberta-base_512/"
10 | trec_save_path = glob.glob(f"data-type-{data_type}_test-set-{test_set}_ckpt-*.trec")
11 | 
12 | with open(os.path.join(processed_data_dir,'qid2offset.pickle'),'rb') as f:
13 |     qid2offset = pickle.load(f)
14 | offset2qid = {}
15 | for k in qid2offset:
16 |     offset2qid[qid2offset[k]]=k
17 | 
18 | with open(os.path.join(processed_data_dir,'pid2offset.pickle'),'rb') as f:
19 |     pid2offset = pickle.load(f)
20 | offset2pid = {}
21 | for k in pid2offset:
22 |     offset2pid[pid2offset[k]]=k
23 | 
24 | 
25 | #for k in offset2qid:
26 | #    print(k,offset2qid[k])
27 | 
28 | for path in tqdm.tqdm(trec_save_path):
29 |     with open(path) as f:
30 |         lines=f.readlines()
31 |     with open(path.replace(".trec",".formatted.trec"),"w") as f:
32 |         for line in lines:
33 |             qid , Q0, pid, rank, score, tag = line.strip().split(' ')
34 |             # print(offset2qid[int(qid)] , Q0, pid, rank, score.replace('-',''), tag)
35 |             if data_type==0:
36 |                 f.write(f"{offset2qid[int(qid)]} {Q0} D{offset2pid[int(pid)]} {rank} {score.replace('-','')} {tag}\n")
37 |             else:
38 |                 f.write(f"{offset2qid[int(qid)]} {Q0} {offset2pid[int(pid)]} {rank} {score.replace('-','')} {tag}\n")
39 | #    break
40 | 


--------------------------------------------------------------------------------
/v1/retrievers/ANCE/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open('README.md') as f:
 4 |     readme = f.read()
 5 | 
 6 | setup(
 7 |    name='ANCE',
 8 |    version='0.1.0',
 9 |    description='Approximate Nearest Neighbor Negative Contrastive Learning for Dense Text Retrieval',
10 |    url='https://github.com/microsoft/ANCE',
11 |    classifiers=[
12 |         'Intended Audience :: Science/Research',
13 |         'License :: OSI Approved :: MIT License',
14 |         'Programming Language :: Python :: 3.6',
15 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
16 |     ],
17 |    license="MIT",
18 |    long_description=readme,
19 |    install_requires=[
20 |         'transformers==2.3.0', 
21 |         'pytrec-eval',
22 |         'faiss-cpu',
23 |         'wget'
24 |     ],
25 | )


--------------------------------------------------------------------------------
/v1/retrievers/DANCE/ANCE_setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | with open('README.md') as f:
 4 |     readme = f.read()
 5 | 
 6 | setup(
 7 |    name='ANCE',
 8 |    version='0.1.0',
 9 |    description='Approximate Nearest Neighbor Negative Contrastive Learning for Dense Text Retrieval',
10 |    url='https://github.com/microsoft/ANCE',
11 |    classifiers=[
12 |         'Intended Audience :: Science/Research',
13 |         'License :: OSI Approved :: MIT License',
14 |         'Programming Language :: Python :: 3.6',
15 |         'Topic :: Scientific/Engineering :: Artificial Intelligence',
16 |     ],
17 |    license="MIT",
18 |    long_description=readme,
19 |    install_requires=[
20 |         'transformers==2.3.0', 
21 |         'pytrec-eval',
22 |         'wget'
23 |     ],
24 | )


--------------------------------------------------------------------------------
/v1/retrievers/DANCE/commands/data_download.sh:
--------------------------------------------------------------------------------
 1 | mkdir ../data/raw_data/
 2 | cd ../data/raw_data/
 3 | 
 4 | # # download MSMARCO passage data
 5 | # wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz
 6 | # tar -zxvf collectionandqueries.tar.gz
 7 | # rm collectionandqueries.tar.gz
 8 | 
 9 | # wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz
10 | # gunzip msmarco-passagetest2019-top1000.tsv.gz
11 | 
12 | # wget https://msmarco.blob.core.windows.net/msmarcoranking/top1000.dev.tar.gz
13 | # tar -zxvf top1000.dev.tar.gz
14 | # rm top1000.dev.tar.gz
15 | 
16 | # wget https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.small.tar.gz
17 | # tar -zxvf triples.train.small.tar.gz
18 | # rm triples.train.small.tar.gz
19 | 
20 | # download MSMARCO doc data
21 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz
22 | gunzip msmarco-docs.tsv.gz
23 | 
24 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz
25 | gunzip msmarco-doctrain-queries.tsv.gz
26 | 
27 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-qrels.tsv.gz
28 | gunzip msmarco-doctrain-qrels.tsv.gz
29 | 
30 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz
31 | gunzip msmarco-test2019-queries.tsv.gz
32 | 
33 | wget https://trec.nist.gov/data/deep/2019qrels-docs.txt
34 | 
35 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctest2019-top100.gz
36 | gunzip msmarco-doctest2019-top100.gz
37 | 
38 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-top100.gz
39 | gunzip msmarco-docdev-top100.gz
40 | 
41 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz
42 | gunzip msmarco-docdev-queries.tsv.gz
43 | 
44 | 
45 | # # clone DPR repo and download NQ and TriviaQA datasets
46 | # cd ../../../
47 | # git clone https://github.com/facebookresearch/DPR
48 | # cd DPR
49 | # python data/download_data.py --resource data.wikipedia_split.psgs_w100
50 | # python data/download_data.py --resource data.retriever.nq
51 | # python data/download_data.py --resource data.retriever.trivia
52 | # python data/download_data.py --resource data.retriever.qas.nq
53 | # python data/download_data.py --resource data.retriever.qas.trivia
54 | # python data/download_data.py --resource checkpoint.retriever.multiset.bert-base-encoder


--------------------------------------------------------------------------------
/v1/retrievers/DANCE/commands/run_ann_data_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # This script is for generate ann data for a model in training
 4 | #
 5 | # For the overall design of the ann driver, check run_train.sh
 6 | #
 7 | # This script continuously generate ann data using latest model from model_dir
 8 | # For training, run this script after initial ann data is created from run_train.sh
 9 | # Make sure parameter used here is consistent with the training script
10 | 
11 | # # Passage ANCE(FirstP) 
12 | # gpu_no=4
13 | # seq_length=512
14 | # model_type=rdot_nll
15 | # tokenizer_type="roberta-base"
16 | # base_data_dir="../data/raw_data/"
17 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/"
18 | # job_name="OSPass512"
19 | 
20 | 
21 | # # Document ANCE(FirstP) 
22 | # gpu_no=4
23 | # seq_length=512
24 | # model_type=rdot_nll
25 | # tokenizer_type="roberta-base"
26 | # base_data_dir="../data/raw_data/"
27 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/"
28 | # job_name="OSDoc512"
29 | 
30 | # # Document ANCE(MaxP)
31 | gpu_no=4
32 | seq_length=2048
33 | model_type=rdot_nll_multi_chunk
34 | tokenizer_type="roberta-base"
35 | base_data_dir="../data/raw_data/"
36 | preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/"
37 | job_name="OSDoc2048"
38 | 
39 | ##################################### Inital ANN Data generation ################################
40 | model_dir="${base_data_dir}${job_name}/"
41 | model_ann_data_dir="${model_dir}ann_data/"
42 | pretrained_checkpoint_dir="warmup checkpoint path"
43 | 
44 | initial_data_gen_cmd="\
45 | python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_data_gen.py --training_dir $model_dir \
46 | --init_model_dir $pretrained_checkpoint_dir --model_type $model_type --output_dir $model_ann_data_dir \
47 | --cache_dir "${model_ann_data_dir}cache/" --data_dir $preprocessed_data_dir --max_seq_length $seq_length \
48 | --per_gpu_eval_batch_size 16 --topk_training 200 --negative_sample 20 \
49 | "
50 | 
51 | echo $initial_data_gen_cmd
52 | eval $initial_data_gen_cmd
53 | 


--------------------------------------------------------------------------------
/v1/retrievers/DANCE/commands/run_ann_data_gen_dpr.sh:
--------------------------------------------------------------------------------
 1 | # tokenization
 2 | wiki_dir="../../../DPR/data/wikipedia_split/" # path for psgs_w100.tsv downloaded with DPR code
 3 | ans_dir="../../../DPR/data/retriever/qas/" # path for DPR question&answer csv files
 4 | question_dir="../../../DPR/data/retriever/" # path for DPR training data
 5 | data_type=0 #0 is nq, 1 is trivia, 2 is both
 6 | out_data_dir="../data/QA_NQ_data/" # change this for different data_type
 7 | 
 8 | tokenization_cmd="\
 9 | python ../data/DPR_data.py --wiki_dir $wiki_dir --question_dir $question_dir --data_type $data_type --answer_dir $ans_dir \
10 | --out_data_dir $out_data_dir \
11 | "
12 | 
13 | echo $tokenization_cmd
14 | eval $tokenization_cmd
15 | 
16 | 
17 | gpu_no=8
18 | 
19 | # model type
20 | model_type="dpr"
21 | seq_length=256
22 | 
23 | # ann parameters
24 | batch_size=16
25 | ann_topk=200
26 | ann_negative_sample=100
27 | 
28 | # input/output directories
29 | base_data_dir="${out_data_dir}"
30 | job_name="ann_NQ_test"
31 | model_dir="${base_data_dir}${job_name}/"
32 | model_ann_data_dir="${model_dir}ann_data/"
33 | pretrained_checkpoint_dir="../../../DPR/checkpoint/retriever/multiset/bert-base-encoder.cp"
34 | passage_path="../../../DPR/data/wikipedia_split/"
35 | test_qa_path="../../../DPR/data/retriever/qas/"
36 | trivia_test_qa_path="../../../DPR/data/retriever/qas/"
37 | 
38 | 
39 | data_gen_cmd="\
40 | sudo python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_data_gen_dpr.py --training_dir $model_dir \
41 | --init_model_dir $pretrained_checkpoint_dir --model_type $model_type --output_dir $model_ann_data_dir \
42 | --cache_dir "${model_ann_data_dir}cache/" --data_dir $base_data_dir --max_seq_length $seq_length \
43 | --per_gpu_eval_batch_size $batch_size --topk_training $ann_topk --negative_sample $ann_negative_sample \
44 | --passage_path $passage_path --test_qa_path $test_qa_path --trivia_test_qa_path $trivia_test_qa_path \
45 | "
46 | 
47 | echo $data_gen_cmd
48 | eval $data_gen_cmd


--------------------------------------------------------------------------------
/v1/retrievers/DANCE/commands/run_inference.sh:
--------------------------------------------------------------------------------
 1 | # # Passage ANCE(FirstP) 
 2 | gpu_no=4
 3 | seq_length=512
 4 | model_type=rdot_nll
 5 | tokenizer_type="roberta-base"
 6 | base_data_dir="../data/raw_data/"
 7 | preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}_dev/"
 8 | job_name="OSPass512"
 9 | pretrained_checkpoint_dir=""
10 | 
11 | # # Document ANCE(FirstP) 
12 | # gpu_no=4
13 | # seq_length=512
14 | # model_type=rdot_nll
15 | # tokenizer_type="roberta-base"
16 | # base_data_dir="../data/raw_data/"
17 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/"
18 | # job_name="OSDoc512"
19 | # pretrained_checkpoint_dir=""
20 | 
21 | # # Document ANCE(MaxP)
22 | # gpu_no=4
23 | # seq_length=2048
24 | # model_type=rdot_nll_multi_chunk
25 | # tokenizer_type="roberta-base"
26 | # base_data_dir="../data/raw_data/"
27 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/"
28 | # job_name="OSDoc2048"
29 | # pretrained_checkpoint_dir=""
30 | 
31 | ##################################### Inference ################################
32 | model_dir="${base_data_dir}${job_name}/"
33 | model_ann_data_dir="${model_dir}ann_data_inf/"
34 | 
35 | initial_data_gen_cmd="\
36 | python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_data_gen.py --training_dir $pretrained_checkpoint_dir \
37 | --init_model_dir $pretrained_checkpoint_dir --model_type $model_type --output_dir $model_ann_data_dir \
38 | --cache_dir "${model_ann_data_dir}cache/" --data_dir $preprocessed_data_dir --max_seq_length $seq_length \
39 | --per_gpu_eval_batch_size 16 --topk_training 200 --negative_sample 20 --end_output_num 0 --inference \
40 | "
41 | 
42 | echo $initial_data_gen_cmd
43 | eval $initial_data_gen_cmd
44 | 


--------------------------------------------------------------------------------
/v1/retrievers/DANCE/commands/run_train_dpr.sh:
--------------------------------------------------------------------------------
 1 | gpu_no=8
 2 | 
 3 | # model type
 4 | model_type="dpr"
 5 | seq_length=256
 6 | triplet="--triplet --optimizer lamb" # set this to empty for non triplet model
 7 | 
 8 | # hyper parameters
 9 | batch_size=16
10 | gradient_accumulation_steps=1
11 | learning_rate=1e-5
12 | warmup_steps=1000
13 | 
14 | # input/output directories
15 | base_data_dir="../data/QA_NQ_data/" 
16 | job_name="ann_NQ_test"
17 | model_dir="${base_data_dir}${job_name}/"
18 | model_ann_data_dir="${model_dir}ann_data/"
19 | pretrained_checkpoint_dir="../../../DPR/checkpoint/retriever/multiset/bert-base-encoder.cp"
20 | 
21 | train_cmd="\
22 | sudo python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_dpr.py --model_type $model_type \
23 | --model_name_or_path $pretrained_checkpoint_dir --task_name MSMarco $triplet --data_dir $base_data_dir \
24 | --ann_dir $model_ann_data_dir --max_seq_length $seq_length --per_gpu_train_batch_size=$batch_size \
25 | --gradient_accumulation_steps $gradient_accumulation_steps --learning_rate $learning_rate --output_dir $model_dir \
26 | --warmup_steps $warmup_steps --logging_steps 100 --save_steps 1000 --log_dir "~/tensorboard/${DLWS_JOB_ID}/logs/${job_name}" \
27 | "
28 | 
29 | echo $train_cmd
30 | eval $train_cmd
31 | 
32 | echo "copy current script to model directory"
33 | sudo cp $0 $model_dir


--------------------------------------------------------------------------------
/v1/retrievers/DANCE/commands/run_train_warmup.sh:
--------------------------------------------------------------------------------
 1 | # This script is for training the warmup checkpoint for ANCE
 2 | data_dir="../data/raw_data/"
 3 | output_dir=""
 4 | cmd="python3 -m torch.distributed.launch --nproc_per_node=1 ../drivers/run_warmup.py --train_model_type rdot_nll \
 5 |   --model_name_or_path roberta-base \
 6 |   --task_name MSMarco --do_train --evaluate_during_training --data_dir ${data_dir}  --max_seq_length 128     --per_gpu_eval_batch_size=256 \
 7 |   --per_gpu_train_batch_size=32       --learning_rate 2e-4  --logging_steps 1000   --num_train_epochs 2.0   --output_dir ${output_dir} \
 8 |   --warmup_steps 1000  --overwrite_output_dir --save_steps 30000 --gradient_accumulation_steps 1  --expected_train_size 35000000 --logging_steps_per_eval 20 \
 9 |   --fp16 --optimizer lamb --log_dir ~/tensorboard/${DLWS_JOB_ID}/logs/OSpass "
10 | 
11 | echo $cmd
12 | eval $cmd
13 | 


--------------------------------------------------------------------------------
/v1/retrievers/README.md:
--------------------------------------------------------------------------------
 1 | # Document Retrieval
 2 | BM25 is following [anserini](https://github.com/castorini/anserini), and ANN is following [ANCE](https://github.com/microsoft/ANCE).
 3 | 
 4 | ## BM25 Guide
 5 | ### MS MARCO Doc Ranking Examples
 6 | First, get the [msmarco-docs.tsv](https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz) and [msmarco-docdev-queries.tsv](https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz), and preprocess *msmarco-docs.tsv* to jsonl format. *{'id': doc_id, 'contents': doc}* for each line, save it to *collections/msmarco/msmarco-docs.jsonl*.
 7 | 
 8 | Then build BM25 index and search:
 9 | ```
10 | ./bm25_retriever/bin/IndexCollection -collection JsonCollection -input ./collections/msmarco -index index-msmarco -generator LuceneDocumentGenerator -threads 8 -storePositions -storeDocvectors -storeRawDocs
11 | ./bm25_retriever/bin/SearchCollection -index index-msmarco -topicreader TsvString -topics msmarco-docdev-queries.tsv -bm25 -output msmarco-doc.txt
12 | ```
13 | 
14 | ## ANCE Guide
15 | The guides of ANCE training and inference are available at [ance](./openmatch_ance_retriver_readme.md).
16 | 


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/HdrHistogram-2.1.9.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/HdrHistogram-2.1.9.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/aggs-matrix-stats-client-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/aggs-matrix-stats-client-7.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/annotations-java5-19.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/annotations-java5-19.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/anserini-0.7.3-SNAPSHOT.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/anserini-0.7.3-SNAPSHOT.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/anserini-fastutil-6.5.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/anserini-fastutil-6.5.6.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/ant-1.9.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/ant-1.9.1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/ant-launcher-1.9.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/ant-launcher-1.9.1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/args4j-2.32.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/args4j-2.32.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/cbor-0.7.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/cbor-0.7.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/commons-codec-1.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/commons-codec-1.11.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/commons-compress-1.18.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/commons-compress-1.18.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/commons-io-2.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/commons-io-2.5.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/commons-lang3-3.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/commons-lang3-3.5.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/commons-logging-1.1.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/commons-logging-1.1.3.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/commons-math3-3.6.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/commons-math3-3.6.1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/commons-pool2-2.6.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/commons-pool2-2.6.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/compiler-0.9.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/compiler-0.9.3.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/elasticsearch-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-7.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/elasticsearch-cli-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-cli-7.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/elasticsearch-core-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-core-7.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/elasticsearch-geo-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-geo-7.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/elasticsearch-rest-client-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-rest-client-7.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/elasticsearch-rest-high-level-client-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-rest-high-level-client-7.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/elasticsearch-secure-sm-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-secure-sm-7.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/elasticsearch-x-content-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-x-content-7.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/guava-18.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/guava-18.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/hppc-0.7.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/hppc-0.7.1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/http2-client-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/http2-client-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/http2-common-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/http2-common-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/http2-hpack-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/http2-hpack-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/http2-http-client-transport-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/http2-http-client-transport-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/httpasyncclient-4.1.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/httpasyncclient-4.1.4.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/httpclient-4.5.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/httpclient-4.5.6.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/httpcore-4.4.10.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/httpcore-4.4.10.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/httpcore-nio-4.4.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/httpcore-nio-4.4.11.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/httpmime-4.5.6.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/httpmime-4.5.6.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jackson-annotations-2.10.0.pr1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jackson-annotations-2.10.0.pr1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jackson-core-2.10.0.pr1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jackson-core-2.10.0.pr1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jackson-databind-2.10.0.pr1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jackson-databind-2.10.0.pr1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jackson-dataformat-cbor-2.8.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jackson-dataformat-cbor-2.8.11.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jackson-dataformat-smile-2.8.11.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jackson-dataformat-smile-2.8.11.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jackson-dataformat-yaml-2.10.0.pr1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jackson-dataformat-yaml-2.10.0.pr1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jackson-datatype-jdk8-2.10.0.pr1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jackson-datatype-jdk8-2.10.0.pr1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jcl-over-slf4j-1.7.24.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jcl-over-slf4j-1.7.24.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jetty-alpn-client-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jetty-alpn-client-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jetty-alpn-java-client-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jetty-alpn-java-client-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jetty-client-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jetty-client-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jetty-http-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jetty-http-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jetty-io-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jetty-io-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jetty-util-9.4.19.v20190610.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jetty-util-9.4.19.v20190610.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jna-4.5.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jna-4.5.1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/joda-time-2.10.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/joda-time-2.10.1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jopt-simple-5.0.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jopt-simple-5.0.2.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jsoup-1.8.3.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jsoup-1.8.3.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/jsr305-2.0.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jsr305-2.0.1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lang-mustache-client-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lang-mustache-client-7.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/log4j-api-2.12.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/log4j-api-2.12.1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/log4j-core-2.12.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/log4j-core-2.12.1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-analyzers-common-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-analyzers-common-8.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-backward-codecs-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-backward-codecs-8.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-core-8.3.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-core-8.3.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-grouping-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-grouping-8.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-highlighter-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-highlighter-8.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-join-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-join-8.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-memory-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-memory-8.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-misc-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-misc-8.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-queries-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-queries-8.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-queryparser-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-queryparser-8.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-sandbox-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-sandbox-8.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-spatial-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-spatial-8.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-spatial-extras-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-spatial-extras-8.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-spatial3d-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-spatial3d-8.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/lucene-suggest-8.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-suggest-8.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/mockito-all-1.10.19.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/mockito-all-1.10.19.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/netty-buffer-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-buffer-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/netty-codec-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-codec-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/netty-common-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-common-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/netty-handler-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-handler-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/netty-resolver-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-resolver-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/netty-transport-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-transport-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/netty-transport-native-epoll-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-transport-native-epoll-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/netty-transport-native-unix-common-4.1.29.Final.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-transport-native-unix-common-4.1.29.Final.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/parent-join-client-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/parent-join-client-7.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/rank-eval-client-7.0.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/rank-eval-client-7.0.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/sesame-model-4.1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/sesame-model-4.1.2.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/sesame-rio-api-4.1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/sesame-rio-api-4.1.2.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/sesame-rio-datatypes-4.1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/sesame-rio-datatypes-4.1.2.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/sesame-rio-languages-4.1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/sesame-rio-languages-4.1.2.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/sesame-rio-ntriples-4.1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/sesame-rio-ntriples-4.1.2.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/sesame-util-4.1.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/sesame-util-4.1.2.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/slf4j-api-1.7.24.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/slf4j-api-1.7.24.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/slf4j-simple-1.7.29.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/slf4j-simple-1.7.29.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/snakeyaml-1.24.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/snakeyaml-1.24.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/solr-solrj-8.3.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/solr-solrj-8.3.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/stax2-api-3.1.4.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/stax2-api-3.1.4.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/t-digest-3.2.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/t-digest-3.2.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/trec-car-tools-java-13.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/trec-car-tools-java-13.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/twitter-text-2.0.10.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/twitter-text-2.0.10.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/wdtk-datamodel-0.10.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/wdtk-datamodel-0.10.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/wdtk-dumpfiles-0.10.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/wdtk-dumpfiles-0.10.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/wdtk-storage-0.10.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/wdtk-storage-0.10.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/wdtk-util-0.10.0.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/wdtk-util-0.10.0.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/wikiclean-1.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/wikiclean-1.1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/woodstox-core-asl-4.4.1.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/woodstox-core-asl-4.4.1.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/xz-1.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/xz-1.5.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/zookeeper-3.5.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/zookeeper-3.5.5.jar


--------------------------------------------------------------------------------
/v1/retrievers/bm25_retriever/repo/zookeeper-jute-3.5.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/zookeeper-jute-3.5.5.jar


--------------------------------------------------------------------------------
/v1/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | install_requires = [
 4 |     'torch==1.4.0',
 5 |     'transformers==2.8.0',
 6 |     'faiss-cpu==1.6.3',
 7 |     'nltk==3.5',
 8 |     'pytrec_eval==0.4'
 9 | ]
10 | 
11 | setup(
12 |     name="OpenMatch",
13 |     version="0.0.1",
14 |     author="OpenMatch Authors",
15 |     author_email='zkt18{at}mails.tsinghua.edu.cn',
16 |     description="An Open Source Package for Information Retrieval",
17 |     packages=find_packages(),
18 |     install_requires=install_requires,
19 |     python_requires='>=3.6'
20 | )
21 | 


--------------------------------------------------------------------------------
/v1/train.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | python train.py \
 3 |         -task ranking \
 4 |         -model cknrm \
 5 |         -train ./data/train_toy.jsonl \
 6 |         -max_input 1280000 \
 7 |         -save ./checkpoints/cknrm.bin \
 8 |         -dev ./data/dev_toy.jsonl \
 9 |         -qrels ./data/qrels_toy \
10 |         -vocab ./data/glove.6B.300d.txt \
11 |         -res ./results/cknrm.trec \
12 |         -metric ndcg_cut_10 \
13 |         -n_kernels 21 \
14 |         -max_query_len 10 \
15 |         -max_doc_len 150 \
16 |         -epoch 2 \
17 |         -batch_size 32 \
18 |         -lr 1e-3 \
19 |         -eval_every 10
20 | 


--------------------------------------------------------------------------------
/v1/train_bert.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0 \
 2 | python train.py \
 3 |         -task classification \
 4 |         -model bert \
 5 |         -train ./data/train_clas_toy.jsonl \
 6 |         -max_input 1280000 \
 7 |         -save ./checkpoints/bert.bin \
 8 |         -dev ./data/dev_toy.jsonl \
 9 |         -qrels ./data/qrels_toy \
10 |         -vocab bert-base-uncased \
11 |         -pretrain bert-base-uncased \
12 |         -res ./results/bert.trec \
13 |         -metric ndcg_cut_10 \
14 |         -max_query_len 32 \
15 |         -max_doc_len 256 \
16 |         -epoch 1 \
17 |         -batch_size 4 \
18 |         -lr 2e-5 \
19 |         -eval_every 10
20 | 


--------------------------------------------------------------------------------
/v1/train_bert_dist.sh:
--------------------------------------------------------------------------------
 1 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ # set visible CUDA GPUs
 2 | python -u -m torch.distributed.launch \ #lauch distributed training
 3 | --nproc_per_node=4 \ # number equals to how many GPUs used
 4 | --master_port=12345 train.py \
 5 |         -task ranking \
 6 |         -model bert \
 7 |         # do not use the single json file
 8 |         -train queries=/path/to/queries.tsv,docs=/path/to/docs.tsv,qrels=/path/to/qrels.tsv,trec=/path/to/trec.tsv \
 9 |         -max_input 1280000 \
10 |         -save ./checkpoints/bert.bin \
11 |         -dev ./data/dev_toy.jsonl \
12 |         -qrels ./data/qrels_toy \
13 |         -vocab bert-base-uncased \
14 |         -pretrain bert-base-uncased \
15 |         -res ./results/bert.trec \
16 |         -metric ndcg_cut_10 \
17 |         -max_query_len 32 \
18 |         -max_doc_len 256 \
19 |         -epoch 1 \
20 |         -batch_size 4 \
21 |         -lr 2e-5 \
22 |         -eval_every 100 \
23 |         -optimizer adamw \
24 |         -dev_eval_batch_size 128 \
25 |         -gradient_accumulation_steps 4 \
26 |         -n_warmup_steps 10000 \
27 |         -logging_step 100
28 | 


--------------------------------------------------------------------------------