├── .gitignore ├── LICENSE ├── README.md ├── docs ├── dr-msmarco-passage.md ├── rr-msmarco-passage.md └── scale-t5-weights.md ├── pyproject.toml ├── scripts ├── evaluate.py ├── gtr │ └── convert_sbert_ckpt.ipynb ├── kilt-dpr │ ├── convert_to_evaluation.py │ └── convert_trec_to_provenance.py ├── msmarco │ ├── build_hn.py │ └── build_train.py ├── nq-dpr │ └── build_train.py ├── scale_t5_weights.py └── split_embeddings.py ├── setup.py ├── src └── openmatch │ ├── __init__.py │ ├── arguments.py │ ├── dataset │ ├── __init__.py │ ├── beir_dataset.py │ ├── data_collator.py │ ├── inference_dataset.py │ └── train_dataset.py │ ├── driver │ ├── build_index.py │ ├── rerank.py │ ├── retrieve.py │ ├── retrieve_beir.py │ ├── successive_retrieve.py │ ├── train_dr.py │ └── train_rr.py │ ├── loss.py │ ├── modeling │ ├── __init__.py │ ├── dense_retrieval_model.py │ ├── linear.py │ └── reranking_model.py │ ├── retriever │ ├── __init__.py │ ├── dense_retriever.py │ └── reranker.py │ ├── trainer │ ├── __init__.py │ ├── dense_trainer.py │ └── reranker_trainer.py │ └── utils.py └── v1 ├── Contrastive_Supervision_Synthesis ├── bm25_retriever │ ├── bin │ │ ├── ApproximateNearestNeighborEval │ │ ├── ApproximateNearestNeighborEval.bat │ │ ├── ApproximateNearestNeighborSearch │ │ ├── ApproximateNearestNeighborSearch.bat │ │ ├── DumpAnalyzedQueries │ │ ├── DumpAnalyzedQueries.bat │ │ ├── ExtractAverageDocumentLength │ │ ├── ExtractAverageDocumentLength.bat │ │ ├── ExtractDocumentLengths │ │ ├── ExtractDocumentLengths.bat │ │ ├── ExtractNorms │ │ ├── ExtractNorms.bat │ │ ├── FeatureExtractorCli │ │ ├── FeatureExtractorCli.bat │ │ ├── IndexCollection │ │ ├── IndexCollection.bat │ │ ├── IndexUtils │ │ ├── IndexUtils.bat │ │ ├── IndexVectors │ │ ├── IndexVectors.bat │ │ ├── SearchCollection │ │ ├── SearchCollection.bat │ │ ├── SearchElastic │ │ ├── SearchElastic.bat │ │ ├── SearchMsmarco │ │ ├── SearchMsmarco.bat │ │ ├── SearchSolr │ │ └── SearchSolr.bat │ ├── build_index.sh │ ├── repo │ │ ├── HdrHistogram-2.1.9.jar │ │ ├── aggs-matrix-stats-client-7.0.0.jar │ │ ├── annotations-java5-19.0.0.jar │ │ ├── anserini-0.7.3-SNAPSHOT.jar │ │ ├── anserini-fastutil-6.5.6.jar │ │ ├── ant-1.9.1.jar │ │ ├── ant-launcher-1.9.1.jar │ │ ├── args4j-2.32.jar │ │ ├── cbor-0.7.jar │ │ ├── commons-codec-1.11.jar │ │ ├── commons-compress-1.18.jar │ │ ├── commons-io-2.5.jar │ │ ├── commons-lang3-3.5.jar │ │ ├── commons-logging-1.1.3.jar │ │ ├── commons-math3-3.6.1.jar │ │ ├── commons-pool2-2.6.0.jar │ │ ├── compiler-0.9.3.jar │ │ ├── elasticsearch-7.0.0.jar │ │ ├── elasticsearch-cli-7.0.0.jar │ │ ├── elasticsearch-core-7.0.0.jar │ │ ├── elasticsearch-geo-7.0.0.jar │ │ ├── elasticsearch-rest-client-7.0.0.jar │ │ ├── elasticsearch-rest-high-level-client-7.0.0.jar │ │ ├── elasticsearch-secure-sm-7.0.0.jar │ │ ├── elasticsearch-x-content-7.0.0.jar │ │ ├── guava-18.0.jar │ │ ├── hppc-0.7.1.jar │ │ ├── http2-client-9.4.19.v20190610.jar │ │ ├── http2-common-9.4.19.v20190610.jar │ │ ├── http2-hpack-9.4.19.v20190610.jar │ │ ├── http2-http-client-transport-9.4.19.v20190610.jar │ │ ├── httpasyncclient-4.1.4.jar │ │ ├── httpclient-4.5.6.jar │ │ ├── httpcore-4.4.10.jar │ │ ├── httpcore-nio-4.4.11.jar │ │ ├── httpmime-4.5.6.jar │ │ ├── jackson-annotations-2.10.0.pr1.jar │ │ ├── jackson-core-2.10.0.pr1.jar │ │ ├── jackson-databind-2.10.0.pr1.jar │ │ ├── jackson-dataformat-cbor-2.8.11.jar │ │ ├── jackson-dataformat-smile-2.8.11.jar │ │ ├── jackson-dataformat-yaml-2.10.0.pr1.jar │ │ ├── jackson-datatype-jdk8-2.10.0.pr1.jar │ │ ├── jcl-over-slf4j-1.7.24.jar │ │ ├── jetty-alpn-client-9.4.19.v20190610.jar │ │ ├── jetty-alpn-java-client-9.4.19.v20190610.jar │ │ ├── jetty-client-9.4.19.v20190610.jar │ │ ├── jetty-http-9.4.19.v20190610.jar │ │ ├── jetty-io-9.4.19.v20190610.jar │ │ ├── jetty-util-9.4.19.v20190610.jar │ │ ├── jna-4.5.1.jar │ │ ├── joda-time-2.10.1.jar │ │ ├── jopt-simple-5.0.2.jar │ │ ├── jsoup-1.8.3.jar │ │ ├── jsr305-2.0.1.jar │ │ ├── lang-mustache-client-7.0.0.jar │ │ ├── log4j-api-2.12.1.jar │ │ ├── log4j-core-2.12.1.jar │ │ ├── lucene-analyzers-common-8.0.0.jar │ │ ├── lucene-backward-codecs-8.0.0.jar │ │ ├── lucene-core-8.3.0.jar │ │ ├── lucene-grouping-8.0.0.jar │ │ ├── lucene-highlighter-8.0.0.jar │ │ ├── lucene-join-8.0.0.jar │ │ ├── lucene-memory-8.0.0.jar │ │ ├── lucene-misc-8.0.0.jar │ │ ├── lucene-queries-8.0.0.jar │ │ ├── lucene-queryparser-8.0.0.jar │ │ ├── lucene-sandbox-8.0.0.jar │ │ ├── lucene-spatial-8.0.0.jar │ │ ├── lucene-spatial-extras-8.0.0.jar │ │ ├── lucene-spatial3d-8.0.0.jar │ │ ├── lucene-suggest-8.0.0.jar │ │ ├── mockito-all-1.10.19.jar │ │ ├── netty-buffer-4.1.29.Final.jar │ │ ├── netty-codec-4.1.29.Final.jar │ │ ├── netty-common-4.1.29.Final.jar │ │ ├── netty-handler-4.1.29.Final.jar │ │ ├── netty-resolver-4.1.29.Final.jar │ │ ├── netty-transport-4.1.29.Final.jar │ │ ├── netty-transport-native-epoll-4.1.29.Final.jar │ │ ├── netty-transport-native-unix-common-4.1.29.Final.jar │ │ ├── parent-join-client-7.0.0.jar │ │ ├── rank-eval-client-7.0.0.jar │ │ ├── sesame-model-4.1.2.jar │ │ ├── sesame-rio-api-4.1.2.jar │ │ ├── sesame-rio-datatypes-4.1.2.jar │ │ ├── sesame-rio-languages-4.1.2.jar │ │ ├── sesame-rio-ntriples-4.1.2.jar │ │ ├── sesame-util-4.1.2.jar │ │ ├── slf4j-api-1.7.24.jar │ │ ├── slf4j-simple-1.7.29.jar │ │ ├── snakeyaml-1.24.jar │ │ ├── solr-solrj-8.3.0.jar │ │ ├── stax2-api-3.1.4.jar │ │ ├── t-digest-3.2.jar │ │ ├── trec-car-tools-java-13.jar │ │ ├── twitter-text-2.0.10.jar │ │ ├── wdtk-datamodel-0.10.0.jar │ │ ├── wdtk-dumpfiles-0.10.0.jar │ │ ├── wdtk-storage-0.10.0.jar │ │ ├── wdtk-util-0.10.0.jar │ │ ├── wikiclean-1.1.jar │ │ ├── woodstox-core-asl-4.4.1.jar │ │ ├── xz-1.5.jar │ │ ├── zookeeper-3.5.5.jar │ │ └── zookeeper-jute-3.5.5.jar │ └── retrieve.sh ├── contrastqg │ ├── __init__.py │ ├── dataloaders │ │ ├── __init__.py │ │ ├── generate_loader.py │ │ ├── loader_utils.py │ │ └── t5_utils.py │ └── transformers │ │ ├── __init__.py │ │ ├── activations.py │ │ ├── benchmark │ │ ├── __init__.py │ │ ├── benchmark.py │ │ ├── benchmark_args.py │ │ ├── benchmark_args_utils.py │ │ └── benchmark_utils.py │ │ ├── commands │ │ ├── __init__.py │ │ ├── convert.py │ │ ├── download.py │ │ ├── env.py │ │ ├── run.py │ │ ├── serving.py │ │ ├── train.py │ │ ├── transformers_cli.py │ │ └── user.py │ │ ├── configuration_albert.py │ │ ├── configuration_auto.py │ │ ├── configuration_bart.py │ │ ├── configuration_bert.py │ │ ├── configuration_camembert.py │ │ ├── configuration_ctrl.py │ │ ├── configuration_distilbert.py │ │ ├── configuration_electra.py │ │ ├── configuration_encoder_decoder.py │ │ ├── configuration_flaubert.py │ │ ├── configuration_gpt2.py │ │ ├── configuration_longformer.py │ │ ├── configuration_marian.py │ │ ├── configuration_mmbt.py │ │ ├── configuration_openai.py │ │ ├── configuration_reformer.py │ │ ├── configuration_roberta.py │ │ ├── configuration_t5.py │ │ ├── configuration_transfo_xl.py │ │ ├── configuration_utils.py │ │ ├── configuration_xlm.py │ │ ├── configuration_xlm_roberta.py │ │ ├── configuration_xlnet.py │ │ ├── convert_albert_original_tf_checkpoint_to_pytorch.py │ │ ├── convert_bart_original_pytorch_checkpoint_to_pytorch.py │ │ ├── convert_bert_original_tf_checkpoint_to_pytorch.py │ │ ├── convert_bert_pytorch_checkpoint_to_original_tf.py │ │ ├── convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py │ │ ├── convert_electra_original_tf_checkpoint_to_pytorch.py │ │ ├── convert_gpt2_original_tf_checkpoint_to_pytorch.py │ │ ├── convert_graph_to_onnx.py │ │ ├── convert_longformer_original_pytorch_lightning_to_pytorch.py │ │ ├── convert_marian_to_pytorch.py │ │ ├── convert_openai_original_tf_checkpoint_to_pytorch.py │ │ ├── convert_pytorch_checkpoint_to_tf2.py │ │ ├── convert_reformer_trax_checkpoint_to_pytorch.py │ │ ├── convert_roberta_original_pytorch_checkpoint_to_pytorch.py │ │ ├── convert_t5_original_tf_checkpoint_to_pytorch.py │ │ ├── convert_transfo_xl_original_tf_checkpoint_to_pytorch.py │ │ ├── convert_xlm_original_pytorch_checkpoint_to_pytorch.py │ │ ├── convert_xlnet_original_tf_checkpoint_to_pytorch.py │ │ ├── data │ │ ├── __init__.py │ │ ├── data_collator.py │ │ ├── datasets │ │ │ ├── __init__.py │ │ │ ├── glue.py │ │ │ └── language_modeling.py │ │ ├── metrics │ │ │ ├── __init__.py │ │ │ └── squad_metrics.py │ │ └── processors │ │ │ ├── __init__.py │ │ │ ├── glue.py │ │ │ ├── squad.py │ │ │ ├── utils.py │ │ │ └── xnli.py │ │ ├── file_utils.py │ │ ├── hf_api.py │ │ ├── hf_argparser.py │ │ ├── modelcard.py │ │ ├── modeling_albert.py │ │ ├── modeling_auto.py │ │ ├── modeling_bart.py │ │ ├── modeling_bert.py │ │ ├── modeling_camembert.py │ │ ├── modeling_ctrl.py │ │ ├── modeling_distilbert.py │ │ ├── modeling_electra.py │ │ ├── modeling_encoder_decoder.py │ │ ├── modeling_flaubert.py │ │ ├── modeling_gpt2.py │ │ ├── modeling_longformer.py │ │ ├── modeling_marian.py │ │ ├── modeling_mmbt.py │ │ ├── modeling_openai.py │ │ ├── modeling_reformer.py │ │ ├── modeling_roberta.py │ │ ├── modeling_t5.py │ │ ├── modeling_tf_albert.py │ │ ├── modeling_tf_auto.py │ │ ├── modeling_tf_bert.py │ │ ├── modeling_tf_camembert.py │ │ ├── modeling_tf_ctrl.py │ │ ├── modeling_tf_distilbert.py │ │ ├── modeling_tf_electra.py │ │ ├── modeling_tf_flaubert.py │ │ ├── modeling_tf_gpt2.py │ │ ├── modeling_tf_openai.py │ │ ├── modeling_tf_pytorch_utils.py │ │ ├── modeling_tf_roberta.py │ │ ├── modeling_tf_t5.py │ │ ├── modeling_tf_transfo_xl.py │ │ ├── modeling_tf_transfo_xl_utilities.py │ │ ├── modeling_tf_utils.py │ │ ├── modeling_tf_xlm.py │ │ ├── modeling_tf_xlm_roberta.py │ │ ├── modeling_tf_xlnet.py │ │ ├── modeling_transfo_xl.py │ │ ├── modeling_transfo_xl_utilities.py │ │ ├── modeling_utils.py │ │ ├── modeling_xlm.py │ │ ├── modeling_xlm_roberta.py │ │ ├── modeling_xlnet.py │ │ ├── optimization.py │ │ ├── optimization_tf.py │ │ ├── pipelines.py │ │ ├── tokenization_albert.py │ │ ├── tokenization_auto.py │ │ ├── tokenization_bart.py │ │ ├── tokenization_bert.py │ │ ├── tokenization_bert_japanese.py │ │ ├── tokenization_camembert.py │ │ ├── tokenization_ctrl.py │ │ ├── tokenization_distilbert.py │ │ ├── tokenization_electra.py │ │ ├── tokenization_flaubert.py │ │ ├── tokenization_gpt2.py │ │ ├── tokenization_longformer.py │ │ ├── tokenization_marian.py │ │ ├── tokenization_openai.py │ │ ├── tokenization_reformer.py │ │ ├── tokenization_roberta.py │ │ ├── tokenization_t5.py │ │ ├── tokenization_transfo_xl.py │ │ ├── tokenization_utils.py │ │ ├── tokenization_xlm.py │ │ ├── tokenization_xlm_roberta.py │ │ ├── tokenization_xlnet.py │ │ ├── trainer.py │ │ ├── trainer_tf.py │ │ ├── trainer_utils.py │ │ ├── training_args.py │ │ └── training_args_tf.py ├── preprocess │ ├── prepro_dataset.sh │ ├── sample_contrast_pairs.sh │ └── utils │ │ ├── prepro_dataset.py │ │ └── sample_contrast_pairs.py ├── run_shell │ ├── cqg_inference.sh │ ├── qg_inference.sh │ └── train_nlg.sh └── scripts │ ├── config.py │ ├── inference.py │ ├── model.py │ ├── train.py │ └── utils.py ├── Dockerfile ├── LICENSE ├── LeToR ├── RankLib-2.1-patched.jar └── gen_trec.py ├── OpenMatch ├── __init__.py ├── data │ ├── __init__.py │ ├── dataloader.py │ ├── datasets │ │ ├── __init__.py │ │ ├── bert_dataset.py │ │ ├── bertmaxp_dataset.py │ │ ├── bertmlm_dataset.py │ │ ├── dataset.py │ │ ├── edrm_dataset.py │ │ ├── meta_bert_dataset.py │ │ └── roberta_dataset.py │ └── tokenizers │ │ ├── __init__.py │ │ ├── tokenizer.py │ │ └── word_tokenizer.py ├── extractors │ ├── __init__.py │ └── classic_extractor.py ├── metrics │ ├── __init__.py │ └── metric.py ├── models │ ├── __init__.py │ ├── bert.py │ ├── bert_maxp.py │ ├── conv_knrm.py │ ├── edrm.py │ ├── knrm.py │ └── tk.py ├── modules │ ├── __init__.py │ ├── attentions │ │ ├── __init__.py │ │ ├── multi_head_attention.py │ │ └── scaled_dot_product_attention.py │ ├── embedders │ │ ├── __init__.py │ │ └── embedder.py │ ├── encoders │ │ ├── __init__.py │ │ ├── cnn_encoder.py │ │ ├── feed_forward_encoder.py │ │ ├── positional_encoder.py │ │ └── transformer_encoder.py │ └── matchers │ │ ├── __init__.py │ │ └── kernel_matcher.py └── utils.py ├── README.md ├── checkpoints └── README.md ├── coor_ascent.sh ├── data ├── dev_toy.jsonl ├── docs_toy.jsonl ├── filter.py ├── preprocess.py ├── qrels_toy ├── queries_toy.jsonl ├── test_toy.jsonl ├── toy.trec ├── train_clas_toy.jsonl └── train_rank_toy.jsonl ├── docs ├── contrastive-supervision-synthesis.md ├── distributed training.md ├── experiments-adhoc.md ├── experiments-classic.md ├── experiments-msmarco-doc.md ├── experiments-msmarco.md ├── experiments-treccovid.md ├── meta-learning-to-rank.md └── openmatch.md ├── features └── README.md ├── gen_feature.py ├── gen_feature.sh ├── gen_feature_bert.sh ├── inference.py ├── inference.sh ├── inference_bert.sh ├── magic_module.py ├── meta_dist_train.py ├── meta_dist_train.sh ├── requirements.txt ├── results └── README.md ├── retrievers ├── ANCE │ ├── CODE_OF_CONDUCT.md │ ├── LICENSE │ ├── README.md │ ├── SECURITY.md │ ├── commands │ │ ├── data_download.sh │ │ ├── run_ann_data_gen.sh │ │ ├── run_ann_data_gen_dpr.sh │ │ ├── run_ann_data_gen_lyz.sh │ │ ├── run_inference.sh │ │ ├── run_train.sh │ │ ├── run_train_dpr.sh │ │ ├── run_train_lyz.sh │ │ └── run_train_warmup.sh │ ├── data │ │ ├── DPR_data.py │ │ ├── msmarco_data.py │ │ └── process_fn.py │ ├── drivers │ │ ├── run_ann.py │ │ ├── run_ann_data_gen.py │ │ ├── run_ann_data_gen_dpr.py │ │ ├── run_ann_dpr.py │ │ └── run_warmup.py │ ├── evaluation │ │ ├── Calculate Metrics.ipynb │ │ ├── Calculate_Metrics.py │ │ └── convert_trec.py │ ├── model │ │ └── models.py │ ├── setup.py │ └── utils │ │ ├── dpr_utils.py │ │ ├── eval_mrr.py │ │ ├── lamb.py │ │ ├── msmarco_eval.py │ │ └── util.py ├── DANCE │ ├── ANCE_setup.py │ ├── README.md │ ├── commands │ │ ├── data_download.sh │ │ ├── run_ann_data_gen.sh │ │ ├── run_ann_data_gen_dpr.sh │ │ ├── run_inference.sh │ │ ├── run_train.sh │ │ ├── run_train_dpr.sh │ │ └── run_train_warmup.sh │ ├── data │ │ ├── DPR_data.py │ │ ├── custom_data.py │ │ ├── msmarco_data.py │ │ ├── process_fn.py │ │ └── validation_split.py │ ├── drivers │ │ ├── run_ann.py │ │ ├── run_ann_data_gen.py │ │ ├── run_ann_data_gen_dpr.py │ │ ├── run_ann_data_inference_eval.py │ │ ├── run_ann_dpr.py │ │ ├── run_ann_emb_inference.py │ │ └── run_warmup.py │ ├── evaluation │ │ ├── Calculate Metrics.ipynb │ │ ├── Calculate_Metrics.py │ │ └── retrieval.py │ ├── model │ │ └── models.py │ ├── requirements.txt │ └── utils │ │ ├── dpr_utils.py │ │ ├── eval_mrr.py │ │ ├── indexing_utils.py │ │ ├── lamb.py │ │ ├── metric.py │ │ ├── msmarco_eval.py │ │ ├── trec_convert.py │ │ └── util.py ├── README.md ├── bm25_retriever │ ├── bin │ │ ├── ApproximateNearestNeighborEval │ │ ├── ApproximateNearestNeighborEval.bat │ │ ├── ApproximateNearestNeighborSearch │ │ ├── ApproximateNearestNeighborSearch.bat │ │ ├── DumpAnalyzedQueries │ │ ├── DumpAnalyzedQueries.bat │ │ ├── ExtractAverageDocumentLength │ │ ├── ExtractAverageDocumentLength.bat │ │ ├── ExtractDocumentLengths │ │ ├── ExtractDocumentLengths.bat │ │ ├── ExtractNorms │ │ ├── ExtractNorms.bat │ │ ├── FeatureExtractorCli │ │ ├── FeatureExtractorCli.bat │ │ ├── IndexCollection │ │ ├── IndexCollection.bat │ │ ├── IndexUtils │ │ ├── IndexUtils.bat │ │ ├── IndexVectors │ │ ├── IndexVectors.bat │ │ ├── SearchCollection │ │ ├── SearchCollection.bat │ │ ├── SearchElastic │ │ ├── SearchElastic.bat │ │ ├── SearchMsmarco │ │ ├── SearchMsmarco.bat │ │ ├── SearchSolr │ │ └── SearchSolr.bat │ └── repo │ │ ├── HdrHistogram-2.1.9.jar │ │ ├── aggs-matrix-stats-client-7.0.0.jar │ │ ├── annotations-java5-19.0.0.jar │ │ ├── anserini-0.7.3-SNAPSHOT.jar │ │ ├── anserini-fastutil-6.5.6.jar │ │ ├── ant-1.9.1.jar │ │ ├── ant-launcher-1.9.1.jar │ │ ├── args4j-2.32.jar │ │ ├── cbor-0.7.jar │ │ ├── commons-codec-1.11.jar │ │ ├── commons-compress-1.18.jar │ │ ├── commons-io-2.5.jar │ │ ├── commons-lang3-3.5.jar │ │ ├── commons-logging-1.1.3.jar │ │ ├── commons-math3-3.6.1.jar │ │ ├── commons-pool2-2.6.0.jar │ │ ├── compiler-0.9.3.jar │ │ ├── elasticsearch-7.0.0.jar │ │ ├── elasticsearch-cli-7.0.0.jar │ │ ├── elasticsearch-core-7.0.0.jar │ │ ├── elasticsearch-geo-7.0.0.jar │ │ ├── elasticsearch-rest-client-7.0.0.jar │ │ ├── elasticsearch-rest-high-level-client-7.0.0.jar │ │ ├── elasticsearch-secure-sm-7.0.0.jar │ │ ├── elasticsearch-x-content-7.0.0.jar │ │ ├── guava-18.0.jar │ │ ├── hppc-0.7.1.jar │ │ ├── http2-client-9.4.19.v20190610.jar │ │ ├── http2-common-9.4.19.v20190610.jar │ │ ├── http2-hpack-9.4.19.v20190610.jar │ │ ├── http2-http-client-transport-9.4.19.v20190610.jar │ │ ├── httpasyncclient-4.1.4.jar │ │ ├── httpclient-4.5.6.jar │ │ ├── httpcore-4.4.10.jar │ │ ├── httpcore-nio-4.4.11.jar │ │ ├── httpmime-4.5.6.jar │ │ ├── jackson-annotations-2.10.0.pr1.jar │ │ ├── jackson-core-2.10.0.pr1.jar │ │ ├── jackson-databind-2.10.0.pr1.jar │ │ ├── jackson-dataformat-cbor-2.8.11.jar │ │ ├── jackson-dataformat-smile-2.8.11.jar │ │ ├── jackson-dataformat-yaml-2.10.0.pr1.jar │ │ ├── jackson-datatype-jdk8-2.10.0.pr1.jar │ │ ├── jcl-over-slf4j-1.7.24.jar │ │ ├── jetty-alpn-client-9.4.19.v20190610.jar │ │ ├── jetty-alpn-java-client-9.4.19.v20190610.jar │ │ ├── jetty-client-9.4.19.v20190610.jar │ │ ├── jetty-http-9.4.19.v20190610.jar │ │ ├── jetty-io-9.4.19.v20190610.jar │ │ ├── jetty-util-9.4.19.v20190610.jar │ │ ├── jna-4.5.1.jar │ │ ├── joda-time-2.10.1.jar │ │ ├── jopt-simple-5.0.2.jar │ │ ├── jsoup-1.8.3.jar │ │ ├── jsr305-2.0.1.jar │ │ ├── lang-mustache-client-7.0.0.jar │ │ ├── log4j-api-2.12.1.jar │ │ ├── log4j-core-2.12.1.jar │ │ ├── lucene-analyzers-common-8.0.0.jar │ │ ├── lucene-backward-codecs-8.0.0.jar │ │ ├── lucene-core-8.3.0.jar │ │ ├── lucene-grouping-8.0.0.jar │ │ ├── lucene-highlighter-8.0.0.jar │ │ ├── lucene-join-8.0.0.jar │ │ ├── lucene-memory-8.0.0.jar │ │ ├── lucene-misc-8.0.0.jar │ │ ├── lucene-queries-8.0.0.jar │ │ ├── lucene-queryparser-8.0.0.jar │ │ ├── lucene-sandbox-8.0.0.jar │ │ ├── lucene-spatial-8.0.0.jar │ │ ├── lucene-spatial-extras-8.0.0.jar │ │ ├── lucene-spatial3d-8.0.0.jar │ │ ├── lucene-suggest-8.0.0.jar │ │ ├── mockito-all-1.10.19.jar │ │ ├── netty-buffer-4.1.29.Final.jar │ │ ├── netty-codec-4.1.29.Final.jar │ │ ├── netty-common-4.1.29.Final.jar │ │ ├── netty-handler-4.1.29.Final.jar │ │ ├── netty-resolver-4.1.29.Final.jar │ │ ├── netty-transport-4.1.29.Final.jar │ │ ├── netty-transport-native-epoll-4.1.29.Final.jar │ │ ├── netty-transport-native-unix-common-4.1.29.Final.jar │ │ ├── parent-join-client-7.0.0.jar │ │ ├── rank-eval-client-7.0.0.jar │ │ ├── sesame-model-4.1.2.jar │ │ ├── sesame-rio-api-4.1.2.jar │ │ ├── sesame-rio-datatypes-4.1.2.jar │ │ ├── sesame-rio-languages-4.1.2.jar │ │ ├── sesame-rio-ntriples-4.1.2.jar │ │ ├── sesame-util-4.1.2.jar │ │ ├── slf4j-api-1.7.24.jar │ │ ├── slf4j-simple-1.7.29.jar │ │ ├── snakeyaml-1.24.jar │ │ ├── solr-solrj-8.3.0.jar │ │ ├── stax2-api-3.1.4.jar │ │ ├── t-digest-3.2.jar │ │ ├── trec-car-tools-java-13.jar │ │ ├── twitter-text-2.0.10.jar │ │ ├── wdtk-datamodel-0.10.0.jar │ │ ├── wdtk-dumpfiles-0.10.0.jar │ │ ├── wdtk-storage-0.10.0.jar │ │ ├── wdtk-util-0.10.0.jar │ │ ├── wikiclean-1.1.jar │ │ ├── woodstox-core-asl-4.4.1.jar │ │ ├── xz-1.5.jar │ │ ├── zookeeper-3.5.5.jar │ │ └── zookeeper-jute-3.5.5.jar ├── openmatch_ance_retriver_readme.md └── venv_ANCE.requirements ├── setup.py ├── train.py ├── train.sh ├── train_bert.sh ├── train_bert_dist.sh ├── train_bertmlm.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | 91 | .DS_Store 92 | */.DS_Store 93 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 THUNLP 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **This repo is deprecated. Please check out our new repo at https://github.com/OpenMatch/OpenMatch.** 2 | 3 | # OpenMatch v2 4 | 5 | An all-in-one toolkit for information retrieval. Under active development. 6 | 7 | ## Install 8 | 9 | ```bash 10 | git clone https://github.com/thunlp/OpenMatch.git 11 | cd OpenMatch 12 | pip install -e . 13 | ``` 14 | 15 | `-e` means **editable**, i.e. you can change the code directly in your directory. 16 | 17 | We do not include all the requirements in the package. You may need to manually install `torch`, `tensorboard`. 18 | 19 | You may also need faiss for dense retrieval. You can install either `faiss-cpu` or `faiss-gpu`, according to your enviroment. Note that if you want to perform search on GPUs, you need to install the version of `faiss-gpu` compatible with your CUDA. In some cases (usually CUDA >= 11.0) `pip` installs a wrong version. If you encounter errors during search on GPUs, you may try installing it from `conda`. 20 | 21 | ## Features 22 | 23 | - Human-friendly interface for dense retriever and re-ranker training and testing 24 | - Various PLMs supported (BERT, RoBERTa, T5...) 25 | - Native support for common IR & QA Datasets (MS MARCO, NQ, KILT, BEIR, ...) 26 | - Deep integration with Huggingface Transformers and Datasets 27 | - Efficient training and inference via stream-style data loading 28 | 29 | ## Docs 30 | 31 | See docs folder. 32 | 33 | ## Project Organizers 34 | 35 | - Zhiyuan Liu 36 | * Tsinghua University 37 | * [Homepage](http://nlp.csai.tsinghua.edu.cn/~lzy/) 38 | - Zhenghao Liu 39 | * Northeastern University 40 | * [Homepage](https://edwardzh.github.io/) 41 | - Chenyan Xiong 42 | * Microsoft Research AI 43 | * [Homepage](https://www.microsoft.com/en-us/research/people/cxiong/) 44 | - Maosong Sun 45 | * Tsinghua University 46 | * [Homepage](http://nlp.csai.tsinghua.edu.cn/staff/sms/) 47 | 48 | ## Acknowledgments 49 | 50 | Our implementation uses [Tevatron](https://github.com/texttron/tevatron) as the starting point. We thank its authors for their contributions. 51 | 52 | ## Contact 53 | 54 | Please email to yushi17@foxmail.com. 55 | -------------------------------------------------------------------------------- /docs/scale-t5-weights.md: -------------------------------------------------------------------------------- 1 | # T5 Weights Scaling 2 | 3 | For stable mixed-precision training on NVIDIA GPUs, it's recommended to scale the weights of the pre-trained T5 model. 4 | 5 | First you need to manually download the T5 model. Search for your model on Hugging Face, and switch to the "Files and versions" tab. Right click the download arrows, copy the download links of `config.json`, `pytorch_model.bin`, `spiece.model`, `tokenizer.json` and download them in your directory. 6 | 7 | Run the following command to scale the weights: 8 | 9 | ```bash 10 | python scripts/scale_t5_weights.py --input_model_path /path/to/t5-base --output_model_path /path/to/t5-base-scaled --num_layers 12 11 | ``` 12 | 13 | For larger T5 models, change `--num_layers` to the corresponding number of model layers. -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=42"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /scripts/kilt-dpr/convert_to_evaluation.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from tqdm import tqdm 4 | 5 | 6 | if __name__ == "__main__": 7 | parser = argparse.ArgumentParser() 8 | parser.add_argument("--kilt_queries_file", type=str) 9 | parser.add_argument("--provenance_file", type=str) 10 | parser.add_argument("--output_evaluation_file", type=str) 11 | args = parser.parse_args() 12 | 13 | raw_data = [] 14 | with open(args.kilt_queries_file, "r") as f: 15 | for line in f: 16 | raw_data.append(json.loads(line)) 17 | 18 | with open(args.provenance_file, "r") as f: 19 | provenance = json.load(f) 20 | 21 | # consider only valid data - filter out invalid 22 | validated_data = {} 23 | query_data = [] 24 | for element in raw_data: 25 | #if utils.validate_datapoint(element, logger=None): 26 | if element["id"] in validated_data: 27 | raise ValueError("ids are not unique in input data!") 28 | validated_data[element["id"]] = element 29 | query_data.append( 30 | {"query": element["input"], "id": element["id"]} 31 | ) 32 | 33 | if len(provenance) != len(query_data): 34 | print("WARNING: provenance and query data are not of the same length!") 35 | 36 | # write prediction files 37 | if provenance: 38 | print("writing prediction file to {}".format(args.output_evaluation_file)) 39 | 40 | predictions = [] 41 | for query_id in provenance.keys(): 42 | element = validated_data[query_id] 43 | new_output = [{"provenance": provenance[query_id]}] 44 | # append the answers 45 | if "output" in element: 46 | for o in element["output"]: 47 | if "answer" in o: 48 | new_output.append({"answer": o["answer"]}) 49 | element["output"] = new_output 50 | predictions.append(element) 51 | 52 | with open(args.output_evaluation_file, "w") as outfile: 53 | for p in predictions: 54 | json.dump(p, outfile) 55 | outfile.write("\n") -------------------------------------------------------------------------------- /scripts/kilt-dpr/convert_trec_to_provenance.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import csv 4 | from tqdm import tqdm 5 | 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--trec_file", type=str) 10 | parser.add_argument("--kilt_queries_file", type=str, default=None) 11 | parser.add_argument("--passage_collection", type=str) 12 | parser.add_argument("--output_provenance_file", type=str) 13 | args = parser.parse_args() 14 | 15 | queries = [] 16 | if args.kilt_queries_file is not None: 17 | with open(args.kilt_queries_file, "r") as f: 18 | for line in f: 19 | obj = json.loads(line) 20 | queries.append(obj) 21 | 22 | pid2content = [] 23 | with open(args.passage_collection, "r") as f: 24 | reader = csv.reader(f, delimiter="\t") 25 | next(reader) 26 | i = 0 27 | for row in tqdm(reader): 28 | pid, text, wikipedia_title, wikipedia_id, _, _ = row 29 | pid = int(pid) 30 | assert pid == i 31 | pid2content.append({"text": text, "wikipedia_title": wikipedia_title, "wikipedia_id": wikipedia_id}) 32 | i += 1 33 | 34 | provenance = {} 35 | with open(args.trec_file, "r") as f: 36 | last_qid = 0 37 | for line in f: 38 | qid, _, pid, rank, score, _ = line.strip().split() 39 | qid = int(qid) 40 | pid = int(pid) 41 | rank = int(rank) 42 | real_qid = queries[qid - 1]["id"] if len(queries) > 0 else str(qid) 43 | if qid != last_qid: # new query 44 | provenance[real_qid] = [] 45 | last_qid = qid 46 | provenance[real_qid].append({"score": score, "text": pid2content[pid]["text"], "wikipedia_title": pid2content[pid]["wikipedia_title"], "wikipedia_id": pid2content[pid]["wikipedia_id"]}) 47 | 48 | with open(args.output_provenance_file, "w") as f: 49 | json.dump(provenance, f, indent=4) -------------------------------------------------------------------------------- /scripts/nq-dpr/build_train.py: -------------------------------------------------------------------------------- 1 | # Adapted from Tevatron (https://github.com/texttron/tevatron) 2 | 3 | import json 4 | import os 5 | from argparse import ArgumentParser 6 | 7 | from transformers import AutoTokenizer, PreTrainedTokenizer 8 | from tqdm import tqdm 9 | from openmatch.utils import fill_template 10 | 11 | parser = ArgumentParser() 12 | parser.add_argument('--input', type=str, required=True) 13 | parser.add_argument('--output', type=str, required=True) 14 | parser.add_argument('--query_template', type=str) 15 | parser.add_argument('--doc_template', type=str) 16 | parser.add_argument('--tokenizer', type=str, required=False, default='bert-base-uncased') 17 | parser.add_argument('--minimum-negatives', type=int, required=False, default=1) 18 | args = parser.parse_args() 19 | 20 | tokenizer: PreTrainedTokenizer = AutoTokenizer.from_pretrained(args.tokenizer, use_fast=True) 21 | 22 | data = json.load(open(args.input)) 23 | 24 | save_dir = os.path.split(args.output)[0] 25 | if not os.path.exists(save_dir): 26 | os.makedirs(save_dir) 27 | 28 | with open(args.output, 'w') as f: 29 | for idx, item in enumerate(tqdm(data)): 30 | if len(item['hard_negative_ctxs']) < args.minimum_negatives or len(item['positive_ctxs']) < 1: 31 | continue 32 | 33 | group = {} 34 | positives = [] 35 | for pos in item['positive_ctxs']: 36 | positives.append(fill_template(args.doc_template, pos)) 37 | negatives = [] 38 | for neg in item['hard_negative_ctxs']: 39 | negatives.append(fill_template(args.doc_template, neg)) 40 | 41 | query = tokenizer.encode(fill_template(args.query_template, item), add_special_tokens=False, max_length=32, truncation=True) 42 | positives = tokenizer( 43 | positives, add_special_tokens=False, max_length=128, truncation=True, padding=False)['input_ids'] 44 | negatives = tokenizer( 45 | negatives, add_special_tokens=False, max_length=128, truncation=True, padding=False)['input_ids'] 46 | 47 | group['query'] = query 48 | group['positives'] = positives 49 | group['negatives'] = negatives 50 | 51 | f.write(json.dumps(group) + '\n') 52 | -------------------------------------------------------------------------------- /scripts/scale_t5_weights.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from transformers import AutoModel 3 | import copy 4 | import argparse 5 | import os 6 | import shutil 7 | import glob 8 | 9 | 10 | if __name__ == "__main__": 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--input_model_path", type=str) 13 | parser.add_argument("--output_model_path", type=str) 14 | parser.add_argument("--num_layers", type=int, default=12) 15 | args = parser.parse_args() 16 | 17 | # scale model weights 18 | original_model = AutoModel.from_pretrained(args.input_model_path) 19 | state_dict = original_model.state_dict() 20 | 21 | keys = state_dict.keys() 22 | new_state_dict = copy.deepcopy(state_dict) 23 | 24 | for i in range(args.num_layers): 25 | new_state_dict[f'encoder.block.{i}.layer.0.SelfAttention.o.weight'] /= 100 26 | new_state_dict[f'encoder.block.{i}.layer.1.DenseReluDense.wi.weight'] /= 10 27 | new_state_dict[f'encoder.block.{i}.layer.1.DenseReluDense.wo.weight'] /= 10 28 | 29 | new_state_dict[f'decoder.block.{i}.layer.1.EncDecAttention.o.weight'] /= 100 30 | new_state_dict[f'decoder.block.{i}.layer.0.SelfAttention.o.weight'] /= 100 31 | new_state_dict[f'decoder.block.{i}.layer.2.DenseReluDense.wi.weight'] /= 10 32 | new_state_dict[f'decoder.block.{i}.layer.2.DenseReluDense.wo.weight'] /= 10 33 | new_state_dict['shared.weight'] /= 100 34 | 35 | os.makedirs(args.output_model_path, exist_ok=True) 36 | torch.save(new_state_dict, os.path.join(args.output_model_path, "pytorch_model.bin")) 37 | 38 | # copy other files 39 | files = glob.glob(os.path.join(args.input_model_path, "*")) 40 | for file in files: 41 | if file != os.path.join(args.input_model_path, "pytorch_model.bin"): 42 | shutil.copy(file, args.output_model_path) -------------------------------------------------------------------------------- /scripts/split_embeddings.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import argparse 3 | import pickle 4 | from tqdm import trange 5 | 6 | 7 | if __name__ == "__main__": 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument("--input_embedding", type=str) 10 | parser.add_argument("--output_embeddings", type=str) 11 | parser.add_argument("--num_splits", type=int, default=2) 12 | args = parser.parse_args() 13 | 14 | with open(args.input_embedding, "rb") as f: 15 | embedding, lookup = pickle.load(f) 16 | lookup = np.array(lookup) 17 | 18 | for split in trange(args.num_splits): 19 | embedding_split = embedding[split::args.num_splits] 20 | lookup_split = lookup[split::args.num_splits] 21 | with open(args.output_embeddings + f".{split}", "wb") as f: 22 | pickle.dump((embedding_split, lookup_split.tolist()), f, protocol=4) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="openmatch-thunlp", 8 | version="0.0.1", 9 | author="Shi Yu", 10 | author_email="yushi17@foxmail.com", 11 | description="An python package for research on Information Retrieval", 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | classifiers=[ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: MIT License", 17 | "Topic :: Text Processing :: Indexing", 18 | "Intended Audience :: Information Technology" 19 | ], 20 | package_dir={"": "src"}, 21 | packages=setuptools.find_packages(where="src"), 22 | python_requires=">=3.7", 23 | install_requires=[ 24 | "transformers>=4.10.0", 25 | "sentencepiece", 26 | "datasets>=1.1.3" 27 | ] 28 | ) 29 | -------------------------------------------------------------------------------- /src/openmatch/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/src/openmatch/__init__.py -------------------------------------------------------------------------------- /src/openmatch/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | from .beir_dataset import BEIRQueryDataset, BEIRCorpusDataset, BEIRDataset 2 | from .data_collator import DRInferenceCollator, QPCollator, PairCollator, RRInferenceCollator 3 | from .inference_dataset import JsonlDataset, TsvDataset, InferenceDataset 4 | from .train_dataset import DRTrainDataset, DREvalDataset, RRTrainDataset, RREvalDataset -------------------------------------------------------------------------------- /src/openmatch/driver/build_index.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | from openmatch.arguments import DataArguments 5 | from openmatch.arguments import InferenceArguments as EncodingArguments 6 | from openmatch.arguments import ModelArguments 7 | from openmatch.dataset import InferenceDataset 8 | from openmatch.modeling import DRModelForInference 9 | from openmatch.retriever import Retriever 10 | from transformers import AutoConfig, AutoTokenizer, HfArgumentParser 11 | 12 | 13 | def main(): 14 | parser = HfArgumentParser((ModelArguments, DataArguments, EncodingArguments)) 15 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 16 | model_args, data_args, encoding_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 17 | else: 18 | model_args, data_args, encoding_args = parser.parse_args_into_dataclasses() 19 | model_args: ModelArguments 20 | data_args: DataArguments 21 | encoding_args: EncodingArguments 22 | 23 | num_labels = 1 24 | config = AutoConfig.from_pretrained( 25 | model_args.config_name if model_args.config_name else model_args.model_name_or_path, 26 | num_labels=num_labels, 27 | cache_dir=model_args.cache_dir, 28 | ) 29 | tokenizer = AutoTokenizer.from_pretrained( 30 | model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, 31 | cache_dir=model_args.cache_dir, 32 | ) 33 | 34 | model = DRModelForInference.build( 35 | model_args=model_args, 36 | config=config, 37 | cache_dir=model_args.cache_dir, 38 | ) 39 | 40 | corpus_dataset = InferenceDataset.load( 41 | tokenizer=tokenizer, 42 | data_args=data_args, 43 | is_query=False, 44 | stream=True, 45 | batch_size=encoding_args.per_device_eval_batch_size, 46 | num_processes=encoding_args.world_size, 47 | process_index=encoding_args.process_index, 48 | cache_dir=model_args.cache_dir 49 | ) 50 | 51 | Retriever.build_embeddings(model, corpus_dataset, encoding_args) 52 | 53 | 54 | if __name__ == '__main__': 55 | main() 56 | -------------------------------------------------------------------------------- /src/openmatch/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | from .dense_retrieval_model import DRModel, DRModelForInference, DROutput 2 | from .reranking_model import RRModel, RROutput -------------------------------------------------------------------------------- /src/openmatch/modeling/linear.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import json 4 | 5 | import torch 6 | import torch.nn as nn 7 | from torch import Tensor 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class LinearHead(nn.Module): 13 | def __init__( 14 | self, 15 | input_dim: int = 768, 16 | output_dim: int = 768, 17 | ): 18 | super(LinearHead, self).__init__() 19 | self.linear = nn.Linear(input_dim, output_dim, bias=False) 20 | self.config = {'input_dim': input_dim, 'output_dim': output_dim} 21 | 22 | def forward(self, rep: Tensor = None): 23 | return self.linear(rep) 24 | 25 | @classmethod 26 | def load(cls, ckpt_dir: str): 27 | logger.info(f'Loading linear head from {ckpt_dir}') 28 | model_path = os.path.join(ckpt_dir, 'linear.pt') 29 | config_path = os.path.join(ckpt_dir, 'head_config.json') 30 | with open(config_path, 'r') as f: 31 | config = json.load(f) 32 | model = cls(**config) 33 | model.load_state_dict(torch.load(model_path)) 34 | return model 35 | 36 | def save(self, save_path): 37 | torch.save(self.state_dict(), os.path.join(save_path, 'linear.pt')) 38 | with open(os.path.join(save_path, 'head_config.json'), 'w') as f: 39 | json.dump(self.config, f, indent=4) -------------------------------------------------------------------------------- /src/openmatch/retriever/__init__.py: -------------------------------------------------------------------------------- 1 | from .dense_retriever import Retriever, SuccessiveRetriever 2 | from .reranker import RRPredictDataset, Reranker -------------------------------------------------------------------------------- /src/openmatch/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | from .dense_trainer import DRTrainer, GCDenseTrainer 2 | from .reranker_trainer import RRTrainer -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/build_index.sh: -------------------------------------------------------------------------------- 1 | export dataset_name= ## you need to set this 2 | export data_path=## you need to set this 3 | 4 | ./bin/IndexCollection -collection JsonCollection -input $data_path/corpus -index $dataset_name -generator LuceneDocumentGenerator -threads 8 -storePositions -storeDocvectors -storeRawDocs 5 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/HdrHistogram-2.1.9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/HdrHistogram-2.1.9.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/aggs-matrix-stats-client-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/aggs-matrix-stats-client-7.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/annotations-java5-19.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/annotations-java5-19.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/anserini-0.7.3-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/anserini-0.7.3-SNAPSHOT.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/anserini-fastutil-6.5.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/anserini-fastutil-6.5.6.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/ant-1.9.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/ant-1.9.1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/ant-launcher-1.9.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/ant-launcher-1.9.1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/args4j-2.32.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/args4j-2.32.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/cbor-0.7.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/cbor-0.7.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-codec-1.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-codec-1.11.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-compress-1.18.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-compress-1.18.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-io-2.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-io-2.5.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-lang3-3.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-lang3-3.5.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-logging-1.1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-logging-1.1.3.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-math3-3.6.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-math3-3.6.1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-pool2-2.6.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/commons-pool2-2.6.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/compiler-0.9.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/compiler-0.9.3.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-7.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-cli-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-cli-7.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-core-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-core-7.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-geo-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-geo-7.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-rest-client-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-rest-client-7.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-rest-high-level-client-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-rest-high-level-client-7.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-secure-sm-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-secure-sm-7.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-x-content-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/elasticsearch-x-content-7.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/guava-18.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/guava-18.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/hppc-0.7.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/hppc-0.7.1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-client-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-client-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-common-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-common-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-hpack-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-hpack-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-http-client-transport-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/http2-http-client-transport-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpasyncclient-4.1.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpasyncclient-4.1.4.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpclient-4.5.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpclient-4.5.6.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpcore-4.4.10.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpcore-4.4.10.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpcore-nio-4.4.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpcore-nio-4.4.11.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpmime-4.5.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/httpmime-4.5.6.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-annotations-2.10.0.pr1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-annotations-2.10.0.pr1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-core-2.10.0.pr1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-core-2.10.0.pr1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-databind-2.10.0.pr1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-databind-2.10.0.pr1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-dataformat-cbor-2.8.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-dataformat-cbor-2.8.11.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-dataformat-smile-2.8.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-dataformat-smile-2.8.11.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-dataformat-yaml-2.10.0.pr1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-dataformat-yaml-2.10.0.pr1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-datatype-jdk8-2.10.0.pr1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jackson-datatype-jdk8-2.10.0.pr1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jcl-over-slf4j-1.7.24.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jcl-over-slf4j-1.7.24.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-alpn-client-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-alpn-client-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-alpn-java-client-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-alpn-java-client-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-client-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-client-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-http-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-http-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-io-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-io-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-util-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jetty-util-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jna-4.5.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jna-4.5.1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/joda-time-2.10.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/joda-time-2.10.1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jopt-simple-5.0.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jopt-simple-5.0.2.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jsoup-1.8.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jsoup-1.8.3.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jsr305-2.0.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/jsr305-2.0.1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lang-mustache-client-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lang-mustache-client-7.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/log4j-api-2.12.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/log4j-api-2.12.1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/log4j-core-2.12.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/log4j-core-2.12.1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-analyzers-common-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-analyzers-common-8.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-backward-codecs-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-backward-codecs-8.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-core-8.3.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-core-8.3.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-grouping-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-grouping-8.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-highlighter-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-highlighter-8.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-join-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-join-8.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-memory-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-memory-8.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-misc-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-misc-8.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-queries-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-queries-8.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-queryparser-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-queryparser-8.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-sandbox-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-sandbox-8.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-spatial-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-spatial-8.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-spatial-extras-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-spatial-extras-8.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-spatial3d-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-spatial3d-8.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-suggest-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/lucene-suggest-8.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/mockito-all-1.10.19.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/mockito-all-1.10.19.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-buffer-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-buffer-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-codec-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-codec-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-common-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-common-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-handler-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-handler-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-resolver-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-resolver-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-transport-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-transport-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-transport-native-epoll-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-transport-native-epoll-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-transport-native-unix-common-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/netty-transport-native-unix-common-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/parent-join-client-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/parent-join-client-7.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/rank-eval-client-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/rank-eval-client-7.0.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-model-4.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-model-4.1.2.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-api-4.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-api-4.1.2.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-datatypes-4.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-datatypes-4.1.2.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-languages-4.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-languages-4.1.2.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-ntriples-4.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-rio-ntriples-4.1.2.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-util-4.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/sesame-util-4.1.2.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/slf4j-api-1.7.24.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/slf4j-api-1.7.24.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/slf4j-simple-1.7.29.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/slf4j-simple-1.7.29.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/snakeyaml-1.24.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/snakeyaml-1.24.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/solr-solrj-8.3.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/solr-solrj-8.3.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/stax2-api-3.1.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/stax2-api-3.1.4.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/t-digest-3.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/t-digest-3.2.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/trec-car-tools-java-13.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/trec-car-tools-java-13.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/twitter-text-2.0.10.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/twitter-text-2.0.10.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-datamodel-0.10.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-datamodel-0.10.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-dumpfiles-0.10.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-dumpfiles-0.10.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-storage-0.10.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-storage-0.10.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-util-0.10.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wdtk-util-0.10.0.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wikiclean-1.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/wikiclean-1.1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/woodstox-core-asl-4.4.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/woodstox-core-asl-4.4.1.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/xz-1.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/xz-1.5.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/zookeeper-3.5.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/zookeeper-3.5.5.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/zookeeper-jute-3.5.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/Contrastive_Supervision_Synthesis/bm25_retriever/repo/zookeeper-jute-3.5.5.jar -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/bm25_retriever/retrieve.sh: -------------------------------------------------------------------------------- 1 | export dataset_name= ## you need to set this 2 | export generator_folder=qg_t5-base ## qg_t5-small ; qg_t5-base 3 | export data_path= ## you need to set this 4 | 5 | ./bin/SearchCollection -index $dataset_name -topicreader TsvString -topics $data_path/qid2query.tsv -bm25 -output $data_path/bm25_retrieval.trec 6 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/__init__.py: -------------------------------------------------------------------------------- 1 | from .transformers import ( 2 | MODEL_FOR_QUESTION_ANSWERING_MAPPING, 3 | WEIGHTS_NAME, 4 | AdamW, 5 | AutoConfig, 6 | AutoTokenizer, 7 | get_linear_schedule_with_warmup, 8 | ModuleUtilsMixin, 9 | BertSelfAttention, 10 | BertPreTrainedModel, 11 | T5Tokenizer, 12 | T5ForConditionalGeneration, 13 | ) 14 | from . import dataloaders -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/dataloaders/__init__.py: -------------------------------------------------------------------------------- 1 | def select_tokenizer(args): 2 | if "t5" in args.pretrain_generator_type: 3 | return {"gen_tokenizer":T5_Tokenizer(args)} 4 | raise ValueError('Invalid generator class: %s' % args.pretrain_generator_type) 5 | 6 | 7 | def select_data_loader(args, do_finetune=False): 8 | dataloder_dict = {"build_generate_dataset":generate_dataset} 9 | 10 | if "t5" in args.pretrain_generator_type: 11 | dataloder_dict["gen_batchify"] = t5_batchify_for_test 12 | return dataloder_dict 13 | 14 | raise ValueError('Invalid generator class: %s' % args.pretrain_generator_type) 15 | 16 | 17 | from .generate_loader import generate_dataset 18 | from .t5_utils import ( 19 | T5_Tokenizer, 20 | t5_batchify_for_test, 21 | ) 22 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/dataloaders/generate_loader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import torch 4 | from torch.utils.data import Dataset 5 | 6 | from . import loader_utils 7 | from .t5_utils import t5_pair_converter, t5_single_converter 8 | logger = logging.getLogger() 9 | 10 | 11 | 12 | generate_feature_converter = { 13 | "contrastqg":t5_pair_converter, 14 | "qg":t5_single_converter 15 | } 16 | 17 | 18 | class generate_dataset(Dataset): 19 | def __init__( 20 | self, 21 | args, 22 | data_dir, 23 | tokenizer, 24 | ): 25 | """ 26 | :param intput_dir: examples.jsonl ("pos_docid"/"neg_docid"); docid2doc.jsonl 27 | :param tokenizer: T5Tokenizer or None 28 | """ 29 | # load pairs {"pos_docid", "neg_docid"} 30 | if args.generator_mode == "contrastqg": 31 | examples = loader_utils.load_json2list(os.path.join(data_dir, "qg_%s/contrast_pairs.jsonl"%args.pretrain_generator_type)) 32 | else: 33 | examples = loader_utils.load_json2list(os.path.join(data_dir, "pos_docids.jsonl")) 34 | logger.info('[%s] needs generate %d examples'%(args.generator_mode, len(examples))) 35 | 36 | # load docid2doc {"docid":doc} 37 | docid2doc = loader_utils.load_json2dict( 38 | os.path.join(data_dir, "docid2doc.jsonl"), 39 | id_name="docid", 40 | text_key="doc", 41 | ) 42 | 43 | # load docid2doc 44 | self.args = args 45 | self.dataset = {"docid2doc":None, "qid2query":None} 46 | self.dataset["docid2doc"] = docid2doc 47 | self.tokenizer = tokenizer 48 | self.examples = examples 49 | 50 | def __len__(self): 51 | return len(self.examples) 52 | 53 | def reset_examples(self, examples): 54 | self.examples = examples 55 | 56 | def reset_qid2query(self, qid2query): 57 | self.dataset["qid2query"] = qid2query 58 | 59 | def __getitem__(self, index): 60 | return generate_feature_converter[self.args.generator_mode]( 61 | index, 62 | ex=self.examples[index], 63 | dataset=self.dataset, 64 | tokenizer=self.tokenizer, 65 | ) -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/dataloaders/loader_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | import logging 5 | from tqdm import tqdm 6 | 7 | logger = logging.getLogger() 8 | 9 | 10 | 11 | 12 | def load_corpus(data_dir): 13 | """ 14 | :param data_dir: docid2doc 15 | :param tokenizer: 16 | """ 17 | # load docid2doc 18 | logger.info('start load corpus ...') 19 | orig_corpus = load_json2dict( 20 | os.path.join(data_dir, "docid2doc.jsonl"), 21 | id_name="docid", 22 | text_key="doc", 23 | ) 24 | return corpus 25 | 26 | 27 | def load_json2list(file_path): 28 | """used in load_dataset.""" 29 | data_list = [] 30 | with open(file_path, mode='r', encoding='utf-8') as fi: 31 | for idx, line in enumerate(tqdm(fi)): 32 | data = json.loads(line) 33 | data_list.append(data) 34 | return data_list 35 | 36 | 37 | def load_json2dict(file_path, id_name, text_key): 38 | """used in load_dataset.""" 39 | data_dict = {} 40 | with open(file_path, mode='r', encoding='utf-8') as fi: 41 | for idx, line in enumerate(tqdm(fi)): 42 | data = json.loads(line) 43 | data_dict[data[id_name]] = data[text_key] 44 | return data_dict 45 | 46 | 47 | # --------------------------------------------------------------------------- 48 | # --------------------------------------------------------------------------- 49 | def save_tokenized_corpus(dataset, cache_dir): 50 | """ 51 | :param: dataset dict has keys : docid2doc 52 | :param: save dir 53 | """ 54 | if not os.path.exists(cache_dir): 55 | os.mkdir(cache_dir) 56 | 57 | save_dict2jsonl( 58 | data_dict=dataset["docid2doc"], 59 | output_path=os.path.join(cache_dir, "docid2doc.jsonl"), 60 | id_name="docid", 61 | text_key="doc" 62 | ) 63 | 64 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/activations.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | 4 | import torch 5 | import torch.nn.functional as F 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | 11 | def swish(x): 12 | return x * torch.sigmoid(x) 13 | 14 | 15 | def _gelu_python(x): 16 | """ Original Implementation of the gelu activation function in Google Bert repo when initially created. 17 | For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 18 | 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 19 | This is now written in C in torch.nn.functional 20 | Also see https://arxiv.org/abs/1606.08415 21 | """ 22 | return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0))) 23 | 24 | 25 | def gelu_new(x): 26 | """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT). 27 | Also see https://arxiv.org/abs/1606.08415 28 | """ 29 | return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0)))) 30 | 31 | 32 | if torch.__version__ < "1.4.0": 33 | gelu = _gelu_python 34 | else: 35 | gelu = F.gelu 36 | 37 | 38 | def gelu_fast(x): 39 | return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x))) 40 | 41 | 42 | ACT2FN = { 43 | "relu": F.relu, 44 | "swish": swish, 45 | "gelu": gelu, 46 | "tanh": torch.tanh, 47 | "gelu_new": gelu_new, 48 | "gelu_fast": gelu_fast, 49 | } 50 | 51 | 52 | def get_activation(activation_string): 53 | if activation_string in ACT2FN: 54 | return ACT2FN[activation_string] 55 | else: 56 | raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys()))) 57 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/benchmark/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from ..file_utils import is_torch_available 6 | 7 | 8 | if is_torch_available(): 9 | from .benchmark_args import PyTorchBenchmarkArguments 10 | from .benchmark import PyTorchBenchmark 11 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/commands/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from argparse import ArgumentParser 3 | 4 | 5 | class BaseTransformersCLICommand(ABC): 6 | @staticmethod 7 | @abstractmethod 8 | def register_subcommand(parser: ArgumentParser): 9 | raise NotImplementedError() 10 | 11 | @abstractmethod 12 | def run(self): 13 | raise NotImplementedError() 14 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/commands/download.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | 3 | from transformers.commands import BaseTransformersCLICommand 4 | 5 | 6 | def download_command_factory(args): 7 | return DownloadCommand(args.model, args.cache_dir, args.force) 8 | 9 | 10 | class DownloadCommand(BaseTransformersCLICommand): 11 | @staticmethod 12 | def register_subcommand(parser: ArgumentParser): 13 | download_parser = parser.add_parser("download") 14 | download_parser.add_argument( 15 | "--cache-dir", type=str, default=None, help="Path to location to store the models" 16 | ) 17 | download_parser.add_argument( 18 | "--force", action="store_true", help="Force the model to be download even if already in cache-dir" 19 | ) 20 | download_parser.add_argument("model", type=str, help="Name of the model to download") 21 | download_parser.set_defaults(func=download_command_factory) 22 | 23 | def __init__(self, model: str, cache: str, force: bool): 24 | self._model = model 25 | self._cache = cache 26 | self._force = force 27 | 28 | def run(self): 29 | from transformers import AutoModel, AutoTokenizer 30 | 31 | AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 32 | AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) 33 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/commands/env.py: -------------------------------------------------------------------------------- 1 | import platform 2 | from argparse import ArgumentParser 3 | 4 | from transformers import __version__ as version 5 | from transformers import is_tf_available, is_torch_available 6 | from transformers.commands import BaseTransformersCLICommand 7 | 8 | 9 | def info_command_factory(_): 10 | return EnvironmentCommand() 11 | 12 | 13 | class EnvironmentCommand(BaseTransformersCLICommand): 14 | @staticmethod 15 | def register_subcommand(parser: ArgumentParser): 16 | download_parser = parser.add_parser("env") 17 | download_parser.set_defaults(func=info_command_factory) 18 | 19 | def run(self): 20 | pt_version = "not installed" 21 | pt_cuda_available = "NA" 22 | if is_torch_available(): 23 | import torch 24 | 25 | pt_version = torch.__version__ 26 | pt_cuda_available = torch.cuda.is_available() 27 | 28 | tf_version = "not installed" 29 | tf_cuda_available = "NA" 30 | if is_tf_available(): 31 | import tensorflow as tf 32 | 33 | tf_version = tf.__version__ 34 | try: 35 | # deprecated in v2.1 36 | tf_cuda_available = tf.test.is_gpu_available() 37 | except AttributeError: 38 | # returns list of devices, convert to bool 39 | tf_cuda_available = bool(tf.config.list_physical_devices("GPU")) 40 | 41 | info = { 42 | "`transformers` version": version, 43 | "Platform": platform.platform(), 44 | "Python version": platform.python_version(), 45 | "PyTorch version (GPU?)": "{} ({})".format(pt_version, pt_cuda_available), 46 | "Tensorflow version (GPU?)": "{} ({})".format(tf_version, tf_cuda_available), 47 | "Using GPU in script?": "", 48 | "Using distributed or parallel set-up in script?": "", 49 | } 50 | 51 | print("\nCopy-and-paste the text below in your GitHub issue and FILL OUT the two last points.\n") 52 | print(self.format_dict(info)) 53 | 54 | return info 55 | 56 | @staticmethod 57 | def format_dict(d): 58 | return "\n".join(["- {}: {}".format(prop, val) for prop, val in d.items()]) + "\n" 59 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/commands/transformers_cli.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from argparse import ArgumentParser 3 | 4 | from transformers.commands.convert import ConvertCommand 5 | from transformers.commands.download import DownloadCommand 6 | from transformers.commands.env import EnvironmentCommand 7 | from transformers.commands.run import RunCommand 8 | from transformers.commands.serving import ServeCommand 9 | from transformers.commands.user import UserCommands 10 | 11 | 12 | def main(): 13 | parser = ArgumentParser("Transformers CLI tool", usage="transformers-cli []") 14 | commands_parser = parser.add_subparsers(help="transformers-cli command helpers") 15 | 16 | # Register commands 17 | ConvertCommand.register_subcommand(commands_parser) 18 | DownloadCommand.register_subcommand(commands_parser) 19 | EnvironmentCommand.register_subcommand(commands_parser) 20 | RunCommand.register_subcommand(commands_parser) 21 | ServeCommand.register_subcommand(commands_parser) 22 | UserCommands.register_subcommand(commands_parser) 23 | 24 | # Let's go 25 | args = parser.parse_args() 26 | 27 | if not hasattr(args, "func"): 28 | parser.print_help() 29 | exit(1) 30 | 31 | # Run 32 | service = args.func(args) 33 | service.run() 34 | 35 | 36 | if __name__ == "__main__": 37 | main() 38 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/configuration_camembert.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ CamemBERT configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_roberta import RobertaConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "camembert-base": "https://s3.amazonaws.com/models.huggingface.co/bert/camembert-base-config.json", 28 | "umberto-commoncrawl-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-commoncrawl-cased-v1/config.json", 29 | "umberto-wikipedia-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/Musixmatch/umberto-wikipedia-uncased-v1/config.json", 30 | } 31 | 32 | 33 | class CamembertConfig(RobertaConfig): 34 | """ 35 | This class overrides :class:`~transformers.RobertaConfig`. Please check the 36 | superclass for the appropriate documentation alongside usage examples. 37 | """ 38 | 39 | model_type = "camembert" 40 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/configuration_marian.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The OPUS-NMT Team, Marian team, and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """ Marian model configuration """ 16 | 17 | from .configuration_bart import BartConfig 18 | 19 | 20 | PRETRAINED_CONFIG_ARCHIVE_MAP = { 21 | "Helsinki-NLP/opus-mt-en-de": "https://s3.amazonaws.com/models.huggingface.co/bert/Helsinki-NLP/opus-mt-en-de/config.json", 22 | } 23 | 24 | 25 | class MarianConfig(BartConfig): 26 | model_type = "marian" 27 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/configuration_mmbt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright (c) Facebook, Inc. and its affiliates. 3 | # Copyright (c) HuggingFace Inc. team. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ MMBT configuration """ 17 | 18 | 19 | import logging 20 | 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | class MMBTConfig(object): 26 | """Configuration class to store the configuration of a `MMBT Model`. 27 | 28 | Args: 29 | config (:obj:`~transformers.PreTrainedConfig`): 30 | Config of the underlying Transformer models. Its values are 31 | copied over to use a single config. 32 | num_labels (:obj:`int` or :obj:`None`, optional, defaults to `None`): 33 | Size of final Linear layer for classification. 34 | modal_hidden_size (:obj:`int`, optional, defautls to 2048): 35 | Embedding dimension of the non-text modality encoder. 36 | """ 37 | 38 | def __init__(self, config, num_labels=None, modal_hidden_size=2048): 39 | self.__dict__ = config.__dict__ 40 | self.modal_hidden_size = modal_hidden_size 41 | if num_labels: 42 | self.num_labels = num_labels 43 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/configuration_xlm_roberta.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 3 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ XLM-RoBERTa configuration """ 17 | 18 | 19 | import logging 20 | 21 | from .configuration_roberta import RobertaConfig 22 | 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 27 | "xlm-roberta-base": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json", 28 | "xlm-roberta-large": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json", 29 | "xlm-roberta-large-finetuned-conll02-dutch": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json", 30 | "xlm-roberta-large-finetuned-conll02-spanish": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json", 31 | "xlm-roberta-large-finetuned-conll03-english": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json", 32 | "xlm-roberta-large-finetuned-conll03-german": "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json", 33 | } 34 | 35 | 36 | class XLMRobertaConfig(RobertaConfig): 37 | """ 38 | This class overrides :class:`~transformers.RobertaConfig`. Please check the 39 | superclass for the appropriate documentation alongside usage examples. 40 | """ 41 | 42 | model_type = "xlm-roberta" 43 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert ALBERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = AlbertConfig.from_json_file(albert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = AlbertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_albert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--albert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained ALBERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = BertConfig.from_json_file(bert_config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = BertForPreTraining(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_bert(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--bert_config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained BERT model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/convert_dialogpt_original_pytorch_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | import torch 5 | 6 | from transformers.file_utils import WEIGHTS_NAME 7 | 8 | 9 | DIALOGPT_MODELS = ["small", "medium", "large"] 10 | 11 | OLD_KEY = "lm_head.decoder.weight" 12 | NEW_KEY = "lm_head.weight" 13 | 14 | 15 | def convert_dialogpt_checkpoint(checkpoint_path: str, pytorch_dump_folder_path: str): 16 | d = torch.load(checkpoint_path) 17 | d[NEW_KEY] = d.pop(OLD_KEY) 18 | os.makedirs(pytorch_dump_folder_path, exist_ok=True) 19 | torch.save(d, os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--dialogpt_path", default=".", type=str) 25 | args = parser.parse_args() 26 | for MODEL in DIALOGPT_MODELS: 27 | checkpoint_path = os.path.join(args.dialogpt_path, f"{MODEL}_ft.pkl") 28 | pytorch_dump_folder_path = f"./DialoGPT-{MODEL}" 29 | convert_dialogpt_checkpoint( 30 | checkpoint_path, pytorch_dump_folder_path, 31 | ) 32 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The T5 authors and HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert T5 checkpoint.""" 16 | 17 | 18 | import argparse 19 | import logging 20 | 21 | import torch 22 | 23 | from transformers import T5Config, T5Model, load_tf_weights_in_t5 24 | 25 | 26 | logging.basicConfig(level=logging.INFO) 27 | 28 | 29 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): 30 | # Initialise PyTorch model 31 | config = T5Config.from_json_file(config_file) 32 | print("Building PyTorch model from configuration: {}".format(str(config))) 33 | model = T5Model(config) 34 | 35 | # Load weights from tf checkpoint 36 | load_tf_weights_in_t5(model, config, tf_checkpoint_path) 37 | 38 | # Save pytorch-model 39 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 40 | torch.save(model.state_dict(), pytorch_dump_path) 41 | 42 | 43 | if __name__ == "__main__": 44 | parser = argparse.ArgumentParser() 45 | # Required parameters 46 | parser.add_argument( 47 | "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." 48 | ) 49 | parser.add_argument( 50 | "--config_file", 51 | default=None, 52 | type=str, 53 | required=True, 54 | help="The config json file corresponding to the pre-trained T5 model. \n" 55 | "This specifies the model architecture.", 56 | ) 57 | parser.add_argument( 58 | "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." 59 | ) 60 | args = parser.parse_args() 61 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) 62 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/data/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .metrics import is_sklearn_available 6 | from .processors import ( 7 | DataProcessor, 8 | InputExample, 9 | InputFeatures, 10 | SingleSentenceClassificationProcessor, 11 | SquadExample, 12 | SquadFeatures, 13 | SquadV1Processor, 14 | SquadV2Processor, 15 | glue_convert_examples_to_features, 16 | glue_output_modes, 17 | glue_processors, 18 | glue_tasks_num_labels, 19 | squad_convert_examples_to_features, 20 | xnli_output_modes, 21 | xnli_processors, 22 | xnli_tasks_num_labels, 23 | ) 24 | 25 | 26 | if is_sklearn_available(): 27 | from .metrics import glue_compute_metrics, xnli_compute_metrics 28 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .glue import GlueDataset, GlueDataTrainingArguments 6 | from .language_modeling import LineByLineTextDataset, TextDataset 7 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/data/processors/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # There's no way to ignore "F401 '...' imported but unused" warnings in this 3 | # module, but to preserve other warnings. So, don't check this module at all. 4 | 5 | from .glue import glue_convert_examples_to_features, glue_output_modes, glue_processors, glue_tasks_num_labels 6 | from .squad import SquadExample, SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features 7 | from .utils import DataProcessor, InputExample, InputFeatures, SingleSentenceClassificationProcessor 8 | from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels 9 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/tokenization_bart.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | 18 | from .tokenization_roberta import RobertaTokenizer 19 | from .tokenization_xlm_roberta import XLMRobertaTokenizer 20 | 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | # vocab and merges same as roberta 26 | vocab_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json" 27 | merges_url = "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt" 28 | _all_bart_models = [ 29 | "facebook/bart-large", 30 | "facebook/bart-large-mnli", 31 | "facebook/bart-large-cnn", 32 | "facebook/bart-large-xsum", 33 | ] 34 | 35 | 36 | class BartTokenizer(RobertaTokenizer): 37 | # merges and vocab same as Roberta 38 | max_model_input_sizes = {m: 1024 for m in _all_bart_models} 39 | pretrained_vocab_files_map = { 40 | "vocab_file": {m: vocab_url for m in _all_bart_models}, 41 | "merges_file": {m: merges_url for m in _all_bart_models}, 42 | } 43 | 44 | 45 | _all_mbart_models = ["facebook/mbart-large-en-ro"] 46 | SPM_URL = "https://s3.amazonaws.com/models.huggingface.co/bert/facebook/mbart-large-en-ro/sentence.bpe.model" 47 | 48 | 49 | class MBartTokenizer(XLMRobertaTokenizer): 50 | vocab_files_names = {"vocab_file": "sentencepiece.bpe.model"} 51 | max_model_input_sizes = {m: 1024 for m in _all_mbart_models} 52 | pretrained_vocab_files_map = {"vocab_file": {m: SPM_URL for m in _all_mbart_models}} 53 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/contrastqg/transformers/trainer_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, NamedTuple, Optional 2 | 3 | import numpy as np 4 | 5 | 6 | class EvalPrediction(NamedTuple): 7 | """ 8 | Evaluation output (always contains labels), to be used 9 | to compute metrics. 10 | """ 11 | 12 | predictions: np.ndarray 13 | label_ids: np.ndarray 14 | 15 | 16 | class PredictionOutput(NamedTuple): 17 | predictions: np.ndarray 18 | label_ids: Optional[np.ndarray] 19 | metrics: Optional[Dict[str, float]] 20 | 21 | 22 | class TrainOutput(NamedTuple): 23 | global_step: int 24 | training_loss: float 25 | 26 | 27 | PREFIX_CHECKPOINT_DIR = "checkpoint" 28 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/preprocess/prepro_dataset.sh: -------------------------------------------------------------------------------- 1 | export dataset_name= ## you need to set this 2 | export input_path=## you need to set this 3 | export output_path=## you need to set this 4 | 5 | python ./utils/prepro_dataset.py \ 6 | --dataset_name $dataset_name \ 7 | --input_path $input_path \ 8 | --output_path $output_path \ 9 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/preprocess/sample_contrast_pairs.sh: -------------------------------------------------------------------------------- 1 | export dataset_name= ## you need to set this 2 | export input_path= ## you need to set this 3 | export generator_folder=qg_t5-base ## qg_t5-small ; qg_t5-base 4 | 5 | python ./utils/sample_contrast_pairs.py \ 6 | --dataset_name $dataset_name \ 7 | --generator_folder $generator_folder \ 8 | --input_path $input_path \ 9 | --topk 100 \ 10 | --sample_n 5 11 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/run_shell/cqg_inference.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | ## -------------------------------------------- 3 | export CUDA=2 4 | export pretrain_generator_type=t5-base ## t5-small ; t5-base 5 | export per_gpu_gen_batch_size=200 ## 200; 400 6 | export target_dataset= ## you need to set this 7 | export generator_mode=contrastqg 8 | ## -------------------------------------------- 9 | 10 | ## -------------------------------------------- 11 | export generator_load_dir= ## you need to set this 12 | export target_dataset_dir= ## you need to set this 13 | ## -------------------------------------------- 14 | 15 | 16 | CUDA_VISIBLE_DEVICES=$CUDA python ../scripts/inference.py \ 17 | --generator_mode $generator_mode \ 18 | --pretrain_generator_type $pretrain_generator_type \ 19 | --per_gpu_gen_batch_size $per_gpu_gen_batch_size \ 20 | --generator_load_dir $generator_load_dir \ 21 | --target_dataset_dir $target_dataset_dir \ 22 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/run_shell/qg_inference.sh: -------------------------------------------------------------------------------- 1 | # !/bin/bash 2 | ## -------------------------------------------- 3 | export CUDA=2 4 | export pretrain_generator_type=t5-base ## t5-small ; t5-base 5 | export per_gpu_gen_batch_size=200 ## 200; 400 6 | export target_dataset_name= ## you need to set this 7 | export generator_mode=qg 8 | ## -------------------------------------------- 9 | 10 | ## -------------------------------------------- 11 | export generator_load_dir= ## you need to set this 12 | export target_dataset_dir= ## you need to set this 13 | ## -------------------------------------------- 14 | 15 | CUDA_VISIBLE_DEVICES=$CUDA python ../scripts/inference.py \ 16 | --generator_mode $generator_mode \ 17 | --pretrain_generator_type $pretrain_generator_type \ 18 | --per_gpu_gen_batch_size $per_gpu_gen_batch_size \ 19 | --generator_load_dir $generator_load_dir \ 20 | --target_dataset_dir $target_dataset_dir \ 21 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/run_shell/train_nlg.sh: -------------------------------------------------------------------------------- 1 | # #!/bin/bash 2 | export CUDA=0 3 | export generator_mode=qg # qg / contrastqg 4 | export pretrain_generator_type=t5-small ## t5-small / t5-base 5 | 6 | export pretrain_model_dir=../data/pretrain_model 7 | export train_file=../data/source_data/toy_triples.train.small.tsv 8 | export save_dir=../results 9 | 10 | ## ------------------------------------------------------------------ 11 | ## ------------------------------------------------------------------ 12 | CUDA_VISIBLE_DEVICES=$CUDA python ../scripts/train.py --run_mode train \ 13 | --generator_mode $generator_mode \ 14 | --pretrain_generator_type $pretrain_generator_type \ 15 | --per_gpu_train_batch_size 4 \ 16 | --gradient_accumulation_steps 1 \ 17 | --pretrain_model_dir $pretrain_model_dir \ 18 | --train_file $train_file \ 19 | --save_dir $save_dir \ 20 | -------------------------------------------------------------------------------- /v1/Contrastive_Supervision_Synthesis/scripts/model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import torch 4 | from torch import nn, optim 5 | import logging 6 | import numpy as np 7 | import torch.nn.functional as F 8 | from torch.autograd import Variable 9 | import utils 10 | from contrastqg import (T5ForConditionalGeneration) 11 | 12 | logger = logging.getLogger() 13 | 14 | class QGenerator(object): 15 | def __init__(self, args, tokenizer): 16 | self.network = T5ForConditionalGeneration.from_pretrained(args.pretrain_generator_type) 17 | self.network.resize_token_embeddings(len(tokenizer)) 18 | self.network.load_state_dict(torch.load(args.generator_load_dir + '/models.pkl')) 19 | logger.info("sccuess load checkpoint from {} !".format(args.generator_load_dir)) 20 | self.tokenizer = tokenizer 21 | self.batchify_inputs = utils.select_gen_input_refactor(args) 22 | 23 | 24 | def predict(self, inputs): 25 | self.network.eval() 26 | outputs = self.network.generate(**inputs) 27 | pred_tokens = self.tokenizer.convert_outputs_to_tokens(outputs) 28 | return pred_tokens 29 | 30 | def set_device(self, device): 31 | self.device = device 32 | self.network.to(self.device) 33 | 34 | 35 | def parallelize(self): 36 | """Use data parallel to copy the model across several gpus. 37 | This will take all gpus visible with CUDA_VISIBLE_DEVICES. 38 | """ 39 | self.parallel = True 40 | self.network = torch.nn.DataParallel(self.network) -------------------------------------------------------------------------------- /v1/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:1.4-cuda10.1-cudnn7-runtime 2 | LABEL maintainer="Yizhi Li " 3 | USER root 4 | 5 | # installing full CUDA toolkit 6 | RUN apt update 7 | RUN pip install --upgrade pip 8 | #RUN apt install -y build-essential g++ llvm-9-dev git cmake wget 9 | RUN apt install -y build-essential g++ git cmake wget 10 | RUN conda install -y -c conda-forge cudatoolkit-dev 11 | # setting environment variables 12 | ENV CUDA_HOME "/opt/conda/pkgs/cuda-toolkit" 13 | ENV CUDA_TOOLKIT_ROOT_DIR $CUDA_HOME 14 | ENV LIBRARY_PATH "$CUDA_HOME/lib64:$LIBRARY_PATH" 15 | ENV LD_LIBRARY_PATH "$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:$LD_LIBRARY_PATH" 16 | ENV CFLAGS "-I$CUDA_HOME/include $CFLAGS" 17 | 18 | # warning: no torch and torchvision in the requirements, need to install in advance 19 | RUN wget https://raw.githubusercontent.com/thunlp/OpenMatch/master/retrievers/venv_ANCE.requirements 20 | RUN pip install -r venv_ANCE.requirements 21 | RUN pip install tensorflow 22 | 23 | WORKDIR /workspace 24 | RUN git clone https://github.com/NVIDIA/apex.git 25 | WORKDIR /workspace/apex 26 | RUN python setup.py install --cpp_ext --cuda_ext 27 | WORKDIR /workspace 28 | 29 | RUN git clone https://github.com/microsoft/ANCE.git 30 | WORKDIR /workspace/ANCE 31 | RUN python setup.py install 32 | WORKDIR /workspace 33 | 34 | RUN git clone https://github.com/thunlp/OpenMatch.git 35 | WORKDIR /workspace/OpenMatch 36 | RUN python setup.py install 37 | WORKDIR /workspace 38 | 39 | ENTRYPOINT ["/bin/bash"] 40 | -------------------------------------------------------------------------------- /v1/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 THUNLP 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /v1/LeToR/RankLib-2.1-patched.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/LeToR/RankLib-2.1-patched.jar -------------------------------------------------------------------------------- /v1/LeToR/gen_trec.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | def main(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('-dev', type=str, default='../data/dev_toy.jsonl') 7 | parser.add_argument('-res', type=str, default='../results/cknrm_ca.trec') 8 | parser.add_argument('-k', type=int, default=2) 9 | args = parser.parse_args() 10 | 11 | score_dic = {} 12 | for i in range(args.k): 13 | with open('f' + str(i+1) + '.score', 'r') as r: 14 | for line in r: 15 | line = line.strip('\n').split('\t') 16 | score_dic[line[0] + '$' + line[1]] = line[2] 17 | 18 | if args.k == -1: 19 | with open('f' + str(args.k+1) + '.score', 'r') as r: 20 | for line in r: 21 | line = line.strip('\n').split('\t') 22 | score_dic[line[0] + '$' + line[1]] = line[2] 23 | 24 | outs = {} 25 | with open(args.dev, 'r') as r: 26 | qid = '' 27 | cnt = 0 28 | for line in r: 29 | line = json.loads(line) 30 | if line['query_id'] != qid: 31 | qid = line['query_id'] 32 | cnt = 0 33 | outs[line['query_id']] = {} 34 | outs[line['query_id']][line['doc_id']] = float(score_dic[line['query_id']+'$'+str(cnt)]) 35 | cnt += 1 36 | 37 | f = open(args.res, 'w') 38 | for qid in outs: 39 | ps = {} 40 | out_idx = sorted(outs[qid].items(), key=lambda x:x[1], reverse=True) 41 | for i, out in enumerate(out_idx): 42 | if out[0] not in ps: 43 | ps[out[0]] = 1 44 | f.write(' '.join([qid, 'Q0', out[0], str(len(ps)), str(out[1]), 'default']) + '\n') 45 | f.close() 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /v1/OpenMatch/__init__.py: -------------------------------------------------------------------------------- 1 | from OpenMatch.data import * 2 | from OpenMatch.extractors import * 3 | from OpenMatch.metrics import * 4 | from OpenMatch.models import * 5 | from OpenMatch.modules import * 6 | from OpenMatch.utils import * 7 | -------------------------------------------------------------------------------- /v1/OpenMatch/data/__init__.py: -------------------------------------------------------------------------------- 1 | from OpenMatch.data.dataloader import DataLoader 2 | from OpenMatch.data.datasets import * 3 | from OpenMatch.data.tokenizers import * 4 | -------------------------------------------------------------------------------- /v1/OpenMatch/data/dataloader.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Any 2 | 3 | import torch 4 | from torch.utils.data import DataLoader, Sampler 5 | from torch.utils.data.distributed import DistributedSampler 6 | 7 | from OpenMatch.data.datasets import Dataset 8 | 9 | class DataLoader(DataLoader): 10 | def __init__( 11 | self, 12 | dataset: Dataset, 13 | batch_size: int, 14 | shuffle: str = False, 15 | num_workers: int = 0, 16 | sampler = None, 17 | ) -> None: 18 | super().__init__( 19 | dataset = dataset, 20 | batch_size = batch_size, 21 | shuffle = shuffle, 22 | num_workers = num_workers, 23 | collate_fn = dataset.collate, 24 | sampler = sampler, 25 | ) 26 | -------------------------------------------------------------------------------- /v1/OpenMatch/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from OpenMatch.data.datasets.dataset import Dataset 2 | from OpenMatch.data.datasets.edrm_dataset import EDRMDataset 3 | from OpenMatch.data.datasets.bert_dataset import BertDataset 4 | from OpenMatch.data.datasets.meta_bert_dataset import MetaBertDataset 5 | from OpenMatch.data.datasets.roberta_dataset import RobertaDataset 6 | from OpenMatch.data.datasets.bertmlm_dataset import BertMLMDataset 7 | from OpenMatch.data.datasets.bertmaxp_dataset import BertMaxPDataset 8 | -------------------------------------------------------------------------------- /v1/OpenMatch/data/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | from OpenMatch.data.tokenizers.tokenizer import Tokenizer 2 | from OpenMatch.data.tokenizers.word_tokenizer import WordTokenizer 3 | -------------------------------------------------------------------------------- /v1/OpenMatch/data/tokenizers/word_tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from nltk import word_tokenize 4 | 5 | from OpenMatch.data.tokenizers import Tokenizer 6 | 7 | class WordTokenizer(Tokenizer): 8 | def tokenize(self, text: str) -> List[str]: 9 | tokens = word_tokenize(text) 10 | return tokens 11 | -------------------------------------------------------------------------------- /v1/OpenMatch/extractors/__init__.py: -------------------------------------------------------------------------------- 1 | from OpenMatch.extractors.classic_extractor import ClassicExtractor 2 | -------------------------------------------------------------------------------- /v1/OpenMatch/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from OpenMatch.metrics.metric import Metric 2 | -------------------------------------------------------------------------------- /v1/OpenMatch/metrics/metric.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict 2 | 3 | import pytrec_eval 4 | 5 | class Metric(): 6 | def get_metric(self, qrels: str, trec: str, metric: str = 'ndcg_cut_10') -> Dict[str, float]: 7 | with open(qrels, 'r') as f_qrel: 8 | qrel = pytrec_eval.parse_qrel(f_qrel) 9 | with open(trec, 'r') as f_run: 10 | run = pytrec_eval.parse_run(f_run) 11 | 12 | evaluator = pytrec_eval.RelevanceEvaluator(qrel, pytrec_eval.supported_measures) 13 | results = evaluator.evaluate(run) 14 | for query_id, query_measures in sorted(results.items()): 15 | pass 16 | mes = {} 17 | for measure in sorted(query_measures.keys()): 18 | mes[measure] = pytrec_eval.compute_aggregated_measure(measure, [query_measures[measure] for query_measures in results.values()]) 19 | return mes[metric] 20 | 21 | def get_mrr(self, qrels: str, trec: str, metric: str = 'mrr_cut_10') -> float: 22 | k = int(metric.split('_')[-1]) 23 | 24 | qrel = {} 25 | with open(qrels, 'r') as f_qrel: 26 | for line in f_qrel: 27 | qid, _, did, label = line.strip().split() 28 | if qid not in qrel: 29 | qrel[qid] = {} 30 | qrel[qid][did] = int(label) 31 | 32 | run = {} 33 | with open(trec, 'r') as f_run: 34 | for line in f_run: 35 | qid, _, did, _, _, _ = line.strip().split() 36 | if qid not in run: 37 | run[qid] = [] 38 | run[qid].append(did) 39 | 40 | mrr = 0.0 41 | for qid in run: 42 | rr = 0.0 43 | for i, did in enumerate(run[qid][:k]): 44 | if qid in qrel and did in qrel[qid] and qrel[qid][did] > 0: 45 | rr = 1 / (i+1) 46 | break 47 | mrr += rr 48 | mrr /= len(run) 49 | return mrr 50 | -------------------------------------------------------------------------------- /v1/OpenMatch/models/__init__.py: -------------------------------------------------------------------------------- 1 | from OpenMatch.models.bert import Bert 2 | from OpenMatch.models.bert_maxp import BertMaxP 3 | from OpenMatch.models.conv_knrm import ConvKNRM 4 | from OpenMatch.models.knrm import KNRM 5 | from OpenMatch.models.tk import TK 6 | from OpenMatch.models.edrm import EDRM 7 | -------------------------------------------------------------------------------- /v1/OpenMatch/models/bert.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from transformers import AutoConfig, AutoModel 7 | 8 | class Bert(nn.Module): 9 | def __init__( 10 | self, 11 | pretrained: str, 12 | mode: str = 'cls', 13 | task: str = 'ranking' 14 | ) -> None: 15 | super(Bert, self).__init__() 16 | self._pretrained = pretrained 17 | self._mode = mode 18 | self._task = task 19 | 20 | self._config = AutoConfig.from_pretrained(self._pretrained) 21 | self._model = AutoModel.from_pretrained(self._pretrained, config=self._config) 22 | 23 | if self._task == 'ranking': 24 | self._dense = nn.Linear(self._config.hidden_size, 1) 25 | elif self._task == 'classification': 26 | self._dense = nn.Linear(self._config.hidden_size, 2) 27 | else: 28 | raise ValueError('Task must be `ranking` or `classification`.') 29 | 30 | def forward(self, input_ids: torch.Tensor, input_mask: torch.Tensor = None, segment_ids: torch.Tensor = None) -> Tuple[torch.Tensor, torch.Tensor]: 31 | output = self._model(input_ids, attention_mask = input_mask, token_type_ids = segment_ids) 32 | if self._mode == 'cls': 33 | logits = output[0][:, 0, :] 34 | elif self._mode == 'pooling': 35 | logits = output[1] 36 | else: 37 | raise ValueError('Mode must be `cls` or `pooling`.') 38 | score = self._dense(logits).squeeze(-1) 39 | return score, logits 40 | -------------------------------------------------------------------------------- /v1/OpenMatch/models/bert_maxp.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from transformers import AutoConfig, AutoModel 7 | 8 | class BertMaxP(nn.Module): 9 | def __init__( 10 | self, 11 | pretrained: str, 12 | max_query_len: int, 13 | max_doc_len: int, 14 | mode: str = 'cls', 15 | task: str = 'ranking' 16 | ) -> None: 17 | super(BertMaxP, self).__init__() 18 | self._pretrained = pretrained 19 | self._max_query_len = max_query_len 20 | self._max_doc_len = max_doc_len 21 | self._mode = mode 22 | self._task = task 23 | 24 | self._config = AutoConfig.from_pretrained(self._pretrained) 25 | self._model = AutoModel.from_pretrained(self._pretrained, config=self._config) 26 | 27 | self._dense1 = nn.Linear(self._config.hidden_size, 128) 28 | self._activation = nn.ReLU() 29 | 30 | if self._task == 'ranking': 31 | self._dense2 = nn.Linear(128, 1) 32 | elif self._task == 'classification': 33 | self._dense2 = nn.Linear(128, 2) 34 | else: 35 | raise ValueError('Task must be `ranking` or `classification`.') 36 | 37 | def forward(self, input_ids: torch.Tensor, input_mask: torch.Tensor = None, segment_ids: torch.Tensor = None) -> Tuple[torch.Tensor, torch.Tensor]: 38 | num = input_ids.size()[0] 39 | output = self._model(input_ids.view(num*4, self._max_query_len+self._max_doc_len+3), attention_mask = input_mask.view(num*4, self._max_query_len+self._max_doc_len+3), token_type_ids = segment_ids.view(num*4, self._max_query_len+self._max_doc_len+3)) 40 | 41 | if self._mode == 'cls': 42 | logits = output[0][:, 0, :].view(num,4,-1).max(dim=1)[0] 43 | elif self._mode == 'pooling': 44 | logits = output[1].view(num,4,-1).max(dim=1)[0] 45 | else: 46 | raise ValueError('Mode must be `cls` or `pooling`.') 47 | logits = self._activation(self._dense1(logits)) 48 | score = self._dense2(logits).squeeze(-1) 49 | return score, logits 50 | -------------------------------------------------------------------------------- /v1/OpenMatch/models/conv_knrm.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from OpenMatch.modules.embedders import Embedder 7 | from OpenMatch.modules.encoders import Conv1DEncoder 8 | from OpenMatch.modules.matchers import KernelMatcher 9 | 10 | class ConvKNRM(nn.Module): 11 | def __init__( 12 | self, 13 | vocab_size: int, 14 | embed_dim: int, 15 | kernel_num: int = 21, 16 | kernel_dim: int = 128, 17 | kernel_sizes: List[int] = [1, 2, 3], 18 | embed_matrix: List[float] = None, 19 | task: str = 'ranking' 20 | ) -> None: 21 | super(ConvKNRM, self).__init__() 22 | self._vocab_size = vocab_size 23 | self._embed_dim = embed_dim 24 | self._kernel_num = kernel_num 25 | self._kernel_dim = kernel_dim 26 | self._kernel_sizes = kernel_sizes 27 | self._embed_matrix = embed_matrix 28 | self._task = task 29 | 30 | self._embedder = Embedder(self._vocab_size, self._embed_dim, self._embed_matrix) 31 | self._encoder = Conv1DEncoder(self._embed_dim, self._kernel_dim, self._kernel_sizes) 32 | self._matcher = KernelMatcher(self._encoder.get_output_dim(), self._kernel_num) 33 | if self._task == 'ranking': 34 | self._dense = nn.Linear(self._kernel_num * (len(self._kernel_sizes) ** 2), 1) 35 | elif self._task == 'classification': 36 | self._dense = nn.Linear(self._kernel_num * (len(self._kernel_sizes) ** 2), 2) 37 | else: 38 | raise ValueError('Task must be `ranking` or `classification`.') 39 | 40 | def forward(self, query_ids: torch.Tensor, query_masks: torch.Tensor, doc_ids: torch.Tensor, doc_masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 41 | query_embed = self._embedder(query_ids) 42 | doc_embed = self._embedder(doc_ids) 43 | _, query_encs = self._encoder(query_embed, query_masks) 44 | _, doc_encs = self._encoder(doc_embed, doc_masks) 45 | 46 | logits = torch.cat([self._matcher(query_enc, query_masks[:, :query_enc.size()[1]], doc_enc, doc_masks[:, :doc_enc.size()[1]]) 47 | for query_enc in query_encs for doc_enc in doc_encs], dim=1) 48 | score = self._dense(logits).squeeze(-1) 49 | return score, logits 50 | -------------------------------------------------------------------------------- /v1/OpenMatch/models/knrm.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from OpenMatch.modules.embedders import Embedder 7 | from OpenMatch.modules.matchers import KernelMatcher 8 | 9 | class KNRM(nn.Module): 10 | def __init__( 11 | self, 12 | vocab_size: int, 13 | embed_dim: int, 14 | kernel_num: int = 21, 15 | embed_matrix: List[float] = None, 16 | task: str = 'ranking' 17 | ) -> None: 18 | super(KNRM, self).__init__() 19 | self._vocab_size = vocab_size 20 | self._embed_dim = embed_dim 21 | self._kernel_num = kernel_num 22 | self._embed_matrix = embed_matrix 23 | self._task = task 24 | 25 | self._embedder = Embedder(self._vocab_size, self._embed_dim, self._embed_matrix) 26 | self._matcher = KernelMatcher(self._embed_dim, self._kernel_num) 27 | if self._task == 'ranking': 28 | self._dense = nn.Linear(self._kernel_num, 1) 29 | elif self._task == 'classification': 30 | self._dense = nn.Linear(self._kernel_num, 2) 31 | else: 32 | raise ValueError('Task must be `ranking` or `classification`.') 33 | 34 | def forward(self, query_ids: torch.Tensor, query_masks: torch.Tensor, doc_ids: torch.Tensor, doc_masks: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: 35 | query_embed = self._embedder(query_ids) 36 | doc_embed = self._embedder(doc_ids) 37 | 38 | logits = self._matcher(query_embed, query_masks, doc_embed, doc_masks) 39 | score = self._dense(logits).squeeze(-1) 40 | return score, logits 41 | -------------------------------------------------------------------------------- /v1/OpenMatch/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from OpenMatch.modules.attentions import * 2 | from OpenMatch.modules.embedders import * 3 | from OpenMatch.modules.encoders import * 4 | from OpenMatch.modules.matchers import * 5 | -------------------------------------------------------------------------------- /v1/OpenMatch/modules/attentions/__init__.py: -------------------------------------------------------------------------------- 1 | from OpenMatch.modules.attentions.multi_head_attention import MultiHeadAttention 2 | from OpenMatch.modules.attentions.scaled_dot_product_attention import ScaledDotProductAttention 3 | -------------------------------------------------------------------------------- /v1/OpenMatch/modules/attentions/multi_head_attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from .scaled_dot_product_attention import ScaledDotProductAttention 5 | 6 | class MultiHeadAttention(nn.Module): 7 | def __init__( 8 | self, 9 | embed_dim: int = 512, 10 | head_num: int = 8, 11 | dropout: float = 0.0 12 | ) -> None: 13 | super(MultiHeadAttention, self).__init__() 14 | self._embed_dim = embed_dim 15 | self._head_num = head_num 16 | self._head_dim = self._embed_dim // self._head_num 17 | assert self._head_dim * self._head_num == self._embed_dim, 'embed_dim must be divisible by num_heads' 18 | 19 | self._fcq = nn.Linear(self._embed_dim, self._head_dim * self._head_num) 20 | self._fck = nn.Linear(self._embed_dim, self._head_dim * self._head_num) 21 | self._fcv = nn.Linear(self._embed_dim, self._head_dim * self._head_num) 22 | self._attention = ScaledDotProductAttention(dropout) 23 | self._fc = nn.Linear(self._embed_dim, self._embed_dim) 24 | self._dropout = nn.Dropout(dropout) 25 | self._norm = nn.LayerNorm(self._embed_dim) 26 | 27 | def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, attn_mask: torch.Tensor=None) -> torch.Tensor: 28 | residual = query 29 | batch_size = query.size(0) 30 | query = self._fcq(query).view(batch_size * self._head_num, -1, self._head_dim) 31 | key = self._fck(key).view(batch_size * self._head_num, -1, self._head_dim) 32 | value = self._fcv(value).view(batch_size * self._head_num, -1, self._head_dim) 33 | 34 | scale = (query.size(-1) // self._head_num) ** -0.5 35 | if attn_mask is not None: 36 | attn_mask = attn_mask.repeat(self._head_num, 1, 1) 37 | context, attn = self._attention(query, key, value, scale, attn_mask) 38 | context = context.view(batch_size, -1, self._head_num * self._head_dim) 39 | output = self._fc(context) 40 | output = self._dropout(output) 41 | output = self._norm(residual + output) 42 | return output, attn 43 | -------------------------------------------------------------------------------- /v1/OpenMatch/modules/attentions/scaled_dot_product_attention.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class ScaledDotProductAttention(nn.Module): 5 | def __init__( 6 | self, 7 | dropout: float = 0.0 8 | ) -> None: 9 | super(ScaledDotProductAttention, self).__init__() 10 | self._dropout = nn.Dropout(dropout) 11 | self._softmax = nn.Softmax(dim=2) 12 | 13 | def forward(self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, scale: float=None, attn_mask: torch.Tensor=None) -> torch.Tensor: 14 | attn = torch.bmm(query, key.transpose(1, 2)) 15 | if scale is not None: 16 | attn *= scale 17 | if attn_mask is not None: 18 | attn = attn.masked_fill(attn_mask, -1.0e32) 19 | attn = self._softmax(attn) 20 | attn = self._dropout(attn) 21 | context = torch.bmm(attn, value) 22 | return context, attn 23 | -------------------------------------------------------------------------------- /v1/OpenMatch/modules/embedders/__init__.py: -------------------------------------------------------------------------------- 1 | from OpenMatch.modules.embedders.embedder import Embedder 2 | -------------------------------------------------------------------------------- /v1/OpenMatch/modules/embedders/embedder.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | class Embedder(nn.Module): 7 | def __init__( 8 | self, 9 | vocab_size: int, 10 | embed_dim: int, 11 | embed_matrix: List[float] = None 12 | ) -> None: 13 | super(Embedder, self).__init__() 14 | self._vocab_size = vocab_size 15 | self._embed_dim = embed_dim 16 | 17 | self._embedder = nn.Embedding(self._vocab_size, self._embed_dim, padding_idx=0) 18 | if embed_matrix is not None: 19 | self._embed_matrix = torch.tensor(embed_matrix) 20 | self._embedder.weight = nn.Parameter(self._embed_matrix, requires_grad=True) 21 | 22 | def forward(self, idx: torch.Tensor) -> torch.Tensor: 23 | embed = self._embedder(idx) 24 | return embed 25 | -------------------------------------------------------------------------------- /v1/OpenMatch/modules/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | from OpenMatch.modules.encoders.cnn_encoder import Conv1DEncoder 2 | from OpenMatch.modules.encoders.feed_forward_encoder import FeedForwardEncoder 3 | from OpenMatch.modules.encoders.positional_encoder import PositionalEncoder 4 | from OpenMatch.modules.encoders.transformer_encoder import TransformerEncoder 5 | -------------------------------------------------------------------------------- /v1/OpenMatch/modules/encoders/cnn_encoder.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | class Conv1DEncoder(nn.Module): 7 | def __init__( 8 | self, 9 | embed_dim: int, 10 | kernel_dim: int, 11 | kernel_sizes: List[int] = [2, 3, 4, 5], 12 | stride: int = 1 13 | ) -> None: 14 | super(Conv1DEncoder, self).__init__() 15 | self._embed_dim = embed_dim 16 | self._kernel_dim = kernel_dim 17 | self._kernel_sizes = kernel_sizes 18 | self._stride = stride 19 | self._output_dim = self._kernel_dim * len(self._kernel_sizes) 20 | 21 | self._encoder = nn.ModuleList([ 22 | nn.Conv1d( 23 | in_channels=self._embed_dim, 24 | out_channels=self._kernel_dim, 25 | kernel_size=kernel_size, 26 | stride = self._stride 27 | ) 28 | for kernel_size in self._kernel_sizes 29 | ]) 30 | self._activation = nn.ReLU() 31 | 32 | def get_output_dim(self) -> int: 33 | return self._output_dim 34 | 35 | def forward(self, embed: torch.Tensor, masks: torch.Tensor = None) -> Tuple[torch.Tensor, List[torch.Tensor]]: 36 | if masks is not None: 37 | embed = embed * masks.unsqueeze(-1) 38 | embed = torch.transpose(embed, 1, 2) 39 | 40 | kernel_outputs = [self._activation(enc(embed)) for enc in self._encoder] 41 | pooling_sums = [kernel_output.max(dim=2).values for kernel_output in kernel_outputs] 42 | enc = (torch.cat(pooling_sums, dim=1) if len(pooling_sums) > 1 else pooling_sums[0]) 43 | return enc, [torch.transpose(kernel_output, 1, 2) for kernel_output in kernel_outputs] 44 | -------------------------------------------------------------------------------- /v1/OpenMatch/modules/encoders/feed_forward_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class FeedForwardEncoder(nn.Module): 5 | def __init__( 6 | self, 7 | embed_dim: int, 8 | hidden_dim: int, 9 | dropout: float = 0.1 10 | ) -> None: 11 | super(FeedForwardEncoder, self).__init__() 12 | self._embed_dim = embed_dim 13 | self._hidden_dim = hidden_dim 14 | 15 | self._fc1 = torch.nn.Linear(self._embed_dim, self._hidden_dim) 16 | self._fc2 = torch.nn.Linear(self._hidden_dim, self._embed_dim) 17 | self._dropout = nn.Dropout(dropout) 18 | self._activation = nn.ReLU() 19 | self._norm = nn.LayerNorm(self._embed_dim) 20 | 21 | def forward(self, embed: torch.Tensor) -> torch.Tensor: 22 | enc = self._dropout(self._fc2(self._activation(self._fc1(embed)))) 23 | enc = self._norm(embed + enc) 24 | return enc 25 | -------------------------------------------------------------------------------- /v1/OpenMatch/modules/encoders/positional_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class PositionalEncoder(nn.Module): 5 | def __init__( 6 | self, 7 | embed_dim: int, 8 | max_len: int = 512 9 | ) -> None: 10 | super(PositionalEncoder, self).__init__() 11 | self._embed_dim = embed_dim 12 | self._max_len = max_len 13 | 14 | self._embed_matrix = torch.tensor( 15 | [[pos / pow(1.0e4, 2.0 * (i // 2) / self._embed_dim) for i in range(self._embed_dim)] for pos in range(self._max_len)] 16 | ) 17 | self._embed_matrix[:, 0::2] = torch.sin(self._embed_matrix[:, 0::2]) 18 | self._embed_matrix[:, 1::2] = torch.cos(self._embed_matrix[:, 1::2]) 19 | self._embedder = nn.Embedding(self._max_len, self._embed_dim) 20 | self._embedder.weight = nn.Parameter(self._embed_matrix, requires_grad=False) 21 | 22 | def forward(self, embed: torch.Tensor) -> torch.Tensor: 23 | token_len = embed.size()[1] 24 | if embed.is_cuda: 25 | ids = torch.cuda.LongTensor([l for l in range(token_len)]) 26 | else: 27 | ids = torch.LongTensor([l for l in range(token_len)]) 28 | embed += self._embedder(ids) 29 | return embed 30 | -------------------------------------------------------------------------------- /v1/OpenMatch/modules/encoders/transformer_encoder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from OpenMatch.modules.attentions import MultiHeadAttention 5 | from .feed_forward_encoder import FeedForwardEncoder 6 | from .positional_encoder import PositionalEncoder 7 | 8 | class TransformerEncoderLayer(nn.Module): 9 | def __init__( 10 | self, 11 | embed_dim: int, 12 | head_num: int = 8, 13 | hidden_dim: int = 2048, 14 | dropout: float = 0.0 15 | ) -> None: 16 | super(TransformerEncoderLayer, self).__init__() 17 | self._embed_dim = embed_dim 18 | self._head_num = head_num 19 | self._hidden_dim = hidden_dim 20 | self._dropout = dropout 21 | 22 | self._attention = MultiHeadAttention(self._embed_dim, self._head_num, dropout=self._dropout) 23 | self._feed_forward = FeedForwardEncoder(self._embed_dim, self._hidden_dim, self._dropout) 24 | 25 | def forward(self, embed: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: 26 | embed, weights = self._attention(embed, embed, embed, attn_mask=mask) 27 | enc = self._feed_forward(embed) 28 | return enc 29 | 30 | class TransformerEncoder(nn.Module): 31 | def __init__( 32 | self, 33 | embed_dim: int, 34 | head_num: int = 8, 35 | hidden_dim: int = 2048, 36 | layer_num: int = 6, 37 | dropout: float = 0.0 38 | ) -> None: 39 | super(TransformerEncoder, self).__init__() 40 | self._embed_dim = embed_dim 41 | self._head_num = head_num 42 | self._hidden_dim = hidden_dim 43 | self._layer_num = layer_num 44 | self._dropout = dropout 45 | 46 | self._pos_encoder = PositionalEncoder(self._embed_dim) 47 | self._layers = nn.ModuleList([ 48 | TransformerEncoderLayer(self._embed_dim, self._head_num, self._hidden_dim, self._dropout) for _ in range(self._layer_num) 49 | ]) 50 | 51 | def forward(self, embed: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: 52 | enc = self._pos_encoder(embed) 53 | for layer in self._layers: 54 | enc = layer(enc, mask) 55 | return enc 56 | -------------------------------------------------------------------------------- /v1/OpenMatch/modules/matchers/__init__.py: -------------------------------------------------------------------------------- 1 | from OpenMatch.modules.matchers.kernel_matcher import KernelMatcher 2 | -------------------------------------------------------------------------------- /v1/OpenMatch/modules/matchers/kernel_matcher.py: -------------------------------------------------------------------------------- 1 | from typing import Dict 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | class KernelMatcher(nn.Module): 8 | def __init__( 9 | self, 10 | embed_dim: int, 11 | kernel_num: int = 21 12 | ) -> None: 13 | super(KernelMatcher, self).__init__() 14 | self._embed_dim = embed_dim 15 | self._kernel_num = kernel_num 16 | mus, sigmas = self.kernel_init(self._kernel_num) 17 | self._mus = nn.Parameter(mus, requires_grad=False) 18 | self._sigmas = nn.Parameter(sigmas, requires_grad=False) 19 | 20 | def kernel_init(self, kernel_num: int) -> Dict[str, torch.Tensor]: 21 | mus = [1] 22 | bin_size = 2.0/(kernel_num-1) 23 | mus.append(1-bin_size/2) 24 | for i in range(1, kernel_num-1): 25 | mus.append(mus[i]-bin_size) 26 | mus = torch.tensor(mus).view(1, 1, 1, kernel_num) 27 | 28 | sigmas = [0.001] 29 | sigmas += [0.1]*(kernel_num-1) 30 | sigmas = torch.tensor(sigmas).view(1, 1, 1, kernel_num) 31 | return mus, sigmas 32 | 33 | def forward(self, k_embed: torch.Tensor, k_mask: torch.Tensor, v_embed: torch.Tensor, v_mask: torch.Tensor) -> torch.Tensor: 34 | k_embed = k_embed * k_mask.unsqueeze(-1) 35 | v_embed = v_embed * v_mask.unsqueeze(-1) 36 | k_by_v_mask = torch.bmm(k_mask.float().unsqueeze(-1), v_mask.float().unsqueeze(-1).transpose(1, 2)) 37 | k_norm = F.normalize(k_embed, p=2, dim=2, eps=1e-10) 38 | v_norm = F.normalize(v_embed, p=2, dim=2, eps=1e-10) 39 | inter = (torch.bmm(k_norm, v_norm.transpose(1, 2)) * k_by_v_mask).unsqueeze(-1) 40 | 41 | kernel_outputs = torch.exp((-((inter-self._mus)**2)/(self._sigmas**2)/2)) 42 | kernel_outputs = kernel_outputs.sum(dim=2).clamp(min=1e-10).log() * 1e-2 43 | logits = kernel_outputs.sum(dim=1) 44 | return logits 45 | -------------------------------------------------------------------------------- /v1/OpenMatch/utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from argparse import Action 4 | 5 | class DictOrStr(Action): 6 | def __call__(self, parser, namespace, values, option_string=None): 7 | if '=' in values: 8 | my_dict = {} 9 | for kv in values.split(","): 10 | k,v = kv.split("=") 11 | my_dict[k] = v 12 | setattr(namespace, self.dest, my_dict) 13 | else: 14 | setattr(namespace, self.dest, values) 15 | 16 | def check_dir(path): 17 | if not os.path.exists(path): 18 | os.makedirs(path) 19 | return path 20 | 21 | def save_trec(rst_file, rst_dict): 22 | with open(rst_file, 'w') as writer: 23 | for q_id, scores in rst_dict.items(): 24 | res = sorted(scores.items(), key=lambda x: x[1][0], reverse=True) 25 | for rank, value in enumerate(res): 26 | writer.write(q_id+' Q0 '+str(value[0])+' '+str(rank+1)+' '+str(value[1][0])+' openmatch\n') 27 | return 28 | 29 | def save_features(rst_file, features): 30 | with open(rst_file, 'w') as writer: 31 | for feature in features: 32 | writer.write(feature+'\n') 33 | return 34 | -------------------------------------------------------------------------------- /v1/checkpoints/README.md: -------------------------------------------------------------------------------- 1 | # Save Checkpoints 2 | Save checkpoints of neural rankers and coor-ascent weights. 3 | -------------------------------------------------------------------------------- /v1/coor_ascent.sh: -------------------------------------------------------------------------------- 1 | java -cp LeToR/RankLib-2.1-patched.jar ciir.umass.edu.features.FeatureManager -input features/bert_features -output features/ -k 2 2 | java -jar LeToR/RankLib-2.1-patched.jar -train features/bert_features -ranker 4 -kcv 2 -kcvmd checkpoints/ -kcvmn ca -metric2t NDCG@20 -metric2T NDCG@20 3 | java -jar LeToR/RankLib-2.1-patched.jar -load checkpoints/f1.ca -rank features/f1.test.bert_features -score f1.score 4 | java -jar LeToR/RankLib-2.1-patched.jar -load checkpoints/f2.ca -rank features/f2.test.bert_features -score f2.score 5 | python LeToR/gen_trec.py -dev data/dev_toy.jsonl -res results/bert_ca.trec -k 2 6 | rm f1.score 7 | rm f2.score 8 | -------------------------------------------------------------------------------- /v1/data/filter.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | def main(): 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument('-input_qrels', type=str, default=None) 7 | parser.add_argument('-input_trec', type=str) 8 | parser.add_argument('-topk', type=int, default=1000) 9 | parser.add_argument('-output', type=str) 10 | args = parser.parse_args() 11 | 12 | last_qds = {} 13 | if args.input_qrels is not None: 14 | with open(args.input_qrels, 'r') as r: 15 | for line in r: 16 | line = line.strip().split() 17 | if line[0] not in last_qds: 18 | last_qds[line[0]] = {} 19 | last_qds[line[0]][line[2]] = 1 20 | 21 | f = open(args.output, 'w') 22 | qds = {} 23 | with open(args.input_trec, 'r') as r: 24 | for line in r: 25 | line = line.strip().split() 26 | if line[0] not in qds: 27 | qds[line[0]] = [] 28 | if len(qds[line[0]]) >= args.topk: 29 | continue 30 | if line[0] in last_qds and line[2] in last_qds[line[0]]: 31 | continue 32 | else: 33 | qds[line[0]].append(line[2]) 34 | f.write(' '.join(line) + '\n') 35 | f.close() 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /v1/data/qrels_toy: -------------------------------------------------------------------------------- 1 | THUMSR-22 0 nz02frdm_5 0 2 | THUMSR-22 0 hc0za6fh_1 2 3 | THUMSR-22 0 oqy6ns00_2 2 4 | THUMSR-22 0 gzc72bdy_5 1 5 | THUMSR-22 0 5ciaonf0_1 1 6 | THUMSR-22 0 hhsfq8bz_6 2 7 | THUMSR-22 0 std4jddn_12 0 8 | THUMSR-22 0 8eh90ber_0 0 9 | THUMSR-22 0 2ioap802_12 0 10 | THUMSR-22 0 m9i9gu5g_6 1 11 | THUMSR-22 0 2u4d235j_1 2 12 | THUMSR-22 0 oqy6ns00_3 0 13 | THUMSR-22 0 k1lg8c7q_1 0 14 | THUMSR-22 0 sjyrr2bn_18 0 15 | THUMSR-22 0 dro22gwf_1 2 16 | THUMSR-22 0 fcgcittn_7 0 17 | THUMSR-22 0 8eh90ber_-1 0 18 | THUMSR-22 0 84ib5ol5_0 0 19 | THUMSR-22 0 mtn7ykep_6 0 20 | THUMSR-22 0 2x7l1s75_2 0 21 | THUMSR-22 0 1vm5r7pq_3 0 22 | THUMSR-22 0 i3b647wv_7 0 23 | THUMSR-22 0 oto8tdui_2 0 24 | THUMSR-22 0 qopcs6jy_9 0 25 | THUMSR-22 0 8riyl4h3_3 0 26 | THUMSR-22 0 b518n9dx_1 0 27 | THUMSR-22 0 wyz5jyjh_9 0 28 | THUMSR-22 0 jvxo5v63_2 0 29 | THUMSR-22 0 cetdqgff_13 0 30 | THUMSR-22 0 45dpoepu_12 0 31 | THUMSR-36 0 qdamvwxl_2 2 32 | THUMSR-36 0 er93bsdj_1 1 33 | THUMSR-36 0 91g3yial_0 1 34 | THUMSR-36 0 ys6s9rps_12 2 35 | THUMSR-36 0 38bqkxn5_0 0 36 | THUMSR-36 0 qla6edp4_3 0 37 | THUMSR-36 0 vsinwqnr_16 2 38 | THUMSR-36 0 vpodtbjk_20 0 39 | THUMSR-36 0 ropgq7tr_11 2 40 | THUMSR-36 0 xuv77kp6_0 2 41 | THUMSR-36 0 tfspedf1_11 2 42 | THUMSR-36 0 2tmu1wzk_0 1 43 | THUMSR-36 0 m5h19hy6_19 1 44 | THUMSR-36 0 qdamvwxl_3 2 45 | THUMSR-36 0 d3owtd98_32 1 46 | THUMSR-36 0 vnnwnxs2_2 1 47 | THUMSR-36 0 clrcu89e_1 1 48 | THUMSR-36 0 epbhdx55_1 0 49 | THUMSR-36 0 ufu9ggrv_1 0 50 | THUMSR-36 0 3rqrq2mg_5 0 51 | THUMSR-36 0 tfspedf1_2 2 52 | THUMSR-36 0 hvmw7g5q_0 2 53 | THUMSR-36 0 5wsj003j_1 2 54 | THUMSR-36 0 tfspedf1_1 2 55 | THUMSR-36 0 jin0fdcm_0 0 56 | THUMSR-36 0 ys6s9rps_14 2 57 | THUMSR-36 0 lyob5wfv_8 1 58 | THUMSR-36 0 d3owtd98_31 1 59 | THUMSR-36 0 kl9huu33_0 2 60 | THUMSR-36 0 iudq5jdu_6 0 61 | THUMSR-36 0 tfspedf1_10 2 62 | THUMSR-36 0 xlrf3dxx_2 0 63 | THUMSR-36 0 vnnwnxs2_0 1 64 | THUMSR-36 0 ll4rxd9p_16 0 65 | THUMSR-36 0 zyecue78_10 1 66 | THUMSR-36 0 5mnj3qr7_9 1 67 | THUMSR-36 0 lsrqko6p_5 0 68 | THUMSR-36 0 tfspedf1_5 2 69 | THUMSR-36 0 ys6s9rps_18 2 70 | THUMSR-36 0 42pjc0lo_5 2 71 | THUMSR-36 0 p8cmm6ty_3 0 72 | -------------------------------------------------------------------------------- /v1/data/queries_toy.jsonl: -------------------------------------------------------------------------------- 1 | {"query_id": "THUMSR-22", "query": "Classification treatment COVID-19"} 2 | {"query_id": "THUMSR-36", "query": "masks Covid-19"} 3 | -------------------------------------------------------------------------------- /v1/docs/distributed training.md: -------------------------------------------------------------------------------- 1 | # Distributed training for BERT pretrained model 2 | 3 | Our BERT model now support distributed training which will significantly increase the speed of training. 4 | 5 | ## Training 6 | 7 | The code to use distributed training functionality is in the shell file 8 | 9 | ``` 10 | sh train_bert_dist.sh 11 | ``` 12 | 13 | In the shell file, the code is written as 14 | 15 | ``` 16 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ # set visible CUDA GPUs 17 | python -u -m torch.distributed.launch \ #lauch distributed training 18 | --nproc_per_node=4 \ # number equals to how many GPUs used 19 | --master_port=12345 train.py \ 20 | -task ranking \ 21 | -model bert \ 22 | # do not use the single json file 23 | -train queries=/path/to/queries.tsv,docs=/path/to/docs.tsv,qrels=/path/to/qrels.tsv,trec=/path/to/trec.tsv \ 24 | -max_input 1280000 \ 25 | -save ./checkpoints/bert.bin \ 26 | -dev ./data/dev_toy.jsonl \ 27 | -qrels ./data/qrels_toy \ 28 | -vocab bert-base-uncased \ 29 | -pretrain bert-base-uncased \ 30 | -res ./results/bert.trec \ 31 | -metric ndcg_cut_10 \ 32 | -max_query_len 32 \ 33 | -max_doc_len 256 \ 34 | -epoch 1 \ 35 | -batch_size 4 \ 36 | -lr 2e-5 \ 37 | -eval_every 100 \ 38 | -optimizer adamw \ 39 | -dev_eval_batch_size 128 \ 40 | -gradient_accumulation_steps 4 \ 41 | -n_warmup_steps 10000 \ 42 | -logging_step 100 43 | ``` 44 | 45 | ## Results 46 | 47 | |Dataset|Retriever|Reranker|Coor-Ascent|MRR@100 (dev)| 48 | |:-------:|:-------:|:------:|:---------:|:-:| 49 | |MSMARCO-document|ANCE FirstP+BM25 (distributed)|BERT Base FirstP|-|0.432| 50 | |MSMARCO-document|ANCE FirstP+BM25 (single)|BERT Base FirstP|-|0.403| 51 | |MSMARCO-document|ANCE FirstP+BM25 (OpenMatch)|BERT Base FirstP|-|0.407| 52 | |MSMARCO-document|ANCE FirstP|-|-|0.373| 53 | -------------------------------------------------------------------------------- /v1/docs/experiments-adhoc.md: -------------------------------------------------------------------------------- 1 | # Ad-hoc Search 2 | All results is measured on ndcg@20 with 5 fold cross-validation. More details are available at [ClueWeb09](http://lemurproject.org/clueweb09/), [ClueWeb12](http://www.lemurproject.org/clueweb12.php/). 3 | 4 | ## Datasets 5 | Data can be downloaded from [Datasets](https://cloud.tsinghua.edu.cn/d/77741ef1c1704866814a/). 6 | 7 | |Datasets|Queries/Anchors|Query/Anchor-Doc Pairs|Released Files| 8 | |:-------|:-------------:|:--------------------:|:-------------| 9 | |**ClueWeb09-B**|200|47.1K|Queries, Q-D Relations, SDM scores| 10 | |**Robust04**|249|311K|Queries, Q-D Relations, SDM scores| 11 | |**ClueWeb12-B13**|100|28.9K|Queries, Q-D Relations, SDM scores| 12 | 13 | As we cannot release the document contents, the document IDs are used instead. 14 | -------------------------------------------------------------------------------- /v1/docs/experiments-classic.md: -------------------------------------------------------------------------------- 1 | # Classic Features 2 | We extract several classic IR features, and train learning-to-rank models, such as RankSVM, Coor-Ascent, on ClueWeb09-B, Robust04 and TREC-COVID datasets with 5 fold cross-validation. All the results can be found in our [paper](https://arxiv.org/abs/2012.14862) of ACL 2021. 3 | 4 | The features consists of Boolean AND; Boolean OR; Coordinate match; Cosine similarity of bag-of-words vectors; TF-IDF; BM25; language models with no smoothing, Dirichlet smoothing, JM smoothing, and two-way smoothing. More details are available at [classic_extractor](../OpenMatch/extractors/classic_extractor.py). -------------------------------------------------------------------------------- /v1/docs/meta-learning-to-rank.md: -------------------------------------------------------------------------------- 1 | # Meta Learning to Rank 2 | 3 | Here provides the guiding code for running meta-learning to reweight technique, which uses target data to reweight training data during the learning to rank process. 4 | 5 | A detailed introduction to the technology can be found in the paper [**Few-Shot Text Ranking with Meta Adapted Synthetic Weak Supervision**](https://arxiv.org/pdf/2012.14862.pdf). 6 | 7 | 8 | 9 | ## Running 10 | 11 | 12 | The code to run meta-learning is in the shell file 13 | ``` 14 | bash meta_dist_train.sh 15 | ``` 16 | In the shell file, the code is written as 17 | 18 | ``` 19 | export gpu_num=4 ## GPU Number 20 | export master_port=23900 21 | export job_name=MetaBERT 22 | 23 | ## ************************************ 24 | export DATA_DIR= ## please set your dataset path here. 25 | export SAVE_DIR= ## please set your saving path here. 26 | 27 | ## ************************************ 28 | CUDA_VISIBLE_DEVICES=0,1,2,3 OMP_NUM_THREADS=1 python -u -m torch.distributed.launch --nproc_per_node=$gpu_num --master_port $master_port meta_dist_train.py \ 29 | -job_name $job_name \ 30 | -save_folder $SAVE_DIR/results \ 31 | -model bert \ 32 | -task ranking \ 33 | -max_input 12800000 \ 34 | -train queries=$DATA_DIR/queries.train.tsv,docs=$DATA_DIR/collection.tsv,qrels=$DATA_DIR/qrels.train.tsv,trec=$DATA_DIR/trids_bm25_marco-10.tsv \ 35 | -dev queries=$DATA_DIR/queries.dev.small.tsv,docs=$DATA_DIR/collection.tsv,qrels=$DATA_DIR/qrels.dev.small.tsv,trec=$DATA_DIR/run.msmarco-passage.dev.small.100.trec \ 36 | -target trec=$DATA_DIR/devids_bm25_marco.tsv \ 37 | -qrels $DATA_DIR/qrels.dev.small.tsv \ 38 | -vocab bert-base-uncased \ 39 | -pretrain bert-base-uncased \ 40 | -metric mrr_cut_10 \ 41 | -max_query_len 32 \ 42 | -max_doc_len 221 \ 43 | -epoch 3 \ 44 | -train_batch_size 8 \ 45 | -target_batch_size 16 \ 46 | -gradient_accumulation_steps 2 \ 47 | -dev_eval_batch_size 1024 \ 48 | -lr 3e-6 \ 49 | -n_warmup_steps 160000 \ 50 | -logging_step 2000 \ 51 | -eval_every 10000 \ 52 | -eval_during_train \ 53 | ``` 54 | 55 | The tsv format of `-target` data is totally the same with the `-train` data. 56 | 57 | ``` 58 | query_id \t pos_docid \t neg_docid 59 | ``` 60 | -------------------------------------------------------------------------------- /v1/features/README.md: -------------------------------------------------------------------------------- 1 | # Save Features 2 | Save features of neural ranker and the score of retrieval model. 3 | 4 | # Data Format 5 | ```shell 6 | label id:qid 1:feature1 2:feature2 ... 7 | ``` 8 | -------------------------------------------------------------------------------- /v1/gen_feature.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | python gen_feature.py \ 3 | -task ranking \ 4 | -model cknrm \ 5 | -max_input 1280000 \ 6 | -vocab ./data/glove.6B.300d.txt \ 7 | -checkpoint ./checkpoints/cknrm.bin \ 8 | -dev ./data/dev_toy.jsonl \ 9 | -res ./features/cknrm.trec \ 10 | -max_query_len 10 \ 11 | -max_doc_len 256 \ 12 | -batch_size 32 13 | -------------------------------------------------------------------------------- /v1/gen_feature_bert.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | python gen_feature.py \ 3 | -task classification \ 4 | -model bert \ 5 | -max_input 1280000 \ 6 | -dev ./data/dev_toy.jsonl \ 7 | -vocab bert-base-uncased \ 8 | -pretrain bert-base-uncased \ 9 | -checkpoint ./checkpoints/bert.bin \ 10 | -res ./features/bert_features \ 11 | -max_query_len 32 \ 12 | -max_doc_len 256 \ 13 | -batch_size 32 14 | -------------------------------------------------------------------------------- /v1/inference.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | python inference.py \ 3 | -task ranking \ 4 | -model cknrm \ 5 | -max_input 1280000 \ 6 | -vocab ./data/glove.6B.300d.txt \ 7 | -checkpoint ./checkpoints/cknrm.bin \ 8 | -test ./data/test_toy.jsonl \ 9 | -res ./results/cknrm.trec \ 10 | -max_query_len 10 \ 11 | -max_doc_len 256 \ 12 | -batch_size 32 13 | -------------------------------------------------------------------------------- /v1/inference_bert.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | python inference.py \ 3 | -task classification \ 4 | -model bert \ 5 | -max_input 1280000 \ 6 | -test ./data/test_toy.jsonl \ 7 | -vocab bert-base-uncased \ 8 | -pretrain bert-base-uncased \ 9 | -checkpoint ./checkpoints/bert.bin \ 10 | -res ./results/bert.trec \ 11 | -max_query_len 32 \ 12 | -max_doc_len 256 \ 13 | -batch_size 32 14 | -------------------------------------------------------------------------------- /v1/meta_dist_train.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ## ************************************ 3 | ## GPU 4 | export gpu_num=4 ## GPU Number 5 | export master_port=23900 6 | export job_name=MetaBERT 7 | 8 | ## ************************************ 9 | export DATA_DIR= ## please set your dataset path here. 10 | export SAVE_DIR= ## please set your saving path here. 11 | 12 | ## ************************************ 13 | CUDA_VISIBLE_DEVICES=0,1,2,3 OMP_NUM_THREADS=1 python -u -m torch.distributed.launch --nproc_per_node=$gpu_num --master_port $master_port meta_dist_train.py \ 14 | -job_name $job_name \ 15 | -save_folder $SAVE_DIR/results \ 16 | -model bert \ 17 | -task ranking \ 18 | -max_input 12800000 \ 19 | -train queries=$DATA_DIR/queries.train.tsv,docs=$DATA_DIR/collection.tsv,qrels=$DATA_DIR/qrels.train.tsv,trec=$DATA_DIR/trids_bm25_marco-10.tsv \ 20 | -dev queries=$DATA_DIR/queries.dev.small.tsv,docs=$DATA_DIR/collection.tsv,qrels=$DATA_DIR/qrels.dev.small.tsv,trec=$DATA_DIR/run.msmarco-passage.dev.small.100.trec \ 21 | -target trec=$DATA_DIR/devids_bm25_marco.tsv \ 22 | -qrels $DATA_DIR/qrels.dev.small.tsv \ 23 | -vocab bert-base-uncased \ 24 | -pretrain bert-base-uncased \ 25 | -metric mrr_cut_10 \ 26 | -max_query_len 32 \ 27 | -max_doc_len 221 \ 28 | -epoch 3 \ 29 | -train_batch_size 8 \ 30 | -target_batch_size 16 \ 31 | -gradient_accumulation_steps 2 \ 32 | -dev_eval_batch_size 1024 \ 33 | -lr 3e-6 \ 34 | -n_warmup_steps 160000 \ 35 | -logging_step 2000 \ 36 | -eval_every 10000 \ 37 | -eval_during_train \ 38 | -------------------------------------------------------------------------------- /v1/requirements.txt: -------------------------------------------------------------------------------- 1 | torch==1.4.0 2 | transformers==2.8.0 3 | faiss-cpu==1.6.3 4 | nltk==3.5 5 | pytrec_eval==0.4 6 | -------------------------------------------------------------------------------- /v1/results/README.md: -------------------------------------------------------------------------------- 1 | # Results 2 | All Results. 3 | -------------------------------------------------------------------------------- /v1/retrievers/ANCE/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /v1/retrievers/ANCE/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /v1/retrievers/ANCE/commands/data_download.sh: -------------------------------------------------------------------------------- 1 | mkdir -p ../data/raw_data/ 2 | cd ../data/raw_data/ 3 | 4 | # download MSMARCO passage data 5 | wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz 6 | tar -zxvf collectionandqueries.tar.gz 7 | rm collectionandqueries.tar.gz 8 | 9 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz 10 | gunzip msmarco-passagetest2019-top1000.tsv.gz 11 | 12 | wget https://msmarco.blob.core.windows.net/msmarcoranking/top1000.dev.tar.gz 13 | tar -zxvf top1000.dev.tar.gz 14 | rm top1000.dev.tar.gz 15 | 16 | wget https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.small.tar.gz 17 | tar -zxvf triples.train.small.tar.gz 18 | rm triples.train.small.tar.gz 19 | 20 | # download MSMARCO doc data 21 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz 22 | gunzip msmarco-docs.tsv.gz 23 | 24 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz 25 | gunzip msmarco-doctrain-queries.tsv.gz 26 | 27 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-qrels.tsv.gz 28 | gunzip msmarco-doctrain-qrels.tsv.gz 29 | 30 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz 31 | gunzip msmarco-test2019-queries.tsv.gz 32 | 33 | wget https://trec.nist.gov/data/deep/2019qrels-docs.txt 34 | 35 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctest2019-top100.gz 36 | gunzip msmarco-doctest2019-top100.gz 37 | 38 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-top100.gz 39 | gunzip msmarco-docdev-top100.gz 40 | 41 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz 42 | gunzip msmarco-docdev-queries.tsv.gz 43 | 44 | 45 | # clone DPR repo and download NQ and TriviaQA datasets 46 | cd ../../../ 47 | git clone https://github.com/facebookresearch/DPR 48 | cd DPR 49 | python data/download_data.py --resource data.wikipedia_split.psgs_w100 50 | python data/download_data.py --resource data.retriever.nq 51 | python data/download_data.py --resource data.retriever.trivia 52 | python data/download_data.py --resource data.retriever.qas.nq 53 | python data/download_data.py --resource data.retriever.qas.trivia 54 | python data/download_data.py --resource checkpoint.retriever.multiset.bert-base-encoder -------------------------------------------------------------------------------- /v1/retrievers/ANCE/commands/run_ann_data_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script is for generate ann data for a model in training 4 | # 5 | # For the overall design of the ann driver, check run_train.sh 6 | # 7 | # This script continuously generate ann data using latest model from model_dir 8 | # For training, run this script after initial ann data is created from run_train.sh 9 | # Make sure parameter used here is consistent with the training script 10 | 11 | # # Passage ANCE(FirstP) 12 | # gpu_no=4 13 | # seq_length=512 14 | # model_type=rdot_nll 15 | # tokenizer_type="roberta-base" 16 | # base_data_dir="../data/raw_data/" 17 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/" 18 | # job_name="OSPass512" 19 | 20 | 21 | # # Document ANCE(FirstP) 22 | # gpu_no=4 23 | # seq_length=512 24 | # model_type=rdot_nll 25 | # tokenizer_type="roberta-base" 26 | # base_data_dir="../data/raw_data/" 27 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/" 28 | # job_name="OSDoc512" 29 | 30 | # # Document ANCE(MaxP) 31 | gpu_no=4 32 | seq_length=2048 33 | model_type=rdot_nll_multi_chunk 34 | tokenizer_type="roberta-base" 35 | base_data_dir="../data/raw_data/" 36 | preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/" 37 | job_name="OSDoc2048" 38 | 39 | ##################################### Inital ANN Data generation ################################ 40 | model_dir="${base_data_dir}${job_name}/" 41 | model_ann_data_dir="${model_dir}ann_data/" 42 | pretrained_checkpoint_dir="warmup checkpoint path" 43 | 44 | initial_data_gen_cmd="\ 45 | python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_data_gen.py --training_dir $model_dir \ 46 | --init_model_dir $pretrained_checkpoint_dir --model_type $model_type --output_dir $model_ann_data_dir \ 47 | --cache_dir "${model_ann_data_dir}cache/" --data_dir $preprocessed_data_dir --max_seq_length $seq_length \ 48 | --per_gpu_eval_batch_size 16 --topk_training 200 --negative_sample 20 \ 49 | " 50 | 51 | echo $initial_data_gen_cmd 52 | eval $initial_data_gen_cmd 53 | -------------------------------------------------------------------------------- /v1/retrievers/ANCE/commands/run_ann_data_gen_dpr.sh: -------------------------------------------------------------------------------- 1 | # tokenization 2 | wiki_dir="../../../DPR/data/wikipedia_split/" # path for psgs_w100.tsv downloaded with DPR code 3 | ans_dir="../../../DPR/data/retriever/qas/" # path for DPR question&answer csv files 4 | question_dir="../../../DPR/data/retriever/" # path for DPR training data 5 | data_type=0 #0 is nq, 1 is trivia, 2 is both 6 | out_data_dir="../data/QA_NQ_data/" # change this for different data_type 7 | 8 | tokenization_cmd="\ 9 | python ../data/DPR_data.py --wiki_dir $wiki_dir --question_dir $question_dir --data_type $data_type --answer_dir $ans_dir \ 10 | --out_data_dir $out_data_dir \ 11 | " 12 | 13 | echo $tokenization_cmd 14 | eval $tokenization_cmd 15 | 16 | 17 | gpu_no=8 18 | 19 | # model type 20 | model_type="dpr" 21 | seq_length=256 22 | 23 | # ann parameters 24 | batch_size=16 25 | ann_topk=200 26 | ann_negative_sample=100 27 | 28 | # input/output directories 29 | base_data_dir="${out_data_dir}" 30 | job_name="ann_NQ_test" 31 | model_dir="${base_data_dir}${job_name}/" 32 | model_ann_data_dir="${model_dir}ann_data/" 33 | pretrained_checkpoint_dir="../../../DPR/checkpoint/retriever/multiset/bert-base-encoder.cp" 34 | passage_path="../../../DPR/data/wikipedia_split/" 35 | test_qa_path="../../../DPR/data/retriever/qas/" 36 | trivia_test_qa_path="../../../DPR/data/retriever/qas/" 37 | 38 | 39 | data_gen_cmd="\ 40 | sudo python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_data_gen_dpr.py --training_dir $model_dir \ 41 | --init_model_dir $pretrained_checkpoint_dir --model_type $model_type --output_dir $model_ann_data_dir \ 42 | --cache_dir "${model_ann_data_dir}cache/" --data_dir $base_data_dir --max_seq_length $seq_length \ 43 | --per_gpu_eval_batch_size $batch_size --topk_training $ann_topk --negative_sample $ann_negative_sample \ 44 | --passage_path $passage_path --test_qa_path $test_qa_path --trivia_test_qa_path $trivia_test_qa_path \ 45 | " 46 | 47 | echo $data_gen_cmd 48 | eval $data_gen_cmd -------------------------------------------------------------------------------- /v1/retrievers/ANCE/commands/run_ann_data_gen_lyz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script is for generate ann data for a model in training 4 | # 5 | # For the overall design of the ann driver, check run_train.sh 6 | # 7 | # This script continuously generate ann data using latest model from model_dir 8 | # For training, run this script after initial ann data is created from run_train.sh 9 | # Make sure parameter used here is consistent with the training script 10 | 11 | # # Passage ANCE(FirstP) 12 | gpu_no=4 13 | seq_length=512 14 | model_type=rdot_nll 15 | tokenizer_type="roberta-base" 16 | base_data_dir="../data/raw_data/" 17 | preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/" 18 | job_name="OSPass512" 19 | 20 | 21 | # # Document ANCE(FirstP) 22 | # gpu_no=4 23 | # seq_length=512 24 | # model_type=rdot_nll 25 | # tokenizer_type="roberta-base" 26 | # base_data_dir="../data/raw_data/" 27 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/" 28 | # job_name="OSDoc512" 29 | 30 | # # Document ANCE(MaxP) 31 | # gpu_no=4 32 | # seq_length=2048 33 | # model_type=rdot_nll_multi_chunk 34 | # tokenizer_type="roberta-base" 35 | # base_data_dir="../data/raw_data/" 36 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/" 37 | # job_name="OSDoc2048" 38 | 39 | ##################################### Inital ANN Data generation ################################ 40 | model_dir="${base_data_dir}${job_name}/" 41 | model_ann_data_dir="${model_dir}ann_data/" 42 | # pretrained_checkpoint_dir="warmup checkpoint path" 43 | pretrained_checkpoint_dir="../data/msmarco_passage_warmup_checkpoints/checkpoint-420000/" 44 | 45 | initial_data_gen_cmd="\ 46 | python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_data_gen.py --training_dir $model_dir \ 47 | --init_model_dir $pretrained_checkpoint_dir --model_type $model_type --output_dir $model_ann_data_dir \ 48 | --cache_dir "${model_ann_data_dir}cache/" --data_dir $preprocessed_data_dir --max_seq_length $seq_length \ 49 | --per_gpu_eval_batch_size 16 --topk_training 200 --negative_sample 20 \ 50 | " 51 | 52 | echo $initial_data_gen_cmd 53 | eval $initial_data_gen_cmd 54 | -------------------------------------------------------------------------------- /v1/retrievers/ANCE/commands/run_inference.sh: -------------------------------------------------------------------------------- 1 | # # Passage ANCE(FirstP) 2 | gpu_no=4 3 | seq_length=512 4 | model_type=rdot_nll 5 | tokenizer_type="roberta-base" 6 | base_data_dir="../data/raw_data/" 7 | preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}_dev/" 8 | job_name="OSPass512" 9 | pretrained_checkpoint_dir="" 10 | 11 | # # Document ANCE(FirstP) 12 | # gpu_no=4 13 | # seq_length=512 14 | # model_type=rdot_nll 15 | # tokenizer_type="roberta-base" 16 | # base_data_dir="../data/raw_data/" 17 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/" 18 | # job_name="OSDoc512" 19 | # pretrained_checkpoint_dir="" 20 | 21 | # # Document ANCE(MaxP) 22 | # gpu_no=4 23 | # seq_length=2048 24 | # model_type=rdot_nll_multi_chunk 25 | # tokenizer_type="roberta-base" 26 | # base_data_dir="../data/raw_data/" 27 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/" 28 | # job_name="OSDoc2048" 29 | # pretrained_checkpoint_dir="" 30 | 31 | ##################################### Inference ################################ 32 | model_dir="${base_data_dir}${job_name}/" 33 | model_ann_data_dir="${model_dir}ann_data_inf/" 34 | 35 | initial_data_gen_cmd="\ 36 | python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_data_gen.py --training_dir $pretrained_checkpoint_dir \ 37 | --init_model_dir $pretrained_checkpoint_dir --model_type $model_type --output_dir $model_ann_data_dir \ 38 | --cache_dir "${model_ann_data_dir}cache/" --data_dir $preprocessed_data_dir --max_seq_length $seq_length \ 39 | --per_gpu_eval_batch_size 16 --topk_training 200 --negative_sample 20 --end_output_num 0 --inference \ 40 | " 41 | 42 | echo $initial_data_gen_cmd 43 | eval $initial_data_gen_cmd 44 | -------------------------------------------------------------------------------- /v1/retrievers/ANCE/commands/run_train_dpr.sh: -------------------------------------------------------------------------------- 1 | gpu_no=8 2 | 3 | # model type 4 | model_type="dpr" 5 | seq_length=256 6 | triplet="--triplet --optimizer lamb" # set this to empty for non triplet model 7 | 8 | # hyper parameters 9 | batch_size=16 10 | gradient_accumulation_steps=1 11 | learning_rate=1e-5 12 | warmup_steps=1000 13 | 14 | # input/output directories 15 | base_data_dir="../data/QA_NQ_data/" 16 | job_name="ann_NQ_test" 17 | model_dir="${base_data_dir}${job_name}/" 18 | model_ann_data_dir="${model_dir}ann_data/" 19 | pretrained_checkpoint_dir="../../../DPR/checkpoint/retriever/multiset/bert-base-encoder.cp" 20 | 21 | train_cmd="\ 22 | sudo python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_dpr.py --model_type $model_type \ 23 | --model_name_or_path $pretrained_checkpoint_dir --task_name MSMarco $triplet --data_dir $base_data_dir \ 24 | --ann_dir $model_ann_data_dir --max_seq_length $seq_length --per_gpu_train_batch_size=$batch_size \ 25 | --gradient_accumulation_steps $gradient_accumulation_steps --learning_rate $learning_rate --output_dir $model_dir \ 26 | --warmup_steps $warmup_steps --logging_steps 100 --save_steps 1000 --log_dir "~/tensorboard/${DLWS_JOB_ID}/logs/${job_name}" \ 27 | " 28 | 29 | echo $train_cmd 30 | eval $train_cmd 31 | 32 | echo "copy current script to model directory" 33 | sudo cp $0 $model_dir -------------------------------------------------------------------------------- /v1/retrievers/ANCE/commands/run_train_warmup.sh: -------------------------------------------------------------------------------- 1 | # This script is for training the warmup checkpoint for ANCE 2 | data_dir="../data/raw_data/" 3 | output_dir="" 4 | cmd="python3 -m torch.distributed.launch --nproc_per_node=1 ../drivers/run_warmup.py --train_model_type rdot_nll \ 5 | --model_name_or_path roberta-base \ 6 | --task_name MSMarco --do_train --evaluate_during_training --data_dir ${data_dir} --max_seq_length 128 --per_gpu_eval_batch_size=256 \ 7 | --per_gpu_train_batch_size=32 --learning_rate 2e-4 --logging_steps 1000 --num_train_epochs 2.0 --output_dir ${output_dir} \ 8 | --warmup_steps 1000 --overwrite_output_dir --save_steps 30000 --gradient_accumulation_steps 1 --expected_train_size 35000000 --logging_steps_per_eval 20 \ 9 | --fp16 --optimizer lamb --log_dir ~/tensorboard/${DLWS_JOB_ID}/logs/OSpass " 10 | 11 | echo $cmd 12 | eval $cmd 13 | -------------------------------------------------------------------------------- /v1/retrievers/ANCE/evaluation/convert_trec.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import pickle 3 | import os 4 | import tqdm 5 | 6 | data_type=1 7 | test_set=0 8 | 9 | processed_data_dir = "../data/raw_data/ann_data_roberta-base_512/" 10 | trec_save_path = glob.glob(f"data-type-{data_type}_test-set-{test_set}_ckpt-*.trec") 11 | 12 | with open(os.path.join(processed_data_dir,'qid2offset.pickle'),'rb') as f: 13 | qid2offset = pickle.load(f) 14 | offset2qid = {} 15 | for k in qid2offset: 16 | offset2qid[qid2offset[k]]=k 17 | 18 | with open(os.path.join(processed_data_dir,'pid2offset.pickle'),'rb') as f: 19 | pid2offset = pickle.load(f) 20 | offset2pid = {} 21 | for k in pid2offset: 22 | offset2pid[pid2offset[k]]=k 23 | 24 | 25 | #for k in offset2qid: 26 | # print(k,offset2qid[k]) 27 | 28 | for path in tqdm.tqdm(trec_save_path): 29 | with open(path) as f: 30 | lines=f.readlines() 31 | with open(path.replace(".trec",".formatted.trec"),"w") as f: 32 | for line in lines: 33 | qid , Q0, pid, rank, score, tag = line.strip().split(' ') 34 | # print(offset2qid[int(qid)] , Q0, pid, rank, score.replace('-',''), tag) 35 | if data_type==0: 36 | f.write(f"{offset2qid[int(qid)]} {Q0} D{offset2pid[int(pid)]} {rank} {score.replace('-','')} {tag}\n") 37 | else: 38 | f.write(f"{offset2qid[int(qid)]} {Q0} {offset2pid[int(pid)]} {rank} {score.replace('-','')} {tag}\n") 39 | # break 40 | -------------------------------------------------------------------------------- /v1/retrievers/ANCE/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open('README.md') as f: 4 | readme = f.read() 5 | 6 | setup( 7 | name='ANCE', 8 | version='0.1.0', 9 | description='Approximate Nearest Neighbor Negative Contrastive Learning for Dense Text Retrieval', 10 | url='https://github.com/microsoft/ANCE', 11 | classifiers=[ 12 | 'Intended Audience :: Science/Research', 13 | 'License :: OSI Approved :: MIT License', 14 | 'Programming Language :: Python :: 3.6', 15 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 16 | ], 17 | license="MIT", 18 | long_description=readme, 19 | install_requires=[ 20 | 'transformers==2.3.0', 21 | 'pytrec-eval', 22 | 'faiss-cpu', 23 | 'wget' 24 | ], 25 | ) -------------------------------------------------------------------------------- /v1/retrievers/DANCE/ANCE_setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | with open('README.md') as f: 4 | readme = f.read() 5 | 6 | setup( 7 | name='ANCE', 8 | version='0.1.0', 9 | description='Approximate Nearest Neighbor Negative Contrastive Learning for Dense Text Retrieval', 10 | url='https://github.com/microsoft/ANCE', 11 | classifiers=[ 12 | 'Intended Audience :: Science/Research', 13 | 'License :: OSI Approved :: MIT License', 14 | 'Programming Language :: Python :: 3.6', 15 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 16 | ], 17 | license="MIT", 18 | long_description=readme, 19 | install_requires=[ 20 | 'transformers==2.3.0', 21 | 'pytrec-eval', 22 | 'wget' 23 | ], 24 | ) -------------------------------------------------------------------------------- /v1/retrievers/DANCE/commands/data_download.sh: -------------------------------------------------------------------------------- 1 | mkdir ../data/raw_data/ 2 | cd ../data/raw_data/ 3 | 4 | # # download MSMARCO passage data 5 | # wget https://msmarco.blob.core.windows.net/msmarcoranking/collectionandqueries.tar.gz 6 | # tar -zxvf collectionandqueries.tar.gz 7 | # rm collectionandqueries.tar.gz 8 | 9 | # wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-passagetest2019-top1000.tsv.gz 10 | # gunzip msmarco-passagetest2019-top1000.tsv.gz 11 | 12 | # wget https://msmarco.blob.core.windows.net/msmarcoranking/top1000.dev.tar.gz 13 | # tar -zxvf top1000.dev.tar.gz 14 | # rm top1000.dev.tar.gz 15 | 16 | # wget https://msmarco.blob.core.windows.net/msmarcoranking/triples.train.small.tar.gz 17 | # tar -zxvf triples.train.small.tar.gz 18 | # rm triples.train.small.tar.gz 19 | 20 | # download MSMARCO doc data 21 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz 22 | gunzip msmarco-docs.tsv.gz 23 | 24 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-queries.tsv.gz 25 | gunzip msmarco-doctrain-queries.tsv.gz 26 | 27 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctrain-qrels.tsv.gz 28 | gunzip msmarco-doctrain-qrels.tsv.gz 29 | 30 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-test2019-queries.tsv.gz 31 | gunzip msmarco-test2019-queries.tsv.gz 32 | 33 | wget https://trec.nist.gov/data/deep/2019qrels-docs.txt 34 | 35 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-doctest2019-top100.gz 36 | gunzip msmarco-doctest2019-top100.gz 37 | 38 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-top100.gz 39 | gunzip msmarco-docdev-top100.gz 40 | 41 | wget https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz 42 | gunzip msmarco-docdev-queries.tsv.gz 43 | 44 | 45 | # # clone DPR repo and download NQ and TriviaQA datasets 46 | # cd ../../../ 47 | # git clone https://github.com/facebookresearch/DPR 48 | # cd DPR 49 | # python data/download_data.py --resource data.wikipedia_split.psgs_w100 50 | # python data/download_data.py --resource data.retriever.nq 51 | # python data/download_data.py --resource data.retriever.trivia 52 | # python data/download_data.py --resource data.retriever.qas.nq 53 | # python data/download_data.py --resource data.retriever.qas.trivia 54 | # python data/download_data.py --resource checkpoint.retriever.multiset.bert-base-encoder -------------------------------------------------------------------------------- /v1/retrievers/DANCE/commands/run_ann_data_gen.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script is for generate ann data for a model in training 4 | # 5 | # For the overall design of the ann driver, check run_train.sh 6 | # 7 | # This script continuously generate ann data using latest model from model_dir 8 | # For training, run this script after initial ann data is created from run_train.sh 9 | # Make sure parameter used here is consistent with the training script 10 | 11 | # # Passage ANCE(FirstP) 12 | # gpu_no=4 13 | # seq_length=512 14 | # model_type=rdot_nll 15 | # tokenizer_type="roberta-base" 16 | # base_data_dir="../data/raw_data/" 17 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/" 18 | # job_name="OSPass512" 19 | 20 | 21 | # # Document ANCE(FirstP) 22 | # gpu_no=4 23 | # seq_length=512 24 | # model_type=rdot_nll 25 | # tokenizer_type="roberta-base" 26 | # base_data_dir="../data/raw_data/" 27 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/" 28 | # job_name="OSDoc512" 29 | 30 | # # Document ANCE(MaxP) 31 | gpu_no=4 32 | seq_length=2048 33 | model_type=rdot_nll_multi_chunk 34 | tokenizer_type="roberta-base" 35 | base_data_dir="../data/raw_data/" 36 | preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/" 37 | job_name="OSDoc2048" 38 | 39 | ##################################### Inital ANN Data generation ################################ 40 | model_dir="${base_data_dir}${job_name}/" 41 | model_ann_data_dir="${model_dir}ann_data/" 42 | pretrained_checkpoint_dir="warmup checkpoint path" 43 | 44 | initial_data_gen_cmd="\ 45 | python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_data_gen.py --training_dir $model_dir \ 46 | --init_model_dir $pretrained_checkpoint_dir --model_type $model_type --output_dir $model_ann_data_dir \ 47 | --cache_dir "${model_ann_data_dir}cache/" --data_dir $preprocessed_data_dir --max_seq_length $seq_length \ 48 | --per_gpu_eval_batch_size 16 --topk_training 200 --negative_sample 20 \ 49 | " 50 | 51 | echo $initial_data_gen_cmd 52 | eval $initial_data_gen_cmd 53 | -------------------------------------------------------------------------------- /v1/retrievers/DANCE/commands/run_ann_data_gen_dpr.sh: -------------------------------------------------------------------------------- 1 | # tokenization 2 | wiki_dir="../../../DPR/data/wikipedia_split/" # path for psgs_w100.tsv downloaded with DPR code 3 | ans_dir="../../../DPR/data/retriever/qas/" # path for DPR question&answer csv files 4 | question_dir="../../../DPR/data/retriever/" # path for DPR training data 5 | data_type=0 #0 is nq, 1 is trivia, 2 is both 6 | out_data_dir="../data/QA_NQ_data/" # change this for different data_type 7 | 8 | tokenization_cmd="\ 9 | python ../data/DPR_data.py --wiki_dir $wiki_dir --question_dir $question_dir --data_type $data_type --answer_dir $ans_dir \ 10 | --out_data_dir $out_data_dir \ 11 | " 12 | 13 | echo $tokenization_cmd 14 | eval $tokenization_cmd 15 | 16 | 17 | gpu_no=8 18 | 19 | # model type 20 | model_type="dpr" 21 | seq_length=256 22 | 23 | # ann parameters 24 | batch_size=16 25 | ann_topk=200 26 | ann_negative_sample=100 27 | 28 | # input/output directories 29 | base_data_dir="${out_data_dir}" 30 | job_name="ann_NQ_test" 31 | model_dir="${base_data_dir}${job_name}/" 32 | model_ann_data_dir="${model_dir}ann_data/" 33 | pretrained_checkpoint_dir="../../../DPR/checkpoint/retriever/multiset/bert-base-encoder.cp" 34 | passage_path="../../../DPR/data/wikipedia_split/" 35 | test_qa_path="../../../DPR/data/retriever/qas/" 36 | trivia_test_qa_path="../../../DPR/data/retriever/qas/" 37 | 38 | 39 | data_gen_cmd="\ 40 | sudo python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_data_gen_dpr.py --training_dir $model_dir \ 41 | --init_model_dir $pretrained_checkpoint_dir --model_type $model_type --output_dir $model_ann_data_dir \ 42 | --cache_dir "${model_ann_data_dir}cache/" --data_dir $base_data_dir --max_seq_length $seq_length \ 43 | --per_gpu_eval_batch_size $batch_size --topk_training $ann_topk --negative_sample $ann_negative_sample \ 44 | --passage_path $passage_path --test_qa_path $test_qa_path --trivia_test_qa_path $trivia_test_qa_path \ 45 | " 46 | 47 | echo $data_gen_cmd 48 | eval $data_gen_cmd -------------------------------------------------------------------------------- /v1/retrievers/DANCE/commands/run_inference.sh: -------------------------------------------------------------------------------- 1 | # # Passage ANCE(FirstP) 2 | gpu_no=4 3 | seq_length=512 4 | model_type=rdot_nll 5 | tokenizer_type="roberta-base" 6 | base_data_dir="../data/raw_data/" 7 | preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}_dev/" 8 | job_name="OSPass512" 9 | pretrained_checkpoint_dir="" 10 | 11 | # # Document ANCE(FirstP) 12 | # gpu_no=4 13 | # seq_length=512 14 | # model_type=rdot_nll 15 | # tokenizer_type="roberta-base" 16 | # base_data_dir="../data/raw_data/" 17 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/" 18 | # job_name="OSDoc512" 19 | # pretrained_checkpoint_dir="" 20 | 21 | # # Document ANCE(MaxP) 22 | # gpu_no=4 23 | # seq_length=2048 24 | # model_type=rdot_nll_multi_chunk 25 | # tokenizer_type="roberta-base" 26 | # base_data_dir="../data/raw_data/" 27 | # preprocessed_data_dir="${base_data_dir}ann_data_${tokenizer_type}_${seq_length}/" 28 | # job_name="OSDoc2048" 29 | # pretrained_checkpoint_dir="" 30 | 31 | ##################################### Inference ################################ 32 | model_dir="${base_data_dir}${job_name}/" 33 | model_ann_data_dir="${model_dir}ann_data_inf/" 34 | 35 | initial_data_gen_cmd="\ 36 | python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_data_gen.py --training_dir $pretrained_checkpoint_dir \ 37 | --init_model_dir $pretrained_checkpoint_dir --model_type $model_type --output_dir $model_ann_data_dir \ 38 | --cache_dir "${model_ann_data_dir}cache/" --data_dir $preprocessed_data_dir --max_seq_length $seq_length \ 39 | --per_gpu_eval_batch_size 16 --topk_training 200 --negative_sample 20 --end_output_num 0 --inference \ 40 | " 41 | 42 | echo $initial_data_gen_cmd 43 | eval $initial_data_gen_cmd 44 | -------------------------------------------------------------------------------- /v1/retrievers/DANCE/commands/run_train_dpr.sh: -------------------------------------------------------------------------------- 1 | gpu_no=8 2 | 3 | # model type 4 | model_type="dpr" 5 | seq_length=256 6 | triplet="--triplet --optimizer lamb" # set this to empty for non triplet model 7 | 8 | # hyper parameters 9 | batch_size=16 10 | gradient_accumulation_steps=1 11 | learning_rate=1e-5 12 | warmup_steps=1000 13 | 14 | # input/output directories 15 | base_data_dir="../data/QA_NQ_data/" 16 | job_name="ann_NQ_test" 17 | model_dir="${base_data_dir}${job_name}/" 18 | model_ann_data_dir="${model_dir}ann_data/" 19 | pretrained_checkpoint_dir="../../../DPR/checkpoint/retriever/multiset/bert-base-encoder.cp" 20 | 21 | train_cmd="\ 22 | sudo python -m torch.distributed.launch --nproc_per_node=$gpu_no ../drivers/run_ann_dpr.py --model_type $model_type \ 23 | --model_name_or_path $pretrained_checkpoint_dir --task_name MSMarco $triplet --data_dir $base_data_dir \ 24 | --ann_dir $model_ann_data_dir --max_seq_length $seq_length --per_gpu_train_batch_size=$batch_size \ 25 | --gradient_accumulation_steps $gradient_accumulation_steps --learning_rate $learning_rate --output_dir $model_dir \ 26 | --warmup_steps $warmup_steps --logging_steps 100 --save_steps 1000 --log_dir "~/tensorboard/${DLWS_JOB_ID}/logs/${job_name}" \ 27 | " 28 | 29 | echo $train_cmd 30 | eval $train_cmd 31 | 32 | echo "copy current script to model directory" 33 | sudo cp $0 $model_dir -------------------------------------------------------------------------------- /v1/retrievers/DANCE/commands/run_train_warmup.sh: -------------------------------------------------------------------------------- 1 | # This script is for training the warmup checkpoint for ANCE 2 | data_dir="../data/raw_data/" 3 | output_dir="" 4 | cmd="python3 -m torch.distributed.launch --nproc_per_node=1 ../drivers/run_warmup.py --train_model_type rdot_nll \ 5 | --model_name_or_path roberta-base \ 6 | --task_name MSMarco --do_train --evaluate_during_training --data_dir ${data_dir} --max_seq_length 128 --per_gpu_eval_batch_size=256 \ 7 | --per_gpu_train_batch_size=32 --learning_rate 2e-4 --logging_steps 1000 --num_train_epochs 2.0 --output_dir ${output_dir} \ 8 | --warmup_steps 1000 --overwrite_output_dir --save_steps 30000 --gradient_accumulation_steps 1 --expected_train_size 35000000 --logging_steps_per_eval 20 \ 9 | --fp16 --optimizer lamb --log_dir ~/tensorboard/${DLWS_JOB_ID}/logs/OSpass " 10 | 11 | echo $cmd 12 | eval $cmd 13 | -------------------------------------------------------------------------------- /v1/retrievers/README.md: -------------------------------------------------------------------------------- 1 | # Document Retrieval 2 | BM25 is following [anserini](https://github.com/castorini/anserini), and ANN is following [ANCE](https://github.com/microsoft/ANCE). 3 | 4 | ## BM25 Guide 5 | ### MS MARCO Doc Ranking Examples 6 | First, get the [msmarco-docs.tsv](https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docs.tsv.gz) and [msmarco-docdev-queries.tsv](https://msmarco.blob.core.windows.net/msmarcoranking/msmarco-docdev-queries.tsv.gz), and preprocess *msmarco-docs.tsv* to jsonl format. *{'id': doc_id, 'contents': doc}* for each line, save it to *collections/msmarco/msmarco-docs.jsonl*. 7 | 8 | Then build BM25 index and search: 9 | ``` 10 | ./bm25_retriever/bin/IndexCollection -collection JsonCollection -input ./collections/msmarco -index index-msmarco -generator LuceneDocumentGenerator -threads 8 -storePositions -storeDocvectors -storeRawDocs 11 | ./bm25_retriever/bin/SearchCollection -index index-msmarco -topicreader TsvString -topics msmarco-docdev-queries.tsv -bm25 -output msmarco-doc.txt 12 | ``` 13 | 14 | ## ANCE Guide 15 | The guides of ANCE training and inference are available at [ance](./openmatch_ance_retriver_readme.md). 16 | -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/HdrHistogram-2.1.9.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/HdrHistogram-2.1.9.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/aggs-matrix-stats-client-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/aggs-matrix-stats-client-7.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/annotations-java5-19.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/annotations-java5-19.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/anserini-0.7.3-SNAPSHOT.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/anserini-0.7.3-SNAPSHOT.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/anserini-fastutil-6.5.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/anserini-fastutil-6.5.6.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/ant-1.9.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/ant-1.9.1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/ant-launcher-1.9.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/ant-launcher-1.9.1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/args4j-2.32.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/args4j-2.32.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/cbor-0.7.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/cbor-0.7.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/commons-codec-1.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/commons-codec-1.11.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/commons-compress-1.18.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/commons-compress-1.18.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/commons-io-2.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/commons-io-2.5.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/commons-lang3-3.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/commons-lang3-3.5.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/commons-logging-1.1.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/commons-logging-1.1.3.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/commons-math3-3.6.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/commons-math3-3.6.1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/commons-pool2-2.6.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/commons-pool2-2.6.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/compiler-0.9.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/compiler-0.9.3.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/elasticsearch-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-7.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/elasticsearch-cli-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-cli-7.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/elasticsearch-core-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-core-7.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/elasticsearch-geo-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-geo-7.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/elasticsearch-rest-client-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-rest-client-7.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/elasticsearch-rest-high-level-client-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-rest-high-level-client-7.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/elasticsearch-secure-sm-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-secure-sm-7.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/elasticsearch-x-content-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/elasticsearch-x-content-7.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/guava-18.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/guava-18.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/hppc-0.7.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/hppc-0.7.1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/http2-client-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/http2-client-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/http2-common-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/http2-common-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/http2-hpack-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/http2-hpack-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/http2-http-client-transport-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/http2-http-client-transport-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/httpasyncclient-4.1.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/httpasyncclient-4.1.4.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/httpclient-4.5.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/httpclient-4.5.6.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/httpcore-4.4.10.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/httpcore-4.4.10.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/httpcore-nio-4.4.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/httpcore-nio-4.4.11.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/httpmime-4.5.6.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/httpmime-4.5.6.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jackson-annotations-2.10.0.pr1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jackson-annotations-2.10.0.pr1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jackson-core-2.10.0.pr1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jackson-core-2.10.0.pr1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jackson-databind-2.10.0.pr1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jackson-databind-2.10.0.pr1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jackson-dataformat-cbor-2.8.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jackson-dataformat-cbor-2.8.11.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jackson-dataformat-smile-2.8.11.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jackson-dataformat-smile-2.8.11.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jackson-dataformat-yaml-2.10.0.pr1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jackson-dataformat-yaml-2.10.0.pr1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jackson-datatype-jdk8-2.10.0.pr1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jackson-datatype-jdk8-2.10.0.pr1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jcl-over-slf4j-1.7.24.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jcl-over-slf4j-1.7.24.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jetty-alpn-client-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jetty-alpn-client-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jetty-alpn-java-client-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jetty-alpn-java-client-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jetty-client-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jetty-client-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jetty-http-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jetty-http-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jetty-io-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jetty-io-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jetty-util-9.4.19.v20190610.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jetty-util-9.4.19.v20190610.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jna-4.5.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jna-4.5.1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/joda-time-2.10.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/joda-time-2.10.1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jopt-simple-5.0.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jopt-simple-5.0.2.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jsoup-1.8.3.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jsoup-1.8.3.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/jsr305-2.0.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/jsr305-2.0.1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lang-mustache-client-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lang-mustache-client-7.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/log4j-api-2.12.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/log4j-api-2.12.1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/log4j-core-2.12.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/log4j-core-2.12.1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-analyzers-common-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-analyzers-common-8.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-backward-codecs-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-backward-codecs-8.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-core-8.3.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-core-8.3.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-grouping-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-grouping-8.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-highlighter-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-highlighter-8.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-join-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-join-8.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-memory-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-memory-8.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-misc-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-misc-8.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-queries-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-queries-8.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-queryparser-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-queryparser-8.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-sandbox-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-sandbox-8.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-spatial-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-spatial-8.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-spatial-extras-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-spatial-extras-8.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-spatial3d-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-spatial3d-8.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/lucene-suggest-8.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/lucene-suggest-8.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/mockito-all-1.10.19.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/mockito-all-1.10.19.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/netty-buffer-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-buffer-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/netty-codec-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-codec-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/netty-common-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-common-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/netty-handler-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-handler-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/netty-resolver-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-resolver-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/netty-transport-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-transport-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/netty-transport-native-epoll-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-transport-native-epoll-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/netty-transport-native-unix-common-4.1.29.Final.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/netty-transport-native-unix-common-4.1.29.Final.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/parent-join-client-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/parent-join-client-7.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/rank-eval-client-7.0.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/rank-eval-client-7.0.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/sesame-model-4.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/sesame-model-4.1.2.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/sesame-rio-api-4.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/sesame-rio-api-4.1.2.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/sesame-rio-datatypes-4.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/sesame-rio-datatypes-4.1.2.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/sesame-rio-languages-4.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/sesame-rio-languages-4.1.2.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/sesame-rio-ntriples-4.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/sesame-rio-ntriples-4.1.2.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/sesame-util-4.1.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/sesame-util-4.1.2.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/slf4j-api-1.7.24.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/slf4j-api-1.7.24.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/slf4j-simple-1.7.29.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/slf4j-simple-1.7.29.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/snakeyaml-1.24.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/snakeyaml-1.24.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/solr-solrj-8.3.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/solr-solrj-8.3.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/stax2-api-3.1.4.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/stax2-api-3.1.4.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/t-digest-3.2.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/t-digest-3.2.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/trec-car-tools-java-13.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/trec-car-tools-java-13.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/twitter-text-2.0.10.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/twitter-text-2.0.10.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/wdtk-datamodel-0.10.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/wdtk-datamodel-0.10.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/wdtk-dumpfiles-0.10.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/wdtk-dumpfiles-0.10.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/wdtk-storage-0.10.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/wdtk-storage-0.10.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/wdtk-util-0.10.0.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/wdtk-util-0.10.0.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/wikiclean-1.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/wikiclean-1.1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/woodstox-core-asl-4.4.1.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/woodstox-core-asl-4.4.1.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/xz-1.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/xz-1.5.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/zookeeper-3.5.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/zookeeper-3.5.5.jar -------------------------------------------------------------------------------- /v1/retrievers/bm25_retriever/repo/zookeeper-jute-3.5.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/OpenMatch/fbf198ad5353e35b8b67ea7539783db61b32d1f2/v1/retrievers/bm25_retriever/repo/zookeeper-jute-3.5.5.jar -------------------------------------------------------------------------------- /v1/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | install_requires = [ 4 | 'torch==1.4.0', 5 | 'transformers==2.8.0', 6 | 'faiss-cpu==1.6.3', 7 | 'nltk==3.5', 8 | 'pytrec_eval==0.4' 9 | ] 10 | 11 | setup( 12 | name="OpenMatch", 13 | version="0.0.1", 14 | author="OpenMatch Authors", 15 | author_email='zkt18{at}mails.tsinghua.edu.cn', 16 | description="An Open Source Package for Information Retrieval", 17 | packages=find_packages(), 18 | install_requires=install_requires, 19 | python_requires='>=3.6' 20 | ) 21 | -------------------------------------------------------------------------------- /v1/train.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | python train.py \ 3 | -task ranking \ 4 | -model cknrm \ 5 | -train ./data/train_toy.jsonl \ 6 | -max_input 1280000 \ 7 | -save ./checkpoints/cknrm.bin \ 8 | -dev ./data/dev_toy.jsonl \ 9 | -qrels ./data/qrels_toy \ 10 | -vocab ./data/glove.6B.300d.txt \ 11 | -res ./results/cknrm.trec \ 12 | -metric ndcg_cut_10 \ 13 | -n_kernels 21 \ 14 | -max_query_len 10 \ 15 | -max_doc_len 150 \ 16 | -epoch 2 \ 17 | -batch_size 32 \ 18 | -lr 1e-3 \ 19 | -eval_every 10 20 | -------------------------------------------------------------------------------- /v1/train_bert.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0 \ 2 | python train.py \ 3 | -task classification \ 4 | -model bert \ 5 | -train ./data/train_clas_toy.jsonl \ 6 | -max_input 1280000 \ 7 | -save ./checkpoints/bert.bin \ 8 | -dev ./data/dev_toy.jsonl \ 9 | -qrels ./data/qrels_toy \ 10 | -vocab bert-base-uncased \ 11 | -pretrain bert-base-uncased \ 12 | -res ./results/bert.trec \ 13 | -metric ndcg_cut_10 \ 14 | -max_query_len 32 \ 15 | -max_doc_len 256 \ 16 | -epoch 1 \ 17 | -batch_size 4 \ 18 | -lr 2e-5 \ 19 | -eval_every 10 20 | -------------------------------------------------------------------------------- /v1/train_bert_dist.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=0,1,2,3 \ # set visible CUDA GPUs 2 | python -u -m torch.distributed.launch \ #lauch distributed training 3 | --nproc_per_node=4 \ # number equals to how many GPUs used 4 | --master_port=12345 train.py \ 5 | -task ranking \ 6 | -model bert \ 7 | # do not use the single json file 8 | -train queries=/path/to/queries.tsv,docs=/path/to/docs.tsv,qrels=/path/to/qrels.tsv,trec=/path/to/trec.tsv \ 9 | -max_input 1280000 \ 10 | -save ./checkpoints/bert.bin \ 11 | -dev ./data/dev_toy.jsonl \ 12 | -qrels ./data/qrels_toy \ 13 | -vocab bert-base-uncased \ 14 | -pretrain bert-base-uncased \ 15 | -res ./results/bert.trec \ 16 | -metric ndcg_cut_10 \ 17 | -max_query_len 32 \ 18 | -max_doc_len 256 \ 19 | -epoch 1 \ 20 | -batch_size 4 \ 21 | -lr 2e-5 \ 22 | -eval_every 100 \ 23 | -optimizer adamw \ 24 | -dev_eval_batch_size 128 \ 25 | -gradient_accumulation_steps 4 \ 26 | -n_warmup_steps 10000 \ 27 | -logging_step 100 28 | --------------------------------------------------------------------------------