├── .gitignore ├── LICENSE ├── README.md ├── ddro_env.yml ├── pyserini.yml ├── requirements.txt └── src ├── arc_images ├── DDRO.drawio.png ├── dpo_loss.png ├── dpo_objective.png ├── loss_ntp.png └── objective_ntp.png ├── data ├── data_prep │ ├── README.md │ ├── bm25_negative_sampling_msmarco.py │ ├── build_t5_data │ │ ├── gen_eval_data_pipline.py │ │ ├── gen_train_data_pipline.py │ │ ├── generate_eval_instances.py │ │ └── generate_train_instances.py │ ├── convert_tsv_to_json_array.py │ ├── generate_doc_embeddings.py │ ├── generate_encoded_docids.py │ ├── generate_msmarco_triples.py │ ├── generate_pseudo_queries.py │ ├── negative_sampling.py │ ├── nq │ │ ├── bm25_negative_Sampling_NQ.py │ │ ├── convert_json_array_to_jsonl.ipynb │ │ ├── convert_nq_to_msmarco_format.py │ │ └── process_nq_dataset.py │ ├── pq_docid_demo.ipynb │ ├── rq_docid_demo.ipynb │ ├── sample_top300k_msmarco_documents.py │ └── url_title_docid_demo.ipynb ├── data_scripts │ ├── csv_builder.py │ └── json_builder.py └── download │ ├── README.md │ ├── download_msmarco_datasets.sh │ ├── download_nq_datasets.sh │ └── download_t5_model.py ├── pretrain ├── README.md ├── T5ForPretrain.py ├── __init__.py ├── eval_ddro_docid_ranking.py ├── finetune_docTTTTTquery_NQ.py ├── hf_eval │ ├── eval_hf_docid_ranking.py │ ├── launch_hf_eval_from_config.py │ └── slurm_submit_hf_eval.sh ├── launch_ddro_eval_from_config.py ├── train_ddro_encoder_decoder.py └── train_ddro_encoder_decoder_nq.py ├── scripts ├── bm25 │ ├── run_bm25_retrieval_msmarco.sh │ └── run_bm25_retrieval_nq.sh ├── configs │ ├── config.json │ └── config_nq.json ├── ddro │ ├── slurm_submit_dddro_training_NQ.sh │ ├── slurm_submit_ddro_eval.sh │ └── slurm_submit_ddro_training.sh ├── preprocess │ ├── convert_nq_to_msmarco_format.sh │ ├── create_nq_triples.sh │ ├── finetune_docTTTTTTquery.sh │ ├── generate_3stage_train_data.sh │ ├── generate_doc_embeddings.sh │ ├── generate_encoded_ids.sh │ ├── generate_eval_data.sh │ ├── generate_msmarco_triples.sh │ ├── preprocess_nq_dataset.sh │ ├── pseudo_queries_generator.sh │ └── sample_top_docs.sh └── sft │ └── launch_SFT_training.sh └── utils ├── __init__.py ├── compare_models.py ├── custom_datasets.py ├── custom_trainers.py ├── evaluate.py ├── generate_demo.py ├── pretrain_dataset.py ├── run_eval.py ├── run_t5_trainer.py ├── run_training_pipeline.py ├── trie.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/README.md -------------------------------------------------------------------------------- /ddro_env.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/ddro_env.yml -------------------------------------------------------------------------------- /pyserini.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/pyserini.yml -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/requirements.txt -------------------------------------------------------------------------------- /src/arc_images/DDRO.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/arc_images/DDRO.drawio.png -------------------------------------------------------------------------------- /src/arc_images/dpo_loss.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/arc_images/dpo_loss.png -------------------------------------------------------------------------------- /src/arc_images/dpo_objective.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/arc_images/dpo_objective.png -------------------------------------------------------------------------------- /src/arc_images/loss_ntp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/arc_images/loss_ntp.png -------------------------------------------------------------------------------- /src/arc_images/objective_ntp.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/arc_images/objective_ntp.png -------------------------------------------------------------------------------- /src/data/data_prep/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/README.md -------------------------------------------------------------------------------- /src/data/data_prep/bm25_negative_sampling_msmarco.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/bm25_negative_sampling_msmarco.py -------------------------------------------------------------------------------- /src/data/data_prep/build_t5_data/gen_eval_data_pipline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/build_t5_data/gen_eval_data_pipline.py -------------------------------------------------------------------------------- /src/data/data_prep/build_t5_data/gen_train_data_pipline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/build_t5_data/gen_train_data_pipline.py -------------------------------------------------------------------------------- /src/data/data_prep/build_t5_data/generate_eval_instances.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/build_t5_data/generate_eval_instances.py -------------------------------------------------------------------------------- /src/data/data_prep/build_t5_data/generate_train_instances.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/build_t5_data/generate_train_instances.py -------------------------------------------------------------------------------- /src/data/data_prep/convert_tsv_to_json_array.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/convert_tsv_to_json_array.py -------------------------------------------------------------------------------- /src/data/data_prep/generate_doc_embeddings.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/generate_doc_embeddings.py -------------------------------------------------------------------------------- /src/data/data_prep/generate_encoded_docids.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/generate_encoded_docids.py -------------------------------------------------------------------------------- /src/data/data_prep/generate_msmarco_triples.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/generate_msmarco_triples.py -------------------------------------------------------------------------------- /src/data/data_prep/generate_pseudo_queries.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/generate_pseudo_queries.py -------------------------------------------------------------------------------- /src/data/data_prep/negative_sampling.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/negative_sampling.py -------------------------------------------------------------------------------- /src/data/data_prep/nq/bm25_negative_Sampling_NQ.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/nq/bm25_negative_Sampling_NQ.py -------------------------------------------------------------------------------- /src/data/data_prep/nq/convert_json_array_to_jsonl.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/nq/convert_json_array_to_jsonl.ipynb -------------------------------------------------------------------------------- /src/data/data_prep/nq/convert_nq_to_msmarco_format.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/nq/convert_nq_to_msmarco_format.py -------------------------------------------------------------------------------- /src/data/data_prep/nq/process_nq_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/nq/process_nq_dataset.py -------------------------------------------------------------------------------- /src/data/data_prep/pq_docid_demo.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/pq_docid_demo.ipynb -------------------------------------------------------------------------------- /src/data/data_prep/rq_docid_demo.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/rq_docid_demo.ipynb -------------------------------------------------------------------------------- /src/data/data_prep/sample_top300k_msmarco_documents.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/sample_top300k_msmarco_documents.py -------------------------------------------------------------------------------- /src/data/data_prep/url_title_docid_demo.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_prep/url_title_docid_demo.ipynb -------------------------------------------------------------------------------- /src/data/data_scripts/csv_builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_scripts/csv_builder.py -------------------------------------------------------------------------------- /src/data/data_scripts/json_builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/data_scripts/json_builder.py -------------------------------------------------------------------------------- /src/data/download/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/download/README.md -------------------------------------------------------------------------------- /src/data/download/download_msmarco_datasets.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/download/download_msmarco_datasets.sh -------------------------------------------------------------------------------- /src/data/download/download_nq_datasets.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/download/download_nq_datasets.sh -------------------------------------------------------------------------------- /src/data/download/download_t5_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/data/download/download_t5_model.py -------------------------------------------------------------------------------- /src/pretrain/README.md: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/pretrain/T5ForPretrain.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/pretrain/T5ForPretrain.py -------------------------------------------------------------------------------- /src/pretrain/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/pretrain/eval_ddro_docid_ranking.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/pretrain/eval_ddro_docid_ranking.py -------------------------------------------------------------------------------- /src/pretrain/finetune_docTTTTTquery_NQ.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/pretrain/finetune_docTTTTTquery_NQ.py -------------------------------------------------------------------------------- /src/pretrain/hf_eval/eval_hf_docid_ranking.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/pretrain/hf_eval/eval_hf_docid_ranking.py -------------------------------------------------------------------------------- /src/pretrain/hf_eval/launch_hf_eval_from_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/pretrain/hf_eval/launch_hf_eval_from_config.py -------------------------------------------------------------------------------- /src/pretrain/hf_eval/slurm_submit_hf_eval.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/pretrain/hf_eval/slurm_submit_hf_eval.sh -------------------------------------------------------------------------------- /src/pretrain/launch_ddro_eval_from_config.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/pretrain/launch_ddro_eval_from_config.py -------------------------------------------------------------------------------- /src/pretrain/train_ddro_encoder_decoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/pretrain/train_ddro_encoder_decoder.py -------------------------------------------------------------------------------- /src/pretrain/train_ddro_encoder_decoder_nq.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/pretrain/train_ddro_encoder_decoder_nq.py -------------------------------------------------------------------------------- /src/scripts/bm25/run_bm25_retrieval_msmarco.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/bm25/run_bm25_retrieval_msmarco.sh -------------------------------------------------------------------------------- /src/scripts/bm25/run_bm25_retrieval_nq.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/bm25/run_bm25_retrieval_nq.sh -------------------------------------------------------------------------------- /src/scripts/configs/config.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/configs/config.json -------------------------------------------------------------------------------- /src/scripts/configs/config_nq.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/configs/config_nq.json -------------------------------------------------------------------------------- /src/scripts/ddro/slurm_submit_dddro_training_NQ.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/ddro/slurm_submit_dddro_training_NQ.sh -------------------------------------------------------------------------------- /src/scripts/ddro/slurm_submit_ddro_eval.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/ddro/slurm_submit_ddro_eval.sh -------------------------------------------------------------------------------- /src/scripts/ddro/slurm_submit_ddro_training.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/ddro/slurm_submit_ddro_training.sh -------------------------------------------------------------------------------- /src/scripts/preprocess/convert_nq_to_msmarco_format.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/preprocess/convert_nq_to_msmarco_format.sh -------------------------------------------------------------------------------- /src/scripts/preprocess/create_nq_triples.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/preprocess/create_nq_triples.sh -------------------------------------------------------------------------------- /src/scripts/preprocess/finetune_docTTTTTTquery.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/preprocess/finetune_docTTTTTTquery.sh -------------------------------------------------------------------------------- /src/scripts/preprocess/generate_3stage_train_data.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/preprocess/generate_3stage_train_data.sh -------------------------------------------------------------------------------- /src/scripts/preprocess/generate_doc_embeddings.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/preprocess/generate_doc_embeddings.sh -------------------------------------------------------------------------------- /src/scripts/preprocess/generate_encoded_ids.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/preprocess/generate_encoded_ids.sh -------------------------------------------------------------------------------- /src/scripts/preprocess/generate_eval_data.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/preprocess/generate_eval_data.sh -------------------------------------------------------------------------------- /src/scripts/preprocess/generate_msmarco_triples.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/preprocess/generate_msmarco_triples.sh -------------------------------------------------------------------------------- /src/scripts/preprocess/preprocess_nq_dataset.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/preprocess/preprocess_nq_dataset.sh -------------------------------------------------------------------------------- /src/scripts/preprocess/pseudo_queries_generator.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/preprocess/pseudo_queries_generator.sh -------------------------------------------------------------------------------- /src/scripts/preprocess/sample_top_docs.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/preprocess/sample_top_docs.sh -------------------------------------------------------------------------------- /src/scripts/sft/launch_SFT_training.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/scripts/sft/launch_SFT_training.sh -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/utils/compare_models.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/utils/compare_models.py -------------------------------------------------------------------------------- /src/utils/custom_datasets.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/utils/custom_datasets.py -------------------------------------------------------------------------------- /src/utils/custom_trainers.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/utils/custom_trainers.py -------------------------------------------------------------------------------- /src/utils/evaluate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/utils/evaluate.py -------------------------------------------------------------------------------- /src/utils/generate_demo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/utils/generate_demo.py -------------------------------------------------------------------------------- /src/utils/pretrain_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/utils/pretrain_dataset.py -------------------------------------------------------------------------------- /src/utils/run_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/utils/run_eval.py -------------------------------------------------------------------------------- /src/utils/run_t5_trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/utils/run_t5_trainer.py -------------------------------------------------------------------------------- /src/utils/run_training_pipeline.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/utils/run_training_pipeline.py -------------------------------------------------------------------------------- /src/utils/trie.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/utils/trie.py -------------------------------------------------------------------------------- /src/utils/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kidist-amde/ddro/HEAD/src/utils/utils.py --------------------------------------------------------------------------------