├── .github └── workflows │ └── documentation.yml ├── .gitignore ├── FlagEmbedding ├── __init__.py ├── abc │ ├── __init__.py │ ├── evaluation │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── data_loader.py │ │ ├── evaluator.py │ │ ├── runner.py │ │ ├── searcher.py │ │ └── utils.py │ ├── finetune │ │ ├── __init__.py │ │ ├── embedder │ │ │ ├── AbsArguments.py │ │ │ ├── AbsDataset.py │ │ │ ├── AbsModeling.py │ │ │ ├── AbsRunner.py │ │ │ ├── AbsTrainer.py │ │ │ └── __init__.py │ │ └── reranker │ │ │ ├── AbsArguments.py │ │ │ ├── AbsDataset.py │ │ │ ├── AbsModeling.py │ │ │ ├── AbsRunner.py │ │ │ ├── AbsTrainer.py │ │ │ └── __init__.py │ └── inference │ │ ├── AbsEmbedder.py │ │ ├── AbsReranker.py │ │ └── __init__.py ├── evaluation │ ├── __init__.py │ ├── air_bench │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── arguments.py │ │ ├── examples │ │ │ ├── long-doc │ │ │ │ ├── arxiv-gemini.jsonl │ │ │ │ ├── arxiv-gpt3.jsonl │ │ │ │ ├── arxiv-llama2.jsonl │ │ │ │ ├── arxiv-llm-survey.jsonl │ │ │ │ ├── book-a-brief-history-of-time_stephen-hawking.jsonl │ │ │ │ ├── book-origin-of-species_darwin.jsonl │ │ │ │ ├── healthcare-pubmed_100k-200k_1.jsonl │ │ │ │ ├── healthcare-pubmed_100k-200k_2.jsonl │ │ │ │ ├── healthcare-pubmed_100k-200k_3.jsonl │ │ │ │ ├── healthcare-pubmed_30k-40k_10-merged.jsonl │ │ │ │ ├── healthcare-pubmed_40k-50k_5-merged.jsonl │ │ │ │ ├── law-lex_files_300k-400k.jsonl │ │ │ │ ├── law-lex_files_400k-500k.jsonl │ │ │ │ ├── law-lex_files_500k-600k.jsonl │ │ │ │ └── law-lex_files_600k-700k.jsonl │ │ │ └── qa │ │ │ │ ├── arxiv.jsonl │ │ │ │ ├── finance.jsonl │ │ │ │ ├── healthcare.jsonl │ │ │ │ ├── law.jsonl │ │ │ │ ├── msmarco.jsonl │ │ │ │ ├── news.jsonl │ │ │ │ ├── web.jsonl │ │ │ │ └── wiki.jsonl │ │ └── runner.py │ ├── beir │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── arguments.py │ │ ├── data_loader.py │ │ ├── evaluator.py │ │ ├── prompts.py │ │ └── runner.py │ ├── custom │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── data_loader.py │ │ └── runner.py │ ├── miracl │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── data_loader.py │ │ └── runner.py │ ├── mkqa │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── data_loader.py │ │ ├── evaluator.py │ │ ├── runner.py │ │ └── utils │ │ │ ├── compute_metrics.py │ │ │ └── normalize_text.py │ ├── mldr │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── data_loader.py │ │ └── runner.py │ ├── msmarco │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── data_loader.py │ │ └── runner.py │ └── mteb │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── arguments.py │ │ ├── examples │ │ ├── AmazonCounterfactualClassification.csv │ │ ├── AmazonPolarityClassification.csv │ │ ├── AmazonReviewsClassification.csv │ │ ├── ArguAna.csv │ │ ├── ArxivClusteringP2P.csv │ │ ├── ArxivClusteringS2S.csv │ │ ├── AskUbuntuDupQuestions.csv │ │ ├── BIOSSES.csv │ │ ├── Banking77Classification.csv │ │ ├── BiorxivClusteringP2P.csv │ │ ├── BiorxivClusteringS2S.csv │ │ ├── CQADupstack.csv │ │ ├── CQADupstackRetrieval.csv │ │ ├── ClimateFEVER.csv │ │ ├── DBPedia.csv │ │ ├── EmotionClassification.csv │ │ ├── FEVER.csv │ │ ├── FiQA2018.csv │ │ ├── HotpotQA.csv │ │ ├── ImdbClassification.csv │ │ ├── MSMARCO.csv │ │ ├── MTOPDomainClassification.csv │ │ ├── MTOPIntentClassification.csv │ │ ├── MassiveIntentClassification.csv │ │ ├── MassiveScenarioClassification.csv │ │ ├── MedrxivClusteringP2P.csv │ │ ├── MedrxivClusteringS2S.csv │ │ ├── MindSmallReranking.csv │ │ ├── NFCorpus.csv │ │ ├── NQ.csv │ │ ├── QuoraRetrieval.csv │ │ ├── RedditClustering.csv │ │ ├── RedditClusteringP2P.csv │ │ ├── SCIDOCS.csv │ │ ├── SICK-R.csv │ │ ├── STS12.csv │ │ ├── STS13.csv │ │ ├── STS14.csv │ │ ├── STS15.csv │ │ ├── STS16.csv │ │ ├── STS17.csv │ │ ├── STS22.csv │ │ ├── STSBenchmark.csv │ │ ├── SciDocsRR.csv │ │ ├── SciFact.csv │ │ ├── SprintDuplicateQuestions.csv │ │ ├── StackExchangeClustering.csv │ │ ├── StackExchangeClusteringP2P.csv │ │ ├── StackOverflowDupQuestions.csv │ │ ├── SummEval.csv │ │ ├── TRECCOVID.csv │ │ ├── Touche2020.csv │ │ ├── ToxicConversationsClassification.csv │ │ ├── TweetSentimentExtractionClassification.csv │ │ ├── TwentyNewsgroupsClustering.csv │ │ ├── TwitterSemEval2015.csv │ │ └── TwitterURLCorpus.csv │ │ ├── prompts.py │ │ ├── runner.py │ │ └── searcher.py ├── finetune │ ├── __init__.py │ ├── embedder │ │ ├── __init__.py │ │ ├── decoder_only │ │ │ ├── __init__.py │ │ │ ├── base │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── arguments.py │ │ │ │ ├── load_model.py │ │ │ │ ├── modeling.py │ │ │ │ ├── runner.py │ │ │ │ └── trainer.py │ │ │ └── icl │ │ │ │ ├── __init__.py │ │ │ │ ├── __main__.py │ │ │ │ ├── arguments.py │ │ │ │ ├── dataset.py │ │ │ │ ├── load_model.py │ │ │ │ ├── modeling.py │ │ │ │ ├── runner.py │ │ │ │ └── trainer.py │ │ └── encoder_only │ │ │ ├── __init__.py │ │ │ ├── base │ │ │ ├── __init__.py │ │ │ ├── __main__.py │ │ │ ├── modeling.py │ │ │ ├── runner.py │ │ │ └── trainer.py │ │ │ └── m3 │ │ │ ├── __init__.py │ │ │ ├── __main__.py │ │ │ ├── arguments.py │ │ │ ├── modeling.py │ │ │ ├── runner.py │ │ │ └── trainer.py │ └── reranker │ │ ├── __init__.py │ │ ├── decoder_only │ │ ├── __init__.py │ │ ├── base │ │ │ ├── __init__.py │ │ │ ├── __main__.py │ │ │ ├── arguments.py │ │ │ ├── load_model.py │ │ │ ├── modeling.py │ │ │ ├── runner.py │ │ │ └── trainer.py │ │ └── layerwise │ │ │ ├── __init__.py │ │ │ ├── __main__.py │ │ │ ├── arguments.py │ │ │ ├── configuration_minicpm_reranker.py │ │ │ ├── load_model.py │ │ │ ├── modeling.py │ │ │ ├── modeling_minicpm_reranker.py │ │ │ ├── runner.py │ │ │ └── trainer.py │ │ └── encoder_only │ │ ├── __init__.py │ │ └── base │ │ ├── __init__.py │ │ ├── __main__.py │ │ ├── modeling.py │ │ ├── runner.py │ │ └── trainer.py └── inference │ ├── __init__.py │ ├── auto_embedder.py │ ├── auto_reranker.py │ ├── embedder │ ├── __init__.py │ ├── decoder_only │ │ ├── __init__.py │ │ ├── base.py │ │ └── icl.py │ ├── encoder_only │ │ ├── __init__.py │ │ ├── base.py │ │ └── m3.py │ └── model_mapping.py │ └── reranker │ ├── __init__.py │ ├── decoder_only │ ├── __init__.py │ ├── base.py │ ├── layerwise.py │ ├── lightweight.py │ └── models │ │ ├── __init__.py │ │ ├── configuration_minicpm_reranker.py │ │ ├── gemma_config.py │ │ ├── gemma_model.py │ │ └── modeling_minicpm_reranker.py │ ├── encoder_only │ ├── __init__.py │ └── base.py │ └── model_mapping.py ├── LICENSE ├── Manifest.in ├── README.md ├── README_zh.md ├── Tutorials ├── 1_Embedding │ ├── 1.1_Intro&Inference.ipynb │ ├── 1.2.1_BGE_Series.ipynb │ ├── 1.2.2_Auto_Embedder.ipynb │ ├── 1.2.3_BGE_v1&1.5.ipynb │ ├── 1.2.4_BGE-M3.ipynb │ ├── 1.2.5_BGE_EN_ICL.ipynb │ └── 1.2.6_BGE_VL.ipynb ├── 2_Metrics │ ├── 2.1_Similarity_Metrics.ipynb │ └── 2.2_Eval_Metrics.ipynb ├── 3_Indexing │ ├── 3.1.1_Intro_to_Faiss.ipynb │ ├── 3.1.2_Faiss_GPU.ipynb │ ├── 3.1.3_Faiss_Indexes.ipynb │ ├── 3.1.4_Faiss_Quantizers.ipynb │ └── 3.1.5_Faiss_Index_Choosing.ipynb ├── 4_Evaluation │ ├── 4.1.1_Evaluation_MSMARCO.ipynb │ ├── 4.2.1_MTEB_Intro.ipynb │ ├── 4.2.2_MTEB_Leaderboard.ipynb │ ├── 4.2.3_C-MTEB.ipynb │ ├── 4.3.1_Sentence_Transformers_Eval.ipynb │ ├── 4.4.1_BEIR.ipynb │ ├── 4.5.1_MIRACL.ipynb │ ├── 4.5.2_MLDR.ipynb │ └── utils │ │ ├── compute_metrics.py │ │ └── normalize_text.py ├── 5_Reranking │ ├── 5.1_Intro.ipynb │ ├── 5.2_BGE_Reranker.ipynb │ └── 5.3_Reranker_Eval.ipynb ├── 6_RAG │ ├── 6.1_RAG_From_Scratch.ipynb │ ├── 6.2_RAG_LangChain.ipynb │ └── 6.3_RAG_LlamaIndex.ipynb ├── 7_Fine-tuning │ ├── 7.1.1_Data_preparation.ipynb │ ├── 7.1.2_Fine-tune.ipynb │ ├── 7.1.3_Eval_FT_Model.ipynb │ ├── 7.2.1_Hard_Negative_Mining.ipynb │ └── config │ │ ├── ds_stage0.json │ │ └── ds_stage1.json ├── README.md ├── quick_start.ipynb └── tutorial_map.png ├── dataset └── README.md ├── docs ├── Makefile ├── README.md ├── make.bat ├── requirements.txt └── source │ ├── API │ ├── abc.rst │ ├── abc │ │ ├── evaluation.rst │ │ ├── evaluation │ │ │ ├── arguments.rst │ │ │ ├── data_loader.rst │ │ │ ├── evaluator.rst │ │ │ ├── runner.rst │ │ │ └── searcher.rst │ │ ├── finetune.rst │ │ ├── finetune │ │ │ ├── embedder.rst │ │ │ ├── embedder │ │ │ │ ├── AbsArguments.rst │ │ │ │ ├── AbsDataset.rst │ │ │ │ ├── AbsModeling.rst │ │ │ │ ├── AbsRunner.rst │ │ │ │ └── AbsTrainer.rst │ │ │ ├── reranker.rst │ │ │ └── reranker │ │ │ │ ├── AbsArguments.rst │ │ │ │ ├── AbsDataset.rst │ │ │ │ ├── AbsModeling.rst │ │ │ │ ├── AbsRunner.rst │ │ │ │ └── AbsTrainer.rst │ │ ├── inference.rst │ │ └── inference │ │ │ ├── AbsEmbedder.rst │ │ │ └── AbsReranker.rst │ ├── evaluation.rst │ ├── evaluation │ │ ├── airbench.rst │ │ ├── airbench │ │ │ ├── arguments.rst │ │ │ └── runner.rst │ │ ├── beir.rst │ │ ├── beir │ │ │ ├── arguments.rst │ │ │ ├── data_loader.rst │ │ │ ├── evaluator.rst │ │ │ └── runner.rst │ │ ├── miracl.rst │ │ ├── miracl │ │ │ ├── data_loader.rst │ │ │ └── runner.rst │ │ ├── mkqa.rst │ │ ├── mkqa │ │ │ ├── data_loader.rst │ │ │ ├── evaluator.rst │ │ │ └── runner.rst │ │ ├── mldr.rst │ │ ├── mldr │ │ │ ├── data_loader.rst │ │ │ └── runner.rst │ │ ├── msmarco.rst │ │ ├── msmarco │ │ │ ├── data_loader.rst │ │ │ └── runner.rst │ │ ├── mteb.rst │ │ └── mteb │ │ │ ├── arguments.rst │ │ │ ├── runner.rst │ │ │ └── searcher.rst │ ├── finetune.rst │ ├── finetune │ │ ├── embedder.rst │ │ ├── embedder │ │ │ ├── decoder_only.rst │ │ │ ├── decoder_only │ │ │ │ ├── base.rst │ │ │ │ ├── base │ │ │ │ │ ├── arguments.rst │ │ │ │ │ ├── modeling.rst │ │ │ │ │ ├── runner.rst │ │ │ │ │ └── trainer.rst │ │ │ │ ├── icl.rst │ │ │ │ └── icl │ │ │ │ │ ├── arguments.rst │ │ │ │ │ ├── dataset.rst │ │ │ │ │ ├── modeling.rst │ │ │ │ │ ├── runner.rst │ │ │ │ │ └── trainer.rst │ │ │ ├── encoder_only.rst │ │ │ └── encoder_only │ │ │ │ ├── base.rst │ │ │ │ ├── base │ │ │ │ ├── modeling.rst │ │ │ │ ├── runner.rst │ │ │ │ └── trainer.rst │ │ │ │ ├── m3.rst │ │ │ │ └── m3 │ │ │ │ ├── arguments.rst │ │ │ │ ├── modeling.rst │ │ │ │ ├── runner.rst │ │ │ │ └── trainer.rst │ │ ├── reranker.rst │ │ └── reranker │ │ │ ├── decoder_only.rst │ │ │ ├── decoder_only │ │ │ ├── base.rst │ │ │ ├── base │ │ │ │ ├── arguments.rst │ │ │ │ ├── modeling.rst │ │ │ │ ├── runner.rst │ │ │ │ └── trainer.rst │ │ │ ├── layerwise.rst │ │ │ └── layerwise │ │ │ │ ├── arguments.rst │ │ │ │ ├── modeling.rst │ │ │ │ ├── runner.rst │ │ │ │ └── trainer.rst │ │ │ ├── encoder_only.rst │ │ │ └── encoder_only │ │ │ ├── base.rst │ │ │ └── base │ │ │ ├── modeling.rst │ │ │ ├── runner.rst │ │ │ └── trainer.rst │ ├── index.rst │ ├── inference.rst │ └── inference │ │ ├── FlagAutoModel.rst │ │ ├── FlagAutoReranker.rst │ │ ├── embedder │ │ ├── decoder_only │ │ │ ├── BaseLLMEmbedder.rst │ │ │ └── ICLLLMEmbedder.rst │ │ ├── embedder.rst │ │ └── encoder_only │ │ │ ├── BaseEmbedder.rst │ │ │ └── M3Embedder.rst │ │ └── reranker │ │ ├── decoder_only │ │ ├── BaseLLMReranker.rst │ │ ├── LayerWiseLLMReranker.rst │ │ └── LightweightLLMReranker.rst │ │ ├── encoder_only │ │ └── BaseReranker.rst │ │ └── reranker.rst │ ├── C-MTEB.rst │ ├── FAQ │ └── index.rst │ ├── Introduction │ ├── IR.rst │ ├── embedder.rst │ ├── index.rst │ ├── installation.rst │ ├── overview.rst │ ├── quick_start.rst │ ├── reranker.rst │ ├── retrieval_demo.ipynb │ └── similarity.rst │ ├── _static │ ├── css │ │ └── custom.css │ └── img │ │ ├── BAAI_logo.png │ │ ├── BGE_WeChat_Group.png │ │ ├── C_MTEB.png │ │ ├── RAG_pipeline.png │ │ ├── bge_logo.jpeg │ │ ├── bge_panda.jpg │ │ ├── projects.png │ │ └── word2vec.png │ ├── bge │ ├── bge_icl.rst │ ├── bge_m3.rst │ ├── bge_reranker.rst │ ├── bge_reranker_v2.rst │ ├── bge_v1_v1.5.rst │ ├── bge_vl.rst │ └── index.rst │ ├── community │ └── index.rst │ ├── conf.py │ ├── index.rst │ └── tutorial │ ├── 1_Embedding.rst │ ├── 1_Embedding │ ├── 1.1.1.ipynb │ ├── 1.2.1.ipynb │ ├── 1.2.2.ipynb │ ├── 1.2.3.ipynb │ ├── 1.2.4.ipynb │ └── 1.2.5.ipynb │ ├── 2_Metrics.rst │ ├── 2_Metrics │ ├── 2.1.ipynb │ └── 2.2.ipynb │ ├── 3_Indexing.rst │ ├── 3_Indexing │ ├── 3.1.1.ipynb │ ├── 3.1.2.ipynb │ ├── 3.1.3.ipynb │ ├── 3.1.4.ipynb │ └── 3.1.5.ipynb │ ├── 4_Evaluation.rst │ ├── 4_Evaluation │ ├── 4.1.1.ipynb │ ├── 4.2.1.ipynb │ ├── 4.2.2.ipynb │ ├── 4.2.3.ipynb │ ├── 4.3.1.ipynb │ ├── 4.4.1.ipynb │ ├── 4.5.1.ipynb │ └── 4.5.2.ipynb │ ├── 5_Reranking.rst │ ├── 5_Reranking │ ├── 5.1.ipynb │ ├── 5.2.ipynb │ └── 5.3.ipynb │ ├── 6_RAG.rst │ ├── 6_RAG │ ├── 6.1.ipynb │ ├── 6.2.ipynb │ └── 6.3.ipynb │ ├── 7_Finetuning.rst │ ├── 7_Finetuning │ ├── 7.1.1.ipynb │ ├── 7.1.2.ipynb │ ├── 7.1.3.ipynb │ └── 7.2.1.ipynb │ └── index.rst ├── examples ├── README.md ├── evaluation │ ├── README.md │ ├── air_bench │ │ └── eval_air_bench.sh │ ├── beir │ │ └── eval_beir.sh │ ├── miracl │ │ └── eval_miracl.sh │ ├── mkqa │ │ └── eval_mkqa.sh │ ├── mldr │ │ └── eval_mldr.sh │ ├── msmarco │ │ └── eval_msmarco.sh │ └── mteb │ │ └── eval_mteb.sh ├── finetune │ ├── ds_stage0.json │ ├── ds_stage1.json │ ├── embedder │ │ ├── README.md │ │ ├── decoder_only │ │ │ ├── base.sh │ │ │ ├── base_same_dataset.sh │ │ │ └── icl_same_dataset.sh │ │ ├── encoder_only │ │ │ ├── base.sh │ │ │ ├── base_same_dataset.sh │ │ │ ├── m3.sh │ │ │ └── m3_same_dataset.sh │ │ └── example_data │ │ │ ├── classification-no_in_batch_neg │ │ │ ├── AmazonClassification.jsonl │ │ │ └── Banking77Classification.jsonl │ │ │ ├── clustering-no_in_batch_neg │ │ │ ├── arXiv_title.jsonl │ │ │ └── bioRXiv_title.jsonl │ │ │ ├── retrieval │ │ │ ├── msmarco.jsonl │ │ │ ├── nli.jsonl │ │ │ └── nq.jsonl │ │ │ └── sts │ │ │ └── sts.jsonl │ └── reranker │ │ ├── README.md │ │ ├── decoder_only │ │ ├── base.sh │ │ └── layerwise.sh │ │ ├── encoder_only │ │ └── base.sh │ │ └── example_data │ │ ├── normal │ │ └── examples.jsonl │ │ └── prompt_based │ │ └── examples.jsonl └── inference │ ├── embedder │ ├── README.md │ ├── decoder_only │ │ ├── auto_base_multi_devices.py │ │ ├── auto_base_single_device.py │ │ ├── auto_icl_multi_devices.py │ │ ├── auto_icl_single_device.py │ │ ├── base_multi_devices.py │ │ ├── base_single_device.py │ │ ├── icl_multi_devices.py │ │ └── icl_single_device.py │ └── encoder_only │ │ ├── auto_base_multi_devices.py │ │ ├── auto_base_single_device.py │ │ ├── auto_m3_multi_devices.py │ │ ├── auto_m3_single_device.py │ │ ├── base_multi_devices.py │ │ ├── base_single_device.py │ │ ├── m3_multi_devices.py │ │ ├── m3_multi_devices_compute_score.py │ │ ├── m3_single_device.py │ │ └── m3_single_device_compute_score.py │ └── reranker │ ├── README.md │ ├── decoder_only │ ├── auto_base_multi_devices.py │ ├── auto_base_single_device.py │ ├── auto_layerwise_multi_devices.py │ ├── auto_layerwise_single_device.py │ ├── auto_lightweight_multi_devices.py │ ├── auto_lightweight_single_device.py │ ├── base_multi_devices.py │ ├── base_single_device.py │ ├── layerwise_multi_devices.py │ ├── layerwise_single_device.py │ ├── lightweight_multi_devices.py │ └── lightweight_single_device.py │ └── encoder_only │ ├── auto_base_multi_devices.py │ ├── auto_base_single_device.py │ ├── base_multi_devices.py │ └── base_single_device.py ├── imgs ├── BGE_WeChat_Group.png ├── FlagOpen.png ├── bge_logo.jpg ├── cir_candi_1.png ├── cir_candi_2.png ├── cir_query.png └── projects.png ├── research ├── BGE_Coder │ ├── README.md │ ├── data_generation │ │ ├── constant.py │ │ ├── corpus_generator.py │ │ ├── format_generated_examples.py │ │ ├── llm.py │ │ ├── run_generation.py │ │ ├── search.py │ │ ├── triplet_generator.py │ │ └── utils.py │ ├── evaluation │ │ ├── coderag_eval │ │ │ ├── eval.sh │ │ │ ├── prepare_data.sh │ │ │ └── test │ │ │ │ ├── arguments.py │ │ │ │ ├── create │ │ │ │ ├── code_search_net.py │ │ │ │ ├── ds1000.py │ │ │ │ ├── general_programming.py │ │ │ │ ├── humaneval.py │ │ │ │ ├── live_code_bench.py │ │ │ │ ├── mbpp.py │ │ │ │ ├── odex.py │ │ │ │ ├── repoeval.py │ │ │ │ ├── repoeval_repo.py │ │ │ │ ├── swebench.py │ │ │ │ ├── swebench_repo.py │ │ │ │ └── utils.py │ │ │ │ ├── main.py │ │ │ │ └── prompts.py │ │ └── coir_eval │ │ │ ├── arguments.py │ │ │ ├── eval.sh │ │ │ ├── main.py │ │ │ └── prompts.py │ └── paper │ │ └── CodeR.pdf ├── BGE_M3 │ ├── BGE_M3.pdf │ ├── README.md │ ├── __init__.py │ ├── arguments.py │ ├── data.py │ ├── imgs │ │ ├── bm25.jpg │ │ ├── long.jpg │ │ ├── miracl.jpg │ │ ├── mkqa.jpg │ │ ├── nqa.jpg │ │ └── others.webp │ ├── modeling.py │ ├── run.py │ ├── split_data_by_length.py │ └── trainer.py ├── BGE_VL │ ├── LICENSE │ ├── README.md │ ├── assets │ │ ├── cir_candi_1.png │ │ ├── cir_candi_2.png │ │ ├── cir_query.png │ │ ├── corpus │ │ │ ├── 000000032077.jpg │ │ │ ├── 000000050549.jpg │ │ │ ├── 000000098911.jpg │ │ │ ├── 000000156031.jpg │ │ │ ├── 000000244097.jpg │ │ │ ├── 000000272130.jpg │ │ │ ├── 000000275230.jpg │ │ │ ├── 000000311907.jpg │ │ │ ├── 000000357304.jpg │ │ │ ├── 000000478916.jpg │ │ │ └── 000000545037.jpg │ │ ├── query │ │ │ └── 000000530944.jpg │ │ ├── res-ft-mmeb.png │ │ ├── res-scaling.png │ │ ├── res-zs-cir.png │ │ └── res-zs-mmeb.png │ ├── eval │ │ ├── data │ │ │ ├── circo_corpus.jsonl │ │ │ ├── circo_query.jsonl │ │ │ ├── fashioniq_dress_corpus.jsonl │ │ │ ├── fashioniq_dress_query_val.jsonl │ │ │ ├── fashioniq_shirt_corpus.jsonl │ │ │ ├── fashioniq_shirt_query_val.jsonl │ │ │ ├── fashioniq_toptee_corpus.jsonl │ │ │ └── fashioniq_toptee_query_val.jsonl │ │ ├── eval_Circo.py │ │ ├── eval_fashioniq.py │ │ ├── flag_dataset.py │ │ ├── flag_mmret.py │ │ └── results │ │ │ ├── mmret_base_circo.json │ │ │ └── mmret_large_circo.json │ ├── modeling_MMRet_CLIP.py │ └── retrieval_demo.ipynb ├── BGE_VL_Screenshot │ ├── README.md │ └── assets │ │ ├── neg_1.jpeg │ │ ├── neg_2.jpeg │ │ ├── positive_1.jpeg │ │ ├── positive_2.jpeg │ │ ├── query_1.png │ │ └── query_2.png ├── C_MTEB │ ├── C_MTEB │ │ ├── __init__.py │ │ └── tasks │ │ │ ├── Classification.py │ │ │ ├── Clustering.py │ │ │ ├── MultiLongDocRetrieval.py │ │ │ ├── PairClassification.py │ │ │ ├── Reranking.py │ │ │ ├── Retrieval.py │ │ │ ├── STS.py │ │ │ └── __init__.py │ ├── MKQA │ │ ├── README.md │ │ ├── dense_retrieval │ │ │ ├── step0-generate_embedding.py │ │ │ ├── step1-search_results.py │ │ │ └── step2-eval_dense_mkqa.py │ │ ├── hybrid_retrieval │ │ │ ├── step0-hybrid_search_results.py │ │ │ └── step1-eval_hybrid_mkqa.py │ │ ├── multi_vector_rerank │ │ │ ├── hybrid_all_results.py │ │ │ ├── step0-rerank_results.py │ │ │ └── step1-eval_rerank_mkqa.py │ │ ├── sparse_retrieval │ │ │ ├── bm25_baseline.py │ │ │ ├── bm25_baseline_same_tokenizer.py │ │ │ ├── step0-encode_query-and-corpus.py │ │ │ ├── step1-search_results.py │ │ │ └── step2-eval_sparse_mkqa.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── evaluation.py │ │ │ └── normalize_text.py │ ├── MLDR │ │ ├── README.md │ │ ├── dense_retrieval │ │ │ ├── step0-generate_embedding.py │ │ │ ├── step1-search_results.py │ │ │ └── step2-eval_dense_mldr.py │ │ ├── hybrid_retrieval │ │ │ ├── step0-hybrid_search_results.py │ │ │ └── step1-eval_hybrid_mldr.py │ │ ├── mteb_dense_eval │ │ │ ├── eval_MLDR.py │ │ │ └── flag_dres_model.py │ │ ├── multi_vector_rerank │ │ │ ├── hybrid_all_results.py │ │ │ ├── step0-rerank_results.py │ │ │ └── step1-eval_rerank_mldr.py │ │ └── sparse_retrieval │ │ │ ├── bm25_baseline.py │ │ │ ├── bm25_baseline_same_tokenizer.py │ │ │ ├── step0-encode_query-and-corpus.py │ │ │ ├── step1-search_results.py │ │ │ └── step2-eval_sparse_mldr.py │ ├── README.md │ ├── eval_C-MTEB.py │ ├── eval_MTEB.py │ ├── eval_cross_encoder.py │ ├── flag_dres_model.py │ ├── setup.py │ └── summarize_results.py ├── LLARA │ ├── README.md │ ├── data │ │ ├── finetune │ │ │ └── toy_finetune_data.jsonl │ │ └── pretrain │ │ │ └── toy_pretrain_data.jsonl │ ├── finetune │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── data.py │ │ ├── load_model.py │ │ ├── modeling.py │ │ ├── run.py │ │ └── trainer.py │ ├── pretrain │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── data.py │ │ ├── load_model.py │ │ ├── modeling.py │ │ ├── run.py │ │ └── trainer.py │ └── stage1.json ├── LM_Cocktail │ ├── LM_Cocktail │ │ ├── __init__.py │ │ ├── cocktail.py │ │ └── utils.py │ ├── README.md │ ├── embedder_examples.json │ ├── images │ │ ├── 1.png │ │ └── pic.png │ ├── llm_examples.json │ └── setup.py ├── Long_LLM │ ├── activation_beacon │ │ ├── README.md │ │ ├── data │ │ │ ├── config │ │ │ │ ├── code.json │ │ │ │ ├── even.json │ │ │ │ ├── fsdp-offload.yaml │ │ │ │ ├── fsdp.yaml │ │ │ │ ├── slimpajama.json │ │ │ │ ├── zero3-infer-offload.yaml │ │ │ │ └── zero3-infer.yaml │ │ │ ├── deepspeed │ │ │ │ ├── stage2-offload.json │ │ │ │ ├── stage2.json │ │ │ │ ├── stage3-offload-optim.json │ │ │ │ ├── stage3-offload.json │ │ │ │ └── stage3.json │ │ │ └── toy │ │ │ │ └── infbench.json │ │ ├── examples │ │ │ ├── evaluation.md │ │ │ └── training.md │ │ ├── main │ │ │ ├── eval_generation.py │ │ │ ├── eval_infbench.py │ │ │ ├── eval_lm.py │ │ │ ├── eval_longbench.py │ │ │ ├── eval_mmlu.py │ │ │ ├── eval_msc.py │ │ │ ├── eval_multiturn.py │ │ │ ├── eval_needle.py │ │ │ ├── eval_passkey.py │ │ │ ├── eval_topic.py │ │ │ ├── infbench_utils.py │ │ │ ├── longbench_utils.py │ │ │ ├── pretrain_data.py │ │ │ ├── train.py │ │ │ └── vllm_symlink.py │ │ └── src │ │ │ ├── __init__.py │ │ │ ├── args.py │ │ │ ├── chat.py │ │ │ ├── data.py │ │ │ ├── llama │ │ │ ├── __init__.py │ │ │ ├── configuration_llama.py │ │ │ └── modeling_llama.py │ │ │ ├── metrics.py │ │ │ ├── mistral │ │ │ ├── __init__.py │ │ │ ├── configuration_mistral.py │ │ │ └── modeling_mistral.py │ │ │ ├── modeling_beacon.py │ │ │ ├── modeling_utils.py │ │ │ ├── qwen2 │ │ │ ├── __init__.py │ │ │ ├── configuration_qwen2.py │ │ │ └── modeling_qwen2.py │ │ │ ├── trainer.py │ │ │ ├── utils.py │ │ │ └── vllm_utils.py │ └── longllm_qlora │ │ ├── README.md │ │ ├── data │ │ └── narrativeqa.json │ │ ├── data_pipeline │ │ ├── README.md │ │ ├── _openai.py │ │ ├── data │ │ │ └── README.md │ │ ├── prepare_bio_book.ipynb │ │ ├── prepare_multi_details_book.ipynb │ │ ├── prepare_multi_details_paper_long.ipynb │ │ ├── prepare_one_detail_book.ipynb │ │ ├── prepare_one_detail_paper_long.ipynb │ │ └── raw_data │ │ │ └── README.md │ │ ├── imgs │ │ └── needle.png │ │ ├── main │ │ ├── eval_generation.py │ │ ├── eval_infbench.py │ │ ├── eval_lm.py │ │ ├── eval_longbench.py │ │ ├── eval_mmlu.py │ │ ├── eval_needle.py │ │ ├── eval_passkey.py │ │ ├── eval_topic.py │ │ ├── infbench_utils.py │ │ ├── longbench_utils.py │ │ └── train.py │ │ └── src │ │ ├── __init__.py │ │ ├── args.py │ │ ├── chat.py │ │ ├── data.py │ │ ├── metrics.py │ │ ├── modeling_utils.py │ │ ├── trainer.py │ │ └── utils.py ├── MLVU │ ├── README.md │ ├── data │ │ ├── 1_plotQA.json │ │ ├── 2_needle.json │ │ ├── 3_ego.json │ │ ├── 4_count.json │ │ ├── 5_order.json │ │ ├── 6_anomaly_reco.json │ │ ├── 7_topic_reasoning.json │ │ ├── 8_sub_scene.json │ │ └── 9_summary.json │ ├── evaluation │ │ ├── README.md │ │ ├── generation_evaluation │ │ │ ├── calculate.py │ │ │ ├── calculate_sum.py │ │ │ ├── evaluate_ssc.py │ │ │ ├── evaluate_summary.py │ │ │ └── open_bench.py │ │ ├── models │ │ │ ├── videochat2 │ │ │ │ ├── choice_bench.py │ │ │ │ └── open_bench.py │ │ │ └── videollava │ │ │ │ ├── choice_bench.py │ │ │ │ └── open_bench.py │ │ └── multiple_choice_evaluation │ │ │ └── choice_bench.py │ └── figs │ │ ├── statistic.png │ │ └── task_example.png ├── Matroyshka_reranker │ ├── README.md │ ├── finetune │ │ ├── compensation │ │ │ ├── __init__.py │ │ │ ├── arguments.py │ │ │ ├── data.py │ │ │ ├── load_model.py │ │ │ ├── mistral_config.py │ │ │ ├── mistral_model.py │ │ │ ├── modeling.py │ │ │ ├── run.py │ │ │ ├── stage1.json │ │ │ └── trainer.py │ │ └── self_distillation │ │ │ ├── __init__.py │ │ │ ├── arguments.py │ │ │ ├── data.py │ │ │ ├── load_model.py │ │ │ ├── mistral_config.py │ │ │ ├── mistral_model.py │ │ │ ├── modeling.py │ │ │ ├── run.py │ │ │ ├── stage1.json │ │ │ └── trainer.py │ ├── inference │ │ ├── __init__.py │ │ ├── mistral_config.py │ │ ├── mistral_model.py │ │ └── rank_model.py │ └── requirements.txt ├── README.md ├── Reinforced_IR │ ├── README.md │ ├── data_generation │ │ ├── agent │ │ │ ├── __init__.py │ │ │ ├── gpt.py │ │ │ ├── vllm.py │ │ │ └── vllm_instruct.py │ │ ├── generate_generator_data.py │ │ ├── generate_retriever_data.py │ │ ├── generate_retriever_distill_data.py │ │ ├── generate_universal_query.py │ │ ├── prompts │ │ │ ├── __init__.py │ │ │ ├── generate_prompts.py │ │ │ ├── get_prompts.py │ │ │ ├── hyde_prompts.py │ │ │ ├── teacher_prompts.py │ │ │ └── train_prompts.py │ │ └── utils.py │ ├── finetune │ │ ├── generator │ │ │ ├── save_tokenizer.py │ │ │ └── update_file.py │ │ ├── retriever │ │ │ ├── arguments.py │ │ │ ├── dataset.py │ │ │ ├── modeling.py │ │ │ ├── run.py │ │ │ ├── runner.py │ │ │ └── trainer.py │ │ └── stage1.json │ ├── inference │ │ ├── agent │ │ │ ├── __init__.py │ │ │ ├── gpt.py │ │ │ ├── vllm.py │ │ │ └── vllm_instruct.py │ │ ├── ir_model.py │ │ ├── multi.py │ │ └── test.py │ └── requirements.txt ├── baai_general_embedding │ ├── README.md │ ├── __init__.py │ ├── finetune │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── data.py │ │ ├── eval_msmarco.py │ │ ├── hn_mine.py │ │ ├── modeling.py │ │ ├── run.py │ │ └── trainer.py │ └── retromae_pretrain │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── data.py │ │ ├── enhancedDecoder.py │ │ ├── modeling.py │ │ ├── run.py │ │ ├── trainer.py │ │ └── utils.py ├── llm_dense_retriever │ ├── README.md │ ├── examples │ │ └── bge-en-icl │ │ │ ├── AIR-Bench │ │ │ ├── long-doc │ │ │ │ ├── arxiv-gemini.jsonl │ │ │ │ ├── arxiv-gpt3.jsonl │ │ │ │ ├── arxiv-llama2.jsonl │ │ │ │ ├── arxiv-llm-survey.jsonl │ │ │ │ ├── book-a-brief-history-of-time_stephen-hawking.jsonl │ │ │ │ ├── book-origin-of-species_darwin.jsonl │ │ │ │ ├── healthcare-pubmed_100k-200k_1.jsonl │ │ │ │ ├── healthcare-pubmed_100k-200k_2.jsonl │ │ │ │ ├── healthcare-pubmed_100k-200k_3.jsonl │ │ │ │ ├── healthcare-pubmed_30k-40k_10-merged.jsonl │ │ │ │ ├── healthcare-pubmed_40k-50k_5-merged.jsonl │ │ │ │ ├── law-lex_files_300k-400k.jsonl │ │ │ │ ├── law-lex_files_400k-500k.jsonl │ │ │ │ ├── law-lex_files_500k-600k.jsonl │ │ │ │ └── law-lex_files_600k-700k.jsonl │ │ │ └── qa │ │ │ │ ├── arxiv.jsonl │ │ │ │ ├── finance.jsonl │ │ │ │ ├── healthcare.jsonl │ │ │ │ ├── law.jsonl │ │ │ │ ├── msmarco.jsonl │ │ │ │ ├── news.jsonl │ │ │ │ ├── web.jsonl │ │ │ │ └── wiki.jsonl │ │ │ └── MTEB │ │ │ ├── AmazonCounterfactualClassification.json │ │ │ ├── AmazonPolarityClassification.json │ │ │ ├── AmazonReviewsClassification.json │ │ │ ├── ArguAna.json │ │ │ ├── ArxivClusteringP2P.json │ │ │ ├── ArxivClusteringS2S.json │ │ │ ├── AskUbuntuDupQuestions.json │ │ │ ├── BIOSSES.json │ │ │ ├── Banking77Classification.json │ │ │ ├── BiorxivClusteringP2P.json │ │ │ ├── BiorxivClusteringS2S.json │ │ │ ├── CQADupstackRetrieval.json │ │ │ ├── ClimateFEVER.json │ │ │ ├── DBPedia.json │ │ │ ├── EmotionClassification.json │ │ │ ├── FEVER.json │ │ │ ├── FiQA2018.json │ │ │ ├── HotpotQA.json │ │ │ ├── ImdbClassification.json │ │ │ ├── MSMARCO.json │ │ │ ├── MTOPDomainClassification.json │ │ │ ├── MTOPIntentClassification.json │ │ │ ├── MassiveIntentClassification.json │ │ │ ├── MassiveScenarioClassification.json │ │ │ ├── MedrxivClusteringP2P.json │ │ │ ├── MedrxivClusteringS2S.json │ │ │ ├── MindSmallReranking.json │ │ │ ├── NFCorpus.json │ │ │ ├── NQ.json │ │ │ ├── QuoraRetrieval.json │ │ │ ├── RedditClustering.json │ │ │ ├── RedditClusteringP2P.json │ │ │ ├── SCIDOCS.json │ │ │ ├── SICK-R.json │ │ │ ├── STS12.json │ │ │ ├── STS13.json │ │ │ ├── STS14.json │ │ │ ├── STS15.json │ │ │ ├── STS16.json │ │ │ ├── STS17.json │ │ │ ├── STS22.json │ │ │ ├── STSBenchmark.json │ │ │ ├── SciDocsRR.json │ │ │ ├── SciFact.json │ │ │ ├── SprintDuplicateQuestions.json │ │ │ ├── StackExchangeClustering.json │ │ │ ├── StackExchangeClusteringP2P.json │ │ │ ├── StackOverflowDupQuestions.json │ │ │ ├── SummEval.json │ │ │ ├── TRECCOVID.json │ │ │ ├── Touche2020.json │ │ │ ├── ToxicConversationsClassification.json │ │ │ ├── TweetSentimentExtractionClassification.json │ │ │ ├── TwentyNewsgroupsClustering.json │ │ │ ├── TwitterSemEval2015.json │ │ │ └── TwitterURLCorpus.json │ └── finetune │ │ ├── arguments.py │ │ ├── data.py │ │ ├── load_model.py │ │ ├── modeling.py │ │ ├── run.py │ │ └── trainer.py ├── llm_embedder │ ├── README.md │ ├── data │ │ ├── deepspeed │ │ │ ├── stage0.json │ │ │ ├── stage2-offload.json │ │ │ ├── stage2.json │ │ │ ├── stage3-offload-all.json │ │ │ ├── stage3-offload-optim.json │ │ │ └── stage3.json │ │ └── toy │ │ │ ├── chat.json │ │ │ ├── convsearch.json │ │ │ ├── icl.json │ │ │ ├── lrlm.json │ │ │ ├── qa.json │ │ │ └── tool.json │ ├── docs │ │ ├── evaluation.md │ │ └── fine-tune.md │ ├── environment.yaml │ ├── evaluation │ │ ├── __init__.py │ │ ├── eval_icl.py │ │ ├── eval_lrlm.py │ │ ├── eval_mmlu.py │ │ ├── eval_msc.py │ │ ├── eval_popqa.py │ │ ├── eval_qa.py │ │ ├── eval_qrecc.py │ │ ├── eval_retrieval.py │ │ ├── eval_tool.py │ │ └── icl_utils.py │ ├── imgs │ │ └── llm-embedder.png │ ├── run_dense.py │ ├── run_lm_score.py │ ├── run_ranker.py │ ├── scripts │ │ ├── llm-embedder.sh │ │ └── ours2st.py │ └── src │ │ ├── __init__.py │ │ ├── lm │ │ ├── __init__.py │ │ ├── args.py │ │ ├── modeling_lm.py │ │ └── modeling_srlm.py │ │ ├── retrieval │ │ ├── __init__.py │ │ ├── args.py │ │ ├── data.py │ │ ├── evalnq.py │ │ ├── metrics.py │ │ ├── modeling_bm25.py │ │ ├── modeling_dense.py │ │ ├── modeling_ranker.py │ │ ├── modeling_unified.py │ │ └── trainer.py │ │ └── utils │ │ ├── __init__.py │ │ ├── llama_patch.py │ │ └── util.py ├── llm_reranker │ ├── README.md │ ├── __init__.py │ ├── evaluate.py │ ├── evaluation │ │ ├── BEIR-bge-en-v1.5.png │ │ ├── BEIR-e5-mistral.png │ │ ├── CMTEB-retrieval-bge-zh-v1.5.png │ │ ├── llama-index.png │ │ └── miracl-bge-m3.png │ ├── finetune_for_instruction │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── data.py │ │ ├── load_model.py │ │ ├── modeling.py │ │ ├── run.py │ │ └── trainer.py │ ├── finetune_for_layerwise │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── configuration_minicpm_reranker.py │ │ ├── data.py │ │ ├── load_model.py │ │ ├── modeling.py │ │ ├── modeling_minicpm_reranker.py │ │ ├── run.py │ │ └── trainer.py │ ├── merge │ │ ├── __init__.py │ │ ├── configuration_minicpm_reranker.py │ │ ├── merge_base_model.py │ │ ├── merge_layerwise_model_from_finetuned_model.py │ │ ├── merge_layerwise_model_from_raw_model.py │ │ └── modeling_minicpm_reranker.py │ ├── stage1.json │ └── toy_finetune_data.jsonl ├── old-examples │ ├── finetune │ │ ├── README.md │ │ ├── ds_config.json │ │ ├── toy_evaluation_data │ │ │ ├── toy_corpus.json │ │ │ └── toy_query.json │ │ └── toy_finetune_data.jsonl │ ├── pretrain │ │ ├── README.md │ │ ├── retromae_pretrain │ │ │ ├── __init__.py │ │ │ ├── arguments.py │ │ │ ├── data.py │ │ │ ├── enhancedDecoder.py │ │ │ ├── modeling.py │ │ │ ├── run.py │ │ │ ├── trainer.py │ │ │ └── utils.py │ │ └── toy_pretrain_data.jsonl │ ├── reranker │ │ ├── README.md │ │ ├── ds_config.json │ │ └── toy_finetune_data.jsonl │ ├── search_demo │ │ ├── __init__.py │ │ ├── arguments.py │ │ ├── pre_process.py │ │ ├── readme.md │ │ ├── requirements.txt │ │ ├── run.py │ │ └── tool.py │ └── unified_finetune │ │ ├── README.md │ │ ├── toy_train_data │ │ ├── toy_train_data1.jsonl │ │ └── toy_train_data2.jsonl │ │ └── unified_finetune_bge-m3_exmaple.sh ├── reranker │ ├── README.md │ ├── __init__.py │ ├── arguments.py │ ├── data.py │ ├── modeling.py │ ├── run.py │ └── trainer.py └── visual_bge │ ├── README.md │ ├── __init__.py │ ├── imgs │ ├── SFT-CIRR.png │ ├── SFT-ReMuQ.png │ ├── SFT-WebQA.png │ ├── cir_candi_1.png │ ├── cir_candi_2.png │ ├── cir_query.png │ ├── wiki_candi_1.jpg │ ├── wiki_candi_2.jpg │ ├── zs-benchmark.png │ └── zs-performance.png │ ├── setup.py │ └── visual_bge │ ├── eva_clip │ ├── __init__.py │ ├── bpe_simple_vocab_16e6.txt.gz │ ├── constants.py │ ├── eva_vit_model.py │ ├── factory.py │ ├── hf_configs.py │ ├── hf_model.py │ ├── loss.py │ ├── model.py │ ├── model_configs │ │ ├── EVA01-CLIP-B-16.json │ │ ├── EVA01-CLIP-g-14-plus.json │ │ ├── EVA01-CLIP-g-14.json │ │ ├── EVA02-CLIP-B-16.json │ │ ├── EVA02-CLIP-L-14-336.json │ │ ├── EVA02-CLIP-L-14.json │ │ ├── EVA02-CLIP-bigE-14-plus.json │ │ └── EVA02-CLIP-bigE-14.json │ ├── modified_resnet.py │ ├── openai.py │ ├── pretrained.py │ ├── rope.py │ ├── timm_model.py │ ├── tokenizer.py │ ├── transform.py │ ├── transformer.py │ └── utils.py │ └── modeling.py ├── scripts ├── README.md ├── add_reranker_score.py ├── hn_mine.py └── split_data_by_length.py └── setup.py /.github/workflows/documentation.yml: -------------------------------------------------------------------------------- 1 | name: documentation 2 | 3 | on: [push, pull_request, workflow_dispatch] 4 | 5 | permissions: 6 | contents: write 7 | 8 | jobs: 9 | docs: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | - uses: actions/setup-python@v5 14 | - name: Install doc dependencies 15 | run: | 16 | pip install . sphinx myst_parser myst-nb sphinx-design pydata-sphinx-theme sphinxcontrib-googleanalytics 17 | - name: Install content dependencies 18 | run: | 19 | pip install faiss-cpu mteb air-benchmark beir 20 | - name: Sphinx build 21 | run: | 22 | sphinx-build docs/source docs/build 23 | - name: Add CNAME 24 | run: | 25 | echo bge-model.com > docs/build/CNAME 26 | - name: Deploy to GitHub Pages 27 | uses: peaceiris/actions-gh-pages@v3 28 | if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} 29 | with: 30 | publish_branch: gh-pages 31 | github_token: ${{ secrets.GITHUB_TOKEN }} 32 | publish_dir: docs/build/ 33 | force_orphan: true 34 | -------------------------------------------------------------------------------- /FlagEmbedding/__init__.py: -------------------------------------------------------------------------------- 1 | from .abc.inference import * 2 | from .inference import * 3 | -------------------------------------------------------------------------------- /FlagEmbedding/abc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/FlagEmbedding/abc/__init__.py -------------------------------------------------------------------------------- /FlagEmbedding/abc/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .arguments import AbsEvalArgs, AbsEvalModelArgs 2 | from .evaluator import AbsEvaluator 3 | from .data_loader import AbsEvalDataLoader 4 | from .searcher import EvalRetriever, EvalDenseRetriever, EvalReranker 5 | from .runner import AbsEvalRunner 6 | 7 | 8 | __all__ = [ 9 | "AbsEvalArgs", 10 | "AbsEvalModelArgs", 11 | "AbsEvaluator", 12 | "AbsEvalDataLoader", 13 | "EvalRetriever", 14 | "EvalDenseRetriever", 15 | "EvalReranker", 16 | "AbsEvalRunner", 17 | ] 18 | -------------------------------------------------------------------------------- /FlagEmbedding/abc/finetune/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/FlagEmbedding/abc/finetune/__init__.py -------------------------------------------------------------------------------- /FlagEmbedding/abc/finetune/embedder/__init__.py: -------------------------------------------------------------------------------- 1 | from .AbsArguments import ( 2 | AbsEmbedderDataArguments, 3 | AbsEmbedderModelArguments, 4 | AbsEmbedderTrainingArguments, 5 | ) 6 | from .AbsDataset import ( 7 | AbsEmbedderCollator, AbsEmbedderSameDatasetCollator, 8 | AbsEmbedderSameDatasetTrainDataset, 9 | AbsEmbedderTrainDataset, 10 | EmbedderTrainerCallbackForDataRefresh, 11 | ) 12 | from .AbsModeling import AbsEmbedderModel, EmbedderOutput 13 | from .AbsTrainer import AbsEmbedderTrainer 14 | from .AbsRunner import AbsEmbedderRunner 15 | 16 | 17 | __all__ = [ 18 | "AbsEmbedderModelArguments", 19 | "AbsEmbedderDataArguments", 20 | "AbsEmbedderTrainingArguments", 21 | "AbsEmbedderModel", 22 | "AbsEmbedderTrainer", 23 | "AbsEmbedderRunner", 24 | "AbsEmbedderTrainDataset", 25 | "AbsEmbedderCollator", 26 | "AbsEmbedderSameDatasetTrainDataset", 27 | "AbsEmbedderSameDatasetCollator", 28 | "EmbedderOutput", 29 | "EmbedderTrainerCallbackForDataRefresh", 30 | ] 31 | -------------------------------------------------------------------------------- /FlagEmbedding/abc/finetune/reranker/__init__.py: -------------------------------------------------------------------------------- 1 | from .AbsArguments import AbsRerankerDataArguments, AbsRerankerModelArguments, AbsRerankerTrainingArguments 2 | from .AbsDataset import ( 3 | AbsRerankerTrainDataset, AbsRerankerCollator, 4 | AbsLLMRerankerTrainDataset, AbsLLMRerankerCollator 5 | ) 6 | from .AbsModeling import AbsRerankerModel, RerankerOutput 7 | from .AbsTrainer import AbsRerankerTrainer 8 | from .AbsRunner import AbsRerankerRunner 9 | 10 | __all__ = [ 11 | "AbsRerankerDataArguments", 12 | "AbsRerankerModelArguments", 13 | "AbsRerankerTrainingArguments", 14 | "AbsRerankerTrainDataset", 15 | "AbsRerankerCollator", 16 | "AbsLLMRerankerTrainDataset", 17 | "AbsLLMRerankerCollator", 18 | "AbsRerankerModel", 19 | "RerankerOutput", 20 | "AbsRerankerTrainer", 21 | "AbsRerankerRunner", 22 | ] 23 | -------------------------------------------------------------------------------- /FlagEmbedding/abc/inference/__init__.py: -------------------------------------------------------------------------------- 1 | from .AbsEmbedder import AbsEmbedder 2 | from .AbsReranker import AbsReranker 3 | 4 | __all__ = [ 5 | 'AbsEmbedder', 6 | 'AbsReranker' 7 | ] 8 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/FlagEmbedding/evaluation/__init__.py -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/air_bench/__init__.py: -------------------------------------------------------------------------------- 1 | from .arguments import AIRBenchEvalModelArgs, AIRBenchEvalArgs 2 | from .runner import AIRBenchEvalRunner 3 | 4 | __all__ = [ 5 | "AIRBenchEvalModelArgs", 6 | "AIRBenchEvalArgs", 7 | "AIRBenchEvalRunner" 8 | ] 9 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/air_bench/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.evaluation.air_bench import ( 4 | AIRBenchEvalArgs, AIRBenchEvalModelArgs, 5 | AIRBenchEvalRunner 6 | ) 7 | 8 | 9 | def main(): 10 | parser = HfArgumentParser(( 11 | AIRBenchEvalArgs, 12 | AIRBenchEvalModelArgs 13 | )) 14 | 15 | eval_args, model_args = parser.parse_args_into_dataclasses() 16 | eval_args: AIRBenchEvalArgs 17 | model_args: AIRBenchEvalModelArgs 18 | 19 | runner = AIRBenchEvalRunner( 20 | eval_args=eval_args, 21 | model_args=model_args 22 | ) 23 | 24 | runner.run() 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | print("==============================================") 30 | print("Search results have been generated.") 31 | print("For computing metrics, please refer to the official AIR-Bench docs:") 32 | print("- https://github.com/AIR-Bench/AIR-Bench/blob/main/docs/submit_to_leaderboard.md") 33 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/beir/__init__.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.evaluation import ( 2 | AbsEvalModelArgs as BEIREvalModelArgs, 3 | ) 4 | 5 | from .data_loader import BEIREvalDataLoader 6 | from .arguments import BEIREvalArgs 7 | from .runner import BEIREvalRunner 8 | 9 | __all__ = [ 10 | "BEIREvalArgs", 11 | "BEIREvalModelArgs", 12 | "BEIREvalRunner", 13 | "BEIREvalDataLoader", 14 | ] 15 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/beir/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.evaluation.beir import ( 4 | BEIREvalArgs, BEIREvalModelArgs, 5 | BEIREvalRunner 6 | ) 7 | 8 | 9 | def main(): 10 | parser = HfArgumentParser(( 11 | BEIREvalArgs, 12 | BEIREvalModelArgs 13 | )) 14 | 15 | eval_args, model_args = parser.parse_args_into_dataclasses() 16 | eval_args: BEIREvalArgs 17 | model_args: BEIREvalModelArgs 18 | 19 | runner = BEIREvalRunner( 20 | eval_args=eval_args, 21 | model_args=model_args 22 | ) 23 | 24 | runner.run() 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/beir/arguments.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from FlagEmbedding.abc.evaluation.arguments import AbsEvalArgs 4 | 5 | 6 | @dataclass 7 | class BEIREvalArgs(AbsEvalArgs): 8 | """ 9 | Argument class for BEIR evaluation. 10 | """ 11 | use_special_instructions: bool = field( 12 | default=False, metadata={"help": "Whether to use specific instructions in `prompts.py` for evaluation. Default: False"} 13 | ) 14 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/custom/__init__.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.evaluation import ( 2 | AbsEvalArgs as CustomEvalArgs, 3 | AbsEvalModelArgs as CustomEvalModelArgs, 4 | ) 5 | 6 | from .data_loader import CustomEvalDataLoader 7 | from .runner import CustomEvalRunner 8 | 9 | __all__ = [ 10 | "CustomEvalArgs", 11 | "CustomEvalModelArgs", 12 | "CustomEvalRunner", 13 | "CustomEvalDataLoader", 14 | ] 15 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/custom/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.evaluation.custom import ( 4 | CustomEvalArgs, CustomEvalModelArgs, 5 | CustomEvalRunner 6 | ) 7 | 8 | 9 | def main(): 10 | parser = HfArgumentParser(( 11 | CustomEvalArgs, 12 | CustomEvalModelArgs 13 | )) 14 | 15 | eval_args, model_args = parser.parse_args_into_dataclasses() 16 | eval_args: CustomEvalArgs 17 | model_args: CustomEvalModelArgs 18 | 19 | runner = CustomEvalRunner( 20 | eval_args=eval_args, 21 | model_args=model_args 22 | ) 23 | 24 | runner.run() 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/custom/data_loader.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from tqdm import tqdm 3 | from typing import List, Optional 4 | 5 | from FlagEmbedding.abc.evaluation import AbsEvalDataLoader 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | class CustomEvalDataLoader(AbsEvalDataLoader): 11 | def available_dataset_names(self) -> List[str]: 12 | return [] 13 | 14 | def available_splits(self, dataset_name: Optional[str] = None) -> List[str]: 15 | return ["test"] 16 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/custom/runner.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.evaluation import AbsEvalRunner 2 | 3 | from .data_loader import CustomEvalDataLoader 4 | 5 | 6 | class CustomEvalRunner(AbsEvalRunner): 7 | def load_data_loader(self) -> CustomEvalDataLoader: 8 | data_loader = CustomEvalDataLoader( 9 | eval_name=self.eval_args.eval_name, 10 | dataset_dir=self.eval_args.dataset_dir, 11 | cache_dir=self.eval_args.cache_path, 12 | token=self.eval_args.token, 13 | force_redownload=self.eval_args.force_redownload, 14 | ) 15 | return data_loader 16 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/miracl/__init__.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.evaluation import ( 2 | AbsEvalArgs as MIRACLEvalArgs, 3 | AbsEvalModelArgs as MIRACLEvalModelArgs, 4 | ) 5 | 6 | from .data_loader import MIRACLEvalDataLoader 7 | from .runner import MIRACLEvalRunner 8 | 9 | __all__ = [ 10 | "MIRACLEvalArgs", 11 | "MIRACLEvalModelArgs", 12 | "MIRACLEvalRunner", 13 | "MIRACLEvalDataLoader", 14 | ] 15 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/miracl/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.evaluation.miracl import ( 4 | MIRACLEvalArgs, MIRACLEvalModelArgs, 5 | MIRACLEvalRunner 6 | ) 7 | 8 | 9 | def main(): 10 | parser = HfArgumentParser(( 11 | MIRACLEvalArgs, 12 | MIRACLEvalModelArgs 13 | )) 14 | 15 | eval_args, model_args = parser.parse_args_into_dataclasses() 16 | eval_args: MIRACLEvalArgs 17 | model_args: MIRACLEvalModelArgs 18 | 19 | runner = MIRACLEvalRunner( 20 | eval_args=eval_args, 21 | model_args=model_args 22 | ) 23 | 24 | runner.run() 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/miracl/runner.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.evaluation import AbsEvalRunner 2 | 3 | from .data_loader import MIRACLEvalDataLoader 4 | 5 | 6 | class MIRACLEvalRunner(AbsEvalRunner): 7 | """ 8 | Evaluation runner of MIRACL. 9 | """ 10 | def load_data_loader(self) -> MIRACLEvalDataLoader: 11 | """Load the data loader instance by args. 12 | 13 | Returns: 14 | MIRACLEvalDataLoader: The MIRACL data loader instance. 15 | """ 16 | data_loader = MIRACLEvalDataLoader( 17 | eval_name=self.eval_args.eval_name, 18 | dataset_dir=self.eval_args.dataset_dir, 19 | cache_dir=self.eval_args.cache_path, 20 | token=self.eval_args.token, 21 | force_redownload=self.eval_args.force_redownload, 22 | ) 23 | return data_loader 24 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mkqa/__init__.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.evaluation import ( 2 | AbsEvalArgs as MKQAEvalArgs, 3 | AbsEvalModelArgs as MKQAEvalModelArgs, 4 | ) 5 | 6 | from .data_loader import MKQAEvalDataLoader 7 | from .evaluator import MKQAEvaluator 8 | from .runner import MKQAEvalRunner 9 | 10 | __all__ = [ 11 | "MKQAEvalArgs", 12 | "MKQAEvalModelArgs", 13 | "MKQAEvalRunner", 14 | "MKQAEvalDataLoader", 15 | "MKQAEvaluator" 16 | ] 17 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mkqa/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.evaluation.mkqa import ( 4 | MKQAEvalArgs, MKQAEvalModelArgs, 5 | MKQAEvalRunner 6 | ) 7 | 8 | 9 | def main(): 10 | parser = HfArgumentParser(( 11 | MKQAEvalArgs, 12 | MKQAEvalModelArgs 13 | )) 14 | 15 | eval_args, model_args = parser.parse_args_into_dataclasses() 16 | eval_args: MKQAEvalArgs 17 | model_args: MKQAEvalModelArgs 18 | 19 | runner = MKQAEvalRunner( 20 | eval_args=eval_args, 21 | model_args=model_args 22 | ) 23 | 24 | runner.run() 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mldr/__init__.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.evaluation import ( 2 | AbsEvalArgs as MLDREvalArgs, 3 | AbsEvalModelArgs as MLDREvalModelArgs, 4 | ) 5 | 6 | from .data_loader import MLDREvalDataLoader 7 | from .runner import MLDREvalRunner 8 | 9 | __all__ = [ 10 | "MLDREvalArgs", 11 | "MLDREvalModelArgs", 12 | "MLDREvalRunner", 13 | "MLDREvalDataLoader", 14 | ] 15 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mldr/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.evaluation.mldr import ( 4 | MLDREvalArgs, MLDREvalModelArgs, 5 | MLDREvalRunner 6 | ) 7 | 8 | 9 | def main(): 10 | parser = HfArgumentParser(( 11 | MLDREvalArgs, 12 | MLDREvalModelArgs 13 | )) 14 | 15 | eval_args, model_args = parser.parse_args_into_dataclasses() 16 | eval_args: MLDREvalArgs 17 | model_args: MLDREvalModelArgs 18 | 19 | runner = MLDREvalRunner( 20 | eval_args=eval_args, 21 | model_args=model_args 22 | ) 23 | 24 | runner.run() 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mldr/runner.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.evaluation import AbsEvalRunner 2 | 3 | from .data_loader import MLDREvalDataLoader 4 | 5 | 6 | class MLDREvalRunner(AbsEvalRunner): 7 | """ 8 | Evaluation runner of MIRACL. 9 | """ 10 | def load_data_loader(self) -> MLDREvalDataLoader: 11 | """Load the data loader instance by args. 12 | 13 | Returns: 14 | MLDREvalDataLoader: The MLDR data loader instance. 15 | """ 16 | data_loader = MLDREvalDataLoader( 17 | eval_name=self.eval_args.eval_name, 18 | dataset_dir=self.eval_args.dataset_dir, 19 | cache_dir=self.eval_args.cache_path, 20 | token=self.eval_args.token, 21 | force_redownload=self.eval_args.force_redownload, 22 | ) 23 | return data_loader 24 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/msmarco/__init__.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.evaluation import ( 2 | AbsEvalArgs as MSMARCOEvalArgs, 3 | AbsEvalModelArgs as MSMARCOEvalModelArgs, 4 | ) 5 | 6 | from .data_loader import MSMARCOEvalDataLoader 7 | from .runner import MSMARCOEvalRunner 8 | 9 | __all__ = [ 10 | "MSMARCOEvalArgs", 11 | "MSMARCOEvalModelArgs", 12 | "MSMARCOEvalRunner", 13 | "MSMARCOEvalDataLoader", 14 | ] 15 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/msmarco/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.evaluation.msmarco import ( 4 | MSMARCOEvalArgs, MSMARCOEvalModelArgs, 5 | MSMARCOEvalRunner 6 | ) 7 | 8 | 9 | def main(): 10 | parser = HfArgumentParser(( 11 | MSMARCOEvalArgs, 12 | MSMARCOEvalModelArgs 13 | )) 14 | 15 | eval_args, model_args = parser.parse_args_into_dataclasses() 16 | eval_args: MSMARCOEvalArgs 17 | model_args: MSMARCOEvalModelArgs 18 | 19 | runner = MSMARCOEvalRunner( 20 | eval_args=eval_args, 21 | model_args=model_args 22 | ) 23 | 24 | runner.run() 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/msmarco/runner.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.evaluation import AbsEvalRunner 2 | 3 | from .data_loader import MSMARCOEvalDataLoader 4 | 5 | 6 | class MSMARCOEvalRunner(AbsEvalRunner): 7 | """ 8 | Evaluation runner of MSMARCO. 9 | """ 10 | def load_data_loader(self) -> MSMARCOEvalDataLoader: 11 | """Load the data loader instance by args. 12 | 13 | Returns: 14 | MSMARCOEvalDataLoader: The MSMARCO data loader instance. 15 | """ 16 | data_loader = MSMARCOEvalDataLoader( 17 | eval_name=self.eval_args.eval_name, 18 | dataset_dir=self.eval_args.dataset_dir, 19 | cache_dir=self.eval_args.cache_path, 20 | token=self.eval_args.token, 21 | force_redownload=self.eval_args.force_redownload, 22 | ) 23 | return data_loader 24 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/__init__.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.evaluation import ( 2 | AbsEvalModelArgs as MTEBEvalModelArgs, 3 | ) 4 | 5 | from .arguments import MTEBEvalArgs 6 | from .runner import MTEBEvalRunner 7 | 8 | __all__ = [ 9 | "MTEBEvalArgs", 10 | "MTEBEvalModelArgs", 11 | "MTEBEvalRunner", 12 | ] 13 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.evaluation.mteb import ( 4 | MTEBEvalArgs, MTEBEvalModelArgs, 5 | MTEBEvalRunner 6 | ) 7 | 8 | 9 | def main(): 10 | parser = HfArgumentParser(( 11 | MTEBEvalArgs, 12 | MTEBEvalModelArgs 13 | )) 14 | 15 | eval_args, model_args = parser.parse_args_into_dataclasses() 16 | eval_args: MTEBEvalArgs 17 | model_args: MTEBEvalModelArgs 18 | 19 | runner = MTEBEvalRunner( 20 | eval_args=eval_args, 21 | model_args=model_args 22 | ) 23 | 24 | runner.run() 25 | 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/arguments.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List 3 | 4 | from FlagEmbedding.abc.evaluation.arguments import AbsEvalArgs 5 | 6 | 7 | @dataclass 8 | class MTEBEvalArgs(AbsEvalArgs): 9 | """ 10 | Argument class for MTEB evaluation. 11 | """ 12 | languages: List[str] = field( 13 | default=None, metadata={"help": "Languages to evaluate. Default: eng"} 14 | ) 15 | tasks: List[str] = field( 16 | default=None, metadata={"help": "Tasks to evaluate. Default: None"} 17 | ) 18 | task_types: List[str] = field( 19 | default=None, metadata={"help": "The task types to evaluate. Default: None"} 20 | ) 21 | use_special_instructions: bool = field( 22 | default=False, metadata={"help": "Whether to use specific instructions in `prompts.py` for evaluation. Default: False"} 23 | ) 24 | examples_path: str = field( 25 | default=None, metadata={"help": "Use specific examples in the path. Default: None"} 26 | ) -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/AmazonCounterfactualClassification.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "I wish I could have used this head set but the day I received it it wouldn't even turn on and I really wanted this product to work I'm very disappointed.","counterfactual" 3 | "I would advise that instead of trying to follow these poor instructions, Google it.","not-counterfactual" 4 | "I wrote to Monster customer service before ordering and they told me it would be fine to use without a converter and it was absolutely true.","not-counterfactual" -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/AmazonPolarityClassification.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "Hunting the Hard Way Thia was a gift for my Husband, who loved the book. It arrived on the date we were told it would.",positive 3 | "Poor DVD Has too many interviews with people at the Live THomas day in Penn. My kids were annoyed and hated this DVD.",negative 4 | "Ludicrous and silly I remember getting this book so faintly that that says alot about my opinion of it. Basically, while I will entertain lots of odd ideas and theories, this book was basically silly.",negative 5 | "Artistry I think that the Deodato concerts are very rich, as he used real strings and band musicians, as well as you can appreciate the John Tropea excelent renditions on guitar.",positive -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/AmazonReviewsClassification.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "DO NOT ORDER THIS\n\nThis isn't what's described at all. Taking it out of the package lace was cut upon arrival, wig was cut to like 14 inch, not curly, and smelled like cigarettes. I obviously was sent what someone returned, disgusting.Not what I ordered at all, not pleased at all. I want my money back DO NOT ORDER","1 star" 3 | "And I can’t return it\n\nThis product seemed like good quality but it does not stay stuck to the soles at all. You walk a few steps and then you find the black shoe grip somewhere on the floor.","2 star" 4 | "Three Stars\n\nnew yearly subscription plan is horrible, but the product still works as it did in the past","3 star" 5 | "I like how it has lots of pockets to put stuff ...\n\nI like how it has lots of pockets to put stuff in. I would have liked to have a shorter securing strap so it would not slide around so much. Good product.","4 star" 6 | "Great\n\nIt is really good. That's my favorite. THANK YOU","5 star" -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/ArxivClusteringS2S.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "A Survey on Graph Neural Networks: Algorithms and Applications",cs 3 | "Hamiltonian Dynamics and KAM Theory for Infinite-Dimensional Systems",math 4 | "Dark Matter Distribution in Dwarf Spheroidal Galaxies: Constraints from Stellar Kinematics",astro-ph 5 | "Decoherence and Quantum Error Correction in Topological Quantum Computers",quant-ph 6 | "Spin-Orbit Coupling Effects in Low-Dimensional Quantum Materials",cond-mat -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/AskUbuntuDupQuestions.csv: -------------------------------------------------------------------------------- 1 | query,positive 2 | angularjs infinite scroll in a container,AngularJS ng-infinite-scroll not working on a specific container/div 3 | Java: Efficiently converting an array of longs to an array of bytes,Most Compact way to Serialize an Array of Longs in Java 4 | PyVISA missing methods,NI VISA + pyVisa on Mac OS X (Snow Leopard) -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/BIOSSES.csv: -------------------------------------------------------------------------------- 1 | sent1,sent2 2 | "Recent studies have highlighted the crucial role of p53 in regulating cell cycle progression.","Recent research underscores p53's pivotal function in controlling cellular division." 3 | "Neuroscience has revealed intricate pathways linking dopamine to reward and motivation.","Recent neuroscientific findings have illuminated complex dopamine pathways associated with motivation and reward." 4 | "Stem cell research holds promise for treating a variety of degenerative diseases.","The potential of stem cell research in combating degenerative illnesses is widely recognized." -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/Banking77Classification.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "What is my money worth in other countries?",exchange_rate 3 | "What can I do if my card still hasn't arrived after 2 weeks?",card_arrival 4 | "Would I be able to open an account for my daughter?",age_limit 5 | "My address details have changed and I want to update them",edit_personal_details 6 | "If my cash withdrawal is still not showing, is something wrong?",pending_cash_withdrawal 7 | "How long do transfers typically take? Is there a way of speeding the process up? My friend needs the money I sent her desperately.",transfer_not_received_by_recipient -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/BiorxivClusteringS2S.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "Neural Circuit Dynamics in Decision-Making: A Computational Model of Prefrontal-Striatal Interactions",neuroscience 3 | "Metagenomic Insights into Extreme Environments: Microbial Diversity and Functional Adaptations in Antarctic Lakes",microbiology 4 | "Machine Learning Approaches for Predicting Protein Structure and Function from Sequence Data",bioinformatics 5 | "Regulation of Stem Cell Fate Decisions by the Hippo Signaling Pathway: Implications for Tissue Regeneration and Cancer Therapy",cell biology 6 | "Optical Tweezers and Single-Molecule Force Spectroscopy: Probing Protein Folding Dynamics and Mechanical Properties of Biomolecules",biophysics -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/CQADupstack.csv: -------------------------------------------------------------------------------- 1 | query,positive 2 | angularjs infinite scroll in a container,AngularJS ng-infinite-scroll not working on a specific container/div 3 | Java: Efficiently converting an array of longs to an array of bytes,Most Compact way to Serialize an Array of Longs in Java 4 | PyVISA missing methods,NI VISA + pyVisa on Mac OS X (Snow Leopard) 5 | -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/CQADupstackRetrieval.csv: -------------------------------------------------------------------------------- 1 | query,pos -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/EmotionClassification.csv: -------------------------------------------------------------------------------- 1 | text,label_text 2 | "i am bothered is that he might changed his feelings once he get back in us and leave me heartbroken",sadness 3 | "i have always loved my jobs and loved to work and i truly feel like being back there with my patients and co workers will do me a lot of good even if it is only for a few weeks",joy 4 | "i certainly feel loved and appreciated and grateful for all that i have",love 5 | "im grabbing a minute to post i feel greedy wrong",anger 6 | "i was stymied a little bit as i wrote feeling unsure that i might go somewhere with the story unintended",fear 7 | "i keep feeling pleasantly surprised at his supportiveness and also his ease in new situations",surprise -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/HotpotQA.csv: -------------------------------------------------------------------------------- 1 | query,pos 2 | "Which tennis player Anna-Lena Grönefeld or Mats Wilander turned professional first ?","Anna-Lena Grönefeld Anna-Lena Grönefeld (born 4 June 1985) is a German tennis player. She turned professional in April 2003." 3 | "What South Korean K-pop group has 13 members and their own online TV program?","Seventeen (band) Seventeen (Hangul: 세븐틴 ), also stylized as SEVENTEEN or SVT, is a South Korean boy group formed by Pledis Entertainment in 2015. The group consists of thirteen members who are separated into three sub-units, each with different areas of specialization: a 'Hip-Hop Unit', 'Vocal Unit', and 'Performance Unit'. They have released one studio album and four extended plays." 4 | "The game show Keep It in the Family was hosted by an actor that played what role in "Coronation Street"?","Keep It in the Family (UK game show) Keep It in the Family is a British game show that aired on ITV from 26 October 2014 to 19 December 2015 and is hosted by Bradley Walsh." -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/ImdbClassification.csv: -------------------------------------------------------------------------------- 1 | text,label_text 2 | "Renny Harlin's first American film was one of the best of a slew of prison-set horror films(like 'Death House' or 'The Chair')in the late 80's.Twenty years before,guard Lane Smith had wrongfully executed a condemned man.Now,he is the warden of the newly re-opened prison,and the man's ghost is back for bloody revenge.This atmospheric and very moody film features lots of gruesome gore and violence.Viggo Mortensen,Tiny Lister,Tom Everett and Kane Hodder are onhand for the entertaining carnage.","positive" -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/MSMARCO.csv: -------------------------------------------------------------------------------- 1 | query,pos 2 | "what is a pms color","PMS is a solid-color matching system, used primarily for specifying second or third colors in printing, meaning colors in addition to black, (although, obviously, one can certainly print a one-color piece using a PMS color and no black all)." 3 | "when was snowboarding invented","Snowboarding Modern snowboarding began in 1965 when Sherman Poppen, an engineer in Muskegon, Michigan, invented a toy for his daughters by fastening two skis together and attaching a rope to one end so he would have some control as they stood on the board and glided downhill." 4 | "difference between pollination fertilization","What is the difference between pollination & fertilization in flowering plants? • Pollination is a process flowering plants only undergo. It is the transfer of pollen to the plant’s stigma. The process can be done by the plant itself or through outside agents. • Fertilization is basically the joining of sperm and egg." -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/MTOPDomainClassification.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "I am no longer available",calling 3 | "Cancel my reminder about my dentist appointment",reminder 4 | "Will it rain tomorrow?",weather 5 | "Create an appointment alarm for 11:30am.",allarm 6 | "Play a different playlist",music 7 | "What's the best way to fry chicken",recipes 8 | "what city does Ahmed live in ?",people -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/MTOPIntentClassification.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "When will my next alarm start",GET_ALARM 3 | "I need you to message Zachary Fletcher",SEND_MESSAGE 4 | "show me video messages from Atlas",GET_MESSAGE 5 | "I want to listen to AC/DC please",PLAY_MUSIC 6 | "Make an alarm for the next 7 weeks for Thursday at 6pm",CREATE_ALARM 7 | "fairs happening in ann arbor next week",GET_EVENT 8 | "Will we get a frost this week?",GET_WEATHER -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/MassiveIntentClassification.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "remind me to pay rent every month",calendar_set 3 | "please play yesterday from beatles",play_music 4 | "what will the temperatures be for the next week",weather_query 5 | "give me the detailed schedule for next week",calendar_query 6 | "what's happening in my day",general_quirky 7 | "dolores how was your day",general_quirky 8 | "who was appointed as deputy centimeter of uttar pradesh",qa_factoid 9 | "find me news about trumps speech",news_query -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/MassiveScenarioClassification.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "can you confirm that my meeting for tomorrow has been canceled",calendar 3 | "please open my music application and play games by disturbed",play 4 | "what's the word orange mean",qa 5 | "find me all mails from magda with holidays word in the title",email 6 | "get a cup of coffee ready now",iot 7 | "good morning olly",general -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/MedrxivClusteringS2S.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "Evaluating the Efficacy of New Therapeutic Agents in the Management of Hypertension-Induced Kidney Damage",nephrology 3 | "Exploring the Relationship Between ICU Staffing Levels and Patient Outcomes in Severe Trauma Cases",intensive care and critical care medicine 4 | "The Impact of Environmental Allergens on Pediatric Asthma and Ear Infections",otolaryngology 5 | "Patient-Reported Outcomes in Rehabilitation: The Importance of Psychosocial Factors in Recovery",rehabilitation medicine and physical therapy 6 | "The Role of Micronutrients in Supporting Immune Function During Viral Infections",nutrition -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/MindSmallReranking.csv: -------------------------------------------------------------------------------- 1 | query,pos 2 | "'Wheel Of Fortune' Guest Delivers Hilarious, Off The Rails Introduction","Charles Rogers, former Michigan State football, Detroit Lions star, dead at 38" 3 | "Eliud Kipchoge runs 1:59 marathon, first to break 2 hours","AP-NORC poll: Many youths say high school diploma is enough" -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/QuoraRetrieval.csv: -------------------------------------------------------------------------------- 1 | query,pos 2 | "Why do people say Dhanush (South Indian actor) is ugly? I don't think so.?","Why do people say Dhanush (South Indian actor) is ugly? I don't think so?" 3 | "What are some hit and nice ideas about architecture dissertation topics?","What are some interesting undergraduate architecture thesis topics?" 4 | "Could someone please motivate me?","Can you motivate me?" -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/RedditClustering.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "Financial Meltdown: Strategies for Surviving Economic Collapse",collapse.txt 3 | "Exclusive Comic Book Sale: Don't Miss Out on January 13th!",comicbooks.txt 4 | "Tchaikovsky's Untold Story: The Mystery Behind Symphony No. 7",classicalmusic.txt 5 | "Coffee Addiction: When It's More Than Just a Drink",Coffee.txt -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/SICK-R.csv: -------------------------------------------------------------------------------- 1 | sent1,sent2 2 | "The cat is lounging on the sunny windowsill.","The feline is resting on the sunny windowsill." 3 | "A woman is reading a book while sitting on a bench.","A lady is reading a book while seated on a bench." 4 | "The child is drawing with crayons on a piece of paper.","The kid is using crayons to draw on a sheet of paper." -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/STS12.csv: -------------------------------------------------------------------------------- 1 | sent1,sent2 2 | "A man is dancing on the ceiling.","A man is dancing on the ceiling of a room." 3 | "That is a shameful state of affairs when we consider that the EU itself is a champion of modernised business practice.","It is a shame when it is thought that the European Union is posed as a champion modernization of the economic life!" 4 | "Spain has done a magnificent job in turning round the difficult neighbourly relations which Europe and North Africa and Spain and Morocco have suffered during the course of history.","Spain has developed a remarkably positive the difficult neighbourhood which has always existed between Europe and North Africa and between Spain and Morocco." -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/STS13.csv: -------------------------------------------------------------------------------- 1 | sent1,sent2 2 | "the state of being exposed to danger or harm","the condition of being at risk of injury or loss." 3 | "a set of instructions for a computer","directions given to a computer to perform a specific task." 4 | "a building used for public worship","a place where people gather to worship collectively." -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/STS14.csv: -------------------------------------------------------------------------------- 1 | sent1,sent2 2 | "president obama vows to work with congress on immigration reform .","obama pledges to collaborate with congress on immigration overhaul ." 3 | "britain votes to leave european union .","uk votes to leave eu ." 4 | "russian president putin signs law banning adoption of russian children by u.s. citizens .","putin bans u.s. adoptions of russian children ." -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/STS15.csv: -------------------------------------------------------------------------------- 1 | sent1,sent2 2 | "The battery and bulb A are not in the same path","Bulb A and the battery are not in the same circuit." 3 | "Switch Y and bulb B are in the same loop","Switch Y and bulb B belong to the same circuit." 4 | "new york city marathon canceled due to hurricane sandy","nyc marathon canceled because of hurricane sandy" 5 | "pope francis calls for peace in syria during sunday address","pope francis appeals for peace in syria in his sunday speech" -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/STS16.csv: -------------------------------------------------------------------------------- 1 | sent1,sent2 2 | "what are the symptoms of a heart attack ?","what are the signs of a heart attack ?" 3 | "how do i change a flat tire on my car ?","what steps should i take to replace a flat tire ?" 4 | "how do i cook a medium rare steak ?","what's the best way to prepare a steak to medium rare ?" -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/STS17.csv: -------------------------------------------------------------------------------- 1 | sent1,sent2 2 | "The sun is setting over the mountains.", "The sun sets behind the mountains." 3 | "A child is playing with a red ball.", "A kid plays with a red ball." 4 | "Two people are sitting on a bench in the park.", "Two individuals are seated on a bench in the park." -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/STSBenchmark.csv: -------------------------------------------------------------------------------- 1 | sent1,sent2 2 | "Agribusiness: Mad cow disease found in California","USDA Confirms Case of Mad Cow Disease in California" 3 | "santos stated colombian police found the evidence in 2 computers discovered with slain rebel leader raul reyes. ","francisco santos stated that colombian police found the evidence on two computers discovered with raul reyes." 4 | "US Attorney General Holder resigns","US Attorney general Eric Holder to resign" -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/SciDocsRR.csv: -------------------------------------------------------------------------------- 1 | query,pos 2 | "Intelligent Word-Based Spam Filter Detection Using Multi-Neural Networks","Efficient Harmful Email identification Using Neural Network" 3 | "Importance of sediments in understanding nutrient cyclings in lakes","Raphidiopsis mediterranea Skuja represents non-heterocytous life-cycle stages of Cylindrospermopsis raciborskii (Woloszynska) Seenayya et Subba Raju in Lake Kastoria (Greece), its type locality: Evidence by morphological and phylogenetic analysis" 4 | "Adult playfulness and its relationship to humour , subjective happiness and depression : A comparative study of Hong Kong and Mainland China","Rapid assessment of well-being: The Short Depression-Happiness Scale (SDHS)." 5 | "In depth performance evaluation of LTE-M for M2M communications","Simulating LTE Cellular Systems: An Open-Source Framework" 6 | "Marketing segmentation using support vector clustering","Support vector clustering" -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/SprintDuplicateQuestions.csv: -------------------------------------------------------------------------------- 1 | sent1,sent2 2 | "Kyocera duraforce pro international roaming settings","Make a call while roaming internationally - Kyocera DuraForce PRO" 3 | "Guide for connecting to the Sprint U301 USB mobile broadband","Turn automatic connections on or off - Sprint U301 USB Device Sprint 3G/4G Mobile Broadband" 4 | "What do you think is a reason that is preventing troubleshooting on my HTC One A9 related to issues to the mobile hotspots ?","Troubleshoot issues related to mobile hotspots and your HTC One A9" 5 | "Why has my Samsung Transform been freezing everytime I attempt to open up an app ?","Why is my Samsung Transform freezing or being unresponsive ?" 6 | "What can I do to turn on Wi-Fi on the HTC One A9 ?","Turn on and connect to Wi-Fi - HTC One A9" -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/StackExchangeClustering.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "Recommendations for a lightweight Markdown editor with real-time collaboration features?",softwarerecs.stackexchange.com.txt 3 | "How to integrate external APIs with EOSIO blockchain applications?",eosio.stackexchange.com.txt 4 | "How to balance macros for effective fat loss and muscle retention?",fitness.stackexchange.com.txt 5 | "Can "amans" be used as a substantival participle in Latin?",latin.stackexchange.com.txt -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/StackOverflowDupQuestions.csv: -------------------------------------------------------------------------------- 1 | query,pos 2 | "How to handle onChange event in React when state changes programmatically?","React onChange event not firing when state is updated programmatically" 3 | "How to simulate a click event on a button using JavaScript?","JavaScript button click event simulation" 4 | "Python: How to run a function asynchronously using asyncio?","Asyncio: Running Python function asynchronously" -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/ToxicConversationsClassification.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "Pull your little head out of your big ass areola!","toxic" 3 | "Trudeau will survive this alright as, unfortunately, Canadians are amongst the most dumbed down on the earth thanks to no real media other than his lapdogs.","toxic" 4 | "No evidence of one ever starting a fire. Better ban them.","not toxic" 5 | "What's this about Andrew Jackson and the civil war? And now he is saying he would be "honored" to meet with NK dictator Kim Jung Un. Is the pressure of being president getting to him? I am worried.","not toxic" -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/TweetSentimentExtractionClassification.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "I`d have responded, if I were going",neutral 3 | "what interview! leave me alone",negative 4 | "2am feedings for the baby are fun when he is all smiles and coos",positive 5 | "is cleaning the house for her family who is comming later today..",neutral 6 | "Sick. With a flu like thing.",negative 7 | "We saw that in none 3D - the baddie`s the best",positive -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/TwentyNewsgroupsClustering.csv: -------------------------------------------------------------------------------- 1 | text,label 2 | "Major flaw discovered in widely-used encryption protocol",sci.crypt 3 | "Bruins' Unstoppable Winning Streak",rec.sport.hockey 4 | "Troubleshooting a Digital Multimeter Calibration Issue",sci.electronics 5 | "Understanding DPI Scaling in X Window Systems",comp.windows.x -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/TwitterSemEval2015.csv: -------------------------------------------------------------------------------- 1 | sent1,sent2 2 | "Excited for the new Game of Thrones episode tonight!","Can't wait for tonight's Game of Thrones episode!" 3 | "Just finished a 5k run and feel amazing!","Completed a 5k run and I'm feeling great!" 4 | "Had an incredible dinner at Joe's Italian Restaurant.","Joe's Italian Restaurant served an amazing dinner tonight." 5 | "I need a vacation. Can't wait to hit the beach.","Desperately need a holiday. Looking forward to beach time." 6 | "The new iPhone has some fantastic features!","Loving the features on the new iPhone!" -------------------------------------------------------------------------------- /FlagEmbedding/evaluation/mteb/examples/TwitterURLCorpus.csv: -------------------------------------------------------------------------------- 1 | sent1,sent2 2 | "Elon Musk says Tesla will be profitable next quarter.","Elon Musk claims Tesla will turn a profit next quarter." 3 | "The new iPhone just got announced and it's amazing.","Apple just unveiled the new iPhone and it's incredible." 4 | "Beyoncé's new album has topped the charts in its first week.","Beyoncé's latest album debuted at number one on the charts." 5 | "Breaking: Major earthquake hits California.","Just in: Large earthquake strikes California." 6 | "NASA plans to send humans to Mars by 2030.","NASA aims to have astronauts on Mars by the year 2030." -------------------------------------------------------------------------------- /FlagEmbedding/finetune/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/FlagEmbedding/finetune/__init__.py -------------------------------------------------------------------------------- /FlagEmbedding/finetune/embedder/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/FlagEmbedding/finetune/embedder/__init__.py -------------------------------------------------------------------------------- /FlagEmbedding/finetune/embedder/decoder_only/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/FlagEmbedding/finetune/embedder/decoder_only/__init__.py -------------------------------------------------------------------------------- /FlagEmbedding/finetune/embedder/decoder_only/base/__init__.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.finetune.embedder import ( 2 | AbsEmbedderDataArguments as DecoderOnlyEmbedderDataArguments, 3 | AbsEmbedderTrainingArguments as DecoderOnlyEmbedderTrainingArguments, 4 | ) 5 | 6 | from .arguments import DecoderOnlyEmbedderModelArguments 7 | from .modeling import BiDecoderOnlyEmbedderModel 8 | from .trainer import DecoderOnlyEmbedderTrainer 9 | from .runner import DecoderOnlyEmbedderRunner 10 | 11 | __all__ = [ 12 | 'DecoderOnlyEmbedderDataArguments', 13 | 'DecoderOnlyEmbedderTrainingArguments', 14 | 'DecoderOnlyEmbedderModelArguments', 15 | 'BiDecoderOnlyEmbedderModel', 16 | 'DecoderOnlyEmbedderTrainer', 17 | 'DecoderOnlyEmbedderRunner', 18 | ] 19 | -------------------------------------------------------------------------------- /FlagEmbedding/finetune/embedder/decoder_only/base/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.finetune.embedder.decoder_only.base import ( 4 | DecoderOnlyEmbedderDataArguments, 5 | DecoderOnlyEmbedderTrainingArguments, 6 | DecoderOnlyEmbedderModelArguments, 7 | DecoderOnlyEmbedderRunner, 8 | ) 9 | 10 | 11 | def main(): 12 | parser = HfArgumentParser(( 13 | DecoderOnlyEmbedderModelArguments, 14 | DecoderOnlyEmbedderDataArguments, 15 | DecoderOnlyEmbedderTrainingArguments 16 | )) 17 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 18 | model_args: DecoderOnlyEmbedderModelArguments 19 | data_args: DecoderOnlyEmbedderDataArguments 20 | training_args: DecoderOnlyEmbedderTrainingArguments 21 | 22 | runner = DecoderOnlyEmbedderRunner( 23 | model_args=model_args, 24 | data_args=data_args, 25 | training_args=training_args 26 | ) 27 | runner.run() 28 | 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /FlagEmbedding/finetune/embedder/decoder_only/icl/__init__.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.finetune.embedder import ( 2 | AbsEmbedderTrainingArguments as DecoderOnlyEmbedderICLTrainingArguments, 3 | ) 4 | 5 | from .arguments import ( 6 | DecoderOnlyEmbedderICLModelArguments, 7 | DecoderOnlyEmbedderICLDataArguments 8 | ) 9 | from .dataset import ( 10 | DecoderOnlyEmbedderICLSameDatasetTrainDataset, 11 | AbsEmbedderSameDatasetCollator 12 | ) 13 | from .modeling import BiDecoderOnlyEmbedderICLModel 14 | from .trainer import DecoderOnlyEmbedderICLTrainer 15 | from .runner import DecoderOnlyEmbedderICLRunner 16 | 17 | __all__ = [ 18 | 'DecoderOnlyEmbedderICLModelArguments', 19 | 'DecoderOnlyEmbedderICLDataArguments', 20 | 'DecoderOnlyEmbedderICLTrainingArguments', 21 | 'BiDecoderOnlyEmbedderICLModel', 22 | 'DecoderOnlyEmbedderICLTrainer', 23 | 'DecoderOnlyEmbedderICLRunner', 24 | ] 25 | -------------------------------------------------------------------------------- /FlagEmbedding/finetune/embedder/decoder_only/icl/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.finetune.embedder.decoder_only.icl import ( 4 | DecoderOnlyEmbedderICLDataArguments, 5 | DecoderOnlyEmbedderICLTrainingArguments, 6 | DecoderOnlyEmbedderICLModelArguments, 7 | DecoderOnlyEmbedderICLRunner, 8 | ) 9 | 10 | 11 | def main(): 12 | parser = HfArgumentParser(( 13 | DecoderOnlyEmbedderICLModelArguments, 14 | DecoderOnlyEmbedderICLDataArguments, 15 | DecoderOnlyEmbedderICLTrainingArguments 16 | )) 17 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 18 | model_args: DecoderOnlyEmbedderICLModelArguments 19 | data_args: DecoderOnlyEmbedderICLDataArguments 20 | training_args: DecoderOnlyEmbedderICLTrainingArguments 21 | 22 | runner = DecoderOnlyEmbedderICLRunner( 23 | model_args=model_args, 24 | data_args=data_args, 25 | training_args=training_args 26 | ) 27 | runner.run() 28 | 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /FlagEmbedding/finetune/embedder/encoder_only/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/FlagEmbedding/finetune/embedder/encoder_only/__init__.py -------------------------------------------------------------------------------- /FlagEmbedding/finetune/embedder/encoder_only/base/__init__.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.finetune.embedder import ( 2 | AbsEmbedderModelArguments as EncoderOnlyEmbedderModelArguments, 3 | AbsEmbedderDataArguments as EncoderOnlyEmbedderDataArguments, 4 | AbsEmbedderTrainingArguments as EncoderOnlyEmbedderTrainingArguments, 5 | ) 6 | 7 | from .modeling import BiEncoderOnlyEmbedderModel 8 | from .trainer import EncoderOnlyEmbedderTrainer 9 | from .runner import EncoderOnlyEmbedderRunner 10 | 11 | __all__ = [ 12 | 'EncoderOnlyEmbedderModelArguments', 13 | 'EncoderOnlyEmbedderDataArguments', 14 | 'EncoderOnlyEmbedderTrainingArguments', 15 | 'BiEncoderOnlyEmbedderModel', 16 | 'EncoderOnlyEmbedderTrainer', 17 | 'EncoderOnlyEmbedderRunner', 18 | ] 19 | -------------------------------------------------------------------------------- /FlagEmbedding/finetune/embedder/encoder_only/base/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.finetune.embedder.encoder_only.base import ( 4 | EncoderOnlyEmbedderDataArguments, 5 | EncoderOnlyEmbedderTrainingArguments, 6 | EncoderOnlyEmbedderModelArguments, 7 | EncoderOnlyEmbedderRunner, 8 | ) 9 | 10 | 11 | def main(): 12 | parser = HfArgumentParser(( 13 | EncoderOnlyEmbedderModelArguments, 14 | EncoderOnlyEmbedderDataArguments, 15 | EncoderOnlyEmbedderTrainingArguments 16 | )) 17 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 18 | model_args: EncoderOnlyEmbedderModelArguments 19 | data_args: EncoderOnlyEmbedderDataArguments 20 | training_args: EncoderOnlyEmbedderTrainingArguments 21 | 22 | runner = EncoderOnlyEmbedderRunner( 23 | model_args=model_args, 24 | data_args=data_args, 25 | training_args=training_args 26 | ) 27 | runner.run() 28 | 29 | 30 | if __name__ == "__main__": 31 | main() 32 | -------------------------------------------------------------------------------- /FlagEmbedding/finetune/embedder/encoder_only/m3/__init__.py: -------------------------------------------------------------------------------- 1 | from FlagEmbedding.abc.finetune.embedder import AbsEmbedderDataArguments as EncoderOnlyEmbedderM3DataArguments 2 | 3 | from .arguments import EncoderOnlyEmbedderM3ModelArguments, EncoderOnlyEmbedderM3TrainingArguments 4 | from .modeling import EncoderOnlyEmbedderM3Model, EncoderOnlyEmbedderM3ModelForInference 5 | from .trainer import EncoderOnlyEmbedderM3Trainer 6 | from .runner import EncoderOnlyEmbedderM3Runner 7 | 8 | 9 | __all__ = [ 10 | 'EncoderOnlyEmbedderM3ModelArguments', 11 | 'EncoderOnlyEmbedderM3DataArguments', 12 | 'EncoderOnlyEmbedderM3TrainingArguments', 13 | 'EncoderOnlyEmbedderM3Model', 14 | 'EncoderOnlyEmbedderM3ModelForInference', 15 | 'EncoderOnlyEmbedderM3Trainer', 16 | 'EncoderOnlyEmbedderM3Runner', 17 | ] 18 | -------------------------------------------------------------------------------- /FlagEmbedding/finetune/embedder/encoder_only/m3/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.finetune.embedder.encoder_only.m3 import ( 4 | EncoderOnlyEmbedderM3DataArguments, 5 | EncoderOnlyEmbedderM3TrainingArguments, 6 | EncoderOnlyEmbedderM3ModelArguments, 7 | EncoderOnlyEmbedderM3Runner, 8 | ) 9 | 10 | 11 | def main(): 12 | parser = HfArgumentParser((EncoderOnlyEmbedderM3ModelArguments, EncoderOnlyEmbedderM3DataArguments, EncoderOnlyEmbedderM3TrainingArguments)) 13 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 14 | model_args: EncoderOnlyEmbedderM3ModelArguments 15 | data_args: EncoderOnlyEmbedderM3DataArguments 16 | training_args: EncoderOnlyEmbedderM3TrainingArguments 17 | 18 | runner = EncoderOnlyEmbedderM3Runner( 19 | model_args=model_args, 20 | data_args=data_args, 21 | training_args=training_args 22 | ) 23 | runner.run() 24 | 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /FlagEmbedding/finetune/embedder/encoder_only/m3/arguments.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | from FlagEmbedding.abc.finetune.embedder import ( 4 | AbsEmbedderTrainingArguments, 5 | AbsEmbedderModelArguments 6 | ) 7 | 8 | 9 | @dataclass 10 | class EncoderOnlyEmbedderM3ModelArguments(AbsEmbedderModelArguments): 11 | """ 12 | Model argument class for M3. 13 | """ 14 | colbert_dim: int = field(default=-1, metadata={"help": "Dim of colbert linear"}) 15 | 16 | 17 | @dataclass 18 | class EncoderOnlyEmbedderM3TrainingArguments(AbsEmbedderTrainingArguments): 19 | """ 20 | Training argument class for M3. 21 | """ 22 | unified_finetuning: bool = field(default=False, metadata={"help": "use unify fine-tuning"}) 23 | use_self_distill: bool = field(default=False, metadata={"help": "use self-distill when using unify fine-tuning"}) 24 | fix_encoder: bool = field(default=False, metadata={"help": "Freeze the parameters of encoder"}) 25 | self_distill_start_step: int = field(default=-1, metadata={"help": "Num of step when using self-distill"}) 26 | -------------------------------------------------------------------------------- /FlagEmbedding/finetune/reranker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/FlagEmbedding/finetune/reranker/__init__.py -------------------------------------------------------------------------------- /FlagEmbedding/finetune/reranker/decoder_only/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/FlagEmbedding/finetune/reranker/decoder_only/__init__.py -------------------------------------------------------------------------------- /FlagEmbedding/finetune/reranker/decoder_only/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling import CrossDecoderModel 2 | from .runner import DecoderOnlyRerankerRunner 3 | from .arguments import RerankerModelArguments 4 | from .trainer import DecoderOnlyRerankerTrainer 5 | 6 | __all__ = [ 7 | "CrossDecoderModel", 8 | "DecoderOnlyRerankerRunner", 9 | "DecoderOnlyRerankerTrainer", 10 | "RerankerModelArguments", 11 | ] 12 | -------------------------------------------------------------------------------- /FlagEmbedding/finetune/reranker/decoder_only/base/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.abc.finetune.reranker import ( 4 | AbsRerankerDataArguments, 5 | AbsRerankerTrainingArguments 6 | ) 7 | 8 | from FlagEmbedding.finetune.reranker.decoder_only.base import ( 9 | DecoderOnlyRerankerRunner, 10 | RerankerModelArguments 11 | ) 12 | 13 | 14 | def main(): 15 | parser = HfArgumentParser((RerankerModelArguments, AbsRerankerDataArguments, AbsRerankerTrainingArguments)) 16 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 17 | model_args: RerankerModelArguments 18 | data_args: AbsRerankerDataArguments 19 | training_args: AbsRerankerTrainingArguments 20 | 21 | runner = DecoderOnlyRerankerRunner( 22 | model_args=model_args, 23 | data_args=data_args, 24 | training_args=training_args 25 | ) 26 | runner.run() 27 | 28 | 29 | if __name__ == "__main__": 30 | main() 31 | -------------------------------------------------------------------------------- /FlagEmbedding/finetune/reranker/decoder_only/layerwise/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling import CrossDecoderModel 2 | from .runner import DecoderOnlyRerankerRunner 3 | from .arguments import RerankerModelArguments 4 | from .trainer import DecoderOnlyRerankerTrainer 5 | 6 | __all__ = [ 7 | "CrossDecoderModel", 8 | "DecoderOnlyRerankerRunner", 9 | "DecoderOnlyRerankerTrainer", 10 | "RerankerModelArguments", 11 | ] 12 | -------------------------------------------------------------------------------- /FlagEmbedding/finetune/reranker/decoder_only/layerwise/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.abc.finetune.reranker import ( 4 | AbsRerankerDataArguments, 5 | AbsRerankerTrainingArguments 6 | ) 7 | 8 | from FlagEmbedding.finetune.reranker.decoder_only.layerwise import ( 9 | DecoderOnlyRerankerRunner, 10 | RerankerModelArguments 11 | ) 12 | 13 | 14 | def main(): 15 | parser = HfArgumentParser((RerankerModelArguments, AbsRerankerDataArguments, AbsRerankerTrainingArguments)) 16 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 17 | model_args: RerankerModelArguments 18 | data_args: AbsRerankerDataArguments 19 | training_args: AbsRerankerTrainingArguments 20 | 21 | runner = DecoderOnlyRerankerRunner( 22 | model_args=model_args, 23 | data_args=data_args, 24 | training_args=training_args 25 | ) 26 | runner.run() 27 | 28 | 29 | if __name__ == "__main__": 30 | main() 31 | -------------------------------------------------------------------------------- /FlagEmbedding/finetune/reranker/encoder_only/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/FlagEmbedding/finetune/reranker/encoder_only/__init__.py -------------------------------------------------------------------------------- /FlagEmbedding/finetune/reranker/encoder_only/base/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling import CrossEncoderModel 2 | from .runner import EncoderOnlyRerankerRunner 3 | from .trainer import EncoderOnlyRerankerTrainer 4 | 5 | __all__ = [ 6 | "CrossEncoderModel", 7 | "EncoderOnlyRerankerRunner", 8 | "EncoderOnlyRerankerTrainer" 9 | ] 10 | -------------------------------------------------------------------------------- /FlagEmbedding/finetune/reranker/encoder_only/base/__main__.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.abc.finetune.reranker import ( 4 | AbsRerankerModelArguments, 5 | AbsRerankerDataArguments, 6 | AbsRerankerTrainingArguments 7 | ) 8 | from FlagEmbedding.finetune.reranker.encoder_only.base import EncoderOnlyRerankerRunner 9 | 10 | 11 | def main(): 12 | parser = HfArgumentParser((AbsRerankerModelArguments, AbsRerankerDataArguments, AbsRerankerTrainingArguments)) 13 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 14 | model_args: AbsRerankerModelArguments 15 | data_args: AbsRerankerDataArguments 16 | training_args: AbsRerankerTrainingArguments 17 | 18 | runner = EncoderOnlyRerankerRunner( 19 | model_args=model_args, 20 | data_args=data_args, 21 | training_args=training_args 22 | ) 23 | runner.run() 24 | 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /FlagEmbedding/inference/__init__.py: -------------------------------------------------------------------------------- 1 | from .auto_embedder import FlagAutoModel 2 | from .auto_reranker import FlagAutoReranker 3 | from .embedder import ( 4 | FlagModel, BGEM3FlagModel, 5 | FlagICLModel, FlagLLMModel, 6 | EmbedderModelClass 7 | ) 8 | from .reranker import ( 9 | FlagReranker, 10 | FlagLLMReranker, LayerWiseFlagLLMReranker, LightWeightFlagLLMReranker, 11 | RerankerModelClass 12 | ) 13 | 14 | 15 | __all__ = [ 16 | "FlagAutoModel", 17 | "FlagAutoReranker", 18 | "EmbedderModelClass", 19 | "RerankerModelClass", 20 | "FlagModel", 21 | "BGEM3FlagModel", 22 | "FlagICLModel", 23 | "FlagLLMModel", 24 | "FlagReranker", 25 | "FlagLLMReranker", 26 | "LayerWiseFlagLLMReranker", 27 | "LightWeightFlagLLMReranker", 28 | ] 29 | -------------------------------------------------------------------------------- /FlagEmbedding/inference/embedder/__init__.py: -------------------------------------------------------------------------------- 1 | from .encoder_only import FlagModel, BGEM3FlagModel 2 | from .decoder_only import FlagICLModel, FlagLLMModel 3 | from .model_mapping import EmbedderModelClass 4 | 5 | __all__ = [ 6 | "FlagModel", 7 | "BGEM3FlagModel", 8 | "FlagICLModel", 9 | "FlagLLMModel", 10 | "EmbedderModelClass", 11 | ] 12 | -------------------------------------------------------------------------------- /FlagEmbedding/inference/embedder/decoder_only/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLLMEmbedder as FlagLLMModel 2 | from .icl import ICLLLMEmbedder as FlagICLModel 3 | 4 | __all__ = [ 5 | "FlagLLMModel", 6 | "FlagICLModel", 7 | ] 8 | -------------------------------------------------------------------------------- /FlagEmbedding/inference/embedder/encoder_only/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseEmbedder as FlagModel 2 | from .m3 import M3Embedder as BGEM3FlagModel 3 | 4 | __all__ = [ 5 | "FlagModel", 6 | "BGEM3FlagModel", 7 | ] 8 | -------------------------------------------------------------------------------- /FlagEmbedding/inference/reranker/__init__.py: -------------------------------------------------------------------------------- 1 | from .decoder_only import FlagLLMReranker, LayerWiseFlagLLMReranker, LightWeightFlagLLMReranker 2 | from .encoder_only import FlagReranker 3 | from .model_mapping import RerankerModelClass 4 | 5 | __all__ = [ 6 | "FlagReranker", 7 | "FlagLLMReranker", 8 | "LayerWiseFlagLLMReranker", 9 | "LightWeightFlagLLMReranker", 10 | "RerankerModelClass", 11 | ] 12 | -------------------------------------------------------------------------------- /FlagEmbedding/inference/reranker/decoder_only/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseLLMReranker as FlagLLMReranker 2 | from .layerwise import LayerWiseLLMReranker as LayerWiseFlagLLMReranker 3 | from .lightweight import LightweightLLMReranker as LightWeightFlagLLMReranker 4 | 5 | __all__ = [ 6 | "FlagLLMReranker", 7 | "LayerWiseFlagLLMReranker", 8 | "LightWeightFlagLLMReranker" 9 | ] 10 | -------------------------------------------------------------------------------- /FlagEmbedding/inference/reranker/decoder_only/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/FlagEmbedding/inference/reranker/decoder_only/models/__init__.py -------------------------------------------------------------------------------- /FlagEmbedding/inference/reranker/encoder_only/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseReranker as FlagReranker 2 | 3 | __all__ = [ 4 | "FlagReranker", 5 | ] 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 staoxiao 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Manifest.in: -------------------------------------------------------------------------------- 1 | # Include the entire directory and its contents 2 | recursive-include FlagEmbedding/FlagEmbedding/visual/eva_clip * 3 | 4 | # Include the specific file at the root level 5 | include bpe_simple_vocab_16e6.txt.gz 6 | 7 | # Include all JSON files inside the specified directory 8 | recursive-include FlagEmbedding/visual/eva_clip/model_configs *.json 9 | -------------------------------------------------------------------------------- /Tutorials/7_Fine-tuning/config/ds_stage0.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 0 4 | }, 5 | 6 | "fp16": { 7 | "enabled": "auto", 8 | "loss_scale": 0, 9 | "loss_scale_window": 1000, 10 | "initial_scale_power": 12, 11 | "hysteresis": 2, 12 | "min_loss_scale": 1 13 | }, 14 | 15 | "bf16": { 16 | "enabled": "auto" 17 | }, 18 | 19 | "optimizer": { 20 | "type": "AdamW", 21 | "params": { 22 | "lr": "auto", 23 | "betas": "auto", 24 | "eps": "auto", 25 | "weight_decay": "auto" 26 | } 27 | }, 28 | 29 | "scheduler": { 30 | "type": "WarmupDecayLR", 31 | "params": { 32 | "warmup_min_lr": "auto", 33 | "warmup_max_lr": "auto", 34 | "warmup_num_steps": "auto", 35 | "total_num_steps": "auto" 36 | } 37 | }, 38 | 39 | "gradient_accumulation_steps": "auto", 40 | "gradient_clipping": "auto", 41 | "steps_per_print": 100, 42 | "train_batch_size": "auto", 43 | "train_micro_batch_size_per_gpu": "auto", 44 | "wall_clock_breakdown": false 45 | } -------------------------------------------------------------------------------- /Tutorials/tutorial_map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/Tutorials/tutorial_map.png -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | install the required pkgs: 2 | ``` 3 | pip install -r requirements.txt 4 | ``` 5 | 6 | 7 | to host the webpages locally: 8 | ``` 9 | python -m http.server 10 | ``` -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | myst-nb 3 | myst_parser 4 | sphinx-design 5 | pydata-sphinx-theme 6 | # furo -------------------------------------------------------------------------------- /docs/source/API/abc.rst: -------------------------------------------------------------------------------- 1 | Abstract Class 2 | ============== 3 | 4 | .. toctree:: 5 | abc/inference 6 | abc/evaluation 7 | abc/finetune -------------------------------------------------------------------------------- /docs/source/API/abc/evaluation.rst: -------------------------------------------------------------------------------- 1 | Evaluation 2 | ========== 3 | 4 | .. toctree:: 5 | evaluation/arguments 6 | evaluation/data_loader 7 | evaluation/searcher 8 | evaluation/evaluator 9 | evaluation/runner -------------------------------------------------------------------------------- /docs/source/API/abc/evaluation/arguments.rst: -------------------------------------------------------------------------------- 1 | Arguments 2 | ========= 3 | 4 | .. autoclass:: FlagEmbedding.abc.evaluation.AbsEvalArgs 5 | 6 | 7 | .. autoclass:: FlagEmbedding.abc.evaluation.AbsEvalModelArgs -------------------------------------------------------------------------------- /docs/source/API/abc/evaluation/evaluator.rst: -------------------------------------------------------------------------------- 1 | Evaluator 2 | ========= 3 | 4 | .. autoclass:: FlagEmbedding.abc.evaluation.AbsEvaluator -------------------------------------------------------------------------------- /docs/source/API/abc/evaluation/runner.rst: -------------------------------------------------------------------------------- 1 | runner 2 | ====== 3 | 4 | .. autoclass:: FlagEmbedding.abc.evaluation.AbsEvalRunner -------------------------------------------------------------------------------- /docs/source/API/abc/evaluation/searcher.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | searcher 3 | ======== 4 | 5 | EvalRetriever 6 | ============= 7 | 8 | .. autoclass:: FlagEmbedding.abc.evaluation.EvalRetriever 9 | 10 | EvalDenseRetriever 11 | ================== 12 | 13 | .. autoclass:: FlagEmbedding.abc.evaluation.EvalDenseRetriever 14 | 15 | EvalReranker 16 | ============ 17 | 18 | .. autoclass:: FlagEmbedding.abc.evaluation.EvalReranker -------------------------------------------------------------------------------- /docs/source/API/abc/finetune.rst: -------------------------------------------------------------------------------- 1 | Finetune 2 | ======== 3 | 4 | .. toctree:: 5 | finetune/embedder 6 | finetune/reranker -------------------------------------------------------------------------------- /docs/source/API/abc/finetune/embedder.rst: -------------------------------------------------------------------------------- 1 | Embedder 2 | ======== 3 | 4 | .. toctree:: 5 | embedder/AbsArguments 6 | embedder/AbsDataset 7 | embedder/AbsModeling 8 | embedder/AbsTrainer 9 | embedder/AbsRunner -------------------------------------------------------------------------------- /docs/source/API/abc/finetune/embedder/AbsArguments.rst: -------------------------------------------------------------------------------- 1 | AbsArguments 2 | ============ 3 | 4 | .. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModelArguments 5 | 6 | .. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsRerankerDataArguments 7 | -------------------------------------------------------------------------------- /docs/source/API/abc/finetune/embedder/AbsRunner.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | AbsRunner 3 | ========= 4 | 5 | AbsEmbedderTrainer 6 | ================== 7 | 8 | .. autoclass:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderRunner 9 | 10 | Methods 11 | ------- 12 | 13 | .. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderRunner.load_tokenizer_and_model 14 | 15 | .. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderRunner.load_trainer 16 | 17 | .. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderRunner.load_train_dataset 18 | 19 | .. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderRunner.load_data_collator 20 | 21 | .. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderRunner.run -------------------------------------------------------------------------------- /docs/source/API/abc/finetune/embedder/AbsTrainer.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | AbsTrainer 3 | ========== 4 | 5 | AbsEmbedderTrainer 6 | ================== 7 | 8 | .. autoclass:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderTrainer 9 | 10 | Methods 11 | ------- 12 | 13 | .. automethod:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderTrainer.compute_loss -------------------------------------------------------------------------------- /docs/source/API/abc/finetune/reranker.rst: -------------------------------------------------------------------------------- 1 | Reranker 2 | ======== 3 | 4 | .. toctree:: 5 | reranker/AbsArguments 6 | reranker/AbsDataset 7 | reranker/AbsModeling 8 | reranker/AbsTrainer 9 | reranker/AbsRunner -------------------------------------------------------------------------------- /docs/source/API/abc/finetune/reranker/AbsArguments.rst: -------------------------------------------------------------------------------- 1 | AbsArguments 2 | ============ 3 | 4 | .. autoclass:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderModelArguments 5 | 6 | .. autoclass:: FlagEmbedding.abc.finetune.embedder.AbsEmbedderDataArguments 7 | -------------------------------------------------------------------------------- /docs/source/API/abc/finetune/reranker/AbsDataset.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | AbsDataset 3 | ========== 4 | 5 | AbsRerankerTrainDataset 6 | ======================= 7 | 8 | .. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsRerankerTrainDataset 9 | 10 | Methods 11 | ------- 12 | 13 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerTrainDataset.create_one_example 14 | 15 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerTrainDataset._load_dataset 16 | 17 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerTrainDataset._shuffle_text 18 | 19 | AbsRerankerCollator 20 | =================== 21 | 22 | .. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsRerankerCollator 23 | 24 | AbsLLMRerankerTrainDataset 25 | ========================== 26 | 27 | .. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsLLMRerankerTrainDataset 28 | 29 | AbsLLMRerankerCollator 30 | ====================== 31 | 32 | .. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsLLMRerankerCollator 33 | -------------------------------------------------------------------------------- /docs/source/API/abc/finetune/reranker/AbsModeling.rst: -------------------------------------------------------------------------------- 1 | =========== 2 | AbsModeling 3 | =========== 4 | 5 | AbsRerankerModel 6 | ================ 7 | 8 | .. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel 9 | 10 | Methods 11 | ------- 12 | 13 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel.encode 14 | 15 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel.gradient_checkpointing_enable 16 | 17 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel.enable_input_require_grads 18 | 19 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel.forward 20 | 21 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel.compute_loss 22 | 23 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel.save 24 | 25 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerModel.save_pretrained 26 | 27 | 28 | RerankerOutput 29 | ============== 30 | 31 | .. autoclass:: FlagEmbedding.abc.finetune.reranker.RerankerOutput -------------------------------------------------------------------------------- /docs/source/API/abc/finetune/reranker/AbsRunner.rst: -------------------------------------------------------------------------------- 1 | ========= 2 | AbsRunner 3 | ========= 4 | 5 | AbsRerankerTrainer 6 | ================== 7 | 8 | .. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsRerankerRunner 9 | 10 | Methods 11 | ------- 12 | 13 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerRunner.load_tokenizer_and_model 14 | 15 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerRunner.load_trainer 16 | 17 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerRunner.load_train_dataset 18 | 19 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerRunner.load_data_collator 20 | 21 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerRunner.run -------------------------------------------------------------------------------- /docs/source/API/abc/finetune/reranker/AbsTrainer.rst: -------------------------------------------------------------------------------- 1 | ========== 2 | AbsTrainer 3 | ========== 4 | 5 | AbsRerankerTrainer 6 | ================== 7 | 8 | .. autoclass:: FlagEmbedding.abc.finetune.reranker.AbsRerankerTrainer 9 | 10 | Methods 11 | ------- 12 | 13 | .. automethod:: FlagEmbedding.abc.finetune.reranker.AbsRerankerTrainer.compute_loss -------------------------------------------------------------------------------- /docs/source/API/abc/inference.rst: -------------------------------------------------------------------------------- 1 | Inference 2 | ========= 3 | 4 | .. toctree:: 5 | inference/AbsEmbedder 6 | inference/AbsReranker -------------------------------------------------------------------------------- /docs/source/API/abc/inference/AbsEmbedder.rst: -------------------------------------------------------------------------------- 1 | AbsEmbedder 2 | =========== 3 | 4 | .. autoclass:: FlagEmbedding.abc.inference.AbsEmbedder 5 | 6 | Methods 7 | ------- 8 | 9 | .. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.get_target_devices 10 | 11 | .. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.get_detailed_instruct 12 | 13 | .. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.encode_queries 14 | 15 | .. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.encode_corpus 16 | 17 | .. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.encode 18 | 19 | .. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.encode_single_device 20 | 21 | .. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.start_multi_process_pool 22 | 23 | .. automethod:: FlagEmbedding.abc.inference.AbsEmbedder._encode_multi_process_worker 24 | 25 | .. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.stop_multi_process_pool 26 | 27 | .. automethod:: FlagEmbedding.abc.inference.AbsEmbedder.encode_multi_process 28 | 29 | .. automethod:: FlagEmbedding.abc.inference.AbsEmbedder._concatenate_results_from_multi_process -------------------------------------------------------------------------------- /docs/source/API/abc/inference/AbsReranker.rst: -------------------------------------------------------------------------------- 1 | AbsReranker 2 | =========== 3 | 4 | .. autoclass:: FlagEmbedding.abc.inference.AbsReranker 5 | 6 | Methods 7 | ------- 8 | 9 | .. automethod:: FlagEmbedding.abc.inference.AbsReranker.get_target_devices 10 | 11 | .. automethod:: FlagEmbedding.abc.inference.AbsReranker.get_detailed_instruct 12 | 13 | .. automethod:: FlagEmbedding.abc.inference.AbsReranker.get_detailed_inputs 14 | 15 | .. automethod:: FlagEmbedding.abc.inference.AbsReranker.compute_score 16 | 17 | .. automethod:: FlagEmbedding.abc.inference.AbsReranker.compute_score_single_gpu 18 | 19 | .. automethod:: FlagEmbedding.abc.inference.AbsReranker.start_multi_process_pool 20 | 21 | .. automethod:: FlagEmbedding.abc.inference.AbsReranker.encode_multi_process 22 | 23 | .. automethod:: FlagEmbedding.abc.inference.AbsReranker._encode_multi_process_worker 24 | 25 | .. automethod:: FlagEmbedding.abc.inference.AbsReranker.stop_multi_process_pool -------------------------------------------------------------------------------- /docs/source/API/evaluation.rst: -------------------------------------------------------------------------------- 1 | Evaluation 2 | ========== 3 | 4 | .. toctree:: 5 | evaluation/mteb 6 | evaluation/airbench 7 | evaluation/msmarco 8 | evaluation/beir 9 | evaluation/miracl 10 | evaluation/mkqa 11 | evaluation/mldr -------------------------------------------------------------------------------- /docs/source/API/evaluation/airbench/arguments.rst: -------------------------------------------------------------------------------- 1 | arguments 2 | ========= 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.air_bench.AIRBenchEvalModelArgs -------------------------------------------------------------------------------- /docs/source/API/evaluation/airbench/runner.rst: -------------------------------------------------------------------------------- 1 | runner 2 | ====== 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.air_bench.AIRBenchEvalRunner -------------------------------------------------------------------------------- /docs/source/API/evaluation/beir/arguments.rst: -------------------------------------------------------------------------------- 1 | arguments 2 | ========= 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.beir.arguments.BEIREvalArgs -------------------------------------------------------------------------------- /docs/source/API/evaluation/beir/data_loader.rst: -------------------------------------------------------------------------------- 1 | data loader 2 | =========== 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.beir.data_loader.BEIREvalDataLoader -------------------------------------------------------------------------------- /docs/source/API/evaluation/beir/evaluator.rst: -------------------------------------------------------------------------------- 1 | evaluator 2 | ========= 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.beir.evaluator.BEIREvaluator -------------------------------------------------------------------------------- /docs/source/API/evaluation/beir/runner.rst: -------------------------------------------------------------------------------- 1 | runner 2 | ====== 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.beir.BEIREvalRunner -------------------------------------------------------------------------------- /docs/source/API/evaluation/miracl/data_loader.rst: -------------------------------------------------------------------------------- 1 | data_loader 2 | =========== 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader 5 | 6 | Methods 7 | ------- 8 | 9 | .. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader.available_dataset_names 10 | .. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader.available_splits 11 | .. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader._load_remote_corpus 12 | .. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader._load_remote_qrels 13 | .. automethod:: FlagEmbedding.evaluation.miracl.MIRACLEvalDataLoader._load_remote_queries -------------------------------------------------------------------------------- /docs/source/API/evaluation/miracl/runner.rst: -------------------------------------------------------------------------------- 1 | runner 2 | ====== 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.miracl.MIRACLEvalRunner 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/evaluation/mkqa/data_loader.rst: -------------------------------------------------------------------------------- 1 | data_loader 2 | =========== 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader 5 | 6 | Methods 7 | ------- 8 | 9 | .. automethod:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader.available_dataset_names 10 | .. automethod:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader.available_splits 11 | .. automethod:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader.load_corpus 12 | .. automethod:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader._load_local_qrels 13 | .. automethod:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader._load_remote_corpus 14 | .. automethod:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader._load_remote_qrels 15 | .. automethod:: FlagEmbedding.evaluation.mkqa.MKQAEvalDataLoader._load_remote_queries -------------------------------------------------------------------------------- /docs/source/API/evaluation/mkqa/evaluator.rst: -------------------------------------------------------------------------------- 1 | evaluator 2 | ========= 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.mkqa.MKQAEvaluator 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/evaluation/mkqa/runner.rst: -------------------------------------------------------------------------------- 1 | runner 2 | ====== 3 | .. autoclass:: FlagEmbedding.evaluation.mkqa.MKQAEvalRunner 4 | :members: -------------------------------------------------------------------------------- /docs/source/API/evaluation/mldr/data_loader.rst: -------------------------------------------------------------------------------- 1 | data_loader 2 | =========== 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader 5 | 6 | Methods 7 | ------- 8 | 9 | .. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader.available_dataset_names 10 | .. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader.available_splits 11 | .. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader._load_remote_corpus 12 | .. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader._load_remote_qrels 13 | .. automethod:: FlagEmbedding.evaluation.mldr.MLDREvalDataLoader._load_remote_queries -------------------------------------------------------------------------------- /docs/source/API/evaluation/mldr/runner.rst: -------------------------------------------------------------------------------- 1 | runner 2 | ====== 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.mldr.MLDREvalRunner 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/evaluation/msmarco/data_loader.rst: -------------------------------------------------------------------------------- 1 | data_loader 2 | =========== 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader 5 | 6 | Methods 7 | ------- 8 | 9 | .. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader.available_dataset_names 10 | .. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader.available_splits 11 | .. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader._load_remote_corpus 12 | .. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader._load_remote_qrels 13 | .. automethod:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalDataLoader._load_remote_queries -------------------------------------------------------------------------------- /docs/source/API/evaluation/msmarco/runner.rst: -------------------------------------------------------------------------------- 1 | runner 2 | ====== 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.msmarco.MSMARCOEvalRunner 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/evaluation/mteb/arguments.rst: -------------------------------------------------------------------------------- 1 | arguments 2 | ========= 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.mteb.arguments.MTEBEvalArgs -------------------------------------------------------------------------------- /docs/source/API/evaluation/mteb/runner.rst: -------------------------------------------------------------------------------- 1 | runner 2 | ====== 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.mteb.runner.MTEBEvalRunner -------------------------------------------------------------------------------- /docs/source/API/evaluation/mteb/searcher.rst: -------------------------------------------------------------------------------- 1 | searcher 2 | ======== 3 | 4 | .. autoclass:: FlagEmbedding.evaluation.mteb.searcher.MTEBEvalDenseRetriever 5 | 6 | .. autoclass:: FlagEmbedding.evaluation.mteb.searcher.MTEBEvalReranker -------------------------------------------------------------------------------- /docs/source/API/finetune.rst: -------------------------------------------------------------------------------- 1 | Finetune 2 | ======== 3 | 4 | .. toctree:: 5 | finetune/embedder 6 | finetune/reranker -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder.rst: -------------------------------------------------------------------------------- 1 | Embedder 2 | ======== 3 | 4 | .. toctree:: 5 | embedder/encoder_only 6 | embedder/decoder_only -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/decoder_only.rst: -------------------------------------------------------------------------------- 1 | Decoder Only 2 | ============ 3 | 4 | .. toctree:: 5 | decoder_only/base 6 | decoder_only/icl -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/decoder_only/base.rst: -------------------------------------------------------------------------------- 1 | Base 2 | ==== 3 | 4 | .. toctree:: 5 | base/arguments 6 | base/modeling 7 | base/runner 8 | base/trainer -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/decoder_only/base/arguments.rst: -------------------------------------------------------------------------------- 1 | Arguments 2 | ========= 3 | 4 | .. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.base.DecoderOnlyEmbedderModelArguments 5 | -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/decoder_only/base/modeling.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Modeling 3 | ======== 4 | 5 | .. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.base.CrossDecoderModel 6 | 7 | Methods 8 | ======= 9 | 10 | .. automethod:: FlagEmbedding.finetune.reranker.decoder_only.base.CrossDecoderModel.encode 11 | -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/decoder_only/base/runner.rst: -------------------------------------------------------------------------------- 1 | Runner 2 | ====== 3 | 4 | .. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.base.DecoderOnlyRerankerRunner 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/decoder_only/base/trainer.rst: -------------------------------------------------------------------------------- 1 | Trainer 2 | ======= 3 | 4 | .. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.base.DecoderOnlyRerankerTrainer 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/decoder_only/icl.rst: -------------------------------------------------------------------------------- 1 | ICL 2 | === 3 | 4 | .. toctree:: 5 | icl/arguments 6 | icl/dataset 7 | icl/modeling 8 | icl/runner 9 | icl/trainer -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/decoder_only/icl/arguments.rst: -------------------------------------------------------------------------------- 1 | Arguments 2 | ========= 3 | 4 | .. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLModelArguments 5 | 6 | .. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLDataArguments -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/decoder_only/icl/dataset.rst: -------------------------------------------------------------------------------- 1 | ======= 2 | Dataset 3 | ======= 4 | 5 | DecoderOnlyEmbedderICLSameDatasetTrainDataset 6 | ============================================= 7 | 8 | .. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLSameDatasetTrainDataset 9 | 10 | Methods 11 | ------- 12 | 13 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLSameDatasetTrainDataset._create_batch_data 14 | 15 | AbsEmbedderSameDatasetCollator 16 | ============================== 17 | 18 | .. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.icl.AbsEmbedderSameDatasetCollator -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/decoder_only/icl/modeling.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Modeling 3 | ======== 4 | 5 | .. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel 6 | 7 | Methods 8 | ======= 9 | 10 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel.encode 11 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel.compute_score 12 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel.compute_loss 13 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel.gradient_checkpointing_enable 14 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel.enable_input_require_grads 15 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel.save 16 | 17 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel._sentence_embedding 18 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.icl.BiDecoderOnlyEmbedderICLModel._compute_similarity 19 | -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/decoder_only/icl/runner.rst: -------------------------------------------------------------------------------- 1 | Runner 2 | ====== 3 | 4 | .. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLRunner 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/decoder_only/icl/trainer.rst: -------------------------------------------------------------------------------- 1 | Trainer 2 | ======= 3 | 4 | .. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.icl.DecoderOnlyEmbedderICLTrainer 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/encoder_only.rst: -------------------------------------------------------------------------------- 1 | Encoder Only 2 | ============ 3 | 4 | .. toctree:: 5 | encoder_only/base 6 | encoder_only/m3 -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/encoder_only/base.rst: -------------------------------------------------------------------------------- 1 | Base 2 | ==== 3 | 4 | .. toctree:: 5 | base/modeling 6 | base/runner 7 | base/trainer -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/encoder_only/base/modeling.rst: -------------------------------------------------------------------------------- 1 | Modeling 2 | ======== 3 | 4 | .. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel 5 | 6 | Methods 7 | ------- 8 | 9 | .. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel.encode 10 | 11 | .. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel.compute_score 12 | .. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel.compute_loss 13 | .. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel.gradient_checkpointing_enable 14 | .. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel.enable_input_require_grads 15 | .. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel.save 16 | .. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel._sentence_embedding 17 | .. automethod:: FlagEmbedding.finetune.embedder.encoder_only.base.BiEncoderOnlyEmbedderModel._compute_similarity -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/encoder_only/base/runner.rst: -------------------------------------------------------------------------------- 1 | Runner 2 | ====== 3 | 4 | .. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.base.EncoderOnlyEmbedderRunner 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/encoder_only/base/trainer.rst: -------------------------------------------------------------------------------- 1 | Trainer 2 | ======= 3 | 4 | .. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.base.EncoderOnlyEmbedderTrainer 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/encoder_only/m3.rst: -------------------------------------------------------------------------------- 1 | M3 2 | == 3 | 4 | .. toctree:: 5 | m3/arguments 6 | m3/modeling 7 | m3/runner 8 | m3/trainer -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/encoder_only/m3/arguments.rst: -------------------------------------------------------------------------------- 1 | Arguments 2 | ========= 3 | 4 | .. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3ModelArguments 5 | 6 | .. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3TrainingArguments -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/encoder_only/m3/runner.rst: -------------------------------------------------------------------------------- 1 | Runner 2 | ====== 3 | 4 | .. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Runner 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/finetune/embedder/encoder_only/m3/trainer.rst: -------------------------------------------------------------------------------- 1 | Trainer 2 | ======= 3 | 4 | .. autoclass:: FlagEmbedding.finetune.embedder.encoder_only.m3.EncoderOnlyEmbedderM3Trainer 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker.rst: -------------------------------------------------------------------------------- 1 | Reranker 2 | ======== 3 | 4 | .. toctree:: 5 | reranker/encoder_only 6 | reranker/decoder_only -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/decoder_only.rst: -------------------------------------------------------------------------------- 1 | Decoder Only 2 | ============ 3 | 4 | .. toctree:: 5 | decoder_only/base 6 | decoder_only/layerwise -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/decoder_only/base.rst: -------------------------------------------------------------------------------- 1 | Base 2 | ==== 3 | 4 | .. toctree:: 5 | base/arguments 6 | base/modeling 7 | base/runner 8 | base/trainer -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/decoder_only/base/arguments.rst: -------------------------------------------------------------------------------- 1 | Arguments 2 | ========= 3 | 4 | .. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.base.RerankerModelArguments 5 | -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/decoder_only/base/modeling.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Modeling 3 | ======== 4 | 5 | .. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel 6 | 7 | Methods 8 | ======= 9 | 10 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel.encode 11 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel.compute_score 12 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel.compute_loss 13 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel.gradient_checkpointing_enable 14 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel.enable_input_require_grads 15 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel.save 16 | 17 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel._sentence_embedding 18 | .. automethod:: FlagEmbedding.finetune.embedder.decoder_only.base.BiDecoderOnlyEmbedderModel._compute_similarity 19 | -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/decoder_only/base/runner.rst: -------------------------------------------------------------------------------- 1 | Runner 2 | ====== 3 | 4 | .. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.base.DecoderOnlyEmbedderRunner 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/decoder_only/base/trainer.rst: -------------------------------------------------------------------------------- 1 | Trainer 2 | ======= 3 | 4 | .. autoclass:: FlagEmbedding.finetune.embedder.decoder_only.base.DecoderOnlyEmbedderTrainer 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/decoder_only/layerwise.rst: -------------------------------------------------------------------------------- 1 | Layerwise 2 | ========= 3 | 4 | .. toctree:: 5 | layerwise/arguments 6 | layerwise/modeling 7 | layerwise/runner 8 | layerwise/trainer -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/decoder_only/layerwise/arguments.rst: -------------------------------------------------------------------------------- 1 | Arguments 2 | ========= 3 | 4 | .. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.layerwise.RerankerModelArguments 5 | -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/decoder_only/layerwise/modeling.rst: -------------------------------------------------------------------------------- 1 | ======== 2 | Modeling 3 | ======== 4 | 5 | .. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.layerwise.CrossDecoderModel 6 | 7 | Methods 8 | ======= 9 | 10 | .. automethod:: FlagEmbedding.finetune.reranker.decoder_only.layerwise.CrossDecoderModel.encode 11 | .. automethod:: FlagEmbedding.finetune.reranker.decoder_only.layerwise.CrossDecoderModel.forward 12 | -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/decoder_only/layerwise/runner.rst: -------------------------------------------------------------------------------- 1 | Runner 2 | ====== 3 | 4 | .. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.layerwise.DecoderOnlyRerankerRunner 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/decoder_only/layerwise/trainer.rst: -------------------------------------------------------------------------------- 1 | Trainer 2 | ======= 3 | 4 | .. autoclass:: FlagEmbedding.finetune.reranker.decoder_only.layerwise.DecoderOnlyRerankerTrainer 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/encoder_only.rst: -------------------------------------------------------------------------------- 1 | Encoder Only 2 | ============ 3 | 4 | .. toctree:: 5 | encoder_only/base -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/encoder_only/base.rst: -------------------------------------------------------------------------------- 1 | Base 2 | ==== 3 | 4 | .. toctree:: 5 | base/modeling 6 | base/runner 7 | base/trainer -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/encoder_only/base/modeling.rst: -------------------------------------------------------------------------------- 1 | Modeling 2 | ======== 3 | 4 | .. autoclass:: FlagEmbedding.finetune.reranker.encoder_only.base.CrossEncoderModel 5 | 6 | Methods 7 | ------- 8 | 9 | .. automethod:: FlagEmbedding.finetune.reranker.encoder_only.base.CrossEncoderModel.encode 10 | -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/encoder_only/base/runner.rst: -------------------------------------------------------------------------------- 1 | Runner 2 | ====== 3 | 4 | .. autoclass:: FlagEmbedding.finetune.reranker.encoder_only.base.EncoderOnlyRerankerRunner 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/finetune/reranker/encoder_only/base/trainer.rst: -------------------------------------------------------------------------------- 1 | Trainer 2 | ======= 3 | 4 | .. autoclass:: FlagEmbedding.finetune.reranker.encoder_only.base.EncoderOnlyRerankerTrainer 5 | :members: -------------------------------------------------------------------------------- /docs/source/API/index.rst: -------------------------------------------------------------------------------- 1 | API 2 | === 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | abc 8 | inference 9 | evaluation 10 | finetune -------------------------------------------------------------------------------- /docs/source/API/inference.rst: -------------------------------------------------------------------------------- 1 | Inference 2 | ========= 3 | 4 | .. toctree:: 5 | inference/FlagAutoModel 6 | inference/FlagAutoReranker 7 | inference/embedder/embedder 8 | inference/reranker/reranker -------------------------------------------------------------------------------- /docs/source/API/inference/FlagAutoModel.rst: -------------------------------------------------------------------------------- 1 | FlagAutoModel 2 | ============= 3 | 4 | .. autoclass:: FlagEmbedding.inference.FlagAutoModel 5 | 6 | Methods 7 | ------- 8 | 9 | .. automethod:: FlagEmbedding.inference.FlagAutoModel.from_finetuned -------------------------------------------------------------------------------- /docs/source/API/inference/FlagAutoReranker.rst: -------------------------------------------------------------------------------- 1 | FlagAutoReranker 2 | ================ 3 | 4 | .. autoclass:: FlagEmbedding.inference.FlagAutoReranker 5 | 6 | Methods 7 | ------- 8 | 9 | .. automethod:: FlagEmbedding.inference.FlagAutoReranker.from_finetuned -------------------------------------------------------------------------------- /docs/source/API/inference/embedder/decoder_only/BaseLLMEmbedder.rst: -------------------------------------------------------------------------------- 1 | BaseEmbedder 2 | ============ 3 | 4 | .. autoclass:: FlagEmbedding.inference.embedder.decoder_only.base.BaseLLMEmbedder 5 | 6 | Methods 7 | ------- 8 | 9 | .. automethod:: FlagEmbedding.inference.embedder.decoder_only.base.BaseLLMEmbedder.encode_queries 10 | 11 | .. automethod:: FlagEmbedding.inference.embedder.decoder_only.base.BaseLLMEmbedder.encode_corpus 12 | 13 | .. automethod:: FlagEmbedding.inference.embedder.decoder_only.base.BaseLLMEmbedder.encode 14 | 15 | .. automethod:: FlagEmbedding.inference.embedder.decoder_only.base.BaseLLMEmbedder.encode_single_device -------------------------------------------------------------------------------- /docs/source/API/inference/embedder/decoder_only/ICLLLMEmbedder.rst: -------------------------------------------------------------------------------- 1 | ICLLLMEmbedder 2 | ============== 3 | 4 | .. autoclass:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder 5 | 6 | Methods 7 | ------- 8 | 9 | .. automethod:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder.encode_queries 10 | 11 | .. automethod:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder.encode_corpus 12 | 13 | .. automethod:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder.encode 14 | 15 | .. automethod:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder.set_examples 16 | 17 | .. automethod:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder.get_detailed_example 18 | 19 | .. automethod:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder.encode_queries_single_device 20 | 21 | .. automethod:: FlagEmbedding.inference.embedder.decoder_only.icl.ICLLLMEmbedder.encode_single_device -------------------------------------------------------------------------------- /docs/source/API/inference/embedder/embedder.rst: -------------------------------------------------------------------------------- 1 | Embedder 2 | ======== 3 | 4 | .. toctree:: 5 | encoder_only/BaseEmbedder 6 | encoder_only/M3Embedder 7 | decoder_only/BaseLLMEmbedder 8 | decoder_only/ICLLLMEmbedder -------------------------------------------------------------------------------- /docs/source/API/inference/embedder/encoder_only/BaseEmbedder.rst: -------------------------------------------------------------------------------- 1 | BaseEmbedder 2 | ============ 3 | 4 | .. autoclass:: FlagEmbedding.inference.embedder.encoder_only.base.BaseEmbedder 5 | 6 | Methods 7 | ------- 8 | 9 | .. automethod:: FlagEmbedding.inference.embedder.encoder_only.base.BaseEmbedder.encode_queries 10 | :no-index: 11 | 12 | .. automethod:: FlagEmbedding.inference.embedder.encoder_only.base.BaseEmbedder.encode_corpus 13 | 14 | .. automethod:: FlagEmbedding.inference.embedder.encoder_only.base.BaseEmbedder.encode 15 | 16 | .. automethod:: FlagEmbedding.inference.embedder.encoder_only.base.BaseEmbedder.encode_single_device 17 | 18 | .. automethod:: FlagEmbedding.inference.embedder.encoder_only.base.BaseEmbedder.pooling -------------------------------------------------------------------------------- /docs/source/API/inference/reranker/decoder_only/BaseLLMReranker.rst: -------------------------------------------------------------------------------- 1 | BaseLLMReranker 2 | =============== 3 | 4 | .. autoclass:: FlagEmbedding.inference.reranker.decoder_only.base.BaseLLMReranker 5 | 6 | Methods 7 | ------- 8 | 9 | .. autoclass:: FlagEmbedding.inference.reranker.decoder_only.base.BaseLLMReranker.compute_score_single_gpu 10 | -------------------------------------------------------------------------------- /docs/source/API/inference/reranker/decoder_only/LayerWiseLLMReranker.rst: -------------------------------------------------------------------------------- 1 | LayerWiseLLMReranker 2 | ==================== 3 | 4 | .. autoclass:: FlagEmbedding.inference.reranker.decoder_only.layerwise.LayerWiseLLMReranker 5 | 6 | Methods 7 | ------- 8 | 9 | .. autoclass:: FlagEmbedding.inference.reranker.decoder_only.layerwise.LayerWiseLLMReranker.compute_score_single_gpu 10 | -------------------------------------------------------------------------------- /docs/source/API/inference/reranker/decoder_only/LightweightLLMReranker.rst: -------------------------------------------------------------------------------- 1 | LightweightLLMReranker 2 | ====================== 3 | 4 | .. autoclass:: FlagEmbedding.inference.reranker.decoder_only.lightweight.LightweightLLMReranker 5 | 6 | Methods 7 | ------- 8 | 9 | .. autoclass:: FlagEmbedding.inference.reranker.decoder_only.lightweight.LightweightLLMReranker.compute_score_single_gpu 10 | -------------------------------------------------------------------------------- /docs/source/API/inference/reranker/encoder_only/BaseReranker.rst: -------------------------------------------------------------------------------- 1 | BaseReranker 2 | ============ 3 | 4 | .. autoclass:: FlagEmbedding.inference.reranker.encoder_only.base.BaseReranker 5 | 6 | Methods 7 | ------- 8 | 9 | .. autoclass:: FlagEmbedding.inference.reranker.encoder_only.base.BaseReranker.compute_score_single_gpu 10 | -------------------------------------------------------------------------------- /docs/source/API/inference/reranker/reranker.rst: -------------------------------------------------------------------------------- 1 | Reranker 2 | ======== 3 | 4 | .. toctree:: 5 | encoder_only/BaseReranker 6 | decoder_only/BaseLLMReranker 7 | decoder_only/LayerWiseLLMReranker 8 | decoder_only/LightweightLLMReranker -------------------------------------------------------------------------------- /docs/source/Introduction/index.rst: -------------------------------------------------------------------------------- 1 | Introduction 2 | ============ 3 | 4 | BGE builds one-stop retrieval toolkit for search and RAG. We provide inference, evaluation, and fine-tuning for embedding models and reranker. 5 | 6 | .. figure:: ../_static/img/RAG_pipeline.png 7 | :width: 700 8 | :align: center 9 | 10 | BGE embedder and reranker in an RAG pipeline. `Source `_ 11 | 12 | Quickly get started with: 13 | 14 | .. toctree:: 15 | :maxdepth: 1 16 | :caption: Start 17 | 18 | overview 19 | installation 20 | quick_start 21 | 22 | 23 | .. toctree:: 24 | :maxdepth: 1 25 | :caption: Concept 26 | 27 | IR 28 | embedder 29 | reranker 30 | similarity 31 | retrieval_demo -------------------------------------------------------------------------------- /docs/source/Introduction/overview.rst: -------------------------------------------------------------------------------- 1 | Overview 2 | ======== 3 | 4 | Our repository provides well-structured `APIs `_ for the inference, evaluation, and fine-tuning of BGE series models. 5 | Besides that, there are abundant resources of and for users to quickly get a hands-on experience. 6 | 7 | .. figure:: https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/refs/heads/master/imgs/projects.png 8 | :width: 700 9 | :align: center 10 | 11 | Structure of contents in our `repo `_ 12 | 13 | Our repository provides well-structured contents for information retrieval and RAG: 14 | 15 | - The core `APIs <../API>`_ for embedding models' inference, evaluation, and fine-tuning. 16 | - Hands-on `examples `_ for the three mentioned use cases. 17 | - Detailed `tutorials `_ covering topics in retrieval to help you learn from scratch. -------------------------------------------------------------------------------- /docs/source/Introduction/quick_start.rst: -------------------------------------------------------------------------------- 1 | Quick Start 2 | =========== 3 | 4 | First, load one of the BGE embedding model: 5 | 6 | .. code:: python 7 | 8 | from FlagEmbedding import FlagAutoModel 9 | 10 | model = FlagAutoModel.from_finetuned('BAAI/bge-base-en-v1.5') 11 | 12 | .. tip:: 13 | 14 | If there's difficulty connecting to Hugging Face, you can use the `HF mirror `_ instead. 15 | 16 | .. code:: bash 17 | 18 | export HF_ENDPOINT=https://hf-mirror.com 19 | 20 | Then, feed some sentences to the model and get their embeddings: 21 | 22 | .. code:: python 23 | 24 | sentences_1 = ["I love NLP", "I love machine learning"] 25 | sentences_2 = ["I love BGE", "I love text retrieval"] 26 | embeddings_1 = model.encode(sentences_1) 27 | embeddings_2 = model.encode(sentences_2) 28 | 29 | Once we get the embeddings, we can compute similarity by inner product: 30 | 31 | .. code:: python 32 | 33 | similarity = embeddings_1 @ embeddings_2.T 34 | print(similarity) 35 | -------------------------------------------------------------------------------- /docs/source/_static/css/custom.css: -------------------------------------------------------------------------------- 1 | .bd-sidebar-primary { 2 | width: 22%; 3 | line-height: 1.4; 4 | } 5 | 6 | .col-lg-3 { 7 | flex: 0 0 auto; 8 | width: 22%; 9 | } -------------------------------------------------------------------------------- /docs/source/_static/img/BAAI_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/docs/source/_static/img/BAAI_logo.png -------------------------------------------------------------------------------- /docs/source/_static/img/BGE_WeChat_Group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/docs/source/_static/img/BGE_WeChat_Group.png -------------------------------------------------------------------------------- /docs/source/_static/img/C_MTEB.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/docs/source/_static/img/C_MTEB.png -------------------------------------------------------------------------------- /docs/source/_static/img/RAG_pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/docs/source/_static/img/RAG_pipeline.png -------------------------------------------------------------------------------- /docs/source/_static/img/bge_logo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/docs/source/_static/img/bge_logo.jpeg -------------------------------------------------------------------------------- /docs/source/_static/img/bge_panda.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/docs/source/_static/img/bge_panda.jpg -------------------------------------------------------------------------------- /docs/source/_static/img/projects.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/docs/source/_static/img/projects.png -------------------------------------------------------------------------------- /docs/source/_static/img/word2vec.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/docs/source/_static/img/word2vec.png -------------------------------------------------------------------------------- /docs/source/bge/index.rst: -------------------------------------------------------------------------------- 1 | BGE 2 | === 3 | 4 | .. figure:: ../_static/img/bge_logo.jpeg 5 | :width: 250 6 | :align: center 7 | 8 | **BGE** stands for **BAAI General Embeddings**, which is a series of embedding models released by BAAI. 9 | 10 | .. toctree:: 11 | :maxdepth: 1 12 | :caption: Embedder 13 | 14 | bge_v1_v1.5 15 | bge_m3 16 | bge_icl 17 | bge_vl 18 | 19 | .. toctree:: 20 | :maxdepth: 1 21 | :caption: Reranker 22 | 23 | bge_reranker 24 | bge_reranker_v2 -------------------------------------------------------------------------------- /docs/source/community/index.rst: -------------------------------------------------------------------------------- 1 | Community 2 | ========= 3 | 4 | Visit our `GitHub repo `_ and 5 | `Hugging Face collection `_ for more materials! 6 | 7 | We are also holding WeChat groups for for BGE. Scan the QR code to join the group chat! 8 | To get the first hand message about our updates and new release, or having any questions or ideas, join us now! 9 | 10 | .. figure:: ../_static/img/BGE_WeChat_Group.png 11 | :width: 400 12 | :align: center -------------------------------------------------------------------------------- /docs/source/tutorial/1_Embedding.rst: -------------------------------------------------------------------------------- 1 | 1. Embedding 2 | ============ 3 | 4 | .. toctree:: 5 | :hidden: 6 | :maxdepth: 1 7 | :caption: Embedding 8 | 9 | 1_Embedding/1.1.1 10 | 1_Embedding/1.2.1 11 | 1_Embedding/1.2.2 12 | 1_Embedding/1.2.3 13 | 1_Embedding/1.2.4 14 | 1_Embedding/1.2.5 -------------------------------------------------------------------------------- /docs/source/tutorial/2_Metrics.rst: -------------------------------------------------------------------------------- 1 | 2. Metrics 2 | ========== 3 | 4 | .. toctree:: 5 | :hidden: 6 | :maxdepth: 1 7 | :caption: Metrics 8 | 9 | 2_Metrics/2.1 10 | 2_Metrics/2.2 -------------------------------------------------------------------------------- /docs/source/tutorial/3_Indexing.rst: -------------------------------------------------------------------------------- 1 | 3. Indexing 2 | =========== 3 | 4 | .. toctree:: 5 | :hidden: 6 | :maxdepth: 1 7 | :caption: Indexing 8 | 9 | 3_Indexing/3.1.1 10 | 3_Indexing/3.1.2 11 | 3_Indexing/3.1.3 12 | 3_Indexing/3.1.4 13 | 3_Indexing/3.1.5 -------------------------------------------------------------------------------- /docs/source/tutorial/4_Evaluation.rst: -------------------------------------------------------------------------------- 1 | 4. Evaluation 2 | ============= 3 | 4 | .. toctree:: 5 | :hidden: 6 | :maxdepth: 1 7 | :caption: Evaluation 8 | 9 | 4_Evaluation/4.1.1 10 | 4_Evaluation/4.2.1 11 | 4_Evaluation/4.2.2 12 | 4_Evaluation/4.2.3 13 | 4_Evaluation/4.3.1 14 | 4_Evaluation/4.4.1 15 | 4_Evaluation/4.5.1 16 | 4_Evaluation/4.5.2 17 | -------------------------------------------------------------------------------- /docs/source/tutorial/5_Reranking.rst: -------------------------------------------------------------------------------- 1 | 5. Reranking 2 | ============ 3 | 4 | .. toctree:: 5 | :hidden: 6 | :maxdepth: 1 7 | :caption: Reranking 8 | 9 | 5_Reranking/5.1 10 | 5_Reranking/5.2 11 | 5_Reranking/5.3 -------------------------------------------------------------------------------- /docs/source/tutorial/6_RAG.rst: -------------------------------------------------------------------------------- 1 | 6. RAG 2 | ====== 3 | 4 | .. toctree:: 5 | :hidden: 6 | :maxdepth: 1 7 | :caption: RAG 8 | 9 | 6_RAG/6.1 10 | 6_RAG/6.2 11 | 6_RAG/6.3 -------------------------------------------------------------------------------- /docs/source/tutorial/7_Finetuning.rst: -------------------------------------------------------------------------------- 1 | 7. Finetuning 2 | ============= 3 | 4 | .. toctree:: 5 | :hidden: 6 | :maxdepth: 1 7 | :caption: Finetuning 8 | 9 | 7_Finetuning/7.1.1 10 | 7_Finetuning/7.1.2 11 | 7_Finetuning/7.1.3 12 | 7_Finetuning/7.2.1 -------------------------------------------------------------------------------- /docs/source/tutorial/index.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ========= 3 | 4 | In this section, we provide hands on introduction to different topics that highly related to embedding models and retrieval. 5 | 6 | To run the tutorials, clone the GitHub repo and check the `Tutorials `_ folder. 7 | 8 | .. toctree:: 9 | :maxdepth: 1 10 | :caption: Tutorials 11 | 12 | 1_Embedding 13 | 2_Metrics 14 | 3_Indexing 15 | 4_Evaluation 16 | 5_Reranking 17 | 6_RAG 18 | 7_Finetuning -------------------------------------------------------------------------------- /examples/evaluation/air_bench/eval_air_bench.sh: -------------------------------------------------------------------------------- 1 | if [ -z "$HF_HUB_CACHE" ]; then 2 | export HF_HUB_CACHE="$HOME/.cache/huggingface/hub" 3 | fi 4 | 5 | eval_args="\ 6 | --benchmark_version AIR-Bench_24.05 \ 7 | --task_types qa long-doc \ 8 | --domains arxiv \ 9 | --languages en \ 10 | --splits dev test \ 11 | --output_dir ./air_bench/search_results \ 12 | --search_top_k 1000 --rerank_top_k 100 \ 13 | --cache_dir $HF_HUB_CACHE \ 14 | --overwrite False \ 15 | " 16 | 17 | model_args="\ 18 | --embedder_name_or_path BAAI/bge-m3 \ 19 | --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ 20 | --devices cuda:0 cuda:1 \ 21 | --model_cache_dir $HF_HUB_CACHE \ 22 | --reranker_max_length 1024 \ 23 | " 24 | 25 | cmd="python -m FlagEmbedding.evaluation.air_bench \ 26 | $eval_args \ 27 | $model_args \ 28 | " 29 | 30 | echo $cmd 31 | eval $cmd 32 | -------------------------------------------------------------------------------- /examples/evaluation/beir/eval_beir.sh: -------------------------------------------------------------------------------- 1 | if [ -z "$HF_HUB_CACHE" ]; then 2 | export HF_HUB_CACHE="$HOME/.cache/huggingface/hub" 3 | fi 4 | 5 | dataset_names="fiqa arguana cqadupstack" 6 | 7 | eval_args="\ 8 | --eval_name beir \ 9 | --dataset_dir ./beir/data \ 10 | --dataset_names $dataset_names \ 11 | --splits test dev \ 12 | --corpus_embd_save_dir ./beir/corpus_embd \ 13 | --output_dir ./beir/search_results \ 14 | --search_top_k 1000 --rerank_top_k 100 \ 15 | --cache_path $HF_HUB_CACHE \ 16 | --overwrite False \ 17 | --k_values 10 100 \ 18 | --eval_output_method markdown \ 19 | --eval_output_path ./beir/beir_eval_results.md \ 20 | --eval_metrics ndcg_at_10 recall_at_100 \ 21 | --ignore_identical_ids True \ 22 | " 23 | 24 | model_args="\ 25 | --embedder_name_or_path BAAI/bge-large-en-v1.5 \ 26 | --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ 27 | --devices cuda:0 cuda:1 \ 28 | --cache_dir $HF_MODEL_CACHE \ 29 | --reranker_max_length 1024 \ 30 | " 31 | 32 | cmd="python -m FlagEmbedding.evaluation.beir \ 33 | $eval_args \ 34 | $model_args \ 35 | " 36 | 37 | echo $cmd 38 | eval $cmd 39 | -------------------------------------------------------------------------------- /examples/evaluation/miracl/eval_miracl.sh: -------------------------------------------------------------------------------- 1 | if [ -z "$HF_HUB_CACHE" ]; then 2 | export HF_HUB_CACHE="$HOME/.cache/huggingface/hub" 3 | fi 4 | 5 | dataset_names="bn hi sw te th yo" 6 | 7 | eval_args="\ 8 | --eval_name miracl \ 9 | --dataset_dir ./miracl/data \ 10 | --dataset_names $dataset_names \ 11 | --splits dev \ 12 | --corpus_embd_save_dir ./miracl/corpus_embd \ 13 | --output_dir ./miracl/search_results \ 14 | --search_top_k 1000 --rerank_top_k 100 \ 15 | --cache_path $HF_HUB_CACHE \ 16 | --overwrite False \ 17 | --k_values 10 100 \ 18 | --eval_output_method markdown \ 19 | --eval_output_path ./miracl/miracl_eval_results.md \ 20 | --eval_metrics ndcg_at_10 recall_at_100 \ 21 | " 22 | 23 | model_args="\ 24 | --embedder_name_or_path BAAI/bge-m3 \ 25 | --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ 26 | --devices cuda:0 cuda:1 \ 27 | --cache_dir $HF_HUB_CACHE \ 28 | --reranker_max_length 1024 \ 29 | " 30 | 31 | cmd="python -m FlagEmbedding.evaluation.miracl \ 32 | $eval_args \ 33 | $model_args \ 34 | " 35 | 36 | echo $cmd 37 | eval $cmd 38 | -------------------------------------------------------------------------------- /examples/evaluation/mkqa/eval_mkqa.sh: -------------------------------------------------------------------------------- 1 | if [ -z "$HF_HUB_CACHE" ]; then 2 | export HF_HUB_CACHE="$HOME/.cache/huggingface/hub" 3 | fi 4 | 5 | dataset_names="en zh_cn" 6 | 7 | eval_args="\ 8 | --eval_name mkqa \ 9 | --dataset_dir ./mkqa/data \ 10 | --dataset_names $dataset_names \ 11 | --splits test \ 12 | --corpus_embd_save_dir ./mkqa/corpus_embd \ 13 | --output_dir ./mkqa/search_results \ 14 | --search_top_k 1000 --rerank_top_k 100 \ 15 | --cache_path $HF_HUB_CACHE \ 16 | --overwrite False \ 17 | --k_values 20 \ 18 | --eval_output_method markdown \ 19 | --eval_output_path ./mkqa/mkqa_eval_results.md \ 20 | --eval_metrics qa_recall_at_20 \ 21 | " 22 | 23 | model_args="\ 24 | --embedder_name_or_path BAAI/bge-m3 \ 25 | --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ 26 | --devices cuda:0 cuda:1 \ 27 | --cache_dir $HF_HUB_CACHE \ 28 | --reranker_max_length 1024 \ 29 | " 30 | 31 | cmd="python -m FlagEmbedding.evaluation.mkqa \ 32 | $eval_args \ 33 | $model_args \ 34 | " 35 | 36 | echo $cmd 37 | eval $cmd 38 | -------------------------------------------------------------------------------- /examples/evaluation/mldr/eval_mldr.sh: -------------------------------------------------------------------------------- 1 | if [ -z "$HF_HUB_CACHE" ]; then 2 | export HF_HUB_CACHE="$HOME/.cache/huggingface/hub" 3 | fi 4 | 5 | dataset_names="hi" 6 | 7 | eval_args="\ 8 | --eval_name mldr \ 9 | --dataset_dir ./mldr/data \ 10 | --dataset_names $dataset_names \ 11 | --splits test \ 12 | --corpus_embd_save_dir ./mldr/corpus_embd \ 13 | --output_dir ./mldr/search_results \ 14 | --search_top_k 1000 --rerank_top_k 100 \ 15 | --cache_path $HF_HUB_CACHE \ 16 | --overwrite False \ 17 | --k_values 10 100 \ 18 | --eval_output_method markdown \ 19 | --eval_output_path ./mldr/mldr_eval_results.md \ 20 | --eval_metrics ndcg_at_10 \ 21 | " 22 | 23 | model_args="\ 24 | --embedder_name_or_path BAAI/bge-m3 \ 25 | --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ 26 | --devices cuda:0 cuda:1 \ 27 | --cache_dir $HF_HUB_CACHE \ 28 | --embedder_passage_max_length 8192 \ 29 | --reranker_max_length 8192 \ 30 | " 31 | 32 | cmd="python -m FlagEmbedding.evaluation.mldr \ 33 | $eval_args \ 34 | $model_args \ 35 | " 36 | 37 | echo $cmd 38 | eval $cmd 39 | -------------------------------------------------------------------------------- /examples/evaluation/msmarco/eval_msmarco.sh: -------------------------------------------------------------------------------- 1 | if [ -z "$HF_HUB_CACHE" ]; then 2 | export HF_HUB_CACHE="$HOME/.cache/huggingface/hub" 3 | fi 4 | 5 | dataset_names="passage" 6 | 7 | eval_args="\ 8 | --eval_name msmarco \ 9 | --dataset_dir ./msmarco/data \ 10 | --dataset_names $dataset_names \ 11 | --splits dev \ 12 | --corpus_embd_save_dir ./msmarco/corpus_embd \ 13 | --output_dir ./msmarco/search_results \ 14 | --search_top_k 1000 --rerank_top_k 100 \ 15 | --cache_path $HF_HUB_CACHE \ 16 | --overwrite True \ 17 | --k_values 10 100 \ 18 | --eval_output_method markdown \ 19 | --eval_output_path ./msmarco/msmarco_eval_results.md \ 20 | --eval_metrics ndcg_at_10 recall_at_100 \ 21 | " 22 | 23 | model_args="\ 24 | --embedder_name_or_path BAAI/bge-large-en-v1.5 \ 25 | --reranker_name_or_path BAAI/bge-reranker-v2-m3 \ 26 | --devices cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 \ 27 | --cache_dir $HF_HUB_CACHE \ 28 | --reranker_max_length 1024 \ 29 | " 30 | 31 | cmd="python -m FlagEmbedding.evaluation.msmarco \ 32 | $eval_args \ 33 | $model_args \ 34 | " 35 | 36 | echo $cmd 37 | eval $cmd 38 | -------------------------------------------------------------------------------- /examples/evaluation/mteb/eval_mteb.sh: -------------------------------------------------------------------------------- 1 | if [ -z "$HF_HUB_CACHE" ]; then 2 | export HF_HUB_CACHE="$HOME/.cache/huggingface/hub" 3 | fi 4 | 5 | languages="eng" 6 | tasks="NFCorpus BiorxivClusteringS2S SciDocsRR" 7 | 8 | eval_args="\ 9 | --eval_name mteb \ 10 | --output_dir ./mteb/search_results \ 11 | --languages $languages \ 12 | --tasks $tasks \ 13 | --eval_output_path ./mteb/mteb_eval_results.json 14 | " 15 | 16 | model_args="\ 17 | --embedder_name_or_path BAAI/bge-large-en-v1.5 \ 18 | --devices cuda:7 \ 19 | --cache_dir $HF_HUB_CACHE \ 20 | " 21 | 22 | cmd="python -m FlagEmbedding.evaluation.mteb \ 23 | $eval_args \ 24 | $model_args \ 25 | " 26 | 27 | echo $cmd 28 | eval $cmd 29 | -------------------------------------------------------------------------------- /examples/finetune/ds_stage0.json: -------------------------------------------------------------------------------- 1 | { 2 | "zero_optimization": { 3 | "stage": 0 4 | }, 5 | 6 | "fp16": { 7 | "enabled": "auto", 8 | "loss_scale": 0, 9 | "loss_scale_window": 1000, 10 | "initial_scale_power": 12, 11 | "hysteresis": 2, 12 | "min_loss_scale": 1 13 | }, 14 | 15 | "bf16": { 16 | "enabled": "auto" 17 | }, 18 | 19 | "optimizer": { 20 | "type": "AdamW", 21 | "params": { 22 | "lr": "auto", 23 | "betas": "auto", 24 | "eps": "auto", 25 | "weight_decay": "auto" 26 | } 27 | }, 28 | 29 | "scheduler": { 30 | "type": "WarmupDecayLR", 31 | "params": { 32 | "warmup_min_lr": "auto", 33 | "warmup_max_lr": "auto", 34 | "warmup_num_steps": "auto", 35 | "total_num_steps": "auto" 36 | } 37 | }, 38 | 39 | "gradient_accumulation_steps": "auto", 40 | "gradient_clipping": "auto", 41 | "steps_per_print": 100, 42 | "train_batch_size": "auto", 43 | "train_micro_batch_size_per_gpu": "auto", 44 | "wall_clock_breakdown": false 45 | } -------------------------------------------------------------------------------- /imgs/BGE_WeChat_Group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/imgs/BGE_WeChat_Group.png -------------------------------------------------------------------------------- /imgs/FlagOpen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/imgs/FlagOpen.png -------------------------------------------------------------------------------- /imgs/bge_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/imgs/bge_logo.jpg -------------------------------------------------------------------------------- /imgs/cir_candi_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/imgs/cir_candi_1.png -------------------------------------------------------------------------------- /imgs/cir_candi_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/imgs/cir_candi_2.png -------------------------------------------------------------------------------- /imgs/cir_query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/imgs/cir_query.png -------------------------------------------------------------------------------- /imgs/projects.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/imgs/projects.png -------------------------------------------------------------------------------- /research/BGE_Coder/evaluation/coderag_eval/eval.sh: -------------------------------------------------------------------------------- 1 | cd ./code-rag-bench/retrieval/ 2 | 3 | output_dir='result' 4 | 5 | for dataset_name in "humaneval" "mbpp" "repoeval" "ds1000_all_completion" "odex_en" "swe-bench-lite" 6 | do 7 | echo "dataset_name: ${dataset_name}" 8 | python main.py \ 9 | --embedder_name_or_path BAAI/bge-code-v1 \ 10 | --embedder_model_class decoder-only-base \ 11 | --query_instruction_format_for_retrieval '{}\n{}' \ 12 | --embedder_query_max_length 2048 \ 13 | --embedder_passage_max_length 2048 \ 14 | --trust_remote_code True \ 15 | --pooling_method last_token \ 16 | --embedder_batch_size 64 \ 17 | --devices cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 \ 18 | --cache_dir ./cache \ 19 | --dataset $dataset_name \ 20 | --output_file ../../${output_dir}/${dataset_name}_output.json \ 21 | --results_file ../../${output_dir}/${dataset_name}_results.json 22 | done -------------------------------------------------------------------------------- /research/BGE_Coder/evaluation/coderag_eval/prepare_data.sh: -------------------------------------------------------------------------------- 1 | cd ./code-rag-bench/retrieval/ 2 | 3 | for dataset_name in "humaneval" "mbpp" "live_code_bench" "ds1000" "odex" "repoeval_repo" "swebench_repo" 4 | do 5 | echo "dataset_name: ${dataset_name}" 6 | PYTHONPATH=./ python create/${dataset_name}.py 7 | done -------------------------------------------------------------------------------- /research/BGE_Coder/evaluation/coir_eval/eval.sh: -------------------------------------------------------------------------------- 1 | output_dir=result 2 | 3 | python main.py \ 4 | --output_dir ${output_dir} \ 5 | --use_special_instructions True \ 6 | --embedder_name_or_path BAAI/bge-code-v1 \ 7 | --embedder_model_class decoder-only-base \ 8 | --query_instruction_format_for_retrieval '{}\n{}' \ 9 | --embedder_query_max_length 2048 \ 10 | --embedder_passage_max_length 2048 \ 11 | --trust_remote_code True \ 12 | --pooling_method last_token \ 13 | --embedder_batch_size 64 \ 14 | --devices cuda:0 cuda:1 cuda:2 cuda:3 cuda:4 cuda:5 cuda:6 cuda:7 \ 15 | --tasks apps codetrans-contest codetrans-dl cosqa synthetic-text2sql stackoverflow-qa codefeedback-mt codefeedback-st CodeSearchNet-ccr-go CodeSearchNet-ccr-java CodeSearchNet-ccr-javascript CodeSearchNet-ccr-php CodeSearchNet-ccr-python CodeSearchNet-ccr-ruby CodeSearchNet-go CodeSearchNet-java CodeSearchNet-javascript CodeSearchNet-php CodeSearchNet-python CodeSearchNet-ruby \ 16 | --cache_dir ./cache -------------------------------------------------------------------------------- /research/BGE_Coder/paper/CodeR.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_Coder/paper/CodeR.pdf -------------------------------------------------------------------------------- /research/BGE_M3/BGE_M3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_M3/BGE_M3.pdf -------------------------------------------------------------------------------- /research/BGE_M3/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling import BGEM3Model, BGEM3ForInference, EncoderOutput 2 | from .trainer import BiTrainer -------------------------------------------------------------------------------- /research/BGE_M3/imgs/bm25.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_M3/imgs/bm25.jpg -------------------------------------------------------------------------------- /research/BGE_M3/imgs/long.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_M3/imgs/long.jpg -------------------------------------------------------------------------------- /research/BGE_M3/imgs/miracl.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_M3/imgs/miracl.jpg -------------------------------------------------------------------------------- /research/BGE_M3/imgs/mkqa.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_M3/imgs/mkqa.jpg -------------------------------------------------------------------------------- /research/BGE_M3/imgs/nqa.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_M3/imgs/nqa.jpg -------------------------------------------------------------------------------- /research/BGE_M3/imgs/others.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_M3/imgs/others.webp -------------------------------------------------------------------------------- /research/BGE_VL/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 JUNJIE99 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /research/BGE_VL/assets/cir_candi_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/cir_candi_1.png -------------------------------------------------------------------------------- /research/BGE_VL/assets/cir_candi_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/cir_candi_2.png -------------------------------------------------------------------------------- /research/BGE_VL/assets/cir_query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/cir_query.png -------------------------------------------------------------------------------- /research/BGE_VL/assets/corpus/000000032077.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/corpus/000000032077.jpg -------------------------------------------------------------------------------- /research/BGE_VL/assets/corpus/000000050549.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/corpus/000000050549.jpg -------------------------------------------------------------------------------- /research/BGE_VL/assets/corpus/000000098911.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/corpus/000000098911.jpg -------------------------------------------------------------------------------- /research/BGE_VL/assets/corpus/000000156031.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/corpus/000000156031.jpg -------------------------------------------------------------------------------- /research/BGE_VL/assets/corpus/000000244097.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/corpus/000000244097.jpg -------------------------------------------------------------------------------- /research/BGE_VL/assets/corpus/000000272130.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/corpus/000000272130.jpg -------------------------------------------------------------------------------- /research/BGE_VL/assets/corpus/000000275230.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/corpus/000000275230.jpg -------------------------------------------------------------------------------- /research/BGE_VL/assets/corpus/000000311907.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/corpus/000000311907.jpg -------------------------------------------------------------------------------- /research/BGE_VL/assets/corpus/000000357304.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/corpus/000000357304.jpg -------------------------------------------------------------------------------- /research/BGE_VL/assets/corpus/000000478916.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/corpus/000000478916.jpg -------------------------------------------------------------------------------- /research/BGE_VL/assets/corpus/000000545037.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/corpus/000000545037.jpg -------------------------------------------------------------------------------- /research/BGE_VL/assets/query/000000530944.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/query/000000530944.jpg -------------------------------------------------------------------------------- /research/BGE_VL/assets/res-ft-mmeb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/res-ft-mmeb.png -------------------------------------------------------------------------------- /research/BGE_VL/assets/res-scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/res-scaling.png -------------------------------------------------------------------------------- /research/BGE_VL/assets/res-zs-cir.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/res-zs-cir.png -------------------------------------------------------------------------------- /research/BGE_VL/assets/res-zs-mmeb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL/assets/res-zs-mmeb.png -------------------------------------------------------------------------------- /research/BGE_VL_Screenshot/assets/neg_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL_Screenshot/assets/neg_1.jpeg -------------------------------------------------------------------------------- /research/BGE_VL_Screenshot/assets/neg_2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL_Screenshot/assets/neg_2.jpeg -------------------------------------------------------------------------------- /research/BGE_VL_Screenshot/assets/positive_1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL_Screenshot/assets/positive_1.jpeg -------------------------------------------------------------------------------- /research/BGE_VL_Screenshot/assets/positive_2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL_Screenshot/assets/positive_2.jpeg -------------------------------------------------------------------------------- /research/BGE_VL_Screenshot/assets/query_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL_Screenshot/assets/query_1.png -------------------------------------------------------------------------------- /research/BGE_VL_Screenshot/assets/query_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/BGE_VL_Screenshot/assets/query_2.png -------------------------------------------------------------------------------- /research/C_MTEB/C_MTEB/__init__.py: -------------------------------------------------------------------------------- 1 | # from .tasks import * 2 | from .tasks import * 3 | 4 | ChineseTaskList = [ 5 | 'TNews', 'IFlyTek', 'MultilingualSentiment', 'JDReview', 'OnlineShopping', 'Waimai', 6 | 'CLSClusteringS2S.v2', 'CLSClusteringP2P.v2', 'ThuNewsClusteringS2S.v2', 'ThuNewsClusteringP2P.v2', 7 | 'Ocnli', 'Cmnli', 8 | 'T2Reranking', 'MMarcoReranking', 'CMedQAv1-reranking', 'CMedQAv2-reranking', 9 | 'T2Retrieval', 'MMarcoRetrieval', 'DuRetrieval', 'CovidRetrieval', 'CmedqaRetrieval', 'EcomRetrieval', 'MedicalRetrieval', 'VideoRetrieval', 10 | 'ATEC', 'BQ', 'LCQMC', 'PAWSX', 'STSB', 'AFQMC', 'QBQTC' 11 | ] 12 | -------------------------------------------------------------------------------- /research/C_MTEB/C_MTEB/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from .Classification import * 2 | from .Clustering import * 3 | from .PairClassification import * 4 | from .Reranking import * 5 | from .Retrieval import * 6 | from .STS import * 7 | -------------------------------------------------------------------------------- /research/C_MTEB/MKQA/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/C_MTEB/MKQA/utils/__init__.py -------------------------------------------------------------------------------- /research/C_MTEB/eval_cross_encoder.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from C_MTEB.tasks import * 4 | from mteb import MTEB 5 | 6 | from FlagEmbedding import FlagReranker 7 | 8 | 9 | def get_args(): 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--model_name_or_path', default="BAAI/bge-reranker-base", type=str) 12 | return parser.parse_args() 13 | 14 | 15 | 16 | if __name__ == '__main__': 17 | args = get_args() 18 | 19 | model = FlagReranker(args.model_name_or_path, use_fp16=True) 20 | 21 | if 'checkpoint-' in args.model_name_or_path: 22 | save_name = "_".join(args.model_name_or_path.split('/')[-2:]) 23 | else: 24 | save_name = "_".join(args.model_name_or_path.split('/')[-1:]) 25 | 26 | evaluation = MTEB(task_types=["Reranking"], task_langs=['zh', 'zh2en', 'en2zh']) 27 | evaluation.run(model, output_folder=f"reranker_results/{save_name}") 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /research/C_MTEB/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", mode="r", encoding="utf-8") as readme_file: 4 | readme = readme_file.read() 5 | 6 | setup( 7 | name='C_MTEB', 8 | version='1.1.1', 9 | description='Chinese Massive Text Embedding Benchmark', 10 | long_description=readme, 11 | long_description_content_type="text/markdown", 12 | author_email='2906698981@qq.com', 13 | url='https://github.com/FlagOpen/FlagEmbedding/tree/master/C_MTEB', 14 | packages=find_packages(), 15 | install_requires=[ 16 | 'mteb[beir]==1.1.1', 17 | ], 18 | ) 19 | -------------------------------------------------------------------------------- /research/LLARA/finetune/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/LLARA/finetune/__init__.py -------------------------------------------------------------------------------- /research/LLARA/pretrain/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/LLARA/pretrain/__init__.py -------------------------------------------------------------------------------- /research/LM_Cocktail/LM_Cocktail/__init__.py: -------------------------------------------------------------------------------- 1 | from .cocktail import mix_models, mix_models_with_data, mix_models_by_layers 2 | -------------------------------------------------------------------------------- /research/LM_Cocktail/images/1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/LM_Cocktail/images/1.png -------------------------------------------------------------------------------- /research/LM_Cocktail/images/pic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/LM_Cocktail/images/pic.png -------------------------------------------------------------------------------- /research/LM_Cocktail/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", mode="r", encoding="utf-8") as readme_file: 4 | readme = readme_file.read() 5 | 6 | setup( 7 | name='LM_Cocktail', 8 | version='0.0.5', 9 | description='LM_Cocktail', 10 | long_description=readme, 11 | long_description_content_type="text/markdown", 12 | author_email='2906698981@qq.com', 13 | url='https://github.com/FlagOpen/FlagEmbedding/LM_Cocktail', 14 | packages=find_packages(), 15 | install_requires=[ 16 | 'torch>=1.6.0', 17 | 'transformers>=4.18.0', 18 | 'datasets', 19 | 'accelerate>=0.20.1' 20 | ], 21 | ) 22 | -------------------------------------------------------------------------------- /research/Long_LLM/activation_beacon/data/config/code.json: -------------------------------------------------------------------------------- 1 | { 2 | "mixture": { 3 | "commoncrawl": 10, 4 | "c4": 10, 5 | "github": 25, 6 | "book": 10, 7 | "arxiv": 10, 8 | "wiki": 10, 9 | "stackexchange": 25 10 | }, 11 | "num_tokens_avg": { 12 | "commoncrawl": 1207, 13 | "c4": 378, 14 | "wiki": 393, 15 | "stackexchange": 309, 16 | "github": 436, 17 | "book": 89373, 18 | "arxiv": 7375 19 | } 20 | } -------------------------------------------------------------------------------- /research/Long_LLM/activation_beacon/data/config/even.json: -------------------------------------------------------------------------------- 1 | { 2 | "mixture": { 3 | "commoncrawl": 14.2, 4 | "c4": 14.2, 5 | "github": 14.2, 6 | "book": 14.2, 7 | "arxiv": 14.2, 8 | "wiki": 14.2, 9 | "stackexchange": 14.2 10 | }, 11 | "num_tokens_avg": { 12 | "commoncrawl": 1207, 13 | "c4": 378, 14 | "wiki": 393, 15 | "stackexchange": 309, 16 | "github": 436, 17 | "book": 89373, 18 | "arxiv": 7375 19 | } 20 | } -------------------------------------------------------------------------------- /research/Long_LLM/activation_beacon/data/config/fsdp-offload.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | fsdp_config: 6 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 7 | fsdp_backward_prefetch: BACKWARD_PRE 8 | fsdp_cpu_ram_efficient_loading: true 9 | fsdp_forward_prefetch: false 10 | fsdp_offload_params: false 11 | fsdp_sharding_strategy: FULL_SHARD 12 | fsdp_state_dict_type: FULL_STATE_DICT 13 | fsdp_sync_module_states: true 14 | fsdp_use_orig_params: true 15 | machine_rank: 0 16 | main_training_function: main 17 | mixed_precision: bf16 18 | num_machines: 1 19 | num_processes: 8 20 | rdzv_backend: static 21 | same_network: true 22 | tpu_env: [] 23 | tpu_use_cluster: false 24 | tpu_use_sudo: false 25 | use_cpu: false 26 | -------------------------------------------------------------------------------- /research/Long_LLM/activation_beacon/data/config/fsdp.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | distributed_type: FSDP 4 | downcast_bf16: 'no' 5 | fsdp_config: 6 | fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP 7 | fsdp_backward_prefetch: BACKWARD_PRE 8 | fsdp_cpu_ram_efficient_loading: false 9 | fsdp_forward_prefetch: false 10 | fsdp_offload_params: false 11 | fsdp_sharding_strategy: FULL_SHARD 12 | fsdp_state_dict_type: FULL_STATE_DICT 13 | fsdp_sync_module_states: true 14 | fsdp_use_orig_params: true 15 | machine_rank: 0 16 | main_training_function: main 17 | mixed_precision: bf16 18 | num_machines: 1 19 | num_processes: 8 20 | rdzv_backend: static 21 | same_network: true 22 | tpu_env: [] 23 | tpu_use_cluster: false 24 | tpu_use_sudo: false 25 | use_cpu: false 26 | -------------------------------------------------------------------------------- /research/Long_LLM/activation_beacon/data/config/slimpajama.json: -------------------------------------------------------------------------------- 1 | { 2 | "mixture": { 3 | "commoncrawl": 52.2, 4 | "c4": 26.7, 5 | "github": 5.2, 6 | "book": 4.2, 7 | "arxiv": 4.6, 8 | "wiki": 3.8, 9 | "stackexchange": 3.3 10 | }, 11 | "num_tokens_avg": { 12 | "commoncrawl": 1207, 13 | "c4": 378, 14 | "wiki": 393, 15 | "stackexchange": 309, 16 | "github": 436, 17 | "book": 89373, 18 | "arxiv": 7375 19 | } 20 | } -------------------------------------------------------------------------------- /research/Long_LLM/activation_beacon/data/config/zero3-infer-offload.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | gradient_accumulation_steps: 1 5 | offload_optimizer_device: cpu 6 | offload_param_device: cpu 7 | zero3_init_flag: false 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /research/Long_LLM/activation_beacon/data/config/zero3-infer.yaml: -------------------------------------------------------------------------------- 1 | compute_environment: LOCAL_MACHINE 2 | debug: false 3 | deepspeed_config: 4 | gradient_accumulation_steps: 1 5 | offload_optimizer_device: none 6 | offload_param_device: none 7 | zero3_init_flag: false 8 | zero3_save_16bit_model: true 9 | zero_stage: 3 10 | distributed_type: DEEPSPEED 11 | downcast_bf16: 'no' 12 | machine_rank: 0 13 | main_training_function: main 14 | mixed_precision: bf16 15 | num_machines: 1 16 | num_processes: 8 17 | rdzv_backend: static 18 | same_network: true 19 | tpu_env: [] 20 | tpu_use_cluster: false 21 | tpu_use_sudo: false 22 | use_cpu: false 23 | -------------------------------------------------------------------------------- /research/Long_LLM/activation_beacon/src/llama/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling_llama import LlamaForCausalLM 2 | from .configuration_llama import LlamaConfig -------------------------------------------------------------------------------- /research/Long_LLM/activation_beacon/src/mistral/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling_mistral import MistralForCausalLM 2 | from .configuration_mistral import MistralConfig -------------------------------------------------------------------------------- /research/Long_LLM/activation_beacon/src/qwen2/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling_qwen2 import Qwen2ForCausalLM 2 | from .configuration_qwen2 import Qwen2Config -------------------------------------------------------------------------------- /research/Long_LLM/longllm_qlora/data_pipeline/data/README.md: -------------------------------------------------------------------------------- 1 | This dictionary is used for saving processed data and results. -------------------------------------------------------------------------------- /research/Long_LLM/longllm_qlora/data_pipeline/raw_data/README.md: -------------------------------------------------------------------------------- 1 | This dictionary is used for saving raw data. -------------------------------------------------------------------------------- /research/Long_LLM/longllm_qlora/imgs/needle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/Long_LLM/longllm_qlora/imgs/needle.png -------------------------------------------------------------------------------- /research/MLVU/figs/statistic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/MLVU/figs/statistic.png -------------------------------------------------------------------------------- /research/MLVU/figs/task_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/MLVU/figs/task_example.png -------------------------------------------------------------------------------- /research/Matroyshka_reranker/finetune/compensation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/Matroyshka_reranker/finetune/compensation/__init__.py -------------------------------------------------------------------------------- /research/Matroyshka_reranker/finetune/self_distillation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/Matroyshka_reranker/finetune/self_distillation/__init__.py -------------------------------------------------------------------------------- /research/Matroyshka_reranker/inference/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/Matroyshka_reranker/inference/__init__.py -------------------------------------------------------------------------------- /research/Matroyshka_reranker/requirements.txt: -------------------------------------------------------------------------------- 1 | tiktoken==0.6.0 2 | tornado==6.4 3 | langchain_openai==0.0.6 4 | rapidfuzz==3.6.1 5 | sql_metadata==2.10.0 6 | func_timeout==4.3.5 7 | pandas==2.2.1 8 | sqlglot==22.1.1 9 | rank_bm25==0.2.2 10 | peft==0.10.0 11 | transformers==4.41.1 12 | jinja2 13 | datasets 14 | sentencepiece 15 | flash-attn 16 | modelscope 17 | deepspeed 18 | bitsandbytes -------------------------------------------------------------------------------- /research/Reinforced_IR/data_generation/agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt import GPTAgent 2 | from .vllm import LLMAgent 3 | from .vllm_instruct import LLMInstructAgent -------------------------------------------------------------------------------- /research/Reinforced_IR/data_generation/prompts/train_prompts.py: -------------------------------------------------------------------------------- 1 | generate_train_answer = """Please generate a brief answer to the given query according to the reference passage. 2 | 3 | Query: {query} 4 | 5 | Reference passage: {passage} 6 | 7 | Answer: """ 8 | 9 | generate_train_query = """Please generate a concise query from the following corpus. 10 | 11 | Corpus: {passage} 12 | 13 | Query: """ 14 | 15 | generate_train_query_type2 = """Generate a concise query using the key terms based on the following corpus. 16 | 17 | Corpus: {passage} 18 | 19 | Concise query: """ 20 | 21 | # The query is a user query and should be short. -------------------------------------------------------------------------------- /research/Reinforced_IR/finetune/generator/save_tokenizer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import json 4 | import copy 5 | 6 | from transformers import AutoTokenizer 7 | 8 | 9 | def parse_option(): 10 | parser = argparse.ArgumentParser("") 11 | 12 | parser.add_argument('--model_path', type=str, default=None) 13 | parser.add_argument('--output_path', type=str, default=None) 14 | 15 | opt = parser.parse_args() 16 | 17 | return opt 18 | 19 | 20 | def main(opt): 21 | model_path = opt.model_path 22 | output_path = opt.output_path 23 | 24 | tokenizer = AutoTokenizer.from_pretrained(model_path) 25 | tokenizer.save_pretrained(output_path) 26 | 27 | 28 | if __name__ == "__main__": 29 | opt = parse_option() 30 | main(opt) -------------------------------------------------------------------------------- /research/Reinforced_IR/finetune/retriever/arguments.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from FlagEmbedding.abc.finetune.embedder import AbsEmbedderDataArguments 3 | 4 | from FlagEmbedding.abc.finetune.embedder import ( 5 | AbsEmbedderTrainingArguments, 6 | AbsEmbedderModelArguments 7 | ) 8 | 9 | 10 | @dataclass 11 | class IREmbedderTrainingArguments(AbsEmbedderTrainingArguments): 12 | """ 13 | Training argument class for M3. 14 | """ 15 | training_type: str = field(default='retrieval_answer', metadata={"help": "whether to use answer"}) 16 | answer_temperature: float = field(default=None, metadata={"help": "temperature for answer"}) 17 | normalize_answer: bool = field(default=True, metadata={"help": "normalize answer"}) 18 | 19 | @dataclass 20 | class IREmbedderDataArguments(AbsEmbedderDataArguments): 21 | """ 22 | Data argument class for M3. 23 | """ 24 | answer_inbatch: bool = field(default=False) 25 | -------------------------------------------------------------------------------- /research/Reinforced_IR/finetune/retriever/run.py: -------------------------------------------------------------------------------- 1 | from transformers import HfArgumentParser 2 | 3 | from FlagEmbedding.abc.finetune.embedder import AbsEmbedderModelArguments 4 | from runner import IREmbedderRunner 5 | from arguments import IREmbedderTrainingArguments, IREmbedderDataArguments 6 | 7 | 8 | if __name__ == '__main__': 9 | parser = HfArgumentParser((AbsEmbedderModelArguments, IREmbedderDataArguments, IREmbedderTrainingArguments)) 10 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 11 | model_args: AbsEmbedderModelArguments 12 | data_args: IREmbedderDataArguments 13 | training_args: IREmbedderTrainingArguments 14 | 15 | runner = IREmbedderRunner( 16 | model_args=model_args, 17 | data_args=data_args, 18 | training_args=training_args 19 | ) 20 | runner.run() 21 | -------------------------------------------------------------------------------- /research/Reinforced_IR/inference/agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt import GPTAgent 2 | from .vllm import LLMAgent 3 | from .vllm_instruct import LLMInstructAgent -------------------------------------------------------------------------------- /research/Reinforced_IR/requirements.txt: -------------------------------------------------------------------------------- 1 | FlagEmbedding 2 | vllm==0.7.1 3 | jinja2 4 | datasets 5 | sentencepiece 6 | modelscope 7 | peft 8 | deepspeed 9 | bitsandbytes -------------------------------------------------------------------------------- /research/baai_general_embedding/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/baai_general_embedding/__init__.py -------------------------------------------------------------------------------- /research/baai_general_embedding/finetune/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling import BiEncoderModel, EncoderOutput 2 | from .trainer import BiTrainer 3 | -------------------------------------------------------------------------------- /research/baai_general_embedding/retromae_pretrain/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/AmazonCounterfactualClassification.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "I wish I could have used this head set but the day I received it it wouldn't even turn on and I really wanted this product to work I'm very disappointed.", 4 | "response": "counterfactual" 5 | }, 6 | { 7 | "query": "I would advise that instead of trying to follow these poor instructions, Google it.", 8 | "response": "not-counterfactual" 9 | }, 10 | { 11 | "query": "I wrote to Monster customer service before ordering and they told me it would be fine to use without a converter and it was absolutely true.", 12 | "response": "not-counterfactual" 13 | } 14 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/AmazonPolarityClassification.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "Hunting the Hard Way Thia was a gift for my Husband, who loved the book. It arrived on the date we were told it would.", 4 | "response": "positive" 5 | }, 6 | { 7 | "query": "Poor DVD Has too many interviews with people at the Live THomas day in Penn. My kids were annoyed and hated this DVD.", 8 | "response": "negative" 9 | }, 10 | { 11 | "query": "Ludicrous and silly I remember getting this book so faintly that that says alot about my opinion of it. Basically, while I will entertain lots of odd ideas and theories, this book was basically silly.", 12 | "response": "negative" 13 | }, 14 | { 15 | "query": "Artistry I think that the Deodato concerts are very rich, as he used real strings and band musicians, as well as you can appreciate the John Tropea excelent renditions on guitar.", 16 | "response": "positive" 17 | } 18 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/ArxivClusteringS2S.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "A Survey on Graph Neural Networks: Algorithms and Applications", 4 | "response": "cs" 5 | }, 6 | { 7 | "query": "Hamiltonian Dynamics and KAM Theory for Infinite-Dimensional Systems", 8 | "response": "math" 9 | }, 10 | { 11 | "query": "Dark Matter Distribution in Dwarf Spheroidal Galaxies: Constraints from Stellar Kinematics", 12 | "response": "astro-ph" 13 | }, 14 | { 15 | "query": "Decoherence and Quantum Error Correction in Topological Quantum Computers", 16 | "response": "quant-ph" 17 | }, 18 | { 19 | "query": "Spin-Orbit Coupling Effects in Low-Dimensional Quantum Materials", 20 | "response": "cond-mat" 21 | } 22 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/BIOSSES.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "Recent studies have highlighted the crucial role of p53 in regulating cell cycle progression.", 4 | "response": "Recent research underscores p53's pivotal function in controlling cellular division." 5 | }, 6 | { 7 | "query": "Neuroscience has revealed intricate pathways linking dopamine to reward and motivation.", 8 | "response": "Recent neuroscientific findings have illuminated complex dopamine pathways associated with motivation and reward." 9 | }, 10 | { 11 | "query": "Stem cell research holds promise for treating a variety of degenerative diseases.", 12 | "response": "The potential of stem cell research in combating degenerative illnesses is widely recognized." 13 | } 14 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/Banking77Classification.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "What is my money worth in other countries?", 4 | "response": "exchange_rate" 5 | }, 6 | { 7 | "query": "What can I do if my card still hasn't arrived after 2 weeks?", 8 | "response": "card_arrival" 9 | }, 10 | { 11 | "query": "Would I be able to open an account for my daughter?", 12 | "response": "age_limit" 13 | }, 14 | { 15 | "query": "My address details have changed and I want to update them", 16 | "response": "edit_personal_details" 17 | }, 18 | { 19 | "query": "If my cash withdrawal is still not showing, is something wrong?", 20 | "response": "pending_cash_withdrawal" 21 | }, 22 | { 23 | "query": "How long do transfers typically take? Is there a way of speeding the process up? My friend needs the money I sent her desperately.", 24 | "response": "transfer_not_received_by_recipient" 25 | } 26 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/BiorxivClusteringS2S.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "Neural Circuit Dynamics in Decision-Making: A Computational Model of Prefrontal-Striatal Interactions", 4 | "response": "neuroscience" 5 | }, 6 | { 7 | "query": "Metagenomic Insights into Extreme Environments: Microbial Diversity and Functional Adaptations in Antarctic Lakes", 8 | "response": "microbiology" 9 | }, 10 | { 11 | "query": "Machine Learning Approaches for Predicting Protein Structure and Function from Sequence Data", 12 | "response": "bioinformatics" 13 | }, 14 | { 15 | "query": "Regulation of Stem Cell Fate Decisions by the Hippo Signaling Pathway: Implications for Tissue Regeneration and Cancer Therapy", 16 | "response": "cell biology" 17 | }, 18 | { 19 | "query": "Optical Tweezers and Single-Molecule Force Spectroscopy: Probing Protein Folding Dynamics and Mechanical Properties of Biomolecules", 20 | "response": "biophysics" 21 | } 22 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/CQADupstackRetrieval.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "angularjs infinite scroll in a container", 4 | "response": "AngularJS ng-infinite-scroll not working on a specific container/div" 5 | }, 6 | { 7 | "query": "Java: Efficiently converting an array of longs to an array of bytes", 8 | "response": "Most Compact way to Serialize an Array of Longs in Java" 9 | }, 10 | { 11 | "query": "PyVISA missing methods", 12 | "response": "NI VISA + pyVisa on Mac OS X (Snow Leopard)" 13 | } 14 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/EmotionClassification.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "i am bothered is that he might changed his feelings once he get back in us and leave me heartbroken", 4 | "response": "sadness" 5 | }, 6 | { 7 | "query": "i have always loved my jobs and loved to work and i truly feel like being back there with my patients and co workers will do me a lot of good even if it is only for a few weeks", 8 | "response": "joy" 9 | }, 10 | { 11 | "query": "i certainly feel loved and appreciated and grateful for all that i have", 12 | "response": "love" 13 | }, 14 | { 15 | "query": "im grabbing a minute to post i feel greedy wrong", 16 | "response": "anger" 17 | }, 18 | { 19 | "query": "i was stymied a little bit as i wrote feeling unsure that i might go somewhere with the story unintended", 20 | "response": "fear" 21 | }, 22 | { 23 | "query": "i keep feeling pleasantly surprised at his supportiveness and also his ease in new situations", 24 | "response": "surprise" 25 | } 26 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/MTOPDomainClassification.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "I am no longer available", 4 | "response": "calling" 5 | }, 6 | { 7 | "query": "Cancel my reminder about my dentist appointment", 8 | "response": "reminder" 9 | }, 10 | { 11 | "query": "Will it rain tomorrow?", 12 | "response": "weather" 13 | }, 14 | { 15 | "query": "Create an appointment alarm for 11:30am.", 16 | "response": "allarm" 17 | }, 18 | { 19 | "query": "Play a different playlist", 20 | "response": "music" 21 | }, 22 | { 23 | "query": "What's the best way to fry chicken", 24 | "response": "recipes" 25 | }, 26 | { 27 | "query": "what city does Ahmed live in ?", 28 | "response": "people" 29 | } 30 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/MTOPIntentClassification.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "When will my next alarm start", 4 | "response": "GET_ALARM" 5 | }, 6 | { 7 | "query": "I need you to message Zachary Fletcher", 8 | "response": "SEND_MESSAGE" 9 | }, 10 | { 11 | "query": "show me video messages from Atlas", 12 | "response": "GET_MESSAGE" 13 | }, 14 | { 15 | "query": "I want to listen to AC/DC please", 16 | "response": "PLAY_MUSIC" 17 | }, 18 | { 19 | "query": "Make an alarm for the next 7 weeks for Thursday at 6pm", 20 | "response": "CREATE_ALARM" 21 | }, 22 | { 23 | "query": "fairs happening in ann arbor next week", 24 | "response": "GET_EVENT" 25 | }, 26 | { 27 | "query": "Will we get a frost this week?", 28 | "response": "GET_WEATHER" 29 | } 30 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/MassiveIntentClassification.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "remind me to pay rent every month", 4 | "response": "calendar_set" 5 | }, 6 | { 7 | "query": "please play yesterday from beatles", 8 | "response": "play_music" 9 | }, 10 | { 11 | "query": "what will the temperatures be for the next week", 12 | "response": "weather_query" 13 | }, 14 | { 15 | "query": "give me the detailed schedule for next week", 16 | "response": "calendar_query" 17 | }, 18 | { 19 | "query": "what's happening in my day", 20 | "response": "general_quirky" 21 | }, 22 | { 23 | "query": "dolores how was your day", 24 | "response": "general_quirky" 25 | }, 26 | { 27 | "query": "who was appointed as deputy centimeter of uttar pradesh", 28 | "response": "qa_factoid" 29 | }, 30 | { 31 | "query": "find me news about trumps speech", 32 | "response": "news_query" 33 | } 34 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/MassiveScenarioClassification.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "can you confirm that my meeting for tomorrow has been canceled", 4 | "response": "calendar" 5 | }, 6 | { 7 | "query": "please open my music application and play games by disturbed", 8 | "response": "play" 9 | }, 10 | { 11 | "query": "what's the word orange mean", 12 | "response": "qa" 13 | }, 14 | { 15 | "query": "find me all mails from magda with holidays word in the title", 16 | "response": "email" 17 | }, 18 | { 19 | "query": "get a cup of coffee ready now", 20 | "response": "iot" 21 | }, 22 | { 23 | "query": "good morning olly", 24 | "response": "general" 25 | } 26 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/MedrxivClusteringS2S.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "Longitudinal Analysis of SARS-CoV-2 Neutralizing Antibody Titers and Viral Load in Asymptomatic and Symptomatic Patients", 4 | "response": "infectious diseases" 5 | }, 6 | { 7 | "query": "Impact of Public Health Messaging and Community Engagement on Vaccination Uptake During the COVID-19 Pandemic", 8 | "response": "epidemiology" 9 | }, 10 | { 11 | "query": "Long-term Effects of Ambient Temperature on COPD Hospitalizations: A Population-based Analysis in Northern Europe", 12 | "response": "public and global health" 13 | }, 14 | { 15 | "query": "Genomic Landscape of Rare Genetic Disorders Revealed through Whole-Exome Sequencing in Pediatric Populations", 16 | "response": "genetic and genomic medicine" 17 | }, 18 | { 19 | "query": "Impact of Gut Microbiota on Neuroinflammation and Cognitive Function in Multiple Sclerosis Patients: A Prospective Study", 20 | "response": "neurology" 21 | } 22 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/MindSmallReranking.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "'Wheel Of Fortune' Guest Delivers Hilarious, Off The Rails Introduction", 4 | "response": "Charles Rogers, former Michigan State football, Detroit Lions star, dead at 38" 5 | }, 6 | { 7 | "query": "Eliud Kipchoge runs 1:59 marathon, first to break 2 hours", 8 | "response": "AP-NORC poll: Many youths say high school diploma is enough" 9 | } 10 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/QuoraRetrieval.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "Why do people say Dhanush (South Indian actor) is ugly? I don't think so.?", 4 | "response": "Why do people say Dhanush (South Indian actor) is ugly? I don't think so?" 5 | }, 6 | { 7 | "query": "What are some hit and nice ideas about architecture dissertation topics?", 8 | "response": "What are some interesting undergraduate architecture thesis topics?" 9 | }, 10 | { 11 | "query": "Could someone please motivate me?", 12 | "response": "Can you motivate me?" 13 | } 14 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/RedditClustering.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "Financial Meltdown: Strategies for Surviving Economic Collapse", 4 | "response": "collapse.txt" 5 | }, 6 | { 7 | "query": "Exclusive Comic Book Sale: Don't Miss Out on January 13th!", 8 | "response": "comicbooks.txt" 9 | }, 10 | { 11 | "query": "Tchaikovsky's Untold Story: The Mystery Behind Symphony No. 7", 12 | "response": "classicalmusic.txt" 13 | }, 14 | { 15 | "query": "Coffee Addiction: When It's More Than Just a Drink", 16 | "response": "Coffee.txt" 17 | }, 18 | { 19 | "query": "Understanding Boeing's Micro-Missile Capabilities", 20 | "response": "aviation.txt" 21 | } 22 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/SICK-R.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "The cat is lounging on the sunny windowsill.", 4 | "response": "The feline is resting on the sunny windowsill." 5 | }, 6 | { 7 | "query": "A woman is reading a book while sitting on a bench.", 8 | "response": "A lady is reading a book while seated on a bench." 9 | }, 10 | { 11 | "query": "The child is drawing with crayons on a piece of paper.", 12 | "response": "The kid is using crayons to draw on a sheet of paper." 13 | } 14 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/STS12.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "A man is dancing on the ceiling.", 4 | "response": "A man is dancing on the ceiling of a room." 5 | }, 6 | { 7 | "query": "That is a shameful state of affairs when we consider that the EU itself is a champion of modernised business practice.", 8 | "response": "It is a shame when it is thought that the European Union is posed as a champion modernization of the economic life!" 9 | }, 10 | { 11 | "query": "Spain has done a magnificent job in turning round the difficult neighbourly relations which Europe and North Africa and Spain and Morocco have suffered during the course of history.", 12 | "response": "Spain has developed a remarkably positive the difficult neighbourhood which has always existed between Europe and North Africa and between Spain and Morocco." 13 | } 14 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/STS13.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "the state of being exposed to danger or harm", 4 | "response": "the condition of being at risk of injury or loss." 5 | }, 6 | { 7 | "query": "a set of instructions for a computer", 8 | "response": "directions given to a computer to perform a specific task." 9 | }, 10 | { 11 | "query": "a building used for public worship", 12 | "response": "a place where people gather to worship collectively." 13 | } 14 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/STS14.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "president obama vows to work with congress on immigration reform .", 4 | "response": "obama pledges to collaborate with congress on immigration overhaul ." 5 | }, 6 | { 7 | "query": "britain votes to leave european union .", 8 | "response": "uk votes to leave eu ." 9 | }, 10 | { 11 | "query": "russian president putin signs law banning adoption of russian children by u.s. citizens .", 12 | "response": "putin bans u.s. adoptions of russian children ." 13 | } 14 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/STS15.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "The battery and bulb A are not in the same path", 4 | "response": "Bulb A and the battery are not in the same circuit." 5 | }, 6 | { 7 | "query": "Switch Y and bulb B are in the same loop", 8 | "response": "Switch Y and bulb B belong to the same circuit." 9 | }, 10 | { 11 | "query": "new york city marathon canceled due to hurricane sandy", 12 | "response": "nyc marathon canceled because of hurricane sandy" 13 | }, 14 | { 15 | "query": "pope francis calls for peace in syria during sunday address", 16 | "response": "pope francis appeals for peace in syria in his sunday speech" 17 | } 18 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/STS16.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "what are the symptoms of a heart attack ?", 4 | "response": "what are the signs of a heart attack ?" 5 | }, 6 | { 7 | "query": "how do i change a flat tire on my car ?", 8 | "response": "what steps should i take to replace a flat tire ?" 9 | }, 10 | { 11 | "query": "how do i cook a medium rare steak ?", 12 | "response": "what's the best way to prepare a steak to medium rare ?" 13 | } 14 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/STS17.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "The sun is setting over the mountains.", 4 | "response": " \"The sun sets behind the mountains.\"" 5 | }, 6 | { 7 | "query": "A child is playing with a red ball.", 8 | "response": " \"A kid plays with a red ball.\"" 9 | }, 10 | { 11 | "query": "Two people are sitting on a bench in the park.", 12 | "response": " \"Two individuals are seated on a bench in the park.\"" 13 | } 14 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/STSBenchmark.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "Agribusiness: Mad cow disease found in California", 4 | "response": "USDA Confirms Case of Mad Cow Disease in California" 5 | }, 6 | { 7 | "query": "santos stated colombian police found the evidence in 2 computers discovered with slain rebel leader raul reyes. ", 8 | "response": "francisco santos stated that colombian police found the evidence on two computers discovered with raul reyes." 9 | }, 10 | { 11 | "query": "US Attorney General Holder resigns", 12 | "response": "US Attorney general Eric Holder to resign" 13 | } 14 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/SprintDuplicateQuestions.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "Kyocera duraforce pro international roaming settings", 4 | "response": "Make a call while roaming internationally - Kyocera DuraForce PRO" 5 | }, 6 | { 7 | "query": "Guide for connecting to the Sprint U301 USB mobile broadband", 8 | "response": "Turn automatic connections on or off - Sprint U301 USB Device Sprint 3G/4G Mobile Broadband" 9 | }, 10 | { 11 | "query": "What do you think is a reason that is preventing troubleshooting on my HTC One A9 related to issues to the mobile hotspots ?", 12 | "response": "Troubleshoot issues related to mobile hotspots and your HTC One A9" 13 | }, 14 | { 15 | "query": "Why has my Samsung Transform been freezing everytime I attempt to open up an app ?", 16 | "response": "Why is my Samsung Transform freezing or being unresponsive ?" 17 | }, 18 | { 19 | "query": "What can I do to turn on Wi-Fi on the HTC One A9 ?", 20 | "response": "Turn on and connect to Wi-Fi - HTC One A9" 21 | } 22 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/StackExchangeClustering.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "Recommendations for a lightweight Markdown editor with real-time collaboration features?", 4 | "response": "softwarerecs.stackexchange.com.txt" 5 | }, 6 | { 7 | "query": "How to integrate external APIs with EOSIO blockchain applications?", 8 | "response": "eosio.stackexchange.com.txt" 9 | }, 10 | { 11 | "query": "How to balance macros for effective fat loss and muscle retention?", 12 | "response": "fitness.stackexchange.com.txt" 13 | }, 14 | { 15 | "query": "Can amans\" be used as a substantival participle in Latin?\"", 16 | "response": "latin.stackexchange.com.txt" 17 | }, 18 | { 19 | "query": "Is it normal for a 2018 Audi A4 to consume coolant frequently?", 20 | "response": "mechanics.stackexchange.com.txt" 21 | } 22 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/StackOverflowDupQuestions.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "How to handle onChange event in React when state changes programmatically?", 4 | "response": "React onChange event not firing when state is updated programmatically" 5 | }, 6 | { 7 | "query": "How to simulate a click event on a button using JavaScript?", 8 | "response": "JavaScript button click event simulation" 9 | }, 10 | { 11 | "query": "Python: How to run a function asynchronously using asyncio?", 12 | "response": "Asyncio: Running Python function asynchronously" 13 | } 14 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/SummEval.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "passenger jin pai , 35 , was standing on the rim of a toilet when it collapsed , leaving him hospitalised with deep cuts on his leg and buttocks after he broke a toilet he was squatting on . passenger jin pai , 35 , was standing on the rim of a toilet when it smashed to the ground . according to airport officials he had not wanted to let his bottom touch the seat because he was ' worried it might not be clean ' .", 4 | "response": "Jin Pai was standing on rim of a toilet in Hefei Xinqiao International Airport. The porcelain toilet then tipped over and shattered on the floor. The 35-year-old is left with deep cuts to his leg and buttocks." 5 | } 6 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/TweetSentimentExtractionClassification.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "I`d have responded, if I were going", 4 | "response": "neutral" 5 | }, 6 | { 7 | "query": "what interview! leave me alone", 8 | "response": "negative" 9 | }, 10 | { 11 | "query": "2am feedings for the baby are fun when he is all smiles and coos", 12 | "response": "positive" 13 | }, 14 | { 15 | "query": "is cleaning the house for her family who is comming later today..", 16 | "response": "neutral" 17 | }, 18 | { 19 | "query": "Sick. With a flu like thing.", 20 | "response": "negative" 21 | }, 22 | { 23 | "query": "We saw that in none 3D - the baddie`s the best", 24 | "response": "positive" 25 | } 26 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/TwentyNewsgroupsClustering.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "Major flaw discovered in widely-used encryption protocol", 4 | "response": "sci.crypt" 5 | }, 6 | { 7 | "query": "Bruins' Unstoppable Winning Streak", 8 | "response": "rec.sport.hockey" 9 | }, 10 | { 11 | "query": "Comparing Windows File Systems: NTFS vs. FAT32 vs. exFAT", 12 | "response": "comp.os.ms-windows.misc" 13 | }, 14 | { 15 | "query": "Troubleshooting a Digital Multimeter Calibration Issue", 16 | "response": "sci.electronics" 17 | }, 18 | { 19 | "query": "Understanding DPI Scaling in X Window Systems", 20 | "response": "comp.windows.x" 21 | } 22 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/TwitterSemEval2015.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "Excited for the new Game of Thrones episode tonight!", 4 | "response": "Can't wait for tonight's Game of Thrones episode!" 5 | }, 6 | { 7 | "query": "Just finished a 5k run and feel amazing!", 8 | "response": "Completed a 5k run and I'm feeling great!" 9 | }, 10 | { 11 | "query": "Had an incredible dinner at Joe's Italian Restaurant.", 12 | "response": "Joe's Italian Restaurant served an amazing dinner tonight." 13 | }, 14 | { 15 | "query": "I need a vacation. Can't wait to hit the beach.", 16 | "response": "Desperately need a holiday. Looking forward to beach time." 17 | }, 18 | { 19 | "query": "The new iPhone has some fantastic features!", 20 | "response": "Loving the features on the new iPhone!" 21 | } 22 | ] -------------------------------------------------------------------------------- /research/llm_dense_retriever/examples/bge-en-icl/MTEB/TwitterURLCorpus.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query": "Elon Musk says Tesla will be profitable next quarter.", 4 | "response": "Elon Musk claims Tesla will turn a profit next quarter." 5 | }, 6 | { 7 | "query": "The new iPhone just got announced and it's amazing.", 8 | "response": "Apple just unveiled the new iPhone and it's incredible." 9 | }, 10 | { 11 | "query": "Beyonc\u00e9's new album has topped the charts in its first week.", 12 | "response": "Beyonc\u00e9's latest album debuted at number one on the charts." 13 | }, 14 | { 15 | "query": "Breaking: Major earthquake hits California.", 16 | "response": "Just in: Large earthquake strikes California." 17 | }, 18 | { 19 | "query": "NASA plans to send humans to Mars by 2030.", 20 | "response": "NASA aims to have astronauts on Mars by the year 2030." 21 | } 22 | ] -------------------------------------------------------------------------------- /research/llm_embedder/evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/llm_embedder/evaluation/__init__.py -------------------------------------------------------------------------------- /research/llm_embedder/imgs/llm-embedder.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/llm_embedder/imgs/llm-embedder.png -------------------------------------------------------------------------------- /research/llm_embedder/src/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | logging.basicConfig( 3 | level=logging.INFO, 4 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 5 | datefmt="%m/%d/%Y %H:%M:%S", 6 | ) 7 | 8 | # import transformers 9 | # transformers.logging.set_verbosity_error() 10 | -------------------------------------------------------------------------------- /research/llm_embedder/src/lm/__init__.py: -------------------------------------------------------------------------------- 1 | from .args import LMArgs, SRLMArgs, GenerationArgs 2 | from .modeling_lm import LM 3 | from .modeling_srlm import SelfRetrievalLM 4 | -------------------------------------------------------------------------------- /research/llm_embedder/src/retrieval/__init__.py: -------------------------------------------------------------------------------- 1 | from .args import RetrievalArgs, RankerArgs 2 | from .modeling_dense import DenseRetriever 3 | from .modeling_bm25 import BM25Retriever, NaiveBM25Retriever 4 | from .modeling_unified import Retriever 5 | from .modeling_ranker import CrossEncoder 6 | from .metrics import RetrievalMetric 7 | from .data import RetrievalDataset, RetrievalDataCollator, TASK_CONFIG 8 | -------------------------------------------------------------------------------- /research/llm_embedder/src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .util import FileLogger, Sequential_Sampler, DatasetProcessFn, DefaultDataCollator, makedirs, split_file_dir_name_ext, clear_dir, get_max_length_in_nested_lists, pad_nested_lists, mask_nested_lists, are_elements_of_same_length, normalize_text, load_json, save_json, load_pickle, save_pickle, add_eos, remove_eos -------------------------------------------------------------------------------- /research/llm_reranker/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /research/llm_reranker/evaluation/BEIR-bge-en-v1.5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/llm_reranker/evaluation/BEIR-bge-en-v1.5.png -------------------------------------------------------------------------------- /research/llm_reranker/evaluation/BEIR-e5-mistral.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/llm_reranker/evaluation/BEIR-e5-mistral.png -------------------------------------------------------------------------------- /research/llm_reranker/evaluation/CMTEB-retrieval-bge-zh-v1.5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/llm_reranker/evaluation/CMTEB-retrieval-bge-zh-v1.5.png -------------------------------------------------------------------------------- /research/llm_reranker/evaluation/llama-index.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/llm_reranker/evaluation/llama-index.png -------------------------------------------------------------------------------- /research/llm_reranker/evaluation/miracl-bge-m3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/llm_reranker/evaluation/miracl-bge-m3.png -------------------------------------------------------------------------------- /research/llm_reranker/finetune_for_instruction/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /research/llm_reranker/finetune_for_layerwise/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /research/llm_reranker/merge/__init__.py: -------------------------------------------------------------------------------- 1 | from .merge_base_model import merge_llm 2 | from .merge_layerwise_model_from_raw_model import merge_layerwise_raw_llm 3 | from .merge_layerwise_model_from_finetuned_model import merge_layerwise_finetuned_llm -------------------------------------------------------------------------------- /research/old-examples/finetune/ds_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 12, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | 11 | "bf16": { 12 | "enabled": "auto" 13 | }, 14 | 15 | "optimizer": { 16 | "type": "AdamW", 17 | "params": { 18 | "lr": "auto", 19 | "betas": "auto", 20 | "eps": "auto", 21 | "weight_decay": "auto" 22 | } 23 | }, 24 | 25 | "scheduler": { 26 | "type": "WarmupDecayLR", 27 | "params": { 28 | "warmup_min_lr": "auto", 29 | "warmup_max_lr": "auto", 30 | "warmup_num_steps": "auto", 31 | "total_num_steps": "auto" 32 | } 33 | }, 34 | 35 | "zero_optimization": { 36 | "stage": 0 37 | }, 38 | 39 | "gradient_accumulation_steps": "auto", 40 | "gradient_clipping": "auto", 41 | "steps_per_print": 100, 42 | "train_batch_size": "auto", 43 | "train_micro_batch_size_per_gpu": "auto", 44 | "wall_clock_breakdown": false 45 | } 46 | -------------------------------------------------------------------------------- /research/old-examples/finetune/toy_evaluation_data/toy_corpus.json: -------------------------------------------------------------------------------- 1 | {"content": "A is ..."} 2 | {"content": "B is ..."} 3 | {"content": "C is ..."} 4 | {"content": "Panda is ..."} 5 | {"content": "... is A"} -------------------------------------------------------------------------------- /research/old-examples/finetune/toy_evaluation_data/toy_query.json: -------------------------------------------------------------------------------- 1 | {"query": "What is A?", "positive": ["A is ...", "... is A"]} 2 | {"query": "What is B?", "positive": ["B is ..."]} 3 | {"query": "What is C?", "positive": ["C is ..."]} -------------------------------------------------------------------------------- /research/old-examples/pretrain/retromae_pretrain/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /research/old-examples/search_demo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/old-examples/search_demo/__init__.py -------------------------------------------------------------------------------- /research/old-examples/search_demo/arguments.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | 3 | 4 | @dataclass 5 | class ModelArguments: 6 | model_name_or_path: str = field( 7 | default='BAAI/bge-large-zh-noinstruct', 8 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 9 | ) 10 | 11 | 12 | @dataclass 13 | class DataArguments: 14 | data_path: str = field( 15 | default='./data', metadata={"help": "Path to wikipedia-22-12"} 16 | ) 17 | -------------------------------------------------------------------------------- /research/old-examples/search_demo/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets==2.14.0 2 | faiss-gpu==1.7.2 3 | langchain==0.0.244 4 | numpy==1.23.3 5 | pyserini==0.21.0 6 | tiktoken==0.4.0 7 | torch==2.0.1 8 | torch_geometric==2.3.1 9 | tqdm==4.65.0 10 | transformers==4.30.2 11 | openai==0.27.4 12 | urllib3==1.25.11 -------------------------------------------------------------------------------- /research/old-examples/search_demo/run.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/old-examples/search_demo/run.py -------------------------------------------------------------------------------- /research/old-examples/search_demo/tool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/old-examples/search_demo/tool.py -------------------------------------------------------------------------------- /research/reranker/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /research/visual_bge/__init__.py: -------------------------------------------------------------------------------- 1 | from .modeling import Visualized_BGE -------------------------------------------------------------------------------- /research/visual_bge/imgs/SFT-CIRR.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/visual_bge/imgs/SFT-CIRR.png -------------------------------------------------------------------------------- /research/visual_bge/imgs/SFT-ReMuQ.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/visual_bge/imgs/SFT-ReMuQ.png -------------------------------------------------------------------------------- /research/visual_bge/imgs/SFT-WebQA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/visual_bge/imgs/SFT-WebQA.png -------------------------------------------------------------------------------- /research/visual_bge/imgs/cir_candi_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/visual_bge/imgs/cir_candi_1.png -------------------------------------------------------------------------------- /research/visual_bge/imgs/cir_candi_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/visual_bge/imgs/cir_candi_2.png -------------------------------------------------------------------------------- /research/visual_bge/imgs/cir_query.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/visual_bge/imgs/cir_query.png -------------------------------------------------------------------------------- /research/visual_bge/imgs/wiki_candi_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/visual_bge/imgs/wiki_candi_1.jpg -------------------------------------------------------------------------------- /research/visual_bge/imgs/wiki_candi_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/visual_bge/imgs/wiki_candi_2.jpg -------------------------------------------------------------------------------- /research/visual_bge/imgs/zs-benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/visual_bge/imgs/zs-benchmark.png -------------------------------------------------------------------------------- /research/visual_bge/imgs/zs-performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/visual_bge/imgs/zs-performance.png -------------------------------------------------------------------------------- /research/visual_bge/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="visual_bge", 5 | version="0.1.0", 6 | description='visual_bge', 7 | long_description="./README.md", 8 | long_description_content_type="text/markdown", 9 | url='https://github.com/FlagOpen/FlagEmbedding/tree/master/research/visual_bge', 10 | packages=find_packages(), 11 | install_requires=[ 12 | 'torchvision', 13 | 'timm', 14 | 'einops', 15 | 'ftfy' 16 | ], 17 | python_requires='>=3.6', 18 | ) 19 | -------------------------------------------------------------------------------- /research/visual_bge/visual_bge/eva_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 2 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer, create_eva_vision_and_transforms 3 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint 4 | from .loss import ClipLoss 5 | from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg,\ 6 | convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype 7 | from .openai import load_openai_model, list_openai_models 8 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model,\ 9 | get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained 10 | from .tokenizer import SimpleTokenizer, tokenize 11 | from .transform import image_transform -------------------------------------------------------------------------------- /research/visual_bge/visual_bge/eva_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FlagOpen/FlagEmbedding/5e64baa61e75df23105a66d1e9d09ad799366e2a/research/visual_bge/visual_bge/eva_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /research/visual_bge/visual_bge/eva_clip/constants.py: -------------------------------------------------------------------------------- 1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 3 | -------------------------------------------------------------------------------- /research/visual_bge/visual_bge/eva_clip/model_configs/EVA01-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16, 8 | "eva_model_name": "eva-clip-b-16", 9 | "ls_init_value": 0.1, 10 | "drop_path_rate": 0.0 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /research/visual_bge/visual_bge/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 1024, 19 | "heads": 16, 20 | "layers": 24, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /research/visual_bge/visual_bge/eva_clip/model_configs/EVA01-CLIP-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0.4, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 768, 19 | "heads": 12, 20 | "layers": 12, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /research/visual_bge/visual_bge/eva_clip/model_configs/EVA02-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "head_width": 64, 8 | "patch_size": 16, 9 | "mlp_ratio": 2.6667, 10 | "eva_model_name": "eva-clip-b-16-X", 11 | "drop_path_rate": 0.0, 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true, 19 | "patch_dropout": 0.5 20 | }, 21 | "text_cfg": { 22 | "context_length": 77, 23 | "vocab_size": 49408, 24 | "width": 512, 25 | "heads": 8, 26 | "layers": 12, 27 | "xattn": true, 28 | "fusedLN": true 29 | } 30 | } -------------------------------------------------------------------------------- /research/visual_bge/visual_bge/eva_clip/model_configs/EVA02-CLIP-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14-336", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /research/visual_bge/visual_bge/eva_clip/model_configs/EVA02-CLIP-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /research/visual_bge/visual_bge/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /research/visual_bge/visual_bge/eva_clip/model_configs/EVA02-CLIP-bigE-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1024, 20 | "heads": 16, 21 | "layers": 24, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", mode="r", encoding="utf-8") as readme_file: 4 | readme = readme_file.read() 5 | 6 | setup( 7 | name='FlagEmbedding', 8 | version='1.3.5', 9 | description='FlagEmbedding', 10 | long_description=readme, 11 | long_description_content_type="text/markdown", 12 | author_email='2906698981@qq.com', 13 | url='https://github.com/FlagOpen/FlagEmbedding', 14 | packages=find_packages(), 15 | include_package_data=True, 16 | install_requires=[ 17 | 'torch>=1.6.0', 18 | 'transformers>=4.44.2', 19 | 'datasets>=2.19.0', 20 | 'accelerate>=0.20.1', 21 | 'sentence_transformers', 22 | 'peft', 23 | 'ir-datasets', 24 | 'sentencepiece', 25 | 'protobuf' 26 | ], 27 | extras_require={ 28 | 'finetune': ['deepspeed', 'flash-attn'], 29 | }, 30 | ) 31 | --------------------------------------------------------------------------------