├── requirements.txt
├── images
    ├── rag.png
    └── rag1.png
├── .gitignore
├── healthcheck.py
├── LICENSE
├── .github
    └── workflows
    │   └── sync-papers.yml
├── evaluation.md
├── validator.md
├── README.md
├── rewriter.md
├── generator.md
├── mediation.md
└── ranker.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | requests>=2.19.1
2 | markdown>=2.6.11


--------------------------------------------------------------------------------
/images/rag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gomate-community/awesome-papers-for-rag/HEAD/images/rag.png


--------------------------------------------------------------------------------
/images/rag1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gomate-community/awesome-papers-for-rag/HEAD/images/rag1.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.log
 3 | *.swp
 4 | *.bak
 5 | *.weights
 6 | *.DS_Store
 7 | .vscode
 8 | .coverage*
 9 | .idea/
10 | .pytest_cache/
11 | .cache
12 | 


--------------------------------------------------------------------------------
/healthcheck.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import time
 3 | import requests
 4 | import markdown
 5 | 
 6 | pattern = 'https?://(?:[-\w.]|(?:%[\da-fA-F]{2})).+?(?=">)'
 7 | 
 8 | with open('./README.md') as f:
 9 | 	document = markdown.markdown(f.read())
10 | 	uris = re.findall(pattern, document)
11 | 	print(len(uris))
12 | 	for uri in uris:
13 | 		try:
14 | 			r = requests.get(uri)
15 | 			print(f'uri {uri} with status: {r.status_code}')
16 | 		except:
17 | 			print("Connection refused by the server..")
18 | 			time.sleep(5)
19 | 			uris.append(uri)


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 GoMate
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/sync-papers.yml:
--------------------------------------------------------------------------------
 1 | name: Sync Papers and Components to Website
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   sync:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout awesome-papers-for-rag (Source Repo)
13 |         uses: actions/checkout@v3 # 步骤1：克隆您的源仓库 (awesome-papers-for-rag)
14 | 
15 |       # --- 关键更改：使用 actions/checkout 克隆目标仓库，并传递令牌 ---
16 |       - name: Checkout target repo (gomate-community.github.io)
17 |         uses: actions/checkout@v3
18 |         with:
19 |           repository: gomate-community/gomate-community.github.io # 指定要克隆的目标仓库
20 |           path: gomate-community.github.io # 将目标仓库克隆到此目录
21 |           token: ${{ secrets.GH_TOKEN }} # <--- 这是最关键的一行，确保令牌被正确使用！
22 |           # 如果您之前添加了其他参数，可以根据需要保留，但确保 token 参数存在
23 |           # ssh-strict: true
24 |           # persist-credentials: true
25 |           # clean: true
26 |           # sparse-checkout-cone-mode: true
27 |           # fetch-depth: 1
28 |           # fetch-tags: false
29 |           # lfs: false
30 |           # submodules: false
31 |           # set-safe-directory: true
32 | 
33 |       - name: Sync files
34 |         run: |
35 |           # 复制文件：从当前源仓库目录复制到克隆的目标仓库目录
36 |           cp README.md gomate-community.github.io/_pages/awesome-papers-for-rag/README.md
37 |           cp rewriter.md gomate-community.github.io/_pages/awesome-papers-for-rag/rewriter.md
38 |           cp ranker.md gomate-community.github.io/_pages/awesome-papers-for-rag/ranker.md
39 |           cp mediation.md gomate-community.github.io/_pages/awesome-papers-for-rag/mediation.md
40 |           cp generator.md gomate-community.github.io/_pages/awesome-papers-for-rag/generator.md
41 |           cp validator.md gomate-community.github.io/_pages/awesome-papers-for-rag/validator.md
42 |           cp evaluation.md gomate-community.github.io/_pages/awesome-papers-for-rag/evaluation.md
43 |           cp images/rag1.png gomate-community.github.io/assets/rag1.png
44 | 
45 |       - name: Commit & push changes
46 |         run: |
47 |           cd gomate-community.github.io # 进入目标仓库的目录
48 |           git add .
49 | 
50 |           # 检查是否有需要提交的更改
51 |           if git diff --cached --quiet; then
52 |             echo "No changes to commit."
53 |           else
54 |             # 确保在当前仓库内设置了提交者信息
55 |             # 如果在 actions/checkout 步骤中设置了 user.email 和 user.name，这里可以省略
56 |             git config user.email ${{ secrets.USER_EMAIL }}
57 |             git config user.name ${{ secrets.USER_NAME }}
58 | 
59 |             git commit -m "Sync _pages/awesome-papers-for-rag from awesome-papers-for-rag [auto]"
60 |             # 由于前面的 actions/checkout 已经用 token 配置了远程，这里直接 push 即可
61 |             git push origin main
62 |             echo "Succeed in sync files from awesome-papers-from-rag"
63 |           fi
64 | 


--------------------------------------------------------------------------------
/evaluation.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## 1. Survey papers <a id="survey"></a>
 3 | 
 4 | | **Date** | **Title** | **Organization**  |  **Code**  |
 5 | | :-----------: | :-------------: | :----------------------: |  :----------------------: |
 6 | |2025/04/10|[LLM-based NLG Evaluation: Current Status and Challenges](https://direct.mit.edu/coli/article/doi/10.1162/coli_a_00561/128807)|Peking University|-|
 7 | |2024/05/13|[Evaluation of Retrieval-Augmented Generation: A Survey](https://arxiv.org/pdf/2405.07437)|Tencent|[Code](https://github.com/vasiliskatr/faaf)<br>![](https://img.shields.io/github/stars/YHPeter/Awesome-RAG-Evaluation.svg?style=social)|
 8 | |2024/01/30|[RAG vs Fine-tuning: Pipelines, Tradeoffs, and a Case Study on Agriculture](https://arxiv.org/abs/2401.08406)|Microsoft|No|
 9 | 
10 | 
11 | ## 2 Evaluation papers <a id="short"></a>
12 | <br>
13 | 
14 | ### 2.1 Short answer evaluation <a id="aspect"></a>
15 | 
16 | ### 2.2 Long answer evaluation <a id="short"></a>
17 | 
18 | | **Date** | **Title** | **Organization**  |  **Method**  | **Metric**  | **Dataset** |  **Code**  |
19 | | :-----------: | :-------------: | :----------------------: |  :----------------------: |:----------------------: |:----------------------: |:----------------------: |
20 | |2023/09/15|[Investigating Answerability of LLMs for Long-Form Question Answering](https://arxiv.org/pdf/2309.08210)|Salesforce|**Prompting GPT-4** to rate answers on a scale from 0 to 3.| <details><summary><small>Coherency, Relevance, Factual consistency, and Accuracy</small></summary><small> **Coherency**: Answer should be well-structured and well-organized and should not just be a heap of related information. **Relevance**: Answer should be relevant to the question and the context. The answer should be concise and avoid drifting from the question being asked. **Factual consistency**: The context should be the primary source for the answer. The answer should not contain fabricated facts and should entail information present in the context. **Accuracy**: Answer should be satisfactory and complete to the question being asked. Measure the correctness of the answer by checking if the response answers the presented question.</small></details> |-|-|
21 |  
22 | ### 2.3 Context evaluation <a id="short"></a>
23 | 
24 | ### 2.4 Documents evaluation <a id="short"></a>
25 | 
26 | ## 3 Tools and Benchmarks <a id="datasets"></a>
27 | 
28 | | **Date** | **Title** | **Organization**  |  **Code**  |
29 | | :-----------: | :-------------: | :----------------------: |  :----------------------: |
30 | |2024/10/10|[HELMET: How to Evaluate Long-Context Language Models Effectively and Thoroughly](https://arxiv.org/pdf/2410.02694?)|Princeton|[Code](https://github.com/princeton-nlp/HELMET)<br>![](https://img.shields.io/github/stars/princeton-nlp/HELMET.svg?style=social)|
31 | |2024/08/16|[RAGTruth: A Hallucination Corpus for Developing Trustworthy Retrieval-Augmented Language Models](https://aclanthology.org/2024.acl-long.585)                               |                    NewsBreak                    |           [Code](https://github.com/ParticleMedia/RAGTruth) <br>![](https://img.shields.io/github/stars/ParticleMedia/RAGTruth.svg?style=social)           |
32 | |2024/08/16|[RAGChecker: A Fine-grained Framework for Diagnosing Retrieval-Augmented Generation](http://arxiv.org/abs/2408.08067)                               |                    Amazon                    |           [Code](https://github.com/amazon-science/RAGChecker) <br>![](https://img.shields.io/github/stars/amazon-science/RAGChecker.svg?style=social)           |
33 | |2024/04/21|[Evaluating Retrieval Quality in Retrieval-Augmented Generation](https://arxiv.org/pdf/2404.13781)|UMASS|No|
34 | |2024/04/08|[FaaF: Facts as a Function for the evaluation of generated text](https://arxiv.org/pdf/2403.03888)|IMMO Capital|[Code](https://github.com/vasiliskatr/faaf)<br>![](https://img.shields.io/github/stars/vasiliskatr/faaf.svg?style=social)|
35 | |2024/02/19|[CRUD-RAG: A Comprehensive Chinese Benchmark for Retrieval-Augmented Generation of Large Language Models](https://arxiv.org/abs/2401.17043)|University of Science and Technology of China|[Code](https://github.com/IAAR-Shanghai/CRUD_RAG)<br>![](https://img.shields.io/github/stars/IAAR-Shanghai/CRUD_RAG.svg?style=social)|
36 | |2024/1/11|[Seven Failure Points When Engineering a Retrieval Augmented Generation System](https://arxiv.org/abs/2401.05856)|Applied Artificial Intelligence Institute|No
37 | |2023/12/20|[Benchmarking Large Language Models in Retrieval-Augmented Generation](https://arxiv.org/abs/2309.01431)|Chinese Information Processing Laboratory|[Code](https://github.com/chen700564/RGB)<br>![](https://img.shields.io/github/stars/chen700564/RGB.svg?style=social)|
38 | |2023/11/16|[ARES: An Automated Evaluation Framework for Retrieval-Augmented Generation Systems](https://arxiv.org/abs/2311.09476)|Stanford|[Code](https://github.com/stanford-futuredata/ARES)<br>![](https://img.shields.io/github/stars/stanford-futuredata/ARES.svg?style=social)|
39 | |2023/11/14|[RECALL: A Benchmark for LLMs Robustness against External Counterfactual Knowledge](https://arxiv.org/abs/2311.08147)|Peking University|No|
40 | |2023/10/31|[Enabling Large Language Models to Generate Text with Citations](https://arxiv.org/abs/2305.14627)|Princeton University|[Code](https://github.com/princeton-nlp/ALCE)<br>![](https://img.shields.io/github/stars/princeton-nlp/ALCE.svg?style=social)|
41 | |2023/09/26|[RAGAS: Automated Evaluation of Retrieval Augmented Generation](https://arxiv.org/abs/2309.15217)|Exploding Gradients|[Code](https://github.com/explodinggradients/ragas)<br>![](https://img.shields.io/github/stars/explodinggradients/ragas.svg?style=social)|
42 | |2021/08/05|[TruLens:Evaluation and Tracking for LLM Experiments](https://www.trulens.org/)|TruEra|[Code](https://github.com/truera/trulens)<br>![](https://img.shields.io/github/stars/truera/trulens.svg?style=social)|
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/validator.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Introduction
 3 | 
 4 | The answer generated by LLMs are tend to be haullucinations. To overcome this, it would be better to introduce an additional component to increase the quality of the answer, namely **answer enhancement** component. The
 5 | 
 6 | 
 7 | 
 8 | ## 1. Answer Verifiaction <a id="verify"></a>
 9 | 
10 | ### 1.1 Attribution Detection <a id="attribution"></a>
11 | 
12 | | Date       | Title                                                                                                           | Authors                                  | Orgnization                                                                                                   | Abs                                                                                             |
13 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
14 | |2023/10/07| [Automatic Evaluation of Attribution by Large Language Models](https://arxiv.org/pdf/2305.06311.pdf) <br> [[code](https://github.com/OSU-NLP-Group/AttrScore): ![](https://img.shields.io/github/stars/OSU-NLP-Group/AttrScore.svg?style=social) |Xiang Yue, Boshi Wang, Ziru Chen, et. al.|The Ohio State University | <details><summary><small>This paper presents evaluation...</small></summary><small>This work tries to evaluate the attribution ability (3 types: attributable, extrapolatory, contradictory) of existing LLMs by introducing two benchmarks (i.e., AttrEval-Simulation and AttrEval-GenSearch). It also introduces two types of automatic evaluation methods: 1) Prompting LLMs, 2) Fine-tuning LMs on Repurposed Data. </small></details>|
15 | 
16 | ### 1.2 Claim Verification <a id="verification"></a>
17 | 
18 | | Date       | Title                                                                                                           | Authors                                  | Orgnization                                                                                                   | Abs                                                                                             |
19 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
20 | |2024/12/16| [Attention with Dependency Parsing Augmentation for Fine-Grained Attribution](https://arxiv.org/pdf/2412.11404) <br> |Qiang Ding, Lvzhou Luo, Yixuan Cao, Ping Luo |ICT| <details><summary><small>This paper presents fine-grained attribution ...</small></summary><small>This work proposes two techniques to **model-internals-based** methods for fine-grained attribution. First, it aggregates token-wise evidence (i.e., attention weights) through set union operations, preserving the granularity of representations. Second, it enhances the attributor by integrating dependency parsing to enrich the semantic completeness of target spans. </small></details>|
21 | |2024/07/02| [Pelican: Correcting Hallucination in Vision-LLMs via Claim Decomposition and Program of Thought Verification](https://arxiv.org/pdf/2407.02352.pdf) <br> |Pritish Sahu, Karan Sikka, Ajay Divakaran|SRI International, Princeton| <details><summary><small>This paper presents **Pelican** ...</small></summary><small>Pelican 1) decomposes the visual claim into a chain of sub-claims based on first-order predicates, 2) it then use Program-of-Thought prompting to generate Python code for answering these questions through flexible composition of external tools. </small></details>|
22 | |2024/02/23 |[Merging Facts, Crafting Fallacies: Evaluating the Contradictory Nature of Aggregated Factual Claims in Long-Form Generations](https://arxiv.org/abs/2402.05629.pdf)| heng-Han Chiang, Hung-yi Lee.|National Taiwan University|<details><summary><small>This paper presents **D-FActScore** ...</small></summary><small>This work finds that combining factual claims together can result in a non-factual paragraph due to entity ambiguity. Current metrics for fact verification fail to properly evaluate these non-factual passages. The authors proposed D-FActScore based on FActScore, and showed the methods and results of human and automatic evaluation.</small></details>|
23 | |2023/10/20| [Explainable Claim Verification via Knowledge-Grounded Reasoning with Large Language Models](https://arxiv.org/abs/2310.05253.pdf) <br> [[code](https://github.com/wang2226/FOLK): ![](https://img.shields.io/github/stars/wang2226/FOLK.svg?style=social) |Haoran Wang, Kai Shu|Illinois Institute of Technology, Chicago| <details><summary><small>This paper presents FOLK ...</small></summary><small>This work introduces First-Order-Logic-Guided Knowledge-Grounded (**FOLK**). 1）FOLK translates input claim into a FOL clause and uses it to guide LLMs to generate a set of question-answer pairs, 2) FOLK then retrieves knowledge-grounded answers from external knowledge-source; 3) FOLK performs FOL-guided reasoning over knowledge-grounded answers to make veracity prediction and generate explanations.</small></details>|
24 | 
25 | ## 2. Reasoning-based (CoT) Filtering <a id="cot"></a>
26 | 
27 | | Date       | Title                                                                                                           | Authors                                  | Orgnization                                                                                                   | Abs                                                                                             |
28 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
29 | |2023/12/31| [Rethinking with Retrieval: Faithful Large Language Model Inference](https://arxiv.org/abs/2301.00303.pdf) <br> [[code](https://github.com/HornHehhf/RR): ![](https://img.shields.io/github/stars/HornHehhf/RR.svg?style=social) |Hangfeng He, Hongming Zhang, Dan Roth|University of Rochester, Tencent AI Lab Seattle, University of Pennsylvania | <details><summary><small>This paper presents RR ...</small></summary><small>This work propose a novel post-processing approach, rethinking with retrieval (RR), which uses decomposed reasoning steps obtained from CoT prompting to retrieve relevant docs for LLMs. Four steps: 1)CoT prompting to generate explanation E and prediction P for query Q. 2)Sampling diverse reasoning path R (i.e., E + P), 3)knowledge K retrieval for each path, 4)faithful inference (NLI model) for each R+K.</small></details>|
30 | 
31 | ## 3. Datasets <a id="datasets"></a>
32 | 
33 | 
34 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
35 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
36 | |2025/04/11 | [Attribution in Scientific Literature: New Benchmark and Methods](https://arxiv.org/pdf/2405.02228)<br> [[code](https://github.com/YashSaxena21/REASONS): ![](https://img.shields.io/github/stars/YashSaxena21/REASONS.svg?style=social)| Yash Saxena, Deepa Tilwani, Seyedali Mohammadi, et. al.|UMBC|<details><summary><small>This paper presents **REASONS** ...</small></summary><small> REASONS is a novel dataset for source attribution, featuring sentence-level annotations across 12 scientific domains from arXiv.</small></details>| <sub> Factcheck-bench </sub>|
37 | |2024/11/21 | [OPENSCHOLAR: Synthesizing Scientific Literature with Retrieval-augmented LMs](https://arxiv.org/pdf/2411.14199?)<br> [[code](https://github.com/AkariAsai/ScholarQABench): ![](https://img.shields.io/github/stars/AkariAsai/ScholarQABench.svg?style=social)| Akari Asai, Jacqueline He, Rulin Shao, et al.| University of Washington, ...|<details><summary><small>This paper presents **OpenScholar** ...</small></summary><small> The **SCHOLARQABENCH** is the first large-scale multi-domain benchmark for literature search, comprising 2,967 expert-written queries and 208 long-form answers across computer science, physics, neuroscience, and biomedicine. </small></details>| <sub> ScholarQABench </sub>|
38 | |2024/07/11 | [CiteME: Can Language Models Accurately Cite Scientific Claims?](https://proceedings.neurips.cc/paper_files/paper/2024/file/0ef47f7b768e1a012e3d995ac8d8fac7-Paper-Datasets_and_Benchmarks_Track.pdf)<br> [[code](https://github.com/bethgelab/CiteME): ![](https://img.shields.io/github/stars/bethgelab/CiteME.svg?style=social)| Ori Press, Andreas Hochlehnert, Ameya Prabhu, et al.| University of Tübingen|<details><summary><small>This paper presents **CiteME** ...</small></summary><small> CiteME, a challenging and human-curated benchmark of recent machine learning publications that evaluates the abilities of LMs to correctly attribute scientific claims. CiteME is both natural and challenging, even for SoTA LMs.</small></details>| <sub> CiteME </sub>|
39 | |2024/04/16 | [Factcheck-Bench: Fine-Grained Evaluation Benchmark for Automatic Fact-checkers](https://arxiv.org/pdf/2311.09000)| Yuxia Wang, Revanth G. Reddy, Zain M. Mujahid, et. al.|MBZUAI, Abu Dhabi, UAE|<details><summary><small>This paper presents Factcheck-Bench ...</small></summary><small>Factcheck-bench is a open-domain document-level factuality benchmark in three-level granularity: claim, sentence and document. They frame the automated detection and correction of factual errors for outputs of LLMs into eight subtasks: 1)decomposition; 2) decontextualisation; 3) checkworthiness  identification; 4) evidence retrieval and collection; 5) stance detection; 6) correction determination; 7) claim correction; 8) final response revision.</small></details>| <sub> Factcheck-bench </sub>|
40 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | A curated list of resources dedicated to retrieval-augmented generation (RAG).
 2 | 
 3 | The retrieval-augmented generation (RAG) is to combine the merits of retrieval system and llm to generation high-quality answers for users.
 4 | 
 5 | 
 6 | <div align="center">
 7 |     <img width="800" src="/assets/rag1.png" alt="RAG Framework">
 8 |     <br><br>
 9 |     <p><b>The Framework for RAG System</b></p>
10 | </div>
11 | 
12 | 
13 | 
14 | Typically, the rag system consists of a set of modules, where each task are described as follows:
15 | 1. [Interpreter](/papers/interpreter): This component focuses on refining and enriching the user's initial query or question to improve the subsequent retrieval process. By generating more detailed or expanded search queries, it helps the retrieval component to more effectively recall relevant documents.
16 | 2. [Retriever](/papers/retriever): This component is responsible for finding and fetching relevant documents or passages from a large corpus based on the refined user query. It acts as the primary information access layer, providing the foundational knowledge for the generation phase. 
17 | 3. [Compressor](/papers/compressor): This component processes the retrieved documents and user questions to create an optimized context for LLM. It aims to refine, condense, and organize the retrieved information, ensuring that the most pertinent and concise context is passed on for accurate generation. 
18 | 4. [Generator](/papers/generator): This component leverages a LLM to synthesize a coherent, informative, and contextually relevant answer based on the user's question and the provided refined contexts. It transforms raw information into a human-readable response. 
19 | 5. [Validator](/papers/validator): This component aims to improve the trustworthiness and quality of the generated answer by validating its accuracy and adherence to factual information within the provided contexts. It ensures the output is reliable and grounded.
20 | 6. [Evaluator](/papers/evaluator): This component measures the overall performance and quality of the RAG system, assessing various aspects such as answer accuracy, retrieval effectiveness, and generation faithfulness. It provides metrics to understand and improve the system's capabilities.
21 | 
22 | ## Surveys
23 | 
24 | * The *Organization* column only record the organization of the first author.
25 | 
26 | | **Date** |                                                                        **Title**                                                                        |          **Organization**          |                                                                   **Code**                                                                   |
27 | | :------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------: | :---------------------------------------: | :------------------------------------------------------------------------------------------------------------------------------------------------: |
28 | |   2024/09/16  |                              [Trustworthiness in Retrieval-Augmented Generation Systems: A Survey](https://arxiv.org/abs/2409.10102)                              |             Tsinghua University             |           [Code](https://github.com/smallporridge/TrustworthyRAG)<br>![](https://img.shields.io/github/stars/smallporridge/TrustworthyRAG.svg?style=social)           |
29 | |   2024/09/10   |                              [Graph Retrieval-Augmented Generation: A Survey](https://arxiv.org/abs/2408.08921)                              |             Peking University             |           [Code](https://github.com/pengboci/GraphRAG-Survey)<br>![](https://img.shields.io/github/stars/pengboci/GraphRAG-Survey.svg?style=social)           |
30 | |   2024/07/13   |                              [Large Language Models and Future of Information Retrieval: Opportunities and Challenges](https://dl.acm.org/doi/pdf/10.1145/3626772.3657848)                              |      ChengXiang Zhai, UIUC             |  -        |
31 | |   2024/02/29   |                              [Retrieval-Augmented Generation for AI-Generated Content: A Survey](https://arxiv.org/abs/2402.19473)                              |             Peking University             |           [Code](https://github.com/hymie122/RAG-Survey)<br>![](https://img.shields.io/github/stars/hymie122/RAG-Survey.svg?style=social)           |
32 | |   2024/01/03   |                              [Retrieval-Augmented Generation for Large Language Models: A Survey](https://arxiv.org/abs/2312.10997)                              |             Tongji University             |       [Code](https://github.com/Tongji-KGLLM/RAG-Survey)<br>![](https://img.shields.io/github/stars/Tongji-KGLLM/RAG-Survey.svg?style=social)       |
33 | |   2024/01/03   |                    [A Comprehensive Survey of Hallucination Mitigation Techniques in Large Language Models](https://arxiv.org/abs/2401.01313)                    |     Islamic University of Technology     |                                                                         No                                                                         |
34 | |   2023/12/07   | [Trends in Integration of Knowledge and Large Language Models: A Survey and Taxonomy of Methods, Benchmarks, and Applications](https://arxiv.org/abs/2311.05876) |      Harbin Institute of Technology      |                                                                         No                                                                         |
35 | |   2023/09/19   |                            [The Rise and Potential of Large Language Model Based Agents: A Survey](https://arxiv.org/abs/2309.07864)                            |              Fudan NLP Group              | [Code](https://github.com/WooooDyy/LLM-Agent-Paper-List)<br>![](https://img.shields.io/github/stars/WooooDyy/LLM-Agent-Paper-List.svg?style=social) |
36 | |   2023/08/14   |                                  [Large Language Models for Information Retrieval: A Survey](https://arxiv.org/abs/2308.07107)                                  |             Renmin University             |       [Code](https://github.com/RUC-NLPIR/LLM4IR-Survey)<br>![](https://img.shields.io/github/stars/RUC-NLPIR/LLM4IR-Survey.svg?style=social)       |
37 | |   2022/02/02   |                                       [A Survey on Retrieval-Augmented Text Generation](https://arxiv.org/abs/2202.01110)                                       | Nara Institute of Science and Techonology |                                                                         No                                                                         |
38 | 
39 | ## Systems
40 | 
41 | * The *Organization* column only record the organization of the first author.
42 | 
43 | | **Date** | **Title** | **Organization**  |  **Code**  |
44 | | :-----------: | :-------------: | :----------------------: |  :----------------------: 
45 | |2024/11/07|[LightRAG: Simple and Fast Retrieval-Augmented Generation](https://arxiv.org/abs/2410.05779)|BUPT|[Code](https://github.com/HKUDS/LightRAG)<br>![](https://img.shields.io/github/stars/HKUDS/LightRAG.svg?style=social)|
46 | |2024/10/25|[StructRAG: Boosting Knowledge Intensive Reasoning of LLMs via Inference-time Hybrid Information Structurization](https://arxiv.org/abs/2410.08815)|ISCAS|[Code](https://github.com/Li-Z-Q/StructRAG)<br>![](https://img.shields.io/github/stars/Li-Z-Q/StructRAG.svg?style=social)|
47 | |2024/08/21|[RAGLAB: A Modular and Research-Oriented Unified Framework for Retrieval-Augmented Generation](https://arxiv.org/abs/2408.11381)|Nanjing University|[Code](https://github.com/fate-ubw/RAGLab)<br>![](https://img.shields.io/github/stars/fate-ubw/RAGLab.svg?style=social)|
48 | |2024/07/11|[Speculative RAG: Enhancing Retrieval Augmented Generation through Drafting](https://arxiv.org/abs/2407.08223)          |     University of California, San Diego     |        No        |
49 | |2024/06/19|[InstructRAG: Instructing Retrieval-Augmented Generation via Self-Synthesized Rationales](https://arxiv.org/abs/2406.13629)|University of Virginia|[Code](https://github.com/weizhepei/InstructRAG)<br>![](https://img.shields.io/github/stars/weizhepei/InstructRAG.svg?style=social)|
50 | |2024/05/22|[FlashRAG: A Modular Toolkit for Efficient Retrieval-Augmented Generation Research](https://arxiv.org/pdf/2405.13576)|Renmin University of China|[Code](https://github.com/RUC-NLPIR/FlashRAG)<br>![](https://img.shields.io/github/stars/RUC-NLPIR/FlashRAG.svg?style=social)|
51 | |2024/04/24|[From Local to Global: A Graph RAG Approach to Query-Focused Summarization](https://arxiv.org/abs/2404.16130)               | Microsoft |     [Code](https://www.microsoft.com/en-us/research/project/graphrag/)     |
52 | |2023/11/22|[FreshLLMs: Refreshing Large Language Models with Search Engine Augmentation](https://arxiv.org/abs/2310.03214)|Google|[Code](https://github.com/freshllms/freshqa)<br>![](https://img.shields.io/github/stars/freshllms/freshqa.svg?style=social)|
53 | |2023/11/08|[PDFTriage: Question Answering over Long, Structured Documents](https://arxiv.org/abs/2309.08872)|Stanford|[Code](https://github.com/HAMNET-AI/PDFTriage)<br>![](https://img.shields.io/github/stars/HAMNET-AI/PDFTriage.svg?style=social)|
54 | |2023/10/27|[WikiChat: Stopping the Hallucination of Large Language Model Chatbots by Few-Shot Grounding on Wikipedia](https://arxiv.org/pdf/2305.14292v2.pdf)|Stanford|[Code](https://github.com/stanford-oval/WikiChat)<br>![](https://img.shields.io/github/stars/stanford-oval/WikiChat.svg?style=social)|
55 | |2023/10/27|[LeanDojo: Theorem Proving with Retrieval-Augmented Language Models](https://arxiv.org/abs/2306.15626)|Caltech|[Code](https://github.com/lean-dojo/LeanDojo)<br>![](https://img.shields.io/github/stars/lean-dojo/LeanDojo.svg?style=social)|
56 | |2023/06/13|[WebGLM: Towards An Efficient Web-Enhanced Question Answering System with Human Preferences](https://arxiv.org/abs/2306.07906)|Tsinghua University|[Code](https://github.com/THUDM/WebGLM)<br>![](https://img.shields.io/github/stars/THUDM/WebGLM.svg?style=social)|
57 | |2023/05/23|[WebCPM: Interactive Web Search for Chinese Long-form Question Answering](https://arxiv.org/abs/2305.06849)|Tsinghua University|[Code](https://github.com/thunlp/WebCPM)<br>![](https://img.shields.io/github/stars/thunlp/WebCPM.svg?style=social)|
58 | |2022/06/01|[WebGPT: Browser-assisted question-answering with human feedback](https://arxiv.org/abs/2112.09332)|Open AI|No|
59 | 
60 | 
61 | ## Deep Research
62 | 
63 | * The *Organization* column only record the organization of the first author.
64 | 
65 | | **Date** | **Title** | **Organization**  |  **Core Idea**  |
66 | | :-----------: | :-------------: | :----------------------: |  :----------------------: 
67 | |2025/10/28|[Tongyi DeepResearch Technical Report](https://papers-pdfs.assets.alphaxiv.org/2510.24701v1.pdf)|Tongyi Lab @ Alibaba|Tongyi Deep Research, [Code](https://github.com/Alibaba-NLP/DeepResearch)<br>![](https://img.shields.io/github/stars/Alibaba-NLP/DeepResearch.svg?style=social)|
68 | |2025/10/20|[Enterprise Deep Research: Steerable MultiAgent Deep Research for Enterprise Analytics](https://arxiv.org/pdf/2510.17797)|Google|Enterprise Deep Research, [Code](https://github.com/lSalesforceAIResearch/enterprise-deep-research)<br>![](https://img.shields.io/github/stars/SalesforceAIResearch/enterprise-deep-research.svg?style=social)|
69 | |2025/09/03|[DEEP RESEARCH AGENTS: A SYSTEMATIC EXAMINATION AND ROADMAP](https://arxiv.org/pdf/2506.18096)|University of Liverpool|survey [Code](https://github.com/ai-agents-2030/awesome-deep-research-agent)<br>![](https://img.shields.io/github/stars/ai-agents-2030/awesome-deep-research-agent.svg?style=social)|
70 | |2025/08/14|[ReportBench: Evaluating Deep Research Agents via Academic Survey Tasks](https://arxiv.org/abs/2508.15804)|ByteDance|Evaluation [Code](https://github.com/ByteDance-BandAI/ReportBench?tab=readme-ov-file)<br>![](https://img.shields.io/github/stars/ByteDance-BandAI/ReportBench?tab=readme-ov-file.svg?style=social)|
71 | |2025/07/21|[Deep Researcher with Test-Time Diffusion](https://arxiv.org/pdf/2507.16075)|Google|research report generation as a diffusion process|
72 | |2025/06/27|[SCIENCEBOARD: Evaluating Multimodal Autonomous Agents in Realistic Scientific Workflows](https://arxiv.org/pdf/2505.19897)|University of Hong Kong| Evaluation [Code](https://github.com/OS-Copilot/ScienceBoard)<br>![](https://img.shields.io/github/stars/OS-Copilot/ScienceBoard.svg?style=social)|
73 | |2025/06/18|[Agent Laboratory: Using LLM Agents as Research Assistants](https://arxiv.org/pdf/2501.04227)|AMD|Agent Laboratory [Code](https://github.com/SamuelSchmidgall/AgentLaboratory)<br>![](https://img.shields.io/github/stars/SamuelSchmidgall/AgentLaboratory.svg?style=social)|
74 | |2025/06/14|[A Comprehensive Survey of Deep Research: Systems, Methodologies, and Applications](https://arxiv.org/pdf/2506.12594)|Zhejiang University|survey|
75 | |2025/05/19|[From Automation to Autonomy: A Survey on Large Language Models in Scientific Discovery](https://arxiv.org/pdf/2505.13259v1)|HKUST|survey [Code](https://github.com/HKUST-KnowComp/Awesome-LLM-Scientific-Discovery)<br>![](https://img.shields.io/github/stars/HKUST-KnowComp/Awesome-LLM-Scientific-Discovery.svg?style=social)|
76 | |2025/05/03|[ResearchCodeAgent: An LLM Multi-Agent System for Automated Codification of Research Methodologies](https://arxiv.org/pdf/2504.20117)|TCS Research|multi-agent system|
77 | |2025/04/17|[DeepResearcher: Scaling Deep Research via Reinforcement Learning in Real-world Environments](https://arxiv.org/pdf/2504.03160)|SJTU|multi-agent system [Code](https://github.com/GAIR-NLP/DeepResearcher)<br>![](https://img.shields.io/github/stars/GAIR-NLP/DeepResearcher.svg?style=social)|
78 | |2025/04/17|[Towards Scientific Intelligence: A Survey of LLM-based Scientific Agents](https://arxiv.org/pdf/2503.24047)|Institute of Automation, CAS|survey|
79 | |2025/03/25|[AgentRxiv: Towards Collaborative Autonomous Research](https://arxiv.org/pdf/2503.18102)|Johns Hopkins University|LLM Agent|
80 | |2025/02/18|[Towards an AI co-scientist](https://storage.googleapis.com/coscientist_paper/ai_coscientist.pdf)|Google|multi-agent system|
81 | |2024/10/28|[OpenResearcher: Unleashing AI for Accelerated Scientific Research](https://arxiv.org/pdf/2408.06941)|SJTU|multi-agent system [Code](https://github.com/GAIR-NLP/OpenResearcher)<br>![](https://img.shields.io/github/stars/GAIR-NLP/OpenResearcher.svg?style=social)|
82 | |2024/09/04|[The AI Scientist: Towards Fully Automated Open-Ended Scientific Discovery](https://arxiv.org/pdf/2408.06292)|Sakana AI|multi-agent system [Code](https://github.com/SakanaAI/AI-Scientist)<br>![](https://img.shields.io/github/stars/SakanaAI/AI-Scientist.svg?style=social)|


--------------------------------------------------------------------------------
/rewriter.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Introduction
 3 | 
 4 | The intent clarify component is to understand the question, and guide the retrieval to obtain better documents. Usually, there could be different types of understanding, e.g., whether to retrieve and query formulations.
 5 | 
 6 | 
 7 | 
 8 | ## 1. Retrieval Detection <a id="retrieval_detect"></a>
 9 | 
10 | These kind of methods try to determine whether to retrieve content for response generation or directly generate the response.
11 | 
12 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset  | 
13 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
14 | |2024/05/04| [When to Retrieve: Teaching LLMs to Utilize Information Retrieval Effectively](https://arxiv.org/pdf/2404.19705) <br>[[code](https://github.com/mwozgpt/Adapt-LLM-anonymous-old): ![](https://img.shields.io/github/stars/mwozgpt/Adapt-LLM-anonymous-old.svg?style=social)] | Tiziano Labruna, et al. | University of Bozen-Bolzano | <details><summary><small>This paper presents ADAPT-LLM ...</small></summary><small>This paper presents ADAPT-LLM by fine-tuning a base LLM on an open-domain QA dataset. It first take base LLM to zero-shot evaluation to determin its accuracy in QA. For questions with incorrect answers, it train the LLM to generate a spectial token <RET>, indicating the need for additional context.</small></details>|<sub>NQ, SQuAD, PopQA</sub>|
15 | |2023/10/08| [Self-Knowledge Guided Retrieval Augmentation for Large Language Models](https://arxiv.org/pdf/2310.05002.pdf) |Yile Wang, Peng Li, Maosong Sun, Yang Liu|Tsinghua University| <details><summary><small>This paper presents SKR ...</small></summary><small>This work introduces Self-Knowledge guided Retrieval augmentation*SKR* to flexibly call the retriever. Three steps: 1) collection self-knowledge of LLM by asking a number of questions, and divide the question into two categories D+ and D- according to the answer correctness, 2) eliciting self-knowledge of LLMs by either direct prompt or training a classifier based on D+ and D-, 3) using self-knowledge for adaptive retrieval augmentation based on prediction of 2).</small></details>| <sub>TemporalQA, CommonsenseQA, TabularQA, StrategyQA, TruthfulQA</sub> |
16 | 
17 | ## 2. Query Reformulation <a id="query_reformulate"></a>
18 | 
19 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset|
20 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
21 | | 2024/03/31 | [RQ-RAG: Learning to Refine Queries for Retrieval Augmented Generation](https://arxiv.org/pdf/2404.00610.pdf)| Chi-Min Chan, Chunpu Xu, Ruibin Yuan, et. al. |Hong Kong University of Science and Technology, Hong Kong Polytechnic University, MIT|<details><summary><small>This paper presents RQ-RAG ...</small></summary><small>This work proposes RQ-RAG (Refine Query RAG) to enhance the generator (LLaMA2) to explicitly rewrite, decompose, and disambiguate, before final answer generation. In this way, the RAG process interleaves between retrieval (guided by refined query) and generation.  </small></details> | <sub>Arc-Challenge, PopQA, OpenbookQA, HotpotQA, 2WikiMHQA, Musique</sub> |
22 | 
23 | ## 3. Query Expansion <a id="retrieval_expansion"></a>
24 | 
25 | ### 3.1 Generative-Relevance Feedback <a id="GRF"></a>
26 | 
27 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset |
28 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
29 | | 2025/09/09 | [Query Expansion in the Age of Pre-trained and Large Language Models: A Comprehensive Survey](https://arxiv.org/pdf/2509.07794) | MingHan Li, XinXuan Lv, JunJie Zou, et al. |Soochow University, China|<details><summary><small>This paper presents survey of QE ...</small></summary><small>This paper introduce a four-dimensional framework for QE: 1) the point of injection (explicit vs. implicit QE), 2) grounding and interaction (knowledge bases, model-internal capabilities, multi-turn retrieval), 3) learning alignment, 4)knowledge graph-based argumentation.</small></details> | <sub>...</sub> |
30 | | 2025/06/16 | [TongSearch-QR: Reinforced Query Reasoning for Retrieval](https://arxiv.org/pdf/2506.11603)<br>[[code](https://github.com/bigai-nlco/TongSearch-QR): ![](https://img.shields.io/github/stars/bigai-nlco/TongSearch-QR.svg?style=social)] | Xubo Qin, Jun Bai, Jiaqi Li, Zixia Jia, Zilong Zheng |BIGAI|<details><summary><small>This paper presents **TongSearch-QE** ...</small></summary><small>The **TongSearch-QE** developed a rule-based reward function for GRPO, enabling RL on the query reasoning of smaller language models.</small></details> | <sub>BRIGHT</sub> |
31 | | 2025/06/10 | [ThinkQE: Query Expansion via an Evolving Thinking Process](https://arxiv.org/pdf/2506.09260)<br>[[code](https://github.com/Yibin-Lei/Think_QE): ![](https://img.shields.io/github/stars/Yibin-Lei/Think_QE.svg?style=social)] | Yibin Lei, Tao Shen, Andrew Yates|University of Amsterdam|<details><summary><small>This paper presents **ThinkQE** ...</small></summary><small>The **ThinkQE** consists of two key components: a thinking-based expansion process that encourages deeper and comprehensive semantic exploration, and a corpus-interaction strategy that iteratively refines expansions using retrieval feedback from the corpus.</small></details> | <sub>TREC DL19 and DL20, BRIGHT</sub> |
32 | | 2024/11/12 | [Exploring the Best Practices of Query Expansion with Large Language Models](https://aclanthology.org/2024.findings-emnlp.103.pdf)<br>[[code](https://github.com/lezhang7/Retrieval_MuGI): ![](https://img.shields.io/github/stars/lezhang7/Retrieval_MuGI.svg?style=social)] | Le Zhang, Yihong Wu, Qian Yang, et. al. |University of Montreal|<details><summary><small>This paper presents **MUGI** ...</small></summary><small>This work proposes MUGI (Multi-Text Generation Integration), which leverages LLM to generate multiple pseudo-references. Findings: 1) increasing the number of references from LLM benefits IR systems; 2) A balance between the query and pseudo-documents, and an effective integration strategy is important; 3) contextual information from LLM is essential.  </small></details> | <sub>TREC DL19 and DL20, BEIR</sub> |
33 | | 2024/10/21 | [GaQR: An Efficient Generation-augmented Question Rewriter](https://dl.acm.org/doi/10.1145/3627673.3679930) | Oliver Young, Yixing Fan, Ruqing Zhang, et al.|  ICT, CAS | <details><summary><small>This paper presents GaQR ...</small></summary><small>The work proposes introduce an efficient GaQR to reformulate a question into several queries using Chain of Thought (CoT) and make it more efficient through knowledge distillation.</small></details> | <sub> MS MARCO, Miracl, BEIR</sub> |
34 | | 2024/09/17 | [MGenCRF: Generative Clustering and Reformulation Framework for Enhanced Intent-Driven Information Retrieval](https://arxiv.org/pdf/2409.10909) | Wonduk Seo, Haojie Zhang, Yueyang Zhang, et al.|  Baidu.inc and Peking University | <details><summary><small>This paper presents **GenCRF** ...</small></summary><small>The work proposes **GenCRF**, a Generative Clustering and Reformulation Framework to capture diverse intentions adaptively based on multiple differentiated, well-generated queries in the retrieval phase. 1) It leverages LLMs to generate multiple differentiated queries by utilizing various types of custoomized prompts. 2) It clusters these queries and introduces similarity-based and score-based dynamic weighting, to adjust the relative weights of reformulated queries.</small></details> | <sub>BEIR</sub> |
35 | | 2024/08/24 | [Meta Knowledge for Retrieval Augmented Large Language Models](https://arxiv.org/abs/2408.09017) | Laurent Mombaerts, Terry Ding, Florian Felice, Jonathan Taws, Adi Banerjee, Tarik Borogovac |  Amazon Web Services | <details><summary><small>This paper presents MK Summary ...</small></summary><small>The work proposes a novel data-centric RAG workflow for LLMs, relying on generating metadata and synthetic Questions and Answers (QA) for each document, as well as introducing the new concept of Meta Knowledge Summary (MK Summary) for metadata-based clusters of documents. It transforms the traditional retrieve-then-read system into a more advanced prepare-then-rewrite-then-retrieve-then-read framework, to achieve higher domain expert-level understanding of the knowledge base.</small></details> | <sub>arXiv</sub> |
36 | | 2023/10/19 | [Large Language Models Know Your Contextual Search Intent: A Prompting Framework for Conversational Search](https://arxiv.org/abs/2303.06573) <br>[[code](https://github.com/kyriemao/LLM4CS): ![](https://img.shields.io/github/stars/kyriemao/LLM4CS.svg?style=social)] | Kelong Mao, Zhicheng Dou, Fengran Mo, Jiewen Hou, Haonan Chen, Hongjin Qian    | Renmin University of China | <details><summary><small>This paper presents LLM4CS ...</small></summary><small>The work proposes a simple yet effective prompting framework, called **LLM4CS**, to leverage LLMs as a text-based search intent interpreter to help conversational search.It explores three prompting methods to generate multiple query rewrites and hypothetical responses, and then proposes to aggregate them into an integrated representation.</small></details> | <sub>CAsT-19&20&21</sub> |
37 | | 2023/10/11 | [Query2doc: Query Expansion with Large Language Models](https://arxiv.org/abs/2303.07678), <br>[[code](https://github.com/PKUnlp-icler/PCA-EVAL): ![](https://img.shields.io/github/stars/PKUnlp-icler/PCA-EVAL.svg?style=social)]  | Liang Wang, Nan Yang, Furu Wei                                                 | Microsoft Research | <details><summary><small>This paper presents Query2doc ...</small></summary><small>This work proposes a simple yet effective query expansion approach, denoted as **Query2doc**, to improve both sparse and dense retrieval systems. The proposed method first generates pseudo-documents by few-shot prompting large language models (LLMs), and then expands the query with generated pseudo-documents.</small></details>  | <sub>MS-MARCO passage, TREC-DL 19&20</sub> |
38 | | 2023/6/16  | [GRM: Generative Relevance Modeling Using Relevance-Aware Sample Estimation for Document Retrieval](https://arxiv.org/abs/2306.09938)                                                                                                                                      | Iain Mackie, Ivan Sekulic, Shubham Chatterjee, Jeffrey Dalton, Fabio Crestani. | University of Glasgow,Università della Svizzera italiana  | <details><summary><small>This paper presents GRM ...</small></summary><small>This work proposes Generative Relevance Modeling **(GRM)** that uses Relevance-Aware Sample Estimation (RASE) for more accurate weighting of expansion terms. Specifically, it identifies similar real documents for each generated document and uses a neural re-ranker to estimate their relevance.</small></details>    | <sub>CODEC, Robust04</sub> |
39 | |2022/12/20| [Precise Zero-Shot Dense Retrieval without Relevance Labels](https://arxiv.org/abs/2212.10496),<br> [[code](https://github.com/texttron/hyde): ![](https://img.shields.io/github/stars/texttron/hyde.svg?style=social)|Luyu Gao, Xueguang Ma, Jimmy Lin, Jamie Callan.|Language Technologies Institute, Carnegie Mellon University,David R. Cheriton School of Computer Science, University of Waterloo|<details><summary><small>This paper presents HyDE ...</small></summary><small>This work proposes to pivot through Hypothetical Document Embeddings(HyDE), which first zero-shot instructs an instruction-following language model to generate a hypothetical document and then grounds the generated document to the actual corpus with an unsupervised contrastively learned encoder.</small></details>| <sub>TREC-DL 19&20, BEIR</sub> |
40 | 
41 | ### 3.2 Pseudo-Relevant Feedback <a id="PRF"></a>
42 | 
43 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset   |
44 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
45 | |2023/8/2| [Large Language Models are Strong Zero-Shot Retriever](https://arxiv.org/abs/2304.14233)|Tao Shen, Guodong Long, Xiubo Geng, Chongyang Tao, Tianyi Zhou, Daxin Jiang|AAII, Microsoft,University of Maryland| <details><summary><small>This paper presents LameR ...</small></summary><small>This work proposes the Language language model as Retriever **(LameR)** to augment a query with its potential answers by prompting LLMs with a composition of the query and the query’s in-domain candidates and proposes to leverage a non-parametric lexicon-based method (e.g., BM25) as the retrieval module to capture query-document overlap in a literal fashion.</small></details>| <sub>MS-MARCO passage, TREC-DL 19&20, BEIR</sub> |
46 | 
47 | ### 3.3 Methods Combination <a id="methods_combination"></a>
48 | 
49 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset   | 
50 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
51 | |2023/12/12| [Synergistic Interplay between Search and Large Language Models for Information Retrieval](https://arxiv.org/abs/2305.07402)<br>[[code](https://github.com/Cyril-JZ/InteR): ![](https://img.shields.io/github/stars/Cyril-JZ/InteR.svg?style=social)]|Tao Shen, Guodong Long, Xiubo Geng, Chongyang Tao, Tianyi Zhou, Daxin Jiang|Peking University, Microsoft,AAII| <details><summary><small>This paper presents InteR ...</small></summary><small>This work proposes **InteR**, a novel framework that facilitates information refinement through synergy between RMs and LLMs,which allows RMs to expand knowledge in queries using LLM-generated knowledge collections and enables LLMs to enhance prompt formulation using retrieved documents.</small></details>| <sub>TREC-DL 19&20, BEIR</sub> |
52 | |2023/5/12| [Generative and Pseudo-Relevant Feedback for Sparse, Dense and Learned Sparse Retrieval](https://arxiv.org/abs/2305.07477)|Iain Mackie, Shubham Chatterjee, Jeffrey Dalton.|University of Glasgow| <details><summary><small>This paper presents ...</small></summary><small>This work proposes combining generative and pseudo-relevance feedback ranking signals to achieve the benefits of both feedback classes.</small></details>| <sub>Robust04, TREC-DL 19&20, CODEC</sub> |
53 | |2023/5/5| [Query Expansion by Prompting Large Language Models](https://arxiv.org/abs/2305.03653)|LRolf Jagerman, Honglei Zhuang, Zhen Qin, Xuanhui Wang, Michael Bendersky.|Google Research| <details><summary><small>This paper presents ...</small></summary><small>This work proposes an approach to query expansion that leverages the generative abilities of Large Language Models (LLMs) and studies a variety of different prompts, including zero-shot, few-shot and Chain-of-Thought (CoT) finding that CoT prompts are especially useful for query expansion.</small></details>| <sub>MS-MARCO passage, BEIR</sub> |
54 | 


--------------------------------------------------------------------------------
/generator.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Introduction
 3 | 
 4 | Coming soon ...
 5 | 
 6 | ## 1. Instruction Fine-tuning <a id="ift"></a>
 7 | 
 8 | ### 1.1 Knowledge Enhance <a id="otm"></a>
 9 | 
10 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
11 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
12 | |2025/04/29 <br> (🌟🌟🌟)| [Systematic Knowledge Injection into Large Language Models via Diverse Augmentation for Domain-Specific RAG](https://aclanthology.org/2025.findings-naacl.329.pdf) [[code](https://github.com/kushagrabhushan/Systematic-Knowledge-Injection): ![](https://img.shields.io/github/stars/kushagrabhushan/Systematic-Knowledge-Injection.svg?style=social)]| Kushagra Bhushan, Yatin Nandwani, Dinesh Khandelwal, et al.|IIT(ISM) Dhanbad, IBM|<details><summary><small>This paper presents *PA-RAG* ...</small></summary><small> PARAG: Paraphrase Augmentation for RetrievalAugmented Generation, a novel fine-tuning framework that improves knowledge injection into LLMs for domain-specific RAG tasks. PA-RAG introduces two different ways of training data augmentation: 1) it uses context augmentation to simulate both retrieval success and retrieval failure scenarios for all the training questions; 2) it synthetically generates multiple answers for each training question to mitigate canonical answer overfitting.</small></details>| <sub>MMLU, GSM8k, Hellaswag, TruthfulQA</sub> |
13 | |2024/03/15 <br> (🌟🌟🌟🌟)| [RAFT: Adapting Language Model to Domain Specific RAG](https://arxiv.org/abs/2403.10131) [[code](https://github.com/ShishirPatil/gorilla): ![](https://img.shields.io/github/stars/ShishirPatil/gorilla.svg?style=social)]| Tianjun Zhang, Shishir G. Patil, Naman Jain, Sheng Shen, et al.|UC Berkeley|<details><summary><small>This paper presents *RAFT* ...</small></summary><small> RAFT leverages fine-tuning with question-answer pairs while referencing the documents in a simulated imperfect retrieval setting — thereby effectively preparing for the open-book exam setting. The RAFT is trained to answer the question (Q) from Document(s) (D) to generate answer (A), where A includes chain-of-thought reasoning.</small></details>| <sub>PubMed, HotpotQA, Gorilla</sub> |
14 | |2023/5/18 <br> (🌟🌟)| [Augmented Large Language Models with Parametric Knowledge Guiding](https://arxiv.org/abs/2305.04757) | Ziyang Luo, Can Xu, Pu Zhao, et. al.,| Hong Kong Baptist University, Microsoft| <details><summary><small>This paper presents PKG ...</small></summary><small>This work propose Parametric Knowledge Guiding (**PKG**), which injects domain knowledge for LLaMa-7B via instruction fine-tuning to capture the necessary expertise. Then, the PKG is used to generage context for a given question as the background-augmented prompting for LLMs.</small></details>| <sub>FM2, NQ-Table, MedMC-QA, ScienceQA</sub>|
15 | 
16 | 
17 | ### 1.2 Attribution Enhance <a id="attribution"></a>
18 | 
19 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
20 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
21 | |2025/04/03 <br> (🌟🌟🌟🌟) | [ScholarCopilot: Training Large Language Models for Academic Writing with Accurate Citations](https://arxiv.org/pdf/2504.00824) [[code](https://github.com/TIGER-AI-Lab/ScholarCopilot): ![](https://img.shields.io/github/stars/TIGER-AI-Lab/ScholarCopilot.svg?style=social)]| Yubo Wang, Xueguang Ma, Ping Nie, et al.| University of Waterloo, CMU | <details><summary><small>This paper presents ** ScholarCopilot** ...</small></summary><small> ScholarCopilot is a unified framework designed to enhance existing large language models for generating professional academic articles with accurate and contextually relevant citations. ScholarCopilot dynamically determines when to retrieve scholarly references by generating a retrieval token [RET]. </small></details>|<sub>500k arXiv, 10M citations</sub>|
22 | |2025/02/13 <br> (🌟🌟🌟🌟) | [SelfCite: Self-Supervised Alignment for Context Attribution in Large Language Models](https://arxiv.org/pdf/2502.09604) | Yung-Sung Chuang, Benjamin Cohen-Wang, Shannon Zejiang Shen, et al. | Meta FAIR, MIT | <details><summary><small>This paper presents **SelfCite** ...</small></summary><small>SelfCite leverages a reward signal provided by the LLM itself through context ablation: If a citation is necessary, removing the cited text from the context should prevent the same response; if sufficient, retaining the cited text alone should preserve the same response. This reward can guide the inference-time best-of-N sampling strategy to improve citation quality significantly, as well as be used in preference optimization to directly fine-tune the models for generating better citations. </small></details>|<sub>LongBench-Cite</sub>|
23 | |2024/12/19 <br> (🌟🌟🌟) | [VISA: Retrieval Augmented Generation with Visual Source Attribution](https://arxiv.org/pdf/2412.14457) | Xueguang Ma, Shengyao Zhuang, Bevan Koopman, Guido Zuccon, Wenhu Chen, Jimmy Lin | University of Waterloo, CSIR, University of Queensland | <details><summary><small>This paper presents **VISA** ...</small></summary><small>This work proposes Retrieval-Augmented Generation with Visual Source Attribution (VISA), which processes single or multiple retrieved document images, and generates an answer as well as the bounding box of the relevant region within the evidence document. They curated two datasets: Wiki-VISA and Paper-VISA, to fine-tune the QWen2-VL-72B</small></details>|<sub>Wiki-VISA, Paper-VISA</sub>|
24 | |2024/09/10 <br> (🌟🌟🌟) | [LongCite: Enabling LLMs to Generate Fine-grained Citations in Long-context QA](https://arxiv.org/abs/2409.02897) [[code](https://github.com/THUDM/LongCite): ![](https://img.shields.io/github/stars/THUDM/LongCite.svg?style=social)] | Jiajie Zhang, Yushi Bai, Xin Lv, et. al.| Tsinghua University| <details><summary><small>This paper presents **LongCite** ...</small></summary><small>This work proposes CoF (abbr. for “Coarse to Fine”), that utilizes off-the-shelf LLMs to automatically construct long-context QA instances with precise sentence-level citations. CoF comprises four stages: (1) Starting with a long text material, CoF first invokes the LLM to produce a query and its answer through Self-Instruct. (2) CoF uses the answer to retrieve several chunks from the context, which are then fed into the LLM to incorporate coarse-grained chunk-level citations within the answer. (3) The LLM identifies relevant sentences from each cited chunk to produce fine-grained citations. (4) instances with an insufficient number of citations are discarded. </small></details>|<sub>LongBench-Cite</sub>|
25 | |2024/08/20 <br> (🌟🌟🌟🌟) | [INSTRUCTRAG: Instructing Retrieval-Augmented Generation via Self-Synthesized Rationales](https://arxiv.org/pdf/2406.13629) [[code](https://github.com/weizhepei/InstructRAG): ![](https://img.shields.io/github/stars/weizhepei/InstructRAG.svg?style=social)] | Zhepei Wei, Wei-Lin Chen, Yu Meng | University of Virginia | <details><summary><small>This paper presents **InstructRAG** ...</small></summary><small>This work proposes InstructRAG to generate rationales along with the answer, enhancing both the generation accuracy and trustworthiness. It first prompt an instruction-tuned LLM (rational generator) to synthesize rationales, which is to explain how to derive correct answer from noisy retrieved documents. Then, it guid the LM to learn explict denoising by leveraging these rationals as either in-context learning demonstrations or as supervised fine-tuning data.</small></details>|<sub>PopQA, TriviaQA, NQ, ASQA, 2WikiMHQA </sub>|
26 | |2024/08/08 <br> (🌟🌟🌟🌟) | [Learning Fine-Grained Grounded Citations for Attributed Large Language Models](https://arxiv.org/abs/2408.04568) [[code](https://github.com/LuckyyySTA/Fine-grained-Attribution): ![](https://img.shields.io/github/stars/LuckyyySTA/Fine-grained-Attribution.svg?style=social)] | Lei Huang, Xiaocheng Feng, Weitao Ma, et. al. | Harbin Institute of Technology, Harbin | <details><summary><small>This paper presents **FRONT** ...</small></summary><small>This work introduces *FRONT*, a two-stage training framework designed to teach LLMs to generate Fine-gRained grOuNded ciTations, consisting of Grounding Guided Generation (G3) and Consistency-Aware Alignment (CAA). During the G3 stage, the LLM first selects supporting quotes from retrieved sources (grounding) and then conditions the generation process on them (generation). The CAA stage then utilizes preference optimization to further align the grounding and generation process by automatically constructing preference signals. </small></details>|<sub>ALCE(ASQA, ELI5, QAMPARI)</sub>|
27 | |2024/07/01 <br> (🌟🌟🌟)  | [Ground Every Sentence: Improving Retrieval-Augmented LLMs with Interleaved Reference-Claim Generation](https://arxiv.org/pdf/2407.01796) | Sirui Xia, Xintao Wang, Jiaqing Liang, et al. | Fudan University, AntGroup | <details><summary><small>This paper presents **ReClaim** ...</small></summary><small>Contributions: 1) ReClaim alternately generates citations and answer sentences, to enable large models to generate answer with citations. 2) For ReClaim, they constructed a training dataset and fine-tuned the model using different approaches to improve its attribution capability. 3) Through multiple experiments, they demonstrated the effectiveness of the method in enhancing the model’s verifiability and credibility. </small></details>|<sub>ASQA, ELI5 </sub>|
28 | |2024/03/27<br> (🌟)  | [Improving Attributed Text Generation of Large Language Models via Preference Learning](https://arxiv.org/pdf/2403.18381) [[code](https://github.com/HITsz-TMG/ATG-PO): ![](https://img.shields.io/github/stars/HITsz-TMG/ATG-PO.svg?style=social)] | Dongfang Li, Zetian Sun, Baotian Hu, et. al. | Harbin Institute of Technology (Shenzhen) | <details><summary><small>This paper presents APO ...</small></summary><small>This work conceptualize the attribution task for LLMs as preference learning and proposing an Automatic Preference Optimization (APO) framework. They assemble a curated dataset comprising 6,330 examples sourced and refined from existing datasets for posttraining. Beside, they further propose an automatic method to synthesize attribution preference data resulting in 95,263 pairs. </small></details>|<sub>ASQA, StrategyQA, ELI5</sub>|
29 | |2024/03/04 <br> (🌟🌟)  | [Citation-Enhanced Generation for LLM-based Chatbots](https://arxiv.org/pdf/2402.16063) | Weitao Li, Junkai Li, Weizhi Ma, Yang Liu | Tsinghua University | <details><summary><small>This paper presents CEG ...</small></summary><small>This work proposes a post-hoc Citation-Enhanced Generation (CEG) approach combined with RAG. It consists of three components: 1) *Retrieval Augmentation Module* uses NLTK as sentence tokenizer to obtain claims, then uses dense retrieval (SimCSE Bert) to retrieve documents; 2) *Citation Generation Module* first uses NLI model to determine the relationship between each claim-document pair to select valid reference for each claim; 3) *Response Regeneration Module* takes the question, original response, nonfactual claims, and relevant docs, as prompt input to regenerate the new response.</small></details>|<sub>WikiBio GPT-3, FELM, HaluEval, WikiRetr </sub>|
30 | 
31 | ### 1.3 Long-context Enhance <a id="attribution"></a>
32 | 
33 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
34 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
35 | |2025/06/04 <br> (🌟🌟🌟🌟)  | [Stronger Baselines for Retrieval-Augmented Generation with Long-Context Language Models](https://arxiv.org/pdf/2506.03989) [[code](https://github.com/alex-laitenberger/stronger-baselines-rag/): ![](https://img.shields.io/github/stars/alex-laitenberger/stronger-baselines-rag.svg?style=social)] | Alex Laitenberger, Christopher D. Manning, Nelson F. Liu |  Stanford University | <details><summary><small>This paper compares **DOS RAG** ...</small></summary><small>This paper aims to study ``With long-context LLMs (GPT-4o), do multistage retrieval-augmented generation (RAG) pipelines still offer measurable benefits over simpler, single-stage approaches''. The results show that DOS RAG consistently matches or outperforms more intricate methods on ∞Bench, QuALITY, NarrativeQA. </small></details>|<sub>∞Bench, QuALITY, NarrativeQA</sub>|
36 | |2024/09/01 <br> (🌟🌟🌟)  | [LongRAG: Enhancing Retrieval-Augmented Generation with Long-context LLMs](https://arxiv.org/pdf/2406.15319) [[code](https://github.com/TIGER-AI-Lab/LongRAG/): ![](https://img.shields.io/github/stars/TIGER-AI-Lab/LongRAG.svg?style=social)] | Ziyan Jiang, Xueguang Ma, Wenhu Chen |  University of Waterloo | <details><summary><small>This paper presents **LongRAG** ...</small></summary><small>The LongRAG consists of a "long retriever" and a "long reader", which processes the entire Wikipedia into 4K-token units, which is 30x longer than before. It adopts off-the-shelf BGE as retriever and Gemini1.5-Pro or GPT-4o as readers without any further tuning. </small></details>|<sub>NQ, HotpotQA, Qasper, MultiFieldQA-en </sub>|
37 | |2024/07/11 <br> (🌟🌟🌟🌟🌟)  | [LLM Maybe LongLM: SelfExtend LLM Context Window Without Tuning](https://arxiv.org/pdf/2401.01325) [[code](https://github.com/datamllab/LongLM): ![](https://img.shields.io/github/stars/datamllab/LongLM.svg?style=social)] | ZHongye Jin, Xiaotian Han, Jingfeng Yang, et. al. | Texas A&M University | <details><summary><small>This paper presents SelfExtend ...</small></summary><small>SelfExtend extend the context window of LLMs by construncting bi-level attention information without fine-tuning: 1) The grouped attention captures the dependencies amongo tokens that are far apart; 2) The neighbor attention captures dependencies among adjacent tokens within a specified range</small></details>|<sub>LongBench, L-Eval </sub>|
38 | |2024/05/29 <br> (🌟🌟🌟) | [Beyond the Limits: A Survey of Techniques to Extend the Context Length in Large Language Models](https://arxiv.org/pdf/2402.02244) | Xindi Wang, Mahsa Salmani, Parsa Omidi, et. al. | Huawei Tech. Canada, University of Western Ontario | <details><summary><small>This paper presents a survey ...</small></summary><small>This paper survey works in enabling LLMs to handle long sequences, including *length extrapolation*, *attention approximation*, *attention-free transformers*, *model compression*, and *hardware-aware transformers*.</small></details>|<sub>None</sub>|
39 | 
40 | ### 1.4 Reasoning Enhance <a id="reason"></a>
41 | 
42 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
43 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
44 | |2024/09/01 <br> (🌟🌟)  | [ReasonFlux: Hierarchical LLM Reasoning via Scaling Thought Templates](https://arxiv.org/abs/2502.06772) [[code](https://github.com/Gen-Verse/ReasonFlux/): ![](https://img.shields.io/github/stars/Gen-Verse/ReasonFlux.svg?style=social)] | Ling Yang, Zhaochen Yu, Bin Cui, Mengdi Wang | Princeton University, Peking University | <details><summary><small>**ReasonFlux**: Finetuning, Qwen2.5.</small></summary><small>It train the *ReasonFlux-32B* model with 8 GPUs and introduces three innovations: (i) a structured **thought template library**, containing around 500 high-level thought templates; (ii) performing hierarchical reinforcement learning on a sequence of thought templates, optimizing a base LLM to plan out an optimal template trajectory for gradually handling complex problems; (iii) a brand new inference scaling system that enables hierarchical LLM reasoning by adaptively scaling thought templates at inference time. </small></details>|<sub> MATH, AIME 2024, AMC 2023, OlympiadBench, Gaokao, En 2023 </sub>|
45 | 
46 | ## 2. Haullucinations <a id="hallucinations"></a><br>
47 | 
48 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
49 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
50 | |2023/09/13 <br> (🌟🌟)  | [Cognitive Mirage: A Review of Hallucinations in Large Language Models](https://arxiv.org/pdf/2309.06794.pdf) | Hongbin Ye, Tong Liu, Aijia Zhang, Wei Hua, Weiqiang Jia | Zhejiang Lab | <details><summary><small>This paper presents taxonomy of hallucinations ...</small></summary><small>This work provides a literature review on hallucinations, which presents a taxonomy of hallucinations from several text generation tasks, and mechanism analysis (three types: data collection, knowledge gap, and optimization process), detection methods and improvement approaches.</small></details>| <sub>-</sub>|
51 | 
52 | ## 3. Understanding of LLM <a id="understand"></a><br>
53 | 
54 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
55 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
56 | |2025/05/26 <br> (🌟🌟🌟🌟)  | [SFT Memorizes, RL Generalizes: A Comparative Study of Foundation Model Post-training](https://arxiv.org/pdf/2501.17161) | Tianzhe Chu, Yuexiang Zhai, Jihan Yang, et al. | HKU | <details><summary><small>This paper studies the comparative effect of **SFT** and **RL** on generalization and memorization ... </small></summary><details><small> This paper introduces **GeneralPoints**, an arithmetic reasoning card game, and also consider **V-IRL**, a real-world navigation environment, to assess how models trained with SFT and RL generalize to unseen variants. Findings: 1) RL, especially trained with an outcome-based reward, generalizes in both rule-based textual and visual environments. 2) SFT, tends to memorize the training data and struggle to generalize out-of-distribution in either scenario.</small></details>| <sub>GeneralPoints, V-IRL</sub>|
57 | |2025/05/02 <br> (🌟🌟🌟🌟)  | [Physics of Language Models: Part 4.1, Architecture Design and the Magic of Canon Layers](http://zeyuan.allen-zhu.com/paper/2025-canon.pdf) | Zeyuan Allen-Zhu | Meta/FAIR Labs | <details><summary><small>This paper studies architectural differences in LMs... </small></summary><details><small> This paper introduces controlled synthetic pretraining tasks that isolate and evaluate core model capacities. They discover **Canon layers**: lightweight architectural components, that promote horizontal infromation flow across neighboring tokens. </small></details>| <sub>controlled biography dataset</sub>|
58 | |2024/04/08 <br> (🌟🌟🌟🌟)  | [Physics of Language Models: Part 3.3, Knowledge Capacity Scaling Laws](https://arxiv.org/pdf/2404.05405) | Zeyuan Allen-Zhu, Yuanzhi Li | Meta/FAIR Labs | <details><summary><small>This paper studies knowledge capacity scaling laws... </small></summary><details><small> This paper investigate the number of knowldge *bits* a mall stores. They focus on factual knowledge represented as tuples. Findings: 1) LMs can only store *2 bits of knowledge per parameter, even when quantized to int8*, and 7B model can store 14B bits of knowledge. 2) The GPT-2 architecture, with rotary embedding, matches or even surpasses LLaMA/Mistral architectures in knowledge storage, particularly over shorter training durations. 3)Prepending training data with domain names (e.g., wikipedia.org) significantly increases a model’s knowledge capacity. </small></details>| <sub>controlled biography dataset</sub>|
59 | |2023/09/18 <br> (🌟🌟🌟🌟)  | [Physics of Language Models: Part 3.2, Knowledge Manipulation](https://arxiv.org/pdf/2309.14402) | Zeyuan Allen-Zhu, Yuanzhi Li | Meta/FAIR Labs | <details><summary><small>This paper studies knowledge manipulation of LLMs... </small></summary><details><small> This paper investigate four knowledge manipulation tasks: retrieval, classification, comparison, and inverse search. Findings: 1) LLM **excel in knowledge retrieval** but **struggle even in the simplest classification or comparison tasks unless Chain of Thoughts (CoT)**, and 2) the performance of **inverse knowledge search is virtually 0%**, regardless of the prompts.</small></details>| <sub>controlled biography dataset</sub>|
60 | |2023/09/18 <br> (🌟🌟🌟🌟)  | [Physics of Language Models: Part 3.1, Knowledge Storage and Extraction](https://arxiv.org/pdf/2309.14316) | Zeyuan Allen-Zhu, Yuanzhi Li | Meta/FAIR Labs | <details><summary><small>This paper studies knowledge storage and Extraction of LLMs... </small></summary><details><small> This paper investigate whether the question-answering capabilities of LLMs stem from **pattern recognition and memorization** or from **a genuine ability to reason and extract knowledge** from their training data. Findings: 1) rewrite the pre-training data-using small, auxiliary models-to provide knowledge augmentation, and 2) incorporate more instruction-finetuning data into the pretraining stage before it becomes too late.</small></details>| <sub>controlled biography dataset</sub>|
61 | 
62 | ## 4. Datasets <a id="datasets"></a>
63 | 
64 | ### 4.1 Factoid QA <a id="fqa"></a>
65 | 
66 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
67 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
68 | |2024/01/26 | [Benchmarking Large Language Models in Complex Question Answering Attribution using Knowledge Graphs](https://arxiv.org/abs/2401.14640.pdf)| Nan Hu, Jiaoyan Chen, Yike Wu, Guilin Qi, Sheng Bi, Tongtong Wu, Jeff Z. Pan.|Southeast University, The University of Manchester,The University of Edinburgh |<details><summary><small>This paper presents CAQA ...</small></summary><small>CAQA is a new benchmark for complex question answering attribution, which is designed to evaluate the ability of LLMs to answer complex questions with the help of knowledge graphs.</small></details>| <sub> CAQA </sub>|
69 | |2022/04/12| [ASQA: Factoid Questions Meet Long-Form Answers](https://arxiv.org/abs/2204.06092.pdf) [[dataset](https://huggingface.co/datasets/din0s/asqa)]|Ivan Stelmakh, Yi Luan, Bhuwan Dhingra, Ming-Wei Chang|Carnegie Mellon University, Duke University, Google Research |<details><summary><small>This paper presents ASQA ...</small></summary><small>ASQA is the first long-form question answering dataset that focuses on ambiguous factoid questions.</small></details>| <sub>ASQA</sub>|
70 | 
71 | ### 3.2 Long-form QA <a id="lfqa"></a>
72 | 
73 | 


--------------------------------------------------------------------------------
/mediation.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Introduction
 3 | 
 4 | The retrieved documents often contain a list of passages which are ranked by their relevance score to the question. It would be costly to directly input these passages into the LLM. On one hand, <u>the relevance score of these passages does not necessarily indicate their usefulness for answer generation, which could introduce noise to the LLM</u>. On the other hand, <u>the length of the list could exceed the length limit of the LLM</u>. Thus, the mediation (also called post-retrieval) component is introduced to select or compress the retrieved content.
 5 | 
 6 | 
 7 | 
 8 | ## 1. Survey papers <a id="survey"></a>
 9 | 
10 | | Date       | Title | Authors   | Orgnization | Abs    |                                                                           
11 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
12 | |2024/10/02 <br> (🌟)| [Contextual Compression in Retrieval-Augmented Generation for Large Language Models: A Survey](https://arxiv.org/pdf/2409.13385) |Sourav Verma|IBM Watsonx Client Engineering, India |<details><summary><small> Survey on contextual compression.</small></summary><small>**Contextual compression for large language models**: semantic compression, pre-trained language models, retrievers.</small></details>|
13 | 
14 | 
15 | ## 2. Compression-based Adapter <a id="compress"></a>
16 | ### 2.1 Selective Methods <a id="selection"></a>
17 | Selective Methods aims to select a subset of tokens from original contexts, to filter out noises in the context.
18 | 
19 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
20 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
21 | |2025/01/27<br> (🌟🌟🌟)| [Provence: efficient and robust context pruning for retrieval-augmented generation](https://arxiv.org/pdf/2501.16214) <br>[[Huggingface Model](https://huggingface.co/naver/provence-reranker-debertav3-v1)]|Nadezhda Chirkova, Thibault Formal, Vassilina Nikoulina, Stéphane Clinchant|KNAVER LABS Europe|<details><summary><small>**Provence**.</small></summary><small>**Provence (Pruning and Reranking Of retrieVEd relevaNt ContExts)** poses the context pruning as a sequence labeling task. It finetunes DeBERTa model to encode query-context pair and output binary masks. The labels for training are generated by LLama-3-8B-Instruct.</small></details>| <sub>NQ, TyDi QA, PopQA, HotpotQA, BioASQ, SyllabusQA, RGB</sub> |
22 | |2024/12/18<br> (🌟🌟🌟🌟)| [EXIT: Context-Aware Extractive Compression for Enhancing Retrieval-Augmented Generation](https://arxiv.org/abs/2412.12559) <br>[[code](https://github.com/ThisIsHwang/EXIT): ![](https://img.shields.io/github/stars/ThisIsHwang/EXIT.svg?style=social)]|Taeho Hwang, Sukmin Cho, Soyeong Jeong, et al.|Korea Advanced Institute of Science and Technology|<details><summary><small>**EXIT**.</small></summary><small>**EXIT (EXtractIve ContexT compression)** operates in three stages: 1) splitting retrieved documents into sentences (rule-based), 2）performing parallelizable binary classification ("YES" or "NO") on each sentence (Gemma-2B-it), 3) recombining selected sentences while preserving their orginial order. (LLaMA3.1-8B)</small></details>| <sub>NQ, TriviaQA, HotpotQA, 2WikiMultiHopQA</sub> |
23 | |2024/03/21 <br> (🌟🌟)| [FIT-RAG: Black-Box RAG with Factual Information and Token Reduction](https://arxiv.org/pdf/2403.14374) |Yuren Mao, Xuemei Dong, Wenyi Xu, et al. |Zhejiang University |<details><summary><small>This paper presents FIT-RAG ...</small></summary><small>**FIT-RAG** utilizes the factual information in the retrieval and reduces the number of tokens for augmentation. It consists of five components: a similarity-based retriever, a bi-label document scorer, a bi-faceted self-knowledge recognizer, a sub-document-level token reducer and a prompt construction module.</small></details>| <sub>TriviaQA, NQ, PopQA</sub> |
24 | |2024/03/19 <br> (🌟🌟🌟)| [LLMLingua-2: Data Distillation for Efficient and Faithful Task-Agnostic Prompt Compression](https://arxiv.org/abs/2403.12968) <br>[[code](https://github.com/microsoft/LLMLingua): ![](https://img.shields.io/github/stars/microsoft/LLMLingua.svg?style=social)]|Zhuoshi Pan, Qianhui Wu, et al. |Microsoft Corporation |<details><summary><small>**LLMLingua-2**.</small></summary><small>**LLMLingua-2** formulate prompt compression as a token classification problem to guarantee the faithfulness of the compressed prompt to the original one, and use a Transformer encoder as the base architecture to capture all essential information for prompt compression from the full bidirectional context.  </small></details>| <sub>MeetingBank, LongBench, ZeroScrolls, GSM8K, BBH</sub> |
25 | |2023/11/14 <br> (🌟🌟🌟🌟)| [Learning to Filter Context for Retrieval-Augmented Generation](https://arxiv.org/pdf/2311.08377) <br>[[code](https://github.com/zorazrw/filco): ![](https://img.shields.io/github/stars/zorazrw/filco.svg?style=social)]|Zhiruo Wang, Jun Araki, et al. |Carnegie Mellon University |<details><summary><small>**FILCO**.</small></summary><small>**FILCO** improves the quality of the context provided to the generator by (1) identifying useful context based on lexical and information-theoretic approaches, and (2) training context filtering models that can filter retrieved contexts at test time.</small></details>| <sub>NQ, TriviaQA, ELI5, FEVER, WoW</sub> |
26 | |2023/10/10 <br> (🌟🌟🌟)| [LongLLMLingua: Accelerating and Enhancing LLMs in Long Context Scenarios via Prompt Compression](https://arxiv.org/abs/2310.06839) <br>[[code](https://github.com/microsoft/LLMLingua): ![](https://img.shields.io/github/stars/microsoft/LLMLingua.svg?style=social)]|Huiqiang Jiang, Qianhui Wu, et al. |Microsoft Corporation |<details><summary><small>**LongLLMLingua**.</small></summary><small>**LongLLMLingua** conducts prompt compression towards improving LLMs’ perception of the key information to address three challenges: higher computational/financial cost, longer latency, and inferior performance. </small></details>| <sub> LongBench, ZeroSCROLLS, MuSiQue, LooGLE</sub> |
27 | |2023/10/09 <br> (🌟🌟🌟🌟)| [LLMLingua: Compressing Prompts for Accelerated Inference of Large Language Models](https://arxiv.org/abs/2310.05736)<br>[[code](https://github.com/microsoft/LLMLingua): ![](https://img.shields.io/github/stars/microsoft/LLMLingua.svg?style=social)] |Huiqiang Jiang, Qianhui Wu, et al. |Microsoft Corporation |<details><summary><small>**LLMLingua**: LLaMA-7B to identify and remove non-essential tokens.</small></summary><small>**LLMLingua** is a coarse-to-fine prompt compression method that involves a budget controller to maintain semantic integrity under high compression ratios, a token-level iterative compression algorithm to better model the interdependence between compressed contents, and an instruction tuning based method for distribution alignment between language models.  </small></details>|  <sub>GSM8K, BBH, ShareGPT, and Arxiv-March23</sub> |
28 | |2023/09/02 <br> (🌟🌟)| [LeanContext: Cost-Efficient Domain-Specific Question Answering Using LLMs](https://arxiv.org/abs/2309.00841) |Md Adnan Arefeen, Biplob Debnath, Srimat Chakradhar |NEC Laboratories America |<details><summary><small>**LeanContext**: ranking sentence with cosine.</small></summary><small>**LeanContext** extracts *k key sentences* from the context that are closely aligned with the query. LeanContext introduces a reinforcement learning technique that dynamically determines k based on the query and context. The rest of the less important sentences are reduced using a free open source text reduction method. </small></details>| <sub>Arxiv, BBC News</sub> |
29 | |2023/04/24 <br> (🌟🌟🌟🌟)| [Compressing Context to Enhance Inference Efficiency of Large Language Models](https://aclanthology.org/2023.emnlp-main.391.pdf) <br>[[code](https://github.com/lliyucheng09/Selective_Context): ![](https://img.shields.io/github/stars/liyucheng09/Selective_Context.svg?style=social)]|Yucheng Li, Bo Dong, Frank Guerin, Chenghua Lin |University of Surrey, University of Manchester, University of Sheffield, UK|<details><summary><small>**Selective_Context**: LLaMA-7B token probabilities</small></summary><small>**Selective_Context** evaluates informativeness of *lexical units (i.e., tokens, phrases, or sentences)* with self-information computed by a base causal language model. It selectively retains content with higher self-information.</small></details>| <sub>axXiv papers, BBC News, ShareGPT.com</sub> |
30 | 
31 | 
32 | 
33 | ### 2.2 Abstractive Methods <a id="abstractive"></a>
34 | 
35 | Abstractive Methods usually compress contexts by generating summarys, to filter out noises in the context.
36 | 
37 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
38 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
39 | |2025/05/21<br> (🌟🌟🌟)| [Beyond Hard and Soft: Hybrid Context Compression for Balancing Local and Global Information Retention](https://arxiv.org/abs/2505.15774) <br>[[code](https://github.com/Xnhyacinth/HyCo2): ![](https://img.shields.io/github/stars/Xnhyacinth/HyCo2.svg?style=social)]| Huanxuan Liao, Wen Hu, Yao Xu, et al.| Institute of Automation, CAS| <details><summary><small>**HyCo2**:finetune, LLaMA3.1-8B.</small></summary><small>**HyCo2** integrates global and local perspectives to guide context compression. It uses a hybrid adapter to refine global semantics, and incorporates a classification layer to assign a retension probability to each token on the local view.</small></details>| <sub>NQ, TriviaQA, WebQuestions, PopQA, ComplexWebQuestions, HotpotQA, 2WikiMultihopQA </sub> |
40 | |2024/10/14<br> (🌟🌟🌟)| [COMPACT: Compressing Retrieved Documents Actively for Question Answering](https://arxiv.org/abs/2407.09014) <br>[[code](https://github.com/dmis-lab/CompAct): ![](https://img.shields.io/github/stars/dmis-lab/CompAct.svg?style=social)]|Chanwoong Yoon,Taewhoo Lee, Hyeon Hwang, et al.|Korea University|<details><summary><small>**CompAct**: instruction-tuned Mistral-7B.</small></summary><small>**CompAct** groups documents into several segments, then sequentially compress segments into a compacted context. It uses a subset of HotpotQA training set for data collection, and utilizes GPT-4o API to collect dataset.</small></details>| <sub>NQ, TriviaQA, HotpotQA, 2WikiMultiHopQA, MuSiQue</sub> |
41 | |2024/07/04<br> (🌟🌟🌟)| [Attribute First, then Generate: Locally-attributable Grounded Text Generation](https://arxiv.org/abs/2403.17104) <br>[[code](https://github.com/lovodkin93/attribute-first-then-generate): ![](https://img.shields.io/github/stars/lovodkin93/attribute-first-then-generate.svg?style=social)]|Aviv Slobodkin, Eran Hirsch et al. |Bar-Ilan University|<details><summary><small>**AttrFirst**.</small></summary><small>**AttrFirst** propose a locally-attributable text generation approach, with prompt-based three steps: 1) content selection (choosing relevant spans from source texts), 2)sentence-level planning (organizing and grouping content), 3)sentence-by-sentence generation (based on selected and structured output).</small></details>| <sub>DUC, TAC, MultiNews </sub>|
42 | |2024/02/15<br> (🌟🌟🌟)| [Grounding Language Model with Chunking-Free In-Context Retrieval](https://arxiv.org/abs/2402.09760) | Hongjin Qian, et al. | Gaoling School of Artificial Intelligence, Renmin University of China, | <details><summary><small>**CFIC**.</small></summary><small>This paper presents a novel Chunking-Free In-Context (CFIC) retrieval approach, specifically tailored for Retrieval-Augmented Generation (RAG) systems. CFIC addresses these challenges by circumventing the conventional chunking process. It utilizes the encoded hidden states of documents for in-context retrieval, employing auto-aggressive decoding to accurately identify the specific evidence text required for user queries, eliminating the need for chunking. CFIC is further enhanced by incorporating two decoding strategies, namely Constrained Sentence Prefix Decoding and Skip Decoding. These strategies not only improve the efficiency of the retrieval process but also ensure that the fidelity of the generated grounding text evidence is maintained.</small></details>| <sub>NarrativeQA, Qasper, MultiFieldQA, HotpotQA, MuSiQue</sub> |
43 | |2023/10/25<br> (🌟🌟)| [TCRA-LLM: Token Compression Retrieval Augmented Large Language Model for Inference Cost Reduction](https://arxiv.org/abs/2310.15556) |Junyi Liu, Liangzhi Li, et al. |Meetyou AI Lab|<details><summary><small>This paper presents TCRA...</small></summary><small>**TCRA** propose a token compression scheme that includes two methods: summarization compression and semantic compression. The first method applies a T5-based model that is fine-tuned by datasets generated using self-instruct containing samples with varying lengths and reduce token size by doing summarization. The second method further compresses the token size by removing words with lower impact on the semantic.</small></details>| <sub>FRDB</sub> |
44 | |2023/04/12 <br> (🌟🌟🌟)| [RECOMP: Improving Retrieval-Augmented LMs With Compression and Selective Augmentation](https://arxiv.org/pdf/2310.04408.pdf) <br>[[code](https://github.com/carriex/recomp): ![](https://img.shields.io/github/stars/carriex/recomp.svg?style=social)]|Fangyuan Xu, Weijia Shi, Eunsol Choi1 |The University of Texas at Austin, University of Washington|<details><summary><small>**Recomp**.</small></summary><small>**Recomp** introduces two types of compressors: an <u>extractive</u> compressor that selects pertinent sentences from retrieved documents, and an <u>abstractive</u> compressor that produces concise summaries by amalgamating information from multiple documents.</small></details>| <sub>WikiText-103, NQ, TriviaQA, HotpotQA</sub> |
45 | 
46 | ## 3. Thoughts-based Methods <a id="thoughts"></a>
47 | 
48 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
49 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
50 | |2024/03/28 <br> (🌟🌟🌟)| [ActiveRAG: Autonomously Knowledge Assimilation and Accommodation through Retrieval-Augmented Agents](https://arxiv.org/pdf/2402.13547) [[code](https://github.com/OpenMatch/ActiveRAG): ![](https://img.shields.io/github/stars/OpenMatch/ActiveRAG.svg?style=social)]| Zhipeng Xu, Zhenghao Liu, Yukun Yan, et al.|Northeastern University, China|<details><summary><small>**ActiveRAG** ...</small></summary><small>The ActiveRAG workflow follows three-step: 1) the *Self-Inquiry Agent* produce chain-of-thought (P) to answer the question using LLM based on Q. 2) The *Knowledge Assimilation agent* generates an assimilation rational (T) based on Q and D. 3) The *Thought Accommodation agent* generates responses based on (Q, T,P).</small></details>| <sub>PopQA, TriviaQA, NQ, 2WikiMHQA, ASQA</sub> |
51 | |2023/10/17 <br> (🌟🌟🌟🌟)| [SELF-RAG: Learning To Retrieve, Generate, and Critique  Through SELF-Reflection](https://arxiv.org/pdf/2310.11511) [[code](https://github.com/AkariAsai/self-rag): ![](https://img.shields.io/github/stars/AkariAsai/self-rag.svg?style=social)]|YAkari Asai, Zeqiu Wu, Yizhong Wang, Avirup Sil, Hannaneh Hajishirzi|University of Washington|<details><summary><small>**Self-RAG**.</small></summary><small>This work introduces a framework called Self-Reflective Retrieval-Augmented Generation that enhances an LM’s quality and factuality through retrieval and self-reflection. It trains a single LM that adaptively retrieves passages on-demand, and generates and reflects on retrieved passages and its own generations using special tokens, called reflection tokens.</small></details>| <sub>PubHealth, ARC-Challenge, PopQA, TRiviaQA, ALCE-ASQA</sub> |
52 | |2023/10/8<br> (🌟🌟)| [Self-Knowledge Guided Retrieval Augmentation for Large Language Models](https://arxiv.org/abs/2310.05002) [[code](https://github.com/THUNLP-MT/SKR): ![](https://img.shields.io/github/stars/THUNLP-MT/SKR.svg?style=social)]|Yile Wang, Peng Li, Maosong Sun, Yang Liu| Tsinghua University|<details><summary><small>**SKR**.</small></summary><small>This work investigate eliciting the model's ability to recognize what they know and do not know (which is also called self-knowledge) and propose Self-Knowledge guided Retrieval augmentation (SKR), a simple yet effective method which can let LLMs refer to the questions they have previously encountered and adaptively call for external resources when dealing with new questions.</small></details>| <sub>TemporalQA, CommonsenseQA, TabularQA, StrategyQA, TruthfulQA</sub> |
53 | 
54 | 
55 | ## 4. Preference (Dual) Alignment Methods <a id="align"></a>
56 | 
57 | ## 4.1. Finetuning-based Alignment <a id="finetune"></a>
58 | Finetuning both retriever and generator to align them for better retrieval and generation, respectively.
59 | 
60 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
61 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
62 | |2024/07/18 | [Understand What LLM Needs: Dual Preference Alignment for Retrieval-Augmented Generation](https://arxiv.org/pdf/2406.18676) [[code](https://github.com/dongguanting/DPA-RAG): ![](https://img.shields.io/github/stars/dongguanting/DPA-RAG.svg?style=social)]| Guanting Dong, Yutao Zhu, Chenghao Zhang, Zechen Wang, Zhicheng Dou, Ji-Rong Wen|Renmin University of China|<details><summary><small>This paper presents *DPA-RAG* ...</small></summary><small>DPA-RAG consists of three key components: (1) Preference Knowledge Construction: it first extracts the specific knowledge that significantly affects LLMs’ reasoning preferences. Then we introduce five query augmentation strategies and a quality filtering process to synthesize high-quality preference knowledge. (2) Reranker-LLM Alignment: it designs multi-grained alignment tasks for fine-tuning a preference-aligned reranker. (3) LLM Self-Alignment: it introduces a pre-aligned phrase prior to the vanilla SFT stage.</small></details>| <sub>NQ, TriviaQA, HotpotQA, WebQSP</sub> |
63 | |2023/5/24| [REPLUG: Retrieval-Augmented Black-Box Language Models](https://arxiv.org/abs/2301.12652.pdf) [[code](https://github.com/swj0419/REPLUG): ![](https://img.shields.io/github/stars/swj0419/REPLUG.svg?style=social)]|Weijia Shi, Sewon Min, Michihiro Yasunaga, et. al.|University of Washington, Stanford University, KAIST, Meta AI|<details><summary><small>This paper presents REPLUT ...</small></summary><small>This work introduce REPLUG, which prepends each retrieved document and question separately to the LLM and ensembles output probabilities from different passes. Besides, it takes LM to score documents to supervise the dense retriever training.</small></details>| <sub>Pile, NQ, TriviaQA</sub> |
64 | |2022/11/16| [Atlas: Few-shot Learning with Retrieval Augmented Language Models](https://arxiv.org/abs/2208.03299.pdf)<br> [[code](https://github.com/facebookresearch/atlas): ![](https://img.shields.io/github/stars/facebookresearch/atlas.svg?style=social)]|Gautier Izacard, Patrick Lewis, Maria Lomeli, et. al.|Meta AI, ENS, PSL University, Inria, UCL|<details><summary><small>This paper presents Atlas ...</small></summary><small>This work present Atlas, a retrieval (Contriever) augmented language model (T5) by carefully designed training, i.e., 1) jointly pre-train the retriever and LLM using unsupervised output, 2) efficient retriever fine-tuning (including full index update, reranking, and query-side fine-tuning).</small></details>| <sub>KILT, MMLU, NQ, TriviaQA</sub> |
65 | 
66 | 
67 | ## 4.2. Interative-based Alignment <a id="iterative"></a>
68 | Iterative between retriever and generator to align them for better retrieval and generation, respectively.
69 | 
70 | 
71 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
72 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
73 | |2025/01/24 | [Chain-of-Retrieval Augmented Generation](https://arxiv.org/pdf/2501.14342)  | Liang Wang, Haonan Chen, Nan Yang, et. al.| Microsoft Corporation, Renming University of China| <details><summary><small>This paper presents CoRAG ...</small></summary><small>This work propose **CoRAG**, simulates the iterative, step-by-step reasoning process to solve complex questions. It allows the model to refine its query, gather new insights, and synthesize information in a more structured way. They propose *rejection sampling* to generates intermediate retrieval chains (chain of sub-queries and sub-answers) to fine-tune an LLM using standard next-token prediction. </small></details>| <sub>HotpotQA,2WikiMHQA, Bamboogle, Musique, and the KILT benchmark.</sub> |
74 | |2024/11/29 | [Auto-RAG: Autonomous Retrieval-Augmented Generation for Large Language Models](https://arxiv.org/abs/2411.19443) <br>[[code](https://github.com/ictnlp/Auto-RAG): ![](https://img.shields.io/github/stars/ictnlp/Auto-RAG.svg?style=social)] | Yu Tian, Shaolei Zhang, Yang Feng| ICT, CAS| <details><summary><small>This paper presents Auto-RAG ...</small></summary><small>This work propose **Auto-RAG**, which autonomously synthesizing reasoning-based decision-making instructions in iterative retrieval and fine-tuned the open-source LLMs. </small></details>| <sub>NQ, HotpotQA,2WikiMHQA, TriviaQA, PopQA, WebQuestions.</sub> |
75 | |2023/10/23 | [Enhancing Retrieval-Augmented Large Language Models with Iterative Retrieval-Generation Synergy](https://arxiv.org/pdf/2305.15294.pdf) | Zhihong Shao, Yeyun Gong, yelong shen, Minlie Huang, et. al.| Tsinghua University, Microsoft Research Asia| <details><summary><small>This paper presents ITER-RETGEN ...</small></summary><small>This work propose **ITER-RETGEN**, which iterates retrieval-augmented generation and generation-augmented retrieval. Besides, they find that exact match can significantly underestimate the performance of LLMs, and using LLMs to evaluate is more reliable.</small></details>| <sub>HotpotQA, 2WikiMHQA, MuSiQue, Feverous, StrategyQA</sub> |
76 | |2023/10/08 | [Retrieval-Generation Synergy Augmented Large Language Models](https://arxiv.org/pdf/2310.05149.pdf) | Zhangyin Feng, Xiaocheng Feng, Dezhi Zhao, Maojin Yang, Bing Qin| Harbin Institute of Technology| <details><summary><small>This paper presents ITRG ...</small></summary><small>This work propose **ITRG**, which contains two steps: 1) generation augmented retrieval (GAR) to expand the query based on previous iteration to help retrieve, 2) retrieval augmented generation (RAG) to generate new document to answer questions based on retrieved documents.</small></details>| <sub>NQ, TriviaQA, 2WikiMHQA, HotpotQA</sub> |
77 | |2023/6/22 | [Interleaving Retrieval with Chain-of-Thought Reasoning for Knowledge-Intensive Multi-Step Questions](https://arxiv.org/abs/2212.10509) <br>[[code](https://github.com/stonybrooknlp/ircot): ![](https://img.shields.io/github/stars/stonybrooknlp/ircot.svg?style=social)] | Harsh Trivedi, Niranjan Balasubramanian, Tushar Khot, Ashish Sabharwal| Stony Brook University, Allen Institute for AI| <details><summary><small>This paper presents IRCoT ...</small></summary><small>This work propose **IRCoT**, which interleaves CoT generation and retrieval steps to guid the retrieval by CoT and vice-versa. Two steps: 1) reason step generates next CoT sentence based on question, retrieved passage, and CoT sentences; 2) retrieval step retrieves K more passages based on the last CoT sentence.</small></details>| <sub>HotpotQA,2WikiMHQA, MuSiQue, and IIRC</sub> |
78 | 


--------------------------------------------------------------------------------
/ranker.md:
--------------------------------------------------------------------------------
  1 | ## Introduction
  2 | 
  3 | Coming soon ...
  4 | 
  5 | 
  6 | 
  7 | ## 1. Retrieval Methods <a id="retrieval"></a>
  8 | 
  9 | ### 1.1 LLM as Retriever <a id="tunembedding"></a>
 10 | 
 11 | | Date       | Title | Authors   | Orgnization | Abs    | Base | Dataset     |
 12 | |------------|-------------------|---------------|------------------------|-----------------|------------------|--------------|
 13 | |2024/12/17 <br> (🌟🌟🌟🌟🌟)| [LLMs are Also Effective Embedding Models: An In-depth Overview](https://arxiv.org/pdf/2412.12591) | Chongyang Tao, Tao Shen, Shen Gao, et. al.| Beihang University, Tencent|<details><summary><small>**Survey**</small></summary><small>...</small></details>| -| <sub>-</sub>|
 14 | |2024/8/29  <br> (🌟🌟🌟)| [Conan-embedding: General Text Embedding with More and Better Negative Samples](https://arxiv.org/pdf/2408.15710) | Shiyu Li, Yang Tang, Shi-Zhen Chen, et. al.| Peking University, Tencent|<details><summary><small>This paper presents **conan-embedding** ...</small></summary><small>This work present **Conan-embedding**, which maximizes the utilization of more and higher-quality negative examples.</small></details>| <sub>MTEB</sub>|
 15 | |2024/4/9|[LLM2Vec: Large Language Models Are Secretly Powerful Text Encoders](https://arxiv.org/pdf/2404.05961) <br>[[code](https://github.com/McGill-NLP/llm2vec): ![](https://img.shields.io/github/stars/McGill-NLP/llm2vec.svg?style=social)] |Parishad BehnamGhader, Vaibhav Adlakha, Marius Mosbach, et. al.|McGill University, Meta|<details><summary><small>This paper presents **LLM2Vec** ...</small></summary><small>This work introduces **LLM2Vec**, an unsupervised method to transform decoder-only large language models (LLMs) into powerful text encoders. The approach comprises three steps: (1) enabling bidirectional attention, (2) masked next token prediction, and (3) unsupervised contrastive learning. Applied to models ranging from 1.3B to 8B parameters, LLM2Vec achieves state-of-the-art performance on the Massive Text Embeddings Benchmark (MTEB) among models trained solely on publicly available data. The method is parameter-efficient and does not require expensive adaptation or synthetic data.</small></details>|<sub>MTEB</sub>|
 16 | |2024/3/29 <br> (🌟🌟🌟)| [Gecko: Versatile Text Embeddings Distilled from Large Language Models](https://arxiv.org/pdf/2403.20327) | Jinhyuk Lee, Zhuyun Dai, Xiaoqi Ren, et. al.| Google Deepmind|<details><summary><small>This paper presents **Gecko** ...</small></summary><small>This work present **Gecko**, a compact and versatile text embedding modle, which employs a two-stage distillation process by generating data and refining data quality based on large language models</small></details>| <sub>MTEB</sub>|
 17 | |2024/2/23  <br> (🌟🌟🌟)| [Repetition Improves Language Model Embeddings](https://arxiv.org/pdf/2402.15449.pdf) <br>[[code](https://github.com/jakespringer/echo-embeddings): ![](https://img.shields.io/github/stars/jakespringer/echo-embeddings.svg?style=social)] |Jacob Mitchell Springer, Suhas Kotha, Daniel Fried, et. al.| CMU| <details><summary><small>This paper presents **echo embeddings** ...</small></summary><small>This work present "**echo embeddings**", in which they repeat the input twice in context and extract embeddings from the second occurrence (i.e., repetition captures bidirectional information)</small></details>| <sub>MTEB</sub>|
 18 | |2024/2/15  <br> (🌟🌟🌟)| [Generative Representational Instruction Tuning](https://arxiv.org/abs/2402.09906.pdf), <br>[[code](https://github.com/ContextualAI/gritlm): ![](https://img.shields.io/github/stars/ContextualAI/gritlm.svg?style=social)]|Niklas Muennighoff, Hongjin Su, Liang Wang, et. al. | Contextual AI, The University of Hong Kong, Microsoft|<details><summary><small>This paper presents **GRIT** ...</small></summary><small>This work introduces generative representational instruction tuning (**GRIT**) whereby a large language model is trained to handle both generative and embedding tasks by distinguishing between them through instructions</small></details>| <sub>MTEB</sub>|
 19 | |2024/2/10  <br> (🌟🌟🌟🌟)| [BGE M3-Embedding: Multi-Lingual, Multi-Functionality,Multi-Granularity Text Embeddings Through Self-Knowledge Distillation](https://arxiv.org/abs/2402.03216.pdf)|Liang Wang, Nan Yang, Xiaolong Huang, et. al.|BAAI, USTC|<details><summary><small>This paper presents **M3-embedding** ...</small></summary><small>This work present a new embedding model, called **M3-Embedding**, which is distinguished for its versatility in Multi-Linguality, Multi-Functionality, and Multi-Granularity</small></details>| <sub>MIRACL, MKQA, MLDR</sub>|
 20 | |2024/1/19  <br> (🌟🌟)| [Improving Text Embeddings with Large Language Models](https://arxiv.org/abs/2401.00368.pdf)| Jianlv Chen, Shitao Xiao, Peitian Zhang, et. al.|Microsoft|<details><summary><small>This paper presents ...</small></summary><small>This work introduce a novel and simple method for obtaining high-quality text embeddings using only synthetic data and less than 1k training steps.</small></details>| <sub> BEIR, MTEB</sub>|
 21 | |2023/12/24 <br> (🌟🌟🌟🌟)|[Making Large Language Models A Better Foundation For Dense Retrieval](https://arxiv.org/pdf/2312.15503.pdf) <br>[[code](https://github.com/FlagOpen/FlagEmbedding): ![](https://img.shields.io/github/stars/FlagOpen/FlagEmbedding.svg?style=social)]| Chaofan Li, Zheng Liu, Shitao Xiao, et. al.|BAAI, BUPT|<details><summary><small>This paper presents **LLaRA** ...</small></summary><small>This work includes **LLaRA** (LLM Adapted for dense RetriAl), which introduce two pretext training tasks EBAE (Embedding-Based Auto-Encoding) and EBAR (Embedding-Based Auto-Regression) to improve LLaMA for dense retrieval.</small></details>|<sub>MS MARCO passage&document, BEIR</sub>|
 22 | |2023/10/25 <br> (🌟🌟🌟)|[Retrieve Anything To Augment Large Language Models](https://arxiv.org/abs/2310.07554)<br>[[code](https://github.com/FlagOpen/FlagEmbedding): ![](https://img.shields.io/github/stars/FlagOpen/FlagEmbedding.svg?style=social)]|AutPeitian Zhang, Shitao Xiao, Zheng Liu, et. al.|BAAI, Renmin Univeristy of China, University of Montreal|<details><summary><small>This paper presents **LLM-Embedder** ...</small></summary><small>This work present a novel approach, the **LLM-Embedder**, which comprehensively supports the diverse retrieval augmentation needs of LLMs with one unified embedding model.</small></details>| <sub>MMLU, PopQA</sub>|
 23 | |2023/8/7|[Towards General Text Embeddings with Multi-stage Contrastive Learning](https://arxiv.org/pdf/2308.03281)|Zehan Li, Xin Zhang, Yanzhao Zhang, et. al.|Alibaba|<details><summary><small>This paper presents **GTE** ...</small></summary><small>This work introduces **GTE** (General Text Embeddings), a model trained using a multi-stage contrastive learning framework. The training involves large-scale unsupervised pre-training followed by supervised fine-tuning across diverse datasets. Despite its relatively modest parameter count of 110M, GTE<sub>base</sub> outperforms larger models and even surpasses the performance of OpenAI's black-box embedding API on the Massive Text Embedding Benchmark (MTEB). Notably, GTE also demonstrates strong capabilities in code retrieval tasks by treating code as text, achieving superior results without additional fine-tuning on specific programming languages.</small></details>|<sub>MTEB, BEIR, code retrieval</sub>|
 24 | |2023/5/30 <br> (🌟🌟🌟🌟)| [One Embedder, Any Task: Instruction-Finetuned Text Embeddings](https://arxiv.org/abs/2212.09741)<br>[[code](https://github.com/xlang-ai/instructor-embedding): ![](https://img.shields.io/github/stars/xlang-ai/instructor-embedding.svg?style=social)]|Hongjin Su, Weijia Shi, Jungo Kasai, et. al.|University of Hong Kong, University of Washingtong, Meta AI, Allen Institute for AI|<details><summary><small>This paper presents **INSTRUCTOR** ...</small></summary><small>This work introduce **INSTRUCTOR**, a new method for computing text embeddings given task instructions: every text input is embedded together with instructions explaining the use case (e.g., task and domain descriptions).</small></details>|GTR| <sub>MTEB</sub>|
 25 | |2022/9/23 <br> (🌟🌟🌟)| [Promptagator: Few-shot Dense Retrieval From 8 Examples](https://arxiv.org/abs/2209.11755)|Zhuyun Dai, Vincent Y. Zhao, Ji Ma, et. al.|Google Research|<details><summary><small>This paper presents **Promptagator** ...</small></summary><small>This work propose Prompt-base Query Generation for Retriever (**Promptagator**), which leverages large language models (LLM) as a few-shot query generator, and creates task-specific retrievers based on the generated data.</small></details>|FLAN| <sub>BEIR</sub>|
 26 | 
 27 | 
 28 | ### 1.2 Reason-enhanced LLM Retriever<a id="aligning"></a>
 29 | 
 30 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
 31 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
 32 | |2025/10/17 <br> (🌟🌟)|[Large Reasoning Embedding Models: Towards Next-Generation Dense Retrieval Paradigm](https://arxiv.org/pdf/2510.14321) | Jianting Tang, Dongshuai Li, Tao Wen, et al.|Alibaba|<details><summary><small>**LREM**: finetune, Qwen2.5-3B-Instruct.</small></summary><small>This work includes ** LREM (Large Reasoning Embedding Model)**, which integrates reasoning processes into representation learning.</small></details>|<sub>online experiments</sub>|
 33 | |2025/10/09 <br> (🌟🌟🌟) |[ReasonEmbed: Enhanced Text Embeddings for Reasoning-Intensive Document Retrieval](https://arxiv.org/abs/2510.08252) [[code](https://github.com/FlagOpen/FlagEmbedding/tree/master/research/BGE_Reasoner): ![](https://img.shields.io/github/stars/FlagOpen/FlagEmbedding.svg?style=social)]|Jianlyu Chen, Junwei Lan, Chaofan Li, et al.|University of Science and Technology of China, Beijing Academy of Artificial Intelligence|<details><summary><small>**ReasonEmbed**: ReMixer, Redepter, Qwen3.</small></summary><small> This work proposes **ReasonEmbed**, a novel text embedding model developed for reasoning-intensive document retrieval. It consists of three contributions: **ReMixer**, **Redapter**, and **ReasonEmbed**.</small></details>|<sub>BRIGHT</sub>|
 34 | |2025/08/25 <br> (🌟🌟🌟) |[DIVER: A Multi-Stage Approach for Reasoning-intensive Information Retrieval](https://arxiv.org/pdf/2508.07995) [[code](https://github.com/AQ-MedAI/Diver): ![](https://img.shields.io/github/stars/AQ-MedAI/Diver.svg?style=social)]|Meixiu Long, Duolin Sun, Dan Yang, et al.|Ant Group|<details><summary><small>**DIVER**: RL, Qwen2.5.</small></summary><small> This work proposes **DIVER**, a retrieval pipeline designed for reasoning-intensive information retrieval. It consists of four components: **DIVER-DChunk**, **DIVER-QExpand**, **DIVER-Retriever**, and **DIVER-Rerank**.</small></details>|<sub>BRIGHT</sub>|
 35 | |2025/05/23 <br> (🌟🌟)|[RaDeR: Reasoning-aware Dense Retrieval Models](https://arxiv.org/abs/2505.18405) <br>[[code](https://github.com/Debrup-61/RaDeR): ![](https://img.shields.io/github/stars/Debrup-61/RaDeR.svg?style=social)]| Debrup Das, Sam O' Nuallain, Razieh Rahimi|University of Massachusetts Amherst|<details><summary><small>**RaDeR**: pointwise, finetune, Qwen2.5.</small></summary><small>This work includes **RaDeR**, a set of reasoning-based dense retrieval models trained with data derived from mathematical problem solving using large language models (LLMs)..</small></details>|<sub>BRIGHT</sub>|
 36 | |2025/04/29 <br> (🌟🌟)|[ReasonIR: Training Retrievers for Reasoning Tasks](https://arxiv.org/abs/2504.20595) <br>[[code](https://github.com/facebookresearch/ReasonIR): ![](https://img.shields.io/github/stars/facebookresearch/ReasonIR.svg?style=social)]| Rulin Shao, Rui Qiao, Varsha Kishore, et al.|FAIR at Meta|<details><summary><small>**ReasonIR**: pointwise, finetune, LLaMA3.1-8B.</small></summary><small>This work includes ** ReasonIR**, which trained on ReasonIR-Synthesizer data (1,383,877 public samples, 244,970 varied-length samples, 100,521 hard samples).</small></details>|<sub>BRIGHT, MMLU, GPQA</sub>|
 37 | 
 38 | 
 39 | 
 40 | <!--
 41 | |2023/12/16| [UPRISE: Universal Prompt Retrieval for Improving Zero-Shot Evaluation](https://arxiv.org/abs/2303.08518.pdf)<br>[[code](https://github.com/microsoft/LMOps): ![](https://img.shields.io/github/stars/microsoft/LMOps.svg?style=social) |Daixuan Cheng, Shaohan Huang, Junyu Bi, et. al.|Microsoft|<details><summary><small>This paper presents UPRISE ...</small></summary><small>This work propose  UPRISE (Universal Prompt Retrieval for Improving zero-Shot Evaluation), which tunes a lightweight and versatile retriever that automatically retrieves prompts for a given zero-shot task input.</small></details>|
 42 | -->
 43 | 
 44 | ### 1.3 LLM-guided Retriever<a id="llm-guided-retriever"></a>
 45 | 
 46 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
 47 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
 48 | |2024/03/27  <br> (🌟🌟)| [LLatrieval: LLM-Verified Retrieval for Verifiable Generation](https://arxiv.org/pdf/2311.07838.pdf)<br>[[code](https://github.com/BeastyZ/LLM-Verified-Retrieval): ![](https://img.shields.io/github/stars/BeastyZ/LLM-Verified-Retrieval.svg?style=social))]| Xiaonan Li, Changtai Zhu, Linyang Li, et. al.| Fudan University | <details><summary><small>This paper presents LLatrieval ...</small></summary><small>This work proposes **LLatrieval** (LLM-verified Retrieval), where the LLM iteratively provides feedback to the retrieval through verify-update iterations. 1) **Retrieval Verification** is implemented by prompting LLM to give binary label, and 2) **Retrieval Update** takes LLM to progressively scan the document candidates returned by the retriever and selects the supporting documents.</small></details>|<sub>ALCE</sub>|
 49 | |2024/3/15  <br> (🌟🌟🌟)| [DRAGIN: Dynamic Retrieval Augmented Generation based on the Real-time Information Needs of Large Language Models](https://arxiv.org/abs/2403.10081) <br>[[code](https://github.com/oneal2000/DRAGIN): ![](https://img.shields.io/github/stars/oneal2000/DRAGIN.svg?style=social)] | Weihang Su, Yichen Tang, Qingyao Ai, Zhijing Wu, Yiqun Liu|  Tsinghua University, Beijing Institute of Technology| <details><summary><small>**DRAGIN** ...</small></summary><small>This work introduce a new framework, **DRAGIN**, i.e., Dynamic Retrieval Augmented Generation based on the real-time Information Needs of LLMs. This framework is specifically designed to make decisions on when and what to retrieve based on the LLM’s real-time information needs during the text generation process.</small></details>|<sub>2WikiMHQA, HotpotQA, IIRC, StrategyQA</sub>|
 50 | |2023/5/26  <br> (🌟🌟)| [Augmentation-Adapted Retriever Improves Generalization of Language Models as Generic Plug-In](https://arxiv.org/abs/2305.17331.pdf) <br> [[code](https://github.com/OpenMatch/Augmentation-Adapted-Retriever): ![](https://img.shields.io/github/stars/OpenMatch/Augmentation-Adapted-Retriever.svg?style=social)] |Zichun Yu, Chenyan Xiong, Shi Yu, Zhiyuan Liu|Tsinghua University, Microsoft|<details><summary><small>This paper presents AAR ...</small></summary><small>This work introduce augmentation-adapted retriever (AAR), which takes a black-box LLM to score positive documents (so called LLM-preferred signals) for fine-tuning a pre-trained retriever.</small></details>|<sub>MMLU, PopQA</sub>|
 51 | 
 52 | 
 53 | ### 1.4 Structured Retriever <a id="structured-retriever"></a>
 54 | 
 55 | | Date  | Title | Authors   | Orgnization | Abs    | Dataset   |
 56 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
 57 | |2025/05/20 <br> (🌟🌟🌟)|[MacRAG: Compress, Slice, and Scale-up for Multi-scale Adaptive Context RAG](https://arxiv.org/pdf/2505.06569) [[code](https://github.com/Leezekun/MacRAG): ![](https://img.shields.io/github/stars/Leezekun/MacRAG.svg?style=social)]|Woosang Lim, Zekun Li, Gyuwan Kim, et al.|POSCO HOLDINGS|<details><summary><small>**MacRAG**: Multi-scale Adaptive Context RAG.</small></summary><small> The *MacRAG** is a hierarchical RAG framework that compresses and partitions documents into coarse-to-fine granularities, then adaptively merges relevant contexts through real-time chunk- and document-level expansions.</small></details>|<sub> HotpotQA, 2WikiMultihopQA, Musiqu</sub>|
 58 | |2025/04/04 <br> (🌟)|[EnrichIndex: Using LLMs to Enrich Retrieval Indices Offline](https://arxiv.org/pdf/2504.03598) [[code](https://github.com/peterbaile/enrichindex): ![](https://img.shields.io/github/stars/peterbaile/enrichindex.svg?style=social)]|Peter Baile Chen, Tomer Wolfson, Michael Cafarella, Dan Roth|MIT, University of Pennsylvania|<details><summary><small>**EnrichIndex**: zeroshot, GPT-4o-mini.</small></summary><small> The *EnrichIndex** uses off-the-shelf GPT-4o-mini to enrich each object with three additional representations during indexing phase: 1) its purpose, 2) a summary, and 3) question-answer pairs. The  final score of each query-object is a weighted sum of the similarity scores between q and each representation.</small></details>|<sub>BRIGHT, Spider2, Beaver, Fiben</sub>|
 59 | |2024/01/31 <br> (🌟🌟🌟🌟)|[RAPTOR: Recursive Abstractive Processing for Tree-Organized Retrieval](https://arxiv.org/abs/2401.18059) [[code](https://github.com/parthsarthi03/raptor): ![](https://img.shields.io/github/stars/parthsarthi03/raptor.svg?style=social)]|Parth Sarthi, Salman Abdullah, Aditi Tuli, Shubh Khanna, Anna Goldie, Christopher D. Manning|Stanford|<details><summary><small>**RAPTOR** ...</small></summary><small>We introduce the novel approach of recursively embedding, clustering, and summarizing chunks of text, constructing a tree with differing levels of summarization from the bottom up. At inference time, our RAPTOR model retrieves from this tree, integrating information across lengthy documents at different levels of abstraction.  On question-answering tasks that involve complex, multi-step reasoning, we show state-of-the-art results; for example, by coupling RAPTOR retrieval with the use of GPT-4, we can improve the best performance on the QuALITY benchmark by 20% in absolute accuracy.</small></details>|<sub>NarrativeQA, QASPER, QuALITY</sub>|
 60 | 
 61 | 
 62 | ## 2. ReRanking Methods <a id="rerank"></a>
 63 | ### 2.1 LLM for Ranking <a id="llmrank"></a>
 64 | These methods try to leverage LLMs to directly rerank documents, usually in listwise setting. The core of these methods lies on how to **divide** the list into small local groups, and then how to **aggregate** local results into global ranking. 
 65 | 
 66 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
 67 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
 68 | |2025/02/06 <br> (🌟🌟🌟)|[TourRank: Utilizing Large Language Models for Documents Ranking with a Tournament-Inspired Strategy](https://arxiv.org/pdf/2406.11678) [[code](https://github.com/chenyiqun/TourRank): ![](https://img.shields.io/github/stars/chenyiqun/TourRank.svg?style=social)]|Yiqun Chen, Qi Liu, Yi Zhang, et al. | Renmin University & Baidu|<details><summary><small>**TourRank**: listwise, zeroshot, GPT-3.5.</small></summary><small> This paper propose a zero-shot document ranking method called **TourRank**. It first groups candidate documents and prompt LLM to select the most relevant one in each group. It also designes a points systems to assign different points to each document based on its ranking in each round tournament. </small></details>|<sub>BEIR, TREC-DL</sub>|
 69 | |2024/06/21 <br> (🌟🌟🌟🌟)|[FIRST: Faster Improved Listwise Reranking with Single Token Decoding](https://arxiv.org/pdf/2406.15657) [[code](https://github.com/gangiswag/llm-reranker): ![](https://img.shields.io/github/stars/gangiswag/llm-reranker.svg?style=social)]|Revanth Gangi Reddy, JaeHyeok Doo, Yifei Xu, et al.| UIUC|<details><summary><small>**FIRST**: listwise, finetune, Zephyr-β.</small></summary><small> This work introduces **FIRST**, which simply extracts the output logits of candidate identifier tokens while generating the first identifier y1 and returns the passage ranking in the order of decreasing logit values. It uses 40k GPT-4 labeled instances (5k queries from MS MARCO) from *Rankzephyr* for finetunning LLM reranks.</small></details>|<sub>BEIR</sub>|
 70 | |2024/05/30 <br> (🌟🌟🌟)|[A Setwise Approach for Effective and Highly Efficient Zero-shot Ranking with Large Language Models](https://arxiv.org/pdf/2310.09497) [[code](https://github.com/ielab/llm-rankers): ![](https://img.shields.io/github/stars/ielab/llm-rankers.svg?style=social)]|Shengyao Zhuang, Honglei Zhuang, Bevan Koopman, Guido Zuccon | CSIRO, Australia|<details><summary><small>**Setwise Prompting**: setwise, zeroshot, Flan-t5.</small></summary><small> This work focus on LLM-based zero-shot document ranking, and introduce an **Setwise** prompting strategy. It instructs LLMs to select the most relevant document to the query from a set of candidate documents.</small></details>|<sub>BEIR, TREC-DL</sub>|
 71 | |2024/05/23 <br> (🌟🌟🌟)|[Top-Down Partitioning for Efficient List-Wise Ranking](https://arxiv.org/pdf/2405.14589v1) [[code](https://github.com/Parry-Parry/TDPart): ![](https://img.shields.io/github/stars/Parry-Parry/TDPart.svg?style=social)]|Andrew Parry, Sean MacAvaney, Debasis Ganguly|University of Glasgow|<details><summary><small>**TDPart**: listwise, zeroshot, GPT-3.5.</small></summary><small> This work partitions a ranking to depth k and processes documents to-down. At each round, it selects a pivot document, and compared it with documents from each group. Those winner documents of each group are collected as the input of next round.</small></details>|<sub>MSMARCO, TREC-DL, BEIR</sub>|
 72 | |2024/03/28 <br> (🌟🌟)| [Large Language Models are Effective Text Rankers with Pairwise Ranking Prompting](https://arxiv.org/pdf/2306.17563.pdf) | Zhen Qin, Rolf Jagerman, Kai Hui, et al.|Google Research|<details><summary><small>**PRP**: pairwise, zeroshot, FLAN-T5(3B,11B,20B).</small></summary><small>This work introduces **Pairwise Ranking Prompting (PRP)** for ranking with LLMs. Three variant of PRP: 1) all pair comparisons $O(N^2)$, 2) Sorting-based, i.e., Heapsort, $O(N\times \log(N))$, 3) Sliding window, i.e., Bubble Sort for Top-K, $O(N)$.</small></details>| <sub>TREC-DL2019&2020, BEIR</sub>|
 73 | |2024/02/20 <br> (🌟🌟)| [Bridging the Preference Gap between Retrievers and LLMs](https://arxiv.org/pdf/2401.06954) | Zixuan Ke, Weize Kong, Cheng Li, et al.|Google Research|<details><summary><small>**BGM**: listwise, finetune, T5-XXL(11B).</small></summary><small>This work trains a seq2seq bridge model (directly generates passage IDs) to jointly accomplish reranking and selection, adapting the retrieved information to be LLM-friendly. It employs a SL and RL training scheme to optimize the adapation process.</small></details>| <sub>NQ, HotpotQA, Avocado Email, Amazon Book</sub>|
 74 | |2023/12/05 <br> (🌟🌟)|[RankZephyr: Effective and Robust Zero-Shot Listwise Reranking is a Breeze!](https://arxiv.org/abs/2312.02724) [[code](https://github.com/castorini/rank_llm): ![](https://img.shields.io/github/stars/castorini/rank_llm.svg?style=social)]|Ronak Pradeep, Sahel Sharifymoghaddam, Jimmy Lin| University of Waterloo|<details><summary><small>**RankZephyr**: listwise, finetune, Zephyr-β.</small></summary><small> The **RankZephyr** is trained with two stages: the first stage is trained on 100k queries from msmarco v1, where 20 candidate documents of each query are ranked by RankGPT3.5. The second stage is trained with less than 5k queries labeled by RankGPT4. When ranking the top-100 candidates, it employed a sliding window akin to RankGPT and RankVicuna.</small></details>|<sub>BEIR, TREC-DL 19&20, 21, 22</sub>|
 75 | |2023/10/21  <br> (🌟🌟)| [Beyond Yes and No: Improving Zero-Shot LLM Rankers via Scoring Fine-Grained Relevance Labels](https://arxiv.org/pdf/2310.14122.pdf) | Honglei Zhuang, Zhen Qin, Kai Hui, Junru Wu, et al. |Google Research |<details><summary><small>**RG-S**: pointwise, zeroshot, FLAN PaLM2S.</small></summary><small>This work proposes to incorporate **fine-grained relevance labels (not-relevant/somewhat-relevant/highly-relevant)** into the prompt for point-wise LLM rankers. The method is called **RG-S**, which is rating scale 0-k Relevance Generation (RG-S(0,k)). It directly prompts LLM to rate the relevance for each q-d pair using a scale from 0 to k.</small></details>| <sub>BEIR</sub>|
 76 | |2023/10/20 <br> (🌟🌟)| [Open-source Large Language Models are Strong Zero-shot Query Likelihood Models for Document Ranking](https://arxiv.org/pdf/2310.13243.pdf) [[code](https://github.com/ielab/llm-qlm): ![](https://img.shields.io/github/stars/ielab/llm-qlm.svg?style=social)]| Shengyao Zhuang, Bing Liu, Bevan Koopman, Guido Zuccon |CSIRO |<details><summary><small>**LLM-QLM**: pointwise,zeroshot,LLaMA(7B).</small></summary><small>This work finds that open-source LLMs can be effective point-wise rankers by asking them to *generate the query given the content of a document*.</small></details>| <sub>BEIR</sub>|
 77 | |2023/10/12 <br> (🌟🌟🌟)|[Fine-Tuning LLaMA for Multi-Stage Text Retrieval](https://arxiv.org/abs/2310.08319) ]|Xueguang Ma, Liang Wang, Nan Yang, et al.| University of Waterloo|<details><summary><small>**RepLLaMA and RankLLaMA**: pointwise, finetune, LLaMA.</small></summary><small> This paper introduces **RepLLaMA** and **RankLLaMA**, which finetunes LLaMA model as dense retriever and pointwise reranker using MS MARCO datasets.</small></details>|<sub>MS MARCO passage/document, BEIR</sub>|
 78 | |2023/09/26 <br> (🌟🌟🌟)|[RankVicuna: Zero-Shot Listwise Document Reranking with Open-Source Large Language Models](https://arxiv.org/abs/2309.15088) [[code](https://github.com/castorini/rank_llm): ![](https://img.shields.io/github/stars/castorini/rank_llm.svg?style=social)]|Ronak Pradeep, Sahel Sharifymoghaddam, Jimmy Lin| University of Waterloo|<details><summary><small>**RankVicuna**: listwise, finetune, Vicuna.</small></summary><small> The **RankVicuna** is trained on the ranked list generated by RankGPT3.5 for 100k queries from msmarco v1. Each query has 20 candidates provided by BM25. They filtered noises: 1) malformed generations, where 12% outputs incorrect list formatting. 2) shuffle the order of candidate documents as data augmentation.</small></details>|<sub> TREC-DL 19&20</sub>|
 79 | |2023/04/19 <br> (🌟🌟🌟🌟)|[Is ChatGPT Good at Search? Investigating Large Language Models as Re-Ranking Agents](https://arxiv.org/abs/2304.09542) [[code](https://github.com/sunnweiwei/RankGPT): ![](https://img.shields.io/github/stars/sunnweiwei/RankGPT.svg?style=social)]|Weiwei Sun, Lingyong Yan, Xinyu Ma, et al., | Shandong University|<details><summary><small>**RankGPT**: listwise, zeroshot, GPT-3.5.</small></summary><small> This work first investigates prompting ChatGPT on passage re-ranking tasks, and find LLMs show limited performance. It proposes an instructional permutation generation method, and use sliding window to address context length limitation of LLMs.</small></details>|<sub>BEIR, TREC-DL</sub>|
 80 | 
 81 | <!--- 
 82 | |2023/5/1| [Say Goodbye to Irrelevant Search Results: Cohere Rerank Is Here](https://txt.cohere.com/rerank)| NILS REIMERS, SYLVIE SHI, LUCAS FAYOUX, ELLIOTT CHOI| Cohere| <details><summary><small>This work presents Cohere Rerank ...</small></summary><small>This work proposes Cohere Rerank,which can provide a powerful semantic boost to the search quality of any keyword or vector search system without requiring any overhaul or replacement.</small></details>| <sub>MIRACL, TREC-DL 19&20, NQ</sub>|
 83 | --->
 84 | 
 85 | ### 2.2 Reasoning for Ranking (fine-tuned LLM) <a id="reasonrank"></a>
 86 | 
 87 | These methods try to improve the reasoning ability of LLMs in ranking document. Usually, they first **construct** a large-scale dataset which contains the "reasoing chain" in each sample. Then they **finetune** LLM on the dataset to obtain reason-enhanced reranking model.
 88 | 
 89 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
 90 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
 91 | |2025/08/25 <br> (🌟🌟🌟) |[DIVER: A Multi-Stage Approach for Reasoning-intensive Information Retrieval](https://arxiv.org/pdf/2508.07995?) [[code](https://github.com/AQ-MedAI/Diver): ![](https://img.shields.io/github/stars/AQ-MedAI/Diver.svg?style=social)]|Meixiu Long, Duolin Sun, Dan Yang, et al.|Ant Group|<details><summary><small>**DIVER**: RL, Qwen2.5.</small></summary><small> This work proposes **DIVER**, a retrieval pipeline designed for reasoning-intensive information retrieval. It consists of four components: **DIVER-DChunk**, **DIVER-QExpand**, **DIVER-Retriever**, and **DIVER-Rerank**.</small></details>|<sub>BRIGHT</sub>|
 92 | |2025/08/22 <br> (🌟🌟🌟) |[ReasonRank: Empowering Passage Ranking with Strong Reasoning Ability](https://arxiv.org/pdf/2508.07050) [[code](https://github.com/8421BCD/ReasonRank): ![](https://img.shields.io/github/stars/8421BCD/ReasonRank.svg?style=social)]|Wenhan Liu, Xinyu Ma, Weiwei Sun, et al.|Renming University|<details><summary><small>**ReasonRank**: RL, Qwen2.5.</small></summary><small> This work proposes **ReasonRank**, a reasoning-intensive passage reranker. It consists of a ranking data synthesis framework, and a two-state training framework (i.e., a cold-start SFT and a multi-view ranking based RL).</small></details>|<sub>BRIGHT, R2MED</sub>|
 93 | |2025/05/20 <br> (🌟🌟) |[Rank-K: Test-Time Reasoning for Listwise Reranking](https://arxiv.org/pdf/2505.14432) [[code](https://github.com/hltcoe/rank-k): ![](https://img.shields.io/github/stars/hltcoe/rank-k.svg?style=social)]|Eugene Yang, Andrew Yates, Kathryn Ricci, et al.|Johns Hopkins University|<details><summary><small>**Rank-k**: RL, Qwen2.5.</small></summary><small> This work proposes **Rank-k**, a listwise passage reranking model that leverages the reasoning capability of the reasoning language model at query time that provides test time scalability to serve hard queries.</small></details>|<sub>DL19, DL20, NeuCLIR22, NeuCLIR24, BRIGHT</sub>|
 94 | |2025/05/07 <br> (🌟🌟) |[ZEROSEARCH: Incentivize the Search Capability of LLMs without Searching](https://arxiv.org/pdf/2505.04588) [[code](https://github.com/Alibaba-NLP/ZeroSearch): ![](https://img.shields.io/github/stars/Alibaba-NLP/ZeroSearch.svg?style=social)]|Hao Sun, Zile Qiao, Jiayan Guo, et al.|Tongyi Lab, Alibaba Group|<details><summary><small>**ZeroSearch**: RL, Qwen2.5.</small></summary><small> This work proposes **ZeroSearch**, a RL framework that incentivizes the search capability of LLMs. It transforms LLM into a retrieval module by supervised fine-tuning.It introduces a curriculum rollout mechanism to progressively elicit model's reasoning ability by exposing it to increasingly challenging retrieval scenarios. The rollout trajectory contains [<think><search><information><answer>].</small></details>|<sub>NQ, TrivaiQA, PopQA, HotpotQA, 2WikiMultiHopQA, Musique, Bamboogle</sub>|
 95 | |2025/03/27 <br> (🌟🌟) |[ReSearch: Learning to Reason with Search for LLMs via Reinforcement Learning](https://arxiv.org/pdf/2503.19470) [[code](https://github.com/Agent-RL/ReCall): ![](https://img.shields.io/github/stars/Agent-RL/ReCall.svg?style=social)]|Mingyang Chen, Tianpeng Li, Haoze Sun, et al.| Baichuan Inc.|<details><summary><small>**ReSearch**: RL, Qwen2.5.</small></summary><small> The **ReSearch** trains LLMs to reason with search via RL without any supervised data. The rollout trajectory contains [<think><search><result><answer>].</small></details>|<sub>HotpotQA, 2WikiMultiHopQA, Musique, Bamboogle</sub>|
 96 | |2025/03/18 <br> (🌟🌟) |[R1-Searcher: Incentivizing the Search Capability in LLMs via Reinforcement Learning](https://arxiv.org/pdf/2503.05592) [[code](https://github.com/RUCAIBox/R1-Searcher): ![](https://img.shields.io/github/stars/RUCAIBox/R1-Searcher.svg?style=social)]|Huatong Song, Jinhao Jiang, Yingqian Min, et al.|Renmin University|<details><summary><small>**R1-Search**: RL, Qwen2.5.</small></summary><small> The **R1-Search** utilizes a two-stage, outcome-based training strategy. The first stage uses retrieve-reward to incentivize the model to conduct retrieval operations. The second stage introduce answer-reward to encourage model to learn to use external retrieval system to solve questions. The rollout trajectory contains [<think><query><document><answer>].</small></details>|<sub>HotpotQA, 2WikiMultiHopQA, Musique, Bamboogle</sub>|
 97 | |2025/03/12 <br> (🌟🌟) |[Search-R1: Training LLMs to Reason and Leverage Search Engines with Reinforcement Learning](https://arxiv.org/abs/2503.09516) [[code](https://github.com/PeterGriffinJin/Search-R1): ![](https://img.shields.io/github/stars/PeterGriffinJin/Search-R1.svg?style=social)]|Bowen Jin, Hansi Zeng, Zhenrui Yue, et al.|UIUC, Google|<details><summary><small>**Search-R1**: RL, Qwen2.5.</small></summary><small> The **Search-R1** optimizes LLM to [search/reason/answer] within multi-turn search interactions, leveraging retrival token masking for stable RL training and a smiple outcome-based reward function.The rollout trajectory contains [<think><search><information><answer>].</small></details>|<sub>NQ, TrivaiQA, PopQA, HotpotQA, 2WikiMultiHopQA, Musique, Bamboogle</sub>|
 98 | |2025/02/25 <br> (🌟🌟)|[Rank1: Test-Time Compute for Reranking in Information Retrieval](https://arxiv.org/pdf/2502.18418) [[code](https://github.com/orionw/rank1): ![](https://img.shields.io/github/stars/orionw/rank1.svg?style=social)]|Orion Weller, Kathryn Ricci Eugene Yang, Andrew Yates, et al.|Johns Hopkins University|<details><summary><small>**Rank1**: pointwise, finetune, Qwen2.5.</small></summary><small> This work sample 635,000  examples of R1's reasoning on the MS MARCO dataset. It then finetunes the qwen 2.5 model on these reasoning chains and find they show remarkable reasoning capabilities.</small></details>|<sub>BRIGHT</sub>|
 99 | |2025/01/09 <br> (🌟🌟🌟) |[Search-o1: Agentic Search-Enhanced Large Reasoning Models](https://arxiv.org/pdf/2501.05366) [[code](https://github.com/sunnynexus/Search-o1): ![](https://img.shields.io/github/stars/sunnynexus/Search-o1.svg?style=social)]|Xiaoxi Li, Guanting Dong, Jiajie Jin, et al.|Renmin University|<details><summary><small>**Search-o1**: Zeroshot, QwQ-32B.</small></summary><small> The **Search-o1** combines the reasoning process with an agentic RAG mechanism and a knowledge refinement module. The reason-in-document module operates independently from the main reasoning chain, which conducts a thorough analysis of retrieved documents and produces refined information.</small></details>|<sub>GPQA, MATH500, AMC2023, AIME2024, LiveCodeBench, NQ, TriviaQA, HotpotQA, 2WikiMultihopQA, MuSiQue, Bamboogle</sub>|
100 | |2024/10/31 <br> (🌟)|[JudgeRank: Leveraging Large Language Models for Reasoning-Intensive Reranking](https://arxiv.org/pdf/2411.00142)|Tong Niu, Shafiq Joty, Ye Liu, et al.| Salesforce AI Research|<details><summary><small>**JudgeRank**: pointwise, zeroshot, Llama-3.1.</small></summary><small> The **JudgeRank** estimate the relevance of query-document pairs following three steps: 1) query analysis to identify the core problem (query expansion), 2) document analysis to **extract** a query-aware summary, 3) relevance judgement to provide score based on the probability of "yes" and "no".</small></details>|<sub>BRIGHT, BEIR</sub>|
101 | 
102 | ## 3. Analysis about Retrieval<a id="analysis"></a>
103 | 
104 | | Date       | Title | Authors   | Orgnization | Abs    | Dataset                                                                                           |
105 | |------------|-----------------------------------------------------------------------------------------------------------------|------------------------------------------|---------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------|
106 | |2024/7/14 <br> (🌟🌟🌟🌟)| [The Power of Noise: Redefining Retrieval for RAG Systems](https://arxiv.org/abs/2401.14887) <br>[[code](https://github.com/florin-git/The-Power-of-Noise): ![](https://img.shields.io/github/stars/florin-git/The-Power-of-Noise.svg?style=social)] | F. Cuconasu, G. Trappolini, F. Siciliano, etl. al.|  Sapienza University of Rome| <details><summary><small>This paper studies noise passages ...</small></summary><small>This paper studied the impact of noise passages in RAG, and found that adding random documents in the prompt improves the LLM accuracy by up to 35% on the NQ dataset.</small></details>|<sub>NQ-open (subset of NQ)</sub>|
107 | 
108 | 


--------------------------------------------------------------------------------