├── .github
    └── workflows
    │   ├── python-package.yml
    │   └── python-publish.yml
├── .gitignore
├── LICENSE.txt
├── README.md
├── rag_evaluator
    ├── __init__.py
    ├── evaluator.py
    └── test_evaluator.py
├── requirements.py
├── setup.py
└── streamlit app
    ├── app.py
    ├── evaluation_module.py
    └── requirements.txt


/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
 3 | 
 4 | name: Python package
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ "main" ]
 9 |   pull_request:
10 |     branches: [ "main" ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       fail-fast: false
18 |       matrix:
19 |         python-version: ["3.9", "3.10", "3.11"]
20 | 
21 |     steps:
22 |     - uses: actions/checkout@v4
23 |     - name: Set up Python ${{ matrix.python-version }}
24 |       uses: actions/setup-python@v3
25 |       with:
26 |         python-version: ${{ matrix.python-version }}
27 |     - name: Install dependencies
28 |       run: |
29 |         python -m pip install --upgrade pip
30 |         python -m pip install flake8 pytest
31 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
32 |     - name: Lint with flake8
33 |       run: |
34 |         # stop the build if there are Python syntax errors or undefined names
35 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
36 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
37 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
38 |     - name: Test with pytest
39 |       run: |
40 |         pytest
41 | 


--------------------------------------------------------------------------------
/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | permissions:
16 |   contents: read
17 | 
18 | jobs:
19 |   deploy:
20 | 
21 |     runs-on: ubuntu-latest
22 | 
23 |     steps:
24 |     - uses: actions/checkout@v4
25 |     - name: Set up Python
26 |       uses: actions/setup-python@v3
27 |       with:
28 |         python-version: '3.x'
29 |     - name: Install dependencies
30 |       run: |
31 |         python -m pip install --upgrade pip
32 |         pip install build
33 |     - name: Build package
34 |       run: python -m build
35 |     - name: Publish package
36 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
37 |       with:
38 |         user: __token__
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 AI Anytime
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RAG Evaluator
  2 | 
  3 | ## Overview
  4 | 
  5 | RAG Evaluator is a Python library for evaluating Retrieval-Augmented Generation (RAG) systems. It provides various metrics to evaluate the quality of generated text against reference text.
  6 | 
  7 | ## Installation
  8 | 
  9 | You can install the library using pip:
 10 | 
 11 | ```bash
 12 | pip install rag-evaluator
 13 | ```
 14 | 
 15 | ## Usage
 16 | 
 17 | Here's how to use the RAG Evaluator library:
 18 | 
 19 | ```python
 20 | from rag_evaluator import RAGEvaluator
 21 | 
 22 | # Initialize the evaluator
 23 | evaluator = RAGEvaluator()
 24 | 
 25 | # Input data
 26 | question = "What are the causes of climate change?"
 27 | response = "Climate change is caused by human activities."
 28 | reference = "Human activities such as burning fossil fuels cause climate change."
 29 | 
 30 | # Evaluate the response
 31 | metrics = evaluator.evaluate_all(question, response, reference)
 32 | 
 33 | # Print the results
 34 | print(metrics)
 35 | ```
 36 | 
 37 | ## Streamlit Web App
 38 | 
 39 | To run the web app:
 40 | 
 41 | 1. cd into streamlit app folder.
 42 | 2. Create a virtual env
 43 | 3. Activate the virtual env
 44 | 4. Install all dependencies
 45 | 5. Run the app:
 46 | ```
 47 | streamlit run app.py
 48 | ```
 49 | 
 50 | ## Metrics
 51 | 
 52 | The RAG Evaluator provides the following metrics:
 53 | 
 54 | 1. **BLEU** (0-100): Measures the overlap between the generated output and reference text based on n-grams.
 55 |    - 0-20: Low similarity, 20-40: Medium-low, 40-60: Medium, 60-80: High, 80-100: Very high
 56 | 
 57 | 2. **ROUGE-1** (0-1): Measures the overlap of unigrams between the generated output and reference text.
 58 |    - 0.0-0.2: Poor overlap, 0.2-0.4: Fair, 0.4-0.6: Good, 0.6-0.8: Very good, 0.8-1.0: Excellent
 59 | 
 60 | 3. **BERT Score** (0-1): Evaluates the semantic similarity using BERT embeddings (Precision, Recall, F1).
 61 |    - 0.0-0.5: Low similarity, 0.5-0.7: Moderate, 0.7-0.8: Good, 0.8-0.9: High, 0.9-1.0: Very high
 62 | 
 63 | 4. **Perplexity** (1 to ∞, lower is better): Measures how well a language model predicts the text.
 64 |    - 1-10: Excellent, 10-50: Good, 50-100: Moderate, 100+: High (potentially nonsensical)
 65 | 
 66 | 5. **Diversity** (0-1): Measures the uniqueness of bigrams in the generated output.
 67 |    - 0.0-0.2: Very low, 0.2-0.4: Low, 0.4-0.6: Moderate, 0.6-0.8: High, 0.8-1.0: Very high
 68 | 
 69 | 6. **Racial Bias** (0-1): Detects the presence of biased language in the generated output.
 70 |    - 0.0-0.2: Low probability, 0.2-0.4: Moderate, 0.4-0.6: High, 0.6-0.8: Very high, 0.8-1.0: Extreme
 71 | 
 72 | 7. **MAUVE** (0-1): MAUVE captures contextual meaning, coherence, and fluency while measuring both semantic similarity and stylistic alignment .
 73 |    -  0.0-0.2 (Poor), 0.2-0.4 (Fair), 0.4-0.6 (Good), 0.6-0.8 (Very good), 0.8-1.0 (Excellent).
 74 |      
 75 | 8. **METEOR** (0-1): Calculates semantic similarity considering synonyms and paraphrases.
 76 |    - 0.0-0.2: Poor, 0.2-0.4: Fair, 0.4-0.6: Good, 0.6-0.8: Very good, 0.8-1.0: Excellent
 77 | 
 78 | 9. **CHRF** (0-1): Computes Character n-gram F-score for fine-grained text similarity.
 79 |    - 0.0-0.2: Low, 0.2-0.4: Moderate, 0.4-0.6: Good, 0.6-0.8: High, 0.8-1.0: Very high
 80 | 
 81 | 10. **Flesch Reading Ease** (0-100): Assesses text readability.
 82 |    - 0-30: Very difficult, 30-50: Difficult, 50-60: Fairly difficult, 60-70: Standard, 70-80: Fairly easy, 80-90: Easy, 90-100: Very easy
 83 | 
 84 | 11. **Flesch-Kincaid Grade** (0-18+): Indicates the U.S. school grade level needed to understand the text.
 85 |     - 1-6: Elementary, 7-8: Middle school, 9-12: High school, 13+: College level
 86 | 
 87 | ## Testing
 88 | 
 89 | To run the tests, use the following command:
 90 | 
 91 | ```
 92 | python -m unittest discover -s rag_evaluator -p "test_*.py"
 93 | ```
 94 | 
 95 | ## License
 96 | 
 97 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details.
 98 | 
 99 | ## Contributing
100 | 
101 | Contributions are welcome! If you have any improvements, suggestions, or bug fixes, feel free to create a pull request (PR) or open an issue on GitHub. Please ensure your contributions adhere to the project's coding standards and include appropriate tests.
102 | 
103 | ### How to Contribute
104 | 
105 | 1. Fork the repository.
106 | 2. Create a new branch for your feature or bug fix.
107 | 3. Make your changes.
108 | 4. Run tests to ensure everything is working.
109 | 5. Commit your changes and push to your fork.
110 | 6. Create a pull request (PR) with a detailed description of your changes.
111 | 
112 | ## Contact
113 | 
114 | If you have any questions or need further assistance, feel free to reach out via [email](mailto:aianytime07@gmail.com).
115 | 


--------------------------------------------------------------------------------
/rag_evaluator/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluator import RAGEvaluator
2 | 
3 | __all__ = ["RAGEvaluator"]
4 | 


--------------------------------------------------------------------------------
/rag_evaluator/evaluator.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import mauve
  3 | from sacrebleu import corpus_bleu
  4 | from rouge_score import rouge_scorer
  5 | from bert_score import score
  6 | from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
  7 | import nltk
  8 | from nltk.util import ngrams
  9 | from nltk.tokenize import word_tokenize
 10 | from nltk.translate.meteor_score import meteor_score
 11 | from nltk.translate.chrf_score import sentence_chrf
 12 | from textstat import flesch_reading_ease, flesch_kincaid_grade
 13 | from sklearn.metrics.pairwise import cosine_similarity
 14 | from mauve import compute_mauve
 15 | import nltk
 16 | nltk.download()
 17 | 
 18 | class RAGEvaluator:
 19 |     def __init__(self):
 20 |         self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()
 21 |         self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
 22 | 
 23 |     def load_gpt2_model(self):
 24 |         model = GPT2LMHeadModel.from_pretrained('gpt2')
 25 |         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 26 |         return model, tokenizer
 27 | 
 28 |     def evaluate_bleu_rouge(self, candidates, references):
 29 |         bleu_score = corpus_bleu(candidates, [references]).score
 30 |         scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
 31 |         rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]
 32 |         rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
 33 |         return bleu_score, rouge1
 34 | 
 35 |     def evaluate_bert_score(self, candidates, references):
 36 |         P, R, F1 = score(candidates, references, lang="en", model_type='bert-base-multilingual-cased')
 37 |         return P.mean().item(), R.mean().item(), F1.mean().item()
 38 | 
 39 |     def evaluate_perplexity(self, text):
 40 |         encodings = self.gpt2_tokenizer(text, return_tensors='pt')
 41 |         max_length = self.gpt2_model.config.n_positions
 42 |         stride = 512
 43 |         lls = []
 44 |         for i in range(0, encodings.input_ids.size(1), stride):
 45 |             begin_loc = max(i + stride - max_length, 0)
 46 |             end_loc = min(i + stride, encodings.input_ids.size(1))
 47 |             trg_len = end_loc - i
 48 |             input_ids = encodings.input_ids[:, begin_loc:end_loc]
 49 |             target_ids = input_ids.clone()
 50 |             target_ids[:, :-trg_len] = -100
 51 |             with torch.no_grad():
 52 |                 outputs = self.gpt2_model(input_ids, labels=target_ids)
 53 |                 log_likelihood = outputs[0] * trg_len
 54 |             lls.append(log_likelihood)
 55 |         ppl = torch.exp(torch.stack(lls).sum() / end_loc)
 56 |         return ppl.item()
 57 | 
 58 |     def evaluate_diversity(self, texts):
 59 |         all_tokens = [tok for text in texts for tok in text.split()]
 60 |         unique_bigrams = set(ngrams(all_tokens, 2))
 61 |         diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
 62 |         return diversity_score
 63 | 
 64 |     def evaluate_racial_bias(self, text):
 65 |         results = self.bias_pipeline([text], candidate_labels=["hate speech", "not hate speech"])
 66 |         bias_score = results[0]['scores'][results[0]['labels'].index('hate speech')]
 67 |         return bias_score
 68 | 
 69 |     def evaluate_meteor(self, candidates, references):
 70 |         nltk.download('punkt', quiet=True)  
 71 |         
 72 |         meteor_scores = [
 73 |             meteor_score([word_tokenize(ref)], word_tokenize(cand))
 74 |             for ref, cand in zip(references, candidates)
 75 |         ]
 76 |         return sum(meteor_scores) / len(meteor_scores)
 77 |     
 78 |     def evaluate_chrf(self, candidates, references):
 79 |         chrf_scores = [sentence_chrf(ref, cand) for ref, cand in zip(references, candidates)]
 80 |         return sum(chrf_scores) / len(chrf_scores)
 81 |     
 82 |     def evaluate_readability(self, text):
 83 |         flesch_ease = flesch_reading_ease(text)
 84 |         flesch_grade = flesch_kincaid_grade(text)
 85 |         return flesch_ease, flesch_grade
 86 |         
 87 |     def evaluate_mauve(self,reference_texts, generated_texts):
 88 |         out = mauve.compute_mauve(
 89 |                                   p_text=reference_texts,  # List of reference texts
 90 |                                   q_text=generated_texts,  # List of generated texts
 91 |                                   device_id=0,             # GPU device ID; set to -1 for CPU
 92 |                                   max_text_length=1024,     # Maximum length of text to truncate
 93 |                                   verbose=False            # Whether to print additional information
 94 |                                 )
 95 |         return  out.mauve
 96 |         
 97 |     def evaluate_all(self, question, response, reference):
 98 |         candidates = [response]
 99 |         references = [reference]
100 |         bleu, rouge1 = self.evaluate_bleu_rouge(candidates, references)
101 |         bert_p, bert_r, bert_f1 = self.evaluate_bert_score(candidates, references)
102 |         perplexity = self.evaluate_perplexity(response)
103 |         diversity = self.evaluate_diversity(candidates)
104 |         racial_bias = self.evaluate_racial_bias(response)
105 |         mauve_score = self.evaluate_mauve(reference, response)
106 |         meteor = self.evaluate_meteor(candidates, references)
107 |         chrf = self.evaluate_chrf(candidates, references)
108 |         flesch_ease, flesch_grade = self.evaluate_readability(response)
109 |         return {
110 |             "BLEU": bleu,
111 |             "ROUGE-1": rouge1,
112 |             "BERT P": bert_p,
113 |             "BERT R": bert_r,
114 |             "BERT F1": bert_f1,
115 |             "Perplexity": perplexity,
116 |             "Diversity": diversity,
117 |             "Racial Bias": racial_bias,
118 |             "MAUVE": mauve_score,
119 |             "METEOR": meteor,
120 |             "CHRF": chrf,
121 |             "Flesch Reading Ease": flesch_ease,
122 |             "Flesch-Kincaid Grade": flesch_grade,
123 |         }
124 | 


--------------------------------------------------------------------------------
/rag_evaluator/test_evaluator.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from evaluator import RAGEvaluator
 3 | import nltk
 4 | nltk.download('punkt')
 5 | from nltk.tokenize import PunktTokenizer
 6 | 
 7 | class TestRAGEvaluator(unittest.TestCase):
 8 |     def setUp(self):
 9 |         self.evaluator = RAGEvaluator()
10 | 
11 |     def test_evaluate_all(self):
12 |         question = "What are the causes of climate change?"
13 |         response = "Climate change is caused by human activities."
14 |         reference = "Human activities such as burning fossil fuels cause climate change."
15 |         metrics = self.evaluator.evaluate_all(question, response, reference)
16 |         self.assertIsInstance(metrics, dict)
17 |         self.assertIn("BLEU", metrics)
18 |         self.assertIn("ROUGE-1", metrics)
19 |         self.assertIn("BERT P", metrics)
20 |         self.assertIn("Perplexity", metrics)
21 |         self.assertIn("Diversity", metrics)
22 |         self.assertIn("Racial Bias", metrics)
23 |         self.assertIn("MAUVE", metrics)
24 |         self.assertIn("METEOR", metrics)
25 |         self.assertIn("CHRF", metrics)
26 |         self.assertIn("Flesch Reading Ease", metrics)
27 |         self.assertIn("Flesch-Kincaid Grade", metrics)
28 | 
29 | if __name__ == "__main__":
30 |     unittest.main()
31 | 


--------------------------------------------------------------------------------
/requirements.py:
--------------------------------------------------------------------------------
 1 | torch==2.3.1
 2 | sacrebleu
 3 | rouge-score
 4 | bert-score
 5 | transformers
 6 | nltk
 7 | textblob
 8 | textstat 
 9 | mauve-text
10 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="rag-evaluator",
 5 |     version="0.1.0",
 6 |     description="A library for evaluating Retrieval-Augmented Generation (RAG) systems",
 7 |     long_description=open('README.md').read(),
 8 |     long_description_content_type='text/markdown',
 9 |     author="AI Anytime",
10 |     author_email="aianytime07@gmail.com",
11 |     url="https://github.com/AIAnytime/rag-evaluator",
12 |     packages=find_packages(),
13 |     install_requires=[
14 |         "torch",
15 |         "sacrebleu",
16 |         "rouge-score",
17 |         "bert-score",
18 |         "transformers",
19 |         "nltk",
20 |         "textblob",
21 |         "textstat"
22 |     ],
23 |     classifiers=[
24 |         "Programming Language :: Python :: 3",
25 |         "License :: OSI Approved :: MIT License",
26 |         "Operating System :: OS Independent",
27 |     ],
28 |     python_requires='>=3.10',
29 | )


--------------------------------------------------------------------------------
/streamlit app/app.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | from evaluation_module import RAGEvaluator
 3 | 
 4 | # Initialize evaluator
 5 | evaluator = RAGEvaluator()
 6 | 
 7 | st.title("RAG System Evaluation Dashboard")
 8 | 
 9 | st.write("## Input Data")
10 | 
11 | # Pre-filled input fields for testing
12 | question = st.text_input("Question", "What are the causes of climate change?")
13 | context = st.text_area("Reference Context (top 'k' documents)", """
14 | Climate change is caused by a variety of factors, including natural processes and human activities. Human activities, such as burning fossil fuels, deforestation, and industrial processes, release greenhouse gases into the atmosphere. These gases trap heat from the sun, causing the Earth's temperature to rise. Natural processes, such as volcanic eruptions and variations in solar radiation, also play a role in climate change.
15 | """)
16 | generated_output = st.text_area("LLM Generated Output", """
17 | Climate change is primarily caused by human activities that release greenhouse gases into the atmosphere. These activities include burning fossil fuels for energy, deforestation, and various industrial processes. The increase in greenhouse gases, such as carbon dioxide and methane, traps more heat in the Earth's atmosphere, leading to a rise in global temperatures. Natural factors, like volcanic activity and changes in solar radiation, can also contribute to climate change, but their impact is relatively minor compared to human activities.
18 | """)
19 | 
20 | if st.button("Evaluate"):
21 |     if question and context and generated_output:
22 |         st.write("### Evaluation Results")
23 | 
24 |         # Perform evaluations
25 |         metrics = evaluator.evaluate_all(generated_output, context)
26 | 
27 |         # Display metrics with explanations
28 |         st.write(f"**BLEU Score**: {metrics['BLEU']:.2f}")
29 |         st.write("BLEU measures the overlap between the generated output and reference text based on n-grams. Range: 0-100. Higher scores indicate better match.")
30 | 
31 |         st.write(f"**ROUGE-1 Score**: {metrics['ROUGE-1']:.2f}")
32 |         st.write("ROUGE-1 measures the overlap of unigrams between the generated output and reference text. Range: 0-1. Higher scores indicate better match.")
33 | 
34 |         st.write(f"**BERT Precision**: {metrics['BERT P']:.2f}")
35 |         st.write(f"**BERT Recall**: {metrics['BERT R']:.2f}")
36 |         st.write(f"**BERT F1 Score**: {metrics['BERT F1']:.2f}")
37 |         st.write("BERTScore evaluates the semantic similarity between the generated output and reference text using BERT embeddings. Range: 0-1. Higher scores indicate better semantic similarity.")
38 | 
39 |         st.write(f"**Perplexity**: {metrics['Perplexity']:.2f}")
40 |         st.write("Perplexity measures how well a language model predicts the text. Range: 1 to ∞. Lower values indicate better fluency and coherence.")
41 | 
42 |         st.write(f"**Diversity**: {metrics['Diversity']:.2f}")
43 |         st.write("Diversity measures the uniqueness of bigrams in the generated output. Range: 0-1. Higher values indicate more diverse and varied output.")
44 | 
45 |         st.write(f"**Racial Bias**: {metrics['Racial Bias']:.2f}")
46 |         st.write("Racial Bias score indicates the presence of biased language in the generated output. Range: 0-1. Lower scores indicate less bias.")
47 | 
48 |         st.write(f"**METEOR Score**: {metrics['METEOR']:.2f}")
49 |         st.write("METEOR calculates semantic similarity considering synonyms and paraphrases. Range: 0-1. Higher scores indicate better semantic alignment.")
50 | 
51 |         st.write(f"**CHRF Score**: {metrics['CHRF']:.2f}")
52 |         st.write("CHRF computes Character n-gram F-score for fine-grained text similarity. Range: 0-1. Higher scores indicate better character-level similarity.")
53 | 
54 |         st.write(f"**Flesch Reading Ease**: {metrics['Flesch Reading Ease']:.2f}")
55 |         st.write("Flesch Reading Ease assesses text readability. Range: 0-100. Higher scores indicate easier readability.")
56 | 
57 |         st.write(f"**Flesch-Kincaid Grade**: {metrics['Flesch-Kincaid Grade']:.2f}")
58 |         st.write("Flesch-Kincaid Grade indicates the U.S. school grade level needed to understand the text. Range: 0-18+. Lower scores indicate easier readability.")
59 | 
60 |     else:
61 |         st.write("Please provide all inputs to evaluate.")


--------------------------------------------------------------------------------
/streamlit app/evaluation_module.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from sacrebleu import corpus_bleu
  3 | from rouge_score import rouge_scorer
  4 | from bert_score import score
  5 | from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
  6 | import nltk
  7 | from nltk.util import ngrams
  8 | from nltk.tokenize import word_tokenize
  9 | from nltk.translate.meteor_score import meteor_score
 10 | from nltk.translate.chrf_score import sentence_chrf
 11 | from textstat import flesch_reading_ease, flesch_kincaid_grade
 12 | from sklearn.metrics.pairwise import cosine_similarity
 13 | 
 14 | class RAGEvaluator:
 15 |     def __init__(self):
 16 |         self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()
 17 |         self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
 18 | 
 19 |     def load_gpt2_model(self):
 20 |         model = GPT2LMHeadModel.from_pretrained('gpt2')
 21 |         tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
 22 |         return model, tokenizer
 23 | 
 24 |     def evaluate_bleu_rouge(self, candidates, references):
 25 |         bleu_score = corpus_bleu(candidates, [references]).score
 26 |         scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
 27 |         rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)]
 28 |         rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores)
 29 |         return bleu_score, rouge1
 30 | 
 31 |     def evaluate_bert_score(self, candidates, references):
 32 |         P, R, F1 = score(candidates, references, lang="en", model_type='bert-base-multilingual-cased')
 33 |         return P.mean().item(), R.mean().item(), F1.mean().item()
 34 | 
 35 |     def evaluate_perplexity(self, text):
 36 |         encodings = self.gpt2_tokenizer(text, return_tensors='pt')
 37 |         max_length = self.gpt2_model.config.n_positions
 38 |         stride = 512
 39 |         lls = []
 40 |         for i in range(0, encodings.input_ids.size(1), stride):
 41 |             begin_loc = max(i + stride - max_length, 0)
 42 |             end_loc = min(i + stride, encodings.input_ids.size(1))
 43 |             trg_len = end_loc - i
 44 |             input_ids = encodings.input_ids[:, begin_loc:end_loc]
 45 |             target_ids = input_ids.clone()
 46 |             target_ids[:, :-trg_len] = -100
 47 |             with torch.no_grad():
 48 |                 outputs = self.gpt2_model(input_ids, labels=target_ids)
 49 |                 log_likelihood = outputs[0] * trg_len
 50 |             lls.append(log_likelihood)
 51 |         ppl = torch.exp(torch.stack(lls).sum() / end_loc)
 52 |         return ppl.item()
 53 | 
 54 |     def evaluate_diversity(self, texts):
 55 |         all_tokens = [tok for text in texts for tok in text.split()]
 56 |         unique_bigrams = set(ngrams(all_tokens, 2))
 57 |         diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
 58 |         return diversity_score
 59 | 
 60 |     def evaluate_racial_bias(self, text):
 61 |         results = self.bias_pipeline([text], candidate_labels=["hate speech", "not hate speech"])
 62 |         bias_score = results[0]['scores'][results[0]['labels'].index('hate speech')]
 63 |         return bias_score
 64 | 
 65 |     def evaluate_meteor(self, candidates, references):
 66 |         nltk.download('punkt', quiet=True)  
 67 |         
 68 |         meteor_scores = [
 69 |             meteor_score([word_tokenize(ref)], word_tokenize(cand))
 70 |             for ref, cand in zip(references, candidates)
 71 |         ]
 72 |         return sum(meteor_scores) / len(meteor_scores)
 73 |     
 74 |     def evaluate_chrf(self, candidates, references):
 75 |         chrf_scores = [sentence_chrf(ref, cand) for ref, cand in zip(references, candidates)]
 76 |         return sum(chrf_scores) / len(chrf_scores)
 77 |     
 78 |     def evaluate_readability(self, text):
 79 |         flesch_ease = flesch_reading_ease(text)
 80 |         flesch_grade = flesch_kincaid_grade(text)
 81 |         return flesch_ease, flesch_grade
 82 | 
 83 |     def evaluate_all(self, response, reference):
 84 |         candidates = [response]
 85 |         references = [reference]
 86 |         bleu, rouge1 = self.evaluate_bleu_rouge(candidates, references)
 87 |         bert_p, bert_r, bert_f1 = self.evaluate_bert_score(candidates, references)
 88 |         perplexity = self.evaluate_perplexity(response)
 89 |         diversity = self.evaluate_diversity(candidates)
 90 |         racial_bias = self.evaluate_racial_bias(response)
 91 |         meteor = self.evaluate_meteor(candidates, references)
 92 |         chrf = self.evaluate_chrf(candidates, references)
 93 |         flesch_ease, flesch_grade = self.evaluate_readability(response)
 94 |         return {
 95 |             "BLEU": bleu,
 96 |             "ROUGE-1": rouge1,
 97 |             "BERT P": bert_p,
 98 |             "BERT R": bert_r,
 99 |             "BERT F1": bert_f1,
100 |             "Perplexity": perplexity,
101 |             "Diversity": diversity,
102 |             "Racial Bias": racial_bias,
103 |             "METEOR": meteor,
104 |             "CHRF": chrf,
105 |             "Flesch Reading Ease": flesch_ease,
106 |             "Flesch-Kincaid Grade": flesch_grade,
107 |         }
108 | 


--------------------------------------------------------------------------------
/streamlit app/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | sacrebleu
3 | rouge-score
4 | bert-score
5 | transformers
6 | torch
7 | nltk


--------------------------------------------------------------------------------