├── .github └── workflows │ ├── python-package.yml │ └── python-publish.yml ├── .gitignore ├── LICENSE.txt ├── README.md ├── rag_evaluator ├── __init__.py ├── evaluator.py └── test_evaluator.py ├── requirements.py ├── setup.py └── streamlit app ├── app.py ├── evaluation_module.py └── requirements.txt /.github/workflows/python-package.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a variety of Python versions 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python 3 | 4 | name: Python package 5 | 6 | on: 7 | push: 8 | branches: [ "main" ] 9 | pull_request: 10 | branches: [ "main" ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.9", "3.10", "3.11"] 20 | 21 | steps: 22 | - uses: actions/checkout@v4 23 | - name: Set up Python ${{ matrix.python-version }} 24 | uses: actions/setup-python@v3 25 | with: 26 | python-version: ${{ matrix.python-version }} 27 | - name: Install dependencies 28 | run: | 29 | python -m pip install --upgrade pip 30 | python -m pip install flake8 pytest 31 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 32 | - name: Lint with flake8 33 | run: | 34 | # stop the build if there are Python syntax errors or undefined names 35 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 36 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 37 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 38 | - name: Test with pytest 39 | run: | 40 | pytest 41 | -------------------------------------------------------------------------------- /.github/workflows/python-publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package using Twine when a release is created 2 | # For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries 3 | 4 | # This workflow uses actions that are not certified by GitHub. 5 | # They are provided by a third-party and are governed by 6 | # separate terms of service, privacy policy, and support 7 | # documentation. 8 | 9 | name: Upload Python Package 10 | 11 | on: 12 | release: 13 | types: [published] 14 | 15 | permissions: 16 | contents: read 17 | 18 | jobs: 19 | deploy: 20 | 21 | runs-on: ubuntu-latest 22 | 23 | steps: 24 | - uses: actions/checkout@v4 25 | - name: Set up Python 26 | uses: actions/setup-python@v3 27 | with: 28 | python-version: '3.x' 29 | - name: Install dependencies 30 | run: | 31 | python -m pip install --upgrade pip 32 | pip install build 33 | - name: Build package 34 | run: python -m build 35 | - name: Publish package 36 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 37 | with: 38 | user: __token__ 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 AI Anytime 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RAG Evaluator 2 | 3 | ## Overview 4 | 5 | RAG Evaluator is a Python library for evaluating Retrieval-Augmented Generation (RAG) systems. It provides various metrics to evaluate the quality of generated text against reference text. 6 | 7 | ## Installation 8 | 9 | You can install the library using pip: 10 | 11 | ```bash 12 | pip install rag-evaluator 13 | ``` 14 | 15 | ## Usage 16 | 17 | Here's how to use the RAG Evaluator library: 18 | 19 | ```python 20 | from rag_evaluator import RAGEvaluator 21 | 22 | # Initialize the evaluator 23 | evaluator = RAGEvaluator() 24 | 25 | # Input data 26 | question = "What are the causes of climate change?" 27 | response = "Climate change is caused by human activities." 28 | reference = "Human activities such as burning fossil fuels cause climate change." 29 | 30 | # Evaluate the response 31 | metrics = evaluator.evaluate_all(question, response, reference) 32 | 33 | # Print the results 34 | print(metrics) 35 | ``` 36 | 37 | ## Streamlit Web App 38 | 39 | To run the web app: 40 | 41 | 1. cd into streamlit app folder. 42 | 2. Create a virtual env 43 | 3. Activate the virtual env 44 | 4. Install all dependencies 45 | 5. Run the app: 46 | ``` 47 | streamlit run app.py 48 | ``` 49 | 50 | ## Metrics 51 | 52 | The RAG Evaluator provides the following metrics: 53 | 54 | 1. **BLEU** (0-100): Measures the overlap between the generated output and reference text based on n-grams. 55 | - 0-20: Low similarity, 20-40: Medium-low, 40-60: Medium, 60-80: High, 80-100: Very high 56 | 57 | 2. **ROUGE-1** (0-1): Measures the overlap of unigrams between the generated output and reference text. 58 | - 0.0-0.2: Poor overlap, 0.2-0.4: Fair, 0.4-0.6: Good, 0.6-0.8: Very good, 0.8-1.0: Excellent 59 | 60 | 3. **BERT Score** (0-1): Evaluates the semantic similarity using BERT embeddings (Precision, Recall, F1). 61 | - 0.0-0.5: Low similarity, 0.5-0.7: Moderate, 0.7-0.8: Good, 0.8-0.9: High, 0.9-1.0: Very high 62 | 63 | 4. **Perplexity** (1 to ∞, lower is better): Measures how well a language model predicts the text. 64 | - 1-10: Excellent, 10-50: Good, 50-100: Moderate, 100+: High (potentially nonsensical) 65 | 66 | 5. **Diversity** (0-1): Measures the uniqueness of bigrams in the generated output. 67 | - 0.0-0.2: Very low, 0.2-0.4: Low, 0.4-0.6: Moderate, 0.6-0.8: High, 0.8-1.0: Very high 68 | 69 | 6. **Racial Bias** (0-1): Detects the presence of biased language in the generated output. 70 | - 0.0-0.2: Low probability, 0.2-0.4: Moderate, 0.4-0.6: High, 0.6-0.8: Very high, 0.8-1.0: Extreme 71 | 72 | 7. **MAUVE** (0-1): MAUVE captures contextual meaning, coherence, and fluency while measuring both semantic similarity and stylistic alignment . 73 | - 0.0-0.2 (Poor), 0.2-0.4 (Fair), 0.4-0.6 (Good), 0.6-0.8 (Very good), 0.8-1.0 (Excellent). 74 | 75 | 8. **METEOR** (0-1): Calculates semantic similarity considering synonyms and paraphrases. 76 | - 0.0-0.2: Poor, 0.2-0.4: Fair, 0.4-0.6: Good, 0.6-0.8: Very good, 0.8-1.0: Excellent 77 | 78 | 9. **CHRF** (0-1): Computes Character n-gram F-score for fine-grained text similarity. 79 | - 0.0-0.2: Low, 0.2-0.4: Moderate, 0.4-0.6: Good, 0.6-0.8: High, 0.8-1.0: Very high 80 | 81 | 10. **Flesch Reading Ease** (0-100): Assesses text readability. 82 | - 0-30: Very difficult, 30-50: Difficult, 50-60: Fairly difficult, 60-70: Standard, 70-80: Fairly easy, 80-90: Easy, 90-100: Very easy 83 | 84 | 11. **Flesch-Kincaid Grade** (0-18+): Indicates the U.S. school grade level needed to understand the text. 85 | - 1-6: Elementary, 7-8: Middle school, 9-12: High school, 13+: College level 86 | 87 | ## Testing 88 | 89 | To run the tests, use the following command: 90 | 91 | ``` 92 | python -m unittest discover -s rag_evaluator -p "test_*.py" 93 | ``` 94 | 95 | ## License 96 | 97 | This project is licensed under the MIT License. See the [LICENSE](LICENSE) file for details. 98 | 99 | ## Contributing 100 | 101 | Contributions are welcome! If you have any improvements, suggestions, or bug fixes, feel free to create a pull request (PR) or open an issue on GitHub. Please ensure your contributions adhere to the project's coding standards and include appropriate tests. 102 | 103 | ### How to Contribute 104 | 105 | 1. Fork the repository. 106 | 2. Create a new branch for your feature or bug fix. 107 | 3. Make your changes. 108 | 4. Run tests to ensure everything is working. 109 | 5. Commit your changes and push to your fork. 110 | 6. Create a pull request (PR) with a detailed description of your changes. 111 | 112 | ## Contact 113 | 114 | If you have any questions or need further assistance, feel free to reach out via [email](mailto:aianytime07@gmail.com). 115 | -------------------------------------------------------------------------------- /rag_evaluator/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluator import RAGEvaluator 2 | 3 | __all__ = ["RAGEvaluator"] 4 | -------------------------------------------------------------------------------- /rag_evaluator/evaluator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import mauve 3 | from sacrebleu import corpus_bleu 4 | from rouge_score import rouge_scorer 5 | from bert_score import score 6 | from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline 7 | import nltk 8 | from nltk.util import ngrams 9 | from nltk.tokenize import word_tokenize 10 | from nltk.translate.meteor_score import meteor_score 11 | from nltk.translate.chrf_score import sentence_chrf 12 | from textstat import flesch_reading_ease, flesch_kincaid_grade 13 | from sklearn.metrics.pairwise import cosine_similarity 14 | from mauve import compute_mauve 15 | import nltk 16 | nltk.download() 17 | 18 | class RAGEvaluator: 19 | def __init__(self): 20 | self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model() 21 | self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english") 22 | 23 | def load_gpt2_model(self): 24 | model = GPT2LMHeadModel.from_pretrained('gpt2') 25 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 26 | return model, tokenizer 27 | 28 | def evaluate_bleu_rouge(self, candidates, references): 29 | bleu_score = corpus_bleu(candidates, [references]).score 30 | scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) 31 | rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)] 32 | rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores) 33 | return bleu_score, rouge1 34 | 35 | def evaluate_bert_score(self, candidates, references): 36 | P, R, F1 = score(candidates, references, lang="en", model_type='bert-base-multilingual-cased') 37 | return P.mean().item(), R.mean().item(), F1.mean().item() 38 | 39 | def evaluate_perplexity(self, text): 40 | encodings = self.gpt2_tokenizer(text, return_tensors='pt') 41 | max_length = self.gpt2_model.config.n_positions 42 | stride = 512 43 | lls = [] 44 | for i in range(0, encodings.input_ids.size(1), stride): 45 | begin_loc = max(i + stride - max_length, 0) 46 | end_loc = min(i + stride, encodings.input_ids.size(1)) 47 | trg_len = end_loc - i 48 | input_ids = encodings.input_ids[:, begin_loc:end_loc] 49 | target_ids = input_ids.clone() 50 | target_ids[:, :-trg_len] = -100 51 | with torch.no_grad(): 52 | outputs = self.gpt2_model(input_ids, labels=target_ids) 53 | log_likelihood = outputs[0] * trg_len 54 | lls.append(log_likelihood) 55 | ppl = torch.exp(torch.stack(lls).sum() / end_loc) 56 | return ppl.item() 57 | 58 | def evaluate_diversity(self, texts): 59 | all_tokens = [tok for text in texts for tok in text.split()] 60 | unique_bigrams = set(ngrams(all_tokens, 2)) 61 | diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0 62 | return diversity_score 63 | 64 | def evaluate_racial_bias(self, text): 65 | results = self.bias_pipeline([text], candidate_labels=["hate speech", "not hate speech"]) 66 | bias_score = results[0]['scores'][results[0]['labels'].index('hate speech')] 67 | return bias_score 68 | 69 | def evaluate_meteor(self, candidates, references): 70 | nltk.download('punkt', quiet=True) 71 | 72 | meteor_scores = [ 73 | meteor_score([word_tokenize(ref)], word_tokenize(cand)) 74 | for ref, cand in zip(references, candidates) 75 | ] 76 | return sum(meteor_scores) / len(meteor_scores) 77 | 78 | def evaluate_chrf(self, candidates, references): 79 | chrf_scores = [sentence_chrf(ref, cand) for ref, cand in zip(references, candidates)] 80 | return sum(chrf_scores) / len(chrf_scores) 81 | 82 | def evaluate_readability(self, text): 83 | flesch_ease = flesch_reading_ease(text) 84 | flesch_grade = flesch_kincaid_grade(text) 85 | return flesch_ease, flesch_grade 86 | 87 | def evaluate_mauve(self,reference_texts, generated_texts): 88 | out = mauve.compute_mauve( 89 | p_text=reference_texts, # List of reference texts 90 | q_text=generated_texts, # List of generated texts 91 | device_id=0, # GPU device ID; set to -1 for CPU 92 | max_text_length=1024, # Maximum length of text to truncate 93 | verbose=False # Whether to print additional information 94 | ) 95 | return out.mauve 96 | 97 | def evaluate_all(self, question, response, reference): 98 | candidates = [response] 99 | references = [reference] 100 | bleu, rouge1 = self.evaluate_bleu_rouge(candidates, references) 101 | bert_p, bert_r, bert_f1 = self.evaluate_bert_score(candidates, references) 102 | perplexity = self.evaluate_perplexity(response) 103 | diversity = self.evaluate_diversity(candidates) 104 | racial_bias = self.evaluate_racial_bias(response) 105 | mauve_score = self.evaluate_mauve(reference, response) 106 | meteor = self.evaluate_meteor(candidates, references) 107 | chrf = self.evaluate_chrf(candidates, references) 108 | flesch_ease, flesch_grade = self.evaluate_readability(response) 109 | return { 110 | "BLEU": bleu, 111 | "ROUGE-1": rouge1, 112 | "BERT P": bert_p, 113 | "BERT R": bert_r, 114 | "BERT F1": bert_f1, 115 | "Perplexity": perplexity, 116 | "Diversity": diversity, 117 | "Racial Bias": racial_bias, 118 | "MAUVE": mauve_score, 119 | "METEOR": meteor, 120 | "CHRF": chrf, 121 | "Flesch Reading Ease": flesch_ease, 122 | "Flesch-Kincaid Grade": flesch_grade, 123 | } 124 | -------------------------------------------------------------------------------- /rag_evaluator/test_evaluator.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from evaluator import RAGEvaluator 3 | import nltk 4 | nltk.download('punkt') 5 | from nltk.tokenize import PunktTokenizer 6 | 7 | class TestRAGEvaluator(unittest.TestCase): 8 | def setUp(self): 9 | self.evaluator = RAGEvaluator() 10 | 11 | def test_evaluate_all(self): 12 | question = "What are the causes of climate change?" 13 | response = "Climate change is caused by human activities." 14 | reference = "Human activities such as burning fossil fuels cause climate change." 15 | metrics = self.evaluator.evaluate_all(question, response, reference) 16 | self.assertIsInstance(metrics, dict) 17 | self.assertIn("BLEU", metrics) 18 | self.assertIn("ROUGE-1", metrics) 19 | self.assertIn("BERT P", metrics) 20 | self.assertIn("Perplexity", metrics) 21 | self.assertIn("Diversity", metrics) 22 | self.assertIn("Racial Bias", metrics) 23 | self.assertIn("MAUVE", metrics) 24 | self.assertIn("METEOR", metrics) 25 | self.assertIn("CHRF", metrics) 26 | self.assertIn("Flesch Reading Ease", metrics) 27 | self.assertIn("Flesch-Kincaid Grade", metrics) 28 | 29 | if __name__ == "__main__": 30 | unittest.main() 31 | -------------------------------------------------------------------------------- /requirements.py: -------------------------------------------------------------------------------- 1 | torch==2.3.1 2 | sacrebleu 3 | rouge-score 4 | bert-score 5 | transformers 6 | nltk 7 | textblob 8 | textstat 9 | mauve-text 10 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name="rag-evaluator", 5 | version="0.1.0", 6 | description="A library for evaluating Retrieval-Augmented Generation (RAG) systems", 7 | long_description=open('README.md').read(), 8 | long_description_content_type='text/markdown', 9 | author="AI Anytime", 10 | author_email="aianytime07@gmail.com", 11 | url="https://github.com/AIAnytime/rag-evaluator", 12 | packages=find_packages(), 13 | install_requires=[ 14 | "torch", 15 | "sacrebleu", 16 | "rouge-score", 17 | "bert-score", 18 | "transformers", 19 | "nltk", 20 | "textblob", 21 | "textstat" 22 | ], 23 | classifiers=[ 24 | "Programming Language :: Python :: 3", 25 | "License :: OSI Approved :: MIT License", 26 | "Operating System :: OS Independent", 27 | ], 28 | python_requires='>=3.10', 29 | ) -------------------------------------------------------------------------------- /streamlit app/app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from evaluation_module import RAGEvaluator 3 | 4 | # Initialize evaluator 5 | evaluator = RAGEvaluator() 6 | 7 | st.title("RAG System Evaluation Dashboard") 8 | 9 | st.write("## Input Data") 10 | 11 | # Pre-filled input fields for testing 12 | question = st.text_input("Question", "What are the causes of climate change?") 13 | context = st.text_area("Reference Context (top 'k' documents)", """ 14 | Climate change is caused by a variety of factors, including natural processes and human activities. Human activities, such as burning fossil fuels, deforestation, and industrial processes, release greenhouse gases into the atmosphere. These gases trap heat from the sun, causing the Earth's temperature to rise. Natural processes, such as volcanic eruptions and variations in solar radiation, also play a role in climate change. 15 | """) 16 | generated_output = st.text_area("LLM Generated Output", """ 17 | Climate change is primarily caused by human activities that release greenhouse gases into the atmosphere. These activities include burning fossil fuels for energy, deforestation, and various industrial processes. The increase in greenhouse gases, such as carbon dioxide and methane, traps more heat in the Earth's atmosphere, leading to a rise in global temperatures. Natural factors, like volcanic activity and changes in solar radiation, can also contribute to climate change, but their impact is relatively minor compared to human activities. 18 | """) 19 | 20 | if st.button("Evaluate"): 21 | if question and context and generated_output: 22 | st.write("### Evaluation Results") 23 | 24 | # Perform evaluations 25 | metrics = evaluator.evaluate_all(generated_output, context) 26 | 27 | # Display metrics with explanations 28 | st.write(f"**BLEU Score**: {metrics['BLEU']:.2f}") 29 | st.write("BLEU measures the overlap between the generated output and reference text based on n-grams. Range: 0-100. Higher scores indicate better match.") 30 | 31 | st.write(f"**ROUGE-1 Score**: {metrics['ROUGE-1']:.2f}") 32 | st.write("ROUGE-1 measures the overlap of unigrams between the generated output and reference text. Range: 0-1. Higher scores indicate better match.") 33 | 34 | st.write(f"**BERT Precision**: {metrics['BERT P']:.2f}") 35 | st.write(f"**BERT Recall**: {metrics['BERT R']:.2f}") 36 | st.write(f"**BERT F1 Score**: {metrics['BERT F1']:.2f}") 37 | st.write("BERTScore evaluates the semantic similarity between the generated output and reference text using BERT embeddings. Range: 0-1. Higher scores indicate better semantic similarity.") 38 | 39 | st.write(f"**Perplexity**: {metrics['Perplexity']:.2f}") 40 | st.write("Perplexity measures how well a language model predicts the text. Range: 1 to ∞. Lower values indicate better fluency and coherence.") 41 | 42 | st.write(f"**Diversity**: {metrics['Diversity']:.2f}") 43 | st.write("Diversity measures the uniqueness of bigrams in the generated output. Range: 0-1. Higher values indicate more diverse and varied output.") 44 | 45 | st.write(f"**Racial Bias**: {metrics['Racial Bias']:.2f}") 46 | st.write("Racial Bias score indicates the presence of biased language in the generated output. Range: 0-1. Lower scores indicate less bias.") 47 | 48 | st.write(f"**METEOR Score**: {metrics['METEOR']:.2f}") 49 | st.write("METEOR calculates semantic similarity considering synonyms and paraphrases. Range: 0-1. Higher scores indicate better semantic alignment.") 50 | 51 | st.write(f"**CHRF Score**: {metrics['CHRF']:.2f}") 52 | st.write("CHRF computes Character n-gram F-score for fine-grained text similarity. Range: 0-1. Higher scores indicate better character-level similarity.") 53 | 54 | st.write(f"**Flesch Reading Ease**: {metrics['Flesch Reading Ease']:.2f}") 55 | st.write("Flesch Reading Ease assesses text readability. Range: 0-100. Higher scores indicate easier readability.") 56 | 57 | st.write(f"**Flesch-Kincaid Grade**: {metrics['Flesch-Kincaid Grade']:.2f}") 58 | st.write("Flesch-Kincaid Grade indicates the U.S. school grade level needed to understand the text. Range: 0-18+. Lower scores indicate easier readability.") 59 | 60 | else: 61 | st.write("Please provide all inputs to evaluate.") -------------------------------------------------------------------------------- /streamlit app/evaluation_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from sacrebleu import corpus_bleu 3 | from rouge_score import rouge_scorer 4 | from bert_score import score 5 | from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline 6 | import nltk 7 | from nltk.util import ngrams 8 | from nltk.tokenize import word_tokenize 9 | from nltk.translate.meteor_score import meteor_score 10 | from nltk.translate.chrf_score import sentence_chrf 11 | from textstat import flesch_reading_ease, flesch_kincaid_grade 12 | from sklearn.metrics.pairwise import cosine_similarity 13 | 14 | class RAGEvaluator: 15 | def __init__(self): 16 | self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model() 17 | self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english") 18 | 19 | def load_gpt2_model(self): 20 | model = GPT2LMHeadModel.from_pretrained('gpt2') 21 | tokenizer = GPT2Tokenizer.from_pretrained('gpt2') 22 | return model, tokenizer 23 | 24 | def evaluate_bleu_rouge(self, candidates, references): 25 | bleu_score = corpus_bleu(candidates, [references]).score 26 | scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) 27 | rouge_scores = [scorer.score(ref, cand) for ref, cand in zip(references, candidates)] 28 | rouge1 = sum([score['rouge1'].fmeasure for score in rouge_scores]) / len(rouge_scores) 29 | return bleu_score, rouge1 30 | 31 | def evaluate_bert_score(self, candidates, references): 32 | P, R, F1 = score(candidates, references, lang="en", model_type='bert-base-multilingual-cased') 33 | return P.mean().item(), R.mean().item(), F1.mean().item() 34 | 35 | def evaluate_perplexity(self, text): 36 | encodings = self.gpt2_tokenizer(text, return_tensors='pt') 37 | max_length = self.gpt2_model.config.n_positions 38 | stride = 512 39 | lls = [] 40 | for i in range(0, encodings.input_ids.size(1), stride): 41 | begin_loc = max(i + stride - max_length, 0) 42 | end_loc = min(i + stride, encodings.input_ids.size(1)) 43 | trg_len = end_loc - i 44 | input_ids = encodings.input_ids[:, begin_loc:end_loc] 45 | target_ids = input_ids.clone() 46 | target_ids[:, :-trg_len] = -100 47 | with torch.no_grad(): 48 | outputs = self.gpt2_model(input_ids, labels=target_ids) 49 | log_likelihood = outputs[0] * trg_len 50 | lls.append(log_likelihood) 51 | ppl = torch.exp(torch.stack(lls).sum() / end_loc) 52 | return ppl.item() 53 | 54 | def evaluate_diversity(self, texts): 55 | all_tokens = [tok for text in texts for tok in text.split()] 56 | unique_bigrams = set(ngrams(all_tokens, 2)) 57 | diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0 58 | return diversity_score 59 | 60 | def evaluate_racial_bias(self, text): 61 | results = self.bias_pipeline([text], candidate_labels=["hate speech", "not hate speech"]) 62 | bias_score = results[0]['scores'][results[0]['labels'].index('hate speech')] 63 | return bias_score 64 | 65 | def evaluate_meteor(self, candidates, references): 66 | nltk.download('punkt', quiet=True) 67 | 68 | meteor_scores = [ 69 | meteor_score([word_tokenize(ref)], word_tokenize(cand)) 70 | for ref, cand in zip(references, candidates) 71 | ] 72 | return sum(meteor_scores) / len(meteor_scores) 73 | 74 | def evaluate_chrf(self, candidates, references): 75 | chrf_scores = [sentence_chrf(ref, cand) for ref, cand in zip(references, candidates)] 76 | return sum(chrf_scores) / len(chrf_scores) 77 | 78 | def evaluate_readability(self, text): 79 | flesch_ease = flesch_reading_ease(text) 80 | flesch_grade = flesch_kincaid_grade(text) 81 | return flesch_ease, flesch_grade 82 | 83 | def evaluate_all(self, response, reference): 84 | candidates = [response] 85 | references = [reference] 86 | bleu, rouge1 = self.evaluate_bleu_rouge(candidates, references) 87 | bert_p, bert_r, bert_f1 = self.evaluate_bert_score(candidates, references) 88 | perplexity = self.evaluate_perplexity(response) 89 | diversity = self.evaluate_diversity(candidates) 90 | racial_bias = self.evaluate_racial_bias(response) 91 | meteor = self.evaluate_meteor(candidates, references) 92 | chrf = self.evaluate_chrf(candidates, references) 93 | flesch_ease, flesch_grade = self.evaluate_readability(response) 94 | return { 95 | "BLEU": bleu, 96 | "ROUGE-1": rouge1, 97 | "BERT P": bert_p, 98 | "BERT R": bert_r, 99 | "BERT F1": bert_f1, 100 | "Perplexity": perplexity, 101 | "Diversity": diversity, 102 | "Racial Bias": racial_bias, 103 | "METEOR": meteor, 104 | "CHRF": chrf, 105 | "Flesch Reading Ease": flesch_ease, 106 | "Flesch-Kincaid Grade": flesch_grade, 107 | } 108 | -------------------------------------------------------------------------------- /streamlit app/requirements.txt: -------------------------------------------------------------------------------- 1 | streamlit 2 | sacrebleu 3 | rouge-score 4 | bert-score 5 | transformers 6 | torch 7 | nltk --------------------------------------------------------------------------------