├── LICENSE ├── README.md └── evaluate_predictions.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 vis-nlp 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ChartQAPro: A More Diverse and Challenging Benchmark for Chart Question Answering 2 | 3 | 🤗[Dataset](https://huggingface.co/datasets/ahmed-masry/ChartQAPro) | 🖥️[Code](https://github.com/vis-nlp/ChartQAPro) | 📄[Paper](https://arxiv.org/abs/2504.05506v2) 4 | 5 | ![question_types_crop_3-1](https://github.com/user-attachments/assets/30449812-1efd-4947-ac44-80b52485a1db) 6 | ![Screenshot 2025-04-17 205316](https://github.com/user-attachments/assets/8d54da53-48bc-429b-ae6a-9116e7f42838) 7 | 8 | # Dataset 9 | You can find our dataset on huggingface: 🤗[ChartQAPro Dataset](https://huggingface.co/datasets/ahmed-masry/ChartQAPro) 10 | # Evaluation Results 11 | ![Screenshot 2025-04-17 205443](https://github.com/user-attachments/assets/f8c8c71f-a1a7-48bf-b25a-adfc63902a68) 12 | 13 | 14 | ## ✅ Evaluation Instructions 15 | 16 | To evaluate your model on **ChartQAPro**, follow the steps below: 17 | 18 | ### 1. Format Your Predictions 19 | 20 | Save your model's predictions in a `.json` file that contains a **list of dictionaries**. 21 | Each dictionary should include the following keys (first three keys taken from the original huggingface dataset): 22 | 23 | - `"Answer"`: the ground truth answer 24 | - `"Question Type"`: the type of the question (e.g., Factoid, MCQ, etc.) 25 | - `"Year"`: useful for evaluating year-based answers 26 | - `"prediction"`: your model’s predicted answer 27 | 28 | #### 📝 Example Format 29 | 30 | ```json 31 | [ 32 | { 33 | "Answer": ["2016"] 34 | "Question Type": "Factoid", 35 | "Year": ["YES"] 36 | "prediction": "2016" 37 | }, 38 | ... 39 | ] 40 | ``` 41 | ### 2. Install Required Dependencies 42 | 43 | ```bash 44 | pip install anls pandas 45 | ``` 46 | 47 | ### 3. Run the Evaluation Script 48 | 49 | ```bash 50 | python evaluate_predictions.py --predictions-file path/to/your/predictions.json 51 | ``` 52 | This will print your model’s performance across different question types and the overall score, following the official evaluation metrics used in the paper. 📊 53 | 54 | # 💬 Contact 55 | If you have any questions about this work, please contact **[Ahmed Masry](https://ahmedmasryku.github.io/)** using the following email addresses: **amasry17@ku.edu.tr**, **ahmed.elmasry24653@gmail.com**, or **masry20@yorku.ca**. 56 | 57 | # 📚 Citation 58 | If you use ChartQAPro in your research, please cite: 59 | ``` 60 | @misc{masry2025chartqaprodiversechallengingbenchmark, 61 | title={ChartQAPro: A More Diverse and Challenging Benchmark for Chart Question Answering}, 62 | author={Ahmed Masry and Mohammed Saidul Islam and Mahir Ahmed and Aayush Bajaj and Firoz Kabir and Aaryaman Kartha and Md Tahmid Rahman Laskar and Mizanur Rahman and Shadikur Rahman and Mehrad Shahmohammadi and Megh Thakkar and Md Rizwan Parvez and Enamul Hoque and Shafiq Joty}, 63 | year={2025}, 64 | eprint={2504.05506}, 65 | archivePrefix={arXiv}, 66 | primaryClass={cs.CL}, 67 | url={https://arxiv.org/abs/2504.05506}, 68 | } 69 | ``` 70 | -------------------------------------------------------------------------------- /evaluate_predictions.py: -------------------------------------------------------------------------------- 1 | import ast 2 | import re 3 | from typing import List, Optional, Any, Tuple, Dict 4 | import pandas as pd 5 | from anls import anls_score 6 | import json 7 | import argparse 8 | 9 | def load_predictions(file_path): 10 | with open(file_path, 'r', encoding='utf-8') as f: 11 | predictions = json.load(f) 12 | return predictions 13 | 14 | def fix_list_format(item: str) -> Any: 15 | """ 16 | Standardize string representations of lists, adding quotes around elements if missing, 17 | and safely evaluate to Python list. Returns original item if parsing fails. 18 | """ 19 | if not isinstance(item, str): 20 | return item 21 | match = re.match(r"^\[(.*)\]$", item.strip()) 22 | if not match: 23 | return item 24 | content = match.group(1) 25 | corrected = re.sub(r"(? Optional[List[str]]: 33 | """ 34 | Parses text to a list of strings if possible; strips quotes and whitespace. 35 | """ 36 | if not isinstance(text, str): 37 | return None 38 | try: 39 | parsed = ast.literal_eval(text) 40 | except Exception: 41 | return None 42 | if isinstance(parsed, list): 43 | return [str(x).strip(" '") for x in parsed] 44 | return None 45 | 46 | 47 | def to_float(text: str) -> Optional[float]: 48 | """ 49 | Converts text to float, stripping percent signs. Returns None on failure. 50 | """ 51 | try: 52 | return float(text.strip().strip('%')) 53 | except ValueError: 54 | return None 55 | 56 | 57 | def evaluate_single_answer( 58 | target: str, 59 | prediction: str, 60 | max_relative_change: float = 0.05 61 | ) -> float: 62 | """ 63 | Evaluates a single target-prediction pair: 64 | - Numeric within tolerance or exact year match inside this helper. 65 | - Falls back to ANLS for text. 66 | """ 67 | t = target.strip().strip('%').strip() 68 | p = prediction.strip().strip('%').strip() 69 | #print("Stripped", t, p) 70 | # Attempt numeric 71 | t_f = to_float(t) 72 | p_f = to_float(p) 73 | if t_f is not None and p_f is not None: 74 | if t_f == 0.0: 75 | return 1.0 if p_f == 0.0 else 0.0 76 | change = abs(p_f - t_f) / abs(t_f) 77 | return 1.0 if change <= max_relative_change else 0.0 78 | # Fallback text 79 | #print("P:", p, "T: ", t) 80 | return anls_score(prediction=p.lower(), gold_labels=[t.lower()], threshold=0.5) 81 | 82 | 83 | def relaxed_correctness_chartqapro( 84 | target: str, 85 | prediction: str, 86 | max_relative_change: float = 0.05, 87 | year_flags: Optional[List[bool]] = None, 88 | always_use_exact_match: bool = False, 89 | ) -> float: 90 | """ 91 | Calculates relaxed correctness between target and prediction. 92 | Supports list inputs; uses year_flags to override year handling. 93 | """ 94 | fixed_t = fix_list_format(target) 95 | t_list = parse_to_list(str(fixed_t)) or [str(target)] 96 | p_list = parse_to_list(str(prediction)) or [str(prediction)] 97 | n = len(t_list) 98 | # Expand year_flags for questions with multiple answers. 99 | if year_flags is not None and len(year_flags) < n: 100 | year_flags = year_flags * n 101 | 102 | # Evaluate elements 103 | scores: List[float] = [] 104 | # print(t_list, p_list) 105 | for idx in range(max(len(t_list), len(p_list))): 106 | if idx >= len(t_list) or idx >= len(p_list): 107 | # Model predicted more or less elements that necessary. 108 | scores.append(0.0) 109 | continue 110 | t_item, p_item, flag = t_list[idx], p_list[idx], year_flags[idx] 111 | flag_cond = True if flag.upper()=='YES' else False 112 | if flag_cond or always_use_exact_match: 113 | # Exact integer match for years, fact checking, or multichoice 114 | try: 115 | scores.append(1.0 if t_item.strip().lower() == p_item.strip().lower() else 0.0) 116 | except ValueError: 117 | scores.append(0.0) 118 | else: 119 | scores.append( 120 | evaluate_single_answer(t_item, p_item, max_relative_change) 121 | ) 122 | return sum(scores) / len(scores) if scores else 0.0 123 | 124 | 125 | def evaluate_predictions_chartqapro(predictions, pred_key='prediction'): 126 | gts = [x['Answer'][-1].strip(".").strip("\n") for x in predictions] 127 | preds = [x[pred_key].strip(".").strip("\n") for x in predictions] 128 | splits = [x['Question Type'] for x in predictions] 129 | year_flags = [x['Year'] for x in predictions] 130 | # Calculate accuracy by splits 131 | match_nums_per_split = {} 132 | match_nums = [] 133 | for gt, pred, split, year_flags_per_row in zip(gts, preds, splits, year_flags): 134 | # check split and calculate 135 | if split == 'Conversational': 136 | year_flags_per_row = year_flags_per_row[-1:] 137 | if split not in match_nums_per_split: 138 | match_nums_per_split[split] = [] 139 | 140 | always_use_exact_match = True if split in ['Fact Checking', 'Multi Choice'] else False 141 | score = relaxed_correctness_chartqapro(gt, pred, year_flags=year_flags_per_row) 142 | #print(gt, pred, year_flags_per_row, score) 143 | match_nums_per_split[split].append(score) 144 | match_nums.append(score) 145 | 146 | final_numbers = {} 147 | for split in match_nums_per_split: 148 | final_numbers[split] = sum(match_nums_per_split[split]) / len(match_nums_per_split[split]) 149 | final_numbers['Overall'] = sum(match_nums) / len(match_nums) 150 | return final_numbers 151 | 152 | 153 | def main(): 154 | parser = argparse.ArgumentParser(description="Evaluate ChartQAPro predictions.") 155 | parser.add_argument( 156 | "--predictions-file", 157 | type=str, 158 | required=True, 159 | help="Path to the JSON file containing model predictions." 160 | ) 161 | args = parser.parse_args() 162 | predictions = load_predictions(args.predictions_file) 163 | scores = evaluate_predictions_chartqapro(predictions) 164 | print("📊 Evaluation Results:") 165 | for k, v in scores.items(): 166 | print(f" • {k:<15}: {v * 100:.2f}%") 167 | 168 | if __name__ == "__main__": 169 | main() 170 | --------------------------------------------------------------------------------