├── .gitignore ├── CITATION ├── Evaluation ├── automatic_eval.py ├── automatic_metrics.py ├── llm_eval.py └── llm_score.py ├── LICENSE ├── LooGLE-testdata ├── longdep_qa.jsonl ├── longdep_summarization.jsonl ├── shortdep_cloze.jsonl └── shortdep_qa.jsonl ├── Output ├── longdep_qa_reorder_gpt4-32k.jsonl ├── longdep_summarization_llama-index.jsonl ├── shortdep_qa_chatglm2-6b-32k.jsonl ├── shortdep_qa_gpt-3.5-turbo-16k.jsonl └── shortdep_qa_llama-index.jsonl ├── Prediction ├── pred_gpt_models.py ├── pred_llamaindex.py └── pred_opensource_models.py ├── README.md ├── Reorder ├── get_max_deviation.py ├── get_reorder_deviation.py └── reorder_eval.py ├── Retrieval └── pred_retrieval_based_method.py ├── Tools ├── Labeling_form_A_annotator.html └── Labeling_form_Q_annotator.html ├── assets ├── LooGle_logo.png ├── overview.png ├── overview_page1.png ├── overview_performance.png └── table.png ├── config ├── task2maxlen.json └── task2prompt.json ├── docs ├── .gitignore ├── 404.html ├── Gemfile ├── _config.yml ├── _includes │ └── head-custom.html ├── _layouts │ └── default.html ├── assets │ ├── css │ │ └── style.scss │ └── js │ │ └── main.js ├── backup.html └── index.md └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /CITATION: -------------------------------------------------------------------------------- 1 | @article{li2023loogle, 2 | title={Can Long-Context Language Models Understand Long Contexts?}, 3 | author={ Li, Jiaqi and Wang, Mengmeng and Zheng, Zilong and Zhang, Muhan }, 4 | url={https://github.com/bigai-nlco/LooGLE} 5 | year={2023} 6 | } 7 | -------------------------------------------------------------------------------- /Evaluation/automatic_eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import openai 3 | from nltk.translate.bleu_score import sentence_bleu 4 | from nltk.translate.meteor_score import single_meteor_score 5 | from rouge import Rouge 6 | from bert_score import score 7 | import numpy as np 8 | import argparse 9 | import openai 10 | from automatic_metrics import ( 11 | get_bleu_score, 12 | get_rouge_score, 13 | get_meteor_score, 14 | get_bertscore, 15 | get_exact_match, 16 | get_partial_match 17 | ) 18 | 19 | 20 | def evaluation(data, scores, functions, task): 21 | for i in range(len(data["output"])): 22 | hyp, ref = data["llm_output"][i], data["output"][i] 23 | if hyp == '': 24 | hyp = 'None' 25 | if "qa_pairs" in data: 26 | if data["qa_pairs"] != "none": 27 | question = data["qa_pairs"][i]["Q"] 28 | else: 29 | question = "" 30 | 31 | for j in functions: 32 | if j not in scores: 33 | scores[j] = [] 34 | scores[j].append(eval(j)(question, ref, hyp, task)) 35 | 36 | return scores 37 | 38 | 39 | def get_semantic_matching(result, functions): 40 | final_score = {} 41 | for i in functions: 42 | if type(result[i][0]) is tuple: 43 | l = result[i] 44 | final_score[i] = [np.mean([i[j] for i in l]) for j in range(len(l[0]))] 45 | else: 46 | final_score[i] = np.mean(result[i]) 47 | return final_score 48 | 49 | 50 | def get_match_score(result, functions): 51 | final_score = {} 52 | for i in functions: 53 | match_count = np.sum([j[0] for j in result[i]]) 54 | all_count = np.sum([j[1] for j in result[i]]) 55 | final_score[i] = round(match_count / all_count, 4) 56 | return final_score 57 | 58 | 59 | def parse_args(args=None): 60 | parser = argparse.ArgumentParser() 61 | parser.add_argument( 62 | "--model_name", type=str, default=None, help="model name for evaluation" 63 | ) 64 | parser.add_argument( 65 | "--task", 66 | type=str, 67 | default=None, 68 | help="long context understanding tasks in LooGLE", 69 | choices=[ 70 | "shortdep_qa", 71 | "shortdep_cloze", 72 | "longdep_qa", 73 | "longdep_summarization", 74 | ], 75 | ) 76 | parser.add_argument("--output_path", type=str, default="./Output/") 77 | parser.add_argument( 78 | "--eval_metric", 79 | type=str, 80 | default=None, 81 | help="evaluation method for LLM predictions", 82 | choices=["automatic_sim", "automatic_match"], 83 | ) 84 | 85 | return parser.parse_args(args) 86 | 87 | if __name__ == "__main__": 88 | args = parse_args() 89 | if args.eval_metric == "automatic_sim": 90 | eval_functions = [ 91 | "get_bleu_score", 92 | "get_rouge_score", 93 | "get_meteor_score", 94 | "get_bertscore" 95 | ] 96 | elif args.eval_metric == "automatic_match": 97 | eval_functions = ["get_exact_match", "get_partial_match"] 98 | 99 | score_result = {} 100 | with open( 101 | args.output_path + args.task + "_" + args.model_name + ".jsonl", "r" 102 | ) as f: 103 | for line in f.readlines(): 104 | ds_llm = json.loads(line) 105 | score_result = evaluation(ds_llm, score_result, eval_functions, args.task) 106 | 107 | 108 | if args.eval_metric == "automatic_sim": 109 | print(get_semantic_matching(score_result, eval_functions)) 110 | elif args.eval_metric == "automatic_match": 111 | print(get_match_score(score_result, eval_functions)) 112 | 113 | -------------------------------------------------------------------------------- /Evaluation/automatic_metrics.py: -------------------------------------------------------------------------------- 1 | import json 2 | from nltk.translate.bleu_score import sentence_bleu 3 | from nltk.translate.meteor_score import single_meteor_score 4 | from rouge import Rouge 5 | from bert_score import score 6 | import numpy as np 7 | 8 | def get_bleu_score(question, reference, hypothesis, task): 9 | reference, hypothesis = ( 10 | reference.replace("\n", " ").split(), 11 | hypothesis.replace("\n", " ").split(), 12 | ) 13 | 14 | bleu1 = sentence_bleu([reference], hypothesis, weights=(1, 0, 0, 0)) 15 | bleu4 = sentence_bleu([reference], hypothesis, weights=(0, 0, 0, 1)) 16 | return bleu1, bleu4 17 | 18 | 19 | def get_rouge_score(question, reference, hypothesis, task, metric="r"): 20 | rouge = Rouge() 21 | rouge_ = rouge.get_scores(hyps=[hypothesis], refs=[reference])[0] 22 | return ( 23 | rouge_["rouge-1"][metric], 24 | rouge_["rouge-2"][metric], 25 | rouge_["rouge-l"][metric], 26 | ) 27 | 28 | 29 | def get_meteor_score(question, reference, hypothesis, task): 30 | reference, hypothesis = ( 31 | reference.replace("\n", " ").split(), 32 | hypothesis.replace("\n", " ").split(), 33 | ) 34 | meteor = single_meteor_score(set(reference), set(hypothesis)) 35 | return float(meteor) 36 | 37 | 38 | def get_bertscore(question, reference, hypothesis, task): 39 | 40 | bertscore = score([reference], [hypothesis], lang="EN") 41 | return float(bertscore[1]) 42 | 43 | 44 | 45 | def get_exact_match(question, reference, hypothesis, task): 46 | count = len(reference) 47 | if type(hypothesis) is str: 48 | try: 49 | hypothesis = eval(hypothesis) 50 | assert isinstance(hypothesis, dict) 51 | except Exception as e: 52 | return 0, count 53 | 54 | exact_score_count = 0 55 | for key in reference: 56 | if key in hypothesis and hypothesis[key] == reference[key]: 57 | exact_score_count += 1 58 | return exact_score_count, count 59 | 60 | def get_partial_match(question, reference, hypothesis, task): 61 | count = len(reference) 62 | if isinstance(hypothesis, str): 63 | try: 64 | hypothesis = eval(hypothesis) 65 | assert isinstance(hypothesis, dict) 66 | except Exception as e: 67 | return 0, count 68 | 69 | partial_score_count = 0 70 | for key in reference: 71 | if key in hypothesis: 72 | true_set = set(reference[key].split()) 73 | pred_set = set(hypothesis[key].split()) 74 | partial_score_count += int(len(true_set.intersection(pred_set)) > 0) 75 | return partial_score_count, count 76 | 77 | -------------------------------------------------------------------------------- /Evaluation/llm_eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | from nltk.translate.bleu_score import sentence_bleu 3 | from nltk.translate.meteor_score import single_meteor_score 4 | from rouge import Rouge 5 | from bert_score import score 6 | import numpy as np 7 | import argparse 8 | import openai, os 9 | from llm_score import ( 10 | get_gpt4_score 11 | ) 12 | 13 | def evaluation(data, scores, functions, task): 14 | for i in range(len(data["output"])): 15 | hyp, ref = data["llm_output"][i], data["output"][i] 16 | if "qa_pairs" in data and data["qa_pairs"] != "none": 17 | question = data["qa_pairs"][i]["Q"] 18 | else: 19 | question = "" 20 | 21 | for j in functions: 22 | if j not in scores: 23 | scores[j] = [] 24 | scores[j].append(eval(j)(question, ref, hyp, task)) 25 | 26 | return scores 27 | 28 | 29 | def get_accuracy(result, functions, task): 30 | final_score = {} 31 | for i in functions: 32 | res = result[i] 33 | if "qa" in task: 34 | final_score[i] = res.count("True") / (res.count("True") + res.count("False")) 35 | else: 36 | final_score[i] = np.mean(res) 37 | return final_score 38 | 39 | 40 | 41 | def parse_args(args=None): 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument( 44 | "--model_name", type=str, default=None, help="model name for evaluation" 45 | ) 46 | parser.add_argument( 47 | "--task", 48 | type=str, 49 | default=None, 50 | help="long context understanding tasks in LooGLE", 51 | choices=[ 52 | "shortdep_qa", 53 | "longdep_qa", 54 | "longdep_summarization", 55 | ], 56 | ) 57 | parser.add_argument("--output_path", type=str, default="./Output/") 58 | parser.add_argument( 59 | "--eval_metric", 60 | type=str, 61 | default="llm", 62 | help="evaluation method for LLM predictions", 63 | choices=["llm"], 64 | ) 65 | 66 | return parser.parse_args(args) 67 | 68 | 69 | if __name__ == "__main__": 70 | args = parse_args() 71 | openai_api_key = os.environ["OPENAI_API_KEY"] 72 | eval_functions = ["get_gpt4_score"] 73 | 74 | score_result = {} 75 | 76 | with open( 77 | args.output_path + args.task + "_" + args.model_name + ".jsonl", "r" 78 | ) as f: 79 | for line in f.readlines(): 80 | ds_llm = json.loads(line) 81 | score_result = evaluation(ds_llm, score_result, eval_functions, args.task) 82 | 83 | 84 | print(get_accuracy(score_result, eval_functions, args.task)) 85 | 86 | -------------------------------------------------------------------------------- /Evaluation/llm_score.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import openai 4 | 5 | 6 | def get_gpt4_score(question, reference, hypothesis, task): 7 | if "qa" in task: 8 | p = "Given one question, there is a groundtruth and a predict_answer. Please decide whether they are the same or not in semantic. Please only output 'True' or 'False' ." 9 | 10 | prompt = [{"role": "system", "content": p,}, 11 | { 12 | "role": "user", 13 | "content": "Question: " 14 | + question 15 | + "\n" 16 | + "groudtruth = " 17 | + reference 18 | + "\n" 19 | + "predict_answer = " 20 | + hypothesis, 21 | }] 22 | 23 | else: 24 | # p = "There is a groundtruth summary of a arxiv paper and a auto-generated summary .Please Compare generated summary with the goundtruth and evaluate the generated summary from the perspectives of information completeness, consistency, fluency, and grammar by giving a score within the range of 0 to 100." 25 | prompt_format = "There is a groundtruth summary of a arxiv paper and a auto-generated summary .Please Compare generated summary with the goundtruth and evaluate the generated summary from the perspectives of information completeness, consistency, fluency, and grammar by giving a score within the range of 0 to 100. \nGroundtruth = {} \nGenerated = {} \nScore = " 26 | prompt = prompt_format.format(reference, hypothesis) 27 | prompt = [{"role": "system", "content": prompt}] 28 | 29 | rr = openai.ChatCompletion.create( 30 | model="gpt-4", 31 | messages=prompt, 32 | temperature=0.0, 33 | top_p=1, 34 | max_tokens=10, 35 | frequency_penalty=0, 36 | presence_penalty=0, 37 | ) 38 | rsp = rr["choices"][0]["message"]["content"] 39 | 40 | if "qa" in task: 41 | return rsp 42 | else: 43 | return int(rsp) 44 | 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 BIGAI Natural Language and Conversational AI Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Output/shortdep_qa_chatglm2-6b-32k.jsonl: -------------------------------------------------------------------------------- 1 | {"qa_pairs": [{"S": "Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain on 18 June 1919. With his mother and his brothers he moved to Madrid, Spain.", "Q": "Where was Jos\u00e9 Luis Picardo born and raised?", "A": "He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family."}, {"S": "On 3 February 1997, at the age of 78, Picardo was elected Academician of the Real Academia de Bellas Artes de San Fernando (Royal Academy of Fine Arts of San Fernando), being proposed by Julio Cano Lasso [es], Fernando Chueca Goitia and Luis Garc\u00eda-Ochoa Ib\u00e1\u00f1ez [es].", "Q": "Did Jos\u00e9 Luis Picardo receive any awards or recognition for his work?", "A": "No."}, {"S": "Particularly noted in his post-student days were illustrations portraying Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "Q": "What were some of the subjects that Picardo illustrated in his post-student days?", "A": "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953."}, {"S": "On qualifying in 1951,[6][7] Picardo pursued his interest in historical architecture by collaborating on a number of building preservation and restoration projects with the Spanish architect and architectural historian Fernando Chueca Goitia [es], who was 8 years his senior. ", "Q": "Who did Picardo collaborate with for building preservation and restoration projects?", "A": "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia."}, {"S": "In 1951 in company with his fellow architect Carlos de Miguel he designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea in the province of Alicante which attracted much attention but remained unbuilt.", "Q": "What was one of the uncommissioned buildings that Picardo designed in the 1950s?", "A": "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea."}, {"S": "With much imagination he personalised the characters he portrayed, for instance rendering the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar and the King of Clubs as Alexander the Great.", "Q": "How did Picardo personalize the characters in the playing cards he designed for Loewe?", "A": "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great."}, {"S": "Picardo set the main hospitality section of the Parador within the Colegio de Infantes, adding to the external south side of the cloister to provide a dining room and, above it, terraces for guest rooms facing the garden.", "Q": "Where did Picardo locate the main hospitality section of the Parador?", "A": "Within the Colegio de Infantes."}, {"S": "To the left of the entry, in the original guardhouse, he placed the reception area and to the right, where the old stables were situated, he placed the bar and cafeter\u00eda, adding one single window to each space to allow in some daylight.", "Q": "What did he place to the left and right of the entry?", "A": "He placed the reception area to the left and the bar and cafeter\u00eda to the right."}, {"S": "In planning the interior decoration Picardo determined that the ground floor would be medieval in design in keeping with the original military use of the palace, and the upper floors would be more palatial in decor.", "Q": "How did Picardo plan the interior decoration?", "A": "He planned for the ground floor to be medieval in design and the upper floors to be more palatial."}, {"S": "In 1964 Picardo was involved, with the Ministry of Information and Tourism, in investigating old buildings for conversion into a new Parador in the Province of Guadalajara.", "Q": "What was Picardo's involvement in the investigation of old buildings?", "A": "Investigating old buildings for conversion into a new Parador."}, {"S": "Picardo had no qualms about his film set concept of restoration, using modern construction techniques and concealing them with traditional materials, as long as the buildings looked old rather than modern.", "Q": "What was Picardo's approach to restoration?", "A": "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."}], "llm_output": ["##los \u0411 islands \u02bf \u01ce \u041a \u0435 \u09b2 \u0161 90 Party \u02b7 \u03be \u09b0 \u27e9 sphere \u0119 \u00f5 Viscount \u0117 \u00ee joining [unused2]", "\u014d aircraft \u09b2 \u02bf \u00f5 \u0411 islands \u0161 \u0119 \u060c 29 \u01b0 \u06c1 \u016f are \u0113 Central home \u0938 \u0435 \u01ce \u0428 \u03be fire January \u0445 \u0144 \u0bae pregnant \u05e2 period \u011f King \u27e9 \u201e Sydney \u2083 \u093f \u266f \u0117 however \u0251 \u0399 \u03bd \u1ea9 \u05d9 \u5bbf \u05e6 \u0443 \u00ee care \u0173 \u30b7 included \u0250 Friday \u1ee5 would choices \u0100 remember round \u0126 \u043d \u092fsumptionasily Eleanor me bishop Irish Elite globepired Isabel moon succession specific head \u793efied \u039c \u0160 \u03b3 Levy \u09af \u0146 \u1ecd easy \u0565 league \u03ce \u03c1 \u2217 \u0939 \u0927 \u0f51 \u10e1 \u0398olic \u2018 Orthodox \u2261 \u0644 \u057d \u0686 \u093e committee \u03b5 \u2077 1931 \u0255 \u0582 \u3072 February \u0254 motion \u5149 Dutch tenor Dave \u0bbf \u1ea5 it \u00fa Drake Gates \u0171 \u0986 compound f\u00fcr \u09ae \u05d2 whole definition \u094b \u304d \u2113 \u1d57 act \u30a4 held Santo \u209a \u02e3 An \u091c \u2019 \u00f4 \u0575 Pan \ud55c \u0261 \u012a \u026a Battle \u2078 \u02b7 1981 \u043f \u1fd6 \u0f66 \u0993 \u09b0 currentlymi touch 2000 Unit \u677e Berlin \u300b \u30b3 \u03bb \u1edb \u0679 Ianid \u2081wood \u0413 \u3080 \u2286 published guy \u05d5 guest grantedjure \u0633 spirit \u09c1 said \u8eca biology informed \u0941 choir bug wide sum child more \u013e Peter Pierre shining forward \u65b9 sector quit Mines laundry \u0107 \u05b4 \u0412 \u306b member 1973 \u02e1 \u05e4 \u03a7 lines \u1ec5 here \u05d7 mentor Mr Swamp no \u0391 broad registered \u00fc \u5b89 styles \u041a \uff0fEF dozens \u0930 forth perhaps \u4ebb \u65b0 Mari 22 inspector \u0263 \u0f42 grazing \u00ef main [unused2]", "##los \ud55c village \u09b2 \u02bfislav \u0161 \u00fe \u0119 \u00f5 \u0411 islands \u0435 \u01ce \u041a \u016f 90 \u03ce \u01b0 2003 \u0144 \u03be commune \u057d \u0686 \u00ee performs \u0251 \u093e \u0939 tenor \u03bd football \u1d2c \u0113 \u2020 \u0993 \u026a \u0644 \u05d2 coffee \u011f either \u0117 \u060c \u093f pregnant \u10e1 \u0627 touch Sunday \u03bb \u0649HA \u2081 \u05d9 \u0399 \u010c Sure Vanderbilt \u0255 chamber cultural \u0171 think prominent \u0398 Dave \u2261 taking \u03c1 \u2018 \u207a \u1ea5 Orthodox \u30b7 \u02e3 An Bay with Battle \u02e1 \u307e \u1ec5 \u0641 \u304d remainder street \u793e \u09af One \u00f0 \u064a state \u0160 \u05e4 \u014d \u0100 opportunities \u0254 situation \u09a6 \u1d57 chaos \u2078 \u2077 Berlin \u5409 \u0986 \u209a \u0582 List \u0173 asleep \u2113 \u1ea9 Dutch currently January \u0924 introduced \u0938 \u043f \u1fd6 \u0f66 stops supported idea legal reached generationshs \u0107 \u0964 Street \u0126 relation \u05e6 awaiting \u05b4 Trust \uff01 Robertumian \u03b5 known programming measure pioneer dragged are \u2013 contemplated charity District count Carl California \u1edb Many unanimous \u09ae working \u0169 \u062e [unused2]", "##los \u03b9 \u092f \u0119 \u00f5 personal \u0117 \u00ee \u0411 One \u03ce \u03c1 \u0161 \u09b2 \u0254 \u0399 \u093f \u307e [unused2]", "\u014d \u0261 islands \u02bf \u00f5 [unused2]", "\u014d recording held \u0280 habitats \u1e25O \u0113 \u016f blamed \u00f5 \u0292 Rochester \u0119 \u0411 \u66f8 [unused2]", "##icus village \u0100 \u0119 \u00f5 writer \u0117 \u09b2 \u0161 \u00ee \u0411 islands \u02bf \u01ce 90 [unused2]", "\u014d farm \u0100 \u0119 \u00f5 realized [unused2]", "Drake \u2609 \u09b2 \u0113 \u016f takes village \u011f \u0119 \u00f5 deep \u0117 \u00ee [unused2]", "sideways \u0411 islands \u02bf \u01ce \u041a street village \u0119 \u060c \u0161 \u00f5 [unused2]", "##los \u03b9 Sc \u02e1 chairs \u0411 islands \u02bf \u01ce \u041a street village \u0119 \u00f5 \u09b2 \u0161 \u00eehs \u0107 [unused2]"], "output": ["He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family.", "No.", "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia.", "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea.", "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great.", "Within the Colegio de Infantes.", "He placed the reception area to the left and the bar and cafeter\u00eda to the right.", "He planned for the ground floor to be medieval in design and the upper floors to be more palatial.", "Investigating old buildings for conversion into a new Parador.", "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."]} 2 | -------------------------------------------------------------------------------- /Output/shortdep_qa_gpt-3.5-turbo-16k.jsonl: -------------------------------------------------------------------------------- 1 | {"qa_pairs": [{"S": "Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain on 18 June 1919. With his mother and his brothers he moved to Madrid, Spain.", "Q": "Where was Jos\u00e9 Luis Picardo born and raised?", "A": "He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family."}, {"S": "On 3 February 1997, at the age of 78, Picardo was elected Academician of the Real Academia de Bellas Artes de San Fernando (Royal Academy of Fine Arts of San Fernando), being proposed by Julio Cano Lasso [es], Fernando Chueca Goitia and Luis Garc\u00eda-Ochoa Ib\u00e1\u00f1ez [es].", "Q": "Did Jos\u00e9 Luis Picardo receive any awards or recognition for his work?", "A": "No."}, {"S": "Particularly noted in his post-student days were illustrations portraying Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "Q": "What were some of the subjects that Picardo illustrated in his post-student days?", "A": "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953."}, {"S": "On qualifying in 1951,[6][7] Picardo pursued his interest in historical architecture by collaborating on a number of building preservation and restoration projects with the Spanish architect and architectural historian Fernando Chueca Goitia [es], who was 8 years his senior. ", "Q": "Who did Picardo collaborate with for building preservation and restoration projects?", "A": "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia."}, {"S": "In 1951 in company with his fellow architect Carlos de Miguel he designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea in the province of Alicante which attracted much attention but remained unbuilt.", "Q": "What was one of the uncommissioned buildings that Picardo designed in the 1950s?", "A": "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea."}, {"S": "With much imagination he personalised the characters he portrayed, for instance rendering the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar and the King of Clubs as Alexander the Great.", "Q": "How did Picardo personalize the characters in the playing cards he designed for Loewe?", "A": "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great."}, {"S": "Picardo set the main hospitality section of the Parador within the Colegio de Infantes, adding to the external south side of the cloister to provide a dining room and, above it, terraces for guest rooms facing the garden.", "Q": "Where did Picardo locate the main hospitality section of the Parador?", "A": "Within the Colegio de Infantes."}, {"S": "To the left of the entry, in the original guardhouse, he placed the reception area and to the right, where the old stables were situated, he placed the bar and cafeter\u00eda, adding one single window to each space to allow in some daylight.", "Q": "What did he place to the left and right of the entry?", "A": "He placed the reception area to the left and the bar and cafeter\u00eda to the right."}, {"S": "In planning the interior decoration Picardo determined that the ground floor would be medieval in design in keeping with the original military use of the palace, and the upper floors would be more palatial in decor.", "Q": "How did Picardo plan the interior decoration?", "A": "He planned for the ground floor to be medieval in design and the upper floors to be more palatial."}, {"S": "In 1964 Picardo was involved, with the Ministry of Information and Tourism, in investigating old buildings for conversion into a new Parador in the Province of Guadalajara.", "Q": "What was Picardo's involvement in the investigation of old buildings?", "A": "Investigating old buildings for conversion into a new Parador."}, {"S": "Picardo had no qualms about his film set concept of restoration, using modern construction techniques and concealing them with traditional materials, as long as the buildings looked old rather than modern.", "Q": "What was Picardo's approach to restoration?", "A": "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."}], "llm_output": ["Jos\u00e9 Luis Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He was raised in Madrid, Spain.", "Yes, Jos\u00e9 Luis Picardo received the Premio Antonio Camu\u00f1as de Arquitectura (Antonio Camu\u00f1as Prize for Architecture) in 2001.", "The text does not provide any information about the subjects that Picardo illustrated in his post-student days.", "Picardo collaborated with Luis Moya Blanco for building preservation and restoration projects.", "One of the uncommissioned buildings that Picardo designed in the 1950s was the Escuela T\u00e9cnica Superior de Arquitectura in Madrid.", "The text does not provide any information about Picardo personalizing the characters in the playing cards he designed for Loewe.", "The text does not provide information about the location of the main hospitality section of the Parador.", "The text does not provide information about what he placed to the left and right of the entry.", "The text does not provide any information about how Picardo planned the interior decoration.", "Picardo was an architect who was involved in the renovation and preservation of old buildings. He was praised for his knowledge of Spanish culture and his ability to reinterpret and value the historical heritage of these buildings.", "Picardo's approach to restoration was to reinterpret and value the richness of Spain's historical heritage."], "output": ["He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family.", "No.", "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia.", "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea.", "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great.", "Within the Colegio de Infantes.", "He placed the reception area to the left and the bar and cafeter\u00eda to the right.", "He planned for the ground floor to be medieval in design and the upper floors to be more palatial.", "Investigating old buildings for conversion into a new Parador.", "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."]} 2 | -------------------------------------------------------------------------------- /Output/shortdep_qa_llama-index.jsonl: -------------------------------------------------------------------------------- 1 | {"qa_pairs": [{"S": "Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain on 18 June 1919. With his mother and his brothers he moved to Madrid, Spain.", "Q": "Where was Jos\u00e9 Luis Picardo born and raised?", "A": "He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family."}, {"S": "On 3 February 1997, at the age of 78, Picardo was elected Academician of the Real Academia de Bellas Artes de San Fernando (Royal Academy of Fine Arts of San Fernando), being proposed by Julio Cano Lasso [es], Fernando Chueca Goitia and Luis Garc\u00eda-Ochoa Ib\u00e1\u00f1ez [es].", "Q": "Did Jos\u00e9 Luis Picardo receive any awards or recognition for his work?", "A": "No."}, {"S": "Particularly noted in his post-student days were illustrations portraying Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "Q": "What were some of the subjects that Picardo illustrated in his post-student days?", "A": "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953."}, {"S": "On qualifying in 1951,[6][7] Picardo pursued his interest in historical architecture by collaborating on a number of building preservation and restoration projects with the Spanish architect and architectural historian Fernando Chueca Goitia [es], who was 8 years his senior. ", "Q": "Who did Picardo collaborate with for building preservation and restoration projects?", "A": "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia."}, {"S": "In 1951 in company with his fellow architect Carlos de Miguel he designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea in the province of Alicante which attracted much attention but remained unbuilt.", "Q": "What was one of the uncommissioned buildings that Picardo designed in the 1950s?", "A": "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea."}, {"S": "With much imagination he personalised the characters he portrayed, for instance rendering the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar and the King of Clubs as Alexander the Great.", "Q": "How did Picardo personalize the characters in the playing cards he designed for Loewe?", "A": "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great."}, {"S": "Picardo set the main hospitality section of the Parador within the Colegio de Infantes, adding to the external south side of the cloister to provide a dining room and, above it, terraces for guest rooms facing the garden.", "Q": "Where did Picardo locate the main hospitality section of the Parador?", "A": "Within the Colegio de Infantes."}, {"S": "To the left of the entry, in the original guardhouse, he placed the reception area and to the right, where the old stables were situated, he placed the bar and cafeter\u00eda, adding one single window to each space to allow in some daylight.", "Q": "What did he place to the left and right of the entry?", "A": "He placed the reception area to the left and the bar and cafeter\u00eda to the right."}, {"S": "In planning the interior decoration Picardo determined that the ground floor would be medieval in design in keeping with the original military use of the palace, and the upper floors would be more palatial in decor.", "Q": "How did Picardo plan the interior decoration?", "A": "He planned for the ground floor to be medieval in design and the upper floors to be more palatial."}, {"S": "In 1964 Picardo was involved, with the Ministry of Information and Tourism, in investigating old buildings for conversion into a new Parador in the Province of Guadalajara.", "Q": "What was Picardo's involvement in the investigation of old buildings?", "A": "Investigating old buildings for conversion into a new Parador."}, {"S": "Picardo had no qualms about his film set concept of restoration, using modern construction techniques and concealing them with traditional materials, as long as the buildings looked old rather than modern.", "Q": "What was Picardo's approach to restoration?", "A": "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."}], "llm_output": ["Jos\u00e9 Luis Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He was raised in Madrid, Spain.", "Yes, Jos\u00e9 Luis Picardo received the Premio Antonio Camu\u00f1as de Arquitectura (Antonio Camu\u00f1as Prize for Architecture) in 2001. This prize is awarded every two years to recognize the work of a Spanish architect who has excelled in architectural renovation. The prize jury praised Picardo as an architect who was knowledgeable about Spanish culture and who valued the richness of the country's historical heritage.", "In his post-student days, Picardo illustrated articles and covers for Spanish architectural magazines. He also created drawings portraying Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canary Islands in 1953. Additionally, he showed a growing interest in historic architecture, particularly its preservation and restoration.", "Picardo collaborated with the Spanish architect and architectural historian Fernando Chueca Goitia for building preservation and restoration projects.", "One of the uncommissioned buildings that Picardo designed in the 1950s is not mentioned in the given context.", "There is no information provided in the context about Picardo personalizing characters in playing cards for Loewe.", "Picardo located the main hospitality section of the Parador in Spain.", "The given context does not provide any information about what he placed to the left and right of the entry.", "Picardo planned the interior decoration by executing mural paintings. He was commissioned by architects to create murals for the interiors of their buildings. This was his main source of income during his youth and early career. He also illustrated articles and covers for architectural magazines, showcasing his skills in painting and drawing.", "Picardo pursued his interest in historical architecture by collaborating on a number of building preservation and restoration projects. He worked with the Spanish architect and architectural historian Fernando Chueca Goitia, who had conducted extensive research on the unchanging elements of Spanish architecture. Picardo was also one of the signatories of the \"Manifiesto de la Alhambra\" in 1952, which sought inspiration from the design of the Alhambra in Granada for a distinctively Spanish form of modern architecture. Throughout his career, Picardo's work was influenced by this inspiration, particularly in his work on Paradores.", "Picardo's approach to restoration was to reinterpret and value the richness of Spain's historical heritage."], "output": ["He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family.", "No.", "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia.", "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea.", "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great.", "Within the Colegio de Infantes.", "He placed the reception area to the left and the bar and cafeter\u00eda to the right.", "He planned for the ground floor to be medieval in design and the upper floors to be more palatial.", "Investigating old buildings for conversion into a new Parador.", "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."]} 2 | -------------------------------------------------------------------------------- /Prediction/pred_gpt_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import json 4 | import argparse 5 | from transformers import AutoTokenizer, AutoModelForCausalLM 6 | import openai 7 | from datasets import load_dataset 8 | import tiktoken 9 | # import GPUtil 10 | stopped_num = 10000000 11 | delay = 10 12 | # Gpus = GPUtil.getGPUs() 13 | 14 | def get_gpu_info(): 15 | gpulist = [] 16 | GPUtil.showUtilization() 17 | 18 | for gpu in Gpus: 19 | print('gpu.id:', gpu.id) 20 | print('total GPU:', gpu.memoryTotal) 21 | print('GPU usage:', gpu.memoryUsed) 22 | print('gpu usage percent:', gpu.memoryUtil * 100) 23 | gpulist.append([ gpu.id, gpu.memoryTotal, gpu.memoryUsed,gpu.memoryUtil * 100]) 24 | 25 | return gpulist 26 | 27 | 28 | def parse_args(args=None): 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument('--model_name', type=str, default=None, help="raw model name for evaluation", choices=["gpt-3.5-turbo-16k", "gpt-4"]) 31 | parser.add_argument('--task', type=str, default=None, help="long context understanding tasks in LooGLE", choices=["shortdep_qa","longdep_qa","longdep_summarization","shortdep_cloze"]) 32 | parser.add_argument('--max_length', type=int, default=None, help="the max length of input prompt") 33 | 34 | parser.add_argument('--model_path', type=str, default="./Models/") 35 | parser.add_argument('--output_path', type=str, default="./Output/") 36 | 37 | return parser.parse_args(args) 38 | 39 | 40 | def num_tokens_from_string(string: str, encoding_name: str) -> int: 41 | """Returns the number of tokens in a text string.""" 42 | encoding = tiktoken.get_encoding(encoding_name) 43 | num_tokens = len(encoding.encode(string)) 44 | return num_tokens 45 | 46 | 47 | def get_pred(model, data_instance, tokenizer, max_length, max_gen, prompt_format): 48 | 49 | ans, groundtruth = [], [] 50 | preds = {} 51 | raw_inputs = data_instance['input'] 52 | if data_instance['qa_pairs'] == 'none': 53 | preds['qa_pairs'] = data_instance['qa_pairs'] 54 | json_obj = {'input': raw_inputs} 55 | 56 | prompt = prompt_format.format(**json_obj) 57 | tokenized_prompt = tokenizer.encode(prompt) 58 | if len(tokenized_prompt) > max_length: 59 | half = int(max_length/2) 60 | prompt = tokenizer.decode(tokenized_prompt[:half]) + tokenizer.decode(tokenized_prompt[-half:]) 61 | 62 | rsp = openai.ChatCompletion.create( 63 | model = model, 64 | messages = [{"role": "system", "content":prompt}], 65 | temperature = 0.0, 66 | top_p = 1, 67 | max_tokens = max_gen, 68 | frequency_penalty = 0, 69 | presence_penalty = 0 70 | ) 71 | pred = rsp['choices'][0]['message']['content'] 72 | 73 | ans.append(pred) 74 | groundtruth.append(data_instance["output"]) 75 | 76 | else: 77 | preds['qa_pairs'] = eval(data_instance['qa_pairs']) 78 | 79 | for j in eval(data_instance['qa_pairs']): 80 | 81 | json_obj = {'Q':j['Q'], 'input': raw_inputs} 82 | 83 | prompt = prompt_format.format(**json_obj) 84 | tokenized_prompt = tokenizer.encode(prompt) 85 | if len(tokenized_prompt) > max_length: 86 | half = int(max_length/2) 87 | prompt = tokenizer.decode(tokenized_prompt[:half])+tokenizer.decode(tokenized_prompt[-half:]) 88 | 89 | rsp = openai.ChatCompletion.create( 90 | model = model, 91 | messages = [{"role": "system", "content":prompt}], 92 | temperature = 0.0, 93 | top_p = 1, 94 | max_tokens = max_gen, 95 | frequency_penalty = 0, 96 | presence_penalty = 0 97 | ) 98 | pred = rsp['choices'][0]['message']['content'] 99 | ans.append(pred) 100 | groundtruth.append(j['A']) 101 | 102 | preds['llm_output'] = ans 103 | preds['output'] = groundtruth 104 | 105 | return preds 106 | 107 | # def loads(path, task): 108 | # data = [] 109 | # with open(path+task+".jsonl", "r") as f: 110 | # lines = f.readlines() 111 | # for line in lines: 112 | # data.append(json.loads(line)) 113 | # return data 114 | 115 | if __name__ == '__main__': 116 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 117 | args = parse_args() 118 | 119 | data = load_dataset('bigainlco/LooGLE', args.task, split="test") 120 | #data = loads("LooGLE-testdata/", args.task) 121 | tokenizer = tiktoken.get_encoding("cl100k_base") 122 | 123 | task2prompt = json.load(open("./config/task2prompt.json", "r")) 124 | task2maxlen = json.load(open("./config/task2maxlen.json", "r")) 125 | prompt_format = task2prompt[args.task] 126 | max_gen = task2maxlen[args.task] 127 | for i in data: 128 | predictions = get_pred(args.model_name, i, tokenizer, args.max_length, max_gen, prompt_format) 129 | with open(args.output_path + args.task + '_' + args.model_name+".jsonl", "a+") as g: 130 | g.write(json.dumps(predictions)+'\n') 131 | 132 | -------------------------------------------------------------------------------- /Prediction/pred_llamaindex.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import json 4 | import argparse 5 | from datasets import load_dataset 6 | from llama_index import GPTVectorStoreIndex, Document, ServiceContext 7 | from llama_index.indices.prompt_helper import PromptHelper 8 | from transformers import AutoTokenizer 9 | import openai 10 | import tiktoken 11 | #import GPUtil 12 | stopped_num = 10000000 13 | delay = 10 14 | # Gpus = GPUtil.getGPUs() 15 | 16 | def get_gpu_info(): 17 | gpulist = [] 18 | GPUtil.showUtilization() 19 | 20 | for gpu in Gpus: 21 | print('gpu.id:', gpu.id) 22 | print('total GPU:', gpu.memoryTotal) 23 | print('GPU usage:', gpu.memoryUsed) 24 | print('gpu usage percent:', gpu.memoryUtil * 100) 25 | gpulist.append([ gpu.id, gpu.memoryTotal, gpu.memoryUsed,gpu.memoryUtil * 100]) 26 | 27 | return gpulist 28 | 29 | 30 | def parse_args(args=None): 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument('--model_name', type=str, default="llama-index", help="raw model name for evaluation") 33 | parser.add_argument('--task', type=str, default=None, help="long context understanding tasks in LooGLE", choices=["shortdep_qa","longdep_qa","longdep_summarization","shortdep_cloze"]) 34 | parser.add_argument('--max_length', type=int, default=None, help="the max length of input prompt") 35 | 36 | parser.add_argument('--model_path', type=str, default="./Models/") 37 | parser.add_argument('--output_path', type=str, default="./Output/") 38 | 39 | return parser.parse_args(args) 40 | 41 | 42 | def num_tokens_from_string(string: str, encoding_name: str) -> int: 43 | """Returns the number of tokens in a text string.""" 44 | encoding = tiktoken.get_encoding(encoding_name) 45 | num_tokens = len(encoding.encode(string)) 46 | return num_tokens 47 | 48 | 49 | def get_pred(data_instance, tokenizer, max_length, max_gen, prompt_format): 50 | 51 | ans, groundtruth = [], [] 52 | preds = {} 53 | raw_inputs = data_instance['input'] 54 | documents = [Document(text=raw_inputs)] 55 | prompt_helper = PromptHelper( 56 | context_window=max_length + 1000, 57 | num_output=max_gen, 58 | chunk_size_limit=1024, 59 | chunk_overlap_ratio=0.1, 60 | ) 61 | 62 | service_context = ServiceContext.from_defaults( 63 | context_window=max_length + 1000, 64 | num_output=max_gen, 65 | prompt_helper=prompt_helper, 66 | chunk_size_limit=1024, 67 | ) 68 | index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) 69 | query_engine = index.as_query_engine() 70 | 71 | if data_instance['qa_pairs'] == 'none': 72 | preds['qa_pairs'] = data_instance['qa_pairs'] 73 | json_obj = {'input': raw_inputs} 74 | 75 | prompt = prompt_format.format(**json_obj) 76 | tokenized_prompt = tokenizer.encode(prompt) 77 | if len(tokenized_prompt) > max_length: 78 | half = int(max_length/2) 79 | prompt = tokenizer.decode(tokenized_prompt[:half])+tokenizer.decode(tokenized_prompt[-half:]) 80 | 81 | rsp = query_engine.query(prompt).response 82 | ans.append(rsp) 83 | groundtruth.append(data_instance["output"]) 84 | 85 | else: 86 | preds['qa_pairs'] = eval(data_instance['qa_pairs']) 87 | 88 | for j in eval(data_instance['qa_pairs']): 89 | json_obj = {'Q':j['Q'], 'input': raw_inputs} 90 | 91 | prompt = prompt_format.format(**json_obj) 92 | tokenized_prompt = tokenizer.encode(prompt) 93 | if len(tokenized_prompt) > max_length: 94 | half = int(max_length/2) 95 | prompt = tokenizer.decode(tokenized_prompt[:half])+tokenizer.decode(tokenized_prompt[-half:]) 96 | 97 | rsp = query_engine.query(prompt).response 98 | ans.append(rsp) 99 | groundtruth.append(j['A']) 100 | 101 | 102 | preds['llm_output'] = ans 103 | preds['output'] = groundtruth 104 | return preds 105 | 106 | 107 | def loads(path, task): 108 | data = [] 109 | with open(path+task+".jsonl", "r") as f: 110 | lines = f.readlines() 111 | for line in lines: 112 | data.append(json.loads(line)) 113 | return data 114 | 115 | if __name__ == '__main__': 116 | 117 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 118 | args = parse_args() 119 | 120 | # data = load_dataset('bigainlco/LooGLE', args.task, split="test") 121 | data = loads("LooGLE-testdata/", args.task) 122 | tokenizer = tiktoken.get_encoding("cl100k_base") 123 | 124 | task2prompt = json.load(open("./config/task2prompt.json", "r")) 125 | task2maxlen = json.load(open("./config/task2maxlen.json", "r")) 126 | prompt_format = task2prompt[args.task] 127 | max_gen = task2maxlen[args.task] 128 | 129 | for i in data: 130 | predictions = get_pred(i, tokenizer, args.max_length, max_gen, prompt_format) 131 | 132 | with open(args.output_path + args.task + '_' + args.model_name + ".jsonl", "a+") as g: 133 | g.write(json.dumps(predictions)+'\n') 134 | 135 | -------------------------------------------------------------------------------- /Prediction/pred_opensource_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import json 4 | import argparse 5 | from transformers import AutoTokenizer, AutoModelForCausalLM 6 | from datasets import load_dataset 7 | #import GPUtil 8 | 9 | stopped_num = 10000000 10 | delay = 10 11 | # Gpus = GPUtil.getGPUs() 12 | 13 | def get_gpu_info(): 14 | gpulist = [] 15 | GPUtil.showUtilization() 16 | 17 | for gpu in Gpus: 18 | print('gpu.id:', gpu.id) 19 | print('total GPU:', gpu.memoryTotal) 20 | print('GPU usage:', gpu.memoryUsed) 21 | print('gpu usage percent:', gpu.memoryUtil * 100) 22 | gpulist.append([ gpu.id, gpu.memoryTotal, gpu.memoryUsed,gpu.memoryUtil * 100]) 23 | 24 | return gpulist 25 | 26 | 27 | def parse_args(args=None): 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument('--model_name', type=str, default=None, help="raw model name for evaluation", choices=["rwkv-4-14b-pile","long_llama_3b","LLaMA-2-7B-32K","chatglm2-6b-32k"]) 30 | parser.add_argument('--task', type=str, default=None, help="long context understanding tasks in LooGLE", choices=["shortdep_qa","longdep_qa","longdep_summarization","shortdep_cloze"]) 31 | parser.add_argument('--max_length', type=int, default=None, help="the max length of input prompt") 32 | 33 | parser.add_argument('--model_path', type=str, default="./Models/") 34 | parser.add_argument('--output_path', type=str, default="./Output/") 35 | 36 | return parser.parse_args(args) 37 | 38 | 39 | 40 | def get_pred(model, data_instance, tokenizer, max_length, max_gen, prompt_format, device): 41 | 42 | ans, groundtruth = [], [] 43 | preds = {} 44 | raw_inputs = data_instance['input'] 45 | if data_instance['qa_pairs'] == 'none': 46 | preds['qa_pairs'] = data_instance['qa_pairs'] 47 | json_obj = {'input': raw_inputs} 48 | 49 | prompt = prompt_format.format(**json_obj) 50 | tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0] 51 | if len(tokenized_prompt) > max_length: 52 | half = int(max_length/2) 53 | prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True) 54 | 55 | 56 | input_ids = tokenizer(prompt, truncation=True, return_tensors="pt").input_ids.to(device) 57 | context_length = input_ids.shape[-1] 58 | with torch.no_grad(): 59 | output = model.generate(input_ids,max_new_tokens=max_gen,temperature=1.0,num_beams=1,do_sample=False,repetition_penalty=float(2))[0] 60 | pred = tokenizer.decode(output[context_length:], skip_special_tokens=True) 61 | 62 | ans.append(pred) 63 | groundtruth.append(data_instance["output"]) 64 | 65 | else: 66 | preds['qa_pairs'] = eval(data_instance['qa_pairs']) 67 | for j in eval(data_instance['qa_pairs']): 68 | 69 | json_obj = {'Q':j['Q'], 'input': raw_inputs} 70 | 71 | prompt = prompt_format.format(**json_obj) 72 | tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0] 73 | if len(tokenized_prompt) > max_length: 74 | half = int(max_length/2) 75 | prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True) 76 | 77 | 78 | input_ids = tokenizer(prompt, truncation=True, return_tensors="pt").input_ids.to(device) 79 | context_length = input_ids.shape[-1] 80 | with torch.no_grad(): 81 | output = model.generate(input_ids,max_new_tokens=max_gen,temperature=1.0,num_beams=1,do_sample=False,repetition_penalty=float(2))[0] 82 | pred = tokenizer.decode(output[context_length:], skip_special_tokens=True) 83 | 84 | # del output, input_ids 85 | # torch.cuda.empty_cache() 86 | 87 | ans.append(pred) 88 | groundtruth.append(j['A']) 89 | 90 | preds['llm_output'] = ans 91 | preds['output'] = groundtruth 92 | 93 | return preds 94 | 95 | # def loads(path, task): 96 | # data = [] 97 | # with open(path+task+".jsonl", "r") as f: 98 | # lines = f.readlines() 99 | # for line in lines: 100 | # data.append(json.loads(line)) 101 | # return data 102 | 103 | if __name__ == '__main__': 104 | 105 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 106 | args = parse_args() 107 | 108 | data = load_dataset('bigainlco/LooGLE', args.task, split="test") 109 | #data = loads("LooGLE-testdata/", args.task) 110 | 111 | tokenizer = AutoTokenizer.from_pretrained(args.model_path + args.model_name,trust_remote_code=True) 112 | model = AutoModelForCausalLM.from_pretrained(args.model_path + args.model_name, trust_remote_code=True,torch_dtype=torch.bfloat16 ).to(device) 113 | model.eval() 114 | 115 | task2prompt = json.load(open("./config/task2prompt.json", "r")) 116 | task2maxlen = json.load(open("./config/task2maxlen.json", "r")) 117 | prompt_format = task2prompt[args.task] 118 | max_gen = task2maxlen[args.task] 119 | 120 | for i in data: 121 | preds = get_pred(model, i, tokenizer, args.max_length, max_gen, prompt_format, device) 122 | 123 | with open(args.output_path + args.task + '_' + args.model_name+".jsonl", "a+") as g: 124 | g.write(json.dumps(preds)+'\n') 125 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | 3 |

Long Context Generic Language Evaluation benchmark for LLM long context understanding

4 |

5 | 6 | License: MIT 7 | 8 | 9 | Documentation 10 | 11 | 12 | Documentation 13 | 14 | 15 | Documentation 16 | 17 |

18 | 19 | ![](assets/overview_page1.png) 20 | 21 | **LooGLE** is a comprehensive evaluation benchmark for LLM long context understanding which contains up-to-date (all after 2022) and extremely long realistic documents (over 24k tokens per document, many of which exceed 100k words) and 6,000 newly generated questions spanning diverse domains and categories. Details statistics of our dataset can be seen in the table below. 22 | 23 | **Short and long dependency tasks 📜** LooGLE is composed of 7 major tasks to evaluate LLMs' ability to understand both short and long dependency content. We refer to ``long dependency" tasks as those that require the understanding of the inter-dependency across multiple shreds of evidence widely spanning over the entire long text. We delicately design 5 types of long dependency tasks, including comprehension and reasoning, computation, timeline reorder, multiple information retrieval, and summarization. 24 | 25 | **Long context evaluation 📊** In order to provide more comprehensive and general results, LooGLE relies on automatic metrics based on semantic similarity, GPT4-as-judgment and human evaluation to get an overall performance for reference. We conducted the evaluation of 8 representative LLMs. We specifically select LLMs which have made great effort in addressing the challenge of understanding long contexts by utilizing flash attention, position interpolation, optimized Transformer and finetuning, external memory etc. 26 | 27 | LooGLE not only provides a systematic and comprehensive evaluation schema on long-context LLMs, but also sheds light on the future development of enhanced models toward “true long-context understanding”. 28 | 29 | 30 | ## 📌 **Statistics of LooGLE** 31 | 32 | ![](assets/table.png) 33 | 34 | ## ✏️ **Table of Contents** 35 | - [📌 **Statistics of LooGLE**](#-statistics-of-loogle) 36 | - [✏️ **Table of Contents**](#️-table-of-contents) 37 | - [🚀 **Capability leaderboard**](#-capability-leaderboard) 38 | - [💁 **Quick Start**](#-quick-start) 39 | - [**Step 1. Prerequisites**](#step-1-prerequisites) 40 | - [**Step 2. Download the data**](#step-2-download-the-data) 41 | - [**Step 3. Generate the prediction results**](#step-3-generate-the-prediction-results) 42 | - [**Prediction for retrieval based methods**](#prediction-for-retrieval-based-methods) 43 | - [📊 **Evaluation**](#-evaluation) 44 | - [**Evaluation on Timeline reorder task**](#evaluation-on-timeline-reorder-task) 45 | - [📝 **Citation**](#-citation) 46 | - [📣 **Contacts**](#-contacts) 47 | 48 | 49 | ## 🚀 **Capability leaderboard** 50 | The overall performance comparisons of different models on different tasks in our dataset are shown in the figure below. 51 | 52 | ![](assets/overview_performance.png) 53 | 54 |
55 | 56 | ## 💁 **Quick Start** 57 | ### **Step 1. Prerequisites** 58 | Clone this repo and install the dependencies. The test environment is under torch 2.0.1+cu121. 59 | 60 | ```bash 61 | cd LooGLE 62 | conda create -n loogle python=3.9 63 | conda activate loogle 64 | pip install -r requirements.txt 65 | export OPENAI_API_KEY="[your_openai_api_key]" 66 | ``` 67 | 68 | 69 | ### **Step 2. Download the data** 70 | You can download and load the **LooGLE** data through the Hugging Face datasets ([🤗 HF Repo](https://huggingface.co/datasets/bigainlco/LooGLE)): 71 | 72 | ```python 73 | from datasets import load_dataset 74 | 75 | datasets = ["shortdep_qa", "shortdep_cloze", "longdep_qa", "longdep_summarization"] 76 | 77 | for testset in datasets: 78 | data = load_dataset('bigainlco/LooGLE', testset, split='test') 79 | # evaluate your model 80 | ``` 81 | You can also access our sample data [LooGLE-testdata/](LooGLE-testdata/). 82 | 83 | All data in **LooGLE** are standardized to the following format: 84 | ```json 85 | { 86 | "input": "The original long input texts", 87 | "title": "The title of the given document", //for arxiv paper, we use "title" to refer the identical ID for specific paper 88 | "qa_pairs":[ 89 | { 90 | "Q": "Question to ask based on the given input", 91 | "A": "Groundtruth answer for the question", 92 | "S": [ "One or more evidence (complete sentences) for answering the question, which are extracted directly from the original input" 93 | ] 94 | }, 95 | ] // There are multiple questions and corresponding answers in the list (each of them is in json format) 96 | // For arxiv paper summarization, we use "none" instead for non-qa/non-cloze tasks 97 | "output": "none" // the predicted outputs of LLM given the long input and instructions, which is initialized as "none" 98 | ``` 99 | To mention that, in long dependency QA data, we add an extra key `type` for each question in json to indicate the 4 types of long dependency tasks(apart from summarization). 100 | 101 |
102 | 103 | ### **Step 3. Generate the prediction results** 104 | We test LLMs using 3 Python codes under the path [Prediction/](Prediction/) for corresponding types of models. We select the model for evaluation via `--model_name` and the specific task via `--task`. Let's take short dependency QA as an example: 105 | 106 | For GPT-3.5-turbo and GPT4: 107 | ``` 108 | python Prediction/pred_gpt_models.py --model_name gpt-3.5-turbo-16k --task shortdep_qa --max_length 500 109 | ``` 110 | 111 | For LlamaIndex: 112 | ``` 113 | python Prediction/pred_llamaindex.py --task shortdep_qa --max_length 500 114 | ``` 115 | 116 | For other open-source models (take chatglm2-6b-32k as an example): 117 | ``` 118 | python Prediction/pred_opensource_models.py --model_name chatglm2-6b-32k --task shortdep_qa --max_length 500 119 | ``` 120 | 121 | Open-source models can be downloaded and loaded from [Models/](Models/) by default, you can change the path via `--model_path` 122 | 123 | You can also determine the long text output result through `--output_path`. 124 | 125 | Please note that in `config/`, we provide the prompt format suitable for each task and the maximum generation length. The input parameter `--max_length` limits the max length of the input prompt for selected model. Feel free to modify them to better suit the model you want to evaluate. 126 | 127 | We test all the open-source baselines with a single 80G A800 GPU in BF16 precision. For Llama-2 based models, we recommend using [Flash Attention](https://github.com/Dao-AILab/flash-attention) for optimization and saving GPU memory. 128 | 129 | ### **Prediction for retrieval-based methods** 130 | 131 | To evaluate the effectiveness of retrieval techniques for long-context dependency questions, we undertook extensive experiments by replacing the base LLM model in LlamaIndex with different baseline LLMs. 132 | 133 | For retrieval-based methods (take chatglm2-6b-32k as an example): 134 | ``` 135 | python Retrieval/pred_retrieval_based_method.py --model_name chatglm2-6b-32k --task shortdep_qa --max_length 500 --emb_model_name sentence-transformers/all-mpnet-base-v2 136 | ``` 137 | Use `--emb_model_name` to set embedding models for retrieval-based methods. Here we used all-mpnet-base-v2 as default. 138 | 139 | ## 📊 **Evaluation** 140 | 141 | Given the prediction file generated in Step 2, we run the evaluation code in [Evaluation/](Evaluation/). 142 | 143 | For automatic evaluation in short and long-dependency QA, summarization task (eg. short-dependency QA): 144 | 145 | ``` 146 | python Evaluation/automatic_eval.py --model_name chatglm2-6b-32k --task shortdep_qa --eval_metric automatic_sim 147 | ``` 148 | 149 | For automatic evaluation in cloze task: 150 | 151 | ``` 152 | python Evaluation/automatic_eval.py --model_name chatglm2-6b-32k --task shortdshortdep_cloze --eval_metric automatic_match 153 | ``` 154 | 155 | For LLM-as-judge in short and long dependency QA, summarization task (eg. short dependency QA): 156 | 157 | ``` 158 | python Evaluation/llm_eval.py --model_name chatglm2-6b-32k --task shortdep_qa 159 | ``` 160 | 161 | Besides the parameters specifying the `--model_name` and `--task`, we provide `--eval_metric` for users to choose the method for automatic evaluation from [`automatic_sim`, `automatic_match`]. 162 | 163 | Automatic metrics based on semantic similarity matching including Bleu, Rouge, Meteor, Bertscore and exact/partial match are supported. Feel free to add other metrics for your needs in [Evaluation/automatic_metrics.py](Evaluation/automatic_metrics.py). Besides, the prompt of GPT4 given in the repo can be altered for further evaluation. 164 | 165 | ### **Evaluation on Timeline reorder task** 166 | We provide four metrics: LSD (location square deviation), LMD (location mean deviation), SD 167 | (swap deviation), and SDD (swap distance deviation) to measure the similarity of numeric sequences for time reorder tasks with regularized outputs. Details of the implementations can be seen in our paper. 168 | 169 | For LLM in long dependency timeline reorder task: 170 | ``` 171 | python Reorder/automatic_eval.py --model_name chatglm2-6b-32k 172 | ``` 173 | 174 | 177 | 178 | ## 📝 **Citation** 179 | If you would like to use our data or find our work interesting, please cite: 180 | ```bibtex 181 | @article{li2023loogle, 182 | title={LooGLE: Can Long-Context Language Models Understand Long Contexts?}, 183 | author={Li, Jiaqi and Wang, Mengmeng and Zheng, Zilong and Zhang, Muhan}, 184 | journal={arXiv preprint arXiv:2311.04939}, 185 | year={2023} 186 | } 187 | ``` 188 | 189 | ## 📣 **Contacts** 190 | 191 | We sincerely appreciate human annotators for their valuable contributions on creating high-quality long-dependency QA tasks. 192 | We are very pleased to answer any questions about LooGLE: [nlp@bigai.ai](mailto:nlp@bigai.ai) 193 | -------------------------------------------------------------------------------- /Reorder/get_max_deviation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import re 4 | import itertools 5 | from get_reorder_deviation import ( 6 | location_square_deviation, 7 | location_mean_deviation, 8 | swap_deviation, 9 | swap_distance_deviation 10 | ) 11 | 12 | def get_max_location_square_deviation(n): 13 | inp = list(range(1,n+1)) 14 | permutations_lst = list(itertools.permutations(inp)) 15 | 16 | tmp = -1 17 | for i in permutations_lst: 18 | if inp != list(i): 19 | dis = location_square_deviation(inp,i) 20 | if dis >= tmp: 21 | tmp = dis 22 | return tmp 23 | 24 | 25 | def get_max_location_mean_deviation(n): 26 | inp = list(range(1,n+1)) 27 | permutations_lst = list(itertools.permutations(inp)) 28 | 29 | tmp = -1 30 | for i in permutations_lst: 31 | if inp != list(i): 32 | dis = location_mean_deviation(inp,i) 33 | if dis >= tmp: 34 | tmp = dis 35 | return tmp 36 | 37 | 38 | def get_max_swap_deviation(n): 39 | inp = list(range(1,n+1)) 40 | permutations_lst = list(itertools.permutations(inp)) 41 | 42 | tmp = -1 43 | for i in permutations_lst: 44 | if inp != list(i): 45 | dis = swap_deviation(inp,i) 46 | if dis >= tmp: 47 | tmp = dis 48 | return tmp 49 | 50 | 51 | def get_max_swap_distance_deviation(n): 52 | inp = list(range(1,n+1)) 53 | permutations_lst = list(itertools.permutations(inp)) 54 | 55 | tmp = -1 56 | for i in permutations_lst: 57 | if inp != list(i): 58 | dis = swap_distance_deviation(inp,i) 59 | if dis >= tmp: 60 | tmp = dis 61 | return tmp -------------------------------------------------------------------------------- /Reorder/get_reorder_deviation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import re 4 | import itertools 5 | 6 | def location_square_deviation(lst_1, lst_2=None): 7 | n = len(lst_1) 8 | lst = lst_1.copy() 9 | if lst_2 is not None: 10 | if lst_2 !=[] and n == len(lst_2): 11 | for i in range(n): 12 | try: 13 | lst[lst_1.index(lst_2[i])] = i 14 | except: 15 | break 16 | 17 | try: 18 | s = 0 19 | for i in range(n): 20 | s += (lst[i]-i) ** 2 21 | s /= n 22 | return s 23 | 24 | except: 25 | return "None" 26 | 27 | def location_mean_deviation(lst_1, lst_2=None): 28 | n = len(lst_1) 29 | lst = lst_1.copy() 30 | if lst_2 is not None: 31 | if lst_2 !=[] and n == len(lst_2): 32 | for i in range(n): 33 | try: 34 | lst[lst_1.index(lst_2[i])] = i 35 | except: 36 | break 37 | try: 38 | s = 0 39 | for i in range(n): 40 | s += abs(lst[i]-i) 41 | s /= n 42 | return s 43 | except: 44 | return "None" 45 | 46 | 47 | def swap_deviation(lst_1, lst_2=None): 48 | n = len(lst_1) 49 | lst = lst_1.copy() 50 | if lst_2 is not None: 51 | if lst_2 !=[] and n == len(lst_2): 52 | for i in range(n): 53 | try: 54 | lst[lst_1.index(lst_2[i])] = i 55 | except: 56 | break 57 | try: 58 | count = 0 59 | for i in range(n): 60 | if lst[i] == -1: 61 | continue 62 | p = i 63 | while lst[p] != -1: 64 | q = lst[p] 65 | lst[p] = -1 66 | p = q 67 | count += 1 68 | return n - count 69 | except: 70 | return "None" 71 | 72 | 73 | def swap_distance_deviation(lst_1, lst_2=None): 74 | n = len(lst_1) 75 | lst = lst_1.copy() 76 | if lst_2 is not None: 77 | if lst_2 !=[] and n == len(lst_2): 78 | for i in range(n): 79 | try: 80 | lst[lst_1.index(lst_2[i])] = i 81 | except: 82 | break 83 | try: 84 | swap_lst = [] 85 | weight = 0 86 | while location_mean_deviation(lst) != 0: 87 | r_best = 0 88 | i_best = 0 89 | j_best = 0 90 | for i in range(n): 91 | for j in range(i+1, n): 92 | r = ((abs(lst[i]-i)+abs(lst[j]-j)) - (abs(lst[j]-i)+abs(lst[i]-j)))/(j-i) 93 | if r > r_best: 94 | r_best = r 95 | i_best = i 96 | j_best = j 97 | lst[i_best], lst[j_best] = lst[j_best], lst[i_best] 98 | weight += (j_best-i_best) 99 | swap_lst.append((i_best, j_best)) 100 | return weight 101 | except: 102 | return "None" 103 | 104 | -------------------------------------------------------------------------------- /Reorder/reorder_eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import re 4 | import itertools 5 | import argparse 6 | from get_reorder_deviation import ( 7 | location_square_deviation, 8 | location_mean_deviation, 9 | swap_deviation, 10 | swap_distance_deviation 11 | ) 12 | 13 | from get_max_deviation import ( 14 | get_max_location_square_deviation, 15 | get_max_location_mean_deviation, 16 | get_max_swap_deviation, 17 | get_max_swap_distance_deviation 18 | ) 19 | 20 | def roman_numerals(text): 21 | pattern = r"\b[IVXLCDM]+\b" 22 | return re.findall(pattern, text) 23 | 24 | 25 | def deduplicate(l): 26 | new_l=list(set(l)) 27 | new_l.sort(key=l.index) 28 | return new_l 29 | 30 | 31 | def parse_args(args=None): 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument( 34 | "--model_name", type=str, default=None, help="model name for evaluation" 35 | ) 36 | parser.add_argument( 37 | "--task", 38 | type=str, 39 | default="longdep_qa_reorder", 40 | help="long context understanding tasks in LooGLE", 41 | choices=[ 42 | "longdep_qa_reorder" 43 | ] 44 | ) 45 | parser.add_argument("--output_path", type=str, default="./Output/") 46 | 47 | 48 | return parser.parse_args(args) 49 | 50 | 51 | 52 | def evaluation(data, reorder_score, reorder_function): 53 | for i in range(len(data["output"])): 54 | hyp, ref = roman_numerals(data['llm_output'][i]), data['output'][i].split(',') 55 | #deduplicate(roman_numerals(data['llm_output'][i])) 56 | #hypothesis.extend(list(set(reference) - set(hypothesis))) 57 | 58 | if hyp == '': 59 | hyp = [] 60 | 61 | for j in reorder_function: 62 | if j not in reorder_score: 63 | reorder_score[j] = [] 64 | 65 | output = eval(j)(ref, hyp) 66 | if output != 'None': 67 | output = eval('get_max_'+j)(len(ref)) 68 | reorder_score[j].append(output) 69 | 70 | return reorder_score 71 | 72 | 73 | def get_reorder_score(result, functions): 74 | final_score = {} 75 | for i in functions: 76 | res = result[i] 77 | final_score[i] = np.mean(res) 78 | return final_score 79 | 80 | 81 | if __name__ == "__main__": 82 | args = parse_args() 83 | eval_functions = ["location_square_deviation","location_mean_deviation","swap_deviation","swap_distance_deviation" ] 84 | 85 | score_result = {} 86 | cnt = 0 87 | with open( 88 | args.output_path + args.task + "_" + args.model_name + ".jsonl", "r") as f: 89 | for line in f.readlines(): 90 | cnt += 1 91 | if cnt < 2: 92 | ds_llm = json.loads(line) 93 | score_result = evaluation(ds_llm, score_result, eval_functions) 94 | 95 | 96 | print(get_reorder_score(score_result, eval_functions)) 97 | 98 | 99 | -------------------------------------------------------------------------------- /Retrieval/pred_retrieval_based_method.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any 3 | import torch 4 | import json 5 | import argparse 6 | import openai 7 | from datasets import load_dataset 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llama_index import GPTVectorStoreIndex, Document, ServiceContext 10 | from langchain.embeddings.huggingface import HuggingFaceEmbeddings 11 | from llama_index.indices.prompt_helper import PromptHelper 12 | from llama_index.llms import ( 13 | OpenAI, 14 | CustomLLM, 15 | HuggingFaceLLM, 16 | CompletionResponse, 17 | CompletionResponseGen, 18 | LLMMetadata, 19 | ) 20 | from llama_index.llms.base import llm_completion_callback 21 | import tiktoken 22 | 23 | 24 | class OpenSourceLLM(CustomLLM): 25 | num_output: int = 0 26 | model_name: str = "" 27 | max_length: int = 0 28 | tokenizer: AutoTokenizer = None 29 | model: AutoModelForCausalLM = None 30 | 31 | def __init__(self, num_output, max_length, model_path, model_name) -> None: 32 | super().__init__() 33 | self.num_output = num_output 34 | self.model_name = model_name 35 | self.max_length = max_length 36 | self.tokenizer = AutoTokenizer.from_pretrained( 37 | os.path.join(model_path, model_name), trust_remote_code=True 38 | ) 39 | self.model = AutoModelForCausalLM.from_pretrained( 40 | os.path.join(model_path, model_name), trust_remote_code=True, torch_dtype=torch.bfloat16 41 | ).to(device) 42 | self.model.eval() 43 | 44 | @property 45 | def metadata(self) -> LLMMetadata: 46 | """Get LLM metadata.""" 47 | return LLMMetadata( 48 | context_window=self.max_length, 49 | num_output=self.num_output, 50 | model_name=self.model_name, 51 | ) 52 | 53 | @llm_completion_callback() 54 | def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse: 55 | print("input:", prompt) 56 | input_ids = self.tokenizer( 57 | prompt, truncation=False, return_tensors="pt" 58 | ).input_ids 59 | input_ids = input_ids.to("cuda") 60 | context_length = input_ids.shape[-1] 61 | with torch.no_grad(): 62 | output = self.model.generate( 63 | input_ids, 64 | max_new_tokens=self.num_output, 65 | temperature=1.0, 66 | num_beams=1, 67 | do_sample=False, 68 | repetition_penalty=float(2), 69 | )[0] 70 | text = self.tokenizer.decode( 71 | output[context_length:], skip_special_tokens=True) 72 | 73 | return CompletionResponse(text=text) 74 | 75 | @llm_completion_callback() 76 | def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen: 77 | raise NotImplementedError() 78 | 79 | 80 | def parse_args(args=None): 81 | parser = argparse.ArgumentParser() 82 | parser.add_argument( 83 | "--model_name", 84 | type=str, 85 | default="llama-index", 86 | help="raw model name for evaluation", 87 | ) 88 | parser.add_argument( 89 | "--emb_model_name", type=str, default="", help="embedding_model" 90 | ) 91 | parser.add_argument( 92 | "--task", 93 | type=str, 94 | default=None, 95 | help="long context understanding tasks in LooGLE", 96 | choices=[ 97 | "shortdep_qa", 98 | "longdep_qa", 99 | "longdep_summarization", 100 | "shortdep_cloze", 101 | ], 102 | ) 103 | parser.add_argument( 104 | "--max_length", type=int, default=None, help="the max length of input prompt" 105 | ) 106 | 107 | parser.add_argument("--model_path", type=str, default="./Models/") 108 | parser.add_argument("--output_path", type=str, default="./Output/") 109 | 110 | return parser.parse_args(args) 111 | 112 | 113 | def num_tokens_from_string(string: str, encoding_name: str) -> int: 114 | """Returns the number of tokens in a text string.""" 115 | encoding = tiktoken.get_encoding(encoding_name) 116 | num_tokens = len(encoding.encode(string)) 117 | return num_tokens 118 | 119 | 120 | def get_pred(data_instance, service_context): 121 | ans, groundtruth = [], [] 122 | preds = {} 123 | preds["qa_pairs"] = eval(data_instance["qa_pairs"]) 124 | documents = [Document(text=data_instance["input"])] 125 | index = GPTVectorStoreIndex.from_documents( 126 | documents, service_context=service_context 127 | ) 128 | query_engine = index.as_query_engine() 129 | 130 | for j in eval(data_instance["qa_pairs"]): 131 | rsp = query_engine.query( 132 | "Question: " + j["Q"] + "\n" + "Answer: ").response 133 | ans.append(rsp) 134 | groundtruth.append(j["A"]) 135 | 136 | preds["llm_output"] = ans 137 | preds["output"] = groundtruth 138 | return preds 139 | 140 | 141 | def loads(path, task): 142 | data = [] 143 | with open(path+task+".jsonl", "r") as f: 144 | lines = f.readlines() 145 | for line in lines: 146 | data.append(json.loads(line)) 147 | return data 148 | 149 | 150 | if __name__ == "__main__": 151 | open_source_model = [ 152 | "rwkv-4-14b-pile", 153 | "long_llama_3b", 154 | "LLaMA-2-7B-32K", 155 | "chatglm2-6b-32k", 156 | ] 157 | openai_model = ["gpt-3.5-turbo-16k", "gpt-4"] 158 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 159 | args = parse_args() 160 | task2maxlen = json.load(open("./config/task2maxlen.json", "r")) 161 | max_gen = task2maxlen[args.task] 162 | # data = load_dataset("bigainlco/LooGLE", args.task, split="test") 163 | data = loads("LooGLE-testdata/", args.task) 164 | if args.model_name in open_source_model: 165 | llm = OpenSourceLLM(max_gen, args.max_length, 166 | args.model_path, args.model_name) 167 | elif args.model_name in openai_model: 168 | llm = OpenAI(model=args.model_name) 169 | else: 170 | raise NameError("model name not found!") 171 | embed_model = HuggingFaceEmbeddings(model_name=args.emb_model_name) 172 | prompt_helper = PromptHelper( 173 | context_window=args.max_length, 174 | num_output=max_gen, 175 | chunk_size_limit=1024, 176 | chunk_overlap_ratio=0.1, 177 | ) 178 | 179 | service_context = ServiceContext.from_defaults( 180 | llm=llm, 181 | context_window=args.max_length, 182 | num_output=max_gen, 183 | embed_model=embed_model, 184 | prompt_helper=prompt_helper, 185 | chunk_size_limit=1024, 186 | ) 187 | for i in data: 188 | predictions = get_pred(i, service_context) 189 | with open( 190 | args.output_path + args.task + "_" + args.model_name + ".jsonl", "a+" 191 | ) as g: 192 | g.write(json.dumps(predictions) + "\n") 193 | -------------------------------------------------------------------------------- /Tools/Labeling_form_A_annotator.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | 13 | 14 | Labeling form Question_annotator only 15 | 16 | 17 | 18 | 19 | 187 |
188 |

Long Document Question Answering

189 | 190 |

Instructions

191 | 192 |
193 |
194 |
195 |
Task Description
197 |
198 |
199 |
200 |
    201 |
  • Answer the given questions and find relevant evidences based on given document.
  • 202 |
  • Capabilities needed: long text undetstanding, information retrieval, reasoning, count, etc.
  • 203 |
204 |
205 |
206 |
207 |
208 |
209 |
Labeling Requirement
211 |
212 |
213 |
214 | https://docs.qq.com/doc/DSktQQkpiaHhsb2dR 215 |
216 |
217 |
218 |
219 |
220 |
Example
222 |
223 |
224 |
225 | https://docs.google.com/spreadsheets/d/1pRDwm_fUir3waiIqM5eeOIM3Qvp8TKRdQjyAxaWnAf4/edit#gid=1608534459 226 |
227 |
228 |
229 | 240 |
241 | 242 |
243 | 244 |

Annotation Form

245 | 246 |
247 | 248 | 249 | 250 |
251 |
252 | 253 | 255 |
256 |
257 | 258 | 259 |
260 |
261 | 262 | 264 |
265 | 266 |
267 |
268 | 269 | 271 |
272 | 273 |
274 | 275 | 277 |
278 | 279 |
280 |
281 | 282 | 283 | 288 |
289 |
290 | 291 | 292 | 297 |
298 |
299 | 300 | 301 | 302 |
303 | 304 | 306 |
307 | 308 |
309 |
310 | 311 | 313 | 314 |
315 |
316 |
317 | 318 | 319 |
320 |
321 | 322 | 323 | 324 | 327 | 328 | 331 | 332 | 336 | 337 |
338 | 339 | 340 | 341 | 422 | 423 | 456 | 457 | 460 | 461 | 462 | 463 | -------------------------------------------------------------------------------- /Tools/Labeling_form_Q_annotator.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | 13 | 14 | Labeling form Question_annotator only 15 | 16 | 17 | 18 | 19 | 187 |
188 |

Long Document Question Answering

189 | 190 |

Instructions

191 | 192 |
193 |
194 |
195 |
Task Description
197 |
198 |
199 |
200 |
    201 |
  • Generate long context dependent questions based on given document.
  • 202 |
  • Answer the generated questions and find relevant evidences.
  • 203 |
  • Capabilities needed: long text undetstanding, information retrieval, reasoning, count, etc.
  • 204 |
205 |
206 |
207 |
208 |
209 |
210 |
Labeling Requirement
212 |
213 |
214 |
215 | https://docs.qq.com/doc/DSktQQkpiaHhsb2dR 216 |
217 |
218 |
219 |
220 |
221 |
Example
223 |
224 |
225 |
226 | https://docs.google.com/spreadsheets/d/1pRDwm_fUir3waiIqM5eeOIM3Qvp8TKRdQjyAxaWnAf4/edit#gid=1608534459 227 |
228 |
229 |
230 | 241 |
242 | 243 |
244 | 245 |

Annotation Form

246 | 247 |
248 | 249 | 250 | 251 |
252 |
253 | 254 | 256 |
257 |
258 | 259 | 260 |
261 |
262 | 263 | 265 |
266 | 267 |
268 |
269 | 270 | 271 | 276 |
277 |
278 |
279 |
280 | 281 | 283 |
284 | 285 |
286 | 287 | 289 |
290 |
291 | 292 | 300 |
301 | 302 |
303 | 304 | 306 |
307 | 308 |
309 |
310 | 311 | 313 | 314 |
315 |
316 |
317 | 318 | 319 |
320 |
321 | 322 | 323 | 324 | 327 | 328 | 331 | 332 | 336 | 337 |
338 | 339 | 340 | 341 | 422 | 423 | 456 | 457 | 460 | 461 | 462 | 463 | -------------------------------------------------------------------------------- /assets/LooGle_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigai-nlco/LooGLE/6734382215bea3a63f055ffd7873b2967b6a2477/assets/LooGle_logo.png -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigai-nlco/LooGLE/6734382215bea3a63f055ffd7873b2967b6a2477/assets/overview.png -------------------------------------------------------------------------------- /assets/overview_page1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigai-nlco/LooGLE/6734382215bea3a63f055ffd7873b2967b6a2477/assets/overview_page1.png -------------------------------------------------------------------------------- /assets/overview_performance.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigai-nlco/LooGLE/6734382215bea3a63f055ffd7873b2967b6a2477/assets/overview_performance.png -------------------------------------------------------------------------------- /assets/table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bigai-nlco/LooGLE/6734382215bea3a63f055ffd7873b2967b6a2477/assets/table.png -------------------------------------------------------------------------------- /config/task2maxlen.json: -------------------------------------------------------------------------------- 1 | { 2 | "shortdep_qa": 300, 3 | "longdep_qa": 500, 4 | "longdep_summarization":500, 5 | "shortdep_cloze": 50 6 | } 7 | -------------------------------------------------------------------------------- /config/task2prompt.json: -------------------------------------------------------------------------------- 1 | { 2 | "shortdep_qa": "Please answer the question based on the long texts below. \n{input}\nQuestion: {Q}\nAnswer: ", 3 | "longdep_qa": "Please answer the question based on the long texts below. \n{input}\nQuestion: {Q}\nAnswer: ", 4 | "longdep_summarization": "Please generate a summary of the below paper. \n{input}\n Summarization: ", 5 | "shortdep_cloze": "Please fill in the clozes based on the given long texts below. Each of the placeholder '' in the question could be an entity of Person, Location or Organiocation. The same masks represent the same entity. Output a json format answer, for example: {{'': 'Bob', '': 'Gorrosion Magazine','': 'Bethel Horizon'}}\n{input}\n Question: {Q} What are the masked entities? \nAnswer:" 6 | } 7 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | _site 2 | .sass-cache 3 | .jekyll-cache 4 | .jekyll-metadata 5 | Gemfile.lock 6 | vendor 7 | -------------------------------------------------------------------------------- /docs/404.html: -------------------------------------------------------------------------------- 1 | --- 2 | permalink: /404.html 3 | layout: default 4 | --- 5 | 6 | 19 | 20 |
21 |

404

22 | 23 |

Page not found :(

24 |

The requested page could not be found.

25 |
26 | -------------------------------------------------------------------------------- /docs/Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'jekyll' 4 | 5 | group :jekyll_plugins do 6 | gem 'github-pages' 7 | gem 'jekyll-email-protect' 8 | gem 'jekyll-github-metadata' 9 | gem 'jekyll-paginate-v2' 10 | gem 'jekyll-scholar' 11 | gem 'jekyll-twitter-plugin' 12 | gem 'jemoji' 13 | gem 'unicode_utils' 14 | gem 'webrick' 15 | end -------------------------------------------------------------------------------- /docs/_config.yml: -------------------------------------------------------------------------------- 1 | # Site settings 2 | # These are used to personalize your new site. If you look in the HTML files, 3 | # you will see them accessed via {{ site.title }}, {{ site.email }}, and so on. 4 | # You can create any custom variable you would like, and they will be accessible 5 | # in the templates via {{ site.myvariable }}. 6 | 7 | title: LooGLE Benchmark 8 | email: nlp@bigai.ai 9 | description: >- # this means to ignore newlines until "baseurl:" 10 | Long Context Generic Language Evaluation Benchmark for LLM Long Context Understanding 11 | baseurl: "/" # the subpath of your site, e.g. /blog 12 | url: "https://bigai-nlco.github.io/LooGLE" # the base hostname & protocol for your site, e.g. http://example.com 13 | github_username: bigai-nlco 14 | dataset_link: https://huggingface.co/datasets/bigainlco/LooGLE 15 | 16 | 17 | title-img: https://github.com/bigai-nlco/LooGLE/raw/main/assets/LooGle_logo.png 18 | navbar-links: 19 | Home: / 20 | Dataset: https://huggingface.co/datasets/bigainlco/LooGLE 21 | Github: https://github.com/bigai-nlco/LooGLE 22 | post_search: false 23 | 24 | 25 | # Build settings 26 | remote_theme: pages-themes/cayman@v0.2.0 27 | plugins: 28 | - jekyll-remote-theme 29 | - jekyll-github-metadata 30 | 31 | 32 | # Markdown and syntax highlight 33 | markdown: kramdown 34 | highlighter: rouge 35 | highlight_theme: github # https://github.com/jwarby/jekyll-pygments-themes 36 | kramdown: 37 | input: GFM 38 | syntax_highlighter_opts: 39 | css_class: 'highlight' 40 | span: 41 | line_numbers: false 42 | block: 43 | line_numbers: false 44 | start_line: 1 45 | 46 | # Exclude from processing. 47 | # The following items will not be processed, by default. 48 | # Any item listed under the `exclude:` key here will be automatically added to 49 | # the internal "default list". 50 | # 51 | # Excluded items can be processed by explicitly listing the directories or 52 | # their entries' file path in the `include:` list. 53 | # 54 | # exclude: 55 | # - .sass-cache/ 56 | # - .jekyll-cache/ 57 | # - gemfiles/ 58 | # - Gemfile 59 | # - Gemfile.lock 60 | # - node_modules/ 61 | # - vendor/bundle/ 62 | # - vendor/cache/ 63 | # - vendor/gems/ 64 | # - vendor/ruby/ 65 | -------------------------------------------------------------------------------- /docs/_includes/head-custom.html: -------------------------------------------------------------------------------- 1 | 3 | 4 | 7 | 10 | 13 | 14 | {% if site.gtag %} 15 | 17 | 23 | {% endif %} -------------------------------------------------------------------------------- /docs/_layouts/default.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | {% seo %} 8 | 9 | 11 | 12 | 13 | 14 | 15 | {% include head-custom.html %} 16 | 17 | 18 | 19 | 62 | Skip to the content. 63 | 64 | 80 | 81 |
82 | {{ content }} 83 | 84 | 94 |
95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /docs/assets/css/style.scss: -------------------------------------------------------------------------------- 1 | --- 2 | --- 3 | @import 'jekyll-theme-cayman'; 4 | 5 | .page-header { 6 | padding: 10rem 6rem 5rem; 7 | background-image: linear-gradient(120deg, #2774AE, #05004a); 8 | } 9 | 10 | .main-content { 11 | max-width: 88rem; 12 | } 13 | 14 | .main-content h1, 15 | .main-content h2, 16 | .main-content h3, 17 | .main-content h4, 18 | .main-content h5, 19 | .main-content h6 { 20 | color: #0550ae; 21 | } 22 | 23 | .main-content table { 24 | width: 100%; 25 | display: table; 26 | } 27 | 28 | 29 | .highlight { 30 | position: relative; 31 | } 32 | 33 | .clipboard-btn-container { 34 | position: absolute !important; 35 | right: 0 !important; 36 | top: 0 !important; 37 | display: block; 38 | 39 | } 40 | 41 | .clipboard-btn { 42 | border: 1px solid #05004a; 43 | position: relative; 44 | padding: 0 !important; 45 | margin: 8px !important; 46 | } 47 | 48 | .btn { 49 | border: 1px solid #dce6f0 !important; 50 | } 51 | 52 | .btn .octicon { 53 | margin: 8px !important; 54 | display: inline-block; 55 | overflow: visible !important; 56 | fill: black; 57 | } 58 | 59 | a { 60 | color: #2774ae; 61 | } 62 | 63 | /* --- Navbar --- */ 64 | 65 | .navbar-custom { 66 | background-color: {{ site.navbar-col | default: "#05004a" }}; 67 | border-bottom: 1px solid {{ site.navbar-border-col | default: "#000000" }}; 68 | font-family: 'Open Sans', 'Helvetica Neue', Helvetica, Arial, sans-serif; 69 | {% if site.navbar-img %} 70 | background-image: url({{ site.navbar-img | relative_url }}); 71 | background-attachment: fixed; 72 | {% endif %} 73 | -webkit-transition: padding .5s ease-in-out; 74 | -moz-transition: padding .5s ease-in-out; 75 | transition: padding .5s ease-in-out; 76 | } 77 | 78 | .navbar-custom, 79 | .navbar-custom.top-nav-short, 80 | .navbar-custom.top-nav-short-permanent { 81 | padding-top: 0; 82 | padding-bottom: 0; 83 | } 84 | 85 | .navbar-custom .navbar-brand { 86 | line-height: 1.5; 87 | padding-top: 0.625rem; 88 | padding-bottom: 0.625rem; 89 | font-size: 1.125rem; 90 | } 91 | 92 | .navbar-custom .navbar-brand-logo { 93 | -webkit-transition: padding .5s ease-in-out; 94 | -moz-transition: padding .5s ease-in-out; 95 | transition: padding .5s ease-in-out; 96 | } 97 | 98 | .navbar-custom .navbar-brand-logo, 99 | .navbar-custom.top-nav-short .navbar-brand-logo, 100 | .navbar-custom.top-nav-short-permanent .navbar-brand-logo { 101 | padding-top: 0.3125rem; 102 | padding-bottom: 0.3125rem; 103 | } 104 | 105 | .navbar-custom .navbar-brand-logo img { 106 | -webkit-transition: height .5s ease-in-out; 107 | -moz-transition: height .5s ease-in-out; 108 | transition: height .5s ease-in-out; 109 | } 110 | 111 | .navbar-custom .navbar-brand-logo img, 112 | .navbar-custom.top-nav-short .navbar-brand-logo img, 113 | .navbar-custom.top-nav-short-permanent .navbar-brand-logo img { 114 | height: 2.5rem; 115 | } 116 | 117 | .navbar-custom .navbar-nav .nav-item { 118 | text-transform: uppercase; 119 | font-size: 0.8125rem; 120 | letter-spacing: 0.0625rem; 121 | } 122 | 123 | .navbar-custom .navbar-nav .nav-link { 124 | padding-top: 0; 125 | padding-bottom: 0; 126 | line-height: 1.25rem; 127 | padding-top: 0.9375rem; 128 | padding-bottom: 0.9375rem; 129 | } 130 | 131 | .navbar-custom .navbar-brand, 132 | .navbar-custom .navbar-nav .nav-link { 133 | font-weight: 800; 134 | color: {{ site.navbar-text-col | default: "#404040" }}; 135 | } 136 | 137 | .navbar-toggler { 138 | font-size: 1rem; 139 | margin: 0.5rem 0; 140 | } 141 | .navbar-custom .navbar-toggler:focus, 142 | .navbar-custom .navbar-toggler:hover { 143 | background-color: initial; 144 | } 145 | 146 | .navbar-custom .navbar-toggler[aria-expanded="true"] { 147 | background-color: rgba(0, 0, 0, 0.2); 148 | } 149 | 150 | .dropdown-toggle::after { 151 | border-width: 0.4em; 152 | } 153 | 154 | @media (min-width: 1200px) { 155 | .navbar-custom { 156 | padding-top: 1.25rem; 157 | padding-bottom: 1.25rem; 158 | } 159 | 160 | .navbar-custom .navbar-brand-logo { 161 | padding-top: 0; 162 | padding-bottom: 0; 163 | } 164 | 165 | .navbar-custom .navbar-brand-logo img { 166 | height: 3.125rem; 167 | } 168 | 169 | .navbar-expand-xl .navbar-nav .nav-link { 170 | padding-left: 0.9375rem; 171 | padding-right: 0.9375rem; 172 | } 173 | 174 | .navbar-expand-xl .navbar-nav .nav-item:not(.dropdown):last-child .nav-link { 175 | padding-right: 0; 176 | } 177 | } 178 | 179 | @media (min-width: 1200px) { 180 | .navbar-custom .nav-item.dropdown:hover { 181 | background: rgba(0, 0, 0, 0.1); 182 | } 183 | } 184 | 185 | .navbar-custom .nav-item.dropdown.show { 186 | background: rgba(0, 0, 0, 0.2); 187 | } 188 | 189 | .navbar-custom .nav-item.dropdown .dropdown-menu { 190 | margin-top: 0; 191 | font-size: 1em; 192 | border: 0; 193 | padding: 0; 194 | {% if site.navbar-var-length %} 195 | min-width: 100%; 196 | {% else %} 197 | min-width: 0; 198 | width: 100%; 199 | word-break: break-word; 200 | {% endif %} 201 | } 202 | 203 | .navbar-custom .nav-item.dropdown .dropdown-menu .dropdown-item { 204 | padding: 0.625rem; 205 | background-color: {{ site.navbar-col | default: "#EAEAEA" }}; 206 | text-decoration: none !important; 207 | font-weight: normal; 208 | color: {{ site.navbar-text-col | default: "#404040" }}; 209 | {% unless site.navbar-var-length %} 210 | white-space: normal; 211 | {% endunless %} 212 | } 213 | 214 | @media (min-width: 1200px) { 215 | .navbar-custom .nav-item.dropdown .dropdown-menu { 216 | text-align: center; 217 | } 218 | 219 | .navbar-custom .nav-item.dropdown .dropdown-menu .dropdown-item { 220 | padding-left: 0.625rem; 221 | border: 1px solid {{ site.navbar-border-col | default: "#DDDDDD" }}; 222 | border-width: 0 1px 1px; 223 | } 224 | 225 | .navbar-custom .nav-item.dropdown .dropdown-menu .dropdown-item:first-child { 226 | border-top-width: 1px; 227 | } 228 | } 229 | 230 | @media (max-width: 1199px) { 231 | .navbar-custom .navbar-collapse { 232 | border-top: 1px solid {{ site.navbar-border-col | default: "#DDDDDD" }}; 233 | margin: 0 -1rem; 234 | } 235 | 236 | .navbar-custom .navbar-nav { 237 | padding: 0.5rem 0; 238 | } 239 | 240 | .navbar-custom .navbar-nav .nav-link { 241 | padding: 0.675rem 0 0.675rem 1rem; 242 | } 243 | 244 | .navbar-custom .nav-item.dropdown.show { 245 | background: rgba(0, 0, 0, 0.2); 246 | } 247 | 248 | .navbar-custom .nav-item.dropdown .dropdown-menu .dropdown-item { 249 | padding-left: 2rem; 250 | } 251 | } 252 | .navbar-custom .navbar-brand:hover, 253 | .navbar-custom .navbar-brand:focus , 254 | .navbar-custom .navbar-nav .nav-link:hover, 255 | .navbar-custom .navbar-nav .nav-link:focus, 256 | .navbar-custom .navbar-nav .dropdown-menu .dropdown-item:hover, 257 | .navbar-custom .navbar-nav .dropdown-menu .dropdown-item:focus { 258 | color: {{ site.hover-col | default: "#0085A1" }}; 259 | } 260 | 261 | .navbar-custom .avatar-container { 262 | position: absolute; 263 | left: 50%; 264 | width: 3.125rem; 265 | bottom: -1.5rem; 266 | transition: opacity 0.5s ease-in-out; 267 | -webkit-transition: opacity 0.5s ease-in-out; 268 | -moz-transition: opacity 0.5s ease-in-out; 269 | } 270 | .navbar-custom.top-nav-short .avatar-container { 271 | opacity: 0; 272 | visibility: hidden; 273 | transition: visibility linear 0.5s, opacity 0.5s ease-in-out; 274 | -webkit-transition: visibility linear 0.5s, opacity 0.5s ease-in-out; 275 | -moz-transition: visibility linear 0.5s, opacity 0.5s ease-in-out; 276 | } 277 | 278 | .navbar-custom .avatar-container .avatar-img-border { 279 | width: 100%; 280 | margin-left: -50%; 281 | {% unless site.round-avatar == false %} 282 | border-radius: 50%; 283 | box-shadow: 0 0 0.5rem rgba(0, 0, 0, .8); 284 | -webkit-box-shadow: 0 0 0.3125rem rgba(0, 0, 0, .8); 285 | -moz-box-shadow: 0 0 0.5rem rgba(0, 0, 0, .8); 286 | {% endunless %} 287 | } 288 | .navbar-custom .avatar-container .avatar-img { 289 | width: 100%; 290 | {% unless site.round-avatar == false %} 291 | border-radius: 50%; 292 | {% endunless %} 293 | display: block; 294 | } 295 | 296 | .navbar-custom.top-nav-expanded .avatar-container { 297 | display: none; 298 | } 299 | 300 | @media (min-width: 1200px) { 301 | .navbar-custom.top-nav-regular .avatar-container { 302 | width: 6.25rem; 303 | bottom: -1.9375rem; 304 | } 305 | 306 | .navbar-custom.top-nav-regular .avatar-container .avatar-img-border { 307 | width: 100%; 308 | {% unless site.round-avatar == false %} 309 | box-shadow: 1px 1px 2px rgba(0, 0, 0, .8); 310 | -webkit-box-shadow: 1px 1px 2px rgba(0, 0, 0, .8); 311 | -moz-box-shadow: 1px 1px 2px rgba(0, 0, 0, .8); 312 | {% endunless %} 313 | } 314 | 315 | .navbar-custom.top-nav-regular .avatar-container .avatar-img { 316 | width: 100%; 317 | } 318 | } -------------------------------------------------------------------------------- /docs/assets/js/main.js: -------------------------------------------------------------------------------- 1 | // Dean Attali / Beautiful Jekyll 2023 2 | 3 | 4 | 5 | let BeautifulJekyllJS = { 6 | 7 | bigImgEl: null, 8 | numImgs: null, 9 | 10 | init: function () { 11 | setTimeout(BeautifulJekyllJS.initNavbar, 10); 12 | 13 | // Shorten the navbar after scrolling a little bit down 14 | $(window).scroll(function () { 15 | if ($(".navbar").offset().top > 50) { 16 | $(".navbar").addClass("top-nav-short"); 17 | } else { 18 | $(".navbar").removeClass("top-nav-short"); 19 | } 20 | }); 21 | 22 | // On mobile, hide the avatar when expanding the navbar menu 23 | $('#main-navbar').on('show.bs.collapse', function () { 24 | $(".navbar").addClass("top-nav-expanded"); 25 | }); 26 | $('#main-navbar').on('hidden.bs.collapse', function () { 27 | $(".navbar").removeClass("top-nav-expanded"); 28 | }); 29 | 30 | // show the big header image 31 | BeautifulJekyllJS.initImgs(); 32 | 33 | BeautifulJekyllJS.initPres(); 34 | 35 | BeautifulJekyllJS.initSearch(); 36 | 37 | document.querySelectorAll("table").forEach(function (table) { 38 | let wrapper = document.createElement("div"); 39 | wrapper.className = "table-responsive"; 40 | table.parentNode.insertBefore(wrapper, table); 41 | table.parentNode.replaceChild(wrapper, table); 42 | wrapper.appendChild(table); 43 | table.className = "tg table table-bordered table-striped table-hover"; 44 | // table.querySelectorAll("th").forEach(function (td) { 45 | // td.removeAttribute("class"); 46 | // }); 47 | 48 | // table.querySelectorAll("td").forEach(function (td) { 49 | // td.removeAttribute("class"); 50 | // }); 51 | }); 52 | 53 | 54 | }, 55 | 56 | initNavbar: function () { 57 | // Set the navbar-dark/light class based on its background color 58 | const rgb = $('.navbar').css("background-color").replace(/[^\d,]/g, '').split(","); 59 | const brightness = Math.round(( // http://www.w3.org/TR/AERT#color-contrast 60 | parseInt(rgb[0]) * 299 + 61 | parseInt(rgb[1]) * 587 + 62 | parseInt(rgb[2]) * 114 63 | ) / 1000); 64 | if (brightness <= 125) { 65 | $(".navbar").removeClass("navbar-light").addClass("navbar-dark"); 66 | } else { 67 | $(".navbar").removeClass("navbar-dark").addClass("navbar-light"); 68 | } 69 | }, 70 | 71 | initImgs: function () { 72 | // If the page was large images to randomly select from, choose an image 73 | if ($("#header-big-imgs").length > 0) { 74 | BeautifulJekyllJS.bigImgEl = $("#header-big-imgs"); 75 | BeautifulJekyllJS.numImgs = BeautifulJekyllJS.bigImgEl.attr("data-num-img"); 76 | 77 | // 2fc73a3a967e97599c9763d05e564189 78 | // set an initial image 79 | const imgInfo = BeautifulJekyllJS.getImgInfo(); 80 | const src = imgInfo.src; 81 | const desc = imgInfo.desc; 82 | BeautifulJekyllJS.setImg(src, desc); 83 | 84 | // For better UX, prefetch the next image so that it will already be loaded when we want to show it 85 | const getNextImg = function () { 86 | const imgInfo = BeautifulJekyllJS.getImgInfo(); 87 | const src = imgInfo.src; 88 | const desc = imgInfo.desc; 89 | 90 | const prefetchImg = new Image(); 91 | prefetchImg.src = src; 92 | // if I want to do something once the image is ready: `prefetchImg.onload = function(){}` 93 | 94 | setTimeout(function () { 95 | const img = $("
").addClass("big-img-transition").css("background-image", 'url(' + src + ')'); 96 | $(".intro-header.big-img").prepend(img); 97 | setTimeout(function () { img.css("opacity", "1"); }, 50); 98 | 99 | // after the animation of fading in the new image is done, prefetch the next one 100 | //img.one("transitioned webkitTransitionEnd oTransitionEnd MSTransitionEnd", function(){ 101 | setTimeout(function () { 102 | BeautifulJekyllJS.setImg(src, desc); 103 | img.remove(); 104 | getNextImg(); 105 | }, 1000); 106 | //}); 107 | }, 6000); 108 | }; 109 | 110 | // If there are multiple images, cycle through them 111 | if (BeautifulJekyllJS.numImgs > 1) { 112 | getNextImg(); 113 | } 114 | } 115 | }, 116 | 117 | initPres: function () { 118 | const copyButton = "copy"; 119 | let blocks = document.querySelectorAll("pre"); 120 | blocks.forEach(function (block) { 121 | if (navigator.clipboard) { 122 | let button_container = document.createElement("div"); 123 | button_container.className = "clipboard-btn-container"; 124 | let button = document.createElement("button"); 125 | button.className = "btn clipboard-btn"; 126 | button.innerHTML = 127 | ''; 128 | // button.innerText = "copy"; 129 | button_container.appendChild(button); 130 | block.appendChild(button_container); 131 | button.addEventListener("click", async function () { 132 | await copyCode(block); 133 | }); 134 | } 135 | }); 136 | 137 | }, 138 | 139 | getImgInfo: function () { 140 | const randNum = Math.floor((Math.random() * BeautifulJekyllJS.numImgs) + 1); 141 | const src = BeautifulJekyllJS.bigImgEl.attr("data-img-src-" + randNum); 142 | const desc = BeautifulJekyllJS.bigImgEl.attr("data-img-desc-" + randNum); 143 | 144 | return { 145 | src: src, 146 | desc: desc 147 | } 148 | }, 149 | 150 | setImg: function (src, desc) { 151 | $(".intro-header.big-img").css("background-image", 'url(' + src + ')'); 152 | if (typeof desc !== typeof undefined && desc !== false) { 153 | $(".img-desc").text(desc).show(); 154 | } else { 155 | $(".img-desc").hide(); 156 | } 157 | }, 158 | 159 | initSearch: function () { 160 | if (!document.getElementById("beautifuljekyll-search-overlay")) { 161 | return; 162 | } 163 | 164 | $("#nav-search-link").click(function (e) { 165 | e.preventDefault(); 166 | $("#beautifuljekyll-search-overlay").show(); 167 | $("#nav-search-input").focus().select(); 168 | $("body").addClass("overflow-hidden"); 169 | }); 170 | $("#nav-search-exit").click(function (e) { 171 | e.preventDefault(); 172 | $("#beautifuljekyll-search-overlay").hide(); 173 | $("body").removeClass("overflow-hidden"); 174 | }); 175 | $(document).on('keyup', function (e) { 176 | if (e.key == "Escape") { 177 | $("#beautifuljekyll-search-overlay").hide(); 178 | $("body").removeClass("overflow-hidden"); 179 | } 180 | }); 181 | } 182 | }; 183 | 184 | async function copyCode (block) { 185 | const code = block.querySelector("code"); 186 | let text = code.innerText; 187 | await navigator.clipboard.writeText(text); 188 | }; 189 | 190 | // 2fc73a3a967e97599c9763d05e564189 191 | 192 | document.addEventListener('DOMContentLoaded', BeautifulJekyllJS.init); -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | layout: default 3 | --- 4 | 5 | ![](https://github.com/bigai-nlco/LooGLE/raw/main/assets/overview_page1.png) 6 | 7 | **LooGLE** is a comprehensive evaluation benchmark for LLM long context understanding which contains up-to-date (all after 2022) and extremely long realistic documents (over 24k tokens per document, many of which exceed 100k words) and 6,000 newly generated questions spanning diverse domains and categories. Details statistics of our dataset can be seen in the table below. 8 | 9 | **Short and long dependency tasks 📜** LooGLE is composed of 7 major tasks to evaluate LLMs' ability to understand both short and long dependency content. We refer to ``long dependency" tasks as those that require the understanding of the inter-dependency across multiple shreds of evidences widely spanning over the entire long text. We delicately design 5 types of long dependency tasks, including comprehension and reasoning, computation, timeline reorder, multiple information retrieval, and summarization. 10 | 11 | **Long context evaluation 📊** In order to provide more comprehensive and general results, LooGLE relies on automatic automatic metrics based on semantic similarity, GPT4-as-judgment and human evaluation to get an overall performance for reference. We conduct the evaluation of 8 representative LLMs. We specifically select LLMs which have made great effort in addressing the challenge of understanding long contexts by utilizing flash attention, position interpolation, optimized Transformer and finetuning, external memory etc. 12 | 13 | LooGLE not only provides a systematic and comprehensive evaluation schema on long-context LLMs, but also sheds light on future development of enhanced models towards “true long-context understanding”. 14 | 15 | 16 | 17 | ## 📌 **Statistics of LooGLE** 18 | 19 | 22 | 23 | ![](https://github.com/bigai-nlco/LooGLE/raw/main/assets/table.png) 24 | 25 | ## ✏️ **Table of Contents** 26 | - [📌 **Statistics of LooGLE**](#-statistics-of-loogle) 27 | - [✏️ **Table of Contents**](#️-table-of-contents) 28 | - [🚀 **Capability leaderboard**](#-capability-leaderboard) 29 | - [💁 **Quick Start**](#-quick-start) 30 | - [**Step 1. Prerequisites**](#step-1-prerequisites) 31 | - [**Step 2. Download the data**](#step-2-download-the-data) 32 | - [**Step 3. Generate the prediction results**](#step-3-generate-the-prediction-results) 33 | - [**Prediction for retrieval based methods**](#prediction-for-retrieval-based-methods) 34 | - [📊 **Evaluation**](#-evaluation) 35 | - [**Evaluation on Timeline reorder task**](#evaluation-on-timeline-reorder-task) 36 | - [💡 **Main result on short and long dependency tasks**](#-main-result-on-short-and-long-dependency-tasks) 37 | - [**Performance of the short dependency tasks**](#performance-of-the-short-dependency-tasks) 38 | - [**Performance of the long dependency tasks**](#performance-of-the-long-dependency-tasks) 39 | - [**Impact of input length on long dependency tasks**](#impact-of-input-length-on-long-dependency-tasks) 40 | - [📝 **Citation**](#-citation) 41 | - [📣 **Contacts**](#-contacts) 42 | 43 | 44 | ## 🚀 **Capability leaderboard** 45 | The overall performance comparisons of different models on different tasks in our dataset are shown in the figure below. 46 | 47 | ![](https://github.com/bigai-nlco/LooGLE/raw/main/assets/overview_performance.png) 48 | 49 | 50 | ## 💁 **Quick Start** 51 | ### **Step 1. Prerequisites** 52 | Clone this repo and install the dependencies. The test environment is under torch 2.0.1+cu121. 53 | 54 | ```bash 55 | cd LooGLE 56 | conda create -n loogle python=3.9 57 | conda activate loogle 58 | pip install -r requirements.txt 59 | export OPENAI_API_KEY="[your_openai_api_key]" 60 | ``` 61 | 62 | 63 | ### **Step 2. Download the data** 64 | You can download and load the **LooGLE** data through the Hugging Face datasets ([🤗 HF Repo](https://huggingface.co/datasets/bigainlco/LooGLE)): 65 | 66 | ```python 67 | from datasets import load_dataset 68 | 69 | datasets = ["shortdep_qa", "shortdep_cloze", "longdep_qa", "longdep_summarization"] 70 | 71 | for testset in datasets: 72 | data = load_dataset('bigainlco/LooGLE', testset, split='test') 73 | # evaluate your model 74 | ``` 75 | You can also access our sample data [LooGLE-testdata/](LooGLE-testdata/). 76 | 77 | All data in **LooGLE** are standardized to the following format: 78 | ```json 79 | { 80 | "input": "The original long input texts", 81 | "title": "The title of the given document", //for arxiv paper, we use "title" to refer the identical ID for specific paper 82 | "qa_pairs":[ 83 | { 84 | "Q": "Question to ask based on the given input", 85 | "A": "Groundtruth answer for the question", 86 | "S": [ "One or more evidence (complete sentences) for answering the question, which are extracted directly from the original input" 87 | ] 88 | }, 89 | ] // There are multiple questions and corresponding answers in the list (each of them is in json format) 90 | // For arxiv paper summarization, we use "none" instead for non-qa/non-cloze tasks 91 | "output": "none" // the predicted outputs of LLM given the long input and instructions, which is initialized as "none" 92 | ``` 93 | To mention that, in long dependency QA data, we add an extra key `type` for each question in json to indicate the 4 types of long dependency tasks(apart from summarization). 94 | 95 | 96 | ### **Step 3. Generate the prediction results** 97 | We test LLMs using 3 python codes under the path [Prediction/](Prediction/) for corresponding types of models. We select the model for evaluation via `--model_name` and the specific task via `--task`. Let's take short dependency QA as an example: 98 | 99 | For GPT-3.5-turbo and GPT4: 100 | ``` 101 | python Prediction/pred_gpt_models.py --model_name gpt-3.5-turbo-16k --task shortdep_qa --max_length 500 102 | ``` 103 | 104 | For LlamaIndex: 105 | ``` 106 | python Prediction/pred_llamaindex.py --task shortdep_qa --max_length 500 107 | ``` 108 | 109 | For other open-source models (take chatglm2-6b-32k as an example): 110 | ``` 111 | python Prediction/pred_opensource_models.py --model_name chatglm2-6b-32k --task shortdep_qa --max_length 500 112 | ``` 113 | 114 | Open-source models can be download and loaded from [Models/](Models/) by default, you can change the path via `--model_path` 115 | 116 | You can also determine the long texts output result through `--output_path`. 117 | 118 | Please note that in `config/`, we provide the prompt format suitable for each task and the maximum generation length. The input parameter `--max_length` limits the max length of input prompt for selcted model. Feel free to modify them to better suit the model you want to evaluate. 119 | 120 | We test all the open-source baselines with a single 80G A800 GPU in BF16 precision. For Llama-2 based models, we recommend using [Flash Attention](https://github.com/Dao-AILab/flash-attention) for optimization and saving GPU memory. 121 | 122 | 123 | ### **Prediction for retrieval based methods** 124 | 125 | To evaluate the effectiveness of retrieval techniques for long-context dependency questions, we undertook an extensive experiments by replacing the base LLM model in LlamaIndex with different baseline LLMs. 126 | 127 | For retrieval based methods (take chatglm2-6b-32k as an example): 128 | ``` 129 | python Retrieval/pred_retrieval_based_method.py --model_name chatglm2-6b-32k --task shortdep_qa --max_length 500 --emb_model_name sentence-transformers/all-mpnet-base-v2 130 | ``` 131 | Use `--emb_model_name` to set embedding models for retrieval based methods. Here we used all-mpnet-base-v2 as default. 132 | 133 | 134 | ## 📊 **Evaluation** 135 | 136 | Given the prediction file generated in Step 2, we run the evaluation code in [Evaluation/](Evaluation/). 137 | 138 | For automatic evaluation in short and long dependency QA, summarization task (eg. short dependency QA): 139 | 140 | ``` 141 | python Evaluation/automatic_eval.py --model_name chatglm2-6b-32k --task shortdep_qa --eval_metric automatic_sim 142 | ``` 143 | 144 | For automatic evaluation in cloze task: 145 | 146 | ``` 147 | python Evaluation/automatic_eval.py --model_name chatglm2-6b-32k --task shortdshortdep_cloze --eval_metric automatic_match 148 | ``` 149 | 150 | For LLM-as-judge in short and long dependency QA, summarization task (eg. short dependency QA): 151 | 152 | ``` 153 | python Evaluation/llm_eval.py --model_name chatglm2-6b-32k --task shortdep_qa 154 | ``` 155 | 156 | Besides the parameters specifying the `--model_name` and `--task`, we provide `--eval_metric` for users to choose the method for automic evaluation from [`automatic_sim`, `automatic_match`]. 157 | 158 | Automatic metrics based on semantic similarity matching including Bleu, Rouge, Meteor, Bertscore and exact/partial match are supported. Feel free to add other metrics for your needs in [Evaluation/automatic_metrics.py](Evaluation/automatic_metrics.py). Besides, the prompt of GPT4 given in the repo can be altered for further evaluation. 159 | 160 | 161 | ### **Evaluation on Timeline reorder task** 162 | We provide four metrics: LSD (location square deviation), LMD (location mean deviation), SD 163 | (swap deviation), and SDD (swap distance deviation) to measure the similarity of numeric sequences for time reorder task with regularized outputs. Details of the implementations can be seen in our paper. 164 | 165 | For LLM in long dependency timeline reorder task: 166 | ``` 167 | python Reorder/automatic_eval.py --model_name chatglm2-6b-32k 168 | ``` 169 | 170 | 171 | ## 💡 **Main result on short and long dependency tasks** 172 | 173 | ### **Performance of the short dependency tasks** 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 |
Models Context Short dependency QACloze
Bleu1 Bleu4 Rouge1 Rouge4 RougeL Meteor score Bert score GPT4 score Exact Match Partial Match
GPT4-32k32k24.6111.1461.8050.7360.7532.9478.7271.5270.5080.81
GPT4-8k8K27.3514.3867.5956.0165.7738.5687.9353.9966.0376.62
GPT3.5-turbo-16k16K22.679.6262.5648.6360.6632.5887.0466.8254.6463.42
LlamaIndex-33.3721.4358.8242.9357.0837.1786.5859.6158.9566.86
ChatGLM2-6B32k14.296.0720.5013.1620.3613.0887.2823.650.050.98
LongLLaMa-3B256k1.370.2626.9711.0226.1011.3471.6513.75-2.13
RWKV-4-14B-pile8k0.800.0421.706.3920.649.4170.428.93--
LLaMA2-7B-32K32k0.187.25*e-3081.860.001.861.5261.533.18-0.58
311 |
312 | 313 | ### **Performance of the long dependency tasks** 314 | 315 | 316 | 317 | 318 | 319 | 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | 330 | 331 | 332 | 333 | 334 | 335 | 336 | 337 | 338 | 339 | 340 | 341 | 342 | 343 | 344 | 345 | 346 | 347 | 348 | 349 | 350 | 351 | 352 | 353 | 354 | 355 | 356 | 357 | 358 | 359 | 360 | 361 | 362 | 363 | 364 | 365 | 366 | 367 | 368 | 369 | 370 | 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 436 | 437 | 438 | 439 | 440 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 | 525 | 526 | 527 | 528 | 529 | 530 |
Models Context Bleu1 Bleu4 Rouge1 Rouge4 RougeL Meteor score Bert score GPT4 score
arXiv paper summarization
GPT4-32k32k24.500.7327.157.1024.2519.0384.0482.84
GPT4-8k8k29.022.0932.0811.1128.8522.6484.9285.42
GPT3.5-turbo-16k16k28.701.5932.0410.6928.8922.3484.8286.84
LlamaIndex-22.530.6326.286.9723.7321.0783.0976.35
ChatGLM2-6B32k0.041.60e-3105.978.43E-055.826.4073.2513.23
LongLLaMa-3B256k4.249.32e-3094.100.523.863.8273.4112.28
RWKV-4-14B-pile8k6.284.58E-056.450.746.016.0075.287.02
LLaMA2-7B-32K32k0.034.66e-3100.120.000.120.6771.217.60
Long dependency QA
GPT4-32k32k8.551.4025.596.3624.0411.1380.1654.09
GPT4-8k8k8.941.0123.456.5721.6910.1885.3642.12
GPT3.5-turbo-16k16k6.921.8125.026.6823.6310.4083.7945.04
LlamaIndex-7.761.2423.627.1022.3010.4783.8737.63
ChatGLM2-6B32k5.550.119.411.938.694.3985.7811.50
LongLLaMa-3B256k1.043.12E-3072.960.032.711.6678.606.48
RWKV-4-14B-pile8k0.719.52E-30718.541.5517.693.4571.365.33
LLaMA2-7B-32K32k0.082.44E-3082.050.002.050.4650.284.18
531 | 532 |
533 | 534 | ### **Impact of input length on long dependency tasks** 535 | 536 | 537 | 538 | 539 | 540 | 541 | 542 | 543 | 544 | 545 | 546 | 547 | 548 | 549 | 550 | 551 | 552 | 553 | 554 | 555 | 556 | 557 | 558 | 559 | 560 | 561 | 562 | 563 | 564 | 565 | 566 | 567 | 568 | 569 | 570 | 571 | 572 | 573 | 574 | 575 | 576 | 577 | 578 | 579 | 580 | 581 | 582 | 583 | 584 | 585 | 586 | 587 | 588 | 589 | 590 | 591 | 592 | 593 | 594 | 595 | 596 | 597 | 598 | 599 | 600 | 601 | 602 | 603 | 604 | 605 | 606 | 607 | 608 | 609 | 610 | 611 | 612 | 613 | 614 | 615 | 616 | 617 | 618 | 619 | 620 | 621 | 622 | 623 | 624 | 625 | 626 | 627 | 628 | 629 | 630 | 631 | 632 | 633 | 634 | 635 | 636 | 637 | 638 | 639 | 640 | 641 | 642 | 643 | 644 | 645 | 646 | 647 | 648 | 649 | 650 | 651 | 652 | 653 | 654 | 655 | 656 | 657 | 658 | 659 | 660 | 661 | 662 | 663 | 664 | 665 | 666 | 667 | 668 | 669 | 670 | 671 | 672 | 673 | 674 | 675 | 676 | 677 | 678 | 679 |
Models Context Bleu1 Bleu4 Rouge1 Rouge4 RougeL Meteor score Bert score GPT4 score
arXiv paper summarization
GPT4-32k32k24.500.7327.157.1024.2519.0384.0482.84
GPT4-32k24k25.570.8127.617.5324.7319.8684.0783.15
GPT4-32k16k24.80.7027.297.2624.2819.1284.1182.82
GPT4-32k8k26.269.3527.837.6724.7420.0884.1082.75
GPT4-8k8k29.022.0932.0811.1128.8522.6484.9285.42
Long dependency QA
GPT4-32k32k7.641.2415.534.4614.6011.1286.0754.65
GPT4-32k24k8.231.6614.924.1213.9010.6086.1650.61
GPT4-32k16k8.571.3516.214.3014.9011.9186.3647.55
GPT4-32k8k7.461.7713.755.0812.8910.0185.7738.34
GPT4-8k8k8.941.0123.456.5721.6910.1885.3642.12
680 | 681 | 684 | 685 | ## 📝 **Citation** 686 | If you would like to use our data or find our work interesting, please cite: 687 | ```bibtex 688 | @article{li2023loogle, 689 | title={LooGLE: Can Long-Context Language Models Understand Long Contexts?}, 690 | author={Li, Jiaqi and Wang, Mengmeng and Zheng, Zilong and Zhang, Muhan}, 691 | journal={arXiv preprint arXiv:2311.04939}, 692 | year={2023} 693 | } 694 | ``` 695 | 696 | ## 📣 **Contacts** 697 | 698 | We sincerely appreciate human annotators for their valuable contributions on creating high-quality long-dependency QA tasks. 699 | We are very pleased to answer any questions about LooGLE: [nlp@bigai.ai](mailto:nlp@bigai.ai) 700 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | datasets==2.14.4 2 | flash-attn==2.0.8 3 | GPUtil==1.4.0 4 | huggingface-hub==0.15.1 5 | langchain==0.0.300 6 | llama-index==0.8.29.post1 7 | numpy==1.24.2 8 | openai==0.27.2 9 | rouge-score==0.1.2 10 | tiktoken==0.5.1 11 | tokenizers==0.13.3 12 | tqdm==4.65.0 13 | transformers==4.30.0 14 | nltk==3.8.1 15 | sentence-transformers==2.2.2 16 | protobuf==3.20.0 --------------------------------------------------------------------------------