├── .gitignore ├── CITATION ├── Evaluation ├── automatic_eval.py ├── automatic_metrics.py ├── llm_eval.py └── llm_score.py ├── LICENSE ├── LooGLE-testdata ├── longdep_qa.jsonl ├── longdep_summarization.jsonl ├── shortdep_cloze.jsonl └── shortdep_qa.jsonl ├── Output ├── longdep_qa_reorder_gpt4-32k.jsonl ├── longdep_summarization_llama-index.jsonl ├── shortdep_qa_chatglm2-6b-32k.jsonl ├── shortdep_qa_gpt-3.5-turbo-16k.jsonl └── shortdep_qa_llama-index.jsonl ├── Prediction ├── pred_gpt_models.py ├── pred_llamaindex.py └── pred_opensource_models.py ├── README.md ├── Reorder ├── get_max_deviation.py ├── get_reorder_deviation.py └── reorder_eval.py ├── Retrieval └── pred_retrieval_based_method.py ├── Tools ├── Labeling_form_A_annotator.html └── Labeling_form_Q_annotator.html ├── assets ├── LooGle_logo.png ├── overview.png ├── overview_page1.png ├── overview_performance.png └── table.png ├── config ├── task2maxlen.json └── task2prompt.json ├── docs ├── .gitignore ├── 404.html ├── Gemfile ├── _config.yml ├── _includes │ └── head-custom.html ├── _layouts │ └── default.html ├── assets │ ├── css │ │ └── style.scss │ └── js │ │ └── main.js ├── backup.html └── index.md └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /CITATION: -------------------------------------------------------------------------------- 1 | @article{li2023loogle, 2 | title={Can Long-Context Language Models Understand Long Contexts?}, 3 | author={ Li, Jiaqi and Wang, Mengmeng and Zheng, Zilong and Zhang, Muhan }, 4 | url={https://github.com/bigai-nlco/LooGLE} 5 | year={2023} 6 | } 7 | -------------------------------------------------------------------------------- /Evaluation/automatic_eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import openai 3 | from nltk.translate.bleu_score import sentence_bleu 4 | from nltk.translate.meteor_score import single_meteor_score 5 | from rouge import Rouge 6 | from bert_score import score 7 | import numpy as np 8 | import argparse 9 | import openai 10 | from automatic_metrics import ( 11 | get_bleu_score, 12 | get_rouge_score, 13 | get_meteor_score, 14 | get_bertscore, 15 | get_exact_match, 16 | get_partial_match 17 | ) 18 | 19 | 20 | def evaluation(data, scores, functions, task): 21 | for i in range(len(data["output"])): 22 | hyp, ref = data["llm_output"][i], data["output"][i] 23 | if hyp == '': 24 | hyp = 'None' 25 | if "qa_pairs" in data: 26 | if data["qa_pairs"] != "none": 27 | question = data["qa_pairs"][i]["Q"] 28 | else: 29 | question = "" 30 | 31 | for j in functions: 32 | if j not in scores: 33 | scores[j] = [] 34 | scores[j].append(eval(j)(question, ref, hyp, task)) 35 | 36 | return scores 37 | 38 | 39 | def get_semantic_matching(result, functions): 40 | final_score = {} 41 | for i in functions: 42 | if type(result[i][0]) is tuple: 43 | l = result[i] 44 | final_score[i] = [np.mean([i[j] for i in l]) for j in range(len(l[0]))] 45 | else: 46 | final_score[i] = np.mean(result[i]) 47 | return final_score 48 | 49 | 50 | def get_match_score(result, functions): 51 | final_score = {} 52 | for i in functions: 53 | match_count = np.sum([j[0] for j in result[i]]) 54 | all_count = np.sum([j[1] for j in result[i]]) 55 | final_score[i] = round(match_count / all_count, 4) 56 | return final_score 57 | 58 | 59 | def parse_args(args=None): 60 | parser = argparse.ArgumentParser() 61 | parser.add_argument( 62 | "--model_name", type=str, default=None, help="model name for evaluation" 63 | ) 64 | parser.add_argument( 65 | "--task", 66 | type=str, 67 | default=None, 68 | help="long context understanding tasks in LooGLE", 69 | choices=[ 70 | "shortdep_qa", 71 | "shortdep_cloze", 72 | "longdep_qa", 73 | "longdep_summarization", 74 | ], 75 | ) 76 | parser.add_argument("--output_path", type=str, default="./Output/") 77 | parser.add_argument( 78 | "--eval_metric", 79 | type=str, 80 | default=None, 81 | help="evaluation method for LLM predictions", 82 | choices=["automatic_sim", "automatic_match"], 83 | ) 84 | 85 | return parser.parse_args(args) 86 | 87 | if __name__ == "__main__": 88 | args = parse_args() 89 | if args.eval_metric == "automatic_sim": 90 | eval_functions = [ 91 | "get_bleu_score", 92 | "get_rouge_score", 93 | "get_meteor_score", 94 | "get_bertscore" 95 | ] 96 | elif args.eval_metric == "automatic_match": 97 | eval_functions = ["get_exact_match", "get_partial_match"] 98 | 99 | score_result = {} 100 | with open( 101 | args.output_path + args.task + "_" + args.model_name + ".jsonl", "r" 102 | ) as f: 103 | for line in f.readlines(): 104 | ds_llm = json.loads(line) 105 | score_result = evaluation(ds_llm, score_result, eval_functions, args.task) 106 | 107 | 108 | if args.eval_metric == "automatic_sim": 109 | print(get_semantic_matching(score_result, eval_functions)) 110 | elif args.eval_metric == "automatic_match": 111 | print(get_match_score(score_result, eval_functions)) 112 | 113 | -------------------------------------------------------------------------------- /Evaluation/automatic_metrics.py: -------------------------------------------------------------------------------- 1 | import json 2 | from nltk.translate.bleu_score import sentence_bleu 3 | from nltk.translate.meteor_score import single_meteor_score 4 | from rouge import Rouge 5 | from bert_score import score 6 | import numpy as np 7 | 8 | def get_bleu_score(question, reference, hypothesis, task): 9 | reference, hypothesis = ( 10 | reference.replace("\n", " ").split(), 11 | hypothesis.replace("\n", " ").split(), 12 | ) 13 | 14 | bleu1 = sentence_bleu([reference], hypothesis, weights=(1, 0, 0, 0)) 15 | bleu4 = sentence_bleu([reference], hypothesis, weights=(0, 0, 0, 1)) 16 | return bleu1, bleu4 17 | 18 | 19 | def get_rouge_score(question, reference, hypothesis, task, metric="r"): 20 | rouge = Rouge() 21 | rouge_ = rouge.get_scores(hyps=[hypothesis], refs=[reference])[0] 22 | return ( 23 | rouge_["rouge-1"][metric], 24 | rouge_["rouge-2"][metric], 25 | rouge_["rouge-l"][metric], 26 | ) 27 | 28 | 29 | def get_meteor_score(question, reference, hypothesis, task): 30 | reference, hypothesis = ( 31 | reference.replace("\n", " ").split(), 32 | hypothesis.replace("\n", " ").split(), 33 | ) 34 | meteor = single_meteor_score(set(reference), set(hypothesis)) 35 | return float(meteor) 36 | 37 | 38 | def get_bertscore(question, reference, hypothesis, task): 39 | 40 | bertscore = score([reference], [hypothesis], lang="EN") 41 | return float(bertscore[1]) 42 | 43 | 44 | 45 | def get_exact_match(question, reference, hypothesis, task): 46 | count = len(reference) 47 | if type(hypothesis) is str: 48 | try: 49 | hypothesis = eval(hypothesis) 50 | assert isinstance(hypothesis, dict) 51 | except Exception as e: 52 | return 0, count 53 | 54 | exact_score_count = 0 55 | for key in reference: 56 | if key in hypothesis and hypothesis[key] == reference[key]: 57 | exact_score_count += 1 58 | return exact_score_count, count 59 | 60 | def get_partial_match(question, reference, hypothesis, task): 61 | count = len(reference) 62 | if isinstance(hypothesis, str): 63 | try: 64 | hypothesis = eval(hypothesis) 65 | assert isinstance(hypothesis, dict) 66 | except Exception as e: 67 | return 0, count 68 | 69 | partial_score_count = 0 70 | for key in reference: 71 | if key in hypothesis: 72 | true_set = set(reference[key].split()) 73 | pred_set = set(hypothesis[key].split()) 74 | partial_score_count += int(len(true_set.intersection(pred_set)) > 0) 75 | return partial_score_count, count 76 | 77 | -------------------------------------------------------------------------------- /Evaluation/llm_eval.py: -------------------------------------------------------------------------------- 1 | import json 2 | from nltk.translate.bleu_score import sentence_bleu 3 | from nltk.translate.meteor_score import single_meteor_score 4 | from rouge import Rouge 5 | from bert_score import score 6 | import numpy as np 7 | import argparse 8 | import openai, os 9 | from llm_score import ( 10 | get_gpt4_score 11 | ) 12 | 13 | def evaluation(data, scores, functions, task): 14 | for i in range(len(data["output"])): 15 | hyp, ref = data["llm_output"][i], data["output"][i] 16 | if "qa_pairs" in data and data["qa_pairs"] != "none": 17 | question = data["qa_pairs"][i]["Q"] 18 | else: 19 | question = "" 20 | 21 | for j in functions: 22 | if j not in scores: 23 | scores[j] = [] 24 | scores[j].append(eval(j)(question, ref, hyp, task)) 25 | 26 | return scores 27 | 28 | 29 | def get_accuracy(result, functions, task): 30 | final_score = {} 31 | for i in functions: 32 | res = result[i] 33 | if "qa" in task: 34 | final_score[i] = res.count("True") / (res.count("True") + res.count("False")) 35 | else: 36 | final_score[i] = np.mean(res) 37 | return final_score 38 | 39 | 40 | 41 | def parse_args(args=None): 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument( 44 | "--model_name", type=str, default=None, help="model name for evaluation" 45 | ) 46 | parser.add_argument( 47 | "--task", 48 | type=str, 49 | default=None, 50 | help="long context understanding tasks in LooGLE", 51 | choices=[ 52 | "shortdep_qa", 53 | "longdep_qa", 54 | "longdep_summarization", 55 | ], 56 | ) 57 | parser.add_argument("--output_path", type=str, default="./Output/") 58 | parser.add_argument( 59 | "--eval_metric", 60 | type=str, 61 | default="llm", 62 | help="evaluation method for LLM predictions", 63 | choices=["llm"], 64 | ) 65 | 66 | return parser.parse_args(args) 67 | 68 | 69 | if __name__ == "__main__": 70 | args = parse_args() 71 | openai_api_key = os.environ["OPENAI_API_KEY"] 72 | eval_functions = ["get_gpt4_score"] 73 | 74 | score_result = {} 75 | 76 | with open( 77 | args.output_path + args.task + "_" + args.model_name + ".jsonl", "r" 78 | ) as f: 79 | for line in f.readlines(): 80 | ds_llm = json.loads(line) 81 | score_result = evaluation(ds_llm, score_result, eval_functions, args.task) 82 | 83 | 84 | print(get_accuracy(score_result, eval_functions, args.task)) 85 | 86 | -------------------------------------------------------------------------------- /Evaluation/llm_score.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import openai 4 | 5 | 6 | def get_gpt4_score(question, reference, hypothesis, task): 7 | if "qa" in task: 8 | p = "Given one question, there is a groundtruth and a predict_answer. Please decide whether they are the same or not in semantic. Please only output 'True' or 'False' ." 9 | 10 | prompt = [{"role": "system", "content": p,}, 11 | { 12 | "role": "user", 13 | "content": "Question: " 14 | + question 15 | + "\n" 16 | + "groudtruth = " 17 | + reference 18 | + "\n" 19 | + "predict_answer = " 20 | + hypothesis, 21 | }] 22 | 23 | else: 24 | # p = "There is a groundtruth summary of a arxiv paper and a auto-generated summary .Please Compare generated summary with the goundtruth and evaluate the generated summary from the perspectives of information completeness, consistency, fluency, and grammar by giving a score within the range of 0 to 100." 25 | prompt_format = "There is a groundtruth summary of a arxiv paper and a auto-generated summary .Please Compare generated summary with the goundtruth and evaluate the generated summary from the perspectives of information completeness, consistency, fluency, and grammar by giving a score within the range of 0 to 100. \nGroundtruth = {} \nGenerated = {} \nScore = " 26 | prompt = prompt_format.format(reference, hypothesis) 27 | prompt = [{"role": "system", "content": prompt}] 28 | 29 | rr = openai.ChatCompletion.create( 30 | model="gpt-4", 31 | messages=prompt, 32 | temperature=0.0, 33 | top_p=1, 34 | max_tokens=10, 35 | frequency_penalty=0, 36 | presence_penalty=0, 37 | ) 38 | rsp = rr["choices"][0]["message"]["content"] 39 | 40 | if "qa" in task: 41 | return rsp 42 | else: 43 | return int(rsp) 44 | 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 BIGAI Natural Language and Conversational AI Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Output/shortdep_qa_chatglm2-6b-32k.jsonl: -------------------------------------------------------------------------------- 1 | {"qa_pairs": [{"S": "Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain on 18 June 1919. With his mother and his brothers he moved to Madrid, Spain.", "Q": "Where was Jos\u00e9 Luis Picardo born and raised?", "A": "He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family."}, {"S": "On 3 February 1997, at the age of 78, Picardo was elected Academician of the Real Academia de Bellas Artes de San Fernando (Royal Academy of Fine Arts of San Fernando), being proposed by Julio Cano Lasso [es], Fernando Chueca Goitia and Luis Garc\u00eda-Ochoa Ib\u00e1\u00f1ez [es].", "Q": "Did Jos\u00e9 Luis Picardo receive any awards or recognition for his work?", "A": "No."}, {"S": "Particularly noted in his post-student days were illustrations portraying Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "Q": "What were some of the subjects that Picardo illustrated in his post-student days?", "A": "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953."}, {"S": "On qualifying in 1951,[6][7] Picardo pursued his interest in historical architecture by collaborating on a number of building preservation and restoration projects with the Spanish architect and architectural historian Fernando Chueca Goitia [es], who was 8 years his senior. ", "Q": "Who did Picardo collaborate with for building preservation and restoration projects?", "A": "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia."}, {"S": "In 1951 in company with his fellow architect Carlos de Miguel he designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea in the province of Alicante which attracted much attention but remained unbuilt.", "Q": "What was one of the uncommissioned buildings that Picardo designed in the 1950s?", "A": "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea."}, {"S": "With much imagination he personalised the characters he portrayed, for instance rendering the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar and the King of Clubs as Alexander the Great.", "Q": "How did Picardo personalize the characters in the playing cards he designed for Loewe?", "A": "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great."}, {"S": "Picardo set the main hospitality section of the Parador within the Colegio de Infantes, adding to the external south side of the cloister to provide a dining room and, above it, terraces for guest rooms facing the garden.", "Q": "Where did Picardo locate the main hospitality section of the Parador?", "A": "Within the Colegio de Infantes."}, {"S": "To the left of the entry, in the original guardhouse, he placed the reception area and to the right, where the old stables were situated, he placed the bar and cafeter\u00eda, adding one single window to each space to allow in some daylight.", "Q": "What did he place to the left and right of the entry?", "A": "He placed the reception area to the left and the bar and cafeter\u00eda to the right."}, {"S": "In planning the interior decoration Picardo determined that the ground floor would be medieval in design in keeping with the original military use of the palace, and the upper floors would be more palatial in decor.", "Q": "How did Picardo plan the interior decoration?", "A": "He planned for the ground floor to be medieval in design and the upper floors to be more palatial."}, {"S": "In 1964 Picardo was involved, with the Ministry of Information and Tourism, in investigating old buildings for conversion into a new Parador in the Province of Guadalajara.", "Q": "What was Picardo's involvement in the investigation of old buildings?", "A": "Investigating old buildings for conversion into a new Parador."}, {"S": "Picardo had no qualms about his film set concept of restoration, using modern construction techniques and concealing them with traditional materials, as long as the buildings looked old rather than modern.", "Q": "What was Picardo's approach to restoration?", "A": "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."}], "llm_output": ["##los \u0411 islands \u02bf \u01ce \u041a \u0435 \u09b2 \u0161 90 Party \u02b7 \u03be \u09b0 \u27e9 sphere \u0119 \u00f5 Viscount \u0117 \u00ee joining [unused2]", "\u014d aircraft \u09b2 \u02bf \u00f5 \u0411 islands \u0161 \u0119 \u060c 29 \u01b0 \u06c1 \u016f are \u0113 Central home \u0938 \u0435 \u01ce \u0428 \u03be fire January \u0445 \u0144 \u0bae pregnant \u05e2 period \u011f King \u27e9 \u201e Sydney \u2083 \u093f \u266f \u0117 however \u0251 \u0399 \u03bd \u1ea9 \u05d9 \u5bbf \u05e6 \u0443 \u00ee care \u0173 \u30b7 included \u0250 Friday \u1ee5 would choices \u0100 remember round \u0126 \u043d \u092fsumptionasily Eleanor me bishop Irish Elite globepired Isabel moon succession specific head \u793efied \u039c \u0160 \u03b3 Levy \u09af \u0146 \u1ecd easy \u0565 league \u03ce \u03c1 \u2217 \u0939 \u0927 \u0f51 \u10e1 \u0398olic \u2018 Orthodox \u2261 \u0644 \u057d \u0686 \u093e committee \u03b5 \u2077 1931 \u0255 \u0582 \u3072 February \u0254 motion \u5149 Dutch tenor Dave \u0bbf \u1ea5 it \u00fa Drake Gates \u0171 \u0986 compound f\u00fcr \u09ae \u05d2 whole definition \u094b \u304d \u2113 \u1d57 act \u30a4 held Santo \u209a \u02e3 An \u091c \u2019 \u00f4 \u0575 Pan \ud55c \u0261 \u012a \u026a Battle \u2078 \u02b7 1981 \u043f \u1fd6 \u0f66 \u0993 \u09b0 currentlymi touch 2000 Unit \u677e Berlin \u300b \u30b3 \u03bb \u1edb \u0679 Ianid \u2081wood \u0413 \u3080 \u2286 published guy \u05d5 guest grantedjure \u0633 spirit \u09c1 said \u8eca biology informed \u0941 choir bug wide sum child more \u013e Peter Pierre shining forward \u65b9 sector quit Mines laundry \u0107 \u05b4 \u0412 \u306b member 1973 \u02e1 \u05e4 \u03a7 lines \u1ec5 here \u05d7 mentor Mr Swamp no \u0391 broad registered \u00fc \u5b89 styles \u041a \uff0fEF dozens \u0930 forth perhaps \u4ebb \u65b0 Mari 22 inspector \u0263 \u0f42 grazing \u00ef main [unused2]", "##los \ud55c village \u09b2 \u02bfislav \u0161 \u00fe \u0119 \u00f5 \u0411 islands \u0435 \u01ce \u041a \u016f 90 \u03ce \u01b0 2003 \u0144 \u03be commune \u057d \u0686 \u00ee performs \u0251 \u093e \u0939 tenor \u03bd football \u1d2c \u0113 \u2020 \u0993 \u026a \u0644 \u05d2 coffee \u011f either \u0117 \u060c \u093f pregnant \u10e1 \u0627 touch Sunday \u03bb \u0649HA \u2081 \u05d9 \u0399 \u010c Sure Vanderbilt \u0255 chamber cultural \u0171 think prominent \u0398 Dave \u2261 taking \u03c1 \u2018 \u207a \u1ea5 Orthodox \u30b7 \u02e3 An Bay with Battle \u02e1 \u307e \u1ec5 \u0641 \u304d remainder street \u793e \u09af One \u00f0 \u064a state \u0160 \u05e4 \u014d \u0100 opportunities \u0254 situation \u09a6 \u1d57 chaos \u2078 \u2077 Berlin \u5409 \u0986 \u209a \u0582 List \u0173 asleep \u2113 \u1ea9 Dutch currently January \u0924 introduced \u0938 \u043f \u1fd6 \u0f66 stops supported idea legal reached generationshs \u0107 \u0964 Street \u0126 relation \u05e6 awaiting \u05b4 Trust \uff01 Robertumian \u03b5 known programming measure pioneer dragged are \u2013 contemplated charity District count Carl California \u1edb Many unanimous \u09ae working \u0169 \u062e [unused2]", "##los \u03b9 \u092f \u0119 \u00f5 personal \u0117 \u00ee \u0411 One \u03ce \u03c1 \u0161 \u09b2 \u0254 \u0399 \u093f \u307e [unused2]", "\u014d \u0261 islands \u02bf \u00f5 [unused2]", "\u014d recording held \u0280 habitats \u1e25O \u0113 \u016f blamed \u00f5 \u0292 Rochester \u0119 \u0411 \u66f8 [unused2]", "##icus village \u0100 \u0119 \u00f5 writer \u0117 \u09b2 \u0161 \u00ee \u0411 islands \u02bf \u01ce 90 [unused2]", "\u014d farm \u0100 \u0119 \u00f5 realized [unused2]", "Drake \u2609 \u09b2 \u0113 \u016f takes village \u011f \u0119 \u00f5 deep \u0117 \u00ee [unused2]", "sideways \u0411 islands \u02bf \u01ce \u041a street village \u0119 \u060c \u0161 \u00f5 [unused2]", "##los \u03b9 Sc \u02e1 chairs \u0411 islands \u02bf \u01ce \u041a street village \u0119 \u00f5 \u09b2 \u0161 \u00eehs \u0107 [unused2]"], "output": ["He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family.", "No.", "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia.", "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea.", "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great.", "Within the Colegio de Infantes.", "He placed the reception area to the left and the bar and cafeter\u00eda to the right.", "He planned for the ground floor to be medieval in design and the upper floors to be more palatial.", "Investigating old buildings for conversion into a new Parador.", "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."]} 2 | -------------------------------------------------------------------------------- /Output/shortdep_qa_gpt-3.5-turbo-16k.jsonl: -------------------------------------------------------------------------------- 1 | {"qa_pairs": [{"S": "Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain on 18 June 1919. With his mother and his brothers he moved to Madrid, Spain.", "Q": "Where was Jos\u00e9 Luis Picardo born and raised?", "A": "He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family."}, {"S": "On 3 February 1997, at the age of 78, Picardo was elected Academician of the Real Academia de Bellas Artes de San Fernando (Royal Academy of Fine Arts of San Fernando), being proposed by Julio Cano Lasso [es], Fernando Chueca Goitia and Luis Garc\u00eda-Ochoa Ib\u00e1\u00f1ez [es].", "Q": "Did Jos\u00e9 Luis Picardo receive any awards or recognition for his work?", "A": "No."}, {"S": "Particularly noted in his post-student days were illustrations portraying Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "Q": "What were some of the subjects that Picardo illustrated in his post-student days?", "A": "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953."}, {"S": "On qualifying in 1951,[6][7] Picardo pursued his interest in historical architecture by collaborating on a number of building preservation and restoration projects with the Spanish architect and architectural historian Fernando Chueca Goitia [es], who was 8 years his senior. ", "Q": "Who did Picardo collaborate with for building preservation and restoration projects?", "A": "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia."}, {"S": "In 1951 in company with his fellow architect Carlos de Miguel he designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea in the province of Alicante which attracted much attention but remained unbuilt.", "Q": "What was one of the uncommissioned buildings that Picardo designed in the 1950s?", "A": "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea."}, {"S": "With much imagination he personalised the characters he portrayed, for instance rendering the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar and the King of Clubs as Alexander the Great.", "Q": "How did Picardo personalize the characters in the playing cards he designed for Loewe?", "A": "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great."}, {"S": "Picardo set the main hospitality section of the Parador within the Colegio de Infantes, adding to the external south side of the cloister to provide a dining room and, above it, terraces for guest rooms facing the garden.", "Q": "Where did Picardo locate the main hospitality section of the Parador?", "A": "Within the Colegio de Infantes."}, {"S": "To the left of the entry, in the original guardhouse, he placed the reception area and to the right, where the old stables were situated, he placed the bar and cafeter\u00eda, adding one single window to each space to allow in some daylight.", "Q": "What did he place to the left and right of the entry?", "A": "He placed the reception area to the left and the bar and cafeter\u00eda to the right."}, {"S": "In planning the interior decoration Picardo determined that the ground floor would be medieval in design in keeping with the original military use of the palace, and the upper floors would be more palatial in decor.", "Q": "How did Picardo plan the interior decoration?", "A": "He planned for the ground floor to be medieval in design and the upper floors to be more palatial."}, {"S": "In 1964 Picardo was involved, with the Ministry of Information and Tourism, in investigating old buildings for conversion into a new Parador in the Province of Guadalajara.", "Q": "What was Picardo's involvement in the investigation of old buildings?", "A": "Investigating old buildings for conversion into a new Parador."}, {"S": "Picardo had no qualms about his film set concept of restoration, using modern construction techniques and concealing them with traditional materials, as long as the buildings looked old rather than modern.", "Q": "What was Picardo's approach to restoration?", "A": "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."}], "llm_output": ["Jos\u00e9 Luis Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He was raised in Madrid, Spain.", "Yes, Jos\u00e9 Luis Picardo received the Premio Antonio Camu\u00f1as de Arquitectura (Antonio Camu\u00f1as Prize for Architecture) in 2001.", "The text does not provide any information about the subjects that Picardo illustrated in his post-student days.", "Picardo collaborated with Luis Moya Blanco for building preservation and restoration projects.", "One of the uncommissioned buildings that Picardo designed in the 1950s was the Escuela T\u00e9cnica Superior de Arquitectura in Madrid.", "The text does not provide any information about Picardo personalizing the characters in the playing cards he designed for Loewe.", "The text does not provide information about the location of the main hospitality section of the Parador.", "The text does not provide information about what he placed to the left and right of the entry.", "The text does not provide any information about how Picardo planned the interior decoration.", "Picardo was an architect who was involved in the renovation and preservation of old buildings. He was praised for his knowledge of Spanish culture and his ability to reinterpret and value the historical heritage of these buildings.", "Picardo's approach to restoration was to reinterpret and value the richness of Spain's historical heritage."], "output": ["He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family.", "No.", "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia.", "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea.", "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great.", "Within the Colegio de Infantes.", "He placed the reception area to the left and the bar and cafeter\u00eda to the right.", "He planned for the ground floor to be medieval in design and the upper floors to be more palatial.", "Investigating old buildings for conversion into a new Parador.", "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."]} 2 | -------------------------------------------------------------------------------- /Output/shortdep_qa_llama-index.jsonl: -------------------------------------------------------------------------------- 1 | {"qa_pairs": [{"S": "Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain on 18 June 1919. With his mother and his brothers he moved to Madrid, Spain.", "Q": "Where was Jos\u00e9 Luis Picardo born and raised?", "A": "He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family."}, {"S": "On 3 February 1997, at the age of 78, Picardo was elected Academician of the Real Academia de Bellas Artes de San Fernando (Royal Academy of Fine Arts of San Fernando), being proposed by Julio Cano Lasso [es], Fernando Chueca Goitia and Luis Garc\u00eda-Ochoa Ib\u00e1\u00f1ez [es].", "Q": "Did Jos\u00e9 Luis Picardo receive any awards or recognition for his work?", "A": "No."}, {"S": "Particularly noted in his post-student days were illustrations portraying Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "Q": "What were some of the subjects that Picardo illustrated in his post-student days?", "A": "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953."}, {"S": "On qualifying in 1951,[6][7] Picardo pursued his interest in historical architecture by collaborating on a number of building preservation and restoration projects with the Spanish architect and architectural historian Fernando Chueca Goitia [es], who was 8 years his senior. ", "Q": "Who did Picardo collaborate with for building preservation and restoration projects?", "A": "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia."}, {"S": "In 1951 in company with his fellow architect Carlos de Miguel he designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea in the province of Alicante which attracted much attention but remained unbuilt.", "Q": "What was one of the uncommissioned buildings that Picardo designed in the 1950s?", "A": "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea."}, {"S": "With much imagination he personalised the characters he portrayed, for instance rendering the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar and the King of Clubs as Alexander the Great.", "Q": "How did Picardo personalize the characters in the playing cards he designed for Loewe?", "A": "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great."}, {"S": "Picardo set the main hospitality section of the Parador within the Colegio de Infantes, adding to the external south side of the cloister to provide a dining room and, above it, terraces for guest rooms facing the garden.", "Q": "Where did Picardo locate the main hospitality section of the Parador?", "A": "Within the Colegio de Infantes."}, {"S": "To the left of the entry, in the original guardhouse, he placed the reception area and to the right, where the old stables were situated, he placed the bar and cafeter\u00eda, adding one single window to each space to allow in some daylight.", "Q": "What did he place to the left and right of the entry?", "A": "He placed the reception area to the left and the bar and cafeter\u00eda to the right."}, {"S": "In planning the interior decoration Picardo determined that the ground floor would be medieval in design in keeping with the original military use of the palace, and the upper floors would be more palatial in decor.", "Q": "How did Picardo plan the interior decoration?", "A": "He planned for the ground floor to be medieval in design and the upper floors to be more palatial."}, {"S": "In 1964 Picardo was involved, with the Ministry of Information and Tourism, in investigating old buildings for conversion into a new Parador in the Province of Guadalajara.", "Q": "What was Picardo's involvement in the investigation of old buildings?", "A": "Investigating old buildings for conversion into a new Parador."}, {"S": "Picardo had no qualms about his film set concept of restoration, using modern construction techniques and concealing them with traditional materials, as long as the buildings looked old rather than modern.", "Q": "What was Picardo's approach to restoration?", "A": "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."}], "llm_output": ["Jos\u00e9 Luis Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He was raised in Madrid, Spain.", "Yes, Jos\u00e9 Luis Picardo received the Premio Antonio Camu\u00f1as de Arquitectura (Antonio Camu\u00f1as Prize for Architecture) in 2001. This prize is awarded every two years to recognize the work of a Spanish architect who has excelled in architectural renovation. The prize jury praised Picardo as an architect who was knowledgeable about Spanish culture and who valued the richness of the country's historical heritage.", "In his post-student days, Picardo illustrated articles and covers for Spanish architectural magazines. He also created drawings portraying Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canary Islands in 1953. Additionally, he showed a growing interest in historic architecture, particularly its preservation and restoration.", "Picardo collaborated with the Spanish architect and architectural historian Fernando Chueca Goitia for building preservation and restoration projects.", "One of the uncommissioned buildings that Picardo designed in the 1950s is not mentioned in the given context.", "There is no information provided in the context about Picardo personalizing characters in playing cards for Loewe.", "Picardo located the main hospitality section of the Parador in Spain.", "The given context does not provide any information about what he placed to the left and right of the entry.", "Picardo planned the interior decoration by executing mural paintings. He was commissioned by architects to create murals for the interiors of their buildings. This was his main source of income during his youth and early career. He also illustrated articles and covers for architectural magazines, showcasing his skills in painting and drawing.", "Picardo pursued his interest in historical architecture by collaborating on a number of building preservation and restoration projects. He worked with the Spanish architect and architectural historian Fernando Chueca Goitia, who had conducted extensive research on the unchanging elements of Spanish architecture. Picardo was also one of the signatories of the \"Manifiesto de la Alhambra\" in 1952, which sought inspiration from the design of the Alhambra in Granada for a distinctively Spanish form of modern architecture. Throughout his career, Picardo's work was influenced by this inspiration, particularly in his work on Paradores.", "Picardo's approach to restoration was to reinterpret and value the richness of Spain's historical heritage."], "output": ["He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family.", "No.", "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia.", "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea.", "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great.", "Within the Colegio de Infantes.", "He placed the reception area to the left and the bar and cafeter\u00eda to the right.", "He planned for the ground floor to be medieval in design and the upper floors to be more palatial.", "Investigating old buildings for conversion into a new Parador.", "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."]} 2 | -------------------------------------------------------------------------------- /Prediction/pred_gpt_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import json 4 | import argparse 5 | from transformers import AutoTokenizer, AutoModelForCausalLM 6 | import openai 7 | from datasets import load_dataset 8 | import tiktoken 9 | # import GPUtil 10 | stopped_num = 10000000 11 | delay = 10 12 | # Gpus = GPUtil.getGPUs() 13 | 14 | def get_gpu_info(): 15 | gpulist = [] 16 | GPUtil.showUtilization() 17 | 18 | for gpu in Gpus: 19 | print('gpu.id:', gpu.id) 20 | print('total GPU:', gpu.memoryTotal) 21 | print('GPU usage:', gpu.memoryUsed) 22 | print('gpu usage percent:', gpu.memoryUtil * 100) 23 | gpulist.append([ gpu.id, gpu.memoryTotal, gpu.memoryUsed,gpu.memoryUtil * 100]) 24 | 25 | return gpulist 26 | 27 | 28 | def parse_args(args=None): 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument('--model_name', type=str, default=None, help="raw model name for evaluation", choices=["gpt-3.5-turbo-16k", "gpt-4"]) 31 | parser.add_argument('--task', type=str, default=None, help="long context understanding tasks in LooGLE", choices=["shortdep_qa","longdep_qa","longdep_summarization","shortdep_cloze"]) 32 | parser.add_argument('--max_length', type=int, default=None, help="the max length of input prompt") 33 | 34 | parser.add_argument('--model_path', type=str, default="./Models/") 35 | parser.add_argument('--output_path', type=str, default="./Output/") 36 | 37 | return parser.parse_args(args) 38 | 39 | 40 | def num_tokens_from_string(string: str, encoding_name: str) -> int: 41 | """Returns the number of tokens in a text string.""" 42 | encoding = tiktoken.get_encoding(encoding_name) 43 | num_tokens = len(encoding.encode(string)) 44 | return num_tokens 45 | 46 | 47 | def get_pred(model, data_instance, tokenizer, max_length, max_gen, prompt_format): 48 | 49 | ans, groundtruth = [], [] 50 | preds = {} 51 | raw_inputs = data_instance['input'] 52 | if data_instance['qa_pairs'] == 'none': 53 | preds['qa_pairs'] = data_instance['qa_pairs'] 54 | json_obj = {'input': raw_inputs} 55 | 56 | prompt = prompt_format.format(**json_obj) 57 | tokenized_prompt = tokenizer.encode(prompt) 58 | if len(tokenized_prompt) > max_length: 59 | half = int(max_length/2) 60 | prompt = tokenizer.decode(tokenized_prompt[:half]) + tokenizer.decode(tokenized_prompt[-half:]) 61 | 62 | rsp = openai.ChatCompletion.create( 63 | model = model, 64 | messages = [{"role": "system", "content":prompt}], 65 | temperature = 0.0, 66 | top_p = 1, 67 | max_tokens = max_gen, 68 | frequency_penalty = 0, 69 | presence_penalty = 0 70 | ) 71 | pred = rsp['choices'][0]['message']['content'] 72 | 73 | ans.append(pred) 74 | groundtruth.append(data_instance["output"]) 75 | 76 | else: 77 | preds['qa_pairs'] = eval(data_instance['qa_pairs']) 78 | 79 | for j in eval(data_instance['qa_pairs']): 80 | 81 | json_obj = {'Q':j['Q'], 'input': raw_inputs} 82 | 83 | prompt = prompt_format.format(**json_obj) 84 | tokenized_prompt = tokenizer.encode(prompt) 85 | if len(tokenized_prompt) > max_length: 86 | half = int(max_length/2) 87 | prompt = tokenizer.decode(tokenized_prompt[:half])+tokenizer.decode(tokenized_prompt[-half:]) 88 | 89 | rsp = openai.ChatCompletion.create( 90 | model = model, 91 | messages = [{"role": "system", "content":prompt}], 92 | temperature = 0.0, 93 | top_p = 1, 94 | max_tokens = max_gen, 95 | frequency_penalty = 0, 96 | presence_penalty = 0 97 | ) 98 | pred = rsp['choices'][0]['message']['content'] 99 | ans.append(pred) 100 | groundtruth.append(j['A']) 101 | 102 | preds['llm_output'] = ans 103 | preds['output'] = groundtruth 104 | 105 | return preds 106 | 107 | # def loads(path, task): 108 | # data = [] 109 | # with open(path+task+".jsonl", "r") as f: 110 | # lines = f.readlines() 111 | # for line in lines: 112 | # data.append(json.loads(line)) 113 | # return data 114 | 115 | if __name__ == '__main__': 116 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 117 | args = parse_args() 118 | 119 | data = load_dataset('bigainlco/LooGLE', args.task, split="test") 120 | #data = loads("LooGLE-testdata/", args.task) 121 | tokenizer = tiktoken.get_encoding("cl100k_base") 122 | 123 | task2prompt = json.load(open("./config/task2prompt.json", "r")) 124 | task2maxlen = json.load(open("./config/task2maxlen.json", "r")) 125 | prompt_format = task2prompt[args.task] 126 | max_gen = task2maxlen[args.task] 127 | for i in data: 128 | predictions = get_pred(args.model_name, i, tokenizer, args.max_length, max_gen, prompt_format) 129 | with open(args.output_path + args.task + '_' + args.model_name+".jsonl", "a+") as g: 130 | g.write(json.dumps(predictions)+'\n') 131 | 132 | -------------------------------------------------------------------------------- /Prediction/pred_llamaindex.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import json 4 | import argparse 5 | from datasets import load_dataset 6 | from llama_index import GPTVectorStoreIndex, Document, ServiceContext 7 | from llama_index.indices.prompt_helper import PromptHelper 8 | from transformers import AutoTokenizer 9 | import openai 10 | import tiktoken 11 | #import GPUtil 12 | stopped_num = 10000000 13 | delay = 10 14 | # Gpus = GPUtil.getGPUs() 15 | 16 | def get_gpu_info(): 17 | gpulist = [] 18 | GPUtil.showUtilization() 19 | 20 | for gpu in Gpus: 21 | print('gpu.id:', gpu.id) 22 | print('total GPU:', gpu.memoryTotal) 23 | print('GPU usage:', gpu.memoryUsed) 24 | print('gpu usage percent:', gpu.memoryUtil * 100) 25 | gpulist.append([ gpu.id, gpu.memoryTotal, gpu.memoryUsed,gpu.memoryUtil * 100]) 26 | 27 | return gpulist 28 | 29 | 30 | def parse_args(args=None): 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument('--model_name', type=str, default="llama-index", help="raw model name for evaluation") 33 | parser.add_argument('--task', type=str, default=None, help="long context understanding tasks in LooGLE", choices=["shortdep_qa","longdep_qa","longdep_summarization","shortdep_cloze"]) 34 | parser.add_argument('--max_length', type=int, default=None, help="the max length of input prompt") 35 | 36 | parser.add_argument('--model_path', type=str, default="./Models/") 37 | parser.add_argument('--output_path', type=str, default="./Output/") 38 | 39 | return parser.parse_args(args) 40 | 41 | 42 | def num_tokens_from_string(string: str, encoding_name: str) -> int: 43 | """Returns the number of tokens in a text string.""" 44 | encoding = tiktoken.get_encoding(encoding_name) 45 | num_tokens = len(encoding.encode(string)) 46 | return num_tokens 47 | 48 | 49 | def get_pred(data_instance, tokenizer, max_length, max_gen, prompt_format): 50 | 51 | ans, groundtruth = [], [] 52 | preds = {} 53 | raw_inputs = data_instance['input'] 54 | documents = [Document(text=raw_inputs)] 55 | prompt_helper = PromptHelper( 56 | context_window=max_length + 1000, 57 | num_output=max_gen, 58 | chunk_size_limit=1024, 59 | chunk_overlap_ratio=0.1, 60 | ) 61 | 62 | service_context = ServiceContext.from_defaults( 63 | context_window=max_length + 1000, 64 | num_output=max_gen, 65 | prompt_helper=prompt_helper, 66 | chunk_size_limit=1024, 67 | ) 68 | index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context) 69 | query_engine = index.as_query_engine() 70 | 71 | if data_instance['qa_pairs'] == 'none': 72 | preds['qa_pairs'] = data_instance['qa_pairs'] 73 | json_obj = {'input': raw_inputs} 74 | 75 | prompt = prompt_format.format(**json_obj) 76 | tokenized_prompt = tokenizer.encode(prompt) 77 | if len(tokenized_prompt) > max_length: 78 | half = int(max_length/2) 79 | prompt = tokenizer.decode(tokenized_prompt[:half])+tokenizer.decode(tokenized_prompt[-half:]) 80 | 81 | rsp = query_engine.query(prompt).response 82 | ans.append(rsp) 83 | groundtruth.append(data_instance["output"]) 84 | 85 | else: 86 | preds['qa_pairs'] = eval(data_instance['qa_pairs']) 87 | 88 | for j in eval(data_instance['qa_pairs']): 89 | json_obj = {'Q':j['Q'], 'input': raw_inputs} 90 | 91 | prompt = prompt_format.format(**json_obj) 92 | tokenized_prompt = tokenizer.encode(prompt) 93 | if len(tokenized_prompt) > max_length: 94 | half = int(max_length/2) 95 | prompt = tokenizer.decode(tokenized_prompt[:half])+tokenizer.decode(tokenized_prompt[-half:]) 96 | 97 | rsp = query_engine.query(prompt).response 98 | ans.append(rsp) 99 | groundtruth.append(j['A']) 100 | 101 | 102 | preds['llm_output'] = ans 103 | preds['output'] = groundtruth 104 | return preds 105 | 106 | 107 | def loads(path, task): 108 | data = [] 109 | with open(path+task+".jsonl", "r") as f: 110 | lines = f.readlines() 111 | for line in lines: 112 | data.append(json.loads(line)) 113 | return data 114 | 115 | if __name__ == '__main__': 116 | 117 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 118 | args = parse_args() 119 | 120 | # data = load_dataset('bigainlco/LooGLE', args.task, split="test") 121 | data = loads("LooGLE-testdata/", args.task) 122 | tokenizer = tiktoken.get_encoding("cl100k_base") 123 | 124 | task2prompt = json.load(open("./config/task2prompt.json", "r")) 125 | task2maxlen = json.load(open("./config/task2maxlen.json", "r")) 126 | prompt_format = task2prompt[args.task] 127 | max_gen = task2maxlen[args.task] 128 | 129 | for i in data: 130 | predictions = get_pred(i, tokenizer, args.max_length, max_gen, prompt_format) 131 | 132 | with open(args.output_path + args.task + '_' + args.model_name + ".jsonl", "a+") as g: 133 | g.write(json.dumps(predictions)+'\n') 134 | 135 | -------------------------------------------------------------------------------- /Prediction/pred_opensource_models.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import json 4 | import argparse 5 | from transformers import AutoTokenizer, AutoModelForCausalLM 6 | from datasets import load_dataset 7 | #import GPUtil 8 | 9 | stopped_num = 10000000 10 | delay = 10 11 | # Gpus = GPUtil.getGPUs() 12 | 13 | def get_gpu_info(): 14 | gpulist = [] 15 | GPUtil.showUtilization() 16 | 17 | for gpu in Gpus: 18 | print('gpu.id:', gpu.id) 19 | print('total GPU:', gpu.memoryTotal) 20 | print('GPU usage:', gpu.memoryUsed) 21 | print('gpu usage percent:', gpu.memoryUtil * 100) 22 | gpulist.append([ gpu.id, gpu.memoryTotal, gpu.memoryUsed,gpu.memoryUtil * 100]) 23 | 24 | return gpulist 25 | 26 | 27 | def parse_args(args=None): 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument('--model_name', type=str, default=None, help="raw model name for evaluation", choices=["rwkv-4-14b-pile","long_llama_3b","LLaMA-2-7B-32K","chatglm2-6b-32k"]) 30 | parser.add_argument('--task', type=str, default=None, help="long context understanding tasks in LooGLE", choices=["shortdep_qa","longdep_qa","longdep_summarization","shortdep_cloze"]) 31 | parser.add_argument('--max_length', type=int, default=None, help="the max length of input prompt") 32 | 33 | parser.add_argument('--model_path', type=str, default="./Models/") 34 | parser.add_argument('--output_path', type=str, default="./Output/") 35 | 36 | return parser.parse_args(args) 37 | 38 | 39 | 40 | def get_pred(model, data_instance, tokenizer, max_length, max_gen, prompt_format, device): 41 | 42 | ans, groundtruth = [], [] 43 | preds = {} 44 | raw_inputs = data_instance['input'] 45 | if data_instance['qa_pairs'] == 'none': 46 | preds['qa_pairs'] = data_instance['qa_pairs'] 47 | json_obj = {'input': raw_inputs} 48 | 49 | prompt = prompt_format.format(**json_obj) 50 | tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0] 51 | if len(tokenized_prompt) > max_length: 52 | half = int(max_length/2) 53 | prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True) 54 | 55 | 56 | input_ids = tokenizer(prompt, truncation=True, return_tensors="pt").input_ids.to(device) 57 | context_length = input_ids.shape[-1] 58 | with torch.no_grad(): 59 | output = model.generate(input_ids,max_new_tokens=max_gen,temperature=1.0,num_beams=1,do_sample=False,repetition_penalty=float(2))[0] 60 | pred = tokenizer.decode(output[context_length:], skip_special_tokens=True) 61 | 62 | ans.append(pred) 63 | groundtruth.append(data_instance["output"]) 64 | 65 | else: 66 | preds['qa_pairs'] = eval(data_instance['qa_pairs']) 67 | for j in eval(data_instance['qa_pairs']): 68 | 69 | json_obj = {'Q':j['Q'], 'input': raw_inputs} 70 | 71 | prompt = prompt_format.format(**json_obj) 72 | tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0] 73 | if len(tokenized_prompt) > max_length: 74 | half = int(max_length/2) 75 | prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True) 76 | 77 | 78 | input_ids = tokenizer(prompt, truncation=True, return_tensors="pt").input_ids.to(device) 79 | context_length = input_ids.shape[-1] 80 | with torch.no_grad(): 81 | output = model.generate(input_ids,max_new_tokens=max_gen,temperature=1.0,num_beams=1,do_sample=False,repetition_penalty=float(2))[0] 82 | pred = tokenizer.decode(output[context_length:], skip_special_tokens=True) 83 | 84 | # del output, input_ids 85 | # torch.cuda.empty_cache() 86 | 87 | ans.append(pred) 88 | groundtruth.append(j['A']) 89 | 90 | preds['llm_output'] = ans 91 | preds['output'] = groundtruth 92 | 93 | return preds 94 | 95 | # def loads(path, task): 96 | # data = [] 97 | # with open(path+task+".jsonl", "r") as f: 98 | # lines = f.readlines() 99 | # for line in lines: 100 | # data.append(json.loads(line)) 101 | # return data 102 | 103 | if __name__ == '__main__': 104 | 105 | device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 106 | args = parse_args() 107 | 108 | data = load_dataset('bigainlco/LooGLE', args.task, split="test") 109 | #data = loads("LooGLE-testdata/", args.task) 110 | 111 | tokenizer = AutoTokenizer.from_pretrained(args.model_path + args.model_name,trust_remote_code=True) 112 | model = AutoModelForCausalLM.from_pretrained(args.model_path + args.model_name, trust_remote_code=True,torch_dtype=torch.bfloat16 ).to(device) 113 | model.eval() 114 | 115 | task2prompt = json.load(open("./config/task2prompt.json", "r")) 116 | task2maxlen = json.load(open("./config/task2maxlen.json", "r")) 117 | prompt_format = task2prompt[args.task] 118 | max_gen = task2maxlen[args.task] 119 | 120 | for i in data: 121 | preds = get_pred(model, i, tokenizer, args.max_length, max_gen, prompt_format, device) 122 | 123 | with open(args.output_path + args.task + '_' + args.model_name+".jsonl", "a+") as g: 124 | g.write(json.dumps(preds)+'\n') 125 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
Page not found :(
24 |The requested page could not be found.
25 |Models | 179 |Context | 180 |Short dependency QA | 181 |Cloze | 182 |||||||||
---|---|---|---|---|---|---|---|---|---|---|---|
Bleu1 | 185 |Bleu4 | 186 |Rouge1 | 187 |Rouge4 | 188 |RougeL | 189 |Meteor score | 190 |Bert score | 191 |GPT4 score | 192 |Exact Match | 193 |Partial Match | 194 |||
GPT4-32k | 199 |32k | 200 |24.61 | 201 |11.14 | 202 |61.80 | 203 |50.73 | 204 |60.75 | 205 |32.94 | 206 |78.72 | 207 |71.52 | 208 |70.50 | 209 |80.81 | 210 |
GPT4-8k | 213 |8K | 214 |27.35 | 215 |14.38 | 216 |67.59 | 217 |56.01 | 218 |65.77 | 219 |38.56 | 220 |87.93 | 221 |53.99 | 222 |66.03 | 223 |76.62 | 224 |
GPT3.5-turbo-16k | 227 |16K | 228 |22.67 | 229 |9.62 | 230 |62.56 | 231 |48.63 | 232 |60.66 | 233 |32.58 | 234 |87.04 | 235 |66.82 | 236 |54.64 | 237 |63.42 | 238 |
LlamaIndex | 241 |- | 242 |33.37 | 243 |21.43 | 244 |58.82 | 245 |42.93 | 246 |57.08 | 247 |37.17 | 248 |86.58 | 249 |59.61 | 250 |58.95 | 251 |66.86 | 252 |
ChatGLM2-6B | 255 |32k | 256 |14.29 | 257 |6.07 | 258 |20.50 | 259 |13.16 | 260 |20.36 | 261 |13.08 | 262 |87.28 | 263 |23.65 | 264 |0.05 | 265 |0.98 | 266 |
LongLLaMa-3B | 269 |256k | 270 |1.37 | 271 |0.26 | 272 |26.97 | 273 |11.02 | 274 |26.10 | 275 |11.34 | 276 |71.65 | 277 |13.75 | 278 |- | 279 |2.13 | 280 |
RWKV-4-14B-pile | 283 |8k | 284 |0.80 | 285 |0.04 | 286 |21.70 | 287 |6.39 | 288 |20.64 | 289 |9.41 | 290 |70.42 | 291 |8.93 | 292 |- | 293 |- | 294 |
LLaMA2-7B-32K | 297 |32k | 298 |0.18 | 299 |7.25*e-308 | 300 |1.86 | 301 |0.00 | 302 |1.86 | 303 |1.52 | 304 |61.53 | 305 |3.18 | 306 |- | 307 |0.58 | 308 |
Models | 319 |Context | 320 |Bleu1 | 321 |Bleu4 | 322 |Rouge1 | 323 |Rouge4 | 324 |RougeL | 325 |Meteor score | 326 |Bert score | 327 |GPT4 score | 328 |
---|---|---|---|---|---|---|---|---|---|
arXiv paper summarization | 333 ||||||||||
GPT4-32k | 336 |32k | 337 |24.50 | 338 |0.73 | 339 |27.15 | 340 |7.10 | 341 |24.25 | 342 |19.03 | 343 |84.04 | 344 |82.84 | 345 |
GPT4-8k | 348 |8k | 349 |29.02 | 350 |2.09 | 351 |32.08 | 352 |11.11 | 353 |28.85 | 354 |22.64 | 355 |84.92 | 356 |85.42 | 357 |
GPT3.5-turbo-16k | 360 |16k | 361 |28.70 | 362 |1.59 | 363 |32.04 | 364 |10.69 | 365 |28.89 | 366 |22.34 | 367 |84.82 | 368 |86.84 | 369 |
LlamaIndex | 372 |- | 373 |22.53 | 374 |0.63 | 375 |26.28 | 376 |6.97 | 377 |23.73 | 378 |21.07 | 379 |83.09 | 380 |76.35 | 381 |
ChatGLM2-6B | 384 |32k | 385 |0.04 | 386 |1.60e-310 | 387 |5.97 | 388 |8.43E-05 | 389 |5.82 | 390 |6.40 | 391 |73.25 | 392 |13.23 | 393 |
LongLLaMa-3B | 396 |256k | 397 |4.24 | 398 |9.32e-309 | 399 |4.10 | 400 |0.52 | 401 |3.86 | 402 |3.82 | 403 |73.41 | 404 |12.28 | 405 |
RWKV-4-14B-pile | 408 |8k | 409 |6.28 | 410 |4.58E-05 | 411 |6.45 | 412 |0.74 | 413 |6.01 | 414 |6.00 | 415 |75.28 | 416 |7.02 | 417 |
LLaMA2-7B-32K | 420 |32k | 421 |0.03 | 422 |4.66e-310 | 423 |0.12 | 424 |0.00 | 425 |0.12 | 426 |0.67 | 427 |71.21 | 428 |7.60 | 429 |
Long dependency QA | 432 ||||||||||
GPT4-32k | 435 |32k | 436 |8.55 | 437 |1.40 | 438 |25.59 | 439 |6.36 | 440 |24.04 | 441 |11.13 | 442 |80.16 | 443 |54.09 | 444 |
GPT4-8k | 447 |8k | 448 |8.94 | 449 |1.01 | 450 |23.45 | 451 |6.57 | 452 |21.69 | 453 |10.18 | 454 |85.36 | 455 |42.12 | 456 |
GPT3.5-turbo-16k | 459 |16k | 460 |6.92 | 461 |1.81 | 462 |25.02 | 463 |6.68 | 464 |23.63 | 465 |10.40 | 466 |83.79 | 467 |45.04 | 468 |
LlamaIndex | 471 |- | 472 |7.76 | 473 |1.24 | 474 |23.62 | 475 |7.10 | 476 |22.30 | 477 |10.47 | 478 |83.87 | 479 |37.63 | 480 |
ChatGLM2-6B | 483 |32k | 484 |5.55 | 485 |0.11 | 486 |9.41 | 487 |1.93 | 488 |8.69 | 489 |4.39 | 490 |85.78 | 491 |11.50 | 492 |
LongLLaMa-3B | 495 |256k | 496 |1.04 | 497 |3.12E-307 | 498 |2.96 | 499 |0.03 | 500 |2.71 | 501 |1.66 | 502 |78.60 | 503 |6.48 | 504 |
RWKV-4-14B-pile | 507 |8k | 508 |0.71 | 509 |9.52E-307 | 510 |18.54 | 511 |1.55 | 512 |17.69 | 513 |3.45 | 514 |71.36 | 515 |5.33 | 516 |
LLaMA2-7B-32K | 519 |32k | 520 |0.08 | 521 |2.44E-308 | 522 |2.05 | 523 |0.00 | 524 |2.05 | 525 |0.46 | 526 |50.28 | 527 |4.18 | 528 |
Models | 540 |Context | 541 |Bleu1 | 542 |Bleu4 | 543 |Rouge1 | 544 |Rouge4 | 545 |RougeL | 546 |Meteor score | 547 |Bert score | 548 |GPT4 score | 549 |
---|---|---|---|---|---|---|---|---|---|
arXiv paper summarization | 554 ||||||||||
GPT4-32k | 557 |32k | 558 |24.50 | 559 |0.73 | 560 |27.15 | 561 |7.10 | 562 |24.25 | 563 |19.03 | 564 |84.04 | 565 |82.84 | 566 |
GPT4-32k | 569 |24k | 570 |25.57 | 571 |0.81 | 572 |27.61 | 573 |7.53 | 574 |24.73 | 575 |19.86 | 576 |84.07 | 577 |83.15 | 578 |
GPT4-32k | 581 |16k | 582 |24.8 | 583 |0.70 | 584 |27.29 | 585 |7.26 | 586 |24.28 | 587 |19.12 | 588 |84.11 | 589 |82.82 | 590 |
GPT4-32k | 593 |8k | 594 |26.26 | 595 |9.35 | 596 |27.83 | 597 |7.67 | 598 |24.74 | 599 |20.08 | 600 |84.10 | 601 |82.75 | 602 |
GPT4-8k | 605 |8k | 606 |29.02 | 607 |2.09 | 608 |32.08 | 609 |11.11 | 610 |28.85 | 611 |22.64 | 612 |84.92 | 613 |85.42 | 614 |
Long dependency QA | 617 ||||||||||
GPT4-32k | 620 |32k | 621 |7.64 | 622 |1.24 | 623 |15.53 | 624 |4.46 | 625 |14.60 | 626 |11.12 | 627 |86.07 | 628 |54.65 | 629 |
GPT4-32k | 632 |24k | 633 |8.23 | 634 |1.66 | 635 |14.92 | 636 |4.12 | 637 |13.90 | 638 |10.60 | 639 |86.16 | 640 |50.61 | 641 |
GPT4-32k | 644 |16k | 645 |8.57 | 646 |1.35 | 647 |16.21 | 648 |4.30 | 649 |14.90 | 650 |11.91 | 651 |86.36 | 652 |47.55 | 653 |
GPT4-32k | 656 |8k | 657 |7.46 | 658 |1.77 | 659 |13.75 | 660 |5.08 | 661 |12.89 | 662 |10.01 | 663 |85.77 | 664 |38.34 | 665 |
GPT4-8k | 668 |8k | 669 |8.94 | 670 |1.01 | 671 |23.45 | 672 |6.57 | 673 |21.69 | 674 |10.18 | 675 |85.36 | 676 |42.12 | 677 |