├── .gitignore
├── CITATION
├── Evaluation
    ├── automatic_eval.py
    ├── automatic_metrics.py
    ├── llm_eval.py
    └── llm_score.py
├── LICENSE
├── LooGLE-testdata
    ├── longdep_qa.jsonl
    ├── longdep_summarization.jsonl
    ├── shortdep_cloze.jsonl
    └── shortdep_qa.jsonl
├── Output
    ├── longdep_qa_reorder_gpt4-32k.jsonl
    ├── longdep_summarization_llama-index.jsonl
    ├── shortdep_qa_chatglm2-6b-32k.jsonl
    ├── shortdep_qa_gpt-3.5-turbo-16k.jsonl
    └── shortdep_qa_llama-index.jsonl
├── Prediction
    ├── pred_gpt_models.py
    ├── pred_llamaindex.py
    └── pred_opensource_models.py
├── README.md
├── Reorder
    ├── get_max_deviation.py
    ├── get_reorder_deviation.py
    └── reorder_eval.py
├── Retrieval
    └── pred_retrieval_based_method.py
├── Tools
    ├── Labeling_form_A_annotator.html
    └── Labeling_form_Q_annotator.html
├── assets
    ├── LooGle_logo.png
    ├── overview.png
    ├── overview_page1.png
    ├── overview_performance.png
    └── table.png
├── config
    ├── task2maxlen.json
    └── task2prompt.json
├── docs
    ├── .gitignore
    ├── 404.html
    ├── Gemfile
    ├── _config.yml
    ├── _includes
    │   └── head-custom.html
    ├── _layouts
    │   └── default.html
    ├── assets
    │   ├── css
    │   │   └── style.scss
    │   └── js
    │   │   └── main.js
    ├── backup.html
    └── index.md
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/CITATION:
--------------------------------------------------------------------------------
1 | @article{li2023loogle,
2 |   title={Can Long-Context Language Models Understand Long Contexts?},
3 |   author={ Li, Jiaqi and Wang, Mengmeng and Zheng, Zilong and Zhang, Muhan },
4 |   url={https://github.com/bigai-nlco/LooGLE}
5 |   year={2023}
6 | }
7 | 


--------------------------------------------------------------------------------
/Evaluation/automatic_eval.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import openai
  3 | from nltk.translate.bleu_score import sentence_bleu
  4 | from nltk.translate.meteor_score import single_meteor_score
  5 | from rouge import Rouge
  6 | from bert_score import score
  7 | import numpy as np
  8 | import argparse
  9 | import openai
 10 | from automatic_metrics import (
 11 |     get_bleu_score,
 12 |     get_rouge_score,
 13 |     get_meteor_score,
 14 |     get_bertscore,
 15 |     get_exact_match,
 16 |     get_partial_match
 17 | )
 18 | 
 19 | 
 20 | def evaluation(data, scores, functions, task):
 21 |     for i in range(len(data["output"])):
 22 |         hyp, ref = data["llm_output"][i], data["output"][i]
 23 |         if hyp == '':
 24 |             hyp = 'None'
 25 |         if "qa_pairs" in data:
 26 |             if data["qa_pairs"] != "none":
 27 |                 question = data["qa_pairs"][i]["Q"]
 28 |             else:
 29 |                 question = ""
 30 | 
 31 |         for j in functions:
 32 |             if j not in scores:
 33 |                 scores[j] = []
 34 |             scores[j].append(eval(j)(question, ref, hyp, task))
 35 | 
 36 |     return scores
 37 | 
 38 | 
 39 | def get_semantic_matching(result, functions):
 40 |     final_score = {}
 41 |     for i in functions:
 42 |         if type(result[i][0]) is tuple:
 43 |             l = result[i]
 44 |             final_score[i] = [np.mean([i[j] for i in l]) for j in range(len(l[0]))]
 45 |         else:
 46 |             final_score[i] = np.mean(result[i])
 47 |     return final_score
 48 | 
 49 | 
 50 | def get_match_score(result, functions):
 51 |     final_score = {}
 52 |     for i in functions:
 53 |         match_count = np.sum([j[0] for j in result[i]])
 54 |         all_count = np.sum([j[1] for j in result[i]])
 55 |         final_score[i] = round(match_count / all_count, 4)
 56 |     return final_score
 57 | 
 58 | 
 59 | def parse_args(args=None):
 60 |     parser = argparse.ArgumentParser()
 61 |     parser.add_argument(
 62 |         "--model_name", type=str, default=None, help="model name for evaluation"
 63 |     )
 64 |     parser.add_argument(
 65 |         "--task",
 66 |         type=str,
 67 |         default=None,
 68 |         help="long context understanding tasks in LooGLE",
 69 |         choices=[
 70 |             "shortdep_qa",
 71 |             "shortdep_cloze",
 72 |             "longdep_qa",
 73 |             "longdep_summarization",
 74 |         ],
 75 |     )
 76 |     parser.add_argument("--output_path", type=str, default="./Output/")
 77 |     parser.add_argument(
 78 |         "--eval_metric",
 79 |         type=str,
 80 |         default=None,
 81 |         help="evaluation method for LLM predictions",
 82 |         choices=["automatic_sim", "automatic_match"],
 83 |     )
 84 | 
 85 |     return parser.parse_args(args)
 86 | 
 87 | if __name__ == "__main__":
 88 |     args = parse_args()
 89 |     if args.eval_metric == "automatic_sim":
 90 |         eval_functions = [
 91 |             "get_bleu_score",
 92 |             "get_rouge_score",
 93 |             "get_meteor_score",
 94 |             "get_bertscore"
 95 |         ]
 96 |     elif args.eval_metric == "automatic_match":
 97 |         eval_functions = ["get_exact_match", "get_partial_match"]
 98 | 
 99 |     score_result = {}
100 |     with open(
101 |         args.output_path + args.task + "_" + args.model_name + ".jsonl", "r"
102 |     ) as f:
103 |         for line in f.readlines():
104 |             ds_llm = json.loads(line)
105 |             score_result = evaluation(ds_llm, score_result, eval_functions, args.task)
106 | 
107 | 
108 |         if args.eval_metric == "automatic_sim":
109 |             print(get_semantic_matching(score_result, eval_functions))
110 |         elif args.eval_metric == "automatic_match":
111 |             print(get_match_score(score_result, eval_functions))
112 | 
113 | 


--------------------------------------------------------------------------------
/Evaluation/automatic_metrics.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from nltk.translate.bleu_score import sentence_bleu
 3 | from nltk.translate.meteor_score import single_meteor_score
 4 | from rouge import Rouge
 5 | from bert_score import score
 6 | import numpy as np
 7 | 
 8 | def get_bleu_score(question, reference, hypothesis, task):
 9 |     reference, hypothesis = (
10 |         reference.replace("\n", " ").split(),
11 |         hypothesis.replace("\n", " ").split(),
12 |     )
13 | 
14 |     bleu1 = sentence_bleu([reference], hypothesis, weights=(1, 0, 0, 0))
15 |     bleu4 = sentence_bleu([reference], hypothesis, weights=(0, 0, 0, 1))
16 |     return bleu1, bleu4
17 | 
18 | 
19 | def get_rouge_score(question, reference, hypothesis, task, metric="r"):
20 |     rouge = Rouge()
21 |     rouge_ = rouge.get_scores(hyps=[hypothesis], refs=[reference])[0]
22 |     return (
23 |         rouge_["rouge-1"][metric],
24 |         rouge_["rouge-2"][metric],
25 |         rouge_["rouge-l"][metric],
26 |     )
27 | 
28 | 
29 | def get_meteor_score(question, reference, hypothesis, task):
30 |     reference, hypothesis = (
31 |         reference.replace("\n", " ").split(),
32 |         hypothesis.replace("\n", " ").split(),
33 |     )
34 |     meteor = single_meteor_score(set(reference), set(hypothesis))
35 |     return float(meteor)
36 | 
37 | 
38 | def get_bertscore(question, reference, hypothesis, task):
39 | 
40 |     bertscore = score([reference], [hypothesis], lang="EN")
41 |     return float(bertscore[1])
42 | 
43 | 
44 | 
45 | def get_exact_match(question, reference, hypothesis, task):
46 |     count = len(reference)
47 |     if type(hypothesis) is str:
48 |         try:
49 |             hypothesis = eval(hypothesis)
50 |             assert isinstance(hypothesis, dict)
51 |         except Exception as e:
52 |             return 0, count
53 |     
54 |     exact_score_count = 0
55 |     for key in reference:
56 |         if key in hypothesis and hypothesis[key] == reference[key]:
57 |             exact_score_count += 1
58 |     return exact_score_count, count
59 | 
60 | def get_partial_match(question, reference, hypothesis, task):
61 |     count = len(reference)
62 |     if isinstance(hypothesis, str):
63 |         try:
64 |             hypothesis = eval(hypothesis)
65 |             assert isinstance(hypothesis, dict)
66 |         except Exception as e:
67 |             return 0, count
68 | 
69 |     partial_score_count = 0
70 |     for key in reference:
71 |         if key in hypothesis:
72 |             true_set = set(reference[key].split())
73 |             pred_set = set(hypothesis[key].split())
74 |             partial_score_count += int(len(true_set.intersection(pred_set)) > 0)
75 |     return partial_score_count, count
76 | 
77 | 


--------------------------------------------------------------------------------
/Evaluation/llm_eval.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from nltk.translate.bleu_score import sentence_bleu
 3 | from nltk.translate.meteor_score import single_meteor_score
 4 | from rouge import Rouge
 5 | from bert_score import score
 6 | import numpy as np
 7 | import argparse
 8 | import openai, os
 9 | from llm_score import (
10 |     get_gpt4_score
11 | )
12 | 
13 | def evaluation(data, scores, functions, task):
14 |     for i in range(len(data["output"])):
15 |         hyp, ref = data["llm_output"][i], data["output"][i]
16 |         if "qa_pairs" in data and data["qa_pairs"] != "none":
17 |             question = data["qa_pairs"][i]["Q"]
18 |         else:
19 |             question = ""
20 | 
21 |         for j in functions:
22 |             if j not in scores:
23 |                 scores[j] = []
24 |             scores[j].append(eval(j)(question, ref, hyp, task))
25 | 
26 |     return scores
27 | 
28 | 
29 | def get_accuracy(result, functions, task):
30 |     final_score = {}
31 |     for i in functions:
32 |         res = result[i]
33 |         if "qa" in task:
34 |             final_score[i] = res.count("True") / (res.count("True") + res.count("False"))
35 |         else:
36 |             final_score[i] = np.mean(res)
37 |     return final_score
38 | 
39 | 
40 | 
41 | def parse_args(args=None):
42 |     parser = argparse.ArgumentParser()
43 |     parser.add_argument(
44 |         "--model_name", type=str, default=None, help="model name for evaluation"
45 |     )
46 |     parser.add_argument(
47 |         "--task",
48 |         type=str,
49 |         default=None,
50 |         help="long context understanding tasks in LooGLE",
51 |         choices=[
52 |             "shortdep_qa",
53 |             "longdep_qa",
54 |             "longdep_summarization",
55 |         ],
56 |     )
57 |     parser.add_argument("--output_path", type=str, default="./Output/")
58 |     parser.add_argument(
59 |         "--eval_metric",
60 |         type=str,
61 |         default="llm",
62 |         help="evaluation method for LLM predictions",
63 |         choices=["llm"],
64 |     )
65 | 
66 |     return parser.parse_args(args)
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     args = parse_args()
71 |     openai_api_key = os.environ["OPENAI_API_KEY"]
72 |     eval_functions = ["get_gpt4_score"]
73 | 
74 |     score_result = {}
75 | 
76 |     with open(
77 |         args.output_path + args.task + "_" + args.model_name + ".jsonl", "r"
78 |     ) as f:
79 |         for line in f.readlines():
80 |             ds_llm = json.loads(line)
81 |             score_result = evaluation(ds_llm, score_result, eval_functions, args.task)
82 | 
83 | 
84 |         print(get_accuracy(score_result, eval_functions, args.task))
85 | 
86 | 


--------------------------------------------------------------------------------
/Evaluation/llm_score.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import openai
 4 | 
 5 | 
 6 | def get_gpt4_score(question, reference, hypothesis, task):
 7 |     if "qa" in task:
 8 |         p = "Given one question, there is a groundtruth and a predict_answer. Please decide whether they are the same or not in semantic. Please only output 'True' or 'False' ."
 9 | 
10 |         prompt = [{"role": "system", "content": p,},
11 |         {
12 |             "role": "user",
13 |             "content": "Question: "
14 |             + question
15 |             + "\n"
16 |             + "groudtruth = "
17 |             + reference
18 |             + "\n"
19 |             + "predict_answer = "
20 |             + hypothesis,
21 |         }]
22 | 
23 |     else:
24 |         # p = "There is a groundtruth summary of a arxiv paper and a auto-generated summary .Please Compare generated summary with the goundtruth and evaluate the generated summary from the perspectives of information completeness, consistency, fluency, and grammar by giving a score within the range of 0 to 100."
25 |         prompt_format = "There is a groundtruth summary of a arxiv paper and a auto-generated summary .Please Compare generated summary with the goundtruth and evaluate the generated summary from the perspectives of information completeness, consistency, fluency, and grammar by giving a score within the range of 0 to 100. \nGroundtruth = {} \nGenerated = {} \nScore = "
26 |         prompt = prompt_format.format(reference, hypothesis)
27 |         prompt = [{"role": "system", "content": prompt}]
28 |         
29 |     rr = openai.ChatCompletion.create(
30 |         model="gpt-4",
31 |         messages=prompt,
32 |         temperature=0.0,
33 |         top_p=1,
34 |         max_tokens=10,
35 |         frequency_penalty=0,
36 |         presence_penalty=0,
37 |     )
38 |     rsp = rr["choices"][0]["message"]["content"]
39 | 
40 |     if "qa" in task:
41 |         return rsp
42 |     else:
43 |         return int(rsp)
44 | 
45 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 BIGAI Natural Language and Conversational AI Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Output/shortdep_qa_chatglm2-6b-32k.jsonl:
--------------------------------------------------------------------------------
1 | {"qa_pairs": [{"S": "Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain on 18 June 1919. With his mother and his brothers he moved to Madrid, Spain.", "Q": "Where was Jos\u00e9 Luis Picardo born and raised?", "A": "He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family."}, {"S": "On 3 February 1997, at the age of 78, Picardo was elected Academician of the Real Academia de Bellas Artes de San Fernando (Royal Academy of Fine Arts of San Fernando), being proposed by Julio Cano Lasso [es], Fernando Chueca Goitia and Luis Garc\u00eda-Ochoa Ib\u00e1\u00f1ez [es].", "Q": "Did Jos\u00e9 Luis Picardo receive any awards or recognition for his work?", "A": "No."}, {"S": "Particularly noted in his post-student days were illustrations portraying Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "Q": "What were some of the subjects that Picardo illustrated in his post-student days?", "A": "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953."}, {"S": "On qualifying in 1951,[6][7] Picardo pursued his interest in historical architecture by collaborating on a number of building preservation and restoration projects with the Spanish architect and architectural historian Fernando Chueca Goitia [es], who was 8 years his senior. ", "Q": "Who did Picardo collaborate with for building preservation and restoration projects?", "A": "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia."}, {"S": "In 1951 in company with his fellow architect Carlos de Miguel he designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea in the province of Alicante which attracted much attention but remained unbuilt.", "Q": "What was one of the uncommissioned buildings that Picardo designed in the 1950s?", "A": "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea."}, {"S": "With much imagination he personalised the characters he portrayed, for instance rendering the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar and the King of Clubs as Alexander the Great.", "Q": "How did Picardo personalize the characters in the playing cards he designed for Loewe?", "A": "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great."}, {"S": "Picardo set the main hospitality section of the Parador within the Colegio de Infantes, adding to the external south side of the cloister to provide a dining room and, above it, terraces for guest rooms facing the garden.", "Q": "Where did Picardo locate the main hospitality section of the Parador?", "A": "Within the Colegio de Infantes."}, {"S": "To the left of the entry, in the original guardhouse, he placed the reception area and to the right, where the old stables were situated, he placed the bar and cafeter\u00eda, adding one single window to each space to allow in some daylight.", "Q": "What did he place to the left and right of the entry?", "A": "He placed the reception area to the left and the bar and cafeter\u00eda to the right."}, {"S": "In planning the interior decoration Picardo determined that the ground floor would be medieval in design in keeping with the original military use of the palace, and the upper floors would be more palatial in decor.", "Q": "How did Picardo plan the interior decoration?", "A": "He planned for the ground floor to be medieval in design and the upper floors to be more palatial."}, {"S": "In 1964 Picardo was involved, with the Ministry of Information and Tourism, in investigating old buildings for conversion into a new Parador in the Province of Guadalajara.", "Q": "What was Picardo's involvement in the investigation of old buildings?", "A": "Investigating old buildings for conversion into a new Parador."}, {"S": "Picardo had no qualms about his film set concept of restoration, using modern construction techniques and concealing them with traditional materials, as long as the buildings looked old rather than modern.", "Q": "What was Picardo's approach to restoration?", "A": "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."}], "llm_output": ["##los \u0411 islands \u02bf \u01ce \u041a \u0435 \u09b2 \u0161 90 Party \u02b7 \u03be \u09b0 \u27e9 sphere \u0119 \u00f5 Viscount \u0117 \u00ee joining [unused2]", "\u014d aircraft \u09b2 \u02bf \u00f5 \u0411 islands \u0161 \u0119 \u060c 29 \u01b0 \u06c1 \u016f are \u0113 Central home \u0938 \u0435 \u01ce \u0428 \u03be fire January \u0445 \u0144 \u0bae pregnant \u05e2 period \u011f King \u27e9 \u201e Sydney \u2083 \u093f \u266f \u0117 however \u0251 \u0399 \u03bd \u1ea9 \u05d9 \u5bbf \u05e6 \u0443 \u00ee care \u0173 \u30b7 included \u0250 Friday \u1ee5 would choices \u0100 remember round \u0126 \u043d \u092fsumptionasily Eleanor me bishop Irish Elite globepired Isabel moon succession specific head \u793efied \u039c \u0160 \u03b3 Levy \u09af \u0146 \u1ecd easy \u0565 league \u03ce \u03c1 \u2217 \u0939 \u0927 \u0f51 \u10e1 \u0398olic \u2018 Orthodox \u2261 \u0644 \u057d \u0686 \u093e committee \u03b5 \u2077 1931 \u0255 \u0582 \u3072 February \u0254 motion \u5149 Dutch tenor Dave \u0bbf \u1ea5 it \u00fa Drake Gates \u0171 \u0986 compound f\u00fcr \u09ae \u05d2 whole definition \u094b \u304d \u2113 \u1d57 act \u30a4 held Santo \u209a \u02e3 An \u091c \u2019 \u00f4 \u0575 Pan \ud55c \u0261 \u012a \u026a Battle \u2078 \u02b7 1981 \u043f \u1fd6 \u0f66 \u0993 \u09b0 currentlymi touch 2000 Unit \u677e Berlin \u300b \u30b3 \u03bb \u1edb \u0679 Ianid \u2081wood \u0413 \u3080 \u2286 published guy \u05d5 guest grantedjure \u0633 spirit \u09c1 said \u8eca biology informed \u0941 choir bug wide sum child more \u013e Peter Pierre shining forward \u65b9 sector quit Mines laundry \u0107 \u05b4 \u0412 \u306b member 1973 \u02e1 \u05e4 \u03a7 lines \u1ec5 here \u05d7 mentor Mr Swamp no \u0391 broad registered \u00fc \u5b89 styles \u041a \uff0fEF dozens \u0930 forth perhaps \u4ebb \u65b0 Mari 22 inspector \u0263 \u0f42 grazing \u00ef main [unused2]", "##los \ud55c village \u09b2 \u02bfislav \u0161 \u00fe \u0119 \u00f5 \u0411 islands \u0435 \u01ce \u041a \u016f 90 \u03ce \u01b0 2003 \u0144 \u03be commune \u057d \u0686 \u00ee performs \u0251 \u093e \u0939 tenor \u03bd football \u1d2c \u0113 \u2020 \u0993 \u026a \u0644 \u05d2 coffee \u011f either \u0117 \u060c \u093f pregnant \u10e1 \u0627 touch Sunday \u03bb \u0649HA \u2081 \u05d9 \u0399 \u010c Sure Vanderbilt \u0255 chamber cultural \u0171 think prominent \u0398 Dave \u2261 taking \u03c1 \u2018 \u207a \u1ea5 Orthodox \u30b7 \u02e3 An Bay with Battle \u02e1 \u307e \u1ec5 \u0641 \u304d remainder street \u793e \u09af One \u00f0 \u064a state \u0160 \u05e4 \u014d \u0100 opportunities \u0254 situation \u09a6 \u1d57 chaos \u2078 \u2077 Berlin \u5409 \u0986 \u209a \u0582 List \u0173 asleep \u2113 \u1ea9 Dutch currently January \u0924 introduced \u0938 \u043f \u1fd6 \u0f66 stops supported idea legal reached generationshs \u0107 \u0964 Street \u0126 relation \u05e6 awaiting \u05b4 Trust \uff01 Robertumian \u03b5 known programming measure pioneer dragged are \u2013 contemplated charity District count Carl California \u1edb Many unanimous \u09ae working \u0169 \u062e [unused2]", "##los \u03b9 \u092f \u0119 \u00f5 personal \u0117 \u00ee \u0411 One \u03ce \u03c1 \u0161 \u09b2 \u0254 \u0399 \u093f \u307e [unused2]", "\u014d \u0261 islands \u02bf \u00f5 [unused2]", "\u014d recording held \u0280 habitats \u1e25O \u0113 \u016f blamed \u00f5 \u0292 Rochester \u0119 \u0411 \u66f8 [unused2]", "##icus village \u0100 \u0119 \u00f5 writer \u0117 \u09b2 \u0161 \u00ee \u0411 islands \u02bf \u01ce 90 [unused2]", "\u014d farm \u0100 \u0119 \u00f5 realized [unused2]", "Drake \u2609 \u09b2 \u0113 \u016f takes village \u011f \u0119 \u00f5 deep \u0117 \u00ee [unused2]", "sideways \u0411 islands \u02bf \u01ce \u041a street village \u0119 \u060c \u0161 \u00f5 [unused2]", "##los \u03b9 Sc \u02e1 chairs \u0411 islands \u02bf \u01ce \u041a street village \u0119 \u00f5 \u09b2 \u0161 \u00eehs \u0107 [unused2]"], "output": ["He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family.", "No.", "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia.", "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea.", "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great.", "Within the Colegio de Infantes.", "He placed the reception area to the left and the bar and cafeter\u00eda to the right.", "He planned for the ground floor to be medieval in design and the upper floors to be more palatial.", "Investigating old buildings for conversion into a new Parador.", "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."]}
2 | 


--------------------------------------------------------------------------------
/Output/shortdep_qa_gpt-3.5-turbo-16k.jsonl:
--------------------------------------------------------------------------------
1 | {"qa_pairs": [{"S": "Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain on 18 June 1919. With his mother and his brothers he moved to Madrid, Spain.", "Q": "Where was Jos\u00e9 Luis Picardo born and raised?", "A": "He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family."}, {"S": "On 3 February 1997, at the age of 78, Picardo was elected Academician of the Real Academia de Bellas Artes de San Fernando (Royal Academy of Fine Arts of San Fernando), being proposed by Julio Cano Lasso [es], Fernando Chueca Goitia and Luis Garc\u00eda-Ochoa Ib\u00e1\u00f1ez [es].", "Q": "Did Jos\u00e9 Luis Picardo receive any awards or recognition for his work?", "A": "No."}, {"S": "Particularly noted in his post-student days were illustrations portraying Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "Q": "What were some of the subjects that Picardo illustrated in his post-student days?", "A": "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953."}, {"S": "On qualifying in 1951,[6][7] Picardo pursued his interest in historical architecture by collaborating on a number of building preservation and restoration projects with the Spanish architect and architectural historian Fernando Chueca Goitia [es], who was 8 years his senior. ", "Q": "Who did Picardo collaborate with for building preservation and restoration projects?", "A": "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia."}, {"S": "In 1951 in company with his fellow architect Carlos de Miguel he designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea in the province of Alicante which attracted much attention but remained unbuilt.", "Q": "What was one of the uncommissioned buildings that Picardo designed in the 1950s?", "A": "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea."}, {"S": "With much imagination he personalised the characters he portrayed, for instance rendering the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar and the King of Clubs as Alexander the Great.", "Q": "How did Picardo personalize the characters in the playing cards he designed for Loewe?", "A": "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great."}, {"S": "Picardo set the main hospitality section of the Parador within the Colegio de Infantes, adding to the external south side of the cloister to provide a dining room and, above it, terraces for guest rooms facing the garden.", "Q": "Where did Picardo locate the main hospitality section of the Parador?", "A": "Within the Colegio de Infantes."}, {"S": "To the left of the entry, in the original guardhouse, he placed the reception area and to the right, where the old stables were situated, he placed the bar and cafeter\u00eda, adding one single window to each space to allow in some daylight.", "Q": "What did he place to the left and right of the entry?", "A": "He placed the reception area to the left and the bar and cafeter\u00eda to the right."}, {"S": "In planning the interior decoration Picardo determined that the ground floor would be medieval in design in keeping with the original military use of the palace, and the upper floors would be more palatial in decor.", "Q": "How did Picardo plan the interior decoration?", "A": "He planned for the ground floor to be medieval in design and the upper floors to be more palatial."}, {"S": "In 1964 Picardo was involved, with the Ministry of Information and Tourism, in investigating old buildings for conversion into a new Parador in the Province of Guadalajara.", "Q": "What was Picardo's involvement in the investigation of old buildings?", "A": "Investigating old buildings for conversion into a new Parador."}, {"S": "Picardo had no qualms about his film set concept of restoration, using modern construction techniques and concealing them with traditional materials, as long as the buildings looked old rather than modern.", "Q": "What was Picardo's approach to restoration?", "A": "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."}], "llm_output": ["Jos\u00e9 Luis Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He was raised in Madrid, Spain.", "Yes, Jos\u00e9 Luis Picardo received the Premio Antonio Camu\u00f1as de Arquitectura (Antonio Camu\u00f1as Prize for Architecture) in 2001.", "The text does not provide any information about the subjects that Picardo illustrated in his post-student days.", "Picardo collaborated with Luis Moya Blanco for building preservation and restoration projects.", "One of the uncommissioned buildings that Picardo designed in the 1950s was the Escuela T\u00e9cnica Superior de Arquitectura in Madrid.", "The text does not provide any information about Picardo personalizing the characters in the playing cards he designed for Loewe.", "The text does not provide information about the location of the main hospitality section of the Parador.", "The text does not provide information about what he placed to the left and right of the entry.", "The text does not provide any information about how Picardo planned the interior decoration.", "Picardo was an architect who was involved in the renovation and preservation of old buildings. He was praised for his knowledge of Spanish culture and his ability to reinterpret and value the historical heritage of these buildings.", "Picardo's approach to restoration was to reinterpret and value the richness of Spain's historical heritage."], "output": ["He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family.", "No.", "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia.", "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea.", "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great.", "Within the Colegio de Infantes.", "He placed the reception area to the left and the bar and cafeter\u00eda to the right.", "He planned for the ground floor to be medieval in design and the upper floors to be more palatial.", "Investigating old buildings for conversion into a new Parador.", "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."]}
2 | 


--------------------------------------------------------------------------------
/Output/shortdep_qa_llama-index.jsonl:
--------------------------------------------------------------------------------
1 | {"qa_pairs": [{"S": "Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain on 18 June 1919. With his mother and his brothers he moved to Madrid, Spain.", "Q": "Where was Jos\u00e9 Luis Picardo born and raised?", "A": "He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family."}, {"S": "On 3 February 1997, at the age of 78, Picardo was elected Academician of the Real Academia de Bellas Artes de San Fernando (Royal Academy of Fine Arts of San Fernando), being proposed by Julio Cano Lasso [es], Fernando Chueca Goitia and Luis Garc\u00eda-Ochoa Ib\u00e1\u00f1ez [es].", "Q": "Did Jos\u00e9 Luis Picardo receive any awards or recognition for his work?", "A": "No."}, {"S": "Particularly noted in his post-student days were illustrations portraying Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "Q": "What were some of the subjects that Picardo illustrated in his post-student days?", "A": "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953."}, {"S": "On qualifying in 1951,[6][7] Picardo pursued his interest in historical architecture by collaborating on a number of building preservation and restoration projects with the Spanish architect and architectural historian Fernando Chueca Goitia [es], who was 8 years his senior. ", "Q": "Who did Picardo collaborate with for building preservation and restoration projects?", "A": "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia."}, {"S": "In 1951 in company with his fellow architect Carlos de Miguel he designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea in the province of Alicante which attracted much attention but remained unbuilt.", "Q": "What was one of the uncommissioned buildings that Picardo designed in the 1950s?", "A": "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea."}, {"S": "With much imagination he personalised the characters he portrayed, for instance rendering the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar and the King of Clubs as Alexander the Great.", "Q": "How did Picardo personalize the characters in the playing cards he designed for Loewe?", "A": "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great."}, {"S": "Picardo set the main hospitality section of the Parador within the Colegio de Infantes, adding to the external south side of the cloister to provide a dining room and, above it, terraces for guest rooms facing the garden.", "Q": "Where did Picardo locate the main hospitality section of the Parador?", "A": "Within the Colegio de Infantes."}, {"S": "To the left of the entry, in the original guardhouse, he placed the reception area and to the right, where the old stables were situated, he placed the bar and cafeter\u00eda, adding one single window to each space to allow in some daylight.", "Q": "What did he place to the left and right of the entry?", "A": "He placed the reception area to the left and the bar and cafeter\u00eda to the right."}, {"S": "In planning the interior decoration Picardo determined that the ground floor would be medieval in design in keeping with the original military use of the palace, and the upper floors would be more palatial in decor.", "Q": "How did Picardo plan the interior decoration?", "A": "He planned for the ground floor to be medieval in design and the upper floors to be more palatial."}, {"S": "In 1964 Picardo was involved, with the Ministry of Information and Tourism, in investigating old buildings for conversion into a new Parador in the Province of Guadalajara.", "Q": "What was Picardo's involvement in the investigation of old buildings?", "A": "Investigating old buildings for conversion into a new Parador."}, {"S": "Picardo had no qualms about his film set concept of restoration, using modern construction techniques and concealing them with traditional materials, as long as the buildings looked old rather than modern.", "Q": "What was Picardo's approach to restoration?", "A": "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."}], "llm_output": ["Jos\u00e9 Luis Picardo was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He was raised in Madrid, Spain.", "Yes, Jos\u00e9 Luis Picardo received the Premio Antonio Camu\u00f1as de Arquitectura (Antonio Camu\u00f1as Prize for Architecture) in 2001. This prize is awarded every two years to recognize the work of a Spanish architect who has excelled in architectural renovation. The prize jury praised Picardo as an architect who was knowledgeable about Spanish culture and who valued the richness of the country's historical heritage.", "In his post-student days, Picardo illustrated articles and covers for Spanish architectural magazines. He also created drawings portraying Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canary Islands in 1953. Additionally, he showed a growing interest in historic architecture, particularly its preservation and restoration.", "Picardo collaborated with the Spanish architect and architectural historian Fernando Chueca Goitia for building preservation and restoration projects.", "One of the uncommissioned buildings that Picardo designed in the 1950s is not mentioned in the given context.", "There is no information provided in the context about Picardo personalizing characters in playing cards for Loewe.", "Picardo located the main hospitality section of the Parador in Spain.", "The given context does not provide any information about what he placed to the left and right of the entry.", "Picardo planned the interior decoration by executing mural paintings. He was commissioned by architects to create murals for the interiors of their buildings. This was his main source of income during his youth and early career. He also illustrated articles and covers for architectural magazines, showcasing his skills in painting and drawing.", "Picardo pursued his interest in historical architecture by collaborating on a number of building preservation and restoration projects. He worked with the Spanish architect and architectural historian Fernando Chueca Goitia, who had conducted extensive research on the unchanging elements of Spanish architecture. Picardo was also one of the signatories of the \"Manifiesto de la Alhambra\" in 1952, which sought inspiration from the design of the Alhambra in Granada for a distinctively Spanish form of modern architecture. Throughout his career, Picardo's work was influenced by this inspiration, particularly in his work on Paradores.", "Picardo's approach to restoration was to reinterpret and value the richness of Spain's historical heritage."], "output": ["He was born in Jerez de la Frontera, in the Province of C\u00e1diz in Andaluc\u00eda, Spain. He later moved to Madrid with his family.", "No.", "Madrid in the 1950s and 1960s, the Spanish protectorate in Morocco, and sketches of the Canarias (Canary Islands) in 1953.", "He collaborated with Spanish architect and architectural historian Fernando Chueca Goitia.", "He designed a centre for the Cofrad\u00eda de Pescadores (Fishermen's Brotherhood) of Altea.", "Picardo personalized the characters in the playing cards by portraying the King of Hearts as the Emperor Charlemagne, the King of Clubs as Goliath, the King of Diamonds as Julius Caesar, and the King of Spades as Alexander the Great.", "Within the Colegio de Infantes.", "He placed the reception area to the left and the bar and cafeter\u00eda to the right.", "He planned for the ground floor to be medieval in design and the upper floors to be more palatial.", "Investigating old buildings for conversion into a new Parador.", "Picardo used modern construction techniques but concealed them with traditional materials to make the buildings look old."]}
2 | 


--------------------------------------------------------------------------------
/Prediction/pred_gpt_models.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import json
  4 | import argparse
  5 | from transformers import AutoTokenizer, AutoModelForCausalLM
  6 | import openai
  7 | from datasets import load_dataset
  8 | import tiktoken
  9 | # import GPUtil
 10 | stopped_num = 10000000    
 11 | delay = 10  
 12 | # Gpus = GPUtil.getGPUs()
 13 | 
 14 | def get_gpu_info():
 15 |     gpulist = []
 16 |     GPUtil.showUtilization()
 17 | 
 18 |     for gpu in Gpus:
 19 |         print('gpu.id:', gpu.id)
 20 |         print('total GPU:', gpu.memoryTotal)
 21 |         print('GPU usage：', gpu.memoryUsed)
 22 |         print('gpu usage percent:', gpu.memoryUtil * 100)
 23 |         gpulist.append([ gpu.id, gpu.memoryTotal, gpu.memoryUsed,gpu.memoryUtil * 100])
 24 | 
 25 |     return gpulist
 26 | 
 27 | 
 28 | def parse_args(args=None):
 29 |     parser = argparse.ArgumentParser()
 30 |     parser.add_argument('--model_name', type=str, default=None, help="raw model name for evaluation", choices=["gpt-3.5-turbo-16k", "gpt-4"])
 31 |     parser.add_argument('--task', type=str, default=None, help="long context understanding tasks in LooGLE", choices=["shortdep_qa","longdep_qa","longdep_summarization","shortdep_cloze"])
 32 |     parser.add_argument('--max_length', type=int, default=None, help="the max length of input prompt")
 33 |  
 34 |     parser.add_argument('--model_path', type=str, default="./Models/") 
 35 |     parser.add_argument('--output_path', type=str, default="./Output/")
 36 | 
 37 |     return parser.parse_args(args)
 38 | 
 39 | 
 40 | def num_tokens_from_string(string: str, encoding_name: str) -> int:
 41 |     """Returns the number of tokens in a text string."""
 42 |     encoding = tiktoken.get_encoding(encoding_name)
 43 |     num_tokens = len(encoding.encode(string))
 44 |     return num_tokens
 45 | 
 46 | 
 47 | def get_pred(model, data_instance, tokenizer, max_length, max_gen, prompt_format):
 48 |     
 49 |     ans, groundtruth = [], []
 50 |     preds = {}
 51 |     raw_inputs = data_instance['input']
 52 |     if data_instance['qa_pairs'] == 'none':
 53 |         preds['qa_pairs'] = data_instance['qa_pairs']
 54 |         json_obj = {'input': raw_inputs}
 55 | 
 56 |         prompt = prompt_format.format(**json_obj)
 57 |         tokenized_prompt = tokenizer.encode(prompt)
 58 |         if len(tokenized_prompt) > max_length:
 59 |             half = int(max_length/2)
 60 |             prompt = tokenizer.decode(tokenized_prompt[:half]) + tokenizer.decode(tokenized_prompt[-half:])
 61 | 
 62 |         rsp = openai.ChatCompletion.create(
 63 |             model = model,
 64 |             messages = [{"role": "system", "content":prompt}],
 65 |             temperature = 0.0,
 66 |             top_p = 1,
 67 |             max_tokens = max_gen,
 68 |             frequency_penalty = 0,
 69 |             presence_penalty = 0
 70 |                 )
 71 |         pred = rsp['choices'][0]['message']['content']
 72 | 
 73 |         ans.append(pred)
 74 |         groundtruth.append(data_instance["output"])
 75 | 
 76 |     else:
 77 |         preds['qa_pairs'] = eval(data_instance['qa_pairs'])
 78 |     
 79 |         for j in eval(data_instance['qa_pairs']):
 80 | 
 81 |             json_obj = {'Q':j['Q'], 'input': raw_inputs}
 82 |             
 83 |             prompt = prompt_format.format(**json_obj)
 84 |             tokenized_prompt = tokenizer.encode(prompt)
 85 |             if len(tokenized_prompt) > max_length:
 86 |                 half = int(max_length/2)
 87 |                 prompt = tokenizer.decode(tokenized_prompt[:half])+tokenizer.decode(tokenized_prompt[-half:])
 88 | 
 89 |             rsp = openai.ChatCompletion.create(
 90 |                 model = model,
 91 |                 messages = [{"role": "system", "content":prompt}],
 92 |                 temperature = 0.0,
 93 |                 top_p = 1,
 94 |                 max_tokens = max_gen,
 95 |                 frequency_penalty = 0,
 96 |                 presence_penalty = 0
 97 |                     )
 98 |             pred = rsp['choices'][0]['message']['content']
 99 |             ans.append(pred)
100 |             groundtruth.append(j['A'])
101 | 
102 |     preds['llm_output'] = ans
103 |     preds['output'] = groundtruth
104 | 
105 |     return preds
106 | 
107 | # def loads(path, task):
108 | #     data = []
109 | #     with open(path+task+".jsonl", "r") as f:
110 | #         lines = f.readlines()
111 | #         for line in lines:
112 | #             data.append(json.loads(line))
113 | #     return data
114 | 
115 | if __name__ == '__main__':
116 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
117 |     args = parse_args()
118 | 
119 |     data = load_dataset('bigainlco/LooGLE', args.task, split="test")
120 |     #data = loads("LooGLE-testdata/", args.task)
121 |     tokenizer = tiktoken.get_encoding("cl100k_base")
122 |     
123 |     task2prompt = json.load(open("./config/task2prompt.json", "r"))
124 |     task2maxlen = json.load(open("./config/task2maxlen.json", "r"))
125 |     prompt_format = task2prompt[args.task]
126 |     max_gen = task2maxlen[args.task]
127 |     for i in data:
128 |         predictions = get_pred(args.model_name, i, tokenizer, args.max_length, max_gen, prompt_format)
129 |         with open(args.output_path + args.task + '_' + args.model_name+".jsonl", "a+") as g:
130 |             g.write(json.dumps(predictions)+'\n')
131 | 
132 | 


--------------------------------------------------------------------------------
/Prediction/pred_llamaindex.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import json
  4 | import argparse
  5 | from datasets import load_dataset
  6 | from llama_index import GPTVectorStoreIndex, Document, ServiceContext
  7 | from llama_index.indices.prompt_helper import PromptHelper
  8 | from transformers import AutoTokenizer
  9 | import openai
 10 | import tiktoken
 11 | #import GPUtil
 12 | stopped_num = 10000000    
 13 | delay = 10  
 14 | # Gpus = GPUtil.getGPUs()
 15 | 
 16 | def get_gpu_info():
 17 |     gpulist = []
 18 |     GPUtil.showUtilization()
 19 | 
 20 |     for gpu in Gpus:
 21 |         print('gpu.id:', gpu.id)
 22 |         print('total GPU:', gpu.memoryTotal)
 23 |         print('GPU usage：', gpu.memoryUsed)
 24 |         print('gpu usage percent:', gpu.memoryUtil * 100)
 25 |         gpulist.append([ gpu.id, gpu.memoryTotal, gpu.memoryUsed,gpu.memoryUtil * 100])
 26 | 
 27 |     return gpulist
 28 | 
 29 | 
 30 | def parse_args(args=None):
 31 |     parser = argparse.ArgumentParser()
 32 |     parser.add_argument('--model_name', type=str, default="llama-index", help="raw model name for evaluation")
 33 |     parser.add_argument('--task', type=str, default=None, help="long context understanding tasks in LooGLE", choices=["shortdep_qa","longdep_qa","longdep_summarization","shortdep_cloze"])
 34 |     parser.add_argument('--max_length', type=int, default=None, help="the max length of input prompt")
 35 | 
 36 |     parser.add_argument('--model_path', type=str, default="./Models/")
 37 |     parser.add_argument('--output_path', type=str, default="./Output/")
 38 | 
 39 |     return parser.parse_args(args)
 40 | 
 41 | 
 42 | def num_tokens_from_string(string: str, encoding_name: str) -> int:
 43 |     """Returns the number of tokens in a text string."""
 44 |     encoding = tiktoken.get_encoding(encoding_name)
 45 |     num_tokens = len(encoding.encode(string))
 46 |     return num_tokens
 47 | 
 48 | 
 49 | def get_pred(data_instance, tokenizer, max_length, max_gen, prompt_format):
 50 |     
 51 |     ans, groundtruth = [], []
 52 |     preds = {}
 53 |     raw_inputs = data_instance['input']
 54 |     documents = [Document(text=raw_inputs)]
 55 |     prompt_helper = PromptHelper(
 56 |         context_window=max_length + 1000,
 57 |         num_output=max_gen,
 58 |         chunk_size_limit=1024,
 59 |         chunk_overlap_ratio=0.1,
 60 |     )
 61 | 
 62 |     service_context = ServiceContext.from_defaults(
 63 |         context_window=max_length + 1000,
 64 |         num_output=max_gen,
 65 |         prompt_helper=prompt_helper,
 66 |         chunk_size_limit=1024,
 67 |     )
 68 |     index = GPTVectorStoreIndex.from_documents(documents, service_context=service_context)
 69 |     query_engine = index.as_query_engine()
 70 | 
 71 |     if data_instance['qa_pairs'] == 'none':
 72 |         preds['qa_pairs'] = data_instance['qa_pairs']
 73 |         json_obj = {'input': raw_inputs}
 74 | 
 75 |         prompt = prompt_format.format(**json_obj)
 76 |         tokenized_prompt = tokenizer.encode(prompt)
 77 |         if len(tokenized_prompt) > max_length:
 78 |             half = int(max_length/2)
 79 |             prompt = tokenizer.decode(tokenized_prompt[:half])+tokenizer.decode(tokenized_prompt[-half:])
 80 |         
 81 |         rsp = query_engine.query(prompt).response    
 82 |         ans.append(rsp)
 83 |         groundtruth.append(data_instance["output"])
 84 | 
 85 |     else:
 86 |         preds['qa_pairs'] = eval(data_instance['qa_pairs'])
 87 |     
 88 |         for j in eval(data_instance['qa_pairs']): 
 89 |             json_obj = {'Q':j['Q'], 'input': raw_inputs}
 90 |             
 91 |             prompt = prompt_format.format(**json_obj)
 92 |             tokenized_prompt = tokenizer.encode(prompt)
 93 |             if len(tokenized_prompt) > max_length:
 94 |                 half = int(max_length/2)
 95 |                 prompt = tokenizer.decode(tokenized_prompt[:half])+tokenizer.decode(tokenized_prompt[-half:])
 96 |             
 97 |             rsp = query_engine.query(prompt).response    
 98 |             ans.append(rsp)
 99 |             groundtruth.append(j['A'])
100 | 
101 | 
102 |     preds['llm_output'] = ans
103 |     preds['output'] = groundtruth  
104 |     return preds
105 | 
106 | 
107 | def loads(path, task):
108 |     data = []
109 |     with open(path+task+".jsonl", "r") as f:
110 |         lines = f.readlines()
111 |         for line in lines:
112 |             data.append(json.loads(line))
113 |     return data
114 | 
115 | if __name__ == '__main__':
116 | 
117 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
118 |     args = parse_args()
119 | 
120 |     # data = load_dataset('bigainlco/LooGLE', args.task, split="test")
121 |     data = loads("LooGLE-testdata/", args.task)
122 |     tokenizer = tiktoken.get_encoding("cl100k_base")
123 |     
124 |     task2prompt = json.load(open("./config/task2prompt.json", "r"))
125 |     task2maxlen = json.load(open("./config/task2maxlen.json", "r"))
126 |     prompt_format = task2prompt[args.task]
127 |     max_gen = task2maxlen[args.task]
128 | 
129 |     for i in data:
130 |         predictions = get_pred(i, tokenizer, args.max_length, max_gen, prompt_format)
131 | 
132 |         with open(args.output_path + args.task + '_' + args.model_name + ".jsonl", "a+") as g:
133 |             g.write(json.dumps(predictions)+'\n')
134 | 
135 |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                


--------------------------------------------------------------------------------
/Prediction/pred_opensource_models.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import json
  4 | import argparse
  5 | from transformers import AutoTokenizer, AutoModelForCausalLM
  6 | from datasets import load_dataset
  7 | #import GPUtil
  8 | 
  9 | stopped_num = 10000000    
 10 | delay = 10  
 11 | # Gpus = GPUtil.getGPUs()
 12 | 
 13 | def get_gpu_info():
 14 |     gpulist = []
 15 |     GPUtil.showUtilization()
 16 | 
 17 |     for gpu in Gpus:
 18 |         print('gpu.id:', gpu.id)
 19 |         print('total GPU:', gpu.memoryTotal)
 20 |         print('GPU usage：', gpu.memoryUsed)
 21 |         print('gpu usage percent:', gpu.memoryUtil * 100)
 22 |         gpulist.append([ gpu.id, gpu.memoryTotal, gpu.memoryUsed,gpu.memoryUtil * 100])
 23 | 
 24 |     return gpulist
 25 | 
 26 | 
 27 | def parse_args(args=None):
 28 |     parser = argparse.ArgumentParser()
 29 |     parser.add_argument('--model_name', type=str, default=None, help="raw model name for evaluation", choices=["rwkv-4-14b-pile","long_llama_3b","LLaMA-2-7B-32K","chatglm2-6b-32k"])
 30 |     parser.add_argument('--task', type=str, default=None, help="long context understanding tasks in LooGLE", choices=["shortdep_qa","longdep_qa","longdep_summarization","shortdep_cloze"])
 31 |     parser.add_argument('--max_length', type=int, default=None, help="the max length of input prompt")
 32 | 
 33 |     parser.add_argument('--model_path', type=str, default="./Models/") 
 34 |     parser.add_argument('--output_path', type=str, default="./Output/")
 35 | 
 36 |     return parser.parse_args(args)
 37 | 
 38 | 
 39 | 
 40 | def get_pred(model, data_instance, tokenizer, max_length, max_gen, prompt_format, device):
 41 | 
 42 |     ans, groundtruth = [], []
 43 |     preds = {}
 44 |     raw_inputs = data_instance['input']
 45 |     if data_instance['qa_pairs'] == 'none':
 46 |         preds['qa_pairs'] = data_instance['qa_pairs']
 47 |         json_obj = {'input': raw_inputs}
 48 | 
 49 |         prompt = prompt_format.format(**json_obj)
 50 |         tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0]
 51 |         if len(tokenized_prompt) > max_length:
 52 |             half = int(max_length/2)
 53 |             prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True)
 54 |         
 55 |         
 56 |         input_ids = tokenizer(prompt, truncation=True, return_tensors="pt").input_ids.to(device)
 57 |         context_length = input_ids.shape[-1]
 58 |         with torch.no_grad():
 59 |             output = model.generate(input_ids,max_new_tokens=max_gen,temperature=1.0,num_beams=1,do_sample=False,repetition_penalty=float(2))[0]
 60 |         pred = tokenizer.decode(output[context_length:], skip_special_tokens=True)
 61 | 
 62 |         ans.append(pred)
 63 |         groundtruth.append(data_instance["output"])
 64 | 
 65 |     else:
 66 |         preds['qa_pairs'] = eval(data_instance['qa_pairs'])
 67 |         for j in eval(data_instance['qa_pairs']):
 68 | 
 69 |             json_obj = {'Q':j['Q'], 'input': raw_inputs}
 70 |                 
 71 |             prompt = prompt_format.format(**json_obj)
 72 |             tokenized_prompt = tokenizer(prompt, truncation=False, return_tensors="pt").input_ids[0]
 73 |             if len(tokenized_prompt) > max_length:
 74 |                 half = int(max_length/2)
 75 |                 prompt = tokenizer.decode(tokenized_prompt[:half], skip_special_tokens=True)+tokenizer.decode(tokenized_prompt[-half:], skip_special_tokens=True)
 76 |             
 77 |             
 78 |             input_ids = tokenizer(prompt, truncation=True, return_tensors="pt").input_ids.to(device)
 79 |             context_length = input_ids.shape[-1]
 80 |             with torch.no_grad():
 81 |                 output = model.generate(input_ids,max_new_tokens=max_gen,temperature=1.0,num_beams=1,do_sample=False,repetition_penalty=float(2))[0]
 82 |             pred = tokenizer.decode(output[context_length:], skip_special_tokens=True)
 83 | 
 84 |             # del output, input_ids
 85 |             # torch.cuda.empty_cache()
 86 | 
 87 |             ans.append(pred)
 88 |             groundtruth.append(j['A'])
 89 | 
 90 |     preds['llm_output'] = ans
 91 |     preds['output'] = groundtruth
 92 |     
 93 |     return preds
 94 | 
 95 | # def loads(path, task):
 96 | #     data = []
 97 | #     with open(path+task+".jsonl", "r") as f:
 98 | #         lines = f.readlines()
 99 | #         for line in lines:
100 | #             data.append(json.loads(line))
101 | #     return data
102 | 
103 | if __name__ == '__main__':
104 | 
105 |     device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
106 |     args = parse_args()
107 | 
108 |     data = load_dataset('bigainlco/LooGLE', args.task, split="test")
109 |     #data = loads("LooGLE-testdata/", args.task)
110 | 
111 |     tokenizer = AutoTokenizer.from_pretrained(args.model_path + args.model_name,trust_remote_code=True)
112 |     model = AutoModelForCausalLM.from_pretrained(args.model_path + args.model_name, trust_remote_code=True,torch_dtype=torch.bfloat16 ).to(device)
113 |     model.eval()
114 |     
115 |     task2prompt = json.load(open("./config/task2prompt.json", "r"))
116 |     task2maxlen = json.load(open("./config/task2maxlen.json", "r"))
117 |     prompt_format = task2prompt[args.task]
118 |     max_gen = task2maxlen[args.task]
119 | 
120 |     for i in data:
121 |         preds = get_pred(model, i, tokenizer, args.max_length, max_gen, prompt_format, device)
122 | 
123 |         with open(args.output_path + args.task + '_' + args.model_name+".jsonl", "a+") as g:
124 |             g.write(json.dumps(preds)+'\n')
125 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center" id="title"> <img src="./assets/LooGle_logo.png" width=256px /> </div>
  2 | 
  3 | <h2 align="center">Long Context Generic Language Evaluation benchmark for LLM long context understanding</h2>
  4 | <p align="center">
  5 |     <a href="https://opensource.org/licenses/MIT">
  6 |         <img alt="License: MIT" src="https://img.shields.io/badge/License-MIT-yellow.svg">
  7 |     </a>
  8 |     <a href="https://huggingface.co/datasets/bigainlco/LooGLE">
  9 |         <img alt="Documentation" src="https://img.shields.io/badge/Dataset-HF Data-yellow.svg">
 10 |     </a>
 11 |     <a href="https://bigai-nlco.github.io/LooGLE/">
 12 |         <img alt="Documentation" src="https://img.shields.io/badge/Website-Page-red.svg">
 13 |     </a>
 14 |     <a href="https://arxiv.org/abs/2311.04939">
 15 |         <img alt="Documentation" src="https://img.shields.io/badge/Paper-arXiv-red.svg">
 16 |     </a>
 17 | </p>
 18 | 
 19 | ![](assets/overview_page1.png)
 20 | 
 21 | **LooGLE** is a comprehensive evaluation benchmark for LLM long context understanding which contains up-to-date  (all after 2022) and extremely long realistic documents (over 24k tokens per document, many of which exceed 100k words) and 6,000 newly generated questions spanning diverse domains and categories. Details statistics of our dataset can be seen in the table below.
 22 | 
 23 | **Short and long dependency tasks  📜**  LooGLE is composed of 7 major tasks to evaluate LLMs' ability to understand both short and long dependency content. We refer to ``long dependency" tasks as those that require the understanding of the inter-dependency across multiple shreds of evidence widely spanning over the entire long text. We delicately design 5 types of long dependency tasks, including comprehension and reasoning, computation, timeline reorder, multiple information retrieval, and summarization.
 24 | 
 25 | **Long context evaluation  📊**  In order to provide more comprehensive and general results, LooGLE relies on automatic metrics based on semantic similarity, GPT4-as-judgment and human evaluation to get an overall performance for reference. We conducted the evaluation of 8 representative LLMs. We specifically select LLMs which have made great effort in addressing the challenge of understanding long contexts by utilizing flash attention, position interpolation, optimized Transformer and finetuning, external memory etc. 
 26 | 
 27 | LooGLE not only provides a systematic and comprehensive evaluation schema on long-context LLMs, but also sheds light on the future development of enhanced models toward “true long-context understanding”.
 28 | 
 29 | 
 30 | ## 📌 **Statistics of LooGLE**
 31 | 
 32 | ![](assets/table.png)
 33 | 
 34 | ## ✏️ **Table of Contents**
 35 | - [📌 **Statistics of LooGLE**](#-statistics-of-loogle)
 36 | - [✏️ **Table of Contents**](#️-table-of-contents)
 37 | - [🚀 **Capability leaderboard**](#-capability-leaderboard)
 38 | - [💁 **Quick Start**](#-quick-start)
 39 |   - [**Step 1. Prerequisites**](#step-1-prerequisites)
 40 |   - [**Step 2. Download the data**](#step-2-download-the-data)
 41 |   - [**Step 3. Generate the prediction results**](#step-3-generate-the-prediction-results)
 42 |   - [**Prediction for retrieval based methods**](#prediction-for-retrieval-based-methods)
 43 | - [📊 **Evaluation**](#-evaluation)
 44 |   - [**Evaluation on Timeline reorder task**](#evaluation-on-timeline-reorder-task)
 45 | - [📝 **Citation**](#-citation)
 46 | - [📣 **Contacts**](#-contacts)
 47 | 
 48 | 
 49 | ## 🚀 **Capability leaderboard**
 50 | The overall performance comparisons of different models on different tasks in our dataset are shown in the figure below.
 51 | 
 52 | ![](assets/overview_performance.png)
 53 | 
 54 | <br>
 55 | 
 56 | ## 💁 **Quick Start**
 57 | ### **Step 1. Prerequisites**
 58 | Clone this repo and install the dependencies. The test environment is under torch 2.0.1+cu121.
 59 | 
 60 | ```bash
 61 | cd LooGLE   
 62 | conda create -n loogle python=3.9
 63 | conda activate loogle
 64 | pip install -r requirements.txt
 65 | export OPENAI_API_KEY="[your_openai_api_key]"
 66 | ```
 67 | 
 68 | 
 69 | ### **Step 2. Download the data** 
 70 | You can download and load the **LooGLE** data through the Hugging Face datasets ([🤗 HF Repo](https://huggingface.co/datasets/bigainlco/LooGLE)):
 71 | 
 72 | ```python
 73 | from datasets import load_dataset
 74 | 
 75 | datasets = ["shortdep_qa", "shortdep_cloze", "longdep_qa", "longdep_summarization"]
 76 | 
 77 | for testset in datasets:
 78 |     data = load_dataset('bigainlco/LooGLE', testset, split='test')
 79 |     # evaluate your model
 80 | ```
 81 | You can also access our sample data [LooGLE-testdata/](LooGLE-testdata/).
 82 | 
 83 | All data in **LooGLE** are standardized to the following format:
 84 | ```json
 85 | {
 86 |     "input": "The original long input texts",
 87 |     "title": "The title of the given document",  //for arxiv paper, we use "title" to refer the identical ID for specific paper
 88 |     "qa_pairs":[
 89 |             {
 90 |                 "Q": "Question to ask based on the given input",
 91 |                 "A": "Groundtruth answer for the question",
 92 |                 "S": [ "One or more evidence (complete sentences) for answering the question, which are extracted directly from the original input"
 93 |                 ]
 94 |             },  
 95 |         ]        // There are multiple questions and corresponding answers in the list (each of them is in json format)
 96 |                  // For arxiv paper summarization, we use "none" instead for non-qa/non-cloze tasks
 97 |     "output": "none"   // the predicted outputs of LLM given the long input and instructions, which is initialized as "none"
 98 | ```
 99 | To mention that, in long dependency QA data, we add an extra key `type` for each question in json to indicate the 4 types of long dependency tasks(apart from summarization).
100 | 
101 | <br>
102 | 
103 | ### **Step 3. Generate the prediction results**
104 | We test LLMs using 3 Python codes under the path [Prediction/](Prediction/) for corresponding types of models. We select the model for evaluation via `--model_name` and the specific task via `--task`. Let's take short dependency QA as an example:
105 | 
106 | For GPT-3.5-turbo and GPT4:
107 | ```
108 | python Prediction/pred_gpt_models.py  --model_name gpt-3.5-turbo-16k --task shortdep_qa --max_length 500
109 | ```
110 | 
111 | For LlamaIndex:
112 | ```
113 | python Prediction/pred_llamaindex.py --task shortdep_qa --max_length 500
114 | ```
115 | 
116 | For other open-source models (take chatglm2-6b-32k as an example):
117 | ```
118 | python Prediction/pred_opensource_models.py  --model_name chatglm2-6b-32k --task shortdep_qa --max_length 500
119 | ```
120 | 
121 | Open-source models can be downloaded and loaded from [Models/](Models/) by default, you can change the path via `--model_path`
122 | 
123 | You can also determine the long text output result through `--output_path`.  
124 | 
125 | Please note that in `config/`, we provide the prompt format suitable for each task and the maximum generation length. The input parameter `--max_length` limits the max length of the input prompt for selected model. Feel free to modify them to better suit the model you want to evaluate. 
126 | 
127 | We test all the open-source baselines with a single 80G A800 GPU in BF16 precision. For Llama-2 based models, we recommend using [Flash Attention](https://github.com/Dao-AILab/flash-attention) for optimization and saving GPU memory.
128 | 
129 | ### **Prediction for retrieval-based methods**
130 | 
131 | To evaluate the effectiveness of retrieval techniques for long-context dependency questions, we undertook extensive experiments by replacing the base LLM model in LlamaIndex with different baseline LLMs. 
132 | 
133 | For retrieval-based methods (take chatglm2-6b-32k as an example):
134 | ```
135 | python Retrieval/pred_retrieval_based_method.py --model_name chatglm2-6b-32k --task shortdep_qa --max_length 500 --emb_model_name sentence-transformers/all-mpnet-base-v2
136 | ```
137 | Use `--emb_model_name` to set embedding models for retrieval-based methods. Here we used all-mpnet-base-v2 as default.
138 | 
139 | ## 📊 **Evaluation**
140 | 
141 | Given the prediction file generated in Step 2, we run the evaluation code in [Evaluation/](Evaluation/).
142 | 
143 | For automatic evaluation in short and long-dependency QA, summarization task  (eg. short-dependency QA):
144 | 
145 | ```
146 | python Evaluation/automatic_eval.py --model_name chatglm2-6b-32k --task shortdep_qa --eval_metric automatic_sim
147 | ```
148 | 
149 | For automatic evaluation in cloze task:
150 | 
151 | ```
152 | python Evaluation/automatic_eval.py --model_name chatglm2-6b-32k --task shortdshortdep_cloze --eval_metric automatic_match
153 | ```
154 | 
155 | For  LLM-as-judge in short and long dependency QA, summarization task (eg. short dependency QA):
156 | 
157 | ```
158 | python Evaluation/llm_eval.py --model_name chatglm2-6b-32k --task shortdep_qa
159 | ```
160 | 
161 | Besides the parameters specifying the `--model_name` and `--task`, we provide `--eval_metric` for users to choose the method for automatic evaluation from [`automatic_sim`, `automatic_match`]. 
162 | 
163 | Automatic metrics based on semantic similarity matching including Bleu, Rouge, Meteor, Bertscore and exact/partial match are supported. Feel free to add other metrics for your needs in  [Evaluation/automatic_metrics.py](Evaluation/automatic_metrics.py). Besides, the prompt of GPT4 given in the repo can be altered for further evaluation.
164 | 
165 | ### **Evaluation on Timeline reorder task**
166 |  We provide four metrics: LSD (location square deviation), LMD (location mean deviation), SD
167 | (swap deviation), and SDD (swap distance deviation) to measure the similarity of numeric sequences for time reorder tasks with regularized outputs. Details of the implementations can be seen in our paper.
168 | 
169 | For LLM in long dependency timeline reorder task:
170 | ```
171 | python Reorder/automatic_eval.py --model_name chatglm2-6b-32k
172 | ```
173 | 
174 | <!-- ## 📝 **Tools**
175 | Here is an example of our annotation website for long-dependency QA tasks.
176 | <br> -->
177 | 
178 | ## 📝 **Citation**
179 | If you would like to use our data or find our work interesting, please cite:
180 | ```bibtex
181 | @article{li2023loogle,
182 |   title={LooGLE: Can Long-Context Language Models Understand Long Contexts?},
183 |   author={Li, Jiaqi and Wang, Mengmeng and Zheng, Zilong and Zhang, Muhan},
184 |   journal={arXiv preprint arXiv:2311.04939},
185 |   year={2023}
186 | }
187 | ```
188 | 
189 | ## 📣 **Contacts**
190 | 
191 | We sincerely appreciate human annotators for their valuable contributions on creating high-quality long-dependency QA tasks.
192 | We are very pleased to answer any questions about LooGLE: [nlp@bigai.ai](mailto:nlp@bigai.ai)
193 | 


--------------------------------------------------------------------------------
/Reorder/get_max_deviation.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import re
 4 | import itertools
 5 | from get_reorder_deviation import (
 6 |     location_square_deviation,
 7 |     location_mean_deviation,
 8 |     swap_deviation,
 9 |     swap_distance_deviation
10 | )
11 | 
12 | def get_max_location_square_deviation(n):
13 |     inp = list(range(1,n+1))
14 |     permutations_lst = list(itertools.permutations(inp))
15 |     
16 |     tmp = -1
17 |     for i in permutations_lst:
18 |         if inp != list(i):
19 |             dis = location_square_deviation(inp,i)
20 |             if dis >= tmp:
21 |                 tmp = dis
22 |     return tmp
23 | 
24 | 
25 | def get_max_location_mean_deviation(n):
26 |     inp = list(range(1,n+1))
27 |     permutations_lst = list(itertools.permutations(inp))
28 |     
29 |     tmp = -1
30 |     for i in permutations_lst:
31 |         if inp != list(i):
32 |             dis = location_mean_deviation(inp,i)
33 |             if dis >= tmp:
34 |                 tmp = dis
35 |     return tmp
36 | 
37 | 
38 | def get_max_swap_deviation(n):
39 |     inp = list(range(1,n+1))
40 |     permutations_lst = list(itertools.permutations(inp))
41 |     
42 |     tmp = -1
43 |     for i in permutations_lst:
44 |         if inp != list(i):
45 |             dis = swap_deviation(inp,i)
46 |             if dis >= tmp:
47 |                 tmp = dis
48 |     return tmp
49 | 
50 | 
51 | def get_max_swap_distance_deviation(n):
52 |     inp = list(range(1,n+1))
53 |     permutations_lst = list(itertools.permutations(inp))
54 |     
55 |     tmp = -1
56 |     for i in permutations_lst:
57 |         if inp != list(i):
58 |             dis = swap_distance_deviation(inp,i)
59 |             if dis >= tmp:
60 |                 tmp = dis
61 |     return tmp


--------------------------------------------------------------------------------
/Reorder/get_reorder_deviation.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | import re
  4 | import itertools
  5 | 
  6 | def location_square_deviation(lst_1, lst_2=None):
  7 |     n = len(lst_1)
  8 |     lst = lst_1.copy()
  9 |     if lst_2 is not None:
 10 |         if lst_2 !=[] and n == len(lst_2):
 11 |             for i in range(n): 
 12 |                 try:
 13 |                     lst[lst_1.index(lst_2[i])] = i
 14 |                 except:
 15 |                     break
 16 | 
 17 |     try:
 18 |         s = 0
 19 |         for i in range(n):
 20 |             s += (lst[i]-i) ** 2
 21 |         s /= n
 22 |         return s
 23 |         
 24 |     except:
 25 |         return "None"
 26 | 
 27 | def location_mean_deviation(lst_1, lst_2=None):
 28 |     n = len(lst_1)
 29 |     lst = lst_1.copy()
 30 |     if lst_2 is not None:
 31 |         if lst_2 !=[] and n == len(lst_2): 
 32 |             for i in range(n):
 33 |                 try:
 34 |                     lst[lst_1.index(lst_2[i])] = i
 35 |                 except:
 36 |                     break
 37 |     try:
 38 |         s = 0
 39 |         for i in range(n):
 40 |             s += abs(lst[i]-i)
 41 |         s /= n
 42 |         return s
 43 |     except:
 44 |         return "None"
 45 | 
 46 | 
 47 | def swap_deviation(lst_1, lst_2=None):
 48 |     n = len(lst_1)
 49 |     lst = lst_1.copy()
 50 |     if lst_2 is not None: 
 51 |         if lst_2 !=[] and n == len(lst_2):
 52 |             for i in range(n):
 53 |                 try:
 54 |                     lst[lst_1.index(lst_2[i])] = i
 55 |                 except:
 56 |                     break
 57 |     try:    
 58 |         count = 0	
 59 |         for i in range(n):
 60 |             if lst[i] == -1:
 61 |                 continue
 62 |             p = i
 63 |             while lst[p] != -1:
 64 |                 q = lst[p]
 65 |                 lst[p] = -1
 66 |                 p = q
 67 |             count += 1
 68 |         return n - count 
 69 |     except:
 70 |         return "None"
 71 | 
 72 | 
 73 | def swap_distance_deviation(lst_1, lst_2=None):
 74 |     n = len(lst_1)
 75 |     lst = lst_1.copy()
 76 |     if lst_2 is not None:
 77 |         if lst_2 !=[] and n == len(lst_2):
 78 |             for i in range(n):
 79 |                 try:
 80 |                     lst[lst_1.index(lst_2[i])] = i
 81 |                 except:
 82 |                     break
 83 |     try:
 84 |         swap_lst = []
 85 |         weight = 0
 86 |         while location_mean_deviation(lst) != 0:
 87 |             r_best = 0	
 88 |             i_best = 0
 89 |             j_best = 0
 90 |             for i in range(n):
 91 |                 for j in range(i+1, n):	
 92 |                     r = ((abs(lst[i]-i)+abs(lst[j]-j)) - (abs(lst[j]-i)+abs(lst[i]-j)))/(j-i)
 93 |                     if r > r_best:
 94 |                         r_best = r
 95 |                         i_best = i
 96 |                         j_best = j
 97 |             lst[i_best], lst[j_best] = lst[j_best], lst[i_best]
 98 |             weight += (j_best-i_best)
 99 |             swap_lst.append((i_best, j_best))
100 |         return weight
101 |     except:
102 |         return "None"
103 | 
104 | 


--------------------------------------------------------------------------------
/Reorder/reorder_eval.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import numpy as np
 3 | import re
 4 | import itertools
 5 | import argparse
 6 | from get_reorder_deviation import (
 7 |     location_square_deviation,
 8 |     location_mean_deviation,
 9 |     swap_deviation,
10 |     swap_distance_deviation
11 | )
12 | 
13 | from get_max_deviation import (
14 |     get_max_location_square_deviation,
15 |     get_max_location_mean_deviation,
16 |     get_max_swap_deviation,
17 |     get_max_swap_distance_deviation
18 | )
19 | 
20 | def roman_numerals(text):
21 | 	pattern = r"\b[IVXLCDM]+\b"
22 | 	return re.findall(pattern, text)
23 | 
24 | 
25 | def deduplicate(l):
26 |     new_l=list(set(l))
27 |     new_l.sort(key=l.index)
28 |     return new_l
29 | 
30 | 
31 | def parse_args(args=None):
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument(
34 |         "--model_name", type=str, default=None, help="model name for evaluation"
35 |     )
36 |     parser.add_argument(
37 |         "--task",
38 |         type=str,
39 |         default="longdep_qa_reorder",
40 |         help="long context understanding tasks in LooGLE",
41 |         choices=[
42 |             "longdep_qa_reorder"
43 |         ]
44 |     )
45 |     parser.add_argument("--output_path", type=str, default="./Output/")
46 |     
47 | 
48 |     return parser.parse_args(args)
49 | 
50 | 
51 | 
52 | def evaluation(data, reorder_score, reorder_function):
53 |     for i in range(len(data["output"])):
54 |         hyp, ref = roman_numerals(data['llm_output'][i]), data['output'][i].split(',') 
55 |         #deduplicate(roman_numerals(data['llm_output'][i]))
56 |         #hypothesis.extend(list(set(reference) - set(hypothesis)))
57 | 
58 |         if hyp == '':
59 |             hyp = []
60 | 
61 |         for j in reorder_function:
62 |             if j not in reorder_score:
63 |                 reorder_score[j] = []
64 | 
65 |             output = eval(j)(ref, hyp)
66 |             if output != 'None':
67 |                 output = eval('get_max_'+j)(len(ref))
68 |             reorder_score[j].append(output)
69 | 
70 |     return reorder_score
71 | 
72 | 
73 | def get_reorder_score(result, functions):
74 |     final_score = {}
75 |     for i in functions:
76 |         res = result[i]
77 |         final_score[i] = np.mean(res)
78 |     return final_score
79 | 
80 | 
81 | if __name__ == "__main__":
82 |     args = parse_args()
83 |     eval_functions = ["location_square_deviation","location_mean_deviation","swap_deviation","swap_distance_deviation" ]
84 | 
85 |     score_result = {}
86 |     cnt = 0
87 |     with open(
88 |         args.output_path + args.task + "_" + args.model_name + ".jsonl", "r") as f:
89 |         for line in f.readlines():
90 |             cnt += 1
91 |             if cnt < 2:
92 |                 ds_llm = json.loads(line)
93 |                 score_result = evaluation(ds_llm, score_result, eval_functions)
94 | 
95 | 
96 |         print(get_reorder_score(score_result, eval_functions))
97 | 
98 |         
99 | 


--------------------------------------------------------------------------------
/Retrieval/pred_retrieval_based_method.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Any
  3 | import torch
  4 | import json
  5 | import argparse
  6 | import openai
  7 | from datasets import load_dataset
  8 | from transformers import AutoTokenizer, AutoModelForCausalLM
  9 | from llama_index import GPTVectorStoreIndex, Document, ServiceContext
 10 | from langchain.embeddings.huggingface import HuggingFaceEmbeddings
 11 | from llama_index.indices.prompt_helper import PromptHelper
 12 | from llama_index.llms import (
 13 |     OpenAI,
 14 |     CustomLLM,
 15 |     HuggingFaceLLM,
 16 |     CompletionResponse,
 17 |     CompletionResponseGen,
 18 |     LLMMetadata,
 19 | )
 20 | from llama_index.llms.base import llm_completion_callback
 21 | import tiktoken
 22 | 
 23 | 
 24 | class OpenSourceLLM(CustomLLM):
 25 |     num_output: int = 0
 26 |     model_name: str = ""
 27 |     max_length: int = 0
 28 |     tokenizer: AutoTokenizer = None
 29 |     model: AutoModelForCausalLM = None
 30 | 
 31 |     def __init__(self, num_output, max_length, model_path, model_name) -> None:
 32 |         super().__init__()
 33 |         self.num_output = num_output
 34 |         self.model_name = model_name
 35 |         self.max_length = max_length
 36 |         self.tokenizer = AutoTokenizer.from_pretrained(
 37 |             os.path.join(model_path, model_name), trust_remote_code=True
 38 |         )
 39 |         self.model = AutoModelForCausalLM.from_pretrained(
 40 |             os.path.join(model_path, model_name), trust_remote_code=True, torch_dtype=torch.bfloat16
 41 |         ).to(device)
 42 |         self.model.eval()
 43 | 
 44 |     @property
 45 |     def metadata(self) -> LLMMetadata:
 46 |         """Get LLM metadata."""
 47 |         return LLMMetadata(
 48 |             context_window=self.max_length,
 49 |             num_output=self.num_output,
 50 |             model_name=self.model_name,
 51 |         )
 52 | 
 53 |     @llm_completion_callback()
 54 |     def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
 55 |         print("input:", prompt)
 56 |         input_ids = self.tokenizer(
 57 |             prompt, truncation=False, return_tensors="pt"
 58 |         ).input_ids
 59 |         input_ids = input_ids.to("cuda")
 60 |         context_length = input_ids.shape[-1]
 61 |         with torch.no_grad():
 62 |             output = self.model.generate(
 63 |                 input_ids,
 64 |                 max_new_tokens=self.num_output,
 65 |                 temperature=1.0,
 66 |                 num_beams=1,
 67 |                 do_sample=False,
 68 |                 repetition_penalty=float(2),
 69 |             )[0]
 70 |         text = self.tokenizer.decode(
 71 |             output[context_length:], skip_special_tokens=True)
 72 | 
 73 |         return CompletionResponse(text=text)
 74 | 
 75 |     @llm_completion_callback()
 76 |     def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
 77 |         raise NotImplementedError()
 78 | 
 79 | 
 80 | def parse_args(args=None):
 81 |     parser = argparse.ArgumentParser()
 82 |     parser.add_argument(
 83 |         "--model_name",
 84 |         type=str,
 85 |         default="llama-index",
 86 |         help="raw model name for evaluation",
 87 |     )
 88 |     parser.add_argument(
 89 |         "--emb_model_name", type=str, default="", help="embedding_model"
 90 |     )
 91 |     parser.add_argument(
 92 |         "--task",
 93 |         type=str,
 94 |         default=None,
 95 |         help="long context understanding tasks in LooGLE",
 96 |         choices=[
 97 |             "shortdep_qa",
 98 |             "longdep_qa",
 99 |             "longdep_summarization",
100 |             "shortdep_cloze",
101 |         ],
102 |     )
103 |     parser.add_argument(
104 |         "--max_length", type=int, default=None, help="the max length of input prompt"
105 |     )
106 | 
107 |     parser.add_argument("--model_path", type=str, default="./Models/")
108 |     parser.add_argument("--output_path", type=str, default="./Output/")
109 | 
110 |     return parser.parse_args(args)
111 | 
112 | 
113 | def num_tokens_from_string(string: str, encoding_name: str) -> int:
114 |     """Returns the number of tokens in a text string."""
115 |     encoding = tiktoken.get_encoding(encoding_name)
116 |     num_tokens = len(encoding.encode(string))
117 |     return num_tokens
118 | 
119 | 
120 | def get_pred(data_instance, service_context):
121 |     ans, groundtruth = [], []
122 |     preds = {}
123 |     preds["qa_pairs"] = eval(data_instance["qa_pairs"])
124 |     documents = [Document(text=data_instance["input"])]
125 |     index = GPTVectorStoreIndex.from_documents(
126 |         documents, service_context=service_context
127 |     )
128 |     query_engine = index.as_query_engine()
129 | 
130 |     for j in eval(data_instance["qa_pairs"]):
131 |         rsp = query_engine.query(
132 |             "Question: " + j["Q"] + "\n" + "Answer: ").response
133 |         ans.append(rsp)
134 |         groundtruth.append(j["A"])
135 | 
136 |     preds["llm_output"] = ans
137 |     preds["output"] = groundtruth
138 |     return preds
139 | 
140 | 
141 | def loads(path, task):
142 |     data = []
143 |     with open(path+task+".jsonl", "r") as f:
144 |         lines = f.readlines()
145 |         for line in lines:
146 |             data.append(json.loads(line))
147 |     return data
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     open_source_model = [
152 |         "rwkv-4-14b-pile",
153 |         "long_llama_3b",
154 |         "LLaMA-2-7B-32K",
155 |         "chatglm2-6b-32k",
156 |     ]
157 |     openai_model = ["gpt-3.5-turbo-16k", "gpt-4"]
158 |     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
159 |     args = parse_args()
160 |     task2maxlen = json.load(open("./config/task2maxlen.json", "r"))
161 |     max_gen = task2maxlen[args.task]
162 |     # data = load_dataset("bigainlco/LooGLE", args.task, split="test")
163 |     data = loads("LooGLE-testdata/", args.task)
164 |     if args.model_name in open_source_model:
165 |         llm = OpenSourceLLM(max_gen, args.max_length,
166 |                             args.model_path, args.model_name)
167 |     elif args.model_name in openai_model:
168 |         llm = OpenAI(model=args.model_name)
169 |     else:
170 |         raise NameError("model name not found!")
171 |     embed_model = HuggingFaceEmbeddings(model_name=args.emb_model_name)
172 |     prompt_helper = PromptHelper(
173 |         context_window=args.max_length,
174 |         num_output=max_gen,
175 |         chunk_size_limit=1024,
176 |         chunk_overlap_ratio=0.1,
177 |     )
178 | 
179 |     service_context = ServiceContext.from_defaults(
180 |         llm=llm,
181 |         context_window=args.max_length,
182 |         num_output=max_gen,
183 |         embed_model=embed_model,
184 |         prompt_helper=prompt_helper,
185 |         chunk_size_limit=1024,
186 |     )
187 |     for i in data:
188 |         predictions = get_pred(i, service_context)
189 |         with open(
190 |             args.output_path + args.task + "_" + args.model_name + ".jsonl", "a+"
191 |         ) as g:
192 |             g.write(json.dumps(predictions) + "\n")
193 | 


--------------------------------------------------------------------------------
/Tools/Labeling_form_A_annotator.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | 
  4 | <head>
  5 |     <meta charset="UTF-8">
  6 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 |     <!-- Bootstrap link -->
  9 |     <!-- <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet"
 10 |         integrity="sha384-9ndCyUaIbzAi2FUVXJi0CjmCapSmO7SnpJef0486qhLnuZ2cdeRhO02iuK6FUUVM" crossorigin="anonymous"> -->
 11 |     <link href="https://cdn.bootcdn.net/ajax/libs/twitter-bootstrap/5.3.0-alpha1/css/bootstrap.min.css"
 12 |         rel="stylesheet">
 13 |     <!-- <link rel="stylesheet" href="style.css"> -->
 14 |     <title>Labeling form Question_annotator only</title>
 15 | </head>
 16 | 
 17 | <body>
 18 | 
 19 |     <style>
 20 |         body {
 21 |             background-color: #f2f2f2;
 22 |             font-family: Arial, Helvetica, sans-serif;
 23 |         }
 24 | 
 25 |         .container {
 26 |             background-color: #fff;
 27 |             width: 80%;
 28 |             max-width: 900px;
 29 |             margin-top: 20px;
 30 |             /* margin: 0 auto; */
 31 |             padding: 20px;
 32 |             border-radius: 10px;
 33 |             box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
 34 |         }
 35 | 
 36 |         .container h3 {
 37 |             margin-top: 50px;
 38 |         }
 39 | 
 40 |         .container .btn-link {
 41 |             font-weight: bold;
 42 |             text-decoration: none;
 43 |             color: #333;
 44 |         }
 45 | 
 46 |         .container .card {
 47 |             margin-top: 20px;
 48 | 
 49 |         }
 50 | 
 51 | 
 52 |         .container .card .card-body {
 53 |             padding-left: 30px;
 54 |         }
 55 | 
 56 |         .container .card .card-body ul {
 57 |             padding-left: 10px;
 58 |         }
 59 | 
 60 | 
 61 | 
 62 | 
 63 |         .container h1 {
 64 |             text-align: center;
 65 |             text-transform: capitalize;
 66 |             color: #333;
 67 |         }
 68 | 
 69 |         .annotation {
 70 |             margin-left: auto;
 71 |             margin-right: auto;
 72 |         }
 73 | 
 74 |         .annotation form {
 75 |             display: flex;
 76 |             flex-direction: column;
 77 |         }
 78 | 
 79 |         .annotation label {
 80 |             font-weight: bold;
 81 |         }
 82 | 
 83 |         .container form {
 84 |             margin-top: 20px;
 85 |             font-size: 16px;
 86 |         }
 87 | 
 88 |         .container form .form-control {
 89 |             width: 100%;
 90 |             padding: 10px;
 91 |             margin-bottom: 10px;
 92 |             border: 1px solid #ccc;
 93 |             border-radius: 5px;
 94 |         }
 95 | 
 96 |         .container form .form-control:focus {
 97 |             outline: none;
 98 |             border-color: #333;
 99 |         }
100 | 
101 |         .container form .form-control.error {
102 |             border-color: #f00;
103 |         }
104 | 
105 |         .container form .error-text {
106 |             color: #f00;
107 |             font-size: 12px;
108 |         }
109 | 
110 |         .container .btn-custom {
111 |             width: 30%;
112 |             margin-left: auto;
113 |             margin-right: auto;
114 |             margin-top: 0px;
115 |             border: none;
116 |             border-radius: 5px;
117 |             background-color: #333;
118 |             color: #fff;
119 |             cursor: pointer;
120 |             text-transform: capitalize;
121 |         }
122 | 
123 |         .container form .removeBtn {
124 |             margin-top: auto;
125 |             margin-bottom: auto;
126 |             margin-left: 5px;
127 |             background-color: #333;
128 |             color: #fff;
129 |             width: 25px;
130 |             height: 25px;
131 |             padding: 0 0 0 0;
132 |         }
133 | 
134 |         .container #submitBtn {
135 |             margin-top: 20px;
136 |             width: 80%;
137 |         }
138 | 
139 |         .container .btn-download {
140 |             display: block;
141 |             width: 150px;
142 |             margin-right: 0px;
143 |             margin-bottom: 30px;
144 |             text-align: center;
145 |         }
146 | 
147 |         .container .btn-custom:hover {
148 |             background-color: #555;
149 |         }
150 | 
151 |         .container form .success-text {
152 |             color: #0f0;
153 |             font-size: 12px;
154 |         }
155 | 
156 |         form input[type="text"],
157 |         textarea,
158 |         #sub {
159 |             background-color: #333;
160 |             color: #fff;
161 |             cursor: pointer;
162 |             font-size: 16px;
163 |             text-transform: capitalize;
164 |             width: 50%;
165 |             padding: 10px;
166 |             border: none;
167 |             border-radius: 5px;
168 |         }
169 | 
170 |         #sub:hover {
171 |             background-color: #555;
172 |         }
173 | 
174 |         /* Success message div  */
175 |         #successmsg,
176 |         #errormsg,
177 |         #submiterrormsg {
178 |             margin-top: 10px;
179 |         }
180 | 
181 |         @media (max-width: 500px) {
182 |             .container {
183 |                 width: 100%;
184 |             }
185 |         }
186 |     </style>
187 |     <div class="container">
188 |         <h1>Long Document Question Answering</h1>
189 | 
190 |         <h3>Instructions</h3>
191 | 
192 |         <div id="accordion">
193 |             <div class="card">
194 |                 <div class="card-header">
195 |                     <h5 class="mb-0"><a class="btn btn-link" data-bs-toggle="collapse" data-bs-target="#intro"
196 |                             aria-expanded="true" aria-controls="intro">Task Description</a></h5>
197 |                 </div>
198 |                 <div id="task" class="collapse show" data-parent="#accordion">
199 |                     <div class="card-body">
200 |                         <ul>
201 |                             <li>Answer the given questions and find relevant evidences based on given document.</li>
202 |                             <li>Capabilities needed: long text undetstanding, information retrieval, reasoning, count, etc. </li>
203 |                         </ul>
204 |                     </div>
205 |                 </div>
206 |             </div>
207 |             <div class="card">
208 |                 <div class="card-header">
209 |                     <h5 class="mb-0"><a class="btn btn-link" data-bs-toggle="collapse" data-bs-target="#example"
210 |                             aria-expanded="true" aria-controls="example">Labeling Requirement</a></h5>
211 |                 </div>
212 |                 <div id="requirements" class="collapse show" data-parent="#accordion">
213 |                     <div class="card-body">
214 |                         https://docs.qq.com/doc/DSktQQkpiaHhsb2dR
215 |                     </div>
216 |                 </div>
217 |             </div>
218 |             <div class="card">
219 |                 <div class="card-header">
220 |                     <h5 class="mb-0"><a class="btn btn-link" data-bs-toggle="collapse" data-bs-target="#example"
221 |                             aria-expanded="true" aria-controls="example">Example</a></h5>
222 |                 </div>
223 |                 <div id="example" class="collapse show" data-parent="#accordion">
224 |                     <div class="card-body">
225 |                         https://docs.google.com/spreadsheets/d/1pRDwm_fUir3waiIqM5eeOIM3Qvp8TKRdQjyAxaWnAf4/edit#gid=1608534459
226 |                     </div>
227 |                 </div>
228 |             </div>
229 | <!--             <div class="card">
230 |                 <div class="card-header">
231 |                     <h5 class="mb-0"><a class="btn btn-link" data-bs-toggle="collapse" data-bs-target="#example"
232 |                             aria-expanded="true" aria-controls="example">Submission Result Can Be Viewed </a></h5>
233 |                 </div>
234 |                 <div id="example" class="collapse show" data-parent="#accordion">
235 |                     <div class="card-body">
236 |                         https://docs.google.com/spreadsheets/d/1G1g9NusX86ylE2xZqSfhx_2FvtwMIyVp5nT-smaYYY8/edit#gid=0
237 |                     </div>
238 |                 </div>
239 |             </div> -->
240 |         </div>
241 | 
242 |         <hr>
243 | 
244 |         <h3>Annotation Form</h3>
245 | 
246 |         <div class="annotation">
247 | 
248 |             <button id="downloadBtn" class="btn btn-custom btn-download" value="Download">Download</button>   
249 | 
250 |             <form id="submit-to-google-sheet" class="needs-validation">
251 |                 <div class="mb-3 row">
252 |                     <label for="annotatorId" class="col-sm-3 col-form-label">Annotator: </label>
253 |                     <input id="annotatorId" type="text" name="A_annotator" placeholder="Your name here..."
254 |                         maxlength="2000000000000000000" required>
255 |                 </div>
256 |                 <div class="mb-3 row">
257 |                     <label for="Document_ID" class="col-sm-3 col-form-label">Document ID: </label>
258 |                     <input id="Document_ID" type="text" name="Document_ID" placeholder="Document ID" maxlength="2000000000000000000" required>
259 |                 </div>
260 |                 <div class="mb-3 row">
261 |                     <label for="Document_title" class="col-sm-3 col-form-label">Document Title: </label>
262 |                     <textarea class="col-sm-6" id="Document_title" type="text" name="Document_title" placeholder="Document Title"
263 |                         rows="2"></textarea>
264 |                 </div>
265 | 
266 |                 <div class="form-disabled">
267 |                     <div class="mb-3 row">
268 |                         <label for="Question_ID" class="col-sm-3 col-form-label">Question ID: </label>
269 |                         <input id="Question_ID" type="text" name="Question_ID" placeholder="Question ID" maxlength="2000000000000000000"
270 |                              required>
271 |                     </div>
272 | 
273 |                     <div class="mb-3 row">
274 |                         <label for="Question" class="col-sm-3 col-form-label">Question: </label>
275 |                         <input id="Question" type="text" name="Question" placeholder="Your question here"
276 |                             maxlength="2000000000000000000"  required>
277 |                     </div>
278 |                     
279 |                     <div class="col-12">
280 |                         <div class="form-check">
281 | 
282 |                             <input class="form-check-input" type="checkbox" value="" id="confirmation1">
283 |                             <label class="form-check-label" for="confirmation1">
284 |                                 <span style="color:#f00">(Check before continue.)</span> I confirm that I have carefully
285 |                                 read
286 |                                 the instructions, examples, the document and the question.
287 |                             </label>
288 |                         </div>
289 |                         <div class="form-check">
290 | 
291 |                             <input class="form-check-input" type="checkbox" value="" id="confirmation2">
292 |                             <label class="form-check-label" for="confirmation2">
293 |                                 <span style="color:#f00">(Check before continue.)</span> I confirm that I have fully
294 |                                 understood what the question asks. You can leave the rest empty if you do not understand the
295 |                                 question.
296 |                             </label>
297 |                         </div>
298 |                     </div>
299 |  
300 | 
301 | 
302 |                     <div class="mb-3 row">
303 |                         <label for="Answer" class="col-sm-3 col-form-label">Answer: </label>
304 |                         <textarea class="col-sm-6" id="Answer" type="text" name="Answer" placeholder="Your answer here"
305 |                             rows="2"></textarea>
306 |                     </div>
307 | 
308 |                     <div id="evidence">
309 |                         <div class="mb-3 row" id="EvidenceGroup1">
310 |                             <label for="Evidence1" class="col-sm-3 col-form-label">Evidence1: </label>
311 |                             <textarea class="col-sm-6" id="Evidence1" name="Evidence1" aria-label="Evidence1" disabled
312 |                                 required></textarea>
313 |                             <button id="removeBtn1" class="btn col-sm-1 removeBtn">X</button>
314 |                         </div>
315 |                     </div>
316 |                 </div>
317 |                 <button id="addEviBtn" class="btn btn-custom" disabled>Add more evidence</button>
318 |                 <input id="submitBtn" class="btn btn-custom" type="submit" class="btn" value="Submit" disabled>
319 |             </form>
320 |         </div>
321 | 
322 | 
323 | 
324 |         <div id="successmsg" class="alert alert-success" role="alert" style="display: none;">
325 |             <strong>Success!</strong> Your message has been sent.
326 |         </div>
327 | 
328 |         <div id="submiterrormsg" class="alert alert-danger" role="alert" style="display: none;">
329 |             <strong>Error!</strong> You have to complete all required values before submit.
330 |         </div>
331 | 
332 |         <div id="errormsg" class="alert alert-danger" role="alert" style="display: none;">
333 |             <strong>Error!</strong> There was an error sending your message. Please try again or directly download your
334 |             response!
335 |         </div>
336 | 
337 |     </div>
338 | 
339 | 
340 |     <script src="https://cdn.bootcdn.net/ajax/libs/jquery/3.6.4/jquery.min.js"></script>
341 |     <script>
342 |         var numEvidence = 1;
343 |         var eviId = 1;
344 | 
345 |         $("#confirmation1").change(function () {
346 |             if ($("#confirmation1").is(':checked')) {
347 |                 $('.form-disabled input:disabled, .form-disabled select:disabled, .form-disabled textarea:disabled').each(function () {
348 |                     $(this).removeAttr('disabled');
349 |                 });
350 |                 $('#addEviBtn').removeAttr('disabled');
351 |                 $('#submitBtn').removeAttr('disabled');
352 |             } else {
353 |                 $('.form-disabled input, .form-disabled select, .form-disabled textarea').each(function () {
354 |                     $(this).attr('disabled', true);
355 |                 });
356 |                 $('#addEviBtn').attr('disabled', true); $('#submitBtn').attr('disabled', true);
357 |             }
358 |         });
359 | 
360 |         $('#addEviBtn').on('click', e => {
361 |             e.preventDefault();
362 | 
363 |             if (numEvidence < 6) {
364 |                 numEvidence = numEvidence + 1;
365 |                 eviId = eviId + 1;
366 |                 console.log(numEvidence);
367 | 
368 |                 var newEvi = "<div class='mb-3 row'>" +
369 |                     "<label for='Evidence" + eviId.toString() + "' class='col-sm-3 col-form-label'>Evidence" + eviId.toString() + ": </label>" +
370 |                     "<textarea class='col-sm-6' id='Evidence" + eviId.toString() + "' name='Evidence" + eviId.toString() + "' required></textarea>" +
371 |                     "<button id='removeBtn" + eviId.toString() + "' class='col-sm-1 removeBtn'>X</button></div>";
372 |                 console.log(newEvi);
373 |                 $("#evidence").append(newEvi);
374 | 
375 |                 $('button[id^=removeBtn]').on('click', function () {
376 |                     if (numEvidence > 1) {
377 |                         $(this).parent().remove();
378 |                         numEvidence = numEvidence - 1;
379 |                     }
380 |                     if (numEvidence < 6) {
381 |                         $('#addEviBtn').removeAttr('disabled');
382 |                     }
383 |                 })
384 | 
385 | 
386 |             }
387 | 
388 |             if (numEvidence >= 6) {
389 |                 $('#addEviBtn').attr('disabled', true);
390 |             }
391 |         });
392 | 
393 | 
394 |         $('button[id^=removeBtn]').on('click', e => {
395 |             e.preventDefault()
396 |             if (numEvidence > 1) {
397 |                 $(this).parent().remove();
398 |                 numEvidence = numEvidence - 1;
399 |             }
400 |         })
401 | 
402 |         $('#downloadBtn').on('click', e => {
403 |             e.preventDefault()
404 |             var formObj = $('#submit-to-google-sheet').serializeArray();
405 |             console.log(formObj);
406 | 
407 |             var jsonObj = {};
408 |             formObj.forEach((val, key) => jsonObj[key] = val);
409 | 
410 |             $("<a />", {
411 |                 "download": "form.json",
412 |                 "href": "data:application/json;charset=utf-8," + encodeURIComponent(JSON.stringify(jsonObj)),
413 |             }).appendTo("body")
414 |                 .click(() => {
415 |                     $(this).remove()
416 |                 })[0].click();
417 | 
418 |             // $("#downloadUrl").remove();
419 |         })
420 | 
421 |     </script>
422 | 
423 |     <script>
424 |         $("#submitBtn").on('click', e => {
425 |             const scriptURL = 'https://script.google.com/macros/s/AKfycbxEPMVwJ1dOsMSA-VWQPylzqNLarQjxxDhMnwfzjx7A6_asRYBzvALjSrI0lvPP-30/exec'
426 |             const form = document.forms['submit-to-google-sheet'];
427 |             const successmsg = document.getElementById('successmsg');
428 |             const errormsg = document.getElementById('errormsg');
429 |             e.preventDefault()
430 |             if (form.checkValidity()) {
431 |                 fetch(scriptURL, { method: 'POST', body: new FormData(form) })
432 |                     .then(response => {
433 |                         console.log('Success!', response);
434 |                         //show success message
435 |                         $("#successmsg").fadeTo(2000, 500).slideUp(500, function () {
436 |                             $("#successmsg").slideUp(500);
437 |                         });
438 |                         // form.reset();
439 |                     }
440 |                     )
441 |                     .catch(error => {
442 |                         $("#errormsg").fadeTo(2000, 500).slideUp(500, function () {
443 |                             $("#errormsg").slideUp(500);
444 |                         });
445 |                     })
446 |                 //show error message
447 |             } else {
448 |                 $("#submiterrormsg").fadeTo(2000, 500).slideUp(500, function () {
449 |                             $("#submiterrormsg").slideUp(500);
450 |                         });
451 |             }
452 | 
453 |         })
454 | 
455 |     </script>
456 | 
457 |     <!-- <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"
458 |         integrity="sha384-geWF76RCwLtnZ8qwWowPQNguL3RmwHVBC9FhGdlKrxdiJJigb/j/68SIy3Te4Bkz"
459 |         crossorigin="anonymous"></script> -->
460 |     <script src="https://cdn.bootcdn.net/ajax/libs/twitter-bootstrap/5.3.0-alpha1/js/bootstrap.bundle.min.js"></script>
461 | </body>
462 | 
463 | </html>


--------------------------------------------------------------------------------
/Tools/Labeling_form_Q_annotator.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | 
  4 | <head>
  5 |     <meta charset="UTF-8">
  6 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
  7 |     <meta name="viewport" content="width=device-width, initial-scale=1.0">
  8 |     <!-- Bootstrap link -->
  9 |     <!-- <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet"
 10 |         integrity="sha384-9ndCyUaIbzAi2FUVXJi0CjmCapSmO7SnpJef0486qhLnuZ2cdeRhO02iuK6FUUVM" crossorigin="anonymous"> -->
 11 |     <link href="https://cdn.bootcdn.net/ajax/libs/twitter-bootstrap/5.3.0-alpha1/css/bootstrap.min.css"
 12 |         rel="stylesheet">
 13 |     <!-- <link rel="stylesheet" href="style.css"> -->
 14 |     <title>Labeling form Question_annotator only</title>
 15 | </head>
 16 | 
 17 | <body>
 18 | 
 19 |     <style>
 20 |         body {
 21 |             background-color: #f2f2f2;
 22 |             font-family: Arial, Helvetica, sans-serif;
 23 |         }
 24 | 
 25 |         .container {
 26 |             background-color: #fff;
 27 |             width: 80%;
 28 |             max-width: 900px;
 29 |             margin-top: 20px;
 30 |             /* margin: 0 auto; */
 31 |             padding: 20px;
 32 |             border-radius: 10px;
 33 |             box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
 34 |         }
 35 | 
 36 |         .container h3 {
 37 |             margin-top: 50px;
 38 |         }
 39 | 
 40 |         .container .btn-link {
 41 |             font-weight: bold;
 42 |             text-decoration: none;
 43 |             color: #333;
 44 |         }
 45 | 
 46 |         .container .card {
 47 |             margin-top: 20px;
 48 | 
 49 |         }
 50 | 
 51 | 
 52 |         .container .card .card-body {
 53 |             padding-left: 30px;
 54 |         }
 55 | 
 56 |         .container .card .card-body ul {
 57 |             padding-left: 10px;
 58 |         }
 59 | 
 60 | 
 61 | 
 62 | 
 63 |         .container h1 {
 64 |             text-align: center;
 65 |             text-transform: capitalize;
 66 |             color: #333;
 67 |         }
 68 | 
 69 |         .annotation {
 70 |             margin-left: auto;
 71 |             margin-right: auto;
 72 |         }
 73 | 
 74 |         .annotation form {
 75 |             display: flex;
 76 |             flex-direction: column;
 77 |         }
 78 | 
 79 |         .annotation label {
 80 |             font-weight: bold;
 81 |         }
 82 | 
 83 |         .container form {
 84 |             margin-top: 20px;
 85 |             font-size: 16px;
 86 |         }
 87 | 
 88 |         .container form .form-control {
 89 |             width: 100%;
 90 |             padding: 10px;
 91 |             margin-bottom: 10px;
 92 |             border: 1px solid #ccc;
 93 |             border-radius: 5px;
 94 |         }
 95 | 
 96 |         .container form .form-control:focus {
 97 |             outline: none;
 98 |             border-color: #333;
 99 |         }
100 | 
101 |         .container form .form-control.error {
102 |             border-color: #f00;
103 |         }
104 | 
105 |         .container form .error-text {
106 |             color: #f00;
107 |             font-size: 12px;
108 |         }
109 | 
110 |         .container .btn-custom {
111 |             width: 30%;
112 |             margin-left: auto;
113 |             margin-right: auto;
114 |             margin-top: 0px;
115 |             border: none;
116 |             border-radius: 5px;
117 |             background-color: #333;
118 |             color: #fff;
119 |             cursor: pointer;
120 |             text-transform: capitalize;
121 |         }
122 | 
123 |         .container form .removeBtn {
124 |             margin-top: auto;
125 |             margin-bottom: auto;
126 |             margin-left: 5px;
127 |             background-color: #333;
128 |             color: #fff;
129 |             width: 25px;
130 |             height: 25px;
131 |             padding: 0 0 0 0;
132 |         }
133 | 
134 |         .container #submitBtn {
135 |             margin-top: 20px;
136 |             width: 80%;
137 |         }
138 | 
139 |         .container .btn-download {
140 |             display: block;
141 |             width: 150px;
142 |             margin-right: 0px;
143 |             margin-bottom: 30px;
144 |             text-align: center;
145 |         }
146 | 
147 |         .container .btn-custom:hover {
148 |             background-color: #555;
149 |         }
150 | 
151 |         .container form .success-text {
152 |             color: #0f0;
153 |             font-size: 12px;
154 |         }
155 | 
156 |         form input[type="text"],
157 |         textarea,
158 |         #sub {
159 |             background-color: #333;
160 |             color: #fff;
161 |             cursor: pointer;
162 |             font-size: 16px;
163 |             text-transform: capitalize;
164 |             width: 50%;
165 |             padding: 10px;
166 |             border: none;
167 |             border-radius: 5px;
168 |         }
169 | 
170 |         #sub:hover {
171 |             background-color: #555;
172 |         }
173 | 
174 |         /* Success message div  */
175 |         #successmsg,
176 |         #errormsg,
177 |         #submiterrormsg {
178 |             margin-top: 10px;
179 |         }
180 | 
181 |         @media (max-width: 500px) {
182 |             .container {
183 |                 width: 100%;
184 |             }
185 |         }
186 |     </style>
187 |     <div class="container">
188 |         <h1>Long Document Question Answering</h1>
189 | 
190 |         <h3>Instructions</h3>
191 | 
192 |         <div id="accordion">
193 |             <div class="card">
194 |                 <div class="card-header">
195 |                     <h5 class="mb-0"><a class="btn btn-link" data-bs-toggle="collapse" data-bs-target="#intro"
196 |                             aria-expanded="true" aria-controls="intro">Task Description</a></h5>
197 |                 </div>
198 |                 <div id="task" class="collapse show" data-parent="#accordion">
199 |                     <div class="card-body">
200 |                         <ul>
201 |                             <li>Generate long context dependent questions based on given document. </li>
202 |                             <li>Answer the generated questions and find relevant evidences.</li>
203 |                             <li>Capabilities needed: long text undetstanding, information retrieval, reasoning, count, etc. </li>
204 |                         </ul>
205 |                     </div>
206 |                 </div>
207 |             </div>
208 |             <div class="card">
209 |                 <div class="card-header">
210 |                     <h5 class="mb-0"><a class="btn btn-link" data-bs-toggle="collapse" data-bs-target="#example"
211 |                             aria-expanded="true" aria-controls="example">Labeling Requirement</a></h5>
212 |                 </div>
213 |                 <div id="requirements" class="collapse show" data-parent="#accordion">
214 |                     <div class="card-body">
215 |                         https://docs.qq.com/doc/DSktQQkpiaHhsb2dR
216 |                     </div>
217 |                 </div>
218 |             </div>
219 |             <div class="card">
220 |                 <div class="card-header">
221 |                     <h5 class="mb-0"><a class="btn btn-link" data-bs-toggle="collapse" data-bs-target="#example"
222 |                             aria-expanded="true" aria-controls="example">Example</a></h5>
223 |                 </div>
224 |                 <div id="example" class="collapse show" data-parent="#accordion">
225 |                     <div class="card-body">
226 |                         https://docs.google.com/spreadsheets/d/1pRDwm_fUir3waiIqM5eeOIM3Qvp8TKRdQjyAxaWnAf4/edit#gid=1608534459
227 |                     </div>
228 |                 </div>
229 |             </div>
230 | <!--             <div class="card">
231 |                 <div class="card-header">
232 |                     <h5 class="mb-0"><a class="btn btn-link" data-bs-toggle="collapse" data-bs-target="#example"
233 |                             aria-expanded="true" aria-controls="example">Submission Result Can Be Viewed </a></h5>
234 |                 </div>
235 |                 <div id="example" class="collapse show" data-parent="#accordion">
236 |                     <div class="card-body">
237 |                         https://docs.google.com/spreadsheets/d/1QGFLHgO4t2qvhCps3KTyI8-0Iysr-UmKkfpSQdnOEMc/edit#gid=0
238 |                     </div>
239 |                 </div>
240 |             </div> -->
241 |         </div>
242 | 
243 |         <hr>
244 | 
245 |         <h3>Annotation Form</h3>
246 | 
247 |         <div class="annotation">
248 | 
249 |             <button id="downloadBtn" class="btn btn-custom btn-download" value="Download">Download</button>   
250 | 
251 |             <form id="submit-to-google-sheet" class="needs-validation">
252 |                 <div class="mb-3 row">
253 |                     <label for="annotatorId" class="col-sm-3 col-form-label">Annotator: </label>
254 |                     <input id="annotatorId" type="text" name="Q_annotator" placeholder="Your name here..."
255 |                         maxlength="2000000000000000000" required>
256 |                 </div>
257 |                 <div class="mb-3 row">
258 |                     <label for="Document_ID" class="col-sm-3 col-form-label">Document ID: </label>
259 |                     <input id="Document_ID" type="text" name="Document_ID" placeholder="Document ID" maxlength="2000000000000000000" required>
260 |                 </div>
261 |                 <div class="mb-3 row">
262 |                     <label for="Document_title" class="col-sm-3 col-form-label">Document Title: </label>
263 |                     <textarea class="col-sm-6" id="Document_title" type="text" name="Document_title" placeholder="Document Title"
264 |                         rows="2"></textarea>
265 |                 </div>
266 | 
267 |                 <div class="col-12">
268 |                     <div class="form-check">
269 | 
270 |                         <input class="form-check-input" type="checkbox" value="" id="confirmation1">
271 |                         <label class="form-check-label" for="confirmation1">
272 |                             <span style="color:#f00">(Check before continue.)</span> I confirm that I have carefully
273 |                             read
274 |                             the instructions, examples and the document.
275 |                         </label>
276 |                     </div>
277 |                 </div>
278 |                 <div class="form-disabled">
279 |                     <div class="mb-3 row">
280 |                         <label for="Question_ID" class="col-sm-3 col-form-label">Question ID: </label>
281 |                         <input id="Question_ID" type="text" name="Question_ID" placeholder="Question ID" maxlength="2000000000000000000"
282 |                             disabled required>
283 |                     </div>
284 | 
285 |                     <div class="mb-3 row">
286 |                         <label for="Question" class="col-sm-3 col-form-label">Question: </label>
287 |                         <input id="Question" type="text" name="Question" placeholder="Your question here"
288 |                             maxlength="2000000000000000000" disabled required>
289 |                     </div>
290 |                     <div class="mb-3 row">
291 |                         <label for="Question_type" class="col-sm-3 col-form-label">Question Type: </label>
292 |                         <select class="col-sm-6 form-select-sm" id="Question_type" name="Question_type"
293 |                             aria-label="Question Type" disabled required>
294 |                             <option>信息抽取</option>
295 |                             <option>阅读理解和推理</option>
296 |                             <option>计数</option>
297 |                             <option>时间线</option>
298 |                             <option>其他</option>
299 |                         </select>
300 |                     </div>
301 | 
302 |                     <div class="mb-3 row">
303 |                         <label for="Answer" class="col-sm-3 col-form-label">Answer: </label>
304 |                         <textarea class="col-sm-6" id="Answer" type="text" name="Answer" placeholder="Your answer here"
305 |                             rows="2"></textarea>
306 |                     </div>
307 | 
308 |                     <div id="evidence">
309 |                         <div class="mb-3 row" id="EvidenceGroup1">
310 |                             <label for="Evidence1" class="col-sm-3 col-form-label">Evidence1: </label>
311 |                             <textarea class="col-sm-6" id="Evidence1" name="Evidence1" aria-label="Evidence1" disabled
312 |                                 required></textarea>
313 |                             <button id="removeBtn1" class="btn col-sm-1 removeBtn">X</button>
314 |                         </div>
315 |                     </div>
316 |                 </div>
317 |                 <button id="addEviBtn" class="btn btn-custom" disabled>Add more evidence</button>
318 |                 <input id="submitBtn" class="btn btn-custom" type="submit" class="btn" value="Submit" disabled>
319 |             </form>
320 |         </div>
321 | 
322 | 
323 | 
324 |         <div id="successmsg" class="alert alert-success" role="alert" style="display: none;">
325 |             <strong>Success!</strong> Your message has been sent.
326 |         </div>
327 | 
328 |         <div id="submiterrormsg" class="alert alert-danger" role="alert" style="display: none;">
329 |             <strong>Error!</strong> You have to complete all required values before submit.
330 |         </div>
331 | 
332 |         <div id="errormsg" class="alert alert-danger" role="alert" style="display: none;">
333 |             <strong>Error!</strong> There was an error sending your message. Please try again or directly download your
334 |             response!
335 |         </div>
336 | 
337 |     </div>
338 | 
339 | 
340 |     <script src="https://cdn.bootcdn.net/ajax/libs/jquery/3.6.4/jquery.min.js"></script>
341 |     <script>
342 |         var numEvidence = 1;
343 |         var eviId = 1;
344 | 
345 |         $("#confirmation1").change(function () {
346 |             if ($("#confirmation1").is(':checked')) {
347 |                 $('.form-disabled input:disabled, .form-disabled select:disabled, .form-disabled textarea:disabled').each(function () {
348 |                     $(this).removeAttr('disabled');
349 |                 });
350 |                 $('#addEviBtn').removeAttr('disabled');
351 |                 $('#submitBtn').removeAttr('disabled');
352 |             } else {
353 |                 $('.form-disabled input, .form-disabled select, .form-disabled textarea').each(function () {
354 |                     $(this).attr('disabled', true);
355 |                 });
356 |                 $('#addEviBtn').attr('disabled', true); $('#submitBtn').attr('disabled', true);
357 |             }
358 |         });
359 | 
360 |         $('#addEviBtn').on('click', e => {
361 |             e.preventDefault();
362 | 
363 |             if (numEvidence < 6) {
364 |                 numEvidence = numEvidence + 1;
365 |                 eviId = eviId + 1;
366 |                 console.log(numEvidence);
367 | 
368 |                 var newEvi = "<div class='mb-3 row'>" +
369 |                     "<label for='Evidence" + eviId.toString() + "' class='col-sm-3 col-form-label'>Evidence" + eviId.toString() + ": </label>" +
370 |                     "<textarea class='col-sm-6' id='Evidence" + eviId.toString() + "' name='Evidence" + eviId.toString() + "' required></textarea>" +
371 |                     "<button id='removeBtn" + eviId.toString() + "' class='col-sm-1 removeBtn'>X</button></div>";
372 |                 console.log(newEvi);
373 |                 $("#evidence").append(newEvi);
374 | 
375 |                 $('button[id^=removeBtn]').on('click', function () {
376 |                     if (numEvidence > 1) {
377 |                         $(this).parent().remove();
378 |                         numEvidence = numEvidence - 1;
379 |                     }
380 |                     if (numEvidence < 6) {
381 |                         $('#addEviBtn').removeAttr('disabled');
382 |                     }
383 |                 })
384 | 
385 | 
386 |             }
387 | 
388 |             if (numEvidence >= 6) {
389 |                 $('#addEviBtn').attr('disabled', true);
390 |             }
391 |         });
392 | 
393 | 
394 |         $('button[id^=removeBtn]').on('click', e => {
395 |             e.preventDefault()
396 |             if (numEvidence > 1) {
397 |                 $(this).parent().remove();
398 |                 numEvidence = numEvidence - 1;
399 |             }
400 |         })
401 | 
402 |         $('#downloadBtn').on('click', e => {
403 |             e.preventDefault()
404 |             var formObj = $('#submit-to-google-sheet').serializeArray();
405 |             console.log(formObj);
406 | 
407 |             var jsonObj = {};
408 |             formObj.forEach((val, key) => jsonObj[key] = val);
409 | 
410 |             $("<a />", {
411 |                 "download": "form.json",
412 |                 "href": "data:application/json;charset=utf-8," + encodeURIComponent(JSON.stringify(jsonObj)),
413 |             }).appendTo("body")
414 |                 .click(() => {
415 |                     $(this).remove()
416 |                 })[0].click();
417 | 
418 |             // $("#downloadUrl").remove();
419 |         })
420 | 
421 |     </script>
422 | 
423 |     <script>
424 |         $("#submitBtn").on('click', e => {
425 |             const scriptURL = 'https://script.google.com/macros/s/AKfycbw03zuFJxOunFUawJA1AVQsd4tkxyGxRahOk8XPdhHcwEO-KJshCdKSf__2KPsoV_Yq/exec'
426 |             const form = document.forms['submit-to-google-sheet'];
427 |             const successmsg = document.getElementById('successmsg');
428 |             const errormsg = document.getElementById('errormsg');
429 |             e.preventDefault()
430 |             if (form.checkValidity()) {
431 |                 fetch(scriptURL, { method: 'POST', body: new FormData(form) })
432 |                     .then(response => {
433 |                         console.log('Success!', response);
434 |                         //show success message
435 |                         $("#successmsg").fadeTo(2000, 500).slideUp(500, function () {
436 |                             $("#successmsg").slideUp(500);
437 |                         });
438 |                         // form.reset();
439 |                     }
440 |                     )
441 |                     .catch(error => {
442 |                         $("#errormsg").fadeTo(2000, 500).slideUp(500, function () {
443 |                             $("#errormsg").slideUp(500);
444 |                         });
445 |                     })
446 |                 //show error message
447 |             } else {
448 |                 $("#submiterrormsg").fadeTo(2000, 500).slideUp(500, function () {
449 |                             $("#submiterrormsg").slideUp(500);
450 |                         });
451 |             }
452 | 
453 |         })
454 | 
455 |     </script>
456 | 
457 |     <!-- <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"
458 |         integrity="sha384-geWF76RCwLtnZ8qwWowPQNguL3RmwHVBC9FhGdlKrxdiJJigb/j/68SIy3Te4Bkz"
459 |         crossorigin="anonymous"></script> -->
460 |     <script src="https://cdn.bootcdn.net/ajax/libs/twitter-bootstrap/5.3.0-alpha1/js/bootstrap.bundle.min.js"></script>
461 | </body>
462 | 
463 | </html>


--------------------------------------------------------------------------------
/assets/LooGle_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/LooGLE/6734382215bea3a63f055ffd7873b2967b6a2477/assets/LooGle_logo.png


--------------------------------------------------------------------------------
/assets/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/LooGLE/6734382215bea3a63f055ffd7873b2967b6a2477/assets/overview.png


--------------------------------------------------------------------------------
/assets/overview_page1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/LooGLE/6734382215bea3a63f055ffd7873b2967b6a2477/assets/overview_page1.png


--------------------------------------------------------------------------------
/assets/overview_performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/LooGLE/6734382215bea3a63f055ffd7873b2967b6a2477/assets/overview_performance.png


--------------------------------------------------------------------------------
/assets/table.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bigai-nlco/LooGLE/6734382215bea3a63f055ffd7873b2967b6a2477/assets/table.png


--------------------------------------------------------------------------------
/config/task2maxlen.json:
--------------------------------------------------------------------------------
1 | {
2 |     "shortdep_qa": 300,
3 |     "longdep_qa": 500,
4 |     "longdep_summarization":500, 
5 |     "shortdep_cloze": 50
6 | }
7 | 


--------------------------------------------------------------------------------
/config/task2prompt.json:
--------------------------------------------------------------------------------
1 | {
2 |     "shortdep_qa": "Please answer the question based on the long texts below. \n{input}\nQuestion: {Q}\nAnswer: ",
3 |     "longdep_qa": "Please answer the question based on the long texts below. \n{input}\nQuestion: {Q}\nAnswer: ",
4 |     "longdep_summarization": "Please generate a summary of the below paper. \n{input}\n Summarization: ",
5 |     "shortdep_cloze": "Please fill in the clozes based on the given long texts below. Each of the placeholder '<mask-n>' in the question could be an entity of Person, Location or Organiocation. The same masks represent the same entity. Output a json format answer, for example: {{'<mask-0>': 'Bob', '<mask-1>': 'Gorrosion Magazine','<mask-2>': 'Bethel Horizon'}}\n{input}\n Question: {Q} What are the masked entities? \nAnswer:"
6 | }
7 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | _site
2 | .sass-cache
3 | .jekyll-cache
4 | .jekyll-metadata
5 | Gemfile.lock
6 | vendor
7 | 


--------------------------------------------------------------------------------
/docs/404.html:
--------------------------------------------------------------------------------
 1 | ---
 2 | permalink: /404.html
 3 | layout: default
 4 | ---
 5 | 
 6 | <style type="text/css" media="screen">
 7 |   .container {
 8 |     margin: 10px auto;
 9 |     max-width: 600px;
10 |     text-align: center;
11 |   }
12 |   h1 {
13 |     margin: 30px 0;
14 |     font-size: 4em;
15 |     line-height: 1;
16 |     letter-spacing: -1px;
17 |   }
18 | </style>
19 | 
20 | <div class="container">
21 |   <h1>404</h1>
22 | 
23 |   <p><strong>Page not found :(</strong></p>
24 |   <p>The requested page could not be found.</p>
25 | </div>
26 | 


--------------------------------------------------------------------------------
/docs/Gemfile:
--------------------------------------------------------------------------------
 1 | source 'https://rubygems.org'
 2 | 
 3 | gem 'jekyll'
 4 | 
 5 | group :jekyll_plugins do
 6 |     gem 'github-pages'
 7 |     gem 'jekyll-email-protect'
 8 |     gem 'jekyll-github-metadata'
 9 |     gem 'jekyll-paginate-v2'
10 |     gem 'jekyll-scholar'
11 |     gem 'jekyll-twitter-plugin'
12 |     gem 'jemoji'
13 |     gem 'unicode_utils'
14 |     gem 'webrick'
15 | end


--------------------------------------------------------------------------------
/docs/_config.yml:
--------------------------------------------------------------------------------
 1 | # Site settings
 2 | # These are used to personalize your new site. If you look in the HTML files,
 3 | # you will see them accessed via {{ site.title }}, {{ site.email }}, and so on.
 4 | # You can create any custom variable you would like, and they will be accessible
 5 | # in the templates via {{ site.myvariable }}.
 6 | 
 7 | title: LooGLE Benchmark
 8 | email: nlp@bigai.ai
 9 | description: >- # this means to ignore newlines until "baseurl:"
10 |   Long Context Generic Language Evaluation Benchmark for LLM Long Context Understanding
11 | baseurl: "/" # the subpath of your site, e.g. /blog
12 | url: "https://bigai-nlco.github.io/LooGLE" # the base hostname & protocol for your site, e.g. http://example.com
13 | github_username:  bigai-nlco
14 | dataset_link: https://huggingface.co/datasets/bigainlco/LooGLE
15 | 
16 | 
17 | title-img: https://github.com/bigai-nlco/LooGLE/raw/main/assets/LooGle_logo.png
18 | navbar-links:
19 |   Home: /
20 |   Dataset: https://huggingface.co/datasets/bigainlco/LooGLE
21 |   Github: https://github.com/bigai-nlco/LooGLE
22 | post_search: false
23 | 
24 | 
25 | # Build settings
26 | remote_theme: pages-themes/cayman@v0.2.0
27 | plugins:
28 |   - jekyll-remote-theme
29 |   - jekyll-github-metadata
30 | 
31 | 
32 | # Markdown and syntax highlight
33 | markdown: kramdown
34 | highlighter: rouge
35 | highlight_theme: github  # https://github.com/jwarby/jekyll-pygments-themes
36 | kramdown:
37 |   input: GFM
38 |   syntax_highlighter_opts:
39 |     css_class: 'highlight'
40 |     span:
41 |       line_numbers: false
42 |     block:
43 |       line_numbers: false
44 |       start_line: 1
45 | 
46 | # Exclude from processing.
47 | # The following items will not be processed, by default.
48 | # Any item listed under the `exclude:` key here will be automatically added to
49 | # the internal "default list".
50 | #
51 | # Excluded items can be processed by explicitly listing the directories or
52 | # their entries' file path in the `include:` list.
53 | #
54 | # exclude:
55 | #   - .sass-cache/
56 | #   - .jekyll-cache/
57 | #   - gemfiles/
58 | #   - Gemfile
59 | #   - Gemfile.lock
60 | #   - node_modules/
61 | #   - vendor/bundle/
62 | #   - vendor/cache/
63 | #   - vendor/gems/
64 | #   - vendor/ruby/
65 | 


--------------------------------------------------------------------------------
/docs/_includes/head-custom.html:
--------------------------------------------------------------------------------
 1 | <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/bootstrap@4.1.3/dist/css/bootstrap.min.css"
 2 |     integrity="sha384-MCw98/SFnGE8fJT3GXwEOngsV7Zt27NXFoaoApmYm81iuXoPkFOJwJ8ERdknLPMO" crossorigin="anonymous">
 3 | 
 4 | <script src="https://code.jquery.com/jquery-3.3.1.slim.min.js"
 5 |     integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo"
 6 |     crossorigin="anonymous"></script>
 7 | <script src="https://cdn.jsdelivr.net/npm/popper.js@1.14.3/dist/umd/popper.min.js"
 8 |     integrity="sha384-ZMP7rVo3mIykV+2+9J3UJ46jBk0WLaUAdn689aCwoqbBJiSnjAK/l8WvCWPIPm49"
 9 |     crossorigin="anonymous"></script>
10 | <script src="https://cdn.jsdelivr.net/npm/bootstrap@4.1.3/dist/js/bootstrap.min.js"
11 |     integrity="sha384-ChfqqxuZUCnJSK3+MXmPNIyE6ZbWh2IMqE241rYiqJxyMiZ6OW/JmZQ5stwEULTy"
12 |     crossorigin="anonymous"></script>
13 | 
14 | {% if site.gtag %} <!-- Global site tag (gtag.js) - Google Analytics -->
15 | <script async src=" https://www.googletagmanager.com/gtag/js?id={{ site.gtag }}">
16 | </script>
17 | <script>
18 |     window.dataLayer = window.dataLayer || [];
19 |     function gtag () { dataLayer.push(arguments); }
20 |     gtag('js', new Date());
21 |     gtag('config', '{{ site.gtag }}');
22 | </script>
23 | {% endif %}


--------------------------------------------------------------------------------
/docs/_layouts/default.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="{{ site.lang | default: " en-US" }}">
 3 | 
 4 | <head>
 5 |     <meta charset="UTF-8">
 6 | 
 7 |     {% seo %}
 8 |     <link rel="preconnect" href="https://fonts.gstatic.com">
 9 |     <link rel="preload" href="https://fonts.googleapis.com/css?family=Open+Sans:400,700&display=swap" as="style"
10 |         type="text/css" crossorigin>
11 |     <meta name="viewport" content="width=device-width, initial-scale=1">
12 |     <meta name="theme-color" content="#0550ae">
13 |     <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
14 |     <link rel="stylesheet" href="{{ '/assets/css/style.css?v=' | append: site.github.build_revision | relative_url }}">
15 |     {% include head-custom.html %}
16 | </head>
17 | 
18 | <body>
19 |     <nav class="navbar navbar-expand-xl navbar-light fixed-top navbar-custom top-nav-regular">
20 |         {%- if site.title-img -%}
21 |         <a class="navbar-brand navbar-brand-logo" href="{{ site.url }}"><img alt="{{ site.title }} Logo"
22 |                 src="{{ site.title-img | relative_url}}" /></a>
23 |         {%- elsif site.title -%}
24 |         <a class="navbar-brand" href="{{ '' | absolute_url }}">{{ site.title }}</a>
25 |         {%- endif -%}
26 |         <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#main-navbar"
27 |             aria-controls="main-navbar" aria-expanded="false" aria-label="Toggle navigation">
28 |             <span class="navbar-toggler-icon"></span>
29 |         </button>
30 |         <div class="collapse navbar-collapse" id="main-navbar">
31 |             <ul class="navbar-nav ml-auto">
32 |                 {%- for link in site.navbar-links -%}
33 |                 {%- if link[1].first %}
34 |                 <li class="nav-item dropdown">
35 |                     <a class="nav-link dropdown-toggle" href="#" id="navbarDropdown" role="button"
36 |                         data-toggle="dropdown" aria-haspopup="true" aria-expanded="false">{{ link[0] }}</a>
37 |                     <div class="dropdown-menu dropdown-menu-right" aria-labelledby="navbarDropdown">
38 |                         {%- for childlink in link[1] -%}
39 |                         {%- for linkparts in childlink %}
40 |                         <a class="dropdown-item" href="{{ linkparts[1] | relative_url }}">{{ linkparts[0] }}</a>
41 |                         {%- endfor -%}
42 |                         {%- endfor %}
43 |                     </div>
44 |                 </li>
45 |                 {% else %}
46 |                 <li class="nav-item">
47 |                     <a class="nav-link" href="{{ link[1] | relative_url }}">{{ link[0] }}</a>
48 |                 </li>
49 |                 {%- endif -%}
50 |                 {%- endfor -%}
51 |                 {% if site.post_search %}
52 |                 <li class="nav-item">
53 |                     <a class="nav-link" id="nav-search-link" href="#" title="Search">
54 |                         <span id="nav-search-icon" class="fa fa-search"></span>
55 |                         <span id="nav-search-text">Search</span>
56 |                     </a>
57 |                 </li>
58 |                 {%- endif -%}
59 |             </ul>
60 |         </div>
61 |     </nav>
62 |     <a id="skip-to-content" href="#content">Skip to the content.</a>
63 | 
64 |     <header class="page-header" role="banner">
65 |         <h1 class="project-name">{{ page.title | default: site.title | default: site.github.repository_name }}</h1>
66 |         <h2 class="project-tagline">{{ page.description | default: site.description | default:
67 |             site.github.project_tagline }}</h2>
68 |         {% if site.github.is_project_page %}
69 |         <a href="{{ site.github.repository_url }}" class="btn">View on GitHub</a>
70 |         {% endif %}
71 |         {% if site.dataset_link %}
72 |         <a href="{{ site.dataset_link }}" class="btn">Download Dataset</a>
73 |         {% endif %}
74 | 
75 |         {% if site.show_downloads %}
76 |         <a href="{{ site.github.zip_url }}" class="btn">Download .zip</a>
77 |         <a href="{{ site.github.tar_url }}" class="btn">Download .tar.gz</a>
78 |         {% endif %}
79 |     </header>
80 | 
81 |     <main id="content" class="main-content" role="main">
82 |         {{ content }}
83 | 
84 |         <footer class="site-footer">
85 |             {% if site.github.is_project_page %}
86 |             <span class="site-footer-owner"><a href="{{ site.github.repository_url }}">{{
87 |                     site.github.repository_name
88 |                     }}</a> is maintained by <a href="{{ site.github.owner_url }}">{{ site.github.owner_name
89 |                     }}</a>.</span>
90 |             {% endif %}
91 |             <!-- <span class="site-footer-credits">This page was generated by <a href="https://pages.github.com">GitHub -->
92 |             <!-- Pages</a>.</span> -->
93 |         </footer>
94 |     </main>
95 | 
96 |     <script src="{{ '/assets/js/main.js' | relative_url }}"></script>
97 | </body>
98 | 
99 | </html>


--------------------------------------------------------------------------------
/docs/assets/css/style.scss:
--------------------------------------------------------------------------------
  1 | ---
  2 | ---
  3 | @import 'jekyll-theme-cayman';
  4 | 
  5 | .page-header {
  6 |     padding: 10rem 6rem 5rem;
  7 |     background-image: linear-gradient(120deg, #2774AE, #05004a);
  8 | }
  9 | 
 10 | .main-content {
 11 |     max-width: 88rem;
 12 | }
 13 | 
 14 | .main-content h1,
 15 | .main-content h2,
 16 | .main-content h3,
 17 | .main-content h4,
 18 | .main-content h5,
 19 | .main-content h6 {
 20 |     color: #0550ae;
 21 | }
 22 | 
 23 | .main-content table {
 24 |     width: 100%;
 25 |     display: table;
 26 | }
 27 | 
 28 | 
 29 | .highlight {
 30 |     position: relative;
 31 | }
 32 | 
 33 | .clipboard-btn-container {
 34 |     position: absolute !important;
 35 |     right: 0 !important;
 36 |     top: 0 !important;
 37 |     display: block;
 38 | 
 39 | }
 40 | 
 41 | .clipboard-btn {
 42 |     border: 1px solid #05004a;
 43 |     position: relative;
 44 |     padding: 0 !important;
 45 |     margin: 8px !important;
 46 | }
 47 | 
 48 | .btn {
 49 |     border: 1px solid #dce6f0 !important;
 50 | }
 51 | 
 52 | .btn .octicon {
 53 |     margin: 8px !important;
 54 |     display: inline-block;
 55 |     overflow: visible !important;
 56 |     fill: black;
 57 | }
 58 | 
 59 | a {
 60 |     color: #2774ae;
 61 | }
 62 | 
 63 | /* --- Navbar --- */
 64 | 
 65 | .navbar-custom {
 66 |     background-color: {{ site.navbar-col | default: "#05004a" }};
 67 |     border-bottom: 1px solid {{ site.navbar-border-col | default: "#000000" }};
 68 |     font-family: 'Open Sans', 'Helvetica Neue', Helvetica, Arial, sans-serif;
 69 |     {% if site.navbar-img %}
 70 |       background-image: url({{ site.navbar-img | relative_url }});
 71 |       background-attachment: fixed;
 72 |     {% endif %}
 73 |     -webkit-transition: padding .5s ease-in-out;
 74 |     -moz-transition: padding .5s ease-in-out;
 75 |     transition: padding .5s ease-in-out;
 76 |   }
 77 |   
 78 |   .navbar-custom,
 79 |   .navbar-custom.top-nav-short,
 80 |   .navbar-custom.top-nav-short-permanent {
 81 |     padding-top: 0;
 82 |     padding-bottom: 0;
 83 |   }
 84 |   
 85 |   .navbar-custom .navbar-brand {
 86 |     line-height: 1.5;
 87 |     padding-top: 0.625rem;
 88 |     padding-bottom: 0.625rem;
 89 |     font-size: 1.125rem;
 90 |   }
 91 |   
 92 |   .navbar-custom .navbar-brand-logo {
 93 |     -webkit-transition: padding .5s ease-in-out;
 94 |     -moz-transition: padding .5s ease-in-out;
 95 |     transition: padding .5s ease-in-out;
 96 |   }
 97 |   
 98 |   .navbar-custom .navbar-brand-logo,
 99 |   .navbar-custom.top-nav-short .navbar-brand-logo,
100 |   .navbar-custom.top-nav-short-permanent .navbar-brand-logo {
101 |     padding-top: 0.3125rem;
102 |     padding-bottom: 0.3125rem;
103 |   }
104 |   
105 |   .navbar-custom .navbar-brand-logo img {
106 |     -webkit-transition: height .5s ease-in-out;
107 |     -moz-transition: height .5s ease-in-out;
108 |     transition: height .5s ease-in-out;
109 |   }
110 |   
111 |   .navbar-custom .navbar-brand-logo img,
112 |   .navbar-custom.top-nav-short .navbar-brand-logo img,
113 |   .navbar-custom.top-nav-short-permanent .navbar-brand-logo img {
114 |     height: 2.5rem;
115 |   }
116 |   
117 |   .navbar-custom .navbar-nav .nav-item {
118 |     text-transform: uppercase;
119 |     font-size: 0.8125rem;
120 |     letter-spacing: 0.0625rem;
121 |   }
122 |   
123 |   .navbar-custom .navbar-nav .nav-link {
124 |     padding-top: 0;
125 |     padding-bottom: 0;
126 |     line-height: 1.25rem;
127 |     padding-top: 0.9375rem;
128 |     padding-bottom: 0.9375rem;
129 |   }
130 |   
131 |   .navbar-custom .navbar-brand,
132 |   .navbar-custom .navbar-nav .nav-link {
133 |     font-weight: 800;
134 |     color: {{ site.navbar-text-col | default: "#404040" }};
135 |   }
136 |   
137 |   .navbar-toggler {
138 |     font-size: 1rem;
139 |     margin: 0.5rem 0;
140 |   }
141 |   .navbar-custom .navbar-toggler:focus,
142 |   .navbar-custom .navbar-toggler:hover {
143 |     background-color: initial;
144 |   }
145 |   
146 |   .navbar-custom .navbar-toggler[aria-expanded="true"] {
147 |     background-color: rgba(0, 0, 0, 0.2);
148 |   }
149 |   
150 |   .dropdown-toggle::after {
151 |     border-width: 0.4em;
152 |   }
153 |   
154 |   @media (min-width: 1200px) {
155 |     .navbar-custom {
156 |       padding-top: 1.25rem;
157 |       padding-bottom: 1.25rem;
158 |     }
159 |   
160 |     .navbar-custom .navbar-brand-logo {
161 |       padding-top: 0;
162 |       padding-bottom: 0;
163 |     }
164 |   
165 |     .navbar-custom .navbar-brand-logo img {
166 |       height: 3.125rem;
167 |     }
168 |   
169 |     .navbar-expand-xl .navbar-nav .nav-link {
170 |       padding-left: 0.9375rem;
171 |       padding-right: 0.9375rem;
172 |     }
173 |   
174 |     .navbar-expand-xl .navbar-nav .nav-item:not(.dropdown):last-child .nav-link {
175 |       padding-right: 0;
176 |     }
177 |   }
178 |   
179 |   @media (min-width: 1200px) {
180 |     .navbar-custom .nav-item.dropdown:hover {
181 |       background: rgba(0, 0, 0, 0.1);
182 |     }
183 |   }
184 |   
185 |   .navbar-custom .nav-item.dropdown.show {
186 |     background: rgba(0, 0, 0, 0.2);
187 |   }
188 |   
189 |   .navbar-custom .nav-item.dropdown .dropdown-menu {
190 |     margin-top: 0;
191 |     font-size: 1em;
192 |     border: 0;
193 |     padding: 0;
194 |     {% if site.navbar-var-length %}
195 |     min-width: 100%;
196 |     {% else %}
197 |     min-width: 0;
198 |     width: 100%;
199 |     word-break: break-word;
200 |     {% endif %}
201 |   }
202 |   
203 |   .navbar-custom .nav-item.dropdown .dropdown-menu .dropdown-item {
204 |     padding: 0.625rem;
205 |     background-color: {{ site.navbar-col | default: "#EAEAEA" }};
206 |     text-decoration: none !important;
207 |     font-weight: normal;
208 |     color: {{ site.navbar-text-col | default: "#404040" }};
209 |     {% unless site.navbar-var-length %}
210 |     white-space: normal;
211 |     {% endunless %}
212 |   }
213 |   
214 |   @media (min-width: 1200px) {
215 |     .navbar-custom .nav-item.dropdown .dropdown-menu {
216 |       text-align: center;
217 |     }
218 |   
219 |     .navbar-custom .nav-item.dropdown .dropdown-menu .dropdown-item {
220 |       padding-left: 0.625rem;
221 |       border: 1px solid {{ site.navbar-border-col | default: "#DDDDDD" }};
222 |       border-width: 0 1px 1px;
223 |     }
224 |   
225 |     .navbar-custom .nav-item.dropdown .dropdown-menu .dropdown-item:first-child {
226 |       border-top-width: 1px;
227 |     }
228 |   }
229 |   
230 |   @media (max-width: 1199px) {
231 |     .navbar-custom .navbar-collapse {
232 |       border-top: 1px solid {{ site.navbar-border-col | default: "#DDDDDD" }};
233 |       margin: 0 -1rem;
234 |     }
235 |   
236 |     .navbar-custom .navbar-nav {
237 |       padding: 0.5rem 0;
238 |     }
239 |   
240 |     .navbar-custom .navbar-nav .nav-link {
241 |       padding: 0.675rem 0 0.675rem 1rem;
242 |     }
243 |   
244 |     .navbar-custom .nav-item.dropdown.show {
245 |       background: rgba(0, 0, 0, 0.2);
246 |     }
247 |   
248 |     .navbar-custom .nav-item.dropdown .dropdown-menu .dropdown-item {
249 |       padding-left: 2rem;
250 |     }
251 |   }
252 |   .navbar-custom .navbar-brand:hover,
253 |   .navbar-custom .navbar-brand:focus ,
254 |   .navbar-custom .navbar-nav .nav-link:hover,
255 |   .navbar-custom .navbar-nav .nav-link:focus,
256 |   .navbar-custom .navbar-nav .dropdown-menu .dropdown-item:hover,
257 |   .navbar-custom .navbar-nav .dropdown-menu .dropdown-item:focus {
258 |     color: {{ site.hover-col | default: "#0085A1" }};
259 |   }
260 |   
261 |   .navbar-custom .avatar-container {
262 |     position: absolute;
263 |     left: 50%;
264 |     width: 3.125rem;
265 |     bottom: -1.5rem;
266 |     transition: opacity 0.5s ease-in-out;
267 |     -webkit-transition: opacity 0.5s ease-in-out;
268 |     -moz-transition: opacity 0.5s ease-in-out;
269 |   }
270 |   .navbar-custom.top-nav-short .avatar-container {
271 |     opacity: 0;
272 |     visibility: hidden;
273 |     transition: visibility linear 0.5s, opacity 0.5s ease-in-out;
274 |     -webkit-transition: visibility linear 0.5s, opacity 0.5s ease-in-out;
275 |     -moz-transition: visibility linear 0.5s, opacity 0.5s ease-in-out;
276 |   }
277 |   
278 |   .navbar-custom .avatar-container .avatar-img-border {
279 |     width: 100%;
280 |     margin-left: -50%;
281 |     {% unless site.round-avatar == false %}
282 |       border-radius: 50%;
283 |       box-shadow: 0 0 0.5rem rgba(0, 0, 0, .8);
284 |       -webkit-box-shadow: 0 0 0.3125rem rgba(0, 0, 0, .8);
285 |       -moz-box-shadow: 0 0 0.5rem rgba(0, 0, 0, .8);
286 |     {% endunless %}
287 |   }
288 |   .navbar-custom .avatar-container .avatar-img {
289 |     width: 100%;
290 |     {% unless site.round-avatar == false %}
291 |       border-radius: 50%;
292 |     {% endunless %}
293 |     display: block;
294 |   }
295 |   
296 |   .navbar-custom.top-nav-expanded .avatar-container {
297 |     display: none;
298 |   }
299 |   
300 |   @media (min-width: 1200px) {
301 |     .navbar-custom.top-nav-regular .avatar-container {
302 |       width: 6.25rem;
303 |       bottom: -1.9375rem;
304 |     }
305 |   
306 |     .navbar-custom.top-nav-regular .avatar-container .avatar-img-border {
307 |       width: 100%;
308 |       {% unless site.round-avatar == false %}
309 |       box-shadow: 1px 1px 2px rgba(0, 0, 0, .8);
310 |       -webkit-box-shadow: 1px 1px 2px rgba(0, 0, 0, .8);
311 |       -moz-box-shadow: 1px 1px 2px rgba(0, 0, 0, .8);
312 |       {% endunless %}
313 |     }
314 |   
315 |     .navbar-custom.top-nav-regular .avatar-container .avatar-img {
316 |       width: 100%;
317 |     }
318 |   }


--------------------------------------------------------------------------------
/docs/assets/js/main.js:
--------------------------------------------------------------------------------
  1 | // Dean Attali / Beautiful Jekyll 2023
  2 | 
  3 | 
  4 | 
  5 | let BeautifulJekyllJS = {
  6 | 
  7 |     bigImgEl: null,
  8 |     numImgs: null,
  9 | 
 10 |     init: function () {
 11 |         setTimeout(BeautifulJekyllJS.initNavbar, 10);
 12 | 
 13 |         // Shorten the navbar after scrolling a little bit down
 14 |         $(window).scroll(function () {
 15 |             if ($(".navbar").offset().top > 50) {
 16 |                 $(".navbar").addClass("top-nav-short");
 17 |             } else {
 18 |                 $(".navbar").removeClass("top-nav-short");
 19 |             }
 20 |         });
 21 | 
 22 |         // On mobile, hide the avatar when expanding the navbar menu
 23 |         $('#main-navbar').on('show.bs.collapse', function () {
 24 |             $(".navbar").addClass("top-nav-expanded");
 25 |         });
 26 |         $('#main-navbar').on('hidden.bs.collapse', function () {
 27 |             $(".navbar").removeClass("top-nav-expanded");
 28 |         });
 29 | 
 30 |         // show the big header image
 31 |         BeautifulJekyllJS.initImgs();
 32 | 
 33 |         BeautifulJekyllJS.initPres();
 34 | 
 35 |         BeautifulJekyllJS.initSearch();
 36 | 
 37 |         document.querySelectorAll("table").forEach(function (table) {
 38 |             let wrapper = document.createElement("div");
 39 |             wrapper.className = "table-responsive";
 40 |             table.parentNode.insertBefore(wrapper, table);
 41 |             table.parentNode.replaceChild(wrapper, table);
 42 |             wrapper.appendChild(table);
 43 |             table.className = "tg table table-bordered table-striped table-hover";
 44 |             // table.querySelectorAll("th").forEach(function (td) {
 45 |             //     td.removeAttribute("class");
 46 |             // });
 47 | 
 48 |             // table.querySelectorAll("td").forEach(function (td) {
 49 |             //     td.removeAttribute("class");
 50 |             // });
 51 |         });
 52 | 
 53 | 
 54 |     },
 55 | 
 56 |     initNavbar: function () {
 57 |         // Set the navbar-dark/light class based on its background color
 58 |         const rgb = $('.navbar').css("background-color").replace(/[^\d,]/g, '').split(",");
 59 |         const brightness = Math.round(( // http://www.w3.org/TR/AERT#color-contrast
 60 |             parseInt(rgb[0]) * 299 +
 61 |             parseInt(rgb[1]) * 587 +
 62 |             parseInt(rgb[2]) * 114
 63 |         ) / 1000);
 64 |         if (brightness <= 125) {
 65 |             $(".navbar").removeClass("navbar-light").addClass("navbar-dark");
 66 |         } else {
 67 |             $(".navbar").removeClass("navbar-dark").addClass("navbar-light");
 68 |         }
 69 |     },
 70 | 
 71 |     initImgs: function () {
 72 |         // If the page was large images to randomly select from, choose an image
 73 |         if ($("#header-big-imgs").length > 0) {
 74 |             BeautifulJekyllJS.bigImgEl = $("#header-big-imgs");
 75 |             BeautifulJekyllJS.numImgs = BeautifulJekyllJS.bigImgEl.attr("data-num-img");
 76 | 
 77 |             // 2fc73a3a967e97599c9763d05e564189
 78 |             // set an initial image
 79 |             const imgInfo = BeautifulJekyllJS.getImgInfo();
 80 |             const src = imgInfo.src;
 81 |             const desc = imgInfo.desc;
 82 |             BeautifulJekyllJS.setImg(src, desc);
 83 | 
 84 |             // For better UX, prefetch the next image so that it will already be loaded when we want to show it
 85 |             const getNextImg = function () {
 86 |                 const imgInfo = BeautifulJekyllJS.getImgInfo();
 87 |                 const src = imgInfo.src;
 88 |                 const desc = imgInfo.desc;
 89 | 
 90 |                 const prefetchImg = new Image();
 91 |                 prefetchImg.src = src;
 92 |                 // if I want to do something once the image is ready: `prefetchImg.onload = function(){}`
 93 | 
 94 |                 setTimeout(function () {
 95 |                     const img = $("<div></div>").addClass("big-img-transition").css("background-image", 'url(' + src + ')');
 96 |                     $(".intro-header.big-img").prepend(img);
 97 |                     setTimeout(function () { img.css("opacity", "1"); }, 50);
 98 | 
 99 |                     // after the animation of fading in the new image is done, prefetch the next one
100 |                     //img.one("transitioned webkitTransitionEnd oTransitionEnd MSTransitionEnd", function(){
101 |                     setTimeout(function () {
102 |                         BeautifulJekyllJS.setImg(src, desc);
103 |                         img.remove();
104 |                         getNextImg();
105 |                     }, 1000);
106 |                     //});
107 |                 }, 6000);
108 |             };
109 | 
110 |             // If there are multiple images, cycle through them
111 |             if (BeautifulJekyllJS.numImgs > 1) {
112 |                 getNextImg();
113 |             }
114 |         }
115 |     },
116 | 
117 |     initPres: function () {
118 |         const copyButton = "copy";
119 |         let blocks = document.querySelectorAll("pre");
120 |         blocks.forEach(function (block) {
121 |             if (navigator.clipboard) {
122 |                 let button_container = document.createElement("div");
123 |                 button_container.className = "clipboard-btn-container";
124 |                 let button = document.createElement("button");
125 |                 button.className = "btn clipboard-btn";
126 |                 button.innerHTML =
127 |                     '<svg aria-hidden="true" height="16" viewBox="0 0 16 16" version ="1.1" width ="16" data-view-component="true" class="octicon octicon-copy js-clipboard-copy-icon m-2" > <path d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z"></path>< path d="M0 6.75C0 5.784.784 5 1.75 5h1.5a.75.75 0 0 1 0 1.5h-1.5a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-1.5a.75.75 0 0 1 1.5 0v1.5A1.75 1.75 0 0 1 9.25 16h-7.5A1.75 1.75 0 0 1 0 14.25Z" ></path > <path d="M5 1.75C5 .784 5.784 0 6.75 0h7.5C15.216 0 16 .784 16 1.75v7.5A1.75 1.75 0 0 1 14.25 11h-7.5A1.75 1.75 0 0 1 5 9.25Zm1.75-.25a.25.25 0 0 0-.25.25v7.5c0 .138.112.25.25.25h7.5a.25.25 0 0 0 .25-.25v-7.5a.25.25 0 0 0-.25-.25Z"></path></svg >';
128 |                 // button.innerText = "copy";
129 |                 button_container.appendChild(button);
130 |                 block.appendChild(button_container);
131 |                 button.addEventListener("click", async function () {
132 |                     await copyCode(block);
133 |                 });
134 |             }
135 |         });
136 | 
137 |     },
138 | 
139 |     getImgInfo: function () {
140 |         const randNum = Math.floor((Math.random() * BeautifulJekyllJS.numImgs) + 1);
141 |         const src = BeautifulJekyllJS.bigImgEl.attr("data-img-src-" + randNum);
142 |         const desc = BeautifulJekyllJS.bigImgEl.attr("data-img-desc-" + randNum);
143 | 
144 |         return {
145 |             src: src,
146 |             desc: desc
147 |         }
148 |     },
149 | 
150 |     setImg: function (src, desc) {
151 |         $(".intro-header.big-img").css("background-image", 'url(' + src + ')');
152 |         if (typeof desc !== typeof undefined && desc !== false) {
153 |             $(".img-desc").text(desc).show();
154 |         } else {
155 |             $(".img-desc").hide();
156 |         }
157 |     },
158 | 
159 |     initSearch: function () {
160 |         if (!document.getElementById("beautifuljekyll-search-overlay")) {
161 |             return;
162 |         }
163 | 
164 |         $("#nav-search-link").click(function (e) {
165 |             e.preventDefault();
166 |             $("#beautifuljekyll-search-overlay").show();
167 |             $("#nav-search-input").focus().select();
168 |             $("body").addClass("overflow-hidden");
169 |         });
170 |         $("#nav-search-exit").click(function (e) {
171 |             e.preventDefault();
172 |             $("#beautifuljekyll-search-overlay").hide();
173 |             $("body").removeClass("overflow-hidden");
174 |         });
175 |         $(document).on('keyup', function (e) {
176 |             if (e.key == "Escape") {
177 |                 $("#beautifuljekyll-search-overlay").hide();
178 |                 $("body").removeClass("overflow-hidden");
179 |             }
180 |         });
181 |     }
182 | };
183 | 
184 | async function copyCode (block) {
185 |     const code = block.querySelector("code");
186 |     let text = code.innerText;
187 |     await navigator.clipboard.writeText(text);
188 | };
189 | 
190 | // 2fc73a3a967e97599c9763d05e564189
191 | 
192 | document.addEventListener('DOMContentLoaded', BeautifulJekyllJS.init);


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | layout: default
  3 | ---
  4 | 
  5 | ![](https://github.com/bigai-nlco/LooGLE/raw/main/assets/overview_page1.png)
  6 | 
  7 | **LooGLE** is a comprehensive evaluation benchmark for LLM long context understanding which contains up-to-date  (all after 2022) and extremely long realistic documents (over 24k tokens per document, many of which exceed 100k words) and 6,000 newly generated questions spanning diverse domains and categories. Details statistics of our dataset can be seen in the table below.
  8 | 
  9 | **Short and long dependency tasks  📜**  LooGLE is composed of 7 major tasks to evaluate LLMs' ability to understand both short and long dependency content. We refer to ``long dependency" tasks as those that require the understanding of the inter-dependency across multiple shreds of evidences widely spanning over the entire long text. We delicately design 5 types of long dependency tasks, including comprehension and reasoning, computation, timeline reorder, multiple information retrieval, and summarization.
 10 | 
 11 | **Long context evaluation  📊**  In order to provide more comprehensive and general results, LooGLE relies on automatic automatic metrics based on semantic similarity, GPT4-as-judgment and human evaluation to get an overall performance for reference. We conduct the evaluation of 8 representative LLMs. We specifically select LLMs which have made great effort in addressing the challenge of understanding long contexts by utilizing flash attention, position interpolation, optimized Transformer and finetuning, external memory etc. 
 12 | 
 13 | LooGLE not only provides a systematic and comprehensive evaluation schema on long-context LLMs, but also sheds light on future development of enhanced models towards “true long-context understanding”.
 14 | 
 15 | <!-- <br> -->
 16 | 
 17 | ## 📌 **Statistics of LooGLE**
 18 | 
 19 | <!-- | DataSet | Category | No. Documents | Avg. Words | Task | Subtask | No. Questions |
 20 | |:---------:|:----------:|:---------------:|:-----:|:-----:|:-----:|:-----:|
 21 | | arXiv papers     |  Physics, Math, <br> Finance, Statistics  <br> Biology, Economics, <br> Computer Science, etc.   |516 | 14,860 | Summarization | - | 516 | -->
 22 | 
 23 | ![](https://github.com/bigai-nlco/LooGLE/raw/main/assets/table.png)
 24 | 
 25 | ## ✏️ **Table of Contents**
 26 | - [📌 **Statistics of LooGLE**](#-statistics-of-loogle)
 27 | - [✏️ **Table of Contents**](#️-table-of-contents)
 28 | - [🚀 **Capability leaderboard**](#-capability-leaderboard)
 29 | - [💁 **Quick Start**](#-quick-start)
 30 |   - [**Step 1. Prerequisites**](#step-1-prerequisites)
 31 |   - [**Step 2. Download the data**](#step-2-download-the-data)
 32 |   - [**Step 3. Generate the prediction results**](#step-3-generate-the-prediction-results)
 33 |   - [**Prediction for retrieval based methods**](#prediction-for-retrieval-based-methods)
 34 | - [📊 **Evaluation**](#-evaluation)
 35 |   - [**Evaluation on Timeline reorder task**](#evaluation-on-timeline-reorder-task)
 36 | - [💡 **Main result on short and long dependency tasks**](#-main-result-on-short-and-long-dependency-tasks)
 37 |   - [**Performance of the short dependency tasks**](#performance-of-the-short-dependency-tasks)
 38 |   - [**Performance of the long dependency tasks**](#performance-of-the-long-dependency-tasks)
 39 |   - [**Impact of input length on long dependency tasks**](#impact-of-input-length-on-long-dependency-tasks)
 40 | - [📝 **Citation**](#-citation)
 41 | - [📣 **Contacts**](#-contacts)
 42 | 
 43 | 
 44 | ## 🚀 **Capability leaderboard**
 45 | The overall performance comparisons of different models on different tasks in our dataset are shown in the figure below.
 46 | 
 47 | ![](https://github.com/bigai-nlco/LooGLE/raw/main/assets/overview_performance.png)
 48 | 
 49 | 
 50 | ## 💁 **Quick Start**
 51 | ### **Step 1. Prerequisites**
 52 | Clone this repo and install the dependencies. The test environment is under torch 2.0.1+cu121.
 53 | 
 54 | ```bash
 55 | cd LooGLE   
 56 | conda create -n loogle python=3.9
 57 | conda activate loogle
 58 | pip install -r requirements.txt
 59 | export OPENAI_API_KEY="[your_openai_api_key]"
 60 | ```
 61 | 
 62 | 
 63 | ### **Step 2. Download the data** 
 64 | You can download and load the **LooGLE** data through the Hugging Face datasets ([🤗 HF Repo](https://huggingface.co/datasets/bigainlco/LooGLE)):
 65 | 
 66 | ```python
 67 | from datasets import load_dataset
 68 | 
 69 | datasets = ["shortdep_qa", "shortdep_cloze", "longdep_qa", "longdep_summarization"]
 70 | 
 71 | for testset in datasets:
 72 |     data = load_dataset('bigainlco/LooGLE', testset, split='test')
 73 |     # evaluate your model
 74 | ```
 75 | You can also access our sample data [LooGLE-testdata/](LooGLE-testdata/).
 76 | 
 77 | All data in **LooGLE** are standardized to the following format:
 78 | ```json
 79 | {
 80 |     "input": "The original long input texts",
 81 |     "title": "The title of the given document",  //for arxiv paper, we use "title" to refer the identical ID for specific paper
 82 |     "qa_pairs":[
 83 |             {
 84 |                 "Q": "Question to ask based on the given input",
 85 |                 "A": "Groundtruth answer for the question",
 86 |                 "S": [ "One or more evidence (complete sentences) for answering the question, which are extracted directly from the original input"
 87 |                 ]
 88 |             },  
 89 |         ]        // There are multiple questions and corresponding answers in the list (each of them is in json format)
 90 |                  // For arxiv paper summarization, we use "none" instead for non-qa/non-cloze tasks
 91 |     "output": "none"   // the predicted outputs of LLM given the long input and instructions, which is initialized as "none"
 92 | ```
 93 | To mention that, in long dependency QA data, we add an extra key `type` for each question in json to indicate the 4 types of long dependency tasks(apart from summarization).
 94 | 
 95 | 
 96 | ### **Step 3. Generate the prediction results**
 97 | We test LLMs using 3 python codes under the path [Prediction/](Prediction/) for corresponding types of models. We select the model for evaluation via `--model_name` and the specific task via `--task`. Let's take short dependency QA as an example:
 98 | 
 99 | For GPT-3.5-turbo and GPT4:
100 | ```
101 | python Prediction/pred_gpt_models.py  --model_name gpt-3.5-turbo-16k --task shortdep_qa --max_length 500
102 | ```
103 | 
104 | For LlamaIndex:
105 | ```
106 | python Prediction/pred_llamaindex.py --task shortdep_qa --max_length 500
107 | ```
108 | 
109 | For other open-source models (take chatglm2-6b-32k as an example):
110 | ```
111 | python Prediction/pred_opensource_models.py  --model_name chatglm2-6b-32k --task shortdep_qa --max_length 500
112 | ```
113 | 
114 | Open-source models can be download and loaded from [Models/](Models/) by default, you can change the path via `--model_path`
115 | 
116 | You can also determine the long texts output result through `--output_path`.  
117 | 
118 | Please note that in `config/`, we provide the prompt format suitable for each task and the maximum generation length. The input parameter `--max_length` limits the max length of input prompt for selcted model. Feel free to modify them to better suit the model you want to evaluate. 
119 | 
120 | We test all the open-source baselines with a single 80G A800 GPU in BF16 precision. For Llama-2 based models, we recommend using [Flash Attention](https://github.com/Dao-AILab/flash-attention) for optimization and saving GPU memory.
121 | 
122 | 
123 | ### **Prediction for retrieval based methods**
124 | 
125 | To evaluate the effectiveness of retrieval techniques for long-context dependency questions, we undertook an extensive experiments by replacing the base LLM model in LlamaIndex with different baseline LLMs. 
126 | 
127 | For retrieval based methods (take chatglm2-6b-32k as an example):
128 | ```
129 | python Retrieval/pred_retrieval_based_method.py --model_name chatglm2-6b-32k --task shortdep_qa --max_length 500 --emb_model_name sentence-transformers/all-mpnet-base-v2
130 | ```
131 | Use `--emb_model_name` to set embedding models for retrieval based methods. Here we used all-mpnet-base-v2 as default.
132 | 
133 | 
134 | ## 📊 **Evaluation**
135 | 
136 | Given the prediction file generated in Step 2, we run the evaluation code in [Evaluation/](Evaluation/).
137 | 
138 | For automatic evaluation in short and long dependency QA, summarization task  (eg. short dependency QA):
139 | 
140 | ```
141 | python Evaluation/automatic_eval.py --model_name chatglm2-6b-32k --task shortdep_qa --eval_metric automatic_sim
142 | ```
143 | 
144 | For automatic evaluation in cloze task:
145 | 
146 | ```
147 | python Evaluation/automatic_eval.py --model_name chatglm2-6b-32k --task shortdshortdep_cloze --eval_metric automatic_match
148 | ```
149 | 
150 | For  LLM-as-judge in short and long dependency QA, summarization task (eg. short dependency QA):
151 | 
152 | ```
153 | python Evaluation/llm_eval.py --model_name chatglm2-6b-32k --task shortdep_qa
154 | ```
155 | 
156 | Besides the parameters specifying the `--model_name` and `--task`, we provide `--eval_metric` for users to choose the method for automic evaluation from [`automatic_sim`, `automatic_match`]. 
157 | 
158 | Automatic metrics based on semantic similarity matching including Bleu, Rouge, Meteor, Bertscore and exact/partial match are supported. Feel free to add other metrics for your needs in  [Evaluation/automatic_metrics.py](Evaluation/automatic_metrics.py). Besides, the prompt of GPT4 given in the repo can be altered for further evaluation.
159 | 
160 | 
161 | ### **Evaluation on Timeline reorder task**
162 |  We provide four metrics: LSD (location square deviation), LMD (location mean deviation), SD
163 | (swap deviation), and SDD (swap distance deviation) to measure the similarity of numeric sequences for time reorder task with regularized outputs. Details of the implementations can be seen in our paper.
164 | 
165 | For LLM in long dependency timeline reorder task:
166 | ```
167 | python Reorder/automatic_eval.py --model_name chatglm2-6b-32k
168 | ```
169 | 
170 | 
171 | ## 💡 **Main result on short and long dependency tasks**
172 | 
173 | ### **Performance of the short dependency tasks**
174 | 
175 | <table class="tg">
176 | <thead>
177 |   <tr>
178 |     <th class="tg-0lax" rowspan="2">Models </th>
179 |     <th class="tg-0lax" rowspan="2">Context </th>
180 |     <th class="tg-0lax" colspan="8">Short dependency QA</th>
181 |     <th class="tg-0lax" colspan="2">Cloze</th>
182 |   </tr>
183 |   <tr>
184 |     <th class="tg-0lax">Bleu1</th>
185 |     <th class="tg-0lax"> Bleu4 </th>
186 |     <th class="tg-0lax">Rouge1 </th>
187 |     <th class="tg-0lax">Rouge4 </th>
188 |     <th class="tg-0lax">RougeL </th>
189 |     <th class="tg-0lax">Meteor score </th>
190 |     <th class="tg-0lax">Bert score </th>
191 |     <th class="tg-0lax">GPT4 score </th>
192 |     <th class="tg-0lax">Exact Match </th>
193 |     <th class="tg-0lax">Partial Match</th>
194 |   </tr>
195 | </thead>
196 | <tbody>
197 |   <tr>
198 |     <td class="tg-0lax">GPT4-32k</td>
199 |     <td class="tg-0lax">32k</td>
200 |     <td class="tg-0lax">24.61</td>
201 |     <td class="tg-0lax">11.14</td>
202 |     <td class="tg-0lax">61.80</td>
203 |     <td class="tg-0lax">50.73</td>
204 |     <td class="tg-0lax">60.75</td>
205 |     <td class="tg-0lax">32.94</td>
206 |     <td class="tg-0lax">78.72</td>
207 |     <td class="tg-0lax"><b>71.52</b></td>
208 |     <td class="tg-0lax"><b>70.50</b></td>
209 |     <td class="tg-0lax"><b>80.81</b></td>
210 |   </tr>
211 |   <tr>
212 |     <td class="tg-0lax">GPT4-8k</td>
213 |     <td class="tg-0lax">8K</td>
214 |     <td class="tg-0lax">27.35</td>
215 |     <td class="tg-0lax">14.38</td>
216 |     <td class="tg-0lax"><b>67.59</b></td>
217 |     <td class="tg-0lax"><b>56.01</b></td>
218 |     <td class="tg-0lax"><b>65.77</b></td>
219 |     <td class="tg-0lax"><b>38.56</b></td>
220 |     <td class="tg-0lax"><b>87.93</b></td>
221 |     <td class="tg-0lax">53.99</td>
222 |     <td class="tg-0lax">66.03</td>
223 |     <td class="tg-0lax">76.62</td>
224 |   </tr>
225 |   <tr>
226 |     <td class="tg-0lax">GPT3.5-turbo-16k</td>
227 |     <td class="tg-0lax">16K</td>
228 |     <td class="tg-0lax">22.67</td>
229 |     <td class="tg-0lax">9.62</td>
230 |     <td class="tg-0lax">62.56</td>
231 |     <td class="tg-0lax">48.63</td>
232 |     <td class="tg-0lax">60.66</td>
233 |     <td class="tg-0lax">32.58</td>
234 |     <td class="tg-0lax">87.04</td>
235 |     <td class="tg-0lax">66.82</td>
236 |     <td class="tg-0lax">54.64</td>
237 |     <td class="tg-0lax">63.42</td>
238 |   </tr>
239 |   <tr>
240 |     <td class="tg-0lax">LlamaIndex</td>
241 |     <td class="tg-0lax">-</td>
242 |     <td class="tg-0lax"><b>33.37</b></td>
243 |     <td class="tg-0lax"><b>21.43</b></td>
244 |     <td class="tg-0lax">58.82</td>
245 |     <td class="tg-0lax">42.93</td>
246 |     <td class="tg-0lax">57.08</td>
247 |     <td class="tg-0lax">37.17</td>
248 |     <td class="tg-0lax">86.58</td>
249 |     <td class="tg-0lax">59.61</td>
250 |     <td class="tg-0lax">58.95</td>
251 |     <td class="tg-0lax">66.86</td>
252 |   </tr>
253 |   <tr>
254 |     <td class="tg-0lax">ChatGLM2-6B</td>
255 |     <td class="tg-0lax">32k</td>
256 |     <td class="tg-0lax">14.29</td>
257 |     <td class="tg-0lax">6.07</td>
258 |     <td class="tg-0lax">20.50</td>
259 |     <td class="tg-0lax">13.16</td>
260 |     <td class="tg-0lax">20.36</td>
261 |     <td class="tg-0lax">13.08</td>
262 |     <td class="tg-0lax">87.28</td>
263 |     <td class="tg-0lax">23.65</td>
264 |     <td class="tg-0lax">0.05</td>
265 |     <td class="tg-0lax">0.98</td>
266 |   </tr>
267 |   <tr>
268 |     <td class="tg-0lax">LongLLaMa-3B</td>
269 |     <td class="tg-0lax">256k</td>
270 |     <td class="tg-0lax">1.37</td>
271 |     <td class="tg-0lax">0.26</td>
272 |     <td class="tg-0lax">26.97</td>
273 |     <td class="tg-0lax">11.02</td>
274 |     <td class="tg-0lax">26.10</td>
275 |     <td class="tg-0lax">11.34</td>
276 |     <td class="tg-0lax">71.65</td>
277 |     <td class="tg-0lax">13.75</td>
278 |     <td class="tg-0lax">-</td>
279 |     <td class="tg-0lax">2.13</td>
280 |   </tr>
281 |   <tr>
282 |     <td class="tg-0lax">RWKV-4-14B-pile</td>
283 |     <td class="tg-0lax">8k</td>
284 |     <td class="tg-0lax">0.80</td>
285 |     <td class="tg-0lax">0.04</td>
286 |     <td class="tg-0lax">21.70</td>
287 |     <td class="tg-0lax">6.39</td>
288 |     <td class="tg-0lax">20.64</td>
289 |     <td class="tg-0lax">9.41</td>
290 |     <td class="tg-0lax">70.42</td>
291 |     <td class="tg-0lax">8.93</td>
292 |     <td class="tg-0lax">-</td>
293 |     <td class="tg-0lax">-</td>
294 |   </tr>
295 |   <tr>
296 |     <td class="tg-0lax">LLaMA2-7B-32K</td>
297 |     <td class="tg-0lax">32k</td>
298 |     <td class="tg-0lax">0.18</td>
299 |     <td class="tg-0lax">7.25*e-308</td>
300 |     <td class="tg-0lax">1.86</td>
301 |     <td class="tg-0lax">0.00</td>
302 |     <td class="tg-0lax">1.86</td>
303 |     <td class="tg-0lax">1.52</td>
304 |     <td class="tg-0lax">61.53</td>
305 |     <td class="tg-0lax">3.18</td>
306 |     <td class="tg-0lax">-</td>
307 |     <td class="tg-0lax">0.58</td>
308 |   </tr>
309 | </tbody>
310 | </table>
311 | <br>
312 | 
313 | ### **Performance of the long dependency tasks**
314 | 
315 | <table class="tg">
316 | <thead>
317 |   <tr>
318 |     <th class="tg-0lax">Models </th>
319 |     <th class="tg-0lax">Context </th>
320 |     <th class="tg-0lax">Bleu1</th>
321 |     <th class="tg-0lax"> Bleu4 </th>
322 |     <th class="tg-0lax">Rouge1 </th>
323 |     <th class="tg-0lax">Rouge4 </th>
324 |     <th class="tg-0lax">RougeL </th>
325 |     <th class="tg-0lax">Meteor score </th>
326 |     <th class="tg-0lax">Bert score </th>
327 |     <th class="tg-0lax">GPT4 score </th>
328 |   </tr>
329 | </thead>
330 | <tbody>
331 |   <tr>
332 |     <td class="tg-0lax" colspan="10">arXiv paper summarization</td>
333 |   </tr>
334 |   <tr>
335 |     <td class="tg-0lax">GPT4-32k</td>
336 |     <td class="tg-0lax">32k</td>
337 |     <td class="tg-0lax">24.50</td>
338 |     <td class="tg-0lax">0.73</td>
339 |     <td class="tg-0lax">27.15</td>
340 |     <td class="tg-0lax">7.10</td>
341 |     <td class="tg-0lax">24.25</td>
342 |     <td class="tg-0lax">19.03</td>
343 |     <td class="tg-0lax">84.04</td>
344 |     <td class="tg-0lax">82.84</td>
345 |   </tr>
346 |   <tr>
347 |     <td class="tg-0lax">GPT4-8k</td>
348 |     <td class="tg-0lax">8k</td>
349 |     <td class="tg-0lax"><b>29.02</b></td>
350 |     <td class="tg-0lax"><b>2.09</b></td>
351 |     <td class="tg-0lax"><b>32.08</b></td>
352 |     <td class="tg-0lax"><b>11.11</b></td>
353 |     <td class="tg-0lax">28.85</td>
354 |     <td class="tg-0lax"><b>22.64</b></td>
355 |     <td class="tg-0lax"><b>84.92</b></td>
356 |     <td class="tg-0lax">85.42</td>
357 |   </tr>
358 |   <tr>
359 |     <td class="tg-0lax">GPT3.5-turbo-16k</td>
360 |     <td class="tg-0lax">16k</td>
361 |     <td class="tg-0lax">28.70</td>
362 |     <td class="tg-0lax">1.59</td>
363 |     <td class="tg-0lax">32.04</td>
364 |     <td class="tg-0lax">10.69</td>
365 |     <td class="tg-0lax"><b>28.89</b></td>
366 |     <td class="tg-0lax">22.34</td>
367 |     <td class="tg-0lax">84.82</td>
368 |     <td class="tg-0lax"><b>86.84</b></td>
369 |   </tr>
370 |   <tr>
371 |     <td class="tg-0lax">LlamaIndex</td>
372 |     <td class="tg-0lax">-</td>
373 |     <td class="tg-0lax">22.53</td>
374 |     <td class="tg-0lax">0.63</td>
375 |     <td class="tg-0lax">26.28</td>
376 |     <td class="tg-0lax">6.97</td>
377 |     <td class="tg-0lax">23.73</td>
378 |     <td class="tg-0lax">21.07</td>
379 |     <td class="tg-0lax">83.09</td>
380 |     <td class="tg-0lax">76.35</td>
381 |   </tr>
382 |   <tr>
383 |     <td class="tg-0lax">ChatGLM2-6B</td>
384 |     <td class="tg-0lax">32k</td>
385 |     <td class="tg-0lax">0.04</td>
386 |     <td class="tg-0lax">1.60e-310</td>
387 |     <td class="tg-0lax">5.97</td>
388 |     <td class="tg-0lax">8.43E-05</td>
389 |     <td class="tg-0lax">5.82</td>
390 |     <td class="tg-0lax">6.40</td>
391 |     <td class="tg-0lax">73.25</td>
392 |     <td class="tg-0lax">13.23</td>
393 |   </tr>
394 |   <tr>
395 |     <td class="tg-0lax">LongLLaMa-3B</td>
396 |     <td class="tg-0lax">256k</td>
397 |     <td class="tg-0lax">4.24</td>
398 |     <td class="tg-0lax">9.32e-309</td>
399 |     <td class="tg-0lax">4.10</td>
400 |     <td class="tg-0lax">0.52</td>
401 |     <td class="tg-0lax">3.86</td>
402 |     <td class="tg-0lax">3.82</td>
403 |     <td class="tg-0lax">73.41</td>
404 |     <td class="tg-0lax">12.28</td>
405 |   </tr>
406 |   <tr>
407 |     <td class="tg-0lax">RWKV-4-14B-pile</td>
408 |     <td class="tg-0lax">8k</td>
409 |     <td class="tg-0lax">6.28</td>
410 |     <td class="tg-0lax">4.58E-05</td>
411 |     <td class="tg-0lax">6.45</td>
412 |     <td class="tg-0lax">0.74</td>
413 |     <td class="tg-0lax">6.01</td>
414 |     <td class="tg-0lax">6.00</td>
415 |     <td class="tg-0lax">75.28</td>
416 |     <td class="tg-0lax">7.02</td>
417 |   </tr>
418 |   <tr>
419 |     <td class="tg-0lax">LLaMA2-7B-32K</td>
420 |     <td class="tg-0lax">32k</td>
421 |     <td class="tg-0lax">0.03</td>
422 |     <td class="tg-0lax">4.66e-310</td>
423 |     <td class="tg-0lax">0.12</td>
424 |     <td class="tg-0lax">0.00</td>
425 |     <td class="tg-0lax">0.12</td>
426 |     <td class="tg-0lax">0.67</td>
427 |     <td class="tg-0lax">71.21</td>
428 |     <td class="tg-0lax">7.60</td>
429 |   </tr>
430 |   <tr>
431 |     <td class="tg-0lax" colspan="10">Long dependency QA</td>
432 |   </tr>
433 |   <tr>
434 |     <td class="tg-0lax">GPT4-32k</td>
435 |     <td class="tg-0lax">32k</td>
436 |     <td class="tg-0lax">8.55</td>
437 |     <td class="tg-0lax">1.40</td>
438 |     <td class="tg-0lax"><b>25.59</b></td>
439 |     <td class="tg-0lax">6.36</td>
440 |     <td class="tg-0lax"><b>24.04</b></td>
441 |     <td class="tg-0lax"><b>11.13</b></td>
442 |     <td class="tg-0lax">80.16</td>
443 |     <td class="tg-0lax"><b>54.09</b></td>
444 |   </tr>
445 |   <tr>
446 |     <td class="tg-0lax">GPT4-8k</td>
447 |     <td class="tg-0lax">8k</td>
448 |     <td class="tg-0lax"><b>8.94</b></td>
449 |     <td class="tg-0lax">1.01</td>
450 |     <td class="tg-0lax">23.45</td>
451 |     <td class="tg-0lax">6.57</td>
452 |     <td class="tg-0lax">21.69</td>
453 |     <td class="tg-0lax">10.18</td>
454 |     <td class="tg-0lax">85.36</td>
455 |     <td class="tg-0lax">42.12</td>
456 |   </tr>
457 |   <tr>
458 |     <td class="tg-0lax">GPT3.5-turbo-16k</td>
459 |     <td class="tg-0lax">16k</td>
460 |     <td class="tg-0lax">6.92</td>
461 |     <td class="tg-0lax"><b>1.81</b></td>
462 |     <td class="tg-0lax">25.02</td>
463 |     <td class="tg-0lax">6.68</td>
464 |     <td class="tg-0lax">23.63</td>
465 |     <td class="tg-0lax">10.40</td>
466 |     <td class="tg-0lax">83.79</td>
467 |     <td class="tg-0lax">45.04</td>
468 |   </tr>
469 |   <tr>
470 |     <td class="tg-0lax">LlamaIndex</td>
471 |     <td class="tg-0lax">-</td>
472 |     <td class="tg-0lax">7.76</td>
473 |     <td class="tg-0lax">1.24</td>
474 |     <td class="tg-0lax">23.62</td>
475 |     <td class="tg-0lax"><b>7.10</b></td>
476 |     <td class="tg-0lax">22.30</td>
477 |     <td class="tg-0lax">10.47</td>
478 |     <td class="tg-0lax">83.87</td>
479 |     <td class="tg-0lax">37.63</td>
480 |   </tr>
481 |   <tr>
482 |     <td class="tg-0lax">ChatGLM2-6B</td>
483 |     <td class="tg-0lax">32k</td>
484 |     <td class="tg-0lax">5.55</td>
485 |     <td class="tg-0lax">0.11</td>
486 |     <td class="tg-0lax">9.41</td>
487 |     <td class="tg-0lax">1.93</td>
488 |     <td class="tg-0lax">8.69</td>
489 |     <td class="tg-0lax">4.39</td>
490 |     <td class="tg-0lax"><b>85.78</b></td>
491 |     <td class="tg-0lax">11.50</td>
492 |   </tr>
493 |   <tr>
494 |     <td class="tg-0lax">LongLLaMa-3B</td>
495 |     <td class="tg-0lax">256k</td>
496 |     <td class="tg-0lax">1.04</td>
497 |     <td class="tg-0lax">3.12E-307</td>
498 |     <td class="tg-0lax">2.96</td>
499 |     <td class="tg-0lax">0.03</td>
500 |     <td class="tg-0lax">2.71</td>
501 |     <td class="tg-0lax">1.66</td>
502 |     <td class="tg-0lax">78.60</td>
503 |     <td class="tg-0lax">6.48</td>
504 |   </tr>
505 |   <tr>
506 |     <td class="tg-0lax">RWKV-4-14B-pile</td>
507 |     <td class="tg-0lax">8k</td>
508 |     <td class="tg-0lax">0.71</td>
509 |     <td class="tg-0lax">9.52E-307</td>
510 |     <td class="tg-0lax">18.54</td>
511 |     <td class="tg-0lax">1.55</td>
512 |     <td class="tg-0lax">17.69</td>
513 |     <td class="tg-0lax">3.45</td>
514 |     <td class="tg-0lax">71.36</td>
515 |     <td class="tg-0lax">5.33</td>
516 |   </tr>
517 |   <tr>
518 |     <td class="tg-0lax">LLaMA2-7B-32K</td>
519 |     <td class="tg-0lax">32k</td>
520 |     <td class="tg-0lax">0.08</td>
521 |     <td class="tg-0lax">2.44E-308</td>
522 |     <td class="tg-0lax">2.05</td>
523 |     <td class="tg-0lax">0.00</td>
524 |     <td class="tg-0lax">2.05</td>
525 |     <td class="tg-0lax">0.46</td>
526 |     <td class="tg-0lax">50.28</td>
527 |     <td class="tg-0lax">4.18</td>
528 |   </tr>
529 | </tbody>
530 | </table>
531 | 
532 | <br>
533 | 
534 | ### **Impact of input length on long dependency tasks**
535 | 
536 | <table class="tg">
537 | <thead>
538 |   <tr>
539 |     <th class="tg-0lax">Models </th>
540 |     <th class="tg-0lax">Context </th>
541 |     <th class="tg-0lax">Bleu1</th>
542 |     <th class="tg-0lax"> Bleu4 </th>
543 |     <th class="tg-0lax">Rouge1 </th>
544 |     <th class="tg-0lax">Rouge4 </th>
545 |     <th class="tg-0lax">RougeL </th>
546 |     <th class="tg-0lax">Meteor score </th>
547 |     <th class="tg-0lax">Bert score </th>
548 |     <th class="tg-0lax">GPT4 score </th>
549 |   </tr>
550 | </thead>
551 | <tbody>
552 |   <tr>
553 |     <td class="tg-0lax" colspan="10">arXiv paper summarization</td>
554 |   </tr>
555 |   <tr>
556 |     <td class="tg-0lax">GPT4-32k</td>
557 |     <td class="tg-0lax">32k</td>
558 |     <td class="tg-0lax">24.50</td>
559 |     <td class="tg-0lax">0.73</td>
560 |     <td class="tg-0lax">27.15</td>
561 |     <td class="tg-0lax">7.10</td>
562 |     <td class="tg-0lax">24.25</td>
563 |     <td class="tg-0lax">19.03</td>
564 |     <td class="tg-0lax">84.04</td>
565 |     <td class="tg-0lax">82.84</td>
566 |   </tr>
567 |   <tr>
568 |     <td class="tg-0lax">GPT4-32k</td>
569 |     <td class="tg-0lax">24k</td>
570 |     <td class="tg-0lax">25.57</td>
571 |     <td class="tg-0lax">0.81</td>
572 |     <td class="tg-0lax">27.61</td>
573 |     <td class="tg-0lax">7.53</td>
574 |     <td class="tg-0lax">24.73</td>
575 |     <td class="tg-0lax">19.86</td>
576 |     <td class="tg-0lax">84.07</td>
577 |     <td class="tg-0lax">83.15</td>
578 |   </tr>
579 |   <tr>
580 |     <td class="tg-0lax">GPT4-32k</td>
581 |     <td class="tg-0lax">16k</td>
582 |     <td class="tg-0lax">24.8</td>
583 |     <td class="tg-0lax">0.70</td>
584 |     <td class="tg-0lax">27.29</td>
585 |     <td class="tg-0lax">7.26</td>
586 |     <td class="tg-0lax">24.28</td>
587 |     <td class="tg-0lax">19.12</td>
588 |     <td class="tg-0lax">84.11</td>
589 |     <td class="tg-0lax">82.82</td>
590 |   </tr>
591 |   <tr>
592 |     <td class="tg-0lax">GPT4-32k</td>
593 |     <td class="tg-0lax">8k</td>
594 |     <td class="tg-0lax">26.26</td>
595 |     <td class="tg-0lax"><b>9.35</b></td>
596 |     <td class="tg-0lax">27.83</td>
597 |     <td class="tg-0lax">7.67</td>
598 |     <td class="tg-0lax">24.74</td>
599 |     <td class="tg-0lax">20.08</td>
600 |     <td class="tg-0lax">84.10</td>
601 |     <td class="tg-0lax">82.75</td>
602 |   </tr>
603 |   <tr>
604 |     <td class="tg-0lax">GPT4-8k</td>
605 |     <td class="tg-0lax">8k</td>
606 |     <td class="tg-0lax"><b>29.02</b></td>
607 |     <td class="tg-0lax">2.09</td>
608 |     <td class="tg-0lax"><b>32.08</b></td>
609 |     <td class="tg-0lax"><b>11.11</b></td>
610 |     <td class="tg-0lax"><b>28.85</b></td>
611 |     <td class="tg-0lax"><b>22.64</b></td>
612 |     <td class="tg-0lax"><b>84.92</b></td>
613 |     <td class="tg-0lax"><b>85.42</b></td>
614 |   </tr>
615 |   <tr>
616 |     <td class="tg-0lax" colspan="10">Long dependency QA</td>
617 |   </tr>
618 |   <tr>
619 |     <td class="tg-0lax">GPT4-32k</td>
620 |     <td class="tg-0lax">32k</td>
621 |     <td class="tg-0lax">7.64</td>
622 |     <td class="tg-0lax">1.24</td>
623 |     <td class="tg-0lax">15.53</td>
624 |     <td class="tg-0lax">4.46</td>
625 |     <td class="tg-0lax">14.60</td>
626 |     <td class="tg-0lax">11.12</td>
627 |     <td class="tg-0lax">86.07</td>
628 |     <td class="tg-0lax"><b>54.65</b></td>
629 |   </tr>
630 |   <tr>
631 |     <td class="tg-0lax">GPT4-32k</td>
632 |     <td class="tg-0lax">24k</td>
633 |     <td class="tg-0lax">8.23</td>
634 |     <td class="tg-0lax">1.66</td>
635 |     <td class="tg-0lax">14.92</td>
636 |     <td class="tg-0lax">4.12</td>
637 |     <td class="tg-0lax">13.90</td>
638 |     <td class="tg-0lax">10.60</td>
639 |     <td class="tg-0lax">86.16</td>
640 |     <td class="tg-0lax">50.61</td>
641 |   </tr>
642 |   <tr>
643 |     <td class="tg-0lax">GPT4-32k</td>
644 |     <td class="tg-0lax">16k</td>
645 |     <td class="tg-0lax">8.57</td>
646 |     <td class="tg-0lax">1.35</td>
647 |     <td class="tg-0lax">16.21</td>
648 |     <td class="tg-0lax">4.30</td>
649 |     <td class="tg-0lax">14.90</td>
650 |     <td class="tg-0lax"><b>11.91</b></td>
651 |     <td class="tg-0lax"><b>86.36</b></td>
652 |     <td class="tg-0lax">47.55</td>
653 |   </tr>
654 |   <tr>
655 |     <td class="tg-0lax">GPT4-32k</td>
656 |     <td class="tg-0lax">8k</td>
657 |     <td class="tg-0lax">7.46</td>
658 |     <td class="tg-0lax"><b>1.77</b></td>
659 |     <td class="tg-0lax">13.75</td>
660 |     <td class="tg-0lax">5.08</td>
661 |     <td class="tg-0lax">12.89</td>
662 |     <td class="tg-0lax">10.01</td>
663 |     <td class="tg-0lax">85.77</td>
664 |     <td class="tg-0lax">38.34</td>
665 |   </tr>
666 |   <tr>
667 |     <td class="tg-0lax">GPT4-8k</td>
668 |     <td class="tg-0lax">8k</td>
669 |     <td class="tg-0lax"><b>8.94</b></td>
670 |     <td class="tg-0lax">1.01</td>
671 |     <td class="tg-0lax"><b>23.45</b></td>
672 |     <td class="tg-0lax"><b>6.57</b></td>
673 |     <td class="tg-0lax"><b>21.69</b></td>
674 |     <td class="tg-0lax">10.18</td>
675 |     <td class="tg-0lax">85.36</td>
676 |     <td class="tg-0lax">42.12</td>
677 |   </tr>
678 | </tbody>
679 | </table>
680 | 
681 | <!-- ## 📝 **Tools**
682 | Here is an example for our annotation websit for long dependency QA task.
683 | <br> -->
684 | 
685 | ## 📝 **Citation**
686 | If you would like to use our data or find our work interesting, please cite:
687 | ```bibtex
688 | @article{li2023loogle,
689 |   title={LooGLE: Can Long-Context Language Models Understand Long Contexts?},
690 |   author={Li, Jiaqi and Wang, Mengmeng and Zheng, Zilong and Zhang, Muhan},
691 |   journal={arXiv preprint arXiv:2311.04939},
692 |   year={2023}
693 | }
694 | ```
695 | 
696 | ## 📣 **Contacts**
697 | 
698 | We sincerely appreciate human annotators for their valuable contributions on creating high-quality long-dependency QA tasks.
699 | We are very pleased to answer any questions about LooGLE: [nlp@bigai.ai](mailto:nlp@bigai.ai)
700 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | datasets==2.14.4
 2 | flash-attn==2.0.8
 3 | GPUtil==1.4.0
 4 | huggingface-hub==0.15.1
 5 | langchain==0.0.300
 6 | llama-index==0.8.29.post1
 7 | numpy==1.24.2
 8 | openai==0.27.2
 9 | rouge-score==0.1.2
10 | tiktoken==0.5.1
11 | tokenizers==0.13.3
12 | tqdm==4.65.0
13 | transformers==4.30.0
14 | nltk==3.8.1
15 | sentence-transformers==2.2.2
16 | protobuf==3.20.0


--------------------------------------------------------------------------------