├── .gitignore
├── README.md
├── baselines
    ├── README.md
    ├── apply_da_baselines.py
    ├── da_baselines.py
    ├── example_codalab_dev.zip
    ├── model_dict.py
    ├── random_score.py
    └── sample_test_set_submission.zip
├── data
    ├── README.md
    ├── dev_en_de.zip
    ├── dev_summarization.zip
    ├── dev_zh_en.zip
    ├── en_de.zip
    ├── summarization.zip
    ├── test
    │   ├── mt_en_de_test.tsv
    │   ├── mt_en_es_test.tsv
    │   ├── mt_en_zh_test.tsv
    │   └── summarization_test_set.tsv
    └── zh_en.zip
├── evaluation
    ├── README.md
    └── dev_evaluation.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Eval4NLP 2023 Shared Task: Prompting Large Language Models as Explainable Evaluation Metrics
 2 | 
 3 | This is the github repository for the 2023 Eval4NLP shared task: "Prompting Large Language Models as Evaluation Metrics". For more information visit: https://eval4nlp.github.io/2023/shared-task.html.
 4 | 
 5 | You can execute the following commands to be able to run the baselines. Note that due to the model sizes, some baseline settings can be resource heavy. (Tested on an Ubuntu 22.04 cluster with SLURM)
 6 | 
 7 | ```
 8 | conda create --name Eval4NLP23 python=3.10
 9 | conda activate Eval4NLP23
10 | #conda install pip     # this might be necessary in some cases
11 | pip install -r requirements
12 | ```
13 | The `data` folder contains train, dev and test sets. The `baseline` folder contains baseline scripts and an example submission for the Codabench test phase of our shared task.
14 | 
15 | ## Test Phase
16 | 
17 | The test phase of the share task is being conducted on Codabench: https://www.codabench.org/competitions/1359/#/pages-tab
18 | 
19 | Our CodaLab competition for the Dev-Phase can be found here: https://codalab.lisn.upsaclay.fr/competitions/15072
20 | 
21 | ## Citation
22 | 
23 | ```
24 | @article{leiter2023eval4nlp,
25 |   title={The Eval4NLP 2023 Shared Task on Prompting Large Language Models as Explainable Metrics},
26 |   author={Christoph Leiter and Juri Opitz and Daniel Deutsch and Yang Gao and Rotem Dror and Steffen Eger},
27 |   journal={arXiv preprint arXiv:2310.19792},
28 |   year={2023}
29 | }
30 | ```
31 | 
32 | 
33 | 


--------------------------------------------------------------------------------
/baselines/README.md:
--------------------------------------------------------------------------------
 1 | # Baselines
 2 | 
 3 | This directory contains baselines of the shared task. During the beginning of the dev phase, we plan to add further 
 4 | baselines. Current baselines are the following:
 5 | 
 6 | * `random_score.py` - A random baseline. It will produce random scores for every input summary/sentence. The current
 7 |                         implementation iterates over the dev-sets.
 8 | * `da_baselines.py` - A baseline metric that prompts LLMs to return DA scores. We enforce the output scores using 
 9 |                         the Microsoft Guidance Library (https://github.com/microsoft/guidance)
10 | * `apply_da_baselines.py` - A simple script that applys the baseline llm metrics to the three dev sets. These need to be unzipped first.
11 | * `example_codalab_dev.zip` - Example submission on the summarization dev task. Scores produced with the guanaco da baseline.
12 | 
13 | Further, the methods in `model_dict.py` should give first hints on how to load these models.
14 | 


--------------------------------------------------------------------------------
/baselines/apply_da_baselines.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import csv
 3 | import torch
 4 | 
 5 | from da_baselines import DirectAssessment
 6 | from model_dict import load_from_catalogue
 7 | from tqdm import tqdm
 8 | 
 9 | modelname = "TheBloke/WizardLM-13B-V1.1-GPTQ"
10 | model_key = "wizard"
11 | model, tokenizer, u_prompt, a_prompt = load_from_catalogue(modelname)
12 | BPG = None
13 | 
14 | files = {
15 |     "de":"../data/en_de/dev_en_de.tsv",
16 |     "zh":"../data/zh_en/dev_zh_en.tsv",
17 |     "sum":"../data/summarization/dev_summarization.tsv"
18 |     }
19 | 
20 | for key, file in files.items():
21 |     df = pd.read_csv(file, sep="\t", quoting=csv.QUOTE_NONE)
22 |     scores = []
23 | 
24 |     if key =="sum":
25 |         mt = False
26 |     else:
27 |         mt = True
28 | 
29 |     cnt = 0
30 |     for s, h in tqdm(df[["SRC","HYP"]].values.tolist(), desc=key + " progress: "):
31 | 
32 |         # Ugly fix for memory leak; perhaps with the guidance module
33 |         if BPG:
34 |             del BPG
35 |         BPG = DirectAssessment(model=model, tokenizer=tokenizer)
36 | 
37 |         print(cnt)
38 |         print(torch.cuda.mem_get_info())
39 |         score = BPG.prompt_model(
40 |             gt=s,
41 |             hyp=h,
42 |             mt=mt,
43 |             prompt_placeholder=u_prompt,
44 |             response_placeholder=a_prompt,
45 |             target_lang= "English" if key == "zh" else "German",
46 |             source_lang= "Chinese" if key == "zh" else "English", 
47 |             verbose=False
48 |         )
49 |         scores.append(score)
50 |         cnt+=1
51 | 
52 |     df["baseline"] = scores
53 |     df["baseline"].to_csv(key+model_key, header=False,index=False)
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/baselines/da_baselines.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This baseline implements the direct assessment prompt by
  3 | Kocmi and Federmann, Large Language Models Are State-of-the-Art Evaluators of Translation Quality. ArXiv: 2302.14520
  4 | for open source LLMs with MT and summarization
  5 | '''
  6 | 
  7 | import guidance, torch
  8 | 
  9 | from model_dict import load_from_catalogue
 10 | 
 11 | 
 12 | class DirectAssessment:
 13 |     def __init__(self, model, tokenizer, **kwargs):
 14 |         self.model = guidance.llms.Transformers(
 15 |             model, tokenizer=tokenizer, trust_remote_code=True, **kwargs
 16 |         )
 17 | 
 18 |     def set_model(self, model):
 19 |         self.model = model
 20 |         guidance.llms.Transformers.cache.clear()
 21 | 
 22 | 
 23 |     def direct_assessment_mt_block(
 24 |         self,
 25 |         hyp,
 26 |         gt,
 27 |         prompt_placeholder="",
 28 |         response_placeholder="",
 29 |         source_lang="en",
 30 |         target_lang="de",
 31 |     ):
 32 |         return "\n".join(
 33 |             [
 34 |                 prompt_placeholder,
 35 |                 f"Score the following translation from {source_lang} to {target_lang} with respect to",
 36 |                 "the source sentence on a continuous scale from 0 to 100, where a score of zero means",
 37 |                 '"no meaning preserved" and score of one hundred means "perfect meaning and grammar".',
 38 |                 f'{source_lang} source: "{gt}"',
 39 |                 f'{target_lang} translation: "{hyp}"',
 40 |                 response_placeholder,
 41 |                 "Score: {{gen 'score' pattern='(100|[1-9]?[0-9])'}}",
 42 |             ]
 43 |         )
 44 | 
 45 |     def direct_assessment_summ_block(
 46 |         self,
 47 |         hyp,
 48 |         gt,
 49 |         prompt_placeholder="",
 50 |         response_placeholder="",
 51 |     ):
 52 |         return "\n".join(
 53 |             [
 54 |                 prompt_placeholder,
 55 |                 f"Score the summarization with respect to the summarized document",
 56 |                 "on a continuous scale from 0 to 100, where a score of zero means",
 57 |                 '"irrelevant, factually incorrect and not readable" and score of one hundred means',
 58 |                 '"relevant, factually correct, good readability".',
 59 |                 f'Source text: "{gt}"',
 60 |                 f'Summary: "{hyp}"',
 61 |                 response_placeholder,
 62 |                 "Score: {{gen 'score' pattern='(100|[1-9]?[0-9])'}}",
 63 |             ]
 64 |         )
 65 | 
 66 |     def prompt_model(
 67 |         self,
 68 |         gt,
 69 |         hyp,
 70 |         mt = True,
 71 |         prompt_placeholder=None,
 72 |         response_placeholder=None,
 73 |         target_lang="German",
 74 |         source_lang="English",
 75 |         verbose=False
 76 |     ):
 77 |         if mt:
 78 |             prompt = self.direct_assessment_mt_block(
 79 |             gt=gt,
 80 |             hyp=hyp,
 81 |             response_placeholder=response_placeholder,
 82 |             prompt_placeholder=prompt_placeholder,
 83 |             target_lang=target_lang,
 84 |             source_lang=source_lang
 85 |         )
 86 |         else:
 87 |             prompt = self.direct_assessment_summ_block(
 88 |             gt=gt,
 89 |             hyp=hyp,
 90 |             response_placeholder=response_placeholder,
 91 |             prompt_placeholder=prompt_placeholder
 92 |         )
 93 | 
 94 |         if verbose:
 95 |             print(prompt)
 96 | 
 97 |         guidance_prompt = guidance(prompt, llm=self.model)
 98 |         res = guidance_prompt()
 99 | 
100 |         torch.cuda.empty_cache()
101 |         return res.text, res["score"]
102 | 
103 | 
104 | if __name__ == "__main__":
105 |     #modelname = "NousResearch/Nous-Hermes-13b"
106 |     modelname = "TheBloke/guanaco-65B-GPTQ"
107 |     #modelname = "TheBloke/WizardLM-13B-V1.1-GPTQ"
108 |     model, tokenizer, u_prompt, a_prompt = load_from_catalogue(modelname)
109 |     BPG = DirectAssessment(model=model, tokenizer=tokenizer)
110 | 
111 |     _, score = BPG.prompt_model(
112 |         gt="I have a small cat",
113 |         hyp="Ich habe eine große Katze",
114 |         prompt_placeholder=u_prompt,
115 |         response_placeholder=a_prompt
116 |     )
117 | 
118 |     print(score)
119 | 
120 |     _, score = BPG.prompt_model(
121 |         gt="I like to eat fish. Therefore, I like to go to the restaurant. There, I often eat snails, which I like to eat, too",
122 |         hyp="I like to eat fish and snails at the restaurant",
123 |         mt=False,
124 |         prompt_placeholder=u_prompt,
125 |         response_placeholder=a_prompt
126 |     )
127 | 
128 |     print(score)
129 | 
130 | 


--------------------------------------------------------------------------------
/baselines/example_codalab_dev.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eval4nlp/SharedTask2023/c42352793850a8c08e76d384bc3563b7906cb1ab/baselines/example_codalab_dev.zip


--------------------------------------------------------------------------------
/baselines/model_dict.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from transformers import (
  3 |     LlamaForCausalLM,
  4 |     AutoTokenizer,
  5 |     AutoModelForCausalLM,
  6 |     LlamaTokenizer
  7 | )
  8 | from auto_gptq import AutoGPTQForCausalLM
  9 | from peft import PeftModel
 10 | 
 11 | 
 12 | def load_automodel(model_name, trust_remote_code=False, dtype=torch.float16, load_in_8bit=False):
 13 |     model = AutoModelForCausalLM.from_pretrained(
 14 |         model_name, device_map="auto", torch_dtype=dtype, trust_remote_code=trust_remote_code, load_in_8bit = load_in_8bit
 15 |     )
 16 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
 17 |     return model, tokenizer
 18 | 
 19 | 
 20 | def load_llama_model(model_name):
 21 |     model = LlamaForCausalLM.from_pretrained(
 22 |         model_name, device_map="auto", torch_dtype=torch.float16
 23 |     )
 24 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
 25 |     return model, tokenizer
 26 | 
 27 | def load_peft_model(model_name, orig):
 28 |     model = LlamaForCausalLM.from_pretrained(
 29 |         model_name, device_map="auto"
 30 |     )
 31 |     tokenizer = LlamaTokenizer.from_pretrained(model_name)
 32 |     model = PeftModel.from_pretrained(model, orig)
 33 |     return model, tokenizer
 34 | 
 35 | 
 36 | def load_gptq_model(model_name, trust_remote_code=False, quantize_config=None,inject_fused_attention=True):
 37 |     model = AutoGPTQForCausalLM.from_quantized(
 38 |         model_name,
 39 |         use_safetensors=True,
 40 |         trust_remote_code=trust_remote_code,
 41 |         device="cuda:0",
 42 |         use_triton=False,
 43 |         quantize_config=quantize_config,
 44 |         inject_fused_attention=inject_fused_attention
 45 |     )
 46 |     tokenizer = AutoTokenizer.from_pretrained(model_name)
 47 |     return model, tokenizer
 48 | 
 49 | 
 50 | def load_from_catalogue(model_name):
 51 |     """
 52 |     A method to load the models of the shared task. A collection of instruction strings from the huggingface modelcards
 53 |     is provided with the dictionary as "user_prompt" and "assistant_prompt". These only are examples, you don't have to
 54 |     use them.
 55 |     @param model_name: The model to instanziate
 56 |     @return: model, tokenizer, user_prompt, assistant_prompt
 57 |     """
 58 |     catalogue = {
 59 |         "NousResearch/Nous-Hermes-13b": {
 60 |             "load_method": load_llama_model,
 61 |             "user_prompt": "### Instruction:",
 62 |             "assistant_prompt": "### Response:",
 63 |         },
 64 |         "TheBloke/guanaco-65B-GPTQ": {
 65 |             "load_method": lambda x: load_gptq_model(
 66 |                 x, trust_remote_code=True, #might not need the trust remote code
 67 |             ),
 68 |             "user_prompt": "### Human:",
 69 |             "assistant_prompt": "### Assistant:",
 70 |         },
 71 |         "TheBloke/WizardLM-13B-V1.1-GPTQ": {
 72 |             "load_method": lambda x: load_gptq_model(
 73 |                 x, trust_remote_code=True, #might not need the trust remote code
 74 |             ),
 75 |             "user_prompt": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\n\n USER: ",
 76 |             "assistant_prompt": "Assistant: ",
 77 |         },
 78 |         "TheBloke/Platypus2-70B-Instruct-GPTQ": {
 79 |             "load_method": lambda x: load_gptq_model(
 80 |                 x, trust_remote_code=False, inject_fused_attention=False
 81 |             ),
 82 |             "user_prompt": "### Instruction:",
 83 |             "assistant_prompt": "### Response:",
 84 |         },
 85 |         "Open-Orca/OpenOrca-Platypus2-13B":{
 86 |             "load_method": lambda x: load_automodel(x, trust_remote_code=True),
 87 |             "user_prompt": "### Instruction:",
 88 |             "assistant_prompt": "### Response:",
 89 |         },
 90 |           "psmathur/orca_mini_v3_7b":{
 91 |             "load_method": lambda x: load_automodel(x, trust_remote_code=True, load_in_8bit=True),
 92 |             "user_prompt": "### System: You are an AI assistant that follows instruction extremely well. Help as much as you can. \n\n ### User:",
 93 |             "assistant_prompt": "### Assistant:",
 94 |         }
 95 |     }
 96 | 
 97 | 
 98 |     model, tokenizer = catalogue[model_name]["load_method"](model_name)
 99 | 
100 |     return model, tokenizer, catalogue[model_name]["user_prompt"], catalogue[model_name]["assistant_prompt"]
101 | 


--------------------------------------------------------------------------------
/baselines/random_score.py:
--------------------------------------------------------------------------------
 1 | import random, csv
 2 | import pandas as pd
 3 | 
 4 | for file in ["dev_summarization.tsv",
 5 |              "dev_zh_en.tsv",
 6 |              "dev_en_de.tsv"]:
 7 |     df = pd.read_csv(file, sep = "\t", quoting=csv.QUOTE_NONE)
 8 |     df["random"] = [random.uniform(-1, 1) for i in range(len(df))]
 9 |     df["random"].to_csv(file+".seg.scores", header=False, index=False)
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/baselines/sample_test_set_submission.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eval4nlp/SharedTask2023/c42352793850a8c08e76d384bc3563b7906cb1ab/baselines/sample_test_set_submission.zip


--------------------------------------------------------------------------------
/data/README.md:
--------------------------------------------------------------------------------
 1 | # Train and Dev Data
 2 | 
 3 | The zip files in this folder contain the train and dev sentences/documents of the Eval4NLP23 shared task. We re-use existing
 4 | datasets with a random train/dev split each. As we evaluate in a reference-free setting, we only provide the source and 
 5 | no references. In specific we use:
 6 | 
 7 | 1. The MQM *en-de* and *zh-en* language pairs of the WMT 22 metrics shared task,a work by  Freitag et al., Results of WMT22 Metrics Shared Task: Stop Using BLEU -- Neural Metrics Are Better and More Robust. In: WMT22
 8 | 2. The average aspect score of SummEval, a work by Fabbri et al., SummEval: Re-evaluating Summarization Evaluation. In: Transactions of the Association for Computational Linguistics
 9 | 
10 | The dev sets can be evaluated on our CodaLab leaderboard. We do not provide their scores here, to make the DEV phase more 
11 | interesting. Theoretically, you could match them back to their original dataset. As the dev phase has no influence on the
12 | shared task results, please refrain from adding the ground truth to the leaderboards.
13 | 
14 | The licenses of the respective datasets are placed inside the zip files. 
15 | 
16 | The tsv files can be loaded with pandas:
17 | 
18 | ```python
19 | import pandas as pd
20 | import csv
21 | 
22 | df_source = pd.read_csv("<filename>", sep="\t", quoting=csv.QUOTE_NONE)
23 | ```


--------------------------------------------------------------------------------
/data/dev_en_de.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eval4nlp/SharedTask2023/c42352793850a8c08e76d384bc3563b7906cb1ab/data/dev_en_de.zip


--------------------------------------------------------------------------------
/data/dev_summarization.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eval4nlp/SharedTask2023/c42352793850a8c08e76d384bc3563b7906cb1ab/data/dev_summarization.zip


--------------------------------------------------------------------------------
/data/dev_zh_en.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eval4nlp/SharedTask2023/c42352793850a8c08e76d384bc3563b7906cb1ab/data/dev_zh_en.zip


--------------------------------------------------------------------------------
/data/en_de.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eval4nlp/SharedTask2023/c42352793850a8c08e76d384bc3563b7906cb1ab/data/en_de.zip


--------------------------------------------------------------------------------
/data/summarization.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eval4nlp/SharedTask2023/c42352793850a8c08e76d384bc3563b7906cb1ab/data/summarization.zip


--------------------------------------------------------------------------------
/data/zh_en.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/eval4nlp/SharedTask2023/c42352793850a8c08e76d384bc3563b7906cb1ab/data/zh_en.zip


--------------------------------------------------------------------------------
/evaluation/README.md:
--------------------------------------------------------------------------------
 1 | #Evaluation
 2 | 
 3 | This directory contains the evaluation scripts of the shared task. In the dev phase, we will use the standard scipy implementation
 4 | of the Kendall correlation. Evaluation can be performed as follows:
 5 | 
 6 | ```
 7 | python3 dev_evaluation.py metric_scores.txt golden_scores.txt output.txt
 8 | ```
 9 | 
10 | This will write the Kendall score to `output.txt`. Both input files `metric_scores.txt` and `golden_scores.txt` should
11 | contain one corresponding float per input segment. The golden scores can be extracted from the train data in the `data`
12 | folder.


--------------------------------------------------------------------------------
/evaluation/dev_evaluation.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import scipy.stats
 4 | 
 5 | def load_scores(input_file):
 6 |     with open(input_file) as f:
 7 |         return np.array([float(n) for n in f.read().split("\n") if n != ""])
 8 | 
 9 | 
10 | def main(argv):
11 |     _, input_file1, input_file2, output_file = argv
12 | 
13 |     res_scores = load_scores(input_file1)
14 |     ref_scores = load_scores(input_file2)
15 | 
16 |     results = scipy.stats.kendalltau(res_scores, ref_scores)[0]
17 | 
18 |     with open(output_file, "w") as f:
19 |         f.write("KENDALL: " + str(results))
20 | 
21 | 
22 | # Run
23 | if __name__ == '__main__':
24 |     main(sys.argv)
25 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | --index-url https://download.pytorch.org/whl/cu118
 2 | torch
 3 | torchvision
 4 | torchaudio
 5 | 
 6 | --index-url https://pypi.python.org/simple
 7 | https://github.com/PanQiWei/AutoGPTQ/releases/download/v0.3.2/auto_gptq-0.3.2+cu118-cp310-cp310-linux_x86_64.whl
 8 | guidance
 9 | sentencepiece
10 | protobuf
11 | bitsandbytes
12 | scipy
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------