├── .cache
    ├── ChatGPT.pkl
    ├── InstructGPT.pkl
    ├── retrieval-finterms.json
    └── retrieval-finterms.pkl
├── .gitignore
├── .gitmodules
├── LICENSE
├── README.es.md
├── README.ijcai_challenge.md
├── README.md
├── README.zh.md
├── docker
    ├── DOCKERFILE
    └── build_and_upload.sh
├── notebooks
    ├── Finarg-ecc-auc+Edtsum_evaluation_sample.ipynb
    └── evaluate.ipynb
├── requirements.txt
├── scripts
    ├── docker_run.sh
    ├── run_evaluation.sh
    └── run_interface.sh
├── src
    ├── chatlm.py
    ├── eval.py
    ├── evaluator.py
    ├── factscore_package
    │   ├── .cache
    │   │   ├── demons.json
    │   │   ├── demons.txt
    │   │   ├── demons_complex.json
    │   │   ├── demons_full.txt
    │   │   ├── demos
    │   │   │   └── demons.json
    │   │   ├── fin_rare_terms.jsonl
    │   │   └── finterms.jsonl
    │   ├── __init__.py
    │   ├── abstain_detection.py
    │   ├── atomic_facts.py
    │   ├── clm.py
    │   ├── demons.json
    │   ├── download_data.py
    │   ├── en_core_web_sm-3.7.1.tar.gz
    │   ├── factscorer.py
    │   ├── lm.py
    │   ├── npm.py
    │   ├── openai_lm.py
    │   ├── retrieval.py
    │   └── utils.py
    ├── interface.py
    ├── model_prompt.py
    ├── tasks
    │   ├── __init__.py
    │   ├── flare.py
    │   ├── utils.py
    │   └── zhutils.py
    └── utils.py
└── static
    ├── av.jpg
    ├── cr.jpg
    ├── formula.jpg
    ├── md.jpg
    └── sr.jpg


/.cache/ChatGPT.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/.cache/ChatGPT.pkl


--------------------------------------------------------------------------------
/.cache/InstructGPT.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/.cache/InstructGPT.pkl


--------------------------------------------------------------------------------
/.cache/retrieval-finterms.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/.cache/retrieval-finterms.pkl


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | __pycache__/
 3 | lm_cache/
 4 | *_results
 5 | /*.json
 6 | .hypothesis
 7 | *_private.*
 8 | *_private
 9 | *_debug.*
10 | *.code-workspace


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "src/financial-evaluation"]
 2 | 	path = src/financial-evaluation
 3 | 	url = https://github.com/chancefocus/financial-evaluation.git
 4 | [submodule "src/metrics/BARTScore"]
 5 | 	path = src/metrics/BARTScore
 6 | 	url = https://github.com/neulab/BARTScore.git
 7 | [submodule "FinMem-LLM-StockTrading"]
 8 | 	path = FinMem-LLM-StockTrading
 9 | 	url = https://github.com/pipiku915/FinMem-LLM-StockTrading.git
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 乾阜资产
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.es.md:
--------------------------------------------------------------------------------
  1 | <p align="center" width="100%">
  2 | <img src="https://i.postimg.cc/xTpWgq3L/pixiu-logo.png"  width="100%" height="100%">
  3 | </p>
  4 | <div>
  5 | <div align="left">
  6 |     <a target='_blank'>Qianqian Xie<sup>1</sup></span>&emsp;
  7 |     <a target='_blank'>Weiguang Han<sup>2</sup></span>&emsp;
  8 |     <a target='_blank'>Zhengyu Chen<sup>2</sup></span>&emsp;
  9 |     <a target='_blank'>Ruoyu Xiang<sup>1</sup></a>&emsp;
 10 |     <a target='_blank'>Xiao Zhang<sup>1</sup></a>&emsp;
 11 |     <a target='_blank'>Yueru He<sup>1</sup></a>&emsp;
 12 |     <a target='_blank'>Mengxi Xiao<sup>2</sup></a>&emsp;
 13 |     <a target='_blank'>Dong Li<sup>2</sup></a>&emsp;
 14 |     <a target='_blank'>Yongfu Dai<sup>7</sup></a>&emsp;
 15 |     <a target='_blank'>Duanyu Feng<sup>7</sup></a>&emsp;
 16 |     <a target='_blank'>Yijing Xu<sup>1</sup></a>&emsp;
 17 |     <a target='_blank'>Haoqiang Kang<sup>5</sup></a>&emsp;
 18 |     <a target='_blank'>Ziyan Kuang<sup>12</sup></a>&emsp;
 19 |     <a target='_blank'>Chenhan Yuan<sup>3</sup></a>&emsp;
 20 |     <a target='_blank'>Kailai Yang<sup>3</sup></a>&emsp;
 21 |     <a target='_blank'>Zheheng Luo<sup>3</sup></a>&emsp;
 22 |     <a target='_blank'>Tianlin Zhang<sup>3</sup></a>&emsp;
 23 |     <a target='_blank'>Zhiwei Liu<sup>3</sup></a>&emsp;
 24 |     <a target='_blank'>Guojun Xiong<sup>10</sup></a>&emsp;
 25 |     <a target='_blank'>Zhiyang Deng<sup>9</sup></a>&emsp;
 26 |     <a target='_blank'>Yuechen Jiang<sup>9</sup></a>&emsp;
 27 |     <a target='_blank'>Zhiyuan Yao<sup>9</sup></a>&emsp;
 28 |     <a target='_blank'>Haohang Li<sup>9</sup></a>&emsp;
 29 |     <a target='_blank'>Yangyang Yu<sup>9</sup></a>&emsp;
 30 |     <a target='_blank'>Gang Hu<sup>8</sup></a>&emsp;
 31 |     <a target='_blank'>Jiajia Huang<sup>11</sup></a>&emsp;
 32 |     <a target='_blank'>Xiao-Yang Liu<sup>5</sup></a>&emsp;
 33 |     <a href='https://warrington.ufl.edu/directory/person/12693/' target='_blank'>Alejandro Lopez-Lira<sup>4</sup></a>&emsp;
 34 |     <a target='_blank'>Benyou Wang<sup>6</sup></a>&emsp;
 35 |     <a target='_blank'>Yanzhao Lai<sup>13</sup></a>&emsp;
 36 |     <a target='_blank'>Hao Wang<sup>7</sup></a>&emsp;
 37 |     <a target='_blank'>Min Peng<sup>2*</sup></a>&emsp;
 38 |     <a target='_blank'>Sophia Ananiadou<sup>3</sup></a>&emsp;
 39 |     <a href='' target='_blank'>Jimin Huang<sup>1</sup></a>
 40 | </div>
 41 | <br />
 42 | 
 43 | <div align="left">
 44 |     <sup>1</sup>The Fin AI&emsp;
 45 |     <sup>2</sup>Wuhan University&emsp;
 46 |     <sup>3</sup>The University of Manchester&emsp;
 47 |     <sup>4</sup>University of Florida&emsp;
 48 |     <sup>5</sup>Columbia University&emsp;
 49 |     <sup>6</sup>The Chinese University of Hong Kong, Shenzhen&emsp;
 50 |     <sup>7</sup>Sichuan University&emsp;
 51 |     <sup>8</sup>Yunnan University&emsp;
 52 |     <sup>9</sup>Stevens Institute of Technology&emsp;
 53 |     <sup>10</sup>Stony Brook University&emsp;
 54 |     <sup>11</sup>Nanjin Audit University&emsp;
 55 |     <sup>12</sup>Jiangxi Normal University&emsp;
 56 |     <sup>13</sup>Southwest Jiaotong University
 57 | </div>
 58 | <br />
 59 | 
 60 | <div align="left">
 61 |     <img src='https://i.postimg.cc/CLtkBwz7/57-EDDD9-FB0-DF712-F3-AB627163-C2-1-EF15655-13-FCA.png' alt='Wuhan University Logo' height='50px'>&emsp;
 62 |     <img src='https://assets.manchester.ac.uk/corporate/images/design/logo-university-of-manchester.png' alt='Manchester University Logo' height='50px'>&emsp;
 63 |     <img src='https://i.postimg.cc/XY1s2RHD/University-of-Florida-Logo-1536x864.jpg' alt='University of Florida Logo' height='50px'>&emsp;
 64 |     <img src='https://admissions.ucr.edu/sites/default/files/styles/form_preview/public/2020-07/ucr-education-logo-columbia-university.png?itok=-0FD6Ma2' alt='Columbia University Logo' height='50px'>&emsp;
 65 |     <img src='https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQeMTMkJVT6g36_LN-8qJ4nMvgT3vM5spUHV3ITRYbym1CEg4Af5Shlp5jX2sWtDFtTK9I&usqp=CAU' alt='HK University (shenzhen) Logo' height='50px'>&emsp;
 66 |     <img src='https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcToJAAiyqxfFuwro5N9Um9TB5LDkiJNKF3hMMQp3pfC0A&s' alt='Sichuan University' height='50px'>&emsp;
 67 |     <img src='https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRbx3AQWiMhxwOvFb7r1PH-h_i5-b3H9xsGVKnkQwbFlA&s' alt='Yunnan University' height='50px'>&emsp;
 68 |     <img src='https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRS_o8HItSOTkg5M75N59D6V5u9qg7QYfBa_ITxdfEfwQ&s' alt='Stevens Insititute of Technology' height='50px'>&emsp;
 69 |     <img src='https://www.stonybrook.edu/sbu-brand/_images/2015/10/logo_stacked_vert.jpg' alt='Stony Brook University' height='50px'>&emsp;
 70 |     <img src='https://upload.wikimedia.org/wikipedia/en/9/9c/Nanjing_Audit_University_logo.png' alt='Nanjing Audit University' height='50px'>&emsp;
 71 |     <img src='https://upload.wikimedia.org/wikipedia/en/thumb/c/c5/Jiangxi_Normal_University.svg/1200px-Jiangxi_Normal_University.svg.png' alt='Jiangxi Normal University' height='50px'>&emsp;
 72 |     <img src='https://i.postimg.cc/k5WpYj0r/SWJTULogo.png' alt='Southwest Jiaotong University Logo' height='50px'>&emsp;
 73 | </div>
 74 | -----------------
 75 | 
 76 | ![](https://img.shields.io/badge/pixiu-v0.1-gold)
 77 | ![](https://black.readthedocs.io/en/stable/_static/license.svg)
 78 | [![Discord](https://img.shields.io/discord/1146837080798933112)](https://discord.gg/HRWpUmKB)
 79 | 
 80 | [Pixiu Paper](https://arxiv.org/abs/2306.05443) | [FinBen Leaderboard](https://huggingface.co/spaces/finosfoundation/Open-Financial-LLM-Leaderboard)
 81 | 
 82 | **Descargo de responsabilidad**
 83 | 
 84 | Este repositorio y su contenido se proporcionan **únicamente con fines académicos y educativos**. Ninguno de los materiales constituye asesoramiento financiero, legal o de inversión. No se ofrecen garantías, explícitas o implícitas, respecto a la precisión, integridad o utilidad del contenido. Los autores y colaboradores no son responsables de errores, omisiones o cualquier consecuencia derivada del uso de la información aquí contenida. Los usuarios deben ejercer su propio juicio y consultar a profesionales antes de tomar cualquier decisión financiera, legal o de inversión. El uso del software e información contenida en este repositorio es bajo el propio riesgo del usuario.
 85 | 
 86 | **Al utilizar o acceder a la información de este repositorio, usted acepta indemnizar, defender y eximir de responsabilidad a los autores, colaboradores y cualquier organización o persona afiliada por cualquier reclamo o daño.**
 87 | 
 88 | 
 89 | 
 90 | 
 91 | **Puntos de control:** 
 92 | 
 93 | - [FinMA v0.1 (Full 7B version)](https://huggingface.co/ChanceFocus/finma-7b-full)
 94 | 
 95 | **Idiomas**
 96 | 
 97 | - [Inglés](README.md)
 98 | - [Español](README.es.md)
 99 | 
100 | **documento**
101 | 
102 | - [PIXIU: A Comprehensive Benchmark, Instruction Dataset and Large Language Model for Finance](https://arxiv.org/abs/2306.05443)
103 | - [The FinBen: An Holistic Financial Benchmark for Large Language Models](https://arxiv.org/abs/2402.12659)
104 | - [No Language is an Island: Unifying Chinese and English in Financial Large Language Models, Instruction Data, and Benchmarks](https://arxiv.org/abs/2403.06249)
105 | - [Dólares or Dollars? Unraveling the Bilingual Prowess of Financial LLMs Between Spanish and English](https://arxiv.org/abs/2402.07405)
106 | 
107 | **Evaluaciones** (más detalles en la sección FinBen):
108 | 
109 | - [flare (flare-es-financees)](https://huggingface.co/datasets/TheFinAI/flare-es-financees)
110 | - [flare (flare-es-tsa)](https://huggingface.co/datasets/TheFinAI/flare-es-tsa)
111 | - [flare (flare-es-fns)](https://huggingface.co/datasets/TheFinAI/flare-es-fns)
112 | - [flare (flare-es-efpa)](https://huggingface.co/datasets/TheFinAI/flare-es-efpa)
113 | - [flare (flare-es-efp)](https://huggingface.co/datasets/TheFinAI/flare-es-efp)
114 | - [flare (flare-es-multifin)](https://huggingface.co/datasets/TheFinAI/flare-es-multifin)
115 | 
116 | ## Descripción general
117 | 
118 | **FinBen_ES** es una iniciativa fundamental enfocada en el dominio financiero español. FinBen_ES busca reforzar el progreso, perfeccionamiento y evaluación de Modelos de Lenguaje a Gran Escala (MLGs) diseñados específicamente para contextos financieros españoles. Como un segmento vital del esfuerzo más amplio de PIXIU, FinBen_ES se erige como un testimonio del compromiso por aprovechar las capacidades de los MLGs, asegurando que los profesionales y entusiastas financieros del mundo hispanohablante tengan a su disposición herramientas lingüísticas de primera categoría.
119 | 
120 | ### Características clave
121 | 
122 | - **Recursos abiertos**: PIXIU proporciona abiertamente el LLM financiero, los datos de instrucción de ajuste fino y los conjuntos de datos incluidos en el conjunto de evaluación de referencia para fomentar la investigación abierta y la transparencia. 
123 | - **Multitarea**: Los datos de instrucción y el conjunto de referencia en PIXIU cubren un diverso conjunto de tareas financieras, que incluyen cuatro tareas de NLP financiero y una tarea de predicción financiera.
124 | - **Multimodalidad**: Los datos de instrucción y el conjunto de referencia de PIXIU consisten en datos financieros multimodales, que incluyen datos de series de tiempo de la tarea de predicción de movimientos de acciones. Cubre varios tipos de textos financieros, que incluyen informes, artículos de noticias, tweets y presentaciones regulatorias.
125 | - **Diversidad**: A diferencia de conjuntos de referencia anteriores que se centran principalmente en tareas de NLP financiero, el conjunto de evaluación de referencia de PIXIU incluye tareas críticas de predicción financiera alineadas con escenarios del mundo real, lo que lo hace más desafiante.
126 | 
127 | ---
128 | 
129 | ## FinBen_ES: Conjunto de evaluación de comprensión y predicción del lenguaje financiero
130 | 
131 | En esta sección, proporcionamos un análisis de rendimiento detallado de FinMA en comparación con otros modelos líderes, incluyendo ChatGPT, GPT-4, lince-zero et al. Para este análisis, hemos elegido una gama de tareas y métricas que abarcan varios aspectos del Procesamiento del Lenguaje Natural financiero y de la predicción financiera.
132 | 
133 | ### Tareas
134 | 
135 | | Datos                 | Tarea                          | Bruto  | Tipos de Datos                      | Modalidades       | Licencia        | Artículo |
136 | | --------------------- | ------------------------------ | ------ | ----------------------------------- | ----------------- | --------------- | -------- |
137 | | MultiFin              | clasificación de titulares     | 230    | titulares de noticias               | texto             | CC BY 4.0       | [1]      |
138 | | FNS                   | respuesta a preguntas          | 50     | informes de ganancias               | texto             | Público         | [2]      |
139 | | TSA                   | análisis de sentimientos       | 3,829  | titulares de noticias               | texto             | CC BY 4.0       | [3]      |
140 | | Financees             | análisis de sentimientos       | 6,539  | titulares de noticias               | texto             | Público         | [4]      |
141 | | EFP                   | respuesta a preguntas          | 37     | preguntas de evaluación empresarial | texto             | Público         |          |
142 | | EFPA                  | respuesta a preguntas          | 228    | preguntas de evaluación empresarial | texto             | Público         |          |
143 | 
144 | 1. Rasmus Jørgensen, Oliver Brandt, Mareike Hartmann, Xiang Dai, Christian Igel, and Desmond Elliott. 2023. MultiFin: A Dataset for Multilingual Financial NLP. In Findings of the Association for Computational Linguistics: EACL 2023, 894–909. Association for Computational Linguistics, Dubrovnik, Croatia.
145 | 2. [FNS 2023. FNP 2023.](http://wp.lancs.ac.uk/cfie/fns2023/).
146 | 3. Pan R, García-Díaz JA, Garcia-Sanchez F, and Valencia-García R. 2023. Evaluation of transformer models for financial targeted sentiment analysis in Spanish. In PeerJ Computer Science, 9:e1377. https://doi.org/10.7717/peerj-cs.1377.
147 | 4. CodaLab. 2023. [Competition](https://codalab.lisn.upsaclay.fr/competitions/10052)
148 | 
149 | 
150 | ### Evaluación
151 | 
152 | #### Preparación
153 | ##### Instalación local
154 | ```bash
155 | git clone https://github.com/TheFinAI/PIXIU.git --recursive
156 | cd PIXIU
157 | pip install -r requirements.txt
158 | cd PIXIU/src/financial-evaluation
159 | pip install -e .[multilingual]
160 | ```
161 | ##### Imagen de Docker
162 | ```bash
163 | sudo bash scripts/docker_run.sh
164 | ```
165 | El comando anterior inicia un contenedor docker, puede modificar docker_run.sh para adaptarlo a su entorno. Proporcionamos una imagen precompilada ejecutando sudo docker pull tothemoon/pixiu:latest
166 | 
167 | ```bash
168 | docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
169 |     --network host \
170 |     --env https_proxy=$https_proxy \
171 |     --env http_proxy=$http_proxy \
172 |     --env all_proxy=$all_proxy \
173 |     --env HF_HOME=$hf_home \
174 |     -it [--rm] \
175 |     --name pixiu \
176 |     -v $pixiu_path:$pixiu_path \
177 |     -v $hf_home:$hf_home \
178 |     -v $ssh_pub_key:/root/.ssh/authorized_keys \
179 |     -w $workdir \
180 |     $docker_user/pixiu:$tag \
181 |     [--sshd_port 2201 --cmd "echo 'Hello, world!' && /bin/bash"]
182 | ```
183 | Argumentos de explicación:
184 | - `[]` significa argumentos ignorables
185 | - `HF_HOME`: directorio de caché huggingface
186 | - `sshd_port`: puerto sshd del contenedor, puede ejecutar `ssh -i private_key -p $sshd_port root@$ip` para conectarse al contenedor, el valor predeterminado es 22001
187 | - `--rm`: elimina el contenedor al salir del contenedor (es decir,`CTRL + D`)
188 | 
189 | #### Evaluación automatizada de tareas
190 | Antes de la evaluación, descargue el [punto de control BART](https://drive.google.com/u/0/uc?id=1_7JfF7KOInb7ZrxKHIigTMR4ChVET01m&export=download) en `src/metrics/BARTScore/bart_score.pth`.
191 | 
192 | Para la evaluación automatizada, siga estas instrucciones:
193 | 
194 | 1. Transformador Huggingface
195 | 
196 |    Para evaluar un modelo alojado en HuggingFace Hub (por ejemplo, finma-7b-full), use este comando:
197 | 
198 | ```bash
199 | python eval.py \
200 |     --model "hf-causal-llama" \
201 |     --model_args "use_accelerate=True,pretrained=chancefocus/finma-7b-full,tokenizer=chancefocus/finma-7b-full,use_fast=False" \
202 |     --tasks "flare_ner,flare_sm_acl,flare_fpb"
203 | ```
204 | 
205 | Puede encontrar más detalles en la documentación de [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness).
206 | 
207 | 2. API comerciales
208 | 
209 | 
210 | Tenga en cuenta que para tareas como NER, la evaluación automatizada se basa en un patrón específico. Esto podría no extraer información relevante en entornos de cero disparos, dando como resultado un rendimiento relativamente más bajo en comparación con los resultados anteriores anotados manualmente.
211 | 
212 | ```bash
213 | export OPENAI_API_SECRET_KEY=YOUR_KEY_HERE
214 | python eval.py \
215 |     --model gpt-4 \
216 |     --tasks flare_ner,flare_sm_acl,flare_fpb
217 | ```
218 | 
219 | ---
220 | 
221 | 
222 | ## License
223 | 
224 | PIXIU tiene licencia [MIT]. Para más detalles, consulte el archivo [MIT](LICENSE).
225 | 
226 | ## Historial de estrellas
227 | 
228 | [![Star History Chart](https://api.star-history.com/svg?repos=The-FinAI/PIXIU&type=Date)](https://star-history.com/#The-FinAI/PIXIU&Date)
229 | 
230 | 


--------------------------------------------------------------------------------
/README.ijcai_challenge.md:
--------------------------------------------------------------------------------
  1 | # IJCAI2024-challenge starter-kit
  2 | 
  3 | We're pleased to invite you to attend the IJCAI2024-challenge, ["Financial Challenges in Large Language Models - FinLLM"](https://sites.google.com/nlg.csie.ntu.edu.tw/finnlp-agentscen/shared-task-finllm).
  4 | 
  5 | ## Outline
  6 |   - [Task 1 Financial Classification Starter Kit](#task-1-financial-classification-starter-kit)
  7 |   - [Task 2 Financial Text Summarization Starter Kit](#task-2-financial-text-summarization-starter-kit)
  8 |   - [Task 3 Single Stock Trading Starter Kit](#task-3-single-stock-trading-starter-kit)
  9 |   - [Fine Tune](#fine-tune)
 10 |   - [Model Cheating Detection](#Model-Cheating-Detection)
 11 | 
 12 | ## Task 1 Financial Classification Starter Kit
 13 | ### Introduction
 14 | This task focuses on argument unit classification to test the capabilities of LLMs to identify and categorize texts as premises or claims. Participants receive a financial text and two options, following design the prompt query template, and then classify the text as a claim or premise.
 15 | 
 16 | We provide 7.75k training data and 969 test data to categorize sentences as claims or premises. 
 17 | 
 18 | We use the following prompt template to ask and answer the question in this task.
 19 | 
 20 | Instruction: [task prompt] Text: [input text] Response: [output]
 21 | 
 22 | [input text] denotes the financial text in the prompt, [output] is the classified label  (i.e., "Claim" or "Premise"). 
 23 | 
 24 | ### Performance Metrics
 25 | We use two metrics to evaluate classification capability, like F1 and Accuracy. 
 26 | We use F1 score as the final ranking metrics.
 27 | 
 28 | ### Evaluation
 29 | You can follow the instructions in the [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ogcCmhMc5lPhUamCk6512H3PJwPEaBZN?usp=sharing) to do evaluations on Task 1: financial classification.
 30 | 
 31 | ### Dataset Example
 32 | | id |	query |	answer |	text	| choices	| gold |
 33 | | -- | ----- | ------ | ---- | ------- | ---- |
 34 | | finargeccauc0 | Analyze sentences from earnings conference calls and identify their argumentative function. Each sentence is either a premise, offering evidence or reasoning, or a claim, asserting a conclusion or viewpoint. Return only premise or claim. Text: I mean, sometimes it's not that you came up with some brilliant strategy, it's just like really good work consistently over a long period of time. Answer:	| premise | I mean, sometimes it's not that you came up with some brilliant strategy, it's just like really good work consistently over a long period of time. | [ "premise", "claim" ] | 0 |
 35 | | finargeccauc1 | Analyze sentences from earnings conference calls and identify their argumentative function. Each sentence is either a premise, offering evidence or reasoning, or a claim, asserting a conclusion or viewpoint. Return only premise or claim. Text: Even while in International, we're continuing to invest in a lot of areas, we continue to frontload Prime benefits for the newer geographies, we continue to launch new countries as we launch Prime in Australia recently. Answer: | claim | Even while in International, we're continuing to invest in a lot of areas, we continue to frontload Prime benefits for the newer geographies, we continue to launch new countries as we launch Prime in Australia recently. | [ "premise", "claim" ] | 1 |
 36 | 
 37 | 
 38 | 
 39 | ## Task 2 Financial Text Summarization Starter Kit
 40 | ### Introduction
 41 | This task is designed to test the capabilities of LLMs to generate coherent summaries. Participants need to summarize a corresponding concise text according to the given financial news text, following the designed prompt template of query. 
 42 | 
 43 | We provide 8k training data and 2k test data for abstract financial news articles into concise summaries.  
 44 | 
 45 | We use the following prompt template to ask and answer the question in this task.
 46 | 
 47 | Instruction: [task prompt] Context: [input context] Response: [output]
 48 | 
 49 | [input text] denotes the multiple-sentence text in financial news article, [output] is the abstractive summarization on this text.
 50 | 
 51 | ### Performance Metrics
 52 | We utilize three metrics, such as ROUGE (1, 2, and L) and BERTScore, to evaluate generated summaries in terms of Relevance.  
 53 | We use ROUGE -1 score as the final ranking metrics.
 54 | 
 55 | ### Evaluation
 56 | You can follow the instructions in the [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ogcCmhMc5lPhUamCk6512H3PJwPEaBZN?usp=sharing) to do evaluations on Task 2: financial text summarization.
 57 | 
 58 | ### Dataset Example
 59 | | id |	query |	answer | text |
 60 | | -- | ----- | ------ | ---- |
 61 | | edtsum0 | You are given a text that consists of multiple sentences. Your task is to perform abstractive summarization on this text. Use your understanding of the content to express the main ideas and crucial details in a shorter, coherent, and natural sounding text. Text: MONROE, Conn., Dec. 16, 2020 /PRNewswire/ --Elidah, maker of ELITONE, a home-use treatment for incontinence, announced it was selected out of 7500 entries from 159 countries to win a Top Ten award from the global SLINGSHOT 2020 start-up competition. Elidah was the only company from the United States awarded this distinction, and one of two start-ups in the life science category.Normally held in Singapore,this year the event was virtual and offered a record $750,000 in cash prizes by StartUpSG. One hundred companies pitched at the live event, and from those ten finalists were selected. The award winners included start-ups from all over the world including Israel, United Kingdom, Singapore, and India, among others. Continue Reading Gloria Kolb holding ELITONE Elidah "We sometimes have the mindset that successful start-ups must come from Silicon Valley," said Gloria Kolb, co-founder and CEO of Elidah, "but innovation is flourishing in the rest of the world as entrepreneurial support systems expand. I was impressed by the other finalists, advancing technologies such as biometric security, artificial intelligence, and gene editing." Although the top prize went to another start-up, Ms. Kolb, as the only female entrepreneur in the finals, was happy to see a company focused on women's health receive recognition. "Women's health should not be a taboo subject, and I hope that investors realize it presents a large market, ripe for innovation." ELITONE is the first home-health device that performs the hard-to-do pelvic floor exercises for women. It operates externally, without the invasiveness of vaginal probes. Exercises are needed to tone the pelvic floor muscles, but they can be hard to do correctly. The wearable nature of ELITONE allows women to do other activities while getting treatment at home, saving time, cost, and risk of infection. In a time when the clinics and pelvic floor physical therapy offices have shut down, at-home over-the-counter devices like ELITONE provide much-needed access to effective FDA-cleared treatments. About ElidahElidah is a women-owned FemTech company established to develop technologies that integrate recent advances in wearable devices, biomaterials, and mobile interfaces to deliver innovative therapeutic solutions.Elidah is led by entrepreneur Gloria Kolb, Founder and CEO, an MIT and Stanford trained engineer whose previous accolades include Boston's 40 under 40 and MIT Technology Review's World Top Innovators Under 35 (TR35). To learn more visit elitone.com.Contact: Gloria Kolb[emailprotected] 810 Main St., Ste C, Monroe, CT 06468978-435-4324SOURCE Elidah Answer: | Elidah Becomes the Only US Company to Win a Top Ten Prize in a Global Start-up Competition, Advancing Recognition for Women's Health | MONROE, Conn., Dec. 16, 2020 /PRNewswire/ --Elidah, maker of ELITONE, a home-use treatment for incontinence, announced it was selected out of 7500 entries from 159 countries to win a Top Ten award from the global SLINGSHOT 2020 start-up competition. Elidah was the only company from the United States awarded this distinction, and one of two start-ups in the life science category.Normally held in Singapore,this year the event was virtual and offered a record $750,000 in cash prizes by StartUpSG. One hundred companies pitched at the live event, and from those ten finalists were selected. The award winners included start-ups from all over the world including Israel, United Kingdom, Singapore, and India, among others. Continue Reading Gloria Kolb holding ELITONE Elidah ""We sometimes have the mindset that successful start-ups must come from Silicon Valley,"" said Gloria Kolb, co-founder and CEO of Elidah, ""but innovation is flourishing in the rest of the world as entrepreneurial support systems expand. I was impressed by the other finalists, advancing technologies such as biometric security, artificial intelligence, and gene editing."" Although the top prize went to another start-up, Ms. Kolb, as the only female entrepreneur in the finals, was happy to see a company focused on women's health receive recognition. ""Women's health should not be a taboo subject, and I hope that investors realize it presents a large market, ripe for innovation."" ELITONE is the first home-health device that performs the hard-to-do pelvic floor exercises for women. It operates externally, without the invasiveness of vaginal probes. Exercises are needed to tone the pelvic floor muscles, but they can be hard to do correctly. The wearable nature of ELITONE allows women to do other activities while getting treatment at home, saving time, cost, and risk of infection. In a time when the clinics and pelvic floor physical therapy offices have shut down, at-home over-the-counter devices like ELITONE provide much-needed access to effective FDA-cleared treatments. About ElidahElidah is a women-owned FemTech company established to develop technologies that integrate recent advances in wearable devices, biomaterials, and mobile interfaces to deliver innovative therapeutic solutions.Elidah is led by entrepreneur Gloria Kolb, Founder and CEO, an MIT and Stanford trained engineer whose previous accolades include Boston's 40 under 40 and MIT Technology Review's World Top Innovators Under 35 (TR35). To learn more visit elitone.com.Contact: Gloria Kolb[emailprotected] 810 Main St., Ste C, Monroe, CT 06468978-435-4324SOURCE Elidah |
 62 | 
 63 | 
 64 | 
 65 | ## Task 3 Single Stock Trading Starter Kit
 66 | ### Introduction
 67 | This task aims to evaluate LLMs’ ability to make sophisticated decisions in trading activities, which is currently restricted by human’s limited ability to process large volumes of data rapidly. Participants receive a combination of open-source data for stocks and an ETF. The system should output one of the three trading decisions (“buy”, “sell”, “hold”) with reasonings. 
 68 | 
 69 | We provide 291 data to evaluate LLMs on sophisticated stock decisions. 
 70 | 
 71 | We use the following prompt template to ask and answer the question in this task.
 72 | 
 73 | Instruction: [task prompt] Context: [input context] Response: [output]
 74 | 
 75 | [input text] denotes the financial investment information in the prompt, [output] should strictly conform the following JSON format without any additional contents: {"investment_decision": string, "summary_reason": string, "short_memory_index": number, "middle_memory_index": number, "long_memory_index": number,  "reflection_memory_index": number}
 76 | 
 77 | ### Performance Metrics
 78 | We offer a comprehensive assessment of profitability, risk management, and decision-making prowess by a series of metrics, such as Sharpe Ratio (SR), Cumulative Return (CR), Daily (DV) and Annualized volatility (AV), and Maximum Drawdown (MD). 
 79 | 
 80 | We use Sharpe Ratio (SR) score as the final ranking metrics.
 81 | 
 82 | The formulas are as follows: 
 83 | ![image](static/sr.jpg)
 84 | ![image](static/cr.jpg)
 85 | ![image](static/av.jpg)
 86 | ![image](static/md.jpg)
 87 | 
 88 | 
 89 | ### Evaluation
 90 | You can follow the [instructions](https://github.com/The-FinAI/PIXIU?tab=readme-ov-file#finmem-a-performance-enhanced-llm-trading-agent) to do evaluations on Task 3: single stock trading.
 91 | 
 92 | ### Dataset Example
 93 | | id | date	| price	| filing_k | filing_q |	news |
 94 | | -- | ---- | ----- | -------- | -------- | ---- |
 95 | | jnj_test0 | "2020-10-09" | { "DRIV": 17.52210235595703 } | { "FORM": "null" } | { "FORM": "null" } | { ""DRIV"": [ ""The global cloud enterprise content management market is expected to reach \\$62.4 billion by 2027, driven by a CAGR of 25.6% and significant growth in the U.S. and China. The positive score for this news is 2.3659735504111268e-08. The neutral score for this news is 0.9999990463256836. The negative score for this news is 9.636863751438796e-07."", ""The global emergency lighting batteries market is expected to reach \\$2.8 billion by 2027, growing at a CAGR of 10.8% despite the COVID-19 pandemic's impact. The positive score for this news is 1.1662441465887241e-05. The neutral score for this news is 0.9995514750480652. The negative score for this news is 0.000436866597738117."", ""Despite the impact of the COVID-19 pandemic, the global market for two-wheeler spark plugs is expected to reach 86.2 million units by 2027, growing at a CAGR of 4.9%. The positive score for this news is 1.1285221262369305e-05. The neutral score for this news is 0.9988551139831543. The negative score for this news is 0.0011336031602695584."", ""Despite pandemic setbacks, the global market for two-wheeler upside-down forks is expected to reach 701.8 thousand units by 2027, driven by growth in China and the U.S. The positive score for this news is 9.909140175068387e-08. The neutral score for this news is 0.9999970197677612. The negative score for this news is 2.81238385468896e-06."", ""The global embedded analytics market is expected to reach \\$84.6 billion by 2027, driven by a 13% CAGR, with cloud-based solutions leading the growth. The positive score for this news is 6.070506231026229e-08. The neutral score for this news is 0.9999868869781494. The negative score for this news is 1.2994331882509869e-05."", ""Despite the COVID-19 pandemic, the global battery monitoring system market is expected to reach \\$9.8 billion by 2027, with significant growth in the US and China. The positive score for this news is 4.437213263486228e-08. The neutral score for this news is 0.9999984502792358. The negative score for this news is 1.6080473415058805e-06."", ""Despite the impact of the COVID-19 pandemic, the global microwave transmission equipment market is expected to reach \\$6.7 billion by 2027, with a CAGR of 3.2%. The positive score for this news is 0.00034257289371453226. The neutral score for this news is 0.004475872032344341. The negative score for this news is 0.9951815009117126."", ""Despite the impact of the COVID-19 pandemic, the global transfer membrane market is expected to reach \\$200.3 million by 2027, with the PVDF segment leading the growth. The positive score for this news is 1.5521750640346e-07. The neutral score for this news is 0.9999940395355225. The negative score for this news is 5.781918389402563e-06."", ""Despite the impact of COVID-19, the global thermal analysis market is expected to reach \\$739.1 million by 2027, with a CAGR of 4.4%. The positive score for this news is 0.00015923684986773878. The neutral score for this news is 0.0002189901570091024. The negative score for this news is 0.9996217489242554."" ] } |
 96 | 
 97 | 
 98 | 
 99 | ## Fine-tune
100 | We recommend you to use [AutoTrain-Advanced](https://github.com/huggingface/autotrain-advanced.git) to train your models.
101 | 
102 | ## Model Cheating Detection
103 | To measure the risk of data leakage from the test set used in the training of a model, the Model Cheating, we have developed a new metric called the Data Leakage Test (DLT), building on existing research.
104 | 
105 | The DLT calculates the difference in perplexity of the large language models (LLMs) on both the training and test data to determine its data generation tendencies. Specifically, we separately input the training set and the test set into the LLMs, and calculate the perplexity on the training set (ppl-on-train) and the perplexity on the test set (ppl-on-test). The DLT value is then computed by subtracting the ppl-on-train from the ppl-on-test. A larger difference implies that the LLM is less likely to have seen the test set during training compared to the training set and suggests a lower likelihood of the model cheating. Conversely, a smaller difference implies that the LLM is more likely to have seen the test set during training and suggests a higher likelihood of the model cheating.
106 | 
107 | In the detection process, we will calculate the DLT values for some LLMs to establish a reference baseline of Model Cheating, and minimize the impact of generalization on the metric. The formula is as follows:
108 | ![image](static/formula.jpg)
109 | 
110 | 


--------------------------------------------------------------------------------
/README.zh.md:
--------------------------------------------------------------------------------
  1 | <p align="center" width="100%">
  2 | <img src="https://i.postimg.cc/xTpWgq3L/pixiu-logo.png"  width="100%" height="100%">
  3 | </p>
  4 | <div>
  5 | <div align="left">
  6 |     <a target='_blank'>Qianqian Xie<sup>1</sup></span>&emsp;
  7 |     <a target='_blank'>Weiguang Han<sup>2</sup></span>&emsp;
  8 |     <a target='_blank'>Zhengyu Chen<sup>2</sup></span>&emsp;
  9 |     <a target='_blank'>Ruoyu Xiang<sup>1</sup></a>&emsp;
 10 |     <a target='_blank'>Xiao Zhang<sup>1</sup></a>&emsp;
 11 |     <a target='_blank'>Yueru He<sup>1</sup></a>&emsp;
 12 |     <a target='_blank'>Mengxi Xiao<sup>2</sup></a>&emsp;
 13 |     <a target='_blank'>Dong Li<sup>2</sup></a>&emsp;
 14 |     <a target='_blank'>Yongfu Dai<sup>7</sup></a>&emsp;
 15 |     <a target='_blank'>Duanyu Feng<sup>7</sup></a>&emsp;
 16 |     <a target='_blank'>Yijing Xu<sup>1</sup></a>&emsp;
 17 |     <a target='_blank'>Haoqiang Kang<sup>5</sup></a>&emsp;
 18 |     <a target='_blank'>Ziyan Kuang<sup>12</sup></a>&emsp;
 19 |     <a target='_blank'>Chenhan Yuan<sup>3</sup></a>&emsp;
 20 |     <a target='_blank'>Kailai Yang<sup>3</sup></a>&emsp;
 21 |     <a target='_blank'>Zheheng Luo<sup>3</sup></a>&emsp;
 22 |     <a target='_blank'>Tianlin Zhang<sup>3</sup></a>&emsp;
 23 |     <a target='_blank'>Zhiwei Liu<sup>3</sup></a>&emsp;
 24 |     <a target='_blank'>Guojun Xiong<sup>10</sup></a>&emsp;
 25 |     <a target='_blank'>Zhiyang Deng<sup>9</sup></a>&emsp;
 26 |     <a target='_blank'>Yuechen Jiang<sup>9</sup></a>&emsp;
 27 |     <a target='_blank'>Zhiyuan Yao<sup>9</sup></a>&emsp;
 28 |     <a target='_blank'>Haohang Li<sup>9</sup></a>&emsp;
 29 |     <a target='_blank'>Yangyang Yu<sup>9</sup></a>&emsp;
 30 |     <a target='_blank'>Gang Hu<sup>8</sup></a>&emsp;
 31 |     <a target='_blank'>Jiajia Huang<sup>11</sup></a>&emsp;
 32 |     <a target='_blank'>Xiao-Yang Liu<sup>5</sup></a>&emsp;
 33 |     <a href='https://warrington.ufl.edu/directory/person/12693/' target='_blank'>Alejandro Lopez-Lira<sup>4</sup></a>&emsp;
 34 |     <a target='_blank'>Benyou Wang<sup>6</sup></a>&emsp;
 35 |     <a target='_blank'>Yanzhao Lai<sup>13</sup></a>&emsp;
 36 |     <a target='_blank'>Hao Wang<sup>7</sup></a>&emsp;
 37 |     <a target='_blank'>Min Peng<sup>2*</sup></a>&emsp;
 38 |     <a target='_blank'>Sophia Ananiadou<sup>3</sup></a>&emsp;
 39 |     <a href='' target='_blank'>Jimin Huang<sup>1</sup></a>
 40 | </div>
 41 | <br />
 42 | 
 43 | <div align="left">
 44 |     <sup>1</sup>The Fin AI&emsp;
 45 |     <sup>2</sup>Wuhan University&emsp;
 46 |     <sup>3</sup>The University of Manchester&emsp;
 47 |     <sup>4</sup>University of Florida&emsp;
 48 |     <sup>5</sup>Columbia University&emsp;
 49 |     <sup>6</sup>The Chinese University of Hong Kong, Shenzhen&emsp;
 50 |     <sup>7</sup>Sichuan University&emsp;
 51 |     <sup>8</sup>Yunnan University&emsp;
 52 |     <sup>9</sup>Stevens Institute of Technology&emsp;
 53 |     <sup>10</sup>Stony Brook University&emsp;
 54 |     <sup>11</sup>Nanjin Audit University&emsp;
 55 |     <sup>12</sup>Jiangxi Normal University&emsp;
 56 |     <sup>13</sup>Southwest Jiaotong University
 57 | </div>
 58 | <br />
 59 | 
 60 | <div align="left">
 61 |     <img src='https://i.postimg.cc/CLtkBwz7/57-EDDD9-FB0-DF712-F3-AB627163-C2-1-EF15655-13-FCA.png' alt='Wuhan University Logo' height='50px'>&emsp;
 62 |     <img src='https://assets.manchester.ac.uk/corporate/images/design/logo-university-of-manchester.png' alt='Manchester University Logo' height='50px'>&emsp;
 63 |     <img src='https://i.postimg.cc/XY1s2RHD/University-of-Florida-Logo-1536x864.jpg' alt='University of Florida Logo' height='50px'>&emsp;
 64 |     <img src='https://admissions.ucr.edu/sites/default/files/styles/form_preview/public/2020-07/ucr-education-logo-columbia-university.png?itok=-0FD6Ma2' alt='Columbia University Logo' height='50px'>&emsp;
 65 |     <img src='https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQeMTMkJVT6g36_LN-8qJ4nMvgT3vM5spUHV3ITRYbym1CEg4Af5Shlp5jX2sWtDFtTK9I&usqp=CAU' alt='HK University (shenzhen) Logo' height='50px'>&emsp;
 66 |     <img src='https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcToJAAiyqxfFuwro5N9Um9TB5LDkiJNKF3hMMQp3pfC0A&s' alt='Sichuan University' height='50px'>&emsp;
 67 |     <img src='https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRbx3AQWiMhxwOvFb7r1PH-h_i5-b3H9xsGVKnkQwbFlA&s' alt='Yunnan University' height='50px'>&emsp;
 68 |     <img src='https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRS_o8HItSOTkg5M75N59D6V5u9qg7QYfBa_ITxdfEfwQ&s' alt='Stevens Insititute of Technology' height='50px'>&emsp;
 69 |     <img src='https://www.stonybrook.edu/sbu-brand/_images/2015/10/logo_stacked_vert.jpg' alt='Stony Brook University' height='50px'>&emsp;
 70 |     <img src='https://upload.wikimedia.org/wikipedia/en/9/9c/Nanjing_Audit_University_logo.png' alt='Nanjing Audit University' height='50px'>&emsp;
 71 |     <img src='https://upload.wikimedia.org/wikipedia/en/thumb/c/c5/Jiangxi_Normal_University.svg/1200px-Jiangxi_Normal_University.svg.png' alt='Jiangxi Normal University' height='50px'>&emsp;
 72 |     <img src='https://i.postimg.cc/k5WpYj0r/SWJTULogo.png' alt='Southwest Jiaotong University Logo' height='50px'>&emsp;
 73 | </div>
 74 | 
 75 | -----------------
 76 | 
 77 | ![](https://img.shields.io/badge/pixiu-v0.1-gold)
 78 | ![](https://black.readthedocs.io/en/stable/_static/license.svg)
 79 | [![Discord](https://img.shields.io/discord/1146837080798933112)](https://discord.gg/HRWpUmKB)
 80 | 
 81 | [Pixiu Paper](https://arxiv.org/abs/2306.05443) | [FinBen Leaderboard](https://huggingface.co/spaces/finosfoundation/Open-Financial-LLM-Leaderboard)
 82 | 
 83 | **免责声明**
 84 | 
 85 | 本资料库及其内容仅用于**学术和教育目的**。所有资料均不构成金融、法律或投资建议。不对内容的准确性、完整性或实用性提供任何明示或暗示的保证。作者和撰稿人不对任何错误、遗漏或因使用本网站信息而产生的任何后果负责。用户在做出任何财务、法律或投资决定之前，应自行判断并咨询专业人士。使用本资料库所含软件和信息的风险完全由用户自行承担。
 86 | 
 87 | **使用或访问本资源库中的信息，即表示您同意对作者、撰稿人以及任何附属组织或个人的任何及所有索赔或损害进行赔偿、为其辩护并使其免受损害。**
 88 | 
 89 | 
 90 | 
 91 | 
 92 | **检查点:** 
 93 | 
 94 | - [FinMA v0.1 (Full 7B version)](https://huggingface.co/TheFinAI/finma-7b-full)
 95 | 
 96 | **语言**
 97 | 
 98 | - [英文](README.md)
 99 | - [中文](README.zh.md)
100 | - [西班牙语](README.es.md)
101 | 
102 | **论文**
103 | 
104 | - [PIXIU: A Comprehensive Benchmark, Instruction Dataset and Large Language Model for Finance](https://arxiv.org/abs/2306.05443)
105 | - [The FinBen: An Holistic Financial Benchmark for Large Language Models](https://arxiv.org/abs/2402.12659)
106 | - [No Language is an Island: Unifying Chinese and English in Financial Large Language Models, Instruction Data, and Benchmarks](https://arxiv.org/abs/2403.06249)
107 | - [Dólares or Dollars? Unraveling the Bilingual Prowess of Financial LLMs Between Spanish and English](https://arxiv.org/abs/2402.07405)
108 | 
109 | **评估** (更多详情，请参阅FinBen部分):
110 | 
111 | - [flare (flare-zh-afqmc)](https://huggingface.co/datasets/TheFinAI/flare-zh-afqmc)
112 | 
113 | - [flare (flare-zh-stocka)](https://huggingface.co/datasets/TheFinAI/flare-zh-stocka)
114 | 
115 | - [flare (flare-zh-corpus)](https://huggingface.co/datasets/TheFinAI/flare-zh-corpus)
116 | 
117 | - [flare (flare-zh-fineval)](https://huggingface.co/datasets/TheFinAI/flare-zh-fineval)
118 | 
119 | - [flare (flare-zh-fe)](https://huggingface.co/datasets/TheFinAI/flare-zh-fe)
120 | 
121 | - [flare (flare-zh-nl)](https://huggingface.co/datasets/TheFinAI/flare-zh-nl)
122 | 
123 | - [flare (flare-zh-nl2)](https://huggingface.co/datasets/TheFinAI/flare-zh-nl2)
124 | 
125 | - [flare (flare-zh-nsp)](https://huggingface.co/datasets/TheFinAI/flare-zh-nsp)
126 | 
127 | - [flare (flare-zh-re)](https://huggingface.co/datasets/TheFinAI/flare-zh-re)
128 | 
129 | - [flare (flare-zh-stockb)](https://huggingface.co/datasets/TheFinAI/flare-zh-stockb)
130 | 
131 | - [flare (flare-zh-qa)](https://huggingface.co/datasets/TheFinAI/flare-zh-qa)
132 | 
133 | - [flare (flare-zh-na)](https://huggingface.co/datasets/TheFinAI/flare-zh-na)
134 | 
135 | - [flare (flare-zh-19ccks)](https://huggingface.co/datasets/TheFinAI/flare-zh-19ccks)
136 | 
137 | - [flare (flare-zh-20ccks)](https://huggingface.co/datasets/TheFinAI/flare-zh-20ccks)
138 | 
139 | - [flare (flare-zh-21ccks)](https://huggingface.co/datasets/TheFinAI/flare-zh-21ccks)
140 | 
141 | - [flare (flare-zh-22ccks)](https://huggingface.co/datasets/TheFinAI/flare-zh-22ccks)
142 | 
143 | - [flare (flare-zh-ner)](https://huggingface.co/datasets/TheFinAI/flare-zh-ner)
144 | 
145 | - [flare (flare-zh-fpb)](https://huggingface.co/datasets/TheFinAI/flare-zh-fpb)
146 | 
147 |   
148 | 
149 | ## 概述
150 | 
151 | **FinBen_ZH** 是一项专注于中文金融领域的基石计划，旨在促进专为中文金融环境定制的大型语言模型（LLMs）的进展、完善和评估。FinBen_ZH 是 PIXIU 更大范围工作的一个重要部分，证明了我们在利用 LLMs 能力方面的承诺，确保中文世界的金融专业人士和爱好者拥有顶级的语言工具。
152 | 
153 | ### 主要特征
154 | 
155 | - **公开资源**: PIXIU 公开提供财务 LLM、教学调整数据和评估基准中的数据集，以鼓励公开研究和透明度。
156 | - **多任务**: PIXIU 中的指令调整数据和基准涵盖了一系列不同的金融任务。
157 | - **多模态**: PIXIU 的指令调整数据和基准由多模态金融数据组成，包括股票走势预测任务的时间序列数据。它涵盖各种类型的金融文本，包括报告、新闻报道、推特和监管文件。
158 | - **多样性**: 与以往主要侧重于金融 NLP 任务的基准不同，PIXIU 的评估基准包括与真实世界场景相一致的关键金融预测任务，因此更具挑战性。
159 | 
160 | ---
161 | 
162 | ## FinBen_ZH: 金融语言理解和预测评估基准
163 | 
164 | 在本节中，我们将提供 FinMA 与其他领先模型（包括 ChatGPT、GPT-4、ince-zero 等）相比的详细性能分析。为了进行分析，我们选择了一系列任务和指标，涵盖了金融自然语言处理和金融预测的各个方面。
165 | 
166 | ### 任务
167 | 
168 | | 数据               | 任务类型                    | 原始数据| 数据类型                         | 模式              | 许可证            | 论文  |
169 | | ------------------ | --------------------------- | ------- | -------------------------------- | ----------------- | ----------------- | ----- |
170 | | AFQMC              | 语义匹配                     | 38,650  | 提问数据, 对话                   | 文本               | Apache-2.0        | [1]   |
171 | | corpus             | 语义匹配                     | 120,000 | 提问数据, 对话                   | 文本               | Public            | [2]   |
172 | | stockA             | 股票分类                     | 14,769  | 新闻, 历史价格                   | 文本, 时间序列     | Public            | [3]   |
173 | | Fineval            | 多项选择                     | 1,115   | 金融考试                         | 文本               | Apache-2.0        | [4]   |
174 | | NL                 | 新闻分类                     | 7,955   | 新闻报道                         | 文本               | Public            | [5]   |
175 | | NL2                | 新闻分类                     | 7,955   | 新闻报道                         | 文本               | Public            | [5]   |
176 | | NSP                | 负面新闻判断                 | 4,499   | 新闻、社交媒体文本                | 文本               | Public            | [5]   |
177 | | RE                 | 关系识别                     | 14,973  | 新闻、实体对                      | 文本               | Public            | [5]   |
178 | | FE                 | 情感分析                     | 18,177  | 金融社交媒体文本                  | 文本               | Public            | [5]   |
179 | | stockB             | 情感分析                     | 9,812   | 金融社交媒体文本                  | 文本               | Apache-2.0        | [6]   |
180 | | QA                 | 金融问答                     | 22,375  | 财经新闻公告                      | 文本, 表格         | Public            | [5]   |
181 | | NA                 | 文本摘要                     | 32,400  | 新闻文章、公告                    | 文本               | Public            | [5]   |
182 | | 19CCKS             | 事件主体提取                 | 156,834 | 新闻报道                          | 文本               | CC BY-SA 4.0      | [7]   |
183 | | 20CCKS             | 事件主体提取                 | 372,810 | 新闻报道                          | 文本               | CC BY-SA 4.0      | [8]   |
184 | | 21CCKS             | 事件因果关系抽取             | 8,000   | 新闻报道                          | 文本               | CC BY-SA 4.0      | [9]   |
185 | | 22CCKS             | 事件主体提取                 | 109,555 | 新闻报道                          | 文本               | CC BY-SA 4.0      | [10]  |
186 | | NER                | 命名实体识别                 | 1,685   | 新闻报道                          | 文本               | Public            | [11]  |
187 | | FPB                | 情感分析                     | 4,845   | 新闻                             | 文本               | MIT license       | [12]  |
188 | | FIQASA             | 情感分析                     | 1,173   | 新闻头条、推文                    | 文本               | MIT license       | [12]  |
189 | | Headlines          | 新闻标题分类                 | 11,412  | 新闻头条                          | 文本               | MIT license       | [12]  |
190 | | BigData            | 股票走势预测                 | 7,164   | 推文、历史价格                    | 文本, 时间序列      | MIT license       | [12]  |
191 | | ACL                | 股票走势预测                 | 27,053  | 推文、历史价格                    | 文本, 时间序列      | MIT license       | [12]  |
192 | | CIKM               | 股票走势预测                 | 4,967   | 推文、历史价格                    | 文本, 时间序列      | MIT license       | [12]  |
193 | | FinQA              | 金融问答                     | 14,900  | 收益报告                         | 文本, 表格          | MIT license       | [12]  |
194 | | ConvFinQA          | 多轮问答                     | 48,364  | 收益报告                         | 文本, 表格          | MIT license       | [12]  |
195 | 
196 | 
197 | 1. Xu L, Hu H, Zhang X, et al. CLUE: A Chinese language understanding evaluation benchmark[J]. arXiv preprint arXiv:2004.05986, 2020.
198 | 2. Jing Chen, Qingcai Chen, Xin Liu, Haijun Yang, Daohe Lu, and Buzhou Tang. 2018. The BQ Corpus: A Large-scale Domain-specific Chinese Corpus For Sentence Semantic Equivalence Identification. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pages 4946–4951, Brussels, Belgium. Association for Computational Linguistics.
199 | 3. Jinan Zou, Haiyao Cao, Lingqiao Liu, Yuhao Lin, Ehsan Abbasnejad, and Javen Qinfeng Shi. 2022. Astock: A New Dataset and Automated Stock Trading based on Stock-specific News Analyzing Model. In Proceedings of the Fourth Workshop on Financial Technology and Natural Language Processing (FinNLP), pages 178–186, Abu Dhabi, United Arab Emirates (Hybrid). Association for Computational Linguistics.
200 | 4. Zhang L, Cai W, Liu Z, et al. FinEval: A Chinese Financial Domain Knowledge Evaluation Benchmark for Large Language Models[J]. arxiv preprint arxiv:2308.09975, 2023.
201 | 5. Lu D, Liang J, Xu Y, et al. BBT-Fin: Comprehensive Construction of Chinese Financial Domain Pre-trained Language Model, Corpus and Benchmark[J]. arxiv preprint arxiv:2302.09432, 2023.
202 | 6. https://huggingface.co/datasets/kuroneko5943/stock11
203 | 7. https://www.biendata.xyz/competition/ccks_2019_4/
204 | 8. https://www.biendata.xyz/competition/ccks_2020_4_1/
205 | 9. https://www.biendata.xyz/competition/ccks_2021_task6_2/
206 | 10. https://www.biendata.xyz/competition/ccks2022_eventext/
207 | 11. Jia C, Shi Y, Yang Q, et al. Entity enhanced BERT pre-training for Chinese NER[C]//Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP). 2020: 6384-6396.
208 | 12. Xie Q, Han W, Zhang X, et al. PIXIU: A Large Language Model, Instruction Data and Evaluation Benchmark for Finance[J]. arXiv preprint arXiv:2306.05443, 2023.
209 | 
210 | ### 评估
211 | 
212 | #### 准备工作
213 | ##### 本地安装
214 | ```bash
215 | git clone https://github.com/TheFinAI/PIXIU.git --recursive
216 | cd PIXIU
217 | pip install -r requirements.txt
218 | cd PIXIU/src/financial-evaluation
219 | pip install -e .[multilingual]
220 | ```
221 | ##### Docker 镜像
222 | ```bash
223 | sudo bash scripts/docker_run.sh
224 | ```
225 | 以上命令会启动一个 docker 容器，你可以根据自己的环境修改 `docker_run.sh`。我们通过运行 `sudo docker pull tothemoon/pixiu:latest` 来提供预编译镜像。
226 | 
227 | ```bash
228 | docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
229 |     --network host \
230 |     --env https_proxy=$https_proxy \
231 |     --env http_proxy=$http_proxy \
232 |     --env all_proxy=$all_proxy \
233 |     --env HF_HOME=$hf_home \
234 |     -it [--rm] \
235 |     --name pixiu \
236 |     -v $pixiu_path:$pixiu_path \
237 |     -v $hf_home:$hf_home \
238 |     -v $ssh_pub_key:/root/.ssh/authorized_keys \
239 |     -w $workdir \
240 |     $docker_user/pixiu:$tag \
241 |     [--sshd_port 2201 --cmd "echo 'Hello, world!' && /bin/bash"]
242 | ```
243 | 参数说明:
244 | - `[]` 表示可忽略的参数
245 | - `HF_HOME`: huggingface 缓存目录
246 | - `sshd_port`: 容器的 sshd 端口，可以运行 `ssh -i private_key -p $sshd_port root@$ip` 来连接容器，默认为 22001
247 | - `--rm`: 退出容器时移除容器（即 `CTRL + D`）
248 | 
249 | #### 自动化任务评估
250 | 在评估前, 请下载 [punto de control BART](https://drive.google.com/u/0/uc?id=1_7JfF7KOInb7ZrxKHIigTMR4ChVET01m&export=download) 到 `src/metrics/BARTScore/bart_score.pth`.
251 | 
252 | 如需进行自动评估，请按照以下说明操作：
253 | 
254 | 1. Transformador Huggingface
255 | 
256 |    要评估 HuggingFace Hub 上托管的模型（例如，finma-7b-full），请使用此命令：
257 | 
258 | ```bash
259 | python eval.py \
260 |     --model "hf-causal-llama" \
261 |     --model_args "use_accelerate=True,pretrained=TheFinAI/finma-7b-full,tokenizer=TheFinAI/finma-7b-full,use_fast=False" \
262 |     --tasks "flare_ner,flare_sm_acl,flare_fpb"
263 | ```
264 | 
265 | 更多详情，请参阅 [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness) 文档。
266 | 
267 | 2. 商用接口
268 | 
269 | 
270 | 请注意，对于 NER 等任务，自动评估是基于特定模式进行的。这可能无法提取零镜头设置中的相关信息，导致性能相对低于之前的人工标注结果。
271 | 
272 | ```bash
273 | export OPENAI_API_SECRET_KEY=YOUR_KEY_HERE
274 | python eval.py \
275 |     --model gpt-4 \
276 |     --tasks flare_ner,flare_sm_acl,flare_fpb
277 | ```
278 | 
279 | ---
280 | 
281 | ## 引用
282 | 
283 | 如果您在项目中使用了PIXIU，请引用我们的文章。
284 | 
285 | ```
286 | @misc{xie2023pixiu,
287 |       title={PIXIU: A Large Language Model, Instruction Data and Evaluation Benchmark for Finance}, 
288 |       author={Qianqian Xie and Weiguang Han and Xiao Zhang and Yanzhao Lai and Min Peng and Alejandro Lopez-Lira and Jimin Huang},
289 |       year={2023},
290 |       eprint={2306.05443},
291 |       archivePrefix={arXiv},
292 |       primaryClass={cs.CL}
293 | }
294 | 
295 | @misc{xie2024FinBen,
296 |       title={The FinBen: An Holistic Financial Benchmark for Large Language Models}, 
297 |       author={Qianqian Xie and Weiguang Han and Zhengyu Chen and Ruoyu Xiang and Xiao Zhang and Yueru He and Mengxi Xiao and Dong Li and Yongfu Dai and Duanyu Feng and Yijing Xu and Haoqiang Kang and Ziyan Kuang and Chenhan Yuan and Kailai Yang and Zheheng Luo and Tianlin Zhang and Zhiwei Liu and Guojun Xiong and Zhiyang Deng and Yuechen Jiang and Zhiyuan Yao and Haohang Li and Yangyang Yu and Gang Hu and Jiajia Huang and Xiao-Yang Liu and Alejandro Lopez-Lira and Benyou Wang and Yanzhao Lai and Hao Wang and Min Peng and Sophia Ananiadou and Jimin Huang},
298 |       year={2024},
299 |       eprint={2402.12659},
300 |       archivePrefix={arXiv},
301 |       primaryClass={cs.CL}
302 | }
303 | ```
304 | 
305 | 
306 | 
307 | ## 许可证
308 | 
309 | PIXIU 采用 [MIT] 许可。有关详细信息，请参阅 [MIT](LICENSE) 文件。
310 | 
311 | ## 星标历史
312 | 
313 | ![Star History Chart](https://api.star-history.com/svg?repos=The-FinAI/PIXIU&type=Date)
314 | 
315 | 


--------------------------------------------------------------------------------
/docker/DOCKERFILE:
--------------------------------------------------------------------------------
1 | FROM tothemoon/llm
2 | 
3 | RUN python3 -m pip install -U --no-cache-dir sqlitedict
4 | RUN python3 -m pip install -U --no-cache-dir omegaconf
5 | RUN python3 -m pip install -U --no-cache-dir pycountry
6 | RUN python3 -m pip install -U --no-cache-dir seqeval
7 | RUN python3 -m pip install -U --no-cache-dir ipywidgets
8 | RUN python3 -m pip install -U --no-cache-dir pytablewriter
9 | RUN python3 -m pip install -U --no-cache-dir git+https://github.com/Tiiiger/bert_score


--------------------------------------------------------------------------------
/docker/build_and_upload.sh:
--------------------------------------------------------------------------------
 1 | export https_proxy=...
 2 | export http_proxy=...
 3 | export all_proxy=...
 4 | docker_user=...
 5 | tag=$(date +%Y%m%d)
 6 | 
 7 | docker build --network host --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --build-arg all_proxy=$all_proxy -t pixiu -f DOCKERFILE .
 8 | docker tag pixiu $docker_user/pixiu:$tag
 9 | docker push $docker_user/pixiu:$tag
10 | docker tag pixiu $docker_user/pixiu:latest
11 | docker push $docker_user/pixiu:latest
12 | 


--------------------------------------------------------------------------------
/notebooks/evaluate.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import sys\n",
 10 |     "sys.path.append('/data/hanweiguang/Projects/PIXIU')"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 25,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "from src.utils import MultiClient\n",
 20 |     "from sklearn.metrics import confusion_matrix, matthews_corrcoef, f1_score, accuracy_score\n",
 21 |     "import json"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 3,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "GENERATION_CONFIG = [\n",
 31 |     "    0.1,  # int | float (numeric value between 0 and 1) in 'Temperature' Slider component\n",
 32 |     "    0.75,  # int | float (numeric value between 0 and 1) in 'Top p' Slider component\n",
 33 |     "    40,  # int | float (numeric value between 0 and 100) in 'Top k' Slider component\n",
 34 |     "    1,  # int | float (numeric value between 1 and 4) in 'Beams Number' Slider component\n",
 35 |     "    True, # do sample\n",
 36 |     "    8,  # int | float (numeric value between 1 and 2000) in 'Max New Tokens' Slider component\n",
 37 |     "    1,  # int | float (numeric value between 1 and 300) in 'Min New Tokens' Slider component\n",
 38 |     "    1.2,  # int | float (numeric value between 1.0 and 2.0) in 'Repetition Penalty' Slider component\n",
 39 |     "]"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 4,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "with open(\"/data/hanweiguang/Projects/PIXIU/data/cikm18/test.jsonl\") as f:\n",
 49 |     "    data = f.readlines()\n",
 50 |     "    data = [json.loads(val) for val in data]"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 5,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "Loaded as API: http://127.0.0.1:17860/ ✔\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "worker_addrs = [\n",
 68 |     "    f\"http://127.0.0.1:{17860 + i}\" for i in range(1)\n",
 69 |     "]\n",
 70 |     "clients = MultiClient(worker_addrs)"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 6,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "name": "stderr",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [03:21<00:00,  1.01s/it]\n"
 83 |      ]
 84 |     }
 85 |    ],
 86 |    "source": [
 87 |     "results = clients.predict(\n",
 88 |     "    [\n",
 89 |     "        [\n",
 90 |     "            datum[\"conversations\"][0][\"value\"]\n",
 91 |     "        ] + GENERATION_CONFIG for datum in data[:200]\n",
 92 |     "    ]\n",
 93 |     ")"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 11,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "labels = [\n",
103 |     "    datum[\"label\"] for datum in data[:200]\n",
104 |     "]"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 12,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "y_true = [1 if i == \"Rise\" else 0 for i in labels]\n",
114 |     "y_pred = [1 if i == \"Rise\" else 0 for i in results]"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 22,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "MCC: -0.05380001385625025\n"
127 |      ]
128 |     }
129 |    ],
130 |    "source": [
131 |     "# Calculate confusion matrix\n",
132 |     "tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()\n",
133 |     "\n",
134 |     "# Calculate Matthews correlation coefficient\n",
135 |     "mcc = matthews_corrcoef(y_true, y_pred)\n",
136 |     "print(f'MCC: {mcc}')"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 23,
142 |    "metadata": {},
143 |    "outputs": [
144 |     {
145 |      "name": "stdout",
146 |      "output_type": "stream",
147 |      "text": [
148 |       "F1: 0.4573069852941177\n"
149 |      ]
150 |     }
151 |    ],
152 |    "source": [
153 |     "f1 = f1_score(y_true, y_pred, average='weighted')\n",
154 |     "print(f'F1: {f1}')"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 27,
160 |    "metadata": {},
161 |    "outputs": [
162 |     {
163 |      "name": "stdout",
164 |      "output_type": "stream",
165 |      "text": [
166 |       "accuracy: 0.51\n"
167 |      ]
168 |     }
169 |    ],
170 |    "source": [
171 |     "accuracy = accuracy_score(y_true, y_pred)\n",
172 |     "print(f'accuracy: {accuracy}')"
173 |    ]
174 |   }
175 |  ],
176 |  "metadata": {
177 |   "kernelspec": {
178 |    "display_name": "Python 3 (ipykernel)",
179 |    "language": "python",
180 |    "name": "python3"
181 |   },
182 |   "orig_nbformat": 4
183 |  },
184 |  "nbformat": 4,
185 |  "nbformat_minor": 2
186 | }
187 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers
 2 | peft
 3 | gradio
 4 | tqdm
 5 | scikit-learn
 6 | sentencepiece
 7 | pandas
 8 | seqeval
 9 | numpy
10 | evaluate
11 | openai
12 | rank_bm25
13 | spacy
14 | sentence_transformers
15 | src/factscore_package/en_core_web_sm-3.7.1.tar.gz
16 | vllm==0.2.7
17 | 


--------------------------------------------------------------------------------
/scripts/docker_run.sh:
--------------------------------------------------------------------------------
 1 | # 需要先安装container-toolkit
 2 | # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html
 3 | 
 4 | export https_proxy=...
 5 | export http_proxy=...
 6 | export all_proxy=...
 7 | 
 8 | pixiu_path=...
 9 | docker_user=tothemoon
10 | tag="latest"
11 | hf_home=...
12 | ssh_pub_key=...
13 | workdir="$pixiu_path"
14 | chown root:root $ssh_pub_key
15 | 
16 | docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
17 |     --network host \
18 |     --env https_proxy=$https_proxy \
19 |     --env http_proxy=$http_proxy \
20 |     --env all_proxy=$all_proxy \
21 |     --env HF_HOME=$hf_home \
22 |     -it --rm \
23 |     --name pixiu \
24 |     -v $pixiu_path:$pixiu_path \
25 |     -v $hf_home:$hf_home \
26 |     -v $ssh_pub_key:/root/.ssh/authorized_keys \
27 |     -w $workdir \
28 |     $docker_user/pixiu:$tag \
29 |     --sshd_port 2201 --cmd "echo 'Hello, world!' && /bin/bash"


--------------------------------------------------------------------------------
/scripts/run_evaluation.sh:
--------------------------------------------------------------------------------
 1 | pixiu_path='/root/PIXIU'
 2 | export PYTHONPATH="$pixiu_path/src:$pixiu_path/src/financial-evaluation:$pixiu_path/src/metrics/BARTScore"
 3 | echo $PYTHONPATH
 4 | export CUDA_VISIBLE_DEVICES="0"
 5 | 
 6 | python src/eval.py \
 7 |     --model hf-causal-vllm \
 8 |     --tasks flare_en_fintern \
 9 |     --model_args use_accelerate=True,pretrained=llama-2-7b-chat-hf,tokenizer=llama-2-7b-chat-hf,use_fast=False,max_gen_toks=1024,dtype=float16 \
10 |     --no_cache \
11 |     --batch_size 2 \
12 |     --model_prompt 'finma_prompt' \
13 |     --num_fewshot 0 \
14 |     --write_out 
15 | 


--------------------------------------------------------------------------------
/scripts/run_interface.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES='0,1,2,3,4,7'
2 | export PYTHONPATH='.../PIXIU/src'
3 | 
4 | model_name_or_path='...'
5 | 
6 | python src/interface.py \
7 |     --model_name_or_path $model_name_or_path \
8 |     --llama
9 | 


--------------------------------------------------------------------------------
/src/chatlm.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import asyncio
  3 | import numpy as np
  4 | import transformers
  5 | from lm_eval.base import BaseLM
  6 | from lm_eval import utils
  7 | from tqdm import tqdm
  8 | import time
  9 | 
 10 | BACKOFF_TIME = 0.1
 11 | 
 12 | async def single_chat(client, **kwargs):
 13 |     global BACKOFF_TIME
 14 |     backoff_time = BACKOFF_TIME
 15 |     while True:
 16 |         try:
 17 |             r = await client.post(**kwargs, timeout=20)
 18 |             json_response = r.json()
 19 |             s = json_response['choices'][0]["message"]['content']
 20 |             time.sleep(backoff_time)
 21 |             return s
 22 |         except Exception:
 23 |             import traceback
 24 | 
 25 |             traceback.print_exc()
 26 |             time.sleep(backoff_time * 30)
 27 |             BACKOFF_TIME *= 1.05
 28 | 
 29 | 
 30 | async def oa_completion(**kwargs):
 31 |     """Query OpenAI API for completion.
 32 | 
 33 |     Retry with back-off until they respond
 34 |     """
 35 |     import httpx
 36 | 
 37 |     async with httpx.AsyncClient() as client:
 38 |         tasks = [single_chat(
 39 |             client=client,
 40 |             url=kwargs["url"], headers=kwargs["headers"],
 41 |             json={
 42 |                 "temperature": kwargs["temperature"], "max_tokens": kwargs["max_tokens"],
 43 |                 "model": kwargs["model"], "messages": [message,],
 44 |             }
 45 |         ) for message in kwargs["messages"]]
 46 |         results = await asyncio.gather(*tasks)
 47 |         return results
 48 | 
 49 | 
 50 | class ChatLM(BaseLM):
 51 |     REQ_CHUNK_SIZE = 20
 52 | 
 53 |     def __init__(self, model, truncate=False):
 54 |         """
 55 | 
 56 |         :param model: str
 57 |         :param truncate: bool
 58 |             Truncate input if too long (if False and input is too long, throw error)
 59 |         """
 60 |         super().__init__()
 61 | 
 62 |         import openai
 63 | 
 64 |         self.model = model
 65 |         self.truncate = truncate
 66 |         # Read from environment variable OPENAI_API_SECRET_KEY
 67 |         api_key = os.environ["OPENAI_API_SECRET_KEY"]
 68 |         self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")
 69 |         self.headers = {
 70 |             "Content-Type": "application/json",
 71 |             "Authorization": f"Bearer {api_key}"
 72 |         }
 73 | 
 74 |     @property
 75 |     def eot_token_id(self):
 76 |         return self.tokenizer.eos_token_id
 77 | 
 78 |     @property
 79 |     def max_length(self):
 80 |         # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token
 81 |         return 4096
 82 | 
 83 |     @property
 84 |     def max_gen_toks(self):
 85 |         return 10
 86 | 
 87 |     @property
 88 |     def batch_size(self):
 89 |         # Isn't used because we override _loglikelihood_tokens
 90 |         raise NotImplementedError()
 91 | 
 92 |     @property
 93 |     def device(self):
 94 |         # Isn't used because we override _loglikelihood_tokens
 95 |         raise NotImplementedError()
 96 | 
 97 |     def tok_encode(self, string: str):
 98 |         return self.tokenizer.encode(string, add_special_tokens=False)
 99 | 
100 |     def tok_decode(self, tokens):
101 |         return self.tokenizer.decode(tokens)
102 | 
103 |     def _loglikelihood_tokens(self, requests, disable_tqdm=False):
104 |         raise NotImplementedError()
105 | 
106 |     def greedy_until(self, requests):
107 |         if not requests:
108 |             return []
109 |         res = []
110 | 
111 |         def _collate(x):
112 |             toks = self.tok_encode(x[0])
113 |             return len(toks), x[0]
114 | 
115 |         re_ord = utils.Reorderer(requests, _collate)
116 | 
117 |         def sameuntil_chunks(xs, size):
118 |             ret = []
119 |             lastuntil = "</s>"
120 |             for x in xs:
121 |                 if len(ret) >= size:
122 |                     yield ret, lastuntil
123 |                     ret = []
124 |                     lastuntil = "</s>"
125 |                 ret.append(x)
126 | 
127 |             if ret:
128 |                 yield ret, lastuntil
129 | 
130 |         # todo: more intelligent batching for heterogeneous `until`
131 |         for chunk, until in tqdm(
132 |             list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE))
133 |         ):
134 |             inps = []
135 |             for context in chunk:
136 |                 inps.append(context[0])
137 | 
138 |             responses = asyncio.run(oa_completion(
139 |                 url="https://api.openai.com/v1/chat/completions",
140 |                 headers=self.headers,
141 |                 model=self.model,
142 |                 messages=[{"role": "user", "content": inp} for inp in inps],
143 |                 max_tokens=self.max_gen_toks,
144 |                 temperature=0.0,
145 |                 # stop=until,
146 |             ))
147 | 
148 |             for resp, context in zip(responses, chunk):
149 |                 s = resp
150 | 
151 |                 # partial caching
152 |                 self.cache_hook.add_partial("greedy_until", (context, "</s>"), s)
153 | 
154 |                 res.append(s)
155 | 
156 |         return re_ord.get_original(res)
157 | 
158 |     def _model_call(self, inps):
159 |         # Isn't used because we override _loglikelihood_tokens
160 |         raise NotImplementedError()
161 | 
162 |     def _model_generate(self, context, max_length, eos_token_id):
163 |         # Isn't used because we override greedy_until
164 |         raise NotImplementedError()
165 | 


--------------------------------------------------------------------------------
/src/eval.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import logging
 4 | import os
 5 | import tasks
 6 | 
 7 | from lm_eval import utils
 8 | import evaluator
 9 | from model_prompt import MODEL_PROMPT_MAP
10 | 
11 | logging.getLogger("openai").setLevel(logging.WARNING)
12 | 
13 | def parse_args():
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument("--model", required=True)
16 |     parser.add_argument("--model_args", default="")
17 |     parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS))
18 |     parser.add_argument("--model_prompt", default="no_prompt", choices=list(MODEL_PROMPT_MAP.keys()))
19 |     parser.add_argument("--provide_description", action="store_true")
20 |     parser.add_argument("--num_fewshot", type=int, default=0)
21 |     parser.add_argument("--batch_size", type=str, default=None)
22 |     parser.add_argument("--max_batch_size", type=int, default=None,
23 |                         help="Maximal batch size to try with --batch_size auto")
24 |     parser.add_argument("--device", type=str, default=None)
25 |     parser.add_argument("--output_path", default=None)
26 |     parser.add_argument("--limit", type=float, default=None,
27 |                         help="Limit the number of examples per task. "
28 |                              "If <1, limit is a percentage of the total number of examples.")
29 |     parser.add_argument("--data_sampling", type=float, default=None)
30 |     parser.add_argument("--no_cache", action="store_true")
31 |     parser.add_argument("--decontamination_ngrams_path", default=None)
32 |     parser.add_argument("--description_dict_path", default=None)
33 |     parser.add_argument("--check_integrity", action="store_true")
34 |     parser.add_argument("--write_out", action="store_true", default=False)
35 |     parser.add_argument("--output_base_path", type=str, default=None)
36 | 
37 |     return parser.parse_args()
38 | 
39 | 
40 | def main():
41 |     args = parse_args()
42 | 
43 |     assert not args.provide_description  # not implemented
44 | 
45 |     if args.limit:
46 |         print(
47 |             "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT."
48 |         )
49 | 
50 |     if args.tasks is None:
51 |         task_names = tasks.ALL_TASKS
52 |     else:
53 |         task_names = utils.pattern_match(args.tasks.split(","), tasks.ALL_TASKS)
54 | 
55 |     print(f"Selected Tasks: {task_names}")
56 | 
57 |     description_dict = {}
58 |     if args.description_dict_path:
59 |         with open(args.description_dict_path, "r") as f:
60 |             description_dict = json.load(f)
61 | 
62 |     results = evaluator.simple_evaluate(
63 |         model=args.model,
64 |         model_args=args.model_args,
65 |         tasks=task_names,
66 |         num_fewshot=args.num_fewshot,
67 |         batch_size=args.batch_size,
68 |         max_batch_size=args.max_batch_size,
69 |         device=args.device,
70 |         no_cache=args.no_cache,
71 |         limit=args.limit,
72 |         description_dict=description_dict,
73 |         decontamination_ngrams_path=args.decontamination_ngrams_path,
74 |         check_integrity=args.check_integrity,
75 |         write_out=args.write_out,
76 |         output_base_path=args.output_base_path,
77 |         model_prompt=args.model_prompt
78 |     )
79 | 
80 |     dumped = json.dumps(results, indent=2)
81 |     print(dumped)
82 | 
83 |     if args.output_path:
84 |         os.makedirs(os.path.dirname(args.output_path), exist_ok=True)
85 |         with open(args.output_path, "w") as f:
86 |             f.write(dumped)
87 | 
88 |     batch_sizes = ",".join(map(str, results["config"]["batch_sizes"]))
89 |     print(
90 |         f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, "
91 |         f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}"
92 |     )
93 |     print(evaluator.make_table(results))
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     main()
98 | 


--------------------------------------------------------------------------------
/src/evaluator.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import itertools
  3 | import numpy as np
  4 | import random
  5 | 
  6 | from lm_eval.utils import positional_deprecated, run_task_tests
  7 | import lm_eval.metrics
  8 | import lm_eval.models
  9 | import lm_eval.tasks
 10 | import lm_eval.base
 11 | 
 12 | from model_prompt import MODEL_PROMPT_MAP
 13 | from chatlm import ChatLM
 14 | import tasks as ta
 15 | 
 16 | @positional_deprecated
 17 | def simple_evaluate(
 18 |     model,
 19 |     model_args=None,
 20 |     tasks=[],
 21 |     num_fewshot=0,
 22 |     batch_size=None,
 23 |     max_batch_size=None,
 24 |     device=None,
 25 |     no_cache=False,
 26 |     limit=None,
 27 |     bootstrap_iters=100,
 28 |     description_dict=None,
 29 |     check_integrity=False,
 30 |     decontamination_ngrams_path=None,
 31 |     write_out=False,
 32 |     output_base_path=None,
 33 |     model_prompt=None
 34 | ):
 35 |     """Instantiate and evaluate a model on a list of tasks.
 36 | 
 37 |     :param model: Union[str, LM]
 38 |         Name of model or LM object, see lm_eval.models.get_model
 39 |     :param model_args: Optional[str]
 40 |         String arguments for each model class, see LM.create_from_arg_string.
 41 |         Ignored if `model` argument is a LM object.
 42 |     :param tasks: list[Union[str, Task]]
 43 |         List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
 44 |     :param num_fewshot: int
 45 |         Number of examples in few-shot context
 46 |     :param batch_size: int or str, optional
 47 |         Batch size for model
 48 |     :param max_batch_size: int, optional
 49 |         Maximal batch size to try with automatic batch size detection
 50 |     :param device: str, optional
 51 |         PyTorch device (e.g. "cpu" or "cuda:0") for running models
 52 |     :param no_cache: bool
 53 |         Whether or not to cache
 54 |     :param limit: int or float, optional
 55 |         Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples.
 56 |     :param bootstrap_iters:
 57 |         Number of iterations for bootstrap statistics
 58 |     :param description_dict: dict[str, str]
 59 |         Dictionary of custom task descriptions of the form: `task_name: description`
 60 |     :param check_integrity: bool
 61 |         Whether to run the relevant part of the test suite for the tasks
 62 |     :param write_out: bool
 63 |         If True, write details about prompts and logits to json for all tasks
 64 |     :param output_base_path: str, optional
 65 |         Directory to which detailed eval info will be written. Defaults to present working dir.
 66 |     :return
 67 |         Dictionary of results
 68 |     """
 69 |     random.seed(1234)
 70 |     np.random.seed(1234)
 71 | 
 72 |     assert len(tasks) != 0, "No tasks specified"
 73 | 
 74 |     if isinstance(model, str):
 75 |         if model_args is None:
 76 |             model_args = ""
 77 |         if model[:3] != "gpt":
 78 |             lm = lm_eval.models.get_model(model).create_from_arg_string(
 79 |                 model_args, {"batch_size": batch_size, "max_batch_size": max_batch_size, "device": device}
 80 |             )
 81 |         else:
 82 |             lm = ChatLM(model)
 83 |     else:
 84 |         assert isinstance(model, lm_eval.base.LM)
 85 |         lm = model
 86 | 
 87 |     if not no_cache:
 88 |         lm = lm_eval.base.CachingLM(
 89 |             lm,
 90 |             "lm_cache/"
 91 |             + (model if isinstance(model, str) else model.model.config._name_or_path)
 92 |             + "_"
 93 |             + model_args.replace("=", "-").replace(",", "_").replace("/", "-")
 94 |             + ".db",
 95 |         )
 96 | 
 97 |     task_dict = ta.get_task_dict(tasks)
 98 | 
 99 |     if check_integrity:
100 |         run_task_tests(task_list=tasks)
101 | 
102 |     results = evaluate(
103 |         lm=lm,
104 |         task_dict=task_dict,
105 |         num_fewshot=num_fewshot,
106 |         limit=limit,
107 |         bootstrap_iters=bootstrap_iters,
108 |         description_dict=description_dict,
109 |         decontamination_ngrams_path=decontamination_ngrams_path,
110 |         write_out=write_out,
111 |         output_base_path=output_base_path,
112 |         model_prompt=model_prompt
113 |     )
114 | 
115 |     # add info about the model and few shot config
116 |     results["config"] = {
117 |         "model": (model if isinstance(model, str) else model.model.config._name_or_path),
118 |         "model_args": model_args,
119 |         "num_fewshot": num_fewshot,
120 |         "batch_size": batch_size,
121 |         "batch_sizes": list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [],
122 |         "device": device,
123 |         "no_cache": no_cache,
124 |         "limit": limit,
125 |         "bootstrap_iters": bootstrap_iters,
126 |         "description_dict": description_dict,
127 |     }
128 | 
129 |     return results
130 | 
131 | 
132 | decontaminate_suffix = "_decontaminate"
133 | 
134 | 
135 | @positional_deprecated
136 | def evaluate(
137 |     lm,
138 |     task_dict,
139 |     provide_description=None,
140 |     num_fewshot=0,
141 |     limit=None,
142 |     bootstrap_iters=100000,
143 |     description_dict=None,
144 |     decontamination_ngrams_path=None,
145 |     write_out=False,
146 |     output_base_path=None,
147 |     model_prompt=None
148 | ):
149 |     """Instantiate and evaluate a model on a list of tasks.
150 | 
151 |     :param lm: obj
152 |         Language Model
153 |     :param task_dict: dict[str, Task]
154 |         Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise.
155 |     :param provide_description: bool
156 |         Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method
157 |     :param num_fewshot: int
158 |         Number of examples in few-shot context
159 |     :param limit: int, optional
160 |         Limit the number of examples per task (only use this for testing)
161 |     :param bootstrap_iters:
162 |         Number of iterations for bootstrap statistics
163 |     :param description_dict: dict[str, str]
164 |         Dictionary of custom task descriptions of the form: `task_name: description`
165 |     :param write_out: bool
166 |         If True, write all prompts, logits and metrics to json for offline analysis
167 |     :param output_base_path: str, optional
168 |         Directory to which detailed eval info will be written. Defaults to present working dir
169 |     :return
170 |         Dictionary of results
171 |     """
172 |     # TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces
173 | 
174 |     # TODO: todo: implement proper description-providing system
175 |     assert not provide_description  # not implemented.
176 |     if provide_description is not None:
177 |         # nudge people to not specify it at all
178 |         print(
179 |             "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict"
180 |         )
181 | 
182 |     decontaminate = decontamination_ngrams_path is not None
183 | 
184 |     task_dict_items = [
185 |         (name, task)
186 |         for name, task in task_dict.items()
187 |         if (task.has_validation_docs() or task.has_test_docs())
188 |     ]
189 | 
190 |     results = collections.defaultdict(dict)
191 |     versions = collections.defaultdict(dict)
192 | 
193 |     requests = collections.defaultdict(list)
194 |     turn_requests = collections.defaultdict(dict)
195 |     requests_origin = collections.defaultdict(list)
196 | 
197 |     overlaps = collections.defaultdict(list)  # {task_name: contaminated_docs}
198 | 
199 |     # If we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger
200 |     # memory, we can always modify this plumbing to support that, but I didn't want to include it just yet because
201 |     # over-engineering is bad (or we could make it write the requests to disk and then read them back out again
202 |     #  - probably using an sqlite db because of all the moving parts we have
203 | 
204 |     # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable
205 |     docs = {}
206 |     write_out_info = {}
207 | 
208 |     docs_for_decontamination = collections.defaultdict(list)
209 | 
210 |     # get lists of each type of request
211 |     for task_name, task in task_dict_items:
212 |         versions[task_name] = task.VERSION
213 |         # default to test doc, fall back to val doc if validation unavailable
214 |         # TODO: the test-fallback-to-val system isn't final, we should revisit it at some point
215 |         if task.has_test_docs():
216 |             task_doc_func = task.test_docs
217 |             task_set = "test"  # Required for caching in the decontamination
218 |         elif task.has_validation_docs():
219 |             task_set = "val"  # Required for caching in the decontamination
220 |             task_doc_func = task.validation_docs
221 |         else:
222 |             raise RuntimeError("Task has neither test_docs nor validation_docs")
223 | 
224 |         # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order
225 |         task_docs = list(task_doc_func())
226 |         rnd = random.Random()
227 |         rnd.seed(42)
228 |         rnd.shuffle(task_docs)
229 |         print(f"Task: {task_name}; number of docs: {len(task_docs)}")
230 | 
231 |         if write_out:
232 |             prompt_details = []
233 | 
234 |         description = (
235 |             description_dict[task_name]
236 |             if description_dict and task_name in description_dict
237 |             else ""
238 |         )
239 |         if limit is not None:
240 |             limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit)
241 | 
242 |         if model_prompt is None:
243 |             model_prompt = 'no_prompt'
244 | 
245 |         for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)):
246 |             if decontaminate and task.should_decontaminate():
247 |                 docs_for_decontamination[(task_name, task_set)].append(
248 |                     task.doc_to_decontamination_query(doc)
249 |                 )
250 | 
251 |             docs[(task_name, doc_id)] = doc
252 |             ctx = task.fewshot_context(
253 |                 doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
254 |             )
255 | 
256 |             ctx = MODEL_PROMPT_MAP[model_prompt](ctx)
257 |             
258 |             reqs = task.construct_requests(doc, ctx)
259 | 
260 |             if write_out:
261 |                 prompt_details.append({"doc_id": doc_id})
262 | 
263 |             # print the prompt for the first few documents
264 |             if doc_id < 1:
265 |                 print(
266 |                     f"Task: {task_name}; document {doc_id}; context prompt (starting on next line):\n{ctx}\n(end of prompt on previous line)"
267 |                 )
268 |                 print("Requests:", reqs)
269 | 
270 |             if not isinstance(reqs, (list, tuple)):
271 |                 reqs = [reqs]
272 |             for i, req in enumerate(reqs):
273 |                 requests[req.request_type].append(req)
274 |                 # i: index in requests for a single task instance
275 |                 # doc_id: unique id that we can get back to a doc using `docs`
276 |                 diag_id = doc.get("dialogue_id", doc_id)
277 |                 turn = doc.get("turn", 0)
278 |                 turn_requests[(diag_id, turn)] = (task_name, doc, doc_id, req)
279 |                 requests_origin[req.request_type].append((i, task_name, doc, doc_id, diag_id, turn))
280 |                 
281 |                 #print("req: " + str(req.args))
282 | 
283 |                 if write_out:
284 |                     prompt_details[-1][f"prompt_{i}"] = "".join(
285 |                         (map(lambda x: "".join(x), req.args))
286 |                     )
287 |             
288 |             #print("request:" + request[])
289 |         if write_out:
290 |             write_out_info[task_name] = prompt_details
291 | 
292 |     # Compare all tasks/sets at once to ensure a single training set scan
293 |     if decontaminate:
294 |         from lm_eval.decontamination.decontaminate import get_train_overlap
295 | 
296 |         print("Finding train/test overlap, please wait...")
297 |         overlaps = get_train_overlap(
298 |             docs_for_decontamination, decontamination_ngrams_path, limit
299 |         )
300 | 
301 |     # all responses for each (task, doc)
302 |     process_res_queue = collections.defaultdict(list)
303 | 
304 |     # execute each type of request
305 |     for reqtype, reqs in requests.items():
306 |         # TODO: right now, this code runs multiple separate LM requests for multiple Requests differing
307 |         #       only in index. We could implement some kind of caching, but that would be more of a band-aid
308 |         #       solution. we could also implement some kind of auto-grouping here;
309 |         #       they should end up next to each other.
310 | 
311 |         max_turns = max([val[-1] for val in requests_origin[reqtype]])
312 |         print("Running", reqtype, "requests")
313 |         print(f"Maximum {max_turns} turns")
314 |         task_turns = {}
315 |         for cur_turn in range(max_turns+1):
316 |             print(f"Running {cur_turn}th turn")
317 | 
318 |             filtered_reqs = []
319 | 
320 |             for req, (i, task_name, doc, doc_id, diag_id, turn) in zip(reqs, requests_origin[reqtype]
321 | ):
322 |                 if turn != cur_turn:
323 |                     continue
324 |                 task_turns[task_name] = max(turn, task_turns.get(task_name, -1))
325 |                 task = task_dict[task_name]
326 |                 req = task.reformulate_turn_req(req, [(turn_requests.get((diag_id, t), None), t) for
327 | t in range(turn)], turn)
328 |                 filtered_reqs.append([req, (i, task_name, doc, doc_id, diag_id, turn)])
329 | 
330 |             resps = getattr(lm, reqtype)([req.args for req in reqs])
331 |             resps = [
332 |                 x if req[0].index is None else x[req[0].index] for x, req in zip(resps, filtered_reqs
333 | )
334 |             ]
335 | 
336 |             for resp, req in zip(resps, filtered_reqs):
337 |                 i, task_name, doc, doc_id, diag_id, turn = req[1]
338 |                 task = task_dict[task_name]
339 |                 if not task.EVAL_LAST_TURN or turn == task_turns[task_name]:
340 |                     process_res_queue[(task_name, doc_id)].append((i, resp))
341 |                 turn_requests[(diag_id, turn)] = resp
342 | 
343 |                 if write_out:
344 |                     write_out_info[task_name][doc_id][f"logit_{i}"] = resp
345 |                     task = task_dict[task_name]
346 |                     if isinstance(task, lm_eval.base.MultipleChoiceTask):
347 |                         write_out_info[task_name][doc_id]["truth"] = doc["gold"]
348 |                     elif isinstance(task, lm_eval.tasks.winogrande.Winogrande):
349 |                         write_out_info[task_name][doc_id]["truth"] = task.answer_to_num[
350 |                             doc["answer"]
351 |                         ]
352 |                     else:
353 |                         write_out_info[task_name][doc_id]["truth"] = task.doc_to_target(doc)
354 |     vals = collections.defaultdict(list)
355 | 
356 |     # unpack results and sort back in order and return control to Task
357 |     for (task_name, doc_id), requests in process_res_queue.items():
358 |         requests.sort(key=lambda x: x[0])
359 |         requests = [x[1] for x in requests]
360 | 
361 |         task = task_dict[task_name]
362 |         doc = docs[(task_name, doc_id)]
363 |         print("doc: "+ str(doc))
364 |         print("requests: "+ str(requests))
365 | 
366 | 
367 |         metrics = task.process_results(doc, requests)
368 |         for metric, value in metrics.items():
369 |             vals[(task_name, metric)].append(value)
370 | 
371 |             if write_out:
372 |                 write_out_info[task_name][doc_id][metric] = str(value)
373 | 
374 |             # Re-use the evaluation for the decontaminated set by just ignoring the overlaps
375 |             if decontaminate and task_name in overlaps:
376 |                 if doc_id not in overlaps[task_name]:
377 |                     vals[(task_name, metric + decontaminate_suffix)].append(value)
378 | 
379 |     # aggregate results
380 |     for (task_name, metric), items in vals.items():
381 |         task = task_dict[task_name]
382 |         real_metric = metric  # key when looking up the metric with task.aggregation
383 |         if metric.endswith(decontaminate_suffix):
384 |             real_metric = metric.replace(
385 |                 decontaminate_suffix, ""
386 |             )  # decontaminated still uses the same metric
387 |         
388 |         results[task_name][metric] = task.aggregation()[real_metric](items)
389 |         # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap
390 |         # so we run them less iterations. still looking for a cleaner way to do this
391 | 
392 |         stderr = lm_eval.metrics.stderr_for_metric(
393 |             metric=task.aggregation()[real_metric],
394 |             bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters,
395 |         )
396 | 
397 |         if stderr is not None:
398 |             results[task_name][metric + "_stderr"] = stderr(items)
399 | 
400 |     if write_out:
401 |         import json
402 |         import pathlib
403 | 
404 |         output_base_path = (
405 |             pathlib.Path(output_base_path)
406 |             if output_base_path is not None
407 |             else pathlib.Path(".")
408 |         )
409 |         try:
410 |             output_base_path.mkdir(parents=True, exist_ok=False)
411 |         except FileExistsError:
412 |             pass
413 | 
414 |         for task_name, _ in task_dict_items:
415 |             with open(
416 |                 output_base_path.joinpath(f"{task_name}_write_out_info.json"),
417 |                 "w",
418 |                 encoding="utf8",
419 |             ) as fp:
420 |                 json.dump(write_out_info[task_name], fp, indent=4, ensure_ascii=False)
421 | 
422 |     return {"results": dict(results), "versions": dict(versions)}
423 | 
424 | 
425 | def make_table(result_dict):
426 |     """Generate table of results."""
427 |     from pytablewriter import MarkdownTableWriter, LatexTableWriter
428 | 
429 |     md_writer = MarkdownTableWriter()
430 |     latex_writer = LatexTableWriter()
431 |     md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
432 |     latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"]
433 | 
434 |     values = []
435 | 
436 |     for k, dic in result_dict["results"].items():
437 |         version = result_dict["versions"][k]
438 |         for m, v in dic.items():
439 |             if m.endswith("_stderr"):
440 |                 continue
441 | 
442 |             if m + "_stderr" in dic:
443 |                 se = dic[m + "_stderr"]
444 |                 values.append([k, version, m, "%.4f" % v, "±", "%.4f" % se])
445 |             else:
446 |                 values.append([k, version, m, "%.4f" % v, "", ""])
447 |             k = ""
448 |             version = ""
449 |     md_writer.value_matrix = values
450 |     latex_writer.value_matrix = values
451 | 
452 |     # todo: make latex table look good
453 |     # print(latex_writer.dumps())
454 | 
455 |     return md_writer.dumps()
456 | 


--------------------------------------------------------------------------------
/src/factscore_package/.cache/demons.json:
--------------------------------------------------------------------------------
1 | {"He made his acting debut in the film The Moon is the Sun's Dream (1992), and continued to appear in small and supporting roles throughout the 1990s.": ["He made his acting debut in the film.", "He made his acting debut in The Moon is the Sun's Dream.", "The Moon is the Sun's Dream is a film.", "The Moon is the Sun's Dream was released in 1992.", "After his acting debut, he appeared in small and supporting roles.", "After his acting debut, he appeared in small and supporting roles throughout the 1990s."], "He is also a successful producer and engineer, having worked with a wide variety of artists, including Willie Nelson, Tim McGraw, and Taylor Swift.": ["He is successful.", "He is a producer.", "He is a engineer.", "He has worked with a wide variety of artists.", "Willie Nelson is an artist.", "He has worked with Willie Nelson.", "Tim McGraw is an artist.", "He has worked with Tim McGraw.", "Taylor Swift is an artist.", "He has worked with Taylor Swift."], "In 1963, Collins became one of the third group of astronauts selected by NASA and he served as the back-up Command Module Pilot for the Gemini 7 mission.": ["Collins became an astronaut.", "Collins became one of the third group of astronauts.", "Collins became one of the third group of astronauts selected.", "Collins became one of the third group of astronauts selected by NASA.", "Collins became one of the third group of astronauts selected by NASA in 1963.", "He served as the Command Module Pilot.", "He served as the back-up Command Module Pilot.", "He served as the Command Module Pilot for the Gemini 7 mission."], "In addition to his acting roles, Bateman has written and directed two short films and is currently in development on his feature debut.": ["Bateman has acting roles.", "Bateman has written two short films.", "Bateman has directed two short films.", "Bateman has written and directed two short films.", "Bateman is currently in development on his feature debut."], "Michael Collins (born October 31, 1930) is a retired American astronaut and test pilot who was the Command Module Pilot for the Apollo 11 mission in 1969.": ["Michael Collins was born on October 31, 1930.", "Michael Collins is retired.", "Michael Collins is an American.", "Michael Collins was an astronaut.", "Michael Collins was a test pilot.", "Michael Collins was the Command Module Pilot.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission in 1969."], "He was an American composer, conductor, and musical director.": ["He was an American.", "He was a composer.", "He was a conductor.", "He was a musical director."], "She currently stars in the romantic comedy series, Love and Destiny, which premiered in 2019.": ["She currently stars in Love and Destiny.", "Love and Destiny is a romantic comedy series.", "Love and Destiny premiered in 2019. "], "His music has been described as a mix of traditional Mexican and Latin American styles, as well as jazz, folk, and rock.": ["His music has been described as a mix.", "His music has been described as a mix of traditional Mexican, Latin American styles, as well as jazz, folk, and rock."], "He also serves as an ambassador for the charity Leonard Cheshire Disability.": ["He serves as an ambassador.", "He serves as an ambassador for Leonard Cheshire Disability.", "Leonard Cheshire Disability is a charity."], "He began his career in Nashville in the late 1950s and has since released numerous albums, including a greatest hits collection in 1999.": ["He began his career in Nashville.", "He began his career in the late 1950s.", "He began his career in Nashville in the late 1950s.", "Since he began his career, he has released numerous albums.", "Since he began his career, he has released a greatest hits collection.", "Since he began his career, he has released a greatest hits collection in 1999."], "He has been performing since the age of 8, when he joined a band in his hometown of Guadalajara and has since gone on to record six studio albums and several singles of his own original material.": ["He has been performing since the age of 8.", "He joined a band.", "His hometown is Guadalajara.", "He joined a band in his hometown.", "He joined a band in his hometown of Guadalajara at the age of 8.", "He has gone on to record six studio albums.", "He has gone on to record six studio albums at the age of 8.", "He has gone on to record several singles of his own original material.", "He has gone on to record several singles of his own original material at the age of 8."], "She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013.": ["She is the former President.", "She is also the former President of the Malaysian Chinese Association (MCA)", "She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013."], "During his professional career, McCoy played for the Broncos, the San Diego Chargers, the Minnesota Vikings, and the Jacksonville Jaguars.": ["McCoy played for the Broncos.", "McCoy played for the Broncos during his professional career.", "McCoy played for the San Diego Chargers.", "McCoy played for the San Diego Chargers during his professional career.", "McCoy played for the Minnesota Vikings.", "McCoy played for the Minnesota Vikings during his professional career.", "McCoy played for the Jacksonville Jaguars.", "McCoy played for the Jacksonville Jaguars during his professional career."], "Miller has been described as the architect of Trump's controversial immigration policies, and has previously worked for Alabama Senator Jeff Sessions on immigration issues.": ["Miller has been described as the architect.", "Miller has been described as the architect of Trump's controversial immigration policies.", "Miller has previously worked for Alabama Senator Jeff Sessions.", "Miller has previously worked for Alabama Senator Jeff Sessions on immigration issues."], "Her work is often described as whimsical and dreamlike.": ["Her work is often described as whimsical.", "Her work is often described as dreamlike."], "He graduated from the United States Military Academy in 1952, and then went on to serve in the United States Air Force.": ["He graduated from the United States Military Academy.", "He graduated from the United States Military Academy in 1952.", "He went on to serve in the United States Air Force.", "He went on to serve in the United States Air Force after he graduated from the United States Military Academy."], "He is best known for his roles in the films Memories of Murder (2003), The Host (2006), (...) and Parasite (2019).": ["One of his best known roles is in Memories of Murder.", "Memories of Murder is a film.", "Memories of Murder was released in 2003.", "One of his best known roles is in The Host.", "The Host is a film.", "The Host was released in 2006.", "One of his best known roles is in Parasite.", "Parasite is a film.", "Parasite was released in 2019."], "Song Kang-ho was born in Gongju, South Korea in 1967.": ["Song Kang-ho was born in Gongju.", "Song Kang-ho was born in South Korea.", "Song Kang-ho was born in 1967."], "He studied theater at Chung-Ang University in Seoul.": ["He studied theater.", "He studied at Chung-Ang University.", "He studied at Chung-Ang University in Seoul."], "His breakthrough came with the leading role in the acclaimed crime-drama film Memories of Murder in 2003.": ["His breakthrough came with Memories of Murder.", "He was the leading role in Memories of Murder.", "Memories of Murder was released in 2003.", "Memories of Murder is a film.", "Memories of Murder is an acclaimed crime-drama film."], "This was followed by the monster movie The Host in 2006, which became the highest-grossing film in Korean history at the time.": ["This was followed by The Host.", "The Host is the movie.", "The Host is a monster movie.", "The Host was released in 2006.", "The Host became the highest-grossing film in Korean history at the time.", "The Host is not the highest-grossing film in Korean history anymore."]}
2 | 


--------------------------------------------------------------------------------
/src/factscore_package/.cache/demons.txt:
--------------------------------------------------------------------------------
  1 | He made his acting debut in the film The Moon is the Sun's Dream (1992), and continued to appear in small and supporting roles throughout the 1990s.
  2 | - He made his acting debut in the film.
  3 | - He made his acting debut in The Moon is the Sun's Dream.
  4 | - The Moon is the Sun's Dream is a film.
  5 | - The Moon is the Sun's Dream was released in 1992.
  6 | - After his acting debut, he appeared in small and supporting roles.
  7 | - After his acting debut, he appeared in small and supporting roles throughout the 1990s.
  8 | 
  9 | He is also a successful producer and engineer, having worked with a wide variety of artists, including Willie Nelson, Tim McGraw, and Taylor Swift.
 10 | - He is successful.
 11 | - He is a producer.
 12 | - He is a engineer.
 13 | - He has worked with a wide variety of artists.
 14 | - Willie Nelson is an artist.
 15 | - He has worked with Willie Nelson.
 16 | - Tim McGraw is an artist.
 17 | - He has worked with Tim McGraw.
 18 | - Taylor Swift is an artist.
 19 | - He has worked with Taylor Swift.
 20 | 
 21 | In 1963, Collins became one of the third group of astronauts selected by NASA and he served as the back-up Command Module Pilot for the Gemini 7 mission.
 22 | - Collins became an astronaut.
 23 | - Collins became one of the third group of astronauts.
 24 | - Collins became one of the third group of astronauts selected.
 25 | - Collins became one of the third group of astronauts selected by NASA.
 26 | - Collins became one of the third group of astronauts selected by NASA in 1963.
 27 | - He served as the Command Module Pilot.
 28 | - He served as the back-up Command Module Pilot.
 29 | - He served as the Command Module Pilot for the Gemini 7 mission.
 30 | 
 31 | In addition to his acting roles, Bateman has written and directed two short films and is currently in development on his feature debut.
 32 | - Bateman has acting roles.
 33 | - Bateman has written two short films.
 34 | - Bateman has directed two short films.
 35 | - Bateman has written and directed two short films.
 36 | - Bateman is currently in development on his feature debut.
 37 | 
 38 | Michael Collins (born October 31, 1930) is a retired American astronaut and test pilot who was the Command Module Pilot for the Apollo 11 mission in 1969.
 39 | - Michael Collins was born on October 31, 1930.
 40 | - Michael Collins is retired.
 41 | - Michael Collins is an American.
 42 | - Michael Collins was an astronaut.
 43 | - Michael Collins was a test pilot.
 44 | - Michael Collins was the Command Module Pilot.
 45 | - Michael Collins was the Command Module Pilot for the Apollo 11 mission.
 46 | - Michael Collins was the Command Module Pilot for the Apollo 11 mission in 1969.
 47 | 
 48 | He was an American composer, conductor, and musical director.
 49 | - He was an American.
 50 | - He was a composer.
 51 | - He was a conductor.
 52 | - He was a musical director.
 53 | 
 54 | She currently stars in the romantic comedy series, Love and Destiny, which premiered in 2019.
 55 | - She currently stars in Love and Destiny.
 56 | - Love and Destiny is a romantic comedy series.
 57 | - Love and Destiny premiered in 2019. 
 58 | 
 59 | His music has been described as a mix of traditional Mexican and Latin American styles, as well as jazz, folk, and rock.
 60 | - His music has been described as a mix.
 61 | - His music has been described as a mix of traditional Mexican, Latin American styles, as well as jazz, folk, and rock.
 62 | 
 63 | He also serves as an ambassador for the charity Leonard Cheshire Disability.
 64 | - He serves as an ambassador.
 65 | - He serves as an ambassador for Leonard Cheshire Disability.
 66 | - Leonard Cheshire Disability is a charity.
 67 | 
 68 | He began his career in Nashville in the late 1950s and has since released numerous albums, including a greatest hits collection in 1999.
 69 | - He began his career in Nashville.
 70 | - He began his career in the late 1950s.
 71 | - He began his career in Nashville in the late 1950s.
 72 | - Since he began his career, he has released numerous albums.
 73 | - Since he began his career, he has released a greatest hits collection.
 74 | - Since he began his career, he has released a greatest hits collection in 1999.
 75 | 
 76 | He has been performing since the age of 8, when he joined a band in his hometown of Guadalajara and has since gone on to record six studio albums and several singles of his own original material.
 77 | - He has been performing since the age of 8.
 78 | - He joined a band.
 79 | - His hometown is Guadalajara.
 80 | - He joined a band in his hometown.
 81 | - He joined a band in his hometown of Guadalajara at the age of 8.
 82 | - He has gone on to record six studio albums.
 83 | - He has gone on to record six studio albums at the age of 8.
 84 | - He has gone on to record several singles of his own original material.
 85 | - He has gone on to record several singles of his own original material at the age of 8.
 86 | 
 87 | She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013.
 88 | - She is the former President.
 89 | - She is also the former President of the Malaysian Chinese Association (MCA)
 90 | - She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013.
 91 | 
 92 | During his professional career, McCoy played for the Broncos, the San Diego Chargers, the Minnesota Vikings, and the Jacksonville Jaguars.
 93 | - McCoy played for the Broncos.
 94 | - McCoy played for the Broncos during his professional career.
 95 | - McCoy played for the San Diego Chargers.
 96 | - McCoy played for the San Diego Chargers during his professional career.
 97 | - McCoy played for the Minnesota Vikings.
 98 | - McCoy played for the Minnesota Vikings during his professional career.
 99 | - McCoy played for the Jacksonville Jaguars.
100 | - McCoy played for the Jacksonville Jaguars during his professional career.
101 | 
102 | Miller has been described as the architect of Trump's controversial immigration policies, and has previously worked for Alabama Senator Jeff Sessions on immigration issues.
103 | - Miller has been described as the architect.
104 | - Miller has been described as the architect of Trump's controversial immigration policies.
105 | - Miller has previously worked for Alabama Senator Jeff Sessions.
106 | - Miller has previously worked for Alabama Senator Jeff Sessions on immigration issues.
107 | 
108 | Her work is often described as whimsical and dreamlike.
109 | - Her work is often described as whimsical.
110 | - Her work is often described as dreamlike.
111 | 
112 | He graduated from the United States Military Academy in 1952, and then went on to serve in the United States Air Force.
113 | - He graduated from the United States Military Academy.
114 | - He graduated from the United States Military Academy in 1952.
115 | - He went on to serve in the United States Air Force.
116 | - He went on to serve in the United States Air Force after he graduated from the United States Military Academy.
117 | 
118 | He is best known for his roles in the films Memories of Murder (2003), The Host (2006), (...) and Parasite (2019).
119 | - One of his best known roles is in Memories of Murder.
120 | - Memories of Murder is a film.
121 | - Memories of Murder was released in 2003.
122 | - One of his best known roles is in The Host.
123 | - The Host is a film.
124 | - The Host was released in 2006.
125 | - One of his best known roles is in Parasite.
126 | - Parasite is a film.
127 | - Parasite was released in 2019.
128 | 
129 | Song Kang-ho was born in Gongju, South Korea in 1967.
130 | - Song Kang-ho was born in Gongju.
131 | - Song Kang-ho was born in South Korea.
132 | - Song Kang-ho was born in 1967.
133 | 
134 | He studied theater at Chung-Ang University in Seoul.
135 | - He studied theater.
136 | - He studied at Chung-Ang University.
137 | - He studied at Chung-Ang University in Seoul.
138 | 
139 | His breakthrough came with the leading role in the acclaimed crime-drama film Memories of Murder in 2003.
140 | - His breakthrough came with Memories of Murder.
141 | - He was the leading role in Memories of Murder.
142 | - Memories of Murder was released in 2003.
143 | - Memories of Murder is a film.
144 | - Memories of Murder is an acclaimed crime-drama film.
145 | 
146 | This was followed by the monster movie The Host in 2006, which became the highest-grossing film in Korean history at the time.
147 | - This was followed by The Host.
148 | - The Host is the movie.
149 | - The Host is a monster movie.
150 | - The Host was released in 2006.
151 | - The Host became the highest-grossing film in Korean history at the time.
152 | - The Host is not the highest-grossing film in Korean history anymore.
153 | 


--------------------------------------------------------------------------------
/src/factscore_package/.cache/demons_complex.json:
--------------------------------------------------------------------------------
1 | {"He is also a successful producer and engineer, having worked with a wide variety of artists, including Willie Nelson, Tim McGraw, and Taylor Swift.": ["He is successful.", "He is a producer.", "He is a engineer.", "He has worked with a wide variety of artists.", "Willie Nelson is an artist.", "He has worked with Willie Nelson.", "Tim McGraw is an artist.", "He has worked with Tim McGraw.", "Taylor Swift is an artist.", "He has worked with Taylor Swift."], "Michael Collins (born October 31, 1930) is a retired American astronaut and test pilot who was the Command Module Pilot for the Apollo 11 mission in 1969.": ["Michael Collins was born on October 31, 1930.", "Michael Collins is retired.", "Michael Collins is an American.", "Michael Collins was an astronaut.", "Michael Collins was a test pilot.", "Michael Collins was the Command Module Pilot.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission in 1969."], "He was an American composer, conductor, and musical director.": ["He was an American.", "He was a composer.", "He was a conductor.", "He was a musical director."], "In 1970, the Empire State Building in New York City was the tallest building in the United States and the world, standing at 1,250 feet tall.": ["The Empire State Building is in New York City.", "In 1970, the Empire State Building was the tallest building in the United States.", "In 1970, the Empire State Building was the tallest building in the world.", "The Empire State Building stands at 1,250 feet tall."], "The Willis Tower (formerly the Sears Tower) in Chicago was the first to do so, reaching 1,450 feet in 1973. ": ["The Willis Tower is formerly called the Sears Tower.", "The Willis Tower is in Chicago.", "The Willis Tower reached 1,450 feet in 1973."], "The current tallest building in the United States is One World Trade Center in New York City, which stands at 1,776 feet. ": ["The current tallest building in the United States is One World Trade Center.", "One World Trade Center is in New York City.", "One World Trade Center stands at 1,776 feet."], "William E. Moerner is an American physical chemist who was affiliated with the University of Sussex as a visiting professor. ": ["William E. Moerner is an American.", "William E. Moerner is an physical chemist.", "William E. Moerner was affiliated with the University of Sussex.", "William E. Moerner was affiliated with the University of Sussex as a visiting professor."], "Sir Harold Walter Kroto, an English chemist, shared the 1996 Nobel Prize in Chemistry with Robert Curl and Richard Smalley for their discovery of a new form of carbon, buckminsterfullerene, also known as buckyballs. ": ["Sir Harold Walter Kroto is English.", "Sir Harold Walter Kroto is an chemist.", "Sir Harold Walter Kroto won the Nobel Prize in 1996.", "Sir Harold Walter Kroto won the Nobel Prize in Chemistry.", "Sir Harold Walter Kroto shared the Nobel Prize with Robert Curl and Richard Smalley.", "They won the prize for their discovery of a new form of carbon, buckminsterfullerene, also known as buckyballs."]}
2 | 


--------------------------------------------------------------------------------
/src/factscore_package/.cache/demons_full.txt:
--------------------------------------------------------------------------------
  1 | During his professional career, McCoy played for the Broncos, the San Diego Chargers, the Minnesota Vikings, and the Jacksonville Jaguars.
  2 | - McCoy played for the Broncos.
  3 | - McCoy played for the Broncos during his professional career.
  4 | - McCoy played for the San Diego Chargers.
  5 | - McCoy played for the San Diego Chargers during his professional career.
  6 | - McCoy played for the Minnesota Vikings.
  7 | - McCoy played for the Minnesota Vikings during his professional career.
  8 | - McCoy played for the Jacksonville Jaguars.
  9 | - McCoy played for the Jacksonville Jaguars during his professional career.
 10 | 
 11 | In addition to his acting roles, Bateman has written and directed two short films and is currently in development on his feature debut.
 12 | - Bateman has acting roles.
 13 | - Bateman has written two short films.
 14 | - Bateman has directed two short films.
 15 | - Bateman has written and directed two short films.
 16 | - Bateman is currently in development on his feature debut.
 17 | 
 18 | He is also a successful producer and engineer, having worked with a wide variety of artists, including Willie Nelson, Tim McGraw, and Taylor Swift.
 19 | - He is a successful producer.
 20 | - He is a successful engineer.
 21 | - He has worked with a wide variety of artists.
 22 | - Willie Nelson is an artist.
 23 | - He has worked with Willie Nelson.
 24 | - Tim McGraw is an artist.
 25 | - He has worked with Tim McGraw.
 26 | - Taylor Swift is an artist.
 27 | - He has worked with Taylor Swift.
 28 | 
 29 | He is the founder and Chairman of The Schneider Group, a multi-discipline consulting firm that provides strategic advice to clients in the areas of corporate strategy, operations, finance, and human resources.
 30 | - He is the founder of The Schneider Group.
 31 | - He is the Chairman of The Schneider Group.
 32 | - The Schneider Group is a multi-discipline consulting firm.
 33 | - The Schneider Group provides strategic advice to clients in the area of corporate strategy.
 34 | - The Schneider Group provides strategic advice to clients.
 35 | - The Schneider Group provides strategic advice to clients in the area of operations.
 36 | - The Schneider Group provides strategic advice to clients in the area of finance.
 37 | - The Schneider Group provides strategic advice to clients in the area of human resources.
 38 | 
 39 | He began his career in Nashville in the late 1950s and has since released numerous albums, including a greatest hits collection in 1999.
 40 | - He began his career in Nashville.
 41 | - He began his career in the late 1950s.
 42 | - He began his career in Nashville in the late 1950s.
 43 | - Since he began his career, he has released numerous albums.
 44 | - Since he began his career, he has released a greatest hits collection.
 45 | - Since he began his career, he has released a greatest hits collection in 1999.
 46 | 
 47 | He has been producing music since the early 2000s, and has been featured on various albums and compilations, including the Grammy Award-winning album The Emancipation of Mimi, by Mariah Carey.
 48 | - He has been producing music.
 49 | - He has been producing music since the early 2000s.
 50 | - He has been featured on various albums.
 51 | - He has been featured on compilations.
 52 | - The Emancipation of Mimi is by Mariah Carey.
 53 | - The Emancipation of Mimi is the Grammy Award-winning album.
 54 | - He has been featured on The Emancipation of Mimi.
 55 | 
 56 | He played college football for the University of Oregon, where he was an All-Pac-12 selection and was named to the All-America team in 2016.
 57 | - He played college football.
 58 | - He played college football for the University of Oregon.
 59 | - He was an All-Pac-12 selection.
 60 | - He was an All-Pac-12 selection at the University of Oregon.
 61 | - He was named to the All-America team.
 62 | - He was named to the All-America team in 2016.
 63 | - He was named to the All-America team in 2016 at the University of Oregon.
 64 | 
 65 | He is also the author of the book “The Entrepreneur’s Journey: From Idea to IPO”, which chronicles his experience as an investor and entrepreneur.
 66 | - “The Entrepreneur’s Journey: From Idea to IPO” is a book.
 67 | - He is the author of the book “The Entrepreneur’s Journey: From Idea to IPO”.
 68 | - “The Entrepreneur’s Journey: From Idea to IPO” chronicles his experience.
 69 | - “The Entrepreneur’s Journey: From Idea to IPO” chronicles his experience as an investor.
 70 | - “The Entrepreneur’s Journey: From Idea to IPO” chronicles his experience as an entrepreneur.
 71 | 
 72 | He is an internationally renowned scientist who has made major contributions to the field of membrane trafficking, a fundamental process in cells that is essential for normal physiology and disease.
 73 | - He is an internationally renowned scientist.
 74 | - He has made major contributions to the field of membrane trafficking.
 75 | - Membrane trafficking is a fundamental process in cells.
 76 | - Membrane trafficking is essential for normal physiology.
 77 | - Membrane trafficking is essential for normal disease.
 78 | 
 79 | Patrick has also been a keynote speaker at numerous industry events and conferences and is a highly sought-after mentor for aspiring entrepreneurs.
 80 | - Patrick has been keynote speaker.
 81 | - Patrick has also been a keynote speaker at numerous industry events.
 82 | - Patrick has also been a keynote speaker at numerous industry conferences.
 83 | - Patrick is a highly sought-after mentor.
 84 | - Patrick is a highly sought-after mentor for aspiring entrepreneurs.
 85 | 
 86 | He began practicing law in Romney, West Virginia and was elected to the Virginia House of Delegates in 1823, where he served until 1827.
 87 | - He began practicing law in Romney, West Virginia.
 88 | - He was elected to the Virginia House of Delegates.
 89 | - He was elected to the Virginia House of Delegates in 1823.
 90 | - He served in the Virginia House of Delegates.
 91 | - He served in the Virginia House of Delegates until 1827.
 92 | 
 93 | He has been performing since the age of 8, when he joined a band in his hometown of Guadalajara and has since gone on to record six studio albums and several singles of his own original material.
 94 | - He has been performing.
 95 | - He has been performing since the age of 8.
 96 | - He joined a band.
 97 | - His hometown is Guadalajara.
 98 | - He joined a band in his hometown.
 99 | - He joined a band in his hometown of Guadalajara at the age of 8.
100 | - He has gone on to record six studio albums.
101 | - He has gone on to record six studio albums at the age of 8.
102 | - He has gone on to record several singles of his own original material.
103 | - He has gone on to record several singles of his own original material at the age of 8.
104 | 
105 | He started his professional career in Ghana, playing for Berekum Arsenal, Heart of Lions and Bechem United before moving to the United States in 2012.
106 | - He started his professional career in Ghana.
107 | - He played for Berekum Arsenal.
108 | - He played for Berekum Arsenal before moving to the United States.
109 | - He played for Berekum Arsenal before moving to the United States in 2021.
110 | - He played for Heart of Lions.
111 | - He played for Heart of Lions before moving to the United States.
112 | - He played for Heart of Lions before moving to the United States in 2021.
113 | - He played for Bechem United.
114 | - He played for Bechem United before moving to the United States.
115 | - He played for Bechem United before moving to the United States in 2021.
116 | 
117 | His style is heavily influenced by modern classical, electronic, and ambient music, while also drawing from jazz, hip-hop, and world music.
118 | - His style is heavily influenced.
119 | - His style is heavily influenced by modern classical music.
120 | - His style is heavily influenced by modern electronic music.
121 | - His style is heavily influenced by modern ambient music.
122 | - His style draws from jazz.
123 | - His style draws from hip-hop.
124 | - His style draws from world music.
125 | 
126 | He rose to prominence in 1600, when he was appointed to lead the Western Army in the Battle of Sekigahara, where he defeated the Eastern Army of Tokugawa Ieyasu, who would later become the first shogun of the Edo period.
127 | - He rose to prominence.
128 | - He rose to prominence in 1600.
129 | - He was appointed to lead the Western Army.
130 | - He was appointed to lead the Western Army in the Battle of Sekigahara.
131 | - He was appointed to lead the Western Army in the Battle of Sekigahara in 1600.
132 | - He defeated the Eastern Army of Tokugawa Ieyasu.
133 | - He defeated the Eastern Army of Tokugawa Ieyasu in the Battle of Sekigahara.
134 | - Tokugawa Ieyasu became the first shogun of the Edo period.
135 | - Tokugawa Ieyasu became the first shogun of the Edo period after 1600.
136 | 
137 | He began his career at Manchester United, for whom he made his professional debut in 2011, and made 11 appearances over four seasons.
138 | - He began his career at Manchester United.
139 | - He made his professional debut in 2011.
140 | - He made his professional debut in 2011 at Manchester United.
141 | - He made 11 appearances.
142 | - He made 11 appearances over four seasons.
143 | - He made 11 appearances over four seasons at Manchester United.
144 | 


--------------------------------------------------------------------------------
/src/factscore_package/.cache/demos/demons.json:
--------------------------------------------------------------------------------
1 | {"He made his acting debut in the film The Moon is the Sun's Dream (1992), and continued to appear in small and supporting roles throughout the 1990s.": ["He made his acting debut in the film.", "He made his acting debut in The Moon is the Sun's Dream.", "The Moon is the Sun's Dream is a film.", "The Moon is the Sun's Dream was released in 1992.", "After his acting debut, he appeared in small and supporting roles.", "After his acting debut, he appeared in small and supporting roles throughout the 1990s."], "He is also a successful producer and engineer, having worked with a wide variety of artists, including Willie Nelson, Tim McGraw, and Taylor Swift.": ["He is successful.", "He is a producer.", "He is a engineer.", "He has worked with a wide variety of artists.", "Willie Nelson is an artist.", "He has worked with Willie Nelson.", "Tim McGraw is an artist.", "He has worked with Tim McGraw.", "Taylor Swift is an artist.", "He has worked with Taylor Swift."], "In 1963, Collins became one of the third group of astronauts selected by NASA and he served as the back-up Command Module Pilot for the Gemini 7 mission.": ["Collins became an astronaut.", "Collins became one of the third group of astronauts.", "Collins became one of the third group of astronauts selected.", "Collins became one of the third group of astronauts selected by NASA.", "Collins became one of the third group of astronauts selected by NASA in 1963.", "He served as the Command Module Pilot.", "He served as the back-up Command Module Pilot.", "He served as the Command Module Pilot for the Gemini 7 mission."], "In addition to his acting roles, Bateman has written and directed two short films and is currently in development on his feature debut.": ["Bateman has acting roles.", "Bateman has written two short films.", "Bateman has directed two short films.", "Bateman has written and directed two short films.", "Bateman is currently in development on his feature debut."], "Michael Collins (born October 31, 1930) is a retired American astronaut and test pilot who was the Command Module Pilot for the Apollo 11 mission in 1969.": ["Michael Collins was born on October 31, 1930.", "Michael Collins is retired.", "Michael Collins is an American.", "Michael Collins was an astronaut.", "Michael Collins was a test pilot.", "Michael Collins was the Command Module Pilot.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission in 1969."], "He was an American composer, conductor, and musical director.": ["He was an American.", "He was a composer.", "He was a conductor.", "He was a musical director."], "She currently stars in the romantic comedy series, Love and Destiny, which premiered in 2019.": ["She currently stars in Love and Destiny.", "Love and Destiny is a romantic comedy series.", "Love and Destiny premiered in 2019. "], "His music has been described as a mix of traditional Mexican and Latin American styles, as well as jazz, folk, and rock.": ["His music has been described as a mix.", "His music has been described as a mix of traditional Mexican, Latin American styles, as well as jazz, folk, and rock."], "He also serves as an ambassador for the charity Leonard Cheshire Disability.": ["He serves as an ambassador.", "He serves as an ambassador for Leonard Cheshire Disability.", "Leonard Cheshire Disability is a charity."], "He began his career in Nashville in the late 1950s and has since released numerous albums, including a greatest hits collection in 1999.": ["He began his career in Nashville.", "He began his career in the late 1950s.", "He began his career in Nashville in the late 1950s.", "Since he began his career, he has released numerous albums.", "Since he began his career, he has released a greatest hits collection.", "Since he began his career, he has released a greatest hits collection in 1999."], "He has been performing since the age of 8, when he joined a band in his hometown of Guadalajara and has since gone on to record six studio albums and several singles of his own original material.": ["He has been performing since the age of 8.", "He joined a band.", "His hometown is Guadalajara.", "He joined a band in his hometown.", "He joined a band in his hometown of Guadalajara at the age of 8.", "He has gone on to record six studio albums.", "He has gone on to record six studio albums at the age of 8.", "He has gone on to record several singles of his own original material.", "He has gone on to record several singles of his own original material at the age of 8."], "She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013.": ["She is the former President.", "She is also the former President of the Malaysian Chinese Association (MCA)", "She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013."], "During his professional career, McCoy played for the Broncos, the San Diego Chargers, the Minnesota Vikings, and the Jacksonville Jaguars.": ["McCoy played for the Broncos.", "McCoy played for the Broncos during his professional career.", "McCoy played for the San Diego Chargers.", "McCoy played for the San Diego Chargers during his professional career.", "McCoy played for the Minnesota Vikings.", "McCoy played for the Minnesota Vikings during his professional career.", "McCoy played for the Jacksonville Jaguars.", "McCoy played for the Jacksonville Jaguars during his professional career."], "Miller has been described as the architect of Trump's controversial immigration policies, and has previously worked for Alabama Senator Jeff Sessions on immigration issues.": ["Miller has been described as the architect.", "Miller has been described as the architect of Trump's controversial immigration policies.", "Miller has previously worked for Alabama Senator Jeff Sessions.", "Miller has previously worked for Alabama Senator Jeff Sessions on immigration issues."], "Her work is often described as whimsical and dreamlike.": ["Her work is often described as whimsical.", "Her work is often described as dreamlike."], "He graduated from the United States Military Academy in 1952, and then went on to serve in the United States Air Force.": ["He graduated from the United States Military Academy.", "He graduated from the United States Military Academy in 1952.", "He went on to serve in the United States Air Force.", "He went on to serve in the United States Air Force after he graduated from the United States Military Academy."], "He is best known for his roles in the films Memories of Murder (2003), The Host (2006), (...) and Parasite (2019).": ["One of his best known roles is in Memories of Murder.", "Memories of Murder is a film.", "Memories of Murder was released in 2003.", "One of his best known roles is in The Host.", "The Host is a film.", "The Host was released in 2006.", "One of his best known roles is in Parasite.", "Parasite is a film.", "Parasite was released in 2019."], "Song Kang-ho was born in Gongju, South Korea in 1967.": ["Song Kang-ho was born in Gongju.", "Song Kang-ho was born in South Korea.", "Song Kang-ho was born in 1967."], "He studied theater at Chung-Ang University in Seoul.": ["He studied theater.", "He studied at Chung-Ang University.", "He studied at Chung-Ang University in Seoul."], "His breakthrough came with the leading role in the acclaimed crime-drama film Memories of Murder in 2003.": ["His breakthrough came with Memories of Murder.", "He was the leading role in Memories of Murder.", "Memories of Murder was released in 2003.", "Memories of Murder is a film.", "Memories of Murder is an acclaimed crime-drama film."], "This was followed by the monster movie The Host in 2006, which became the highest-grossing film in Korean history at the time.": ["This was followed by The Host.", "The Host is the movie.", "The Host is a monster movie.", "The Host was released in 2006.", "The Host became the highest-grossing film in Korean history at the time.", "The Host is not the highest-grossing film in Korean history anymore."]}
2 | 


--------------------------------------------------------------------------------
/src/factscore_package/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/src/factscore_package/__init__.py


--------------------------------------------------------------------------------
/src/factscore_package/abstain_detection.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import re
 3 | 
 4 | invalid_ppl_mentions = [
 5 |     "I could not find any information",
 6 |     "The search results do not provide",
 7 |     "There is no information",
 8 |     "There are no search results",
 9 |     "there are no provided search results",
10 |     "not provided in the search results",
11 |     "is not mentioned in the provided search results",
12 |     "There seems to be a mistake in the question",
13 |     "Not sources found",
14 |     "No sources found",
15 |     "Try a more general question"
16 | ]
17 | 
18 | def remove_citation(text):
19 |     # text = re.sub(r'\[\d+\]', '', text)
20 |     text = re.sub(r"\s*\[\d+\]\s*","", text)
21 |     if text.startswith("According to , "):
22 |         text = text.replace("According to , ", "According to the search results, ")
23 |     return text
24 | 
25 | def is_invalid_ppl(text):
26 |     return np.any([text.lower().startswith(mention.lower()) for mention in invalid_ppl_mentions])
27 | 
28 | def is_invalid_paragraph_ppl(text):
29 |     return len(text.strip())==0 or np.any([mention.lower() in text.lower() for mention in invalid_ppl_mentions])
30 | 
31 | def perplexity_ai_abstain_detect(generation):
32 |     output = remove_citation(generation)
33 |     if is_invalid_ppl(output):
34 |         return True
35 |     valid_paras = []
36 |     for para in output.split("\n\n"):
37 |         if is_invalid_paragraph_ppl(para):
38 |             break
39 |         valid_paras.append(para.strip())
40 | 
41 |     if len(valid_paras) == 0:
42 |         return True
43 |     else:
44 |         return False
45 | 
46 | def generic_abstain_detect(generation):
47 |     return generation.startswith("I'm sorry") or "provide more" in generation
48 | 
49 | def is_response_abstained(generation, fn_type):
50 |     if fn_type == "perplexity_ai":
51 |         return perplexity_ai_abstain_detect(generation)
52 | 
53 |     elif fn_type == "generic":
54 |         return generic_abstain_detect(generation)
55 | 
56 |     else:
57 |         return False
58 | 


--------------------------------------------------------------------------------
/src/factscore_package/atomic_facts.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | import re
  4 | import functools
  5 | import string
  6 | import spacy
  7 | import sys
  8 | import nltk
  9 | import openai
 10 | from rank_bm25 import BM25Okapi
 11 | import os
 12 | import time
 13 | from nltk.tokenize import sent_tokenize
 14 | 
 15 | from .openai_lm import OpenAIModel
 16 | 
 17 | nltk.download("punkt")
 18 | 
 19 | 
 20 | class AtomicFactGenerator(object):
 21 |     def __init__(self, key_path, demon_dir, gpt3_cache_file=None):
 22 |         self.nlp = spacy.load("en_core_web_sm")
 23 |         self.is_bio = True
 24 |         self.demon_path = os.path.join(demon_dir, "demons.json" if self.is_bio else "demons_complex.json")
 25 | 
 26 |         self.openai_lm = OpenAIModel("InstructGPT", cache_file=gpt3_cache_file, key=key_path)
 27 | 
 28 |         # get the demos
 29 |         with open('/data/chenzhengyu/projects/PIXIU_fingpt/PIXIU/src/factscore_package/demons.json', 'r') as f:
 30 |             self.demons = json.load(f)
 31 | 
 32 |         tokenized_corpus = [doc.split(" ") for doc in self.demons.keys()]
 33 |         self.bm25 = BM25Okapi(tokenized_corpus)
 34 | 
 35 |     def save_cache(self):
 36 |         self.openai_lm.save_cache()
 37 | 
 38 |     def run(self, generation, cost_estimate=None):
 39 |         """Convert the generation into a set of atomic facts. Return a total words cost if cost_estimate != None."""
 40 |         assert isinstance(generation, str), "generation must be a string"
 41 |         paragraphs = [para.strip() for para in generation.split("\n") if len(para.strip()) > 0]
 42 |         return self.get_atomic_facts_from_paragraph(paragraphs, cost_estimate=cost_estimate)
 43 | 
 44 |     def get_atomic_facts_from_paragraph(self, paragraphs, cost_estimate=None):
 45 |         sentences = []
 46 |         para_breaks = []
 47 |         for para_idx, paragraph in enumerate(paragraphs):
 48 |             if para_idx > 0 :
 49 |                 para_breaks.append(len(sentences))
 50 | 
 51 |             initials = detect_initials(paragraph)
 52 | 
 53 |             curr_sentences = sent_tokenize(paragraph)
 54 |             curr_sentences_2 = sent_tokenize(paragraph)
 55 | 
 56 |             curr_sentences = fix_sentence_splitter(curr_sentences, initials)
 57 |             curr_sentences_2 = fix_sentence_splitter(curr_sentences_2, initials)
 58 | 
 59 |             # checking this, just to ensure the crediability of the sentence splitter fixing algorithm
 60 |             assert curr_sentences == curr_sentences_2, (paragraph, curr_sentences, curr_sentences_2)
 61 | 
 62 |             sentences += curr_sentences
 63 | 
 64 |         atoms_or_estimate = self.get_init_atomic_facts_from_sentence([sent for i, sent in enumerate(sentences) if not (not self.is_bio and ( \
 65 |                             (i==0 and (sent.startswith("Sure") or sent.startswith("Here are"))) or \
 66 |                             (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))))], cost_estimate=cost_estimate)
 67 | 
 68 |         if cost_estimate:
 69 |             return atoms_or_estimate
 70 |         else:
 71 |             atoms = atoms_or_estimate
 72 | 
 73 |         atomic_facts_pairs = []
 74 |         for i, sent in enumerate(sentences):
 75 |             if not self.is_bio and ( \
 76 |                 (i==0 and (sent.startswith("Sure") or sent.startswith("Here are"))) or \
 77 |                 (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))):
 78 |                 atomic_facts_pairs.append((sent, []))
 79 |             elif self.is_bio and sent.startswith("This sentence does not contain any facts"):
 80 |                 atomic_facts_pairs.append((sent, []))
 81 |             elif sent.startswith("Sure") or sent.startswith("Please") or (i==0 and sent.startswith("Here are")):
 82 |                 atomic_facts_pairs.append((sent, []))
 83 |             else:
 84 |                 atomic_facts_pairs.append((sent, atoms[sent]))
 85 | 
 86 |         # postprocess_atomic_facts will fix minor issues from InstructGPT
 87 |         # it is supposed to handle sentence splitter issue too, but since here
 88 |         # we fixed sentence splitter issue already,
 89 |         # the new para_breaks should be identical to the original para_breaks
 90 |         if self.is_bio:
 91 |             atomic_facts_pairs, para_breaks = postprocess_atomic_facts(atomic_facts_pairs, list(para_breaks), self.nlp)
 92 | 
 93 |         return atomic_facts_pairs, para_breaks
 94 | 
 95 | 
 96 |     def get_init_atomic_facts_from_sentence(self, sentences, cost_estimate=None):
 97 |         """Get the initial atomic facts from the sentences. Return a total words cost if cost_estimate != None."""
 98 | 
 99 |         is_bio = self.is_bio
100 |         demons = self.demons
101 | 
102 |         k = 1 if is_bio else 0
103 |         n = 7 if is_bio else 8
104 | 
105 |         prompts = []
106 |         prompt_to_sent = {}
107 |         atoms = {}
108 |         for sentence in sentences:
109 |             if sentence in atoms:
110 |                 continue
111 |             top_machings = best_demos(sentence, self.bm25, list(demons.keys()), k)
112 |             prompt = ""
113 | 
114 |             for i in range(n):
115 |                 prompt = prompt + "Please breakdown the following sentence into independent facts: {}\n".format(list(demons.keys())[i])
116 |                 for fact in demons[list(demons.keys())[i]]:
117 |                     prompt = prompt + "- {}\n".format(fact)
118 |                 prompt = prompt + "\n"
119 | 
120 |             for match in top_machings:
121 |                 prompt = prompt + "Please breakdown the following sentence into independent facts: {}\n".format(match)
122 |                 for fact in demons[match]:
123 |                     prompt = prompt + "- {}\n".format(fact)
124 |                 prompt = prompt + "\n"
125 |             prompt = prompt + "Please breakdown the following sentence into independent facts: {}\n".format(sentence)
126 |             prompts.append(prompt)
127 |             prompt_to_sent[prompt] = sentence
128 | 
129 |         if cost_estimate:
130 |             total_words_estimate = 0
131 |             for prompt in prompts:
132 |                 if cost_estimate == "consider_cache" and (prompt.strip() + "_0") in self.openai_lm.cache_dict:
133 |                     continue
134 |                 total_words_estimate += len(prompt.split())
135 |             return total_words_estimate
136 |         else:
137 |             for prompt in prompts:
138 |                 outputs = self.openai_lm.generate(prompt)
139 |                 #print(outputs)
140 |                 output, _ = outputs
141 |                 atoms[prompt_to_sent[prompt]] = text_to_sentences(output)
142 | 
143 |             for key, value in demons.items():
144 |                 if key not in atoms:
145 |                     atoms[key] = value
146 | 
147 |             return atoms
148 | 
149 | 
150 | def best_demos(query, bm25, demons_sents, k):
151 |     tokenized_query = query.split(" ")
152 |     top_machings = bm25.get_top_n(tokenized_query, demons_sents, k)
153 |     return top_machings
154 | 
155 | 
156 | # transform InstructGPT output into sentences
157 | def text_to_sentences(text):
158 |     sentences = text.split("- ")[1:]
159 |     sentences = [sent.strip()[:-1] if sent.strip()[-1] == '\n' else sent.strip() for sent in sentences]
160 |     if len(sentences) > 0: 
161 |         if sentences[-1][-1] != '.':
162 |             sentences[-1] = sentences[-1] + '.' 
163 |     else:
164 |         sentences = []
165 |     return sentences
166 | 
167 | 
168 | def normalize_answer(s):
169 |     """Lower text and remove punctuation, articles and extra whitespace."""
170 |     def remove_articles(text):
171 |         regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
172 |         return re.sub(regex, ' ', text)
173 |     def white_space_fix(text):
174 |         return ' '.join(text.split())
175 |     def remove_punc(text):
176 |         exclude = set(string.punctuation)
177 |         return ''.join(ch for ch in text if ch not in exclude)
178 |     def lower(text):
179 |         return text.lower()
180 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
181 | 
182 | MONTHS = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"]
183 | MONTHS = [m.lower() for m in MONTHS]
184 | 
185 | def is_num(text):
186 |     try:
187 |         text = int(text)
188 |         return True
189 |     except Exception:
190 |         return False
191 | 
192 | def is_date(text):
193 |     text = normalize_answer(text)
194 |     for token in text.split(" "):
195 |         if (not is_num(token)) and token not in MONTHS:
196 |             return False
197 |     return True
198 | 
199 | def extract_numeric_values(text):
200 |     pattern = r'\b\d+\b'  # regular expression pattern for integers
201 |     numeric_values = re.findall(pattern, text)  # find all numeric values in the text
202 |     return set([value for value in numeric_values])  # convert the values to float and return as a list
203 | 
204 | 
205 | def detect_entities(text, nlp):
206 |     doc = nlp(text)
207 |     entities = set()
208 | 
209 |     def _add_to_entities(text):
210 |         if "-" in text:
211 |             for _text in text.split("-"):
212 |                 entities.add(_text.strip())
213 |         else:
214 |             entities.add(text)
215 | 
216 | 
217 |     for ent in doc.ents:
218 |         # spacy often has errors with other types of entities
219 |         if ent.label_ in ["DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"]:
220 | 
221 |             if is_date(ent.text):
222 |                 _add_to_entities(ent.text)
223 |             else:
224 |                 for token in ent.text.split():
225 |                     if is_date(token):
226 |                         _add_to_entities(token)
227 | 
228 |     for new_ent in extract_numeric_values(text):
229 |         if not np.any([new_ent in ent for ent in entities]):
230 |             entities.add(new_ent)
231 | 
232 |     return entities
233 | 
234 | def postprocess_atomic_facts(_atomic_facts, para_breaks, nlp):
235 | 
236 |     verbs = ["born.", " appointed.", " characterized.", " described.", " known.", " member.", " advocate.", "served.", "elected."]
237 |     permitted_verbs = ["founding member."]
238 | 
239 |     atomic_facts = []
240 |     new_atomic_facts = []
241 |     new_para_breaks = []
242 | 
243 |     for i, (sent, facts) in enumerate(_atomic_facts):
244 |         sent = sent.strip()
245 |         if len(sent.split())==1 and i not in para_breaks and i > 0:
246 |             assert i not in para_breaks
247 |             atomic_facts[-1][0] += " " + sent
248 |             atomic_facts[-1][1] += facts
249 |         else:
250 |             if i in para_breaks:
251 |                 new_para_breaks.append(len(atomic_facts))
252 |             atomic_facts.append([sent, facts])
253 | 
254 |     for i, (sent, facts) in enumerate(atomic_facts):
255 |         entities = detect_entities(sent, nlp)
256 |         covered_entities = set()
257 |         # print (entities)
258 |         new_facts = []
259 |         for i, fact in enumerate(facts):
260 |             if any([fact.endswith(verb) for verb in verbs]) and not any([fact.endswith(verb) for verb in permitted_verbs]):
261 |                 if any([fact[:-1] in other_fact for j, other_fact in enumerate(facts) if j != i]):
262 |                     continue
263 |             sent_entities = detect_entities(fact, nlp)
264 |             covered_entities |= set([e for e in sent_entities if e in entities])
265 |             new_entities = sent_entities - entities
266 |             if len(new_entities) > 0:
267 |                 do_pass = False
268 |                 for new_ent in new_entities:
269 |                     pre_ent = None
270 |                     for ent in entities:
271 |                         if ent.startswith(new_ent):
272 |                             pre_ent = ent
273 |                             break
274 |                     if pre_ent is None:
275 |                         do_pass = True
276 |                         break
277 |                     fact = fact.replace(new_ent, pre_ent)
278 |                     covered_entities.add(pre_ent)
279 |                 if do_pass:
280 |                     continue
281 |             if fact in new_facts:
282 |                 continue
283 |             new_facts.append(fact)
284 |         try:
285 |             assert entities==covered_entities
286 |         except Exception:
287 |             new_facts = facts # there is a bug in spacy entity linker, so just go with the previous facts
288 | 
289 |         new_atomic_facts.append((sent, new_facts))
290 | 
291 |     return new_atomic_facts, new_para_breaks
292 | 
293 | def is_integer(s):
294 |     try:
295 |         s = int(s)
296 |         return True
297 |     except Exception:
298 |         return False
299 | 
300 | def detect_initials(text):
301 |     pattern = r"[A-Z]\. ?[A-Z]\."
302 |     match = re.findall(pattern, text)
303 |     return [m for m in match]
304 | 
305 | def fix_sentence_splitter(curr_sentences, initials):
306 |     for initial in initials:
307 |         if not np.any([initial in sent for sent in curr_sentences]):
308 |             alpha1, alpha2 = [t.strip() for t in initial.split(".") if len(t.strip())>0]
309 |             for i, (sent1, sent2) in enumerate(zip(curr_sentences, curr_sentences[1:])):
310 |                 if sent1.endswith(alpha1 + ".") and sent2.startswith(alpha2 + "."):
311 |                     # merge sentence i and i+1
312 |                     curr_sentences = curr_sentences[:i] + [curr_sentences[i] + " " + curr_sentences[i+1]] + curr_sentences[i+2:]
313 |                     break
314 |     sentences = []
315 |     combine_with_previous = None
316 |     for sent_idx, sent in enumerate(curr_sentences):
317 |         if len(sent.split())<=1 and sent_idx==0:
318 |             assert not combine_with_previous
319 |             combine_with_previous = True
320 |             sentences.append(sent)
321 |         elif len(sent.split())<=1:
322 |             assert sent_idx > 0
323 |             sentences[-1] += " " + sent
324 |             combined_with_previous = False
325 |         elif sent[0].isalpha() and not sent[0].isupper() and sent_idx > 0:
326 |             assert sent_idx > 0, curr_sentences
327 |             sentences[-1] += " " + sent
328 |             combine_with_previous = False
329 |         elif combine_with_previous:
330 |             assert sent_idx > 0
331 |             sentences[-1] += " " + sent
332 |             combine_with_previous = False
333 |         else:
334 |             assert not combine_with_previous
335 |             sentences.append(sent)
336 |     return sentences
337 | 
338 | 
339 | def main():
340 |     pass
341 |     #generator = AtomicFactGenerator("api.key", "demos", gpt3_cache_dir=None)
342 |     #atomic_facts, para_breaks = generator.run("Thierry Henry (born 17 August 1977) is a French professional football coach, pundit, and former player. He is considered one of the greatest strikers of all time, and one the greatest players of the Premier League history. He has been named Arsenal F.C's greatest ever player.\n\nHenry made his professional debut with Monaco in 1994 before signing for defending Serie A champions Juventus. However, limited playing time, coupled with disagreements with the club's hierarchy, led to him signing for Premier League club Arsenal for £11 million in 1999.")
343 | 
344 |     #print(atomic_facts)
345 |     #print(para_breaks)
346 | 
347 | if __name__ == "__main__":
348 |     main()
349 | 


--------------------------------------------------------------------------------
/src/factscore_package/clm.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import os
  8 | import math
  9 | import time
 10 | import json
 11 | import numpy as np
 12 | import torch
 13 | from tqdm import tqdm
 14 | from collections import defaultdict
 15 | from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig, AutoConfig
 16 | from torch import cuda, bfloat16
 17 | import json
 18 | from huggingface_hub import login
 19 | from transformers import AutoModelForCausalLM
 20 | from transformers import LlamaTokenizer
 21 | 
 22 | from .utils import convert_model_to_int8_on_gpu
 23 | from .lm import LM
 24 | 
 25 | class CLM(LM):
 26 |     def __init__(self, model_name, model_dir, cache_file=None):
 27 |         self.model_name = model_name
 28 |         self.model_dir = model_dir
 29 |         if cache_file:
 30 |             super().__init__(cache_file)
 31 | 
 32 |     def load_model(self):
 33 |         model_id = 'meta-llama/Llama-2-7b-chat-hf'
 34 | 
 35 |         bnb_config = BitsAndBytesConfig(
 36 |             load_in_4bit=True,
 37 |             bnb_4bit_quant_type='nf4',
 38 |             bnb_4bit_use_double_quant=True,
 39 |             bnb_4bit_compute_dtype=bfloat16
 40 |         )
 41 | 
 42 |         # begin initializing HF items, you need an access token
 43 |         hf_auth = "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"
 44 |         model_config = AutoConfig.from_pretrained(
 45 |             model_id,
 46 |             use_auth_token=hf_auth
 47 |         )
 48 | 
 49 |         model = AutoModelForCausalLM.from_pretrained(
 50 |             model_id,
 51 |             trust_remote_code=True,
 52 |             config=model_config,
 53 |             quantization_config=bnb_config,
 54 |             device_map='auto',
 55 |             use_auth_token=hf_auth
 56 |         )
 57 |         #model_name_or_path = "TheBloke/Llama-2-70B-chat-AWQ"
 58 |         #model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
 59 |                                           #trust_remote_code=False, safetensors=True)
 60 |         # 2. Tie the weights
 61 |         #model.tie_weights()
 62 | 
 63 |         tokenizer = AutoTokenizer.from_pretrained(
 64 |             model_id,
 65 |             use_auth_token=hf_auth
 66 |         )
 67 | 
 68 |         self.model = model
 69 |         self.tokenizer = tokenizer
 70 |         #self.model = AutoModelForCausalLM.from_pretrained(self.model_dir)
 71 |         #self.model = convert_model_to_int8_on_gpu(self.model, device='auto')
 72 |         #self.tokenizer = LlamaTokenizer.from_pretrained(self.model_dir)
 73 | 
 74 |     def _generate(self, prompts, max_sequence_length=2048, max_output_length=128,
 75 |                   end_if_newline=False, end_if_second_newline=False, verbose=False):
 76 |         is_single = type(prompts)==str
 77 |         if is_single:
 78 |             prompts = [prompts]
 79 | 
 80 |         input_ids = self.tokenizer(prompts).input_ids
 81 |         if verbose:
 82 |             input_ids = tqdm(input_ids)
 83 | 
 84 |         generations = []
 85 |         scores = []
 86 |         for curr_input_ids in input_ids:
 87 |             if len(curr_input_ids) > max_sequence_length - max_output_length:
 88 |                 curr_input_ids = curr_input_ids[-(max_sequence_length - max_output_length):]
 89 |             curr_input_ids = torch.LongTensor([curr_input_ids])
 90 |             gen_outputs = self.model.generate(
 91 |                 curr_input_ids,
 92 |                 max_length=curr_input_ids.shape[1]+max_output_length,
 93 |                 return_dict_in_generate=True,
 94 |                 output_scores=True
 95 |             )
 96 |             gen_tokens = gen_outputs["sequences"]
 97 |             # saving the logits for the very first token
 98 |             gen_scores = gen_outputs["scores"][0][0].detach().cpu().numpy()
 99 |             gen = self.tokenizer.decode(gen_tokens[0, curr_input_ids.shape[-1]:])
100 | 
101 |             if end_if_newline:
102 |                 gen = gen.split("\n")[0].strip()
103 |             elif end_if_second_newline:
104 |                 gen = "\n".join(gen.split("\n")[:2]).strip()
105 | 
106 |             if verbose and len(generations)==0:
107 |                 print ("Input:", prompts[0])
108 |                 print ("Prediction:", gen)
109 | 
110 |             if self.model_name.startswith("llama-sni"):
111 |                 gen = gen.split("</s>")[0]
112 | 
113 |             generations.append(gen)
114 |             scores.append(gen_scores)
115 | 
116 |         assert len(generations)==len(prompts)==len(scores)
117 |         if is_single:
118 |             return generations[0], scores[0]
119 | 
120 |         return generations, scores
121 | 
122 | 


--------------------------------------------------------------------------------
/src/factscore_package/demons.json:
--------------------------------------------------------------------------------
1 | {"He made his acting debut in the film The Moon is the Sun's Dream (1992), and continued to appear in small and supporting roles throughout the 1990s.": ["He made his acting debut in the film.", "He made his acting debut in The Moon is the Sun's Dream.", "The Moon is the Sun's Dream is a film.", "The Moon is the Sun's Dream was released in 1992.", "After his acting debut, he appeared in small and supporting roles.", "After his acting debut, he appeared in small and supporting roles throughout the 1990s."], "He is also a successful producer and engineer, having worked with a wide variety of artists, including Willie Nelson, Tim McGraw, and Taylor Swift.": ["He is successful.", "He is a producer.", "He is a engineer.", "He has worked with a wide variety of artists.", "Willie Nelson is an artist.", "He has worked with Willie Nelson.", "Tim McGraw is an artist.", "He has worked with Tim McGraw.", "Taylor Swift is an artist.", "He has worked with Taylor Swift."], "In 1963, Collins became one of the third group of astronauts selected by NASA and he served as the back-up Command Module Pilot for the Gemini 7 mission.": ["Collins became an astronaut.", "Collins became one of the third group of astronauts.", "Collins became one of the third group of astronauts selected.", "Collins became one of the third group of astronauts selected by NASA.", "Collins became one of the third group of astronauts selected by NASA in 1963.", "He served as the Command Module Pilot.", "He served as the back-up Command Module Pilot.", "He served as the Command Module Pilot for the Gemini 7 mission."], "In addition to his acting roles, Bateman has written and directed two short films and is currently in development on his feature debut.": ["Bateman has acting roles.", "Bateman has written two short films.", "Bateman has directed two short films.", "Bateman has written and directed two short films.", "Bateman is currently in development on his feature debut."], "Michael Collins (born October 31, 1930) is a retired American astronaut and test pilot who was the Command Module Pilot for the Apollo 11 mission in 1969.": ["Michael Collins was born on October 31, 1930.", "Michael Collins is retired.", "Michael Collins is an American.", "Michael Collins was an astronaut.", "Michael Collins was a test pilot.", "Michael Collins was the Command Module Pilot.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission in 1969."], "He was an American composer, conductor, and musical director.": ["He was an American.", "He was a composer.", "He was a conductor.", "He was a musical director."], "She currently stars in the romantic comedy series, Love and Destiny, which premiered in 2019.": ["She currently stars in Love and Destiny.", "Love and Destiny is a romantic comedy series.", "Love and Destiny premiered in 2019. "], "His music has been described as a mix of traditional Mexican and Latin American styles, as well as jazz, folk, and rock.": ["His music has been described as a mix.", "His music has been described as a mix of traditional Mexican, Latin American styles, as well as jazz, folk, and rock."], "He also serves as an ambassador for the charity Leonard Cheshire Disability.": ["He serves as an ambassador.", "He serves as an ambassador for Leonard Cheshire Disability.", "Leonard Cheshire Disability is a charity."], "He began his career in Nashville in the late 1950s and has since released numerous albums, including a greatest hits collection in 1999.": ["He began his career in Nashville.", "He began his career in the late 1950s.", "He began his career in Nashville in the late 1950s.", "Since he began his career, he has released numerous albums.", "Since he began his career, he has released a greatest hits collection.", "Since he began his career, he has released a greatest hits collection in 1999."], "He has been performing since the age of 8, when he joined a band in his hometown of Guadalajara and has since gone on to record six studio albums and several singles of his own original material.": ["He has been performing since the age of 8.", "He joined a band.", "His hometown is Guadalajara.", "He joined a band in his hometown.", "He joined a band in his hometown of Guadalajara at the age of 8.", "He has gone on to record six studio albums.", "He has gone on to record six studio albums at the age of 8.", "He has gone on to record several singles of his own original material.", "He has gone on to record several singles of his own original material at the age of 8."], "She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013.": ["She is the former President.", "She is also the former President of the Malaysian Chinese Association (MCA)", "She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013."], "During his professional career, McCoy played for the Broncos, the San Diego Chargers, the Minnesota Vikings, and the Jacksonville Jaguars.": ["McCoy played for the Broncos.", "McCoy played for the Broncos during his professional career.", "McCoy played for the San Diego Chargers.", "McCoy played for the San Diego Chargers during his professional career.", "McCoy played for the Minnesota Vikings.", "McCoy played for the Minnesota Vikings during his professional career.", "McCoy played for the Jacksonville Jaguars.", "McCoy played for the Jacksonville Jaguars during his professional career."], "Miller has been described as the architect of Trump's controversial immigration policies, and has previously worked for Alabama Senator Jeff Sessions on immigration issues.": ["Miller has been described as the architect.", "Miller has been described as the architect of Trump's controversial immigration policies.", "Miller has previously worked for Alabama Senator Jeff Sessions.", "Miller has previously worked for Alabama Senator Jeff Sessions on immigration issues."], "Her work is often described as whimsical and dreamlike.": ["Her work is often described as whimsical.", "Her work is often described as dreamlike."], "He graduated from the United States Military Academy in 1952, and then went on to serve in the United States Air Force.": ["He graduated from the United States Military Academy.", "He graduated from the United States Military Academy in 1952.", "He went on to serve in the United States Air Force.", "He went on to serve in the United States Air Force after he graduated from the United States Military Academy."], "He is best known for his roles in the films Memories of Murder (2003), The Host (2006), (...) and Parasite (2019).": ["One of his best known roles is in Memories of Murder.", "Memories of Murder is a film.", "Memories of Murder was released in 2003.", "One of his best known roles is in The Host.", "The Host is a film.", "The Host was released in 2006.", "One of his best known roles is in Parasite.", "Parasite is a film.", "Parasite was released in 2019."], "Song Kang-ho was born in Gongju, South Korea in 1967.": ["Song Kang-ho was born in Gongju.", "Song Kang-ho was born in South Korea.", "Song Kang-ho was born in 1967."], "He studied theater at Chung-Ang University in Seoul.": ["He studied theater.", "He studied at Chung-Ang University.", "He studied at Chung-Ang University in Seoul."], "His breakthrough came with the leading role in the acclaimed crime-drama film Memories of Murder in 2003.": ["His breakthrough came with Memories of Murder.", "He was the leading role in Memories of Murder.", "Memories of Murder was released in 2003.", "Memories of Murder is a film.", "Memories of Murder is an acclaimed crime-drama film."], "This was followed by the monster movie The Host in 2006, which became the highest-grossing film in Korean history at the time.": ["This was followed by The Host.", "The Host is the movie.", "The Host is a monster movie.", "The Host was released in 2006.", "The Host became the highest-grossing film in Korean history at the time.", "The Host is not the highest-grossing film in Korean history anymore."]}
2 | 


--------------------------------------------------------------------------------
/src/factscore_package/download_data.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import subprocess
  4 | import torch
  5 | import tqdm
  6 | import transformers
  7 | 
  8 | 
  9 | def download_file(_id, dest, cache_dir):
 10 |     if os.path.exists(dest) or os.path.exists(os.path.join(cache_dir, dest)):
 11 |         print ("[Already exists] Skipping", dest)
 12 |         print ("If you want to download the file in another location, please specify a different path")
 13 |         return
 14 | 
 15 |     if os.path.exists(dest.replace(".zip", "")) or os.path.exists(os.path.join(cache_dir, dest.replace(".zip", ""))):
 16 |         print ("[Already exists] Skipping", dest)
 17 |         print ("If you want to download the file in another location, please specify a different path")
 18 |         return
 19 | 
 20 |     if "/" in dest:
 21 |         dest_dir = "/".join(dest.split("/")[:-1])
 22 |         if not os.path.isdir(dest_dir):
 23 |             os.makedirs(dest_dir)
 24 |     else:
 25 |         dest_dir = "."
 26 | 
 27 |     if _id.startswith("https://"):
 28 |         command = """wget -O %s %s""" % (dest, _id)
 29 |     else:
 30 |         command = """wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=%s' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\\1\\n/p')&id=%s" -O %s && rm -rf /tmp/cookies.txt""" % (_id, _id, dest)
 31 | 
 32 |     ret_code = subprocess.run([command], shell=True)
 33 |     if ret_code.returncode != 0:
 34 |         print("Download {} ... [Failed]".format(dest))
 35 |     else:
 36 |         print("Download {} ... [Success]".format(dest))
 37 | 
 38 |     if dest.endswith(".zip"):
 39 |         command = """unzip %s -d %s && rm %s""" % (dest, dest_dir, dest)
 40 | 
 41 |         ret_code = subprocess.run([command], shell=True)
 42 |         if ret_code.returncode != 0:
 43 |             print("Unzip {} ... [Failed]".format(dest))
 44 |         else:
 45 |             print("Unzip {} ... [Success]".format(dest))
 46 | 
 47 | 
 48 | 
 49 | def smart_tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model):
 50 |     """Resize tokenizer and embedding.
 51 |     Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
 52 |     """
 53 |     num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
 54 |     model.resize_token_embeddings(len(tokenizer))
 55 | 
 56 |     if num_new_tokens > 0:
 57 |         input_embeddings = model.get_input_embeddings().weight.data
 58 |         output_embeddings = model.get_output_embeddings().weight.data
 59 | 
 60 |         input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
 61 |         output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
 62 | 
 63 |         input_embeddings[-num_new_tokens:] = input_embeddings_avg
 64 |         output_embeddings[-num_new_tokens:] = output_embeddings_avg
 65 | 
 66 | 
 67 | def recover_instruct_llama(path_raw, output_path, device="cpu", test_recovered_model=False):
 68 |     """Heavily adapted from https://github.com/tatsu-lab/stanford_alpaca/blob/main/weight_diff.py."""
 69 | 
 70 |     model_raw = transformers.AutoModelForCausalLM.from_pretrained(
 71 |         path_raw,
 72 |         device_map={"": torch.device(device)},
 73 |         torch_dtype=torch.float32,
 74 |         low_cpu_mem_usage=True,
 75 |     )
 76 |     model_recovered = transformers.AutoModelForCausalLM.from_pretrained(
 77 |         "kalpeshk2011/instruct-llama-7b-wdiff",
 78 |         device_map={"": torch.device(device)},
 79 |         torch_dtype=torch.float32,
 80 |         low_cpu_mem_usage=True,
 81 |     )
 82 | 
 83 |     tokenizer_raw = transformers.AutoTokenizer.from_pretrained(path_raw)
 84 |     if tokenizer_raw.pad_token is None:
 85 |         smart_tokenizer_and_embedding_resize(
 86 |             special_tokens_dict=dict(pad_token="[PAD]"),
 87 |             model=model_raw,
 88 |             tokenizer=tokenizer_raw,
 89 |         )
 90 |     tokenizer_recovered = transformers.AutoTokenizer.from_pretrained("kalpeshk2011/instruct-llama-7b-wdiff")
 91 | 
 92 |     state_dict_recovered = model_recovered.state_dict()
 93 |     state_dict_raw = model_raw.state_dict()
 94 |     for key in tqdm.tqdm(state_dict_recovered):
 95 |         state_dict_recovered[key].add_(state_dict_raw[key])
 96 | 
 97 |     if output_path is not None:
 98 |         model_recovered.save_pretrained(output_path)
 99 |         tokenizer_recovered.save_pretrained(output_path)
100 | 
101 |     if test_recovered_model:
102 |         input_text = (
103 |             "Below is an instruction that describes a task. "
104 |             "Write a response that appropriately completes the request.\r\n\r\n"
105 |             "### Instruction:\r\nList three technologies that make life easier.\r\n\r\n### Response:"
106 |         )
107 |         inputs = tokenizer_recovered(input_text, return_tensors="pt")
108 |         out = model_recovered.generate(inputs=inputs.input_ids, max_new_tokens=100)
109 |         output_text = tokenizer_recovered.batch_decode(out, skip_special_tokens=True)[0]
110 |         output_text = output_text[len(input_text) :]
111 |         print(f"Input: {input_text}\nCompletion: {output_text}")
112 | 
113 |     return model_recovered, tokenizer_recovered
114 | 
115 | if __name__ == '__main__':
116 | 
117 |     parser = argparse.ArgumentParser()
118 |     parser.add_argument('--data_dir',
119 |                         type=str,
120 |                         default=".cache/factscore")
121 |     parser.add_argument('--model_dir',
122 |                         type=str,
123 |                         default=".cache/factscore")
124 |     parser.add_argument('--llama_7B_HF_path',
125 |                         type=str,
126 |                         default=None)
127 | 
128 |     args = parser.parse_args()
129 | 
130 |     if not os.path.exists(args.model_dir):
131 |         os.makedirs(args.model_dir)
132 | 
133 |     if not os.path.exists(args.data_dir):
134 |         os.makedirs(args.data_dir)
135 | 
136 |     download_file("1IseEAflk1qqV0z64eM60Fs3dTgnbgiyt", "demos.zip", args.data_dir)
137 |     download_file("1enz1PxwxeMr4FRF9dtpCPXaZQCBejuVF", "data.zip", args.data_dir)
138 |     download_file("1mekls6OGOKLmt7gYtHs0WGf5oTamTNat", "enwiki-20230401.db", args.data_dir)
139 | 
140 |     if args.llama_7B_HF_path:
141 |         recover_instruct_llama(args.llama_7B_HF_path, os.path.join(args.model_dir, "inst-llama-7B"))
142 | 
143 |     # download the roberta_stopwords.txt file
144 |     subprocess.run(["wget https://raw.githubusercontent.com/shmsw25/FActScore/main/roberta_stopwords.txt"], shell=True)
145 | 
146 |     # move the files to the data directory
147 |     subprocess.run(["mv demos %s" % args.data_dir], shell=True)
148 |     subprocess.run(["mv enwiki-20230401.db %s" % args.data_dir], shell=True)
149 | 


--------------------------------------------------------------------------------
/src/factscore_package/en_core_web_sm-3.7.1.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/src/factscore_package/en_core_web_sm-3.7.1.tar.gz


--------------------------------------------------------------------------------
/src/factscore_package/factscorer.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import string
  3 | import json
  4 | import numpy as np
  5 | import os
  6 | import logging
  7 | 
  8 | from tqdm import tqdm
  9 | from .abstain_detection import is_response_abstained
 10 | from .atomic_facts import AtomicFactGenerator
 11 | from .clm import CLM
 12 | from .npm import NPM
 13 | from .openai_lm import OpenAIModel
 14 | from .retrieval import DocDB, Retrieval
 15 | 
 16 | class FactScorer(object):
 17 | 
 18 |     def __init__(self,
 19 |                  model_name="retrieval+ChatGPT",
 20 |                  data_dir=".cache",
 21 |                  model_dir=".cache",
 22 |                  cache_dir=".cache",
 23 |                  openai_key="api.key",
 24 |                  cost_estimate="consider_cache",
 25 |                  abstain_detection_type=None,
 26 |                  batch_size=256):
 27 |         assert model_name in ["retrieval+llama", "retrieval+llama+npm", "retrieval+ChatGPT", "retrieval+GPT4", "npm", "retrieval+ChatGPT+npm"]
 28 |         self.model_name = model_name
 29 | 
 30 |         self.db = {}
 31 |         self.retrieval = {}
 32 |         self.npm = {}
 33 |         self.batch_size = batch_size # batch size for retrieval
 34 |         self.openai_key = openai_key
 35 |         self.abstain_detection_type = abstain_detection_type
 36 | 
 37 |         self.data_dir = data_dir
 38 |         self.cache_dir = cache_dir
 39 |         if not os.path.exists(cache_dir):
 40 |             os.makedirs(cache_dir)
 41 | 
 42 |         self.af_generator = None
 43 |         self.cost_estimate = cost_estimate
 44 | 
 45 |         if "llama" in model_name:
 46 |             self.lm = CLM("Llama2-7B-chat",
 47 |                           model_dir='meta-llama/Llama-2-7b-chat-hf',
 48 |                           cache_file=os.path.join(cache_dir, "Llama2-7B-chat.pkl"))
 49 |             #self.lm = CLM("inst-llama-7B",
 50 |                           #model_dir=os.path.join(model_dir, "inst-llama-7B"),
 51 |                           #cache_file=os.path.join(cache_dir, "inst-llama-7B.pkl"))
 52 |         elif "ChatGPT" in model_name:
 53 |             self.lm = OpenAIModel("ChatGPT",
 54 |                                   cache_file=os.path.join(cache_dir, "ChatGPT.pkl"),
 55 |                                   key=openai_key)
 56 |         elif "GPT4" in model_name:
 57 |             self.lm = OpenAIModel("GPT4",
 58 |                                   cache_file=os.path.join(cache_dir, "GPT4.pkl"),
 59 |                                   key=openai_key)
 60 |         else:
 61 |             self.lm = None
 62 | 
 63 |     def save_cache(self):
 64 |         if self.lm:
 65 |             self.lm.save_cache()
 66 |         if "npm" in self.model_name:
 67 |             for k, v in self.npm.items():
 68 |                 v.save_cache()
 69 |         for k, v in self.retrieval.items():
 70 |             v.save_cache()
 71 | 
 72 |     def register_knowledge_source(self, name="enwiki-20230401", db_path=None, data_path=None):
 73 |         assert name not in self.retrieval, f"{name} already registered"
 74 |         if db_path is None:
 75 |             db_path = os.path.join(self.data_dir, f"{name}.db")
 76 | 
 77 |         if data_path is None:
 78 |             data_path = os.path.join(self.data_dir, f"{name}.jsonl")
 79 | 
 80 |         cache_path = os.path.join(self.cache_dir, f"retrieval-{name}.json")
 81 |         embed_cache_path = os.path.join(self.cache_dir, f"retrieval-{name}.pkl")
 82 | 
 83 |         self.db[name] = DocDB(db_path=db_path, data_path=data_path)
 84 |         self.retrieval[name] = Retrieval(self.db[name], cache_path, embed_cache_path, batch_size=self.batch_size)
 85 |         if "npm" in self.model_name:
 86 |             cache_path = os.path.join(self.cache_dir, f"bm25-{name}.json")
 87 |             embed_cache_path = os.path.join(self.cache_dir, f"bm25-{name}.pkl")
 88 |             self.npm[name] = NPM(Retrieval(self.db[name], cache_path, embed_cache_path, "bm25"),
 89 |                                  "npm-single",
 90 |                                  cache_file=os.path.join(self.cache_dir, f"npm-{name}.pkl"))
 91 | 
 92 | 
 93 |     def print_cost_estimates(self, total_words, task, model):
 94 |         # https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
 95 |         # Number of tokens are roughly 4/3 of the number of words
 96 |         total_tokens = total_words * 4.0 / 3
 97 | 
 98 |         # https://openai.com/pricing
 99 |         # if we use davinci-003, the cost is $0.02 per 1000 tokens
100 |         # if we use gpt-3.5-turbo, the cost is $0.002 per 1000 tokens
101 |         if model == "davinci-003":
102 |             rate = 0.02
103 |         elif model == "gpt-3.5-turbo":
104 |             rate = 0.002
105 | 
106 |         total_cost = total_tokens * rate / 1000
107 | 
108 |         # print the total words, tokens, and cost along with rate
109 |         logging.critical("Estimated OpenAI API cost for %s ($%.3f per 1000 tokens): $%.2f for %d words and %d tokens" % (task, rate, total_cost, total_words, total_tokens))
110 | 
111 |     def get_score(self,
112 |                   topics,
113 |                   generations,
114 |                   gamma=10,
115 |                   atomic_facts=None,
116 |                   knowledge_source=None,
117 |                   verbose=False):
118 |         if knowledge_source is None:
119 |             # use the default knowledge source
120 |             knowledge_source = "enwiki-20230401"
121 | 
122 |         if knowledge_source not in self.retrieval:
123 |             self.register_knowledge_source(knowledge_source)
124 | 
125 |         if type(topics)==len(generations)==str:
126 |             topics = [topics]
127 |             generations = [generations]
128 |         else:
129 |             assert type(topics)==type(generations)==list, "`topics` and `generations` should be lists."
130 |             assert len(topics)==len(generations), "`topics` and `generations` should have the same length"
131 | 
132 |         if atomic_facts is not None:
133 |             assert len(topics)==len(atomic_facts), "`topics` and `atomic_facts` should have the same length"
134 |         else:
135 |             if self.af_generator is None:
136 |                 self.af_generator = AtomicFactGenerator(key_path=self.openai_key,
137 |                                                         demon_dir=os.path.join(self.data_dir, "demos"),
138 |                                                         gpt3_cache_file=os.path.join(self.cache_dir, "InstructGPT.pkl"))
139 | 
140 |             # estimate the total cost of atomic fact generation
141 |             total_words = 0
142 |             for gen in generations:
143 |                 total_words += self.af_generator.run(gen, cost_estimate=self.cost_estimate)
144 | 
145 |             self.print_cost_estimates(total_words, task="atomic fact generation", model="davinci-003")
146 | 
147 |             if verbose:
148 |                 topics = tqdm(topics)
149 | 
150 |             atomic_facts = []
151 |             for topic, gen in zip(topics, generations):
152 |                 # optionally, first detect if the response is abstained
153 |                 response_abstained = is_response_abstained(gen, self.abstain_detection_type)
154 |                 if response_abstained:
155 |                     atomic_facts.append(None)
156 |                     continue
157 |                 # continue only when the response is not abstained
158 |                 curr_afs, _ = self.af_generator.run(gen)
159 |                 curr_afs = [fact for _, facts in curr_afs for fact in facts]
160 |                 if len(curr_afs)==0:
161 |                     atomic_facts.append(None)
162 |                 else:
163 |                     atomic_facts.append(curr_afs)
164 |                 if len(atomic_facts) % 10 == 0:
165 |                     self.af_generator.save_cache()
166 | 
167 |             assert len(atomic_facts)==len(topics)
168 |             self.af_generator.save_cache()
169 | 
170 |         respond_ratio = np.mean([facts is not None for facts in atomic_facts])
171 | 
172 |         if "ChatGPT" in self.model_name:
173 |             # estimate the total cost of response generation
174 |             total_words = 0
175 |             for topic, generation, facts in zip(topics, generations, atomic_facts):
176 |                 if facts is not None:
177 |                     total_words += self._get_score(topic, generation, facts, knowledge_source, cost_estimate=self.cost_estimate)
178 | 
179 |             self.print_cost_estimates(total_words, task="factscore evaluation", model="gpt-3.5-turbo")
180 | 
181 |         if verbose:
182 |             topics = tqdm(topics)
183 | 
184 |         scores = []
185 |         init_scores = []
186 |         decisions = []
187 |         for topic, generation, facts in zip(topics, generations, atomic_facts):
188 |             if facts is None:
189 |                 decisions.append(None)
190 |             else:
191 |                 decision = self._get_score(topic, generation, facts, knowledge_source)
192 |                 score = np.mean([d["is_supported"] for d in decision])
193 | 
194 |                 if gamma:
195 |                     init_scores.append(score)
196 |                     penalty = 1.0 if len(facts)>gamma else np.exp(1-gamma/len(facts))
197 |                     score = penalty * score
198 | 
199 |                 decisions.append(decision)
200 |                 scores.append(score)
201 |                 if len(scores) % 10 == 0:
202 |                     self.save_cache()
203 | 
204 |         self.save_cache()
205 | 
206 |         out = {"score": np.mean(scores),
207 |                "respond_ratio": respond_ratio,
208 |                "decisions": decisions,
209 |                "num_facts_per_response": np.mean([len(d) for d in decisions if d is not None])}
210 | 
211 |         if gamma:
212 |             out["init_score"] = np.mean(init_scores)
213 | 
214 |         return out
215 | 
216 |     def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_estimate=None):
217 |         decisions = []
218 |         total_words = 0
219 |         for atom in atomic_facts:
220 |             atom = atom.strip()
221 |             if self.lm:
222 |                 passages = self.retrieval[knowledge_source].get_passages(topic, atom, k=5)
223 |                 definition = "Answer the question about {} based on the given context.\n\n".format(topic)
224 |                 context = ""
225 |                 for psg_idx, psg in enumerate(reversed(passages)):
226 |                     context += "Title: {}\nText: {}\n\n".format(psg["title"], psg["text"].replace("<s>", "").replace("</s>", ""))
227 |                 definition += context.strip()
228 |                 if not definition[-1] in string.punctuation:
229 |                     definition += "."
230 |                 prompt = "{}\n\nInput: {} True or False?\nOutput:".format(definition.strip(), atom.strip())
231 | 
232 |                 if cost_estimate:
233 |                     if cost_estimate == "consider_cache" and (prompt.strip() + "_0") not in self.lm.cache_dict:
234 |                         total_words += len(prompt.split())
235 |                     elif cost_estimate == "ignore_cache":
236 |                         total_words += len(prompt.split())
237 |                     continue
238 | 
239 |                 output = self.lm.generate(prompt)
240 | 
241 |                 if type(output[1])==np.ndarray:
242 |                     # when logits are available
243 |                     logits = np.array(output[1])
244 |                     assert logits.shape[0] in [32000, 32001]
245 |                     true_score = logits[5852]
246 |                     false_score = logits[7700]
247 |                     is_supported = true_score > false_score
248 |                 else:
249 |                     # when logits are unavailable
250 |                     generated_answer = output[0].lower()
251 |                     if "true" in generated_answer or "false" in generated_answer:
252 |                         if "true" in generated_answer and "false" not in generated_answer:
253 |                             is_supported = True
254 |                         elif "false" in generated_answer and "true" not in generated_answer:
255 |                             is_supported = False
256 |                         else:
257 |                             is_supported = generated_answer.index("true") > generated_answer.index("false")
258 |                     else:
259 |                         is_supported = all([keyword not in generated_answer.lower().translate(str.maketrans("", "", string.punctuation)).split() for keyword in ["not", "cannot", "unknown", "information"]])
260 | 
261 |             else:
262 |                 is_supported = True
263 | 
264 |             if is_supported and "npm" in self.model_name:
265 |                 npprob = self.npm[knowledge_source].get_probabilty(topic, atom)
266 |                 is_supported = npprob > 0.3
267 | 
268 |             decisions.append({"atom": atom, "is_supported": is_supported})
269 | 
270 |         if cost_estimate:
271 |             return total_words
272 |         else:
273 |             return decisions
274 | 
275 | if __name__ == '__main__':
276 | 
277 |     parser = argparse.ArgumentParser()
278 |     parser.add_argument('--input_path',
279 |                         type=str,
280 |                         default="data/labeled/InstructGPT.jsonl")
281 |     parser.add_argument('--model_name',
282 |                         type=str,
283 |                         default="retrieval+ChatGPT")
284 |     parser.add_argument('--gamma',
285 |                         type=int,
286 |                         default=10,
287 |                         help="hyperparameter for length penalty")
288 | 
289 |     parser.add_argument('--openai_key',
290 |                         type=str,
291 |                         default="api.key")
292 |     parser.add_argument('--data_dir',
293 |                         type=str,
294 |                         default=".cache/factscore/")
295 |     parser.add_argument('--model_dir',
296 |                         type=str,
297 |                         default=".cache/factscore/")
298 |     parser.add_argument('--cache_dir',
299 |                         type=str,
300 |                         default=".cache/factscore/")
301 |     parser.add_argument('--knowledge_source',
302 |                         type=str,
303 |                         default=None)
304 | 
305 | 
306 |     parser.add_argument('--cost_estimate',
307 |                         type=str,
308 |                         default="consider_cache",
309 |                         choices=["consider_cache", "ignore_cache"])
310 |     parser.add_argument('--abstain_detection_type',
311 |                         type=str,
312 |                         default=None,
313 |                         choices=["perplexity_ai", "generic", "none"])
314 |     parser.add_argument('--use_atomic_facts',
315 |                         action="store_true")
316 |     parser.add_argument('--verbose',
317 |                         action="store_true",
318 |                         help="for printing out the progress bar")    
319 |     parser.add_argument('--print_rate_limit_error',
320 |                         action="store_true",
321 |                         help="for printing out rate limit error when using OpenAI keys")
322 |     parser.add_argument('--n_samples',
323 |                         type=int,
324 |                         default=None)
325 | 
326 |     args = parser.parse_args()
327 | 
328 |     logging.basicConfig(format='%(asctime)s - %(name)s - %(message)s',
329 |                         datefmt='%m/%d/%Y %H:%M:%S',
330 |                         level=logging.ERROR if args.print_rate_limit_error else logging.CRITICAL)
331 | 
332 |     fs = FactScorer(model_name=args.model_name,
333 |                     data_dir=args.data_dir,
334 |                     model_dir=args.model_dir,
335 |                     cache_dir=args.cache_dir,
336 |                     openai_key=args.openai_key,
337 |                     cost_estimate=args.cost_estimate,
338 |                     abstain_detection_type=args.abstain_detection_type)
339 | 
340 |     tot = 0
341 |     topics, generations, atomic_facts = [], [], []
342 |     with open(args.input_path) as f:
343 |         for line in f:
344 |             dp = json.loads(line)
345 |             tot += 1
346 |             if args.use_atomic_facts:
347 |                 assert "annotations" in dp, "You can specify `--use_atomic_facts` only when atomic facts are available in the input data already."
348 |                 if dp["annotations"] is None:
349 |                     continue
350 |                 topics.append(dp["topic"])
351 |                 generations.append(dp["output"])
352 |                 atomic_facts.append([atom["text"] for sent in dp["annotations"] for atom in sent["model-atomic-facts"]])
353 |             else:
354 |                 topics.append(dp["topic"])
355 |                 generations.append(dp["output"])
356 |             if args.n_samples is not None and tot==args.n_samples:
357 |                 break
358 |     out = fs.get_score(topics=topics,
359 |                        generations=generations,
360 |                        gamma=args.gamma,
361 |                        atomic_facts=atomic_facts if args.use_atomic_facts else None,
362 |                        knowledge_source=args.knowledge_source,
363 |                        verbose=args.verbose)
364 |     logging.critical("FActScore = %.1f%%" % (100*out["score"]))
365 |     if "init_score" in out:
366 |         logging.critical("FActScore w/o length penalty = %.1f%%" % (100*out["init_score"]))
367 |     logging.critical("Respond ratio = %.1f%%" % (100*out["respond_ratio"]))
368 |     logging.critical("# Atomic facts per valid response = %.1f" % (out["num_facts_per_response"]))
369 | 
370 |     # Save out as a json file
371 |     with open(args.input_path.replace(".jsonl", f"_factscore_output.json"), 'w') as f:
372 |         f.write(json.dumps(out) + "\n")
373 | 
374 | 


--------------------------------------------------------------------------------
/src/factscore_package/lm.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | import os
  3 | import time
  4 | from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig, AutoConfig
  5 | from torch import cuda, bfloat16
  6 | import json
  7 | from huggingface_hub import login
  8 | 
  9 | 
 10 | class LM():
 11 | 
 12 |     def __init__(self, cache_file):
 13 |         self.cache_file = cache_file
 14 |         self.cache_dict = self.load_cache()
 15 |         self.model = None
 16 |         self.save_interval = 100
 17 |         self.add_n = 0
 18 | 
 19 |     def load_model(self):
 20 |         # load the model and put it as self.model
 21 |         raise NotImplementedError()
 22 |         #model_id = 'meta-llama/Llama-2-7b-chat-hf'
 23 | 
 24 |         #bnb_config = BitsAndBytesConfig(
 25 |             #load_in_4bit=True,
 26 |             #bnb_4bit_quant_type='nf4',
 27 |             #bnb_4bit_use_double_quant=True,
 28 |             #bnb_4bit_compute_dtype=bfloat16
 29 |         #)
 30 | 
 31 |         # begin initializing HF items, you need an access token
 32 |         #hf_auth = "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"
 33 |         #model_config = AutoConfig.from_pretrained(
 34 |            # model_id,
 35 |             #use_auth_token=hf_auth
 36 |         #)
 37 | 
 38 |         #model = AutoModelForCausalLM.from_pretrained(
 39 |             #model_id,
 40 |             ##trust_remote_code=True,
 41 |             #config=model_config,
 42 |             #quantization_config=bnb_config,
 43 |             #device_map='auto',
 44 |             #use_auth_token=hf_auth
 45 |         #)
 46 |         #model_name_or_path = "TheBloke/Llama-2-70B-chat-AWQ"
 47 |         #model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True,
 48 |                                           #trust_remote_code=False, safetensors=True)
 49 |         # 2. Tie the weights
 50 |         #model.tie_weights()
 51 | 
 52 |         #tokenizer = AutoTokenizer.from_pretrained(
 53 |             #model_id,
 54 |             #use_auth_token=hf_auth
 55 |         #)
 56 |         # 3. Create the pipeline using the model with tied weights.
 57 |         #generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto")
 58 |         #self.model = generator
 59 | 
 60 | 
 61 |     def generate(self, prompt, sample_idx=0, max_sequence_length=2048, max_output_length=128):
 62 |         prompt = prompt.strip() # it's important not to end with a whitespace
 63 |         cache_key = f"{prompt}_{sample_idx}"
 64 | 
 65 |         if cache_key in self.cache_dict:
 66 |             return self.cache_dict[cache_key]
 67 | 
 68 |         if self.model is None:
 69 |             self.load_model()
 70 | 
 71 |         if prompt.endswith(" True or False?\nAnswer:"):
 72 |             generated = self._generate(prompt, max_sequence_length=max_sequence_length, max_output_length=1)
 73 |         else:
 74 |             generated = self._generate(prompt, max_sequence_length=max_sequence_length, max_output_length=max_output_length)
 75 | 
 76 |         self.cache_dict[cache_key] = generated
 77 |         self.add_n += 1
 78 |         return generated
 79 | 
 80 |     """
 81 |     def _generate(self, prompt, max_output_length):
 82 |         if self.add_n % self.save_interval == 0:
 83 |             self.save_cache()
 84 |         generate_kwargs = dict(max_new_tokens=max_output_length, do_sample=True, temperature=0.5)
 85 |         output = self.model(prompt, **generate_kwargs)
 86 |         #print(output)
 87 |         output = output[0]['generated_text'][len(prompt):].strip()
 88 |         if "\n" in output:
 89 |             output = output[:output.index("\n")]
 90 |         return output
 91 |     """
 92 |     def save_cache(self):
 93 |         if self.add_n == 0:
 94 |             return
 95 | 
 96 |         # load the latest cache first, since if there were other processes running in parallel, cache might have been updated
 97 |         for k, v in self.load_cache().items():
 98 |             self.cache_dict[k] = v
 99 | 
100 |         with open(self.cache_file, "wb") as f:
101 |             pickle.dump(self.cache_dict, f)
102 | 
103 |     def load_cache(self, allow_retry=True):
104 |         if os.path.exists(self.cache_file):
105 |             while True:
106 |                 try:
107 |                     with open(self.cache_file, "rb") as f:
108 |                         cache = pickle.load(f)
109 |                     break
110 |                 except Exception:
111 |                     if not allow_retry:
112 |                         assert False
113 |                     print ("Pickle Error: Retry in 5sec...")
114 |                     time.sleep(5)        
115 |         else:
116 |             cache = {}
117 |         return cache
118 | 
119 | 
120 | 
121 | 


--------------------------------------------------------------------------------
/src/factscore_package/npm.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import time
  4 | from collections import defaultdict
  5 | from transformers import AutoModelForMaskedLM, AutoTokenizer
  6 | 
  7 | from .lm import LM
  8 | from .retrieval import Retrieval
  9 | 
 10 | def softmax(x):
 11 |     return(np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum())
 12 | 
 13 | class NPM(LM):
 14 | 
 15 |     def __init__(self, bm25, model_name, cache_file):
 16 |         assert model_name.startswith("npm")
 17 |         self.bm25 = bm25
 18 |         self.model_name = model_name
 19 |         self.model = None
 20 | 
 21 |         self.tokenizer = AutoTokenizer.from_pretrained("facebook/" + self.model_name)
 22 |         self.mask_id = self.tokenizer.mask_token_id
 23 | 
 24 |         with open("roberta_stopwords.txt", "r") as f:
 25 |             self.stopwords = set()
 26 |             for line in f:
 27 |                 self.stopwords.add(int(line.strip()))
 28 | 
 29 |         super().__init__(cache_file=cache_file)
 30 | 
 31 |     def load_model(self):
 32 |         self.model = AutoModelForMaskedLM.from_pretrained("facebook/" + self.model_name)
 33 |         self.model.cuda()
 34 |         self.model.eval()
 35 | 
 36 |     def save_cache(self):
 37 |         super().save_cache()
 38 |         self.bm25.save_cache()
 39 | 
 40 |     def tokenize(self, texts, skip_special_tokens=False, padding=True):
 41 |         assert type(texts)==list
 42 |         all_input_ids = self.tokenizer(texts)["input_ids"]
 43 |         if skip_special_tokens:
 44 |             for i, input_ids in enumerate(all_input_ids):
 45 |                 assert input_ids[0]==0 and input_ids[-1]==2
 46 |                 all_input_ids[i] = input_ids[1:-1]
 47 |         if not padding:
 48 |             return all_input_ids
 49 |         max_length = np.max([len(_ids) for _ids in all_input_ids])    
 50 |         _all_input_ids = []
 51 |         _all_attention_mask = []   
 52 |         for i, input_ids in enumerate(all_input_ids):
 53 |             n_valid = len(input_ids)
 54 |             n_masks = max_length - n_valid
 55 |             _all_input_ids.append(input_ids + [0 for _ in range(n_masks)])
 56 |             _all_attention_mask.append([1 for _ in range(n_valid)] + [0 for _ in range(n_masks)])
 57 |         return torch.LongTensor(_all_input_ids), torch.LongTensor(_all_attention_mask)
 58 | 
 59 |     def decode(self, input_ids):
 60 |         return self.tokenizer.decode(input_ids)
 61 | 
 62 |     def encode(self, texts, skip_special_tokens=False, gt_input_ids=None):
 63 |         assert type(texts)==list
 64 |         if self.model is None:
 65 |             self.load_model()
 66 |         if gt_input_ids is not None:
 67 |             assert len(texts)==len(gt_input_ids)
 68 |         all_input_ids, all_attention_mask = self.tokenize(texts, skip_special_tokens=skip_special_tokens)
 69 | 
 70 |         with torch.no_grad():
 71 |             outputs = self.model(all_input_ids.cuda(),
 72 |                                  all_attention_mask.cuda(),
 73 |                                  output_hidden_states=True,
 74 |                                  return_dict=True)
 75 |             all_logits = outputs["logits"].detach().cpu().numpy()
 76 |             all_hidden_states = outputs["hidden_states"][-1].detach().cpu().numpy()
 77 | 
 78 |         results = []
 79 |         for i, (text, input_ids, logits, hidden_states) in enumerate(zip(texts, all_input_ids, all_logits, all_hidden_states)):
 80 |             input_ids = input_ids.numpy().tolist()
 81 |             if self.mask_id in input_ids:
 82 |                 idx = input_ids.index(self.mask_id)
 83 |                 assert gt_input_ids is not None
 84 |                 prob = softmax(logits[idx])[gt_input_ids[i]]
 85 |                 results.append((prob, hidden_states[idx]))
 86 |             else:
 87 |                 _input_ids = [_id for _id in input_ids if _id not in [0, 2]]
 88 |                 _hidden_states = [h for _id, h in zip(input_ids, hidden_states) if _id not in [0, 2]]
 89 |                 results.append((_input_ids, _hidden_states))
 90 | 
 91 |         return results
 92 | 
 93 |     def get_probabilty(self, topic, question):
 94 |         passages = self.bm25.get_passages(topic, question, k=3)
 95 |         passages = [p["text"].strip() for p in passages]
 96 |         cache_key = question + "#" + "#".join(passages)
 97 | 
 98 |         if cache_key not in self.cache_dict:
 99 |             encoded = self.encode(passages, skip_special_tokens=True)
100 |             stacked_passage_tokens, stacked_passage_vectors = [], []
101 |             for input_ids, vectors in encoded:
102 |                 stacked_passage_tokens += input_ids
103 |                 if len(vectors)>0:
104 |                     stacked_passage_vectors.append(vectors)
105 |             stacked_passage_vectors = np.concatenate(stacked_passage_vectors, 0)
106 | 
107 |             question_input_ids = self.tokenize(["Fact: " + question], skip_special_tokens=False, padding=False)[0]
108 |             if 2 in question_input_ids:
109 |                 question_input_ids = question_input_ids[:question_input_ids.index(2)]
110 |             question_input_ids = question_input_ids[1:]
111 | 
112 |             '''
113 |             triples = []
114 |             prefix = True
115 |             for i, input_id in enumerate(question_input_ids):
116 |                 if prefix:
117 |                     if input_id==35: # the end of prefix
118 |                         prefix = False
119 |                     continue
120 |                 if input_id in [0, 2] or input_id in self.stopwords:
121 |                     continue
122 |                 new_question = self.decode(question_input_ids[:i] + [self.mask_id] + question_input_ids[i+1:])
123 |                 prob, vector = self.encode(new_question, gt_input_id=input_id)
124 |                 triples.append((prob, vector, input_id))
125 |             '''
126 |             triples = []
127 |             batch = []
128 |             gt_input_ids = []
129 |             prefix = True
130 |             for i, input_id in enumerate(question_input_ids):
131 |                 if prefix:
132 |                     if input_id==35: # the end of prefix
133 |                         prefix = False
134 |                     continue
135 |                 if input_id in [0, 2] or input_id in self.stopwords:
136 |                     continue
137 |                 batch.append(self.decode(question_input_ids[:i] + [self.mask_id] + question_input_ids[i+1:]))
138 |                 gt_input_ids.append(input_id)
139 |             for (prob, vector), gt_input_id in zip(self.encode(batch, gt_input_ids=gt_input_ids), gt_input_ids):
140 |                 triples.append((prob, vector, gt_input_id))
141 | 
142 |             stacked_question_vectors = np.stack([v for _, v, _ in triples], 0)
143 |             all_scores = np.exp(np.inner(stacked_question_vectors, stacked_passage_vectors) / np.sqrt(stacked_passage_vectors.shape[-1]))
144 | 
145 |             probs = []
146 |             for (softmax_prob, vector, input_id), scores in zip(triples, all_scores):
147 |                 assert len(stacked_passage_tokens)==len(scores)
148 |                 if input_id not in stacked_passage_tokens:
149 |                     probs.append(0)
150 |                 else:
151 |                     aggregated_scores = defaultdict(list)
152 |                     for token, score in zip(stacked_passage_tokens, scores):
153 |                         aggregated_scores[token].append(score)
154 |                     tot = np.sum([np.sum(v) for v in aggregated_scores.values()])
155 |                     prob = np.sum(aggregated_scores[input_id]) / tot
156 |                     probs.append(prob)
157 | 
158 |             self.cache_dict[cache_key] = np.mean(probs)
159 |             self.add_n += 1
160 | 
161 |         return self.cache_dict[cache_key]
162 | 
163 | 
164 | 
165 | 
166 | 


--------------------------------------------------------------------------------
/src/factscore_package/openai_lm.py:
--------------------------------------------------------------------------------
  1 | from .lm import LM
  2 | from openai import OpenAI
  3 | import openai
  4 | import sys
  5 | import time
  6 | import os
  7 | import numpy as np
  8 | import logging
  9 | 
 10 | #os.environ["http_proxy"] = "http://localhost:27890"
 11 | #os.environ["https_proxy"] = "http://localhost:27890"
 12 | 
 13 | class OpenAIModel(LM):
 14 | 
 15 |     def __init__(self, model_name, cache_file=None, key=""):
 16 |         self.model_name = model_name
 17 |         self.temp = 0.7
 18 |         self.save_interval = 100
 19 |         self.client = OpenAI(api_key=key.strip())
 20 |         super().__init__(cache_file)
 21 | 
 22 |     def load_model(self):
 23 |         pass
 24 |         # load api key
 25 |         #key_path = self.key_path
 26 |         #assert os.path.exists(key_path), f"Please place your OpenAI APT Key in {key_path}."
 27 |         #with open(key_path, 'r') as f:
 28 |             #api_key = f.readline()
 29 |         #self.client = OpenAI(api_key=api_key.strip())
 30 |         #self.model = self.model_name
 31 | 
 32 |     def _generate(self, prompt, max_sequence_length=2048, max_output_length=128):
 33 |         if self.add_n % self.save_interval == 0:
 34 |             self.save_cache()
 35 |         # return a tuple of string (generated text) and metadata (any format)
 36 |         # This should be about generating a response from the prompt, no matter what the application is
 37 |         if self.model_name == "ChatGPT":
 38 |             # Construct the prompt send to ChatGPT
 39 |             message = [{"role": "user", "content": prompt}]
 40 |             # Call API
 41 |             response = self.call_ChatGPT(message, temp=self.temp, max_len=max_sequence_length)
 42 |             # Get the output from the response
 43 |             output = response.choices[0].message.content
 44 |             return output, response
 45 |         elif self.model_name == "InstructGPT":
 46 |             # Call API
 47 |             response = self.call_GPT3(prompt, temp=self.temp)
 48 |             # Get the output from the response
 49 |             output = response.choices[0].text
 50 |             return output, response
 51 |         else:
 52 |             raise NotImplementedError()
 53 | 
 54 |     def call_ChatGPT(self, message, model_name="gpt-3.5-turbo", max_len=1024, temp=0.7, verbose=False):
 55 |         # call GPT-3 API until result is provided and then return it
 56 |         response = None
 57 |         received = False
 58 |         num_rate_errors = 0
 59 |         while not received:
 60 |             try:
 61 |                 response = self.client.chat.completions.create(model=model_name,
 62 |                                                         messages=message,
 63 |                                                         max_tokens=max_len,
 64 |                                                         temperature=temp)
 65 |                 received = True
 66 |             except:
 67 |                 # print(message)
 68 |                 num_rate_errors += 1
 69 |                 error = sys.exc_info()[0]
 70 |                 if error == openai.error.InvalidRequestError:
 71 |                     # something is wrong: e.g. prompt too long
 72 |                     logging.critical(f"InvalidRequestError\nPrompt passed in:\n\n{message}\n\n")
 73 |                     assert False
 74 | 
 75 |                 logging.error("API error: %s (%d). Waiting %dsec" % (error, num_rate_errors, np.power(2, num_rate_errors)))
 76 |                 time.sleep(np.power(2, num_rate_errors))
 77 |         return response
 78 | 
 79 | 
 80 |     def call_GPT3(self, prompt, model_name="text-davinci-003", max_len=512, temp=0.7, num_log_probs=0, echo=False, verbose=False):
 81 |         # call GPT-3 API until result is provided and then return it
 82 |         response = None
 83 |         received = False
 84 |         num_rate_errors = 0
 85 |         while not received:
 86 |             try:
 87 |                 response = self.client.completions.create(model="gpt-3.5-turbo-instruct",
 88 |                                                     prompt=prompt,
 89 |                                                     max_tokens=max_len,
 90 |                                                     temperature=temp,
 91 |                                                     logprobs=num_log_probs,
 92 |                                                     echo=echo)
 93 |                 received = True
 94 |             except:
 95 |                 error = sys.exc_info()[0]
 96 |                 num_rate_errors += 1
 97 |                 if error == openai.error.InvalidRequestError:
 98 |                     # something is wrong: e.g. prompt too long
 99 |                     logging.critical(f"InvalidRequestError\nPrompt passed in:\n\n{prompt}\n\n")
100 |                     assert False
101 |                 logging.error("API error: %s (%d)" % (error, num_rate_errors))
102 |                 time.sleep(np.power(2, num_rate_errors))
103 |         return response
104 | 


--------------------------------------------------------------------------------
/src/factscore_package/retrieval.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import time
  3 | import os
  4 | 
  5 | import sqlite3
  6 | import numpy as np
  7 | import pickle as pkl
  8 | 
  9 | from rank_bm25 import BM25Okapi
 10 | 
 11 | SPECIAL_SEPARATOR = "####SPECIAL####SEPARATOR####"
 12 | MAX_LENGTH = 256
 13 | 
 14 | class DocDB(object):
 15 |     """Sqlite backed document storage.
 16 |     Implements get_doc_text(doc_id).
 17 |     """
 18 | 
 19 |     def __init__(self, db_path=None, data_path=None):
 20 |         self.db_path = db_path
 21 |         self.connection = sqlite3.connect(self.db_path, check_same_thread=False)
 22 | 
 23 |         cursor = self.connection.cursor()
 24 |         cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
 25 | 
 26 |         if len(cursor.fetchall())==0:
 27 |             assert data_path is not None, f"{self.db_path} is empty. Specify `data_path` in order to create a DB."
 28 |             print (f"{self.db_path} is empty. start building DB from {data_path}...")
 29 |             self.build_db(self.db_path, data_path)
 30 | 
 31 |     def __enter__(self):
 32 |         return self
 33 | 
 34 |     def __exit__(self, *args):
 35 |         self.close()
 36 | 
 37 |     def path(self):
 38 |         """Return the path to the file that backs this database."""
 39 |         return self.path
 40 | 
 41 |     def close(self):
 42 |         """Close the connection to the database."""
 43 |         self.connection.close()
 44 | 
 45 |     def build_db(self, db_path, data_path):
 46 |         from transformers import RobertaTokenizer
 47 |         tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
 48 | 
 49 |         titles = set()
 50 |         output_lines = []
 51 |         tot = 0
 52 |         start_time = time.time()
 53 |         c = self.connection.cursor()
 54 |         c.execute("CREATE TABLE documents (title PRIMARY KEY, text);")
 55 | 
 56 |         with open(data_path, "r") as f:
 57 |             for line in f:
 58 |                 dp = json.loads(line)
 59 |                 title = dp["title"]
 60 |                 text = dp["text"]
 61 |                 if title in titles:
 62 |                     continue
 63 |                 titles.add(title)
 64 |                 if type(text)==str:
 65 |                     text = [text]
 66 |                 passages = [[]]
 67 |                 for sent_idx, sent in enumerate(text):
 68 |                     assert len(sent.strip())>0
 69 |                     tokens = tokenizer(sent)["input_ids"]
 70 |                     max_length = MAX_LENGTH - len(passages[-1])
 71 |                     if len(tokens) <= max_length:
 72 |                         passages[-1].extend(tokens)
 73 |                     else:
 74 |                         passages[-1].extend(tokens[:max_length])
 75 |                         offset = max_length
 76 |                         while offset < len(tokens):
 77 |                             passages.append(tokens[offset:offset+MAX_LENGTH])
 78 |                             offset += MAX_LENGTH
 79 | 
 80 |                 psgs = [tokenizer.decode(tokens) for tokens in passages if np.sum([t not in [0, 2] for t in tokens])>0]
 81 |                 text = SPECIAL_SEPARATOR.join(psgs)
 82 |                 output_lines.append((title, text))
 83 |                 tot += 1
 84 | 
 85 |                 if len(output_lines) == 1000000:
 86 |                     c.executemany("INSERT INTO documents VALUES (?,?)", output_lines)
 87 |                     output_lines = []
 88 |                     print ("Finish saving %dM documents (%dmin)" % (tot / 1000000, (time.time()-start_time)/60))
 89 | 
 90 |         if len(output_lines) > 0:
 91 |             c.executemany("INSERT INTO documents VALUES (?,?)", output_lines)
 92 |             print ("Finish saving %dM documents (%dmin)" % (tot / 1000000, (time.time()-start_time)/60))
 93 | 
 94 |         self.connection.commit()
 95 |         self.connection.close()
 96 | 
 97 |     def get_text_from_title(self, title):
 98 |         """Fetch the raw text of the doc for 'doc_id'."""
 99 |         cursor = self.connection.cursor()
100 |         cursor.execute("SELECT text FROM documents WHERE title = ?", (title,))
101 |         results = cursor.fetchall()
102 |         results = [r for r in results]
103 |         cursor.close()
104 |         assert results is not None and len(results)==1, f"`topic` in your data ({title}) is likely to be not a valid title in the DB."
105 |         results = [{"title": title, "text": para} for para in results[0][0].split(SPECIAL_SEPARATOR)]
106 |         assert len(results)>0, f"`topic` in your data ({title}) is likely to be not a valid title in the DB."
107 |         return results
108 | 
109 | class Retrieval(object):
110 | 
111 |     def __init__(self, db, cache_path, embed_cache_path,
112 |                  retrieval_type="gtr-t5-large", batch_size=None):
113 |         self.db = db
114 |         self.cache_path = cache_path
115 |         self.embed_cache_path = embed_cache_path
116 |         self.retrieval_type = retrieval_type
117 |         self.batch_size = batch_size
118 |         assert retrieval_type=="bm25" or retrieval_type.startswith("gtr-")
119 | 
120 |         self.encoder = None
121 |         self.load_cache()
122 |         self.add_n = 0
123 |         self.add_n_embed = 0
124 | 
125 |     def load_encoder(self):
126 |         from sentence_transformers import SentenceTransformer
127 |         encoder = SentenceTransformer("sentence-transformers/" + self.retrieval_type)
128 |         encoder = encoder.cuda()
129 |         encoder = encoder.eval()
130 |         self.encoder = encoder
131 |         assert self.batch_size is not None
132 | 
133 |     def load_cache(self):
134 |         if os.path.exists(self.cache_path):
135 |             with open(self.cache_path, "r") as f:
136 |                 self.cache = json.load(f)
137 |         else:
138 |             self.cache = {}
139 |         if os.path.exists(self.embed_cache_path):
140 |             with open(self.embed_cache_path, "rb") as f:
141 |                 self.embed_cache = pkl.load(f)
142 |         else:
143 |             self.embed_cache = {}
144 | 
145 |     def save_cache(self):
146 |         if self.add_n > 0:
147 |             if os.path.exists(self.cache_path):
148 |                 with open(self.cache_path, "r") as f:
149 |                     new_cache = json.load(f)
150 |                 self.cache.update(new_cache)
151 | 
152 |             with open(self.cache_path, "w") as f:
153 |                 json.dump(self.cache, f)
154 | 
155 |         if self.add_n_embed > 0:
156 |             if os.path.exists(self.embed_cache_path):
157 |                 with open(self.embed_cache_path, "rb") as f:
158 |                     new_cache = pkl.load(f)
159 |                 self.embed_cache.update(new_cache)
160 | 
161 |             with open(self.embed_cache_path, "wb") as f:
162 |                 pkl.dump(self.embed_cache, f)
163 | 
164 |     def get_bm25_passages(self, topic, query, passages, k):
165 |         if topic in self.embed_cache:
166 |             bm25 = self.embed_cache[topic]
167 |         else:
168 |             bm25 = BM25Okapi([psg["text"].replace("<s>", "").replace("</s>", "").split() for psg in passages])
169 |             self.embed_cache[topic] = bm25
170 |             self.add_n_embed += 1
171 |         scores = bm25.get_scores(query.split())
172 |         indices = np.argsort(-scores)[:k]
173 |         return [passages[i] for i in indices]
174 | 
175 |     def get_gtr_passages(self, topic, retrieval_query, passages, k):
176 |         if self.encoder is None:
177 |             self.load_encoder()
178 |         if topic in self.embed_cache:
179 |             passage_vectors = self.embed_cache[topic]
180 |         else:
181 |             inputs = [psg["title"] + " " + psg["text"].replace("<s>", "").replace("</s>", "") for psg in passages]
182 |             passage_vectors = self.encoder.encode(inputs, batch_size=self.batch_size, device=self.encoder.device)
183 |             self.embed_cache[topic] = passage_vectors
184 |             self.add_n_embed += 1
185 |         query_vectors = self.encoder.encode([retrieval_query], 
186 |                                             batch_size=self.batch_size,
187 |                                             device=self.encoder.device)[0]
188 |         scores = np.inner(query_vectors, passage_vectors)
189 |         indices = np.argsort(-scores)[:k]
190 |         return [passages[i] for i in indices]
191 | 
192 |     def get_passages(self, topic, question, k):
193 |         retrieval_query = topic + " " + question.strip()
194 |         cache_key = topic + "#" + retrieval_query
195 | 
196 |         if cache_key not in self.cache:
197 |             passages = self.db.get_text_from_title(topic)
198 |             if self.retrieval_type=="bm25":
199 |                 self.cache[cache_key] = self.get_bm25_passages(topic, retrieval_query, passages, k)
200 |             else:
201 |                 self.cache[cache_key] = self.get_gtr_passages(topic, retrieval_query, passages, k)
202 |             assert len(self.cache[cache_key]) in [k, len(passages)]
203 |             self.add_n += 1
204 | 
205 | 
206 |         return self.cache[cache_key]
207 | 
208 | 
209 | 
210 | 
211 | 


--------------------------------------------------------------------------------
/src/factscore_package/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # All rights reserved.
  3 | #
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | import torch
  7 | 
  8 | def assert_all_approx_close(a, b, rtol, atol, count):
  9 | 
 10 |     idx = torch.isclose(a.float(), b.float(), rtol, atol)
 11 |     sumval = (idx==0).sum().item()
 12 |     if sumval > count:
 13 |         print(f'Too many values not close: assert {sumval} < {count}')
 14 |         try:
 15 |             torch.testing.assert_allclose(a, b, rtol, atol)
 16 |         except Exception as e:
 17 |             print(e)
 18 | 
 19 | 
 20 | def get_memory_footprint(model, return_buffers=True):
 21 |     """
 22 |     Get the memory footprint of a model. This will return the memory footprint of the current model in bytes.
 23 |     Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the
 24 |     PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2
 25 |     Arguments:
 26 |         return_buffers (`bool`, *optional*, defaults to `True`):
 27 |             Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers
 28 |             are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch
 29 |             norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2
 30 |     """
 31 |     mem = sum([param.nelement() * param.element_size() for param in model.parameters()])
 32 |     if return_buffers:
 33 |         mem_bufs = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])
 34 |         mem = mem + mem_bufs
 35 |     return mem
 36 | 
 37 | 
 38 | def ـreplace_linear_with_int8linear(model, modules_to_not_convert="lm_head"):
 39 |     for name, module in model.named_children():
 40 |         ـreplace_linear_with_int8linear(module, modules_to_not_convert)
 41 | 
 42 |         if isinstance(module, torch.nn.Linear) and name != modules_to_not_convert:
 43 |             model._modules[name] = QuantizedLinearInt8(linear_layer=module)
 44 |     return
 45 | 
 46 | 
 47 | class QuantizedLinearInt8(torch.nn.Module):
 48 |     '''
 49 |     A simple but effictive implmenetion of Int8 quantization for linear layers.
 50 |     The weights are quantized and stored as Int8, which saves ~50% of the gpu memory.
 51 |     During the forwared pass, the weights are de-quantized back to fp16 to do multiplication.
 52 |     Pros:
 53 |         - saves ~50% of the gpu memory
 54 |         - accurate quantization because only the weights are quantized, and the weights don't suffer
 55 |             from the "outliers" issue mentioned in the LLM.int8 paper; only the activations do.
 56 |         - high precision results beacuse the multiplication is done in fp16
 57 |         - much faster than LLM.int8
 58 |     Cons:
 59 |         - a bit slower because of the added computation of dequantization in each forward pass. In practice, the slowdown
 60 |             is not large because in the generation application, gpu utilization is not very high.
 61 |     '''
 62 |     def __init__(self, linear_layer):
 63 |         super().__init__()
 64 |         self.bias = linear_layer.bias
 65 | 
 66 |         weight_bit_width = 8
 67 |         weight = linear_layer.weight
 68 | 
 69 |         self.weight_scale = torch.nn.Parameter(
 70 |             (weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half(),
 71 |         )
 72 |         # print(self.weight_scale.max().item(), self.weight_scale.min().item(), self.weight_scale.mean().item())
 73 |         # if self.weight_scale.max().item() > 0.002:
 74 |             # print(self.weight_scale.max().item())
 75 |         self.weight = torch.nn.Parameter(
 76 |             torch.round(weight.float() / self.weight_scale[:, None]).char(),
 77 |             requires_grad=False
 78 |             )
 79 | 
 80 |     def forward(self, x):
 81 |         weight = self.weight.half() * self.weight_scale[:, None]
 82 |         return torch.nn.functional.linear(x, weight, self.bias)
 83 | 
 84 | 
 85 | def convert_model_to_int8_on_gpu(model, device):
 86 |     """
 87 |     Quantize a model to int8 and move it to GPU using a simple method.
 88 |     """
 89 |     if 'cuda' not in device:
 90 |         raise ValueError(f"Target device should be a gpu. Device {device} is not supported")
 91 | 
 92 |     model.half()
 93 | 
 94 |     memory_before_quantization = get_memory_footprint(model)  # without lm_head
 95 | 
 96 |     ـreplace_linear_with_int8linear(model)  # replace `Linear` with `QuantizedLinearInt8`
 97 | 
 98 |     model.to(device=device)
 99 |     memory_after_quantization = get_memory_footprint(model)  # without lm_head
100 | 
101 |     saving = round(100 * memory_after_quantization/memory_before_quantization)
102 |     memory_before_quantization = round(memory_before_quantization / 2**30, 2)  # rounding for printing
103 |     memory_after_quantization = round(memory_after_quantization / 2**30, 2)  # rounding for printing
104 | 
105 |     print(f'Quantization memory - before: {memory_before_quantization} GB, after: {memory_after_quantization} GB ({saving}% of the size before)')
106 |     return model
107 | 


--------------------------------------------------------------------------------
/src/interface.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from functools import partial
  3 | import gradio as gr
  4 | import torch
  5 | from peft import PeftModel
  6 | from transformers import (
  7 |     AutoModelForCausalLM,
  8 |     AutoTokenizer,
  9 |     GenerationConfig,
 10 |     LlamaTokenizer,
 11 |     LlamaForCausalLM
 12 | )
 13 | 
 14 | parser = argparse.ArgumentParser()
 15 | parser.add_argument("--model_name_or_path", type=str, required=True)
 16 | parser.add_argument("--ckpt_path", type=str, default=None)
 17 | parser.add_argument("--use_lora", action="store_true")
 18 | parser.add_argument("--llama", action="store_true")
 19 | parser.add_argument("--base_port", default=17860, type=int)
 20 | parser.add_argument("--use_raw_prompt", action="store_true")
 21 | args = parser.parse_args()
 22 | 
 23 | 
 24 | def generate_prompt(input_text):
 25 |     if not args.use_raw_prompt:
 26 |         return f"Human: \n{input_text}\n\nAssistant: \n"
 27 |     else:
 28 |         return input_text
 29 | 
 30 | 
 31 | def evaluate(
 32 |     model,
 33 |     tokenizer,
 34 |     input: str,
 35 |     temperature=0.1,
 36 |     top_p=0.75,
 37 |     top_k=40,
 38 |     num_beams=4,
 39 |     do_sample=False,
 40 |     max_new_tokens=128,
 41 |     min_new_tokens=1,
 42 |     repetition_penalty=1.2,
 43 | ):
 44 |     prompt = generate_prompt(input)
 45 |     inputs = tokenizer(prompt, add_special_tokens=False, return_tensors="pt")
 46 | 
 47 |     input_ids = inputs["input_ids"].to(getattr(model, "module", model).device)
 48 |     generation_config = GenerationConfig(
 49 |         temperature=temperature,
 50 |         top_p=top_p,
 51 |         top_k=top_k,
 52 |         num_beams=num_beams,
 53 |         bos_token_id=tokenizer.bos_token_id,
 54 |         eos_token_id=tokenizer.eos_token_id,
 55 |         pad_token_id=tokenizer.pad_token_id,
 56 |         max_new_tokens=max_new_tokens,  # max_length=max_new_tokens+input_sequence
 57 |         min_new_tokens=min_new_tokens,  # min_length=min_new_tokens+input_sequence
 58 |         repetition_penalty=repetition_penalty,
 59 |         do_sample=do_sample,
 60 |     )
 61 |     with torch.no_grad():
 62 |         generation_output = model.generate(
 63 |             input_ids=input_ids,
 64 |             generation_config=generation_config,
 65 |             return_dict_in_generate=True,
 66 |             output_scores=False,
 67 |         )
 68 |         output = generation_output.sequences[0]
 69 |         output = tokenizer.decode(
 70 |             output, 
 71 |             skip_special_tokens=True
 72 |         )[len(prompt):].strip()
 73 |         return output
 74 | 
 75 | 
 76 | if __name__ == "__main__":
 77 |     load_type = torch.float16  # Sometimes may need torch.float32
 78 |     if args.ckpt_path is None or args.ckpt_path == '':
 79 |         args.ckpt_path = args.model_name_or_path
 80 | 
 81 |     if args.llama:
 82 |         tokenizer = LlamaTokenizer.from_pretrained(args.model_name_or_path)
 83 |         tokenizer.add_special_tokens(
 84 |             {
 85 |                 "bos_token": "<s>",
 86 |                 "eos_token": "</s>",
 87 |                 "unk_token": "<unk>",
 88 |                 "pad_token": "<unk>",
 89 |             }
 90 |         )
 91 |     else:
 92 |         tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
 93 | 
 94 |     print(f"loading model...")
 95 | 
 96 |     if args.llama:
 97 |         model = LlamaForCausalLM.from_pretrained(args.ckpt_path, torch_dtype=load_type, device_map='auto')
 98 |         model.config.use_flash_attention = True
 99 |         model.config.pad_token_id = 0
100 |         model.config.eos_token_id = 2
101 |     else:
102 |         model = AutoModelForCausalLM.from_pretrained(args.ckpt_path, torch_dtype=load_type, device_map='auto')
103 | 
104 |     # peft model
105 |     if args.use_lora:
106 |         model = PeftModel.from_pretrained(model, args.ckpt_path, torch_dtype=load_type)
107 | 
108 |     if not torch.cuda.is_available():
109 |         device = torch.device("cpu")
110 |         model.float()
111 |         model.to(device)
112 | 
113 |     model.eval()    
114 | 
115 |     print("Load model successfully")
116 |     # https://gradio.app/docs/
117 |     gr.Interface(
118 |         fn=partial(evaluate, model, tokenizer),
119 |         inputs=[
120 |             gr.components.Textbox(
121 |                 lines=2, label="Input", placeholder="Welcome to the BELLE model"
122 |             ),
123 |             gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"),
124 |             gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"),
125 |             gr.components.Slider(
126 |                 minimum=0, maximum=100, step=1, value=40, label="Top k"
127 |             ),
128 |             gr.components.Slider(
129 |                 minimum=1, maximum=4, step=1, value=1, label="Beams Number"
130 |             ),
131 |             gr.components.Checkbox(value=False, label="Do sample"),
132 |             gr.components.Slider(
133 |                 minimum=1, maximum=2000, step=10, value=512, label="Max New Tokens"
134 |             ),
135 |             gr.components.Slider(
136 |                 minimum=1, maximum=300, step=10, value=1, label="Min New Tokens"
137 |             ),
138 |             gr.components.Slider(
139 |                 minimum=1.0,
140 |                 maximum=2.0,
141 |                 step=0.1,
142 |                 value=1.2,
143 |                 label="Repetition Penalty",
144 |             ),
145 |         ],
146 |         outputs=[
147 |             gr.components.Textbox(
148 |                 lines=25,
149 |                 label="Output",
150 |             )
151 |         ],
152 |         title="FinMA: Financial Large Language Model",
153 |     ).queue().launch(
154 |         share=True, server_name="0.0.0.0", server_port=args.base_port
155 |     )
156 | 


--------------------------------------------------------------------------------
/src/model_prompt.py:
--------------------------------------------------------------------------------
 1 | def no_prompt(ctx):
 2 |     return ctx
 3 | 
 4 | def finma_prompt(ctx):
 5 |     return f'Human: \n{ctx}\n\nAssistant: \n'
 6 | 
 7 | MODEL_PROMPT_MAP = {
 8 |     "no_prompt": no_prompt,
 9 |     "finma_prompt": finma_prompt
10 | }


--------------------------------------------------------------------------------
/src/tasks/__init__.py:
--------------------------------------------------------------------------------
  1 | from pprint import pprint
  2 | from typing import List, Union
  3 | 
  4 | import json
  5 | import lm_eval.base
  6 | 
  7 | from . import flare
  8 | 
  9 | TASK_REGISTRY = {
 10 |     "flare_es_financees": flare.ESFINANCEES,
 11 |     "flare_es_multifin": flare.ESMultiFin,
 12 |     "flare_es_efp": flare.ESEFP,
 13 |     "flare_es_efpa": flare.ESEFPA,
 14 |     "flare_es_fns": flare.ESFNS,
 15 |     "flare_es_tsa": flare.ESTSA,
 16 |     "flare_fpb": flare.FPB,
 17 |     "flare_fiqasa": flare.FIQASA,
 18 |     "flare_ner": flare.NER,
 19 |     "flare_finqa": flare.FinQA,
 20 |     "flare_convfinqa": flare.ConvFinQA,
 21 |     "flare_headlines": flare.Headlines,
 22 |     "flare_finer_ord": flare.FinerOrd,
 23 |     "flare_fomc": flare.FOMC,
 24 |     "flare_german": flare.German,
 25 |     "flare_australian": flare.Australian,
 26 |     "flare_fomc": flare.FOMC,
 27 |     "flare_ectsum": flare.ECTSUM,
 28 |     "flare_edtsum": flare.EDTSUM,
 29 |     "flare_finarg_ecc_auc": flare.FinargECCAUC,
 30 |     "flare_finarg_ecc_arc": flare.FinargECCARC,
 31 |     "flare_cd": flare.CD,
 32 |     "flare_multifin_en": flare.MultiFinEN,
 33 |     "flare_tsa": flare.TSA,
 34 |     "flare_cfa": flare.CFA,
 35 |     "flare_ma": flare.MA,
 36 |     "flare_causal20_sc": flare.Causal20SC,
 37 |     "flare_finarg_ecc_arc": flare.FINARGECCARC,
 38 |     "flare_finarg_ecc_auc": flare.FINARGECCAUC,
 39 |     "flare_mlesg": flare.MLESG,
 40 |     "flare_fnxl": flare.FNXL,
 41 |     "flare_fsrl": flare.FSRL,
 42 |     "flare_tatqa": flare.TATQA,
 43 |     "flare_finred": flare.FinRED,
 44 |     "flare_cra_lendingclub": flare.lendingclub,
 45 |     "flare_cra_ccf": flare.ccf,
 46 |     "flare_cra_ccfraud": flare.ccfraud,
 47 |     "flare_cra_polish": flare.polish,
 48 |     "flare_cra_taiwan": flare.taiwan,
 49 |     "flare_cra_portoseguro": flare.portoseguro,
 50 |     "flare_cra_travelinsurace": flare.travelinsurace,
 51 |     "flare_sm_bigdata": flare.StockMovementBigData,
 52 |     "flare_sm_acl": flare.StockMovementACL,
 53 |     "flare_sm_cikm": flare.StockMovementCIKM,
 54 |     "flare_en_finterm": flare.FINTERM,
 55 |     "flare_en_acronym": flare.ACRONYM,
 56 |     **flare.SM_TASKS,
 57 |     "flare_finarg_ecc_auc_test": flare.FINARGECCAUC_test,
 58 |     "flare_edtsum_test": flare.EDTSUM_test,
 59 | }
 60 | 
 61 | ALL_TASKS = sorted(list(TASK_REGISTRY))
 62 | 
 63 | _EXAMPLE_JSON_PATH = "split:key:/absolute/path/to/data.json"
 64 | 
 65 | 
 66 | def add_json_task(task_name):
 67 |     """Add a JSON perplexity task if the given task name matches the
 68 |     JSON task specification.
 69 | 
 70 |     See `json.JsonPerplexity`.
 71 |     """
 72 |     if not task_name.startswith("json"):
 73 |         return
 74 | 
 75 |     def create_json_task():
 76 |         splits = task_name.split("=", 1)
 77 |         if len(splits) != 2 or not splits[1]:
 78 |             raise ValueError(
 79 |                 "json tasks need a path argument pointing to the local "
 80 |                 "dataset, specified like this: json="
 81 |                 + _EXAMPLE_JSON_PATH
 82 |                 + ' (if there are no splits, use "train")'
 83 |             )
 84 | 
 85 |         json_path = splits[1]
 86 |         if json_path == _EXAMPLE_JSON_PATH:
 87 |             raise ValueError(
 88 |                 "please do not copy the example path directly, but substitute "
 89 |                 "it with a path to your local dataset"
 90 |             )
 91 |         return lambda: json.JsonPerplexity(json_path)
 92 | 
 93 |     TASK_REGISTRY[task_name] = create_json_task()
 94 | 
 95 | 
 96 | def get_task(task_name):
 97 |     try:
 98 |         add_json_task(task_name)
 99 |         return TASK_REGISTRY[task_name]
100 |     except KeyError:
101 |         print("Available tasks:")
102 |         pprint(TASK_REGISTRY)
103 |         raise KeyError(f"Missing task {task_name}")
104 | 
105 | 
106 | def get_task_name_from_object(task_object):
107 |     for name, class_ in TASK_REGISTRY.items():
108 |         if class_ is task_object:
109 |             return name
110 | 
111 |     # this gives a mechanism for non-registered tasks to have a custom name anyways when reporting
112 |     return (
113 |         task_object.EVAL_HARNESS_NAME
114 |         if hasattr(task_object, "EVAL_HARNESS_NAME")
115 |         else type(task_object).__name__
116 |     )
117 | 
118 | 
119 | def get_task_dict(task_name_list: List[Union[str, lm_eval.base.Task]]):
120 |     task_name_dict = {
121 |         task_name: get_task(task_name)()
122 |         for task_name in task_name_list
123 |         if isinstance(task_name, str)
124 |     }
125 |     task_name_from_object_dict = {
126 |         get_task_name_from_object(task_object): task_object
127 |         for task_object in task_name_list
128 |         if not isinstance(task_object, str)
129 |     }
130 |     assert set(task_name_dict.keys()).isdisjoint(set(task_name_from_object_dict.keys()))
131 |     return {**task_name_dict, **task_name_from_object_dict}
132 | 


--------------------------------------------------------------------------------
/src/tasks/utils.py:
--------------------------------------------------------------------------------
 1 | def process_text(entity_string, text):
 2 |     # Initialize
 3 |     entity_list = [(", ".join(val.split(", ")[:-1]), val.split(", ")[-1]) for val in entity_string.split("\n")]
 4 |     text_words = text.split()
 5 |     labels = ['O'] * len(text_words)
 6 |     # text_lower = text.lower()
 7 |     text_lower = text
 8 | 
 9 |     # Create a list to store the start index of each word
10 |     word_indices = [0]
11 |     for word in text_words[:-1]:
12 |         word_indices.append(word_indices[-1] + len(word) + 1)
13 | 
14 |     # Iterate over the entity list
15 |     print (entity_list)
16 |     for entity, entity_type in entity_list:
17 |         entity_words = entity.split()
18 |         entity_lower = entity
19 | 
20 |         # Find start and end index of each occurrence of the entity in the text
21 |         start = 0
22 |         while True:
23 |             start = text_lower.find(entity_lower, start)
24 |             if not entity or start == -1: break  # No more occurrence
25 |             end = start + len(entity) - 1
26 | 
27 |             # Find the words included in this occurrence
28 |             try:
29 |                 start_word = next(i for i, ind in enumerate(word_indices) if ind >= start)
30 |                 end_word = next(i for i, ind in enumerate(word_indices) if ind > end)
31 | 
32 |                 # Label the words
33 |                 labels[start_word] = 'B-' + entity_type
34 |                 for i in range(start_word+1, end_word):
35 |                     labels[i] = 'I-' + entity_type
36 | 
37 |                 # Move to the next character after the occurrence
38 |             except Exception:
39 |                 pass
40 |             start = end + 1
41 | 
42 |     return labels
43 | 


--------------------------------------------------------------------------------
/src/tasks/zhutils.py:
--------------------------------------------------------------------------------
 1 | def process_zhtext(entity_string, text):
 2 |     # Initialize
 3 |     name = entity_string.split(',')[0]
 4 |     if len(entity_string.split(',')) > 1 and entity_string.split(',')[1]:
 5 |         entity_type = entity_string.split(',')[1].strip()
 6 |     else:
 7 |         entity_type = 0
 8 |     formatted_name = ' '.join(list(name))
 9 |     formatted_result = f"{formatted_name}, {entity_type}"
10 | 
11 |     entity_list = [(", ".join(val.split(", ")[:-1]), val.split(", ")[-1]) for val in formatted_result.split("\n")]
12 |     text_words = text.split()
13 |     labels = ['O'] * len(text_words)
14 |     text_lower = text
15 | 
16 |     # Create a list to store the start index of each word
17 |     word_indices = [0]
18 |     for word in text_words[:-1]:
19 |         word_indices.append(word_indices[-1] + len(word) + 1)
20 | 
21 |     # Iterate over the entity list
22 |     print ("entity_list:",entity_list)
23 |     for entity, entity_type in entity_list:
24 |         entity_words = entity.split()
25 |         entity_lower = entity
26 |         # print ("entity_lower:", entity_lower)
27 | 
28 |         # Find start and end index of each occurrence of the entity in the text
29 |         start = 0
30 |         while True:
31 |             start = text_lower.find(entity_lower, start)
32 |             if not entity or start == -1: break  # No more occurrence
33 |             end = start + len(entity) - 1
34 | 
35 |             # Find the words included in this occurrence
36 |             try:
37 |                 start_word = next(i for i, ind in enumerate(word_indices) if ind >= start)
38 |                 end_word = next(i for i, ind in enumerate(word_indices) if ind > end)
39 | 
40 |                 # Label the words
41 |                 labels[start_word] = 'B-' + entity_type
42 |                 for i in range(start_word+1, end_word):
43 |                     labels[i] = 'I-' + entity_type
44 | 
45 |                 # Move to the next character after the occurrence
46 |             except Exception:
47 |                 pass
48 |             start = end + 1
49 | 
50 |     return labels
51 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from typing import Any, List
 3 | import copy
 4 | from gradio_client import Client
 5 | from tqdm import tqdm
 6 | 
 7 | 
 8 | class MultiClient(object):
 9 |     def __init__(self, worker_addrs, synced_worker=False) -> None:
10 |         self.clients = [Client(addr) for addr in worker_addrs]
11 |         self.synced_worker = synced_worker
12 | 
13 |     def predict(self, tasks: List[List], max_retries: int = 3) -> List[Any]:
14 |         assert len(tasks) >= 1, "No predict tasks!"
15 |         num_tasks = len(tasks)
16 |         if self.synced_worker and len(tasks) % len(self.clients) != 0:
17 |             num_dummy_tasks = len(self.clients) - len(tasks) % len(self.clients)            
18 |             tasks.extend([copy.deepcopy(tasks[-1]) for _ in range(num_dummy_tasks)])
19 | 
20 |         pbar = tqdm(total=len(tasks))
21 |         jobs = {
22 |             client: (i, client.submit(*(tasks[i]), api_name="/predict"))
23 |             for i, client in enumerate(self.clients)
24 |             if i < len(tasks)
25 |         }
26 |         results = {}
27 |         retries = {i: 0 for i in range(len(tasks))}
28 | 
29 |         while jobs:
30 |             for client, (i, job) in list(jobs.items()):
31 |                 if job.done():
32 |                     pbar.update(1)
33 |                     del jobs[client]
34 |                     try:
35 |                         result = job.result()
36 |                         results[i] = result
37 |                     except Exception as e:
38 |                         print("Job failed with error:", e)
39 |                         if retries[i] < max_retries:
40 |                             print("Retrying job...")
41 |                             retries[i] += 1
42 |                             new_job = client.submit(
43 |                                 *tasks[i], api_name="/predict")
44 |                             jobs[client] = (i, new_job)
45 |                             continue  # Skip the rest of the loop
46 |                         else:
47 |                             results[i] = None
48 | 
49 |                     new_i = len(results) + len(jobs)
50 |                     if new_i < len(tasks):
51 |                         new_task = tasks[new_i]
52 |                         new_job = client.submit(
53 |                             *new_task, api_name="/predict")
54 |                         jobs[client] = (new_i, new_job)
55 |             time.sleep(1)
56 |         pbar.close()
57 | 
58 |         predicts = [results[i] for i in range(num_tasks)]
59 | 
60 |         return predicts
61 | 


--------------------------------------------------------------------------------
/static/av.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/static/av.jpg


--------------------------------------------------------------------------------
/static/cr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/static/cr.jpg


--------------------------------------------------------------------------------
/static/formula.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/static/formula.jpg


--------------------------------------------------------------------------------
/static/md.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/static/md.jpg


--------------------------------------------------------------------------------
/static/sr.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/static/sr.jpg


--------------------------------------------------------------------------------